4c20552c81bc7fc65757e95b3be4bea2ae2d7937
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_proto_tcp.c
1 /* (C) 1999-2001 Paul `Rusty' Russell
2  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
9  *      - Real stateful connection tracking
10  *      - Modified state transitions table
11  *      - Window scaling support added
12  *      - SACK support added
13  *
14  * Willy Tarreau:
15  *      - State table bugfixes
16  *      - More robust state changes
17  *      - Tuning timer parameters
18  *
19  * version 2.2
20  */
21
22 #include <linux/config.h>
23 #include <linux/types.h>
24 #include <linux/sched.h>
25 #include <linux/timer.h>
26 #include <linux/netfilter.h>
27 #include <linux/module.h>
28 #include <linux/in.h>
29 #include <linux/ip.h>
30 #include <linux/tcp.h>
31 #include <linux/spinlock.h>
32
33 #include <net/tcp.h>
34
35 #include <linux/netfilter.h>
36 #include <linux/netfilter_ipv4.h>
37 #include <linux/netfilter_ipv4/ip_conntrack.h>
38 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39 #include <linux/netfilter_ipv4/lockhelp.h>
40
41 #if 0
42 #define DEBUGP printk
43 #define DEBUGP_VARS
44 #else
45 #define DEBUGP(format, args...)
46 #endif
47
48 /* Protects conntrack->proto.tcp */
49 static DECLARE_RWLOCK(tcp_lock);
50
51 /* "Be conservative in what you do, 
52     be liberal in what you accept from others." 
53     If it's non-zero, we mark only out of window RST segments as INVALID. */
54 int ip_ct_tcp_be_liberal = 0;
55
56 /* When connection is picked up from the middle, how many packets are required
57    to pass in each direction when we assume we are in sync - if any side uses
58    window scaling, we lost the game. 
59    If it is set to zero, we disable picking up already established 
60    connections. */
61 int ip_ct_tcp_loose = 3;
62
63 /* Max number of the retransmitted packets without receiving an (acceptable) 
64    ACK from the destination. If this number is reached, a shorter timer 
65    will be started. */
66 int ip_ct_tcp_max_retrans = 3;
67
68   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
69      closely.  They're more complex. --RR */
70
71 static const char *tcp_conntrack_names[] = {
72         "NONE",
73         "SYN_SENT",
74         "SYN_RECV",
75         "ESTABLISHED",
76         "FIN_WAIT",
77         "CLOSE_WAIT",
78         "LAST_ACK",
79         "TIME_WAIT",
80         "CLOSE",
81         "LISTEN"
82 };
83   
84 #define SECS * HZ
85 #define MINS * 60 SECS
86 #define HOURS * 60 MINS
87 #define DAYS * 24 HOURS
88
89 unsigned long ip_ct_tcp_timeout_syn_sent =      2 MINS;
90 unsigned long ip_ct_tcp_timeout_syn_recv =     60 SECS;
91 #if HZ < 9942
92 unsigned long ip_ct_tcp_timeout_established =  5U DAYS;
93 #else
94 unsigned long ip_ct_tcp_timeout_established =  2U DAYS;
95 #endif
96 unsigned long ip_ct_tcp_timeout_fin_wait =      2 MINS;
97 unsigned long ip_ct_tcp_timeout_close_wait =   60 SECS;
98 unsigned long ip_ct_tcp_timeout_last_ack =     30 SECS;
99 unsigned long ip_ct_tcp_timeout_time_wait =     2 MINS;
100 unsigned long ip_ct_tcp_timeout_close =        10 SECS;
101
102 /* RFC1122 says the R2 limit should be at least 100 seconds.
103    Linux uses 15 packets as limit, which corresponds 
104    to ~13-30min depending on RTO. */
105 unsigned long ip_ct_tcp_timeout_max_retrans =     5 MINS;
106  
107 static unsigned long * tcp_timeouts[]
108 = { NULL,                              /*      TCP_CONNTRACK_NONE */
109     &ip_ct_tcp_timeout_syn_sent,       /*      TCP_CONNTRACK_SYN_SENT, */
110     &ip_ct_tcp_timeout_syn_recv,       /*      TCP_CONNTRACK_SYN_RECV, */
111     &ip_ct_tcp_timeout_established,    /*      TCP_CONNTRACK_ESTABLISHED,      */
112     &ip_ct_tcp_timeout_fin_wait,       /*      TCP_CONNTRACK_FIN_WAIT, */
113     &ip_ct_tcp_timeout_close_wait,     /*      TCP_CONNTRACK_CLOSE_WAIT,       */
114     &ip_ct_tcp_timeout_last_ack,       /*      TCP_CONNTRACK_LAST_ACK, */
115     &ip_ct_tcp_timeout_time_wait,      /*      TCP_CONNTRACK_TIME_WAIT,        */
116     &ip_ct_tcp_timeout_close,          /*      TCP_CONNTRACK_CLOSE,    */
117     NULL,                              /*      TCP_CONNTRACK_LISTEN */
118  };
119  
120 #define sNO TCP_CONNTRACK_NONE
121 #define sSS TCP_CONNTRACK_SYN_SENT
122 #define sSR TCP_CONNTRACK_SYN_RECV
123 #define sES TCP_CONNTRACK_ESTABLISHED
124 #define sFW TCP_CONNTRACK_FIN_WAIT
125 #define sCW TCP_CONNTRACK_CLOSE_WAIT
126 #define sLA TCP_CONNTRACK_LAST_ACK
127 #define sTW TCP_CONNTRACK_TIME_WAIT
128 #define sCL TCP_CONNTRACK_CLOSE
129 #define sLI TCP_CONNTRACK_LISTEN
130 #define sIV TCP_CONNTRACK_MAX
131 #define sIG TCP_CONNTRACK_IGNORE
132
133 /* What TCP flags are set from RST/SYN/FIN/ACK. */
134 enum tcp_bit_set {
135         TCP_SYN_SET,
136         TCP_SYNACK_SET,
137         TCP_FIN_SET,
138         TCP_ACK_SET,
139         TCP_RST_SET,
140         TCP_NONE_SET,
141 };
142   
143 /*
144  * The TCP state transition table needs a few words...
145  *
146  * We are the man in the middle. All the packets go through us
147  * but might get lost in transit to the destination.
148  * It is assumed that the destinations can't receive segments 
149  * we haven't seen.
150  *
151  * The checked segment is in window, but our windows are *not*
152  * equivalent with the ones of the sender/receiver. We always
153  * try to guess the state of the current sender.
154  *
155  * The meaning of the states are:
156  *
157  * NONE:        initial state
158  * SYN_SENT:    SYN-only packet seen 
159  * SYN_RECV:    SYN-ACK packet seen
160  * ESTABLISHED: ACK packet seen
161  * FIN_WAIT:    FIN packet seen
162  * CLOSE_WAIT:  ACK seen (after FIN) 
163  * LAST_ACK:    FIN seen (after FIN)
164  * TIME_WAIT:   last ACK seen
165  * CLOSE:       closed connection
166  *
167  * LISTEN state is not used.
168  *
169  * Packets marked as IGNORED (sIG):
170  *      if they may be either invalid or valid 
171  *      and the receiver may send back a connection 
172  *      closing RST or a SYN/ACK.
173  *
174  * Packets marked as INVALID (sIV):
175  *      if they are invalid
176  *      or we do not support the request (simultaneous open)
177  */
178 static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
179         {
180 /* ORIGINAL */
181 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
182 /*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
183 /*
184  *      sNO -> sSS      Initialize a new connection
185  *      sSS -> sSS      Retransmitted SYN
186  *      sSR -> sIG      Late retransmitted SYN?
187  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
188  *                      are errors. Receiver will reply with RST 
189  *                      and close the connection.
190  *                      Or we are not in sync and hold a dead connection.
191  *      sFW -> sIG
192  *      sCW -> sIG
193  *      sLA -> sIG
194  *      sTW -> sSS      Reopened connection (RFC 1122).
195  *      sCL -> sSS
196  */
197 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
198 /*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
199 /*
200  * A SYN/ACK from the client is always invalid:
201  *      - either it tries to set up a simultaneous open, which is 
202  *        not supported;
203  *      - or the firewall has just been inserted between the two hosts
204  *        during the session set-up. The SYN will be retransmitted 
205  *        by the true client (or it'll time out).
206  */
207 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
208 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
209 /*
210  *      sNO -> sIV      Too late and no reason to do anything...
211  *      sSS -> sIV      Client migth not send FIN in this state:
212  *                      we enforce waiting for a SYN/ACK reply first.
213  *      sSR -> sFW      Close started.
214  *      sES -> sFW      
215  *      sFW -> sLA      FIN seen in both directions, waiting for
216  *                      the last ACK. 
217  *                      Migth be a retransmitted FIN as well...
218  *      sCW -> sLA
219  *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
220  *      sTW -> sTW
221  *      sCL -> sCL
222  */
223 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
224 /*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
225 /*
226  *      sNO -> sES      Assumed.
227  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
228  *      sSR -> sES      Established state is reached.
229  *      sES -> sES      :-)
230  *      sFW -> sCW      Normal close request answered by ACK.
231  *      sCW -> sCW
232  *      sLA -> sTW      Last ACK detected.
233  *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
234  *      sCL -> sCL
235  */
236 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
237 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
238 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
239         },
240         {
241 /* REPLY */
242 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
243 /*syn*/    { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
244 /*
245  *      sNO -> sIV      Never reached.
246  *      sSS -> sIV      Simultaneous open, not supported
247  *      sSR -> sIV      Simultaneous open, not supported.
248  *      sES -> sIV      Server may not initiate a connection.
249  *      sFW -> sIV
250  *      sCW -> sIV
251  *      sLA -> sIV
252  *      sTW -> sIV      Reopened connection, but server may not do it.
253  *      sCL -> sIV
254  */
255 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
256 /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
257 /*
258  *      sSS -> sSR      Standard open.
259  *      sSR -> sSR      Retransmitted SYN/ACK.
260  *      sES -> sIG      Late retransmitted SYN/ACK?
261  *      sFW -> sIG
262  *      sCW -> sIG
263  *      sLA -> sIG
264  *      sTW -> sIG
265  *      sCL -> sIG
266  */
267 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
268 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
269 /*
270  *      sSS -> sIV      Server might not send FIN in this state.
271  *      sSR -> sFW      Close started.
272  *      sES -> sFW
273  *      sFW -> sLA      FIN seen in both directions.
274  *      sCW -> sLA
275  *      sLA -> sLA      Retransmitted FIN.
276  *      sTW -> sTW
277  *      sCL -> sCL
278  */
279 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
280 /*ack*/    { sIV, sIG, sIV, sES, sCW, sCW, sTW, sTW, sCL, sIV },
281 /*
282  *      sSS -> sIG      Might be a half-open connection.
283  *      sSR -> sIV      Simultaneous open.
284  *      sES -> sES      :-)
285  *      sFW -> sCW      Normal close request answered by ACK.
286  *      sCW -> sCW
287  *      sLA -> sTW      Last ACK detected.
288  *      sTW -> sTW      Retransmitted last ACK.
289  *      sCL -> sCL
290  */
291 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
292 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
293 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
294         }
295 };
296
297 static int tcp_pkt_to_tuple(const struct sk_buff *skb,
298                             unsigned int dataoff,
299                             struct ip_conntrack_tuple *tuple)
300 {
301         struct tcphdr _hdr, *hp;
302
303         /* Actually only need first 8 bytes. */
304         hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
305         if (hp == NULL)
306                 return 0;
307
308         tuple->src.u.tcp.port = hp->source;
309         tuple->dst.u.tcp.port = hp->dest;
310
311         return 1;
312 }
313
314 static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
315                             const struct ip_conntrack_tuple *orig)
316 {
317         tuple->src.u.tcp.port = orig->dst.u.tcp.port;
318         tuple->dst.u.tcp.port = orig->src.u.tcp.port;
319         return 1;
320 }
321
322 /* Print out the per-protocol part of the tuple. */
323 static int tcp_print_tuple(struct seq_file *s,
324                            const struct ip_conntrack_tuple *tuple)
325 {
326         return seq_printf(s, "sport=%hu dport=%hu ",
327                           ntohs(tuple->src.u.tcp.port),
328                           ntohs(tuple->dst.u.tcp.port));
329 }
330
331 /* Print out the private part of the conntrack. */
332 static int tcp_print_conntrack(struct seq_file *s,
333                                const struct ip_conntrack *conntrack)
334 {
335         enum tcp_conntrack state;
336
337         READ_LOCK(&tcp_lock);
338         state = conntrack->proto.tcp.state;
339         READ_UNLOCK(&tcp_lock);
340
341         return seq_printf(s, "%s ", tcp_conntrack_names[state]);
342 }
343
344 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
345 {
346         if (tcph->rst) return TCP_RST_SET;
347         else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
348         else if (tcph->fin) return TCP_FIN_SET;
349         else if (tcph->ack) return TCP_ACK_SET;
350         else return TCP_NONE_SET;
351 }
352
353 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
354    in IP Filter' by Guido van Rooij.
355    
356    http://www.nluug.nl/events/sane2000/papers.html
357    http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
358    
359    The boundaries and the conditions are slightly changed:
360    
361         td_maxend = max(sack + max(win,1)) seen in reply packets
362         td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
363         td_end    = max(seq + len) seen in sent packets
364    
365    I.   Upper bound for valid data:     seq + len <= sender.td_maxend
366    II.  Lower bound for valid data:     seq >= sender.td_end - receiver.td_maxwin
367    III. Upper bound for valid ack:      sack <= receiver.td_end
368    IV.  Lower bound for valid ack:      ack >= receiver.td_end - MAXACKWINDOW
369         
370    where sack is the highest right edge of sack block found in the packet.
371         
372    The upper bound limit for a valid ack is not ignored - 
373    we doesn't have to deal with fragments. 
374 */
375
376 static inline __u32 segment_seq_plus_len(__u32 seq,
377                                          size_t len,
378                                          struct iphdr *iph,
379                                          struct tcphdr *tcph)
380   {
381         return (seq + len - (iph->ihl + tcph->doff)*4
382                 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
383 }
384   
385 /* Fixme: what about big packets? */
386 #define MAXACKWINCONST                  66000
387 #define MAXACKWINDOW(sender)                                            \
388         ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
389                                               : MAXACKWINCONST)
390   
391 /*
392  * Simplified tcp_parse_options routine from tcp_input.c
393  */
394 static void tcp_options(const struct sk_buff *skb,
395                         struct iphdr *iph,
396                         struct tcphdr *tcph, 
397                         struct ip_ct_tcp_state *state)
398 {
399         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
400         unsigned char *ptr;
401         int length = (tcph->doff*4) - sizeof(struct tcphdr);
402         
403         if (!length)
404                 return;
405
406         ptr = skb_header_pointer(skb,
407                                  (iph->ihl * 4) + sizeof(struct tcphdr),
408                                  length, buff);
409         BUG_ON(ptr == NULL);
410
411         state->td_scale = 
412         state->flags = 0;
413         
414         while (length > 0) {
415                 int opcode=*ptr++;
416                 int opsize;
417                 
418                 switch (opcode) {
419                 case TCPOPT_EOL:
420                         return;
421                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
422                         length--;
423                         continue;
424                 default:
425                         opsize=*ptr++;
426                         if (opsize < 2) /* "silly options" */
427                                 return;
428                         if (opsize > length)
429                                 break;  /* don't parse partial options */
430
431                         if (opcode == TCPOPT_SACK_PERM 
432                             && opsize == TCPOLEN_SACK_PERM)
433                                 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
434                         else if (opcode == TCPOPT_WINDOW
435                                  && opsize == TCPOLEN_WINDOW) {
436                                 state->td_scale = *(u_int8_t *)ptr;
437                                 
438                                 if (state->td_scale > 14) {
439                                         /* See RFC1323 */
440                                         state->td_scale = 14;
441                                 }
442                                 state->flags |=
443                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
444                         }
445                         ptr += opsize - 2;
446                         length -= opsize;
447                 }
448         }
449 }
450
451 static void tcp_sack(struct tcphdr *tcph, __u32 *sack)
452 {
453         __u32 tmp;
454         unsigned char *ptr;
455         int length = (tcph->doff*4) - sizeof(struct tcphdr);
456         
457         /* Fast path for timestamp-only option */
458         if (length == TCPOLEN_TSTAMP_ALIGNED*4
459             && *(__u32 *)(tcph + 1) ==
460                 __constant_ntohl((TCPOPT_NOP << 24) 
461                                  | (TCPOPT_NOP << 16)
462                                  | (TCPOPT_TIMESTAMP << 8)
463                                  | TCPOLEN_TIMESTAMP))
464                 return;
465                 
466         ptr = (unsigned char *)(tcph + 1);
467         while (length > 0) {
468                 int opcode=*ptr++;
469                 int opsize, i;
470                 
471                 switch (opcode) {
472                 case TCPOPT_EOL:
473                         return;
474                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
475                         length--;
476                         continue;
477                 default:
478                         opsize=*ptr++;
479                         if (opsize < 2) /* "silly options" */
480                                 return;
481                         if (opsize > length)
482                                 break;  /* don't parse partial options */
483
484                         if (opcode == TCPOPT_SACK 
485                             && opsize >= (TCPOLEN_SACK_BASE 
486                                           + TCPOLEN_SACK_PERBLOCK)
487                             && !((opsize - TCPOLEN_SACK_BASE) 
488                                  % TCPOLEN_SACK_PERBLOCK)) {
489                                 for (i = 0;
490                                      i < (opsize - TCPOLEN_SACK_BASE);
491                                      i += TCPOLEN_SACK_PERBLOCK) {
492                                         tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
493                                         
494                                         if (after(tmp, *sack))
495                                                 *sack = tmp;
496                                 }
497                                 return;
498                         }
499                         ptr += opsize - 2;
500                         length -= opsize;
501                 }
502         }
503 }
504
505 static int tcp_in_window(struct ip_ct_tcp *state, 
506                          enum ip_conntrack_dir dir,
507                          unsigned int *index,
508                          const struct sk_buff *skb,
509                          struct iphdr *iph,
510                          struct tcphdr *tcph)
511 {
512         struct ip_ct_tcp_state *sender = &state->seen[dir];
513         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
514         __u32 seq, ack, sack, end, win, swin;
515         int res;
516         
517         /*
518          * Get the required data from the packet.
519          */
520         seq = ntohl(tcph->seq);
521         ack = sack = ntohl(tcph->ack_seq);
522         win = ntohs(tcph->window);
523         end = segment_seq_plus_len(seq, skb->len, iph, tcph);
524         
525         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
526                 tcp_sack(tcph, &sack);
527                 
528         DEBUGP("tcp_in_window: START\n");
529         DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
530                "seq=%u ack=%u sack=%u win=%u end=%u\n",
531                 NIPQUAD(iph->saddr), ntohs(tcph->source), 
532                 NIPQUAD(iph->daddr), ntohs(tcph->dest),
533                 seq, ack, sack, win, end);
534         DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
535                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
536                 sender->td_end, sender->td_maxend, sender->td_maxwin,
537                 sender->td_scale, 
538                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 
539                 receiver->td_scale);
540                 
541         if (sender->td_end == 0) {
542                 /*
543                  * Initialize sender data.
544                  */
545                 if (tcph->syn && tcph->ack) {
546                         /*
547                          * Outgoing SYN-ACK in reply to a SYN.
548                          */
549                         sender->td_end = 
550                         sender->td_maxend = end;
551                         sender->td_maxwin = (win == 0 ? 1 : win);
552
553                         tcp_options(skb, iph, tcph, sender);
554                         /* 
555                          * RFC 1323:
556                          * Both sides must send the Window Scale option
557                          * to enable window scaling in either direction.
558                          */
559                         if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
560                               && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
561                                 sender->td_scale = 
562                                 receiver->td_scale = 0;
563                 } else {
564                         /*
565                          * We are in the middle of a connection,
566                          * its history is lost for us.
567                          * Let's try to use the data from the packet.
568                          */
569                         sender->td_end = end;
570                         sender->td_maxwin = (win == 0 ? 1 : win);
571                         sender->td_maxend = end + sender->td_maxwin;
572                 }
573         } else if (((state->state == TCP_CONNTRACK_SYN_SENT
574                      && dir == IP_CT_DIR_ORIGINAL)
575                     || (state->state == TCP_CONNTRACK_SYN_RECV
576                         && dir == IP_CT_DIR_REPLY))
577                     && after(end, sender->td_end)) {
578                 /*
579                  * RFC 793: "if a TCP is reinitialized ... then it need
580                  * not wait at all; it must only be sure to use sequence 
581                  * numbers larger than those recently used."
582                  */
583                 sender->td_end =
584                 sender->td_maxend = end;
585                 sender->td_maxwin = (win == 0 ? 1 : win);
586
587                 tcp_options(skb, iph, tcph, sender);
588         }
589         
590         if (!(tcph->ack)) {
591                 /*
592                  * If there is no ACK, just pretend it was set and OK.
593                  */
594                 ack = sack = receiver->td_end;
595         } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == 
596                     (TCP_FLAG_ACK|TCP_FLAG_RST)) 
597                    && (ack == 0)) {
598                 /*
599                  * Broken TCP stacks, that set ACK in RST packets as well
600                  * with zero ack value.
601                  */
602                 ack = sack = receiver->td_end;
603         }
604
605         if (seq == end)
606                 /*
607                  * Packets contains no data: we assume it is valid
608                  * and check the ack value only.
609                  */
610                 seq = end = sender->td_end;
611                 
612         DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
613                "seq=%u ack=%u sack =%u win=%u end=%u trim=%u\n",
614                 NIPQUAD(iph->saddr), ntohs(tcph->source),
615                 NIPQUAD(iph->daddr), ntohs(tcph->dest),
616                 seq, ack, sack, win, end, 
617                 after(end, sender->td_maxend) && before(seq, sender->td_maxend)
618                 ? sender->td_maxend : end);
619         DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
620                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
621                 sender->td_end, sender->td_maxend, sender->td_maxwin,
622                 sender->td_scale, 
623                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
624                 receiver->td_scale);
625         
626         /* Ignore data over the right edge of the receiver's window. */
627         if (after(end, sender->td_maxend) &&
628             before(seq, sender->td_maxend)) {
629                 end = sender->td_maxend;
630                 if (*index == TCP_FIN_SET)
631                         *index = TCP_ACK_SET;
632         }
633         DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
634                 before(end, sender->td_maxend + 1) 
635                     || before(seq, sender->td_maxend + 1),
636                 after(seq, sender->td_end - receiver->td_maxwin - 1) 
637                     || after(end, sender->td_end - receiver->td_maxwin - 1),
638                 before(sack, receiver->td_end + 1),
639                 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
640         
641         if (sender->loose || receiver->loose ||
642             (before(end, sender->td_maxend + 1) &&
643              after(seq, sender->td_end - receiver->td_maxwin - 1) &&
644              before(sack, receiver->td_end + 1) &&
645              after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
646                 /*
647                  * Take into account window scaling (RFC 1323).
648                  */
649                 if (!tcph->syn)
650                         win <<= sender->td_scale;
651                 
652                 /*
653                  * Update sender data.
654                  */
655                 swin = win + (sack - ack);
656                 if (sender->td_maxwin < swin)
657                         sender->td_maxwin = swin;
658                 if (after(end, sender->td_end))
659                         sender->td_end = end;
660                 if (after(sack + win, receiver->td_maxend - 1)) {
661                         receiver->td_maxend = sack + win;
662                         if (win == 0)
663                                 receiver->td_maxend++;
664                 }
665
666                 /* 
667                  * Check retransmissions.
668                  */
669                 if (*index == TCP_ACK_SET) {
670                         if (state->last_dir == dir
671                             && state->last_seq == seq
672                             && state->last_ack == ack
673                             && state->last_end == end)
674                                 state->retrans++;
675                         else {
676                                 state->last_dir = dir;
677                                 state->last_seq = seq;
678                                 state->last_ack = ack;
679                                 state->last_end = end;
680                                 state->retrans = 0;
681                         }
682                 }
683                 /*
684                  * Close the window of disabled window tracking :-)
685                  */
686                 if (sender->loose)
687                         sender->loose--;
688                 
689                 res = 1;
690         } else {
691                 if (LOG_INVALID(IPPROTO_TCP))
692                         nf_log_packet(PF_INET, 0, skb, NULL, NULL,
693                         "ip_ct_tcp: %s ",
694                         before(end, sender->td_maxend + 1) ?
695                         after(seq, sender->td_end - receiver->td_maxwin - 1) ?
696                         before(sack, receiver->td_end + 1) ?
697                         after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
698                         : "ACK is under the lower bound (possibly overly delayed ACK)"
699                         : "ACK is over the upper bound (ACKed data has never seen yet)"
700                         : "SEQ is under the lower bound (retransmitted already ACKed data)"
701                         : "SEQ is over the upper bound (over the window of the receiver)");
702
703                 res = ip_ct_tcp_be_liberal && !tcph->rst;
704         }
705   
706         DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
707                "receiver end=%u maxend=%u maxwin=%u\n",
708                 res, sender->td_end, sender->td_maxend, sender->td_maxwin, 
709                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
710
711         return res;
712 }
713
714 #ifdef CONFIG_IP_NF_NAT_NEEDED
715 /* Update sender->td_end after NAT successfully mangled the packet */
716 void ip_conntrack_tcp_update(struct sk_buff *skb,
717                              struct ip_conntrack *conntrack, 
718                              enum ip_conntrack_dir dir)
719 {
720         struct iphdr *iph = skb->nh.iph;
721         struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
722         __u32 end;
723 #ifdef DEBUGP_VARS
724         struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
725         struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
726 #endif
727
728         end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
729         
730         WRITE_LOCK(&tcp_lock);
731         /*
732          * We have to worry for the ack in the reply packet only...
733          */
734         if (after(end, conntrack->proto.tcp.seen[dir].td_end))
735                 conntrack->proto.tcp.seen[dir].td_end = end;
736         conntrack->proto.tcp.last_end = end;
737         WRITE_UNLOCK(&tcp_lock);
738         DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
739                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
740                 sender->td_end, sender->td_maxend, sender->td_maxwin,
741                 sender->td_scale, 
742                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
743                 receiver->td_scale);
744 }
745  
746 #endif
747
748 #define TH_FIN  0x01
749 #define TH_SYN  0x02
750 #define TH_RST  0x04
751 #define TH_PUSH 0x08
752 #define TH_ACK  0x10
753 #define TH_URG  0x20
754 #define TH_ECE  0x40
755 #define TH_CWR  0x80
756
757 /* table of valid flag combinations - ECE and CWR are always valid */
758 static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
759 {
760         [TH_SYN]                        = 1,
761         [TH_SYN|TH_ACK]                 = 1,
762         [TH_RST]                        = 1,
763         [TH_RST|TH_ACK]                 = 1,
764         [TH_RST|TH_ACK|TH_PUSH]         = 1,
765         [TH_FIN|TH_ACK]                 = 1,
766         [TH_ACK]                        = 1,
767         [TH_ACK|TH_PUSH]                = 1,
768         [TH_ACK|TH_URG]                 = 1,
769         [TH_ACK|TH_URG|TH_PUSH]         = 1,
770         [TH_FIN|TH_ACK|TH_PUSH]         = 1,
771         [TH_FIN|TH_ACK|TH_URG]          = 1,
772         [TH_FIN|TH_ACK|TH_URG|TH_PUSH]  = 1,
773 };
774
775 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
776 static int tcp_error(struct sk_buff *skb,
777                      enum ip_conntrack_info *ctinfo,
778                      unsigned int hooknum)
779 {
780         struct iphdr *iph = skb->nh.iph;
781         struct tcphdr _tcph, *th;
782         unsigned int tcplen = skb->len - iph->ihl * 4;
783         u_int8_t tcpflags;
784
785         /* Smaller that minimal TCP header? */
786         th = skb_header_pointer(skb, iph->ihl * 4,
787                                 sizeof(_tcph), &_tcph);
788         if (th == NULL) {
789                 if (LOG_INVALID(IPPROTO_TCP))
790                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
791                                 "ip_ct_tcp: short packet ");
792                 return -NF_ACCEPT;
793         }
794   
795         /* Not whole TCP header or malformed packet */
796         if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
797                 if (LOG_INVALID(IPPROTO_TCP))
798                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
799                                 "ip_ct_tcp: truncated/malformed packet ");
800                 return -NF_ACCEPT;
801         }
802   
803         /* Checksum invalid? Ignore.
804          * We skip checking packets on the outgoing path
805          * because the semantic of CHECKSUM_HW is different there 
806          * and moreover root might send raw packets.
807          */
808         /* FIXME: Source route IP option packets --RR */
809         if (hooknum == NF_IP_PRE_ROUTING
810             && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
811                                  skb->ip_summed == CHECKSUM_HW ? skb->csum
812                                  : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
813                 if (LOG_INVALID(IPPROTO_TCP))
814                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
815                                   "ip_ct_tcp: bad TCP checksum ");
816                 return -NF_ACCEPT;
817         }
818
819         /* Check TCP flags. */
820         tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
821         if (!tcp_valid_flags[tcpflags]) {
822                 if (LOG_INVALID(IPPROTO_TCP))
823                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
824                                   "ip_ct_tcp: invalid TCP flag combination ");
825                 return -NF_ACCEPT;
826         }
827
828         return NF_ACCEPT;
829 }
830
831 /* Returns verdict for packet, or -1 for invalid. */
832 static int tcp_packet(struct ip_conntrack *conntrack,
833                       const struct sk_buff *skb,
834                       enum ip_conntrack_info ctinfo)
835 {
836         enum tcp_conntrack new_state, old_state;
837         enum ip_conntrack_dir dir;
838         struct iphdr *iph = skb->nh.iph;
839         struct tcphdr *th, _tcph;
840         unsigned long timeout;
841         unsigned int index;
842         
843         th = skb_header_pointer(skb, iph->ihl * 4,
844                                 sizeof(_tcph), &_tcph);
845         BUG_ON(th == NULL);
846         
847         WRITE_LOCK(&tcp_lock);
848         old_state = conntrack->proto.tcp.state;
849         dir = CTINFO2DIR(ctinfo);
850         index = get_conntrack_index(th);
851         new_state = tcp_conntracks[dir][index][old_state];
852
853         switch (new_state) {
854         case TCP_CONNTRACK_IGNORE:
855                 /* Either SYN in ORIGINAL
856                  * or SYN/ACK in REPLY
857                  * or ACK in REPLY direction (half-open connection). */
858                 if (index == TCP_SYNACK_SET
859                     && conntrack->proto.tcp.last_index == TCP_SYN_SET
860                     && conntrack->proto.tcp.last_dir != dir
861                     && after(ntohl(th->ack_seq),
862                              conntrack->proto.tcp.last_seq)) {
863                         /* This SYN/ACK acknowledges a SYN that we earlier 
864                          * ignored as invalid. This means that the client and
865                          * the server are both in sync, while the firewall is
866                          * not. We kill this session and block the SYN/ACK so
867                          * that the client cannot but retransmit its SYN and 
868                          * thus initiate a clean new session.
869                          */
870                         WRITE_UNLOCK(&tcp_lock);
871                         if (LOG_INVALID(IPPROTO_TCP))
872                                 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
873                                           "ip_ct_tcp: killing out of sync session ");
874                         if (del_timer(&conntrack->timeout))
875                                 conntrack->timeout.function((unsigned long)
876                                                             conntrack);
877                         return -NF_DROP;
878                 }
879                 conntrack->proto.tcp.last_index = index;
880                 conntrack->proto.tcp.last_dir = dir;
881                 conntrack->proto.tcp.last_seq = ntohl(th->seq);
882                 
883                 WRITE_UNLOCK(&tcp_lock);
884                 if (LOG_INVALID(IPPROTO_TCP))
885                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
886                                   "ip_ct_tcp: invalid packet ignored ");
887                 return NF_ACCEPT;
888         case TCP_CONNTRACK_MAX:
889                 /* Invalid packet */
890                 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
891                        dir, get_conntrack_index(th),
892                        old_state);
893                 WRITE_UNLOCK(&tcp_lock);
894                 if (LOG_INVALID(IPPROTO_TCP))
895                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
896                                   "ip_ct_tcp: invalid state ");
897                 return -NF_ACCEPT;
898         case TCP_CONNTRACK_SYN_SENT:
899                 if (old_state >= TCP_CONNTRACK_TIME_WAIT) {     
900                         /* Attempt to reopen a closed connection.
901                         * Delete this connection and look up again. */
902                         WRITE_UNLOCK(&tcp_lock);
903                         if (del_timer(&conntrack->timeout))
904                                 conntrack->timeout.function((unsigned long)
905                                                             conntrack);
906                         return -NF_REPEAT;
907                 }
908                 break;
909         case TCP_CONNTRACK_CLOSE:
910                 if (index == TCP_RST_SET
911                     && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
912                          && conntrack->proto.tcp.last_index <= TCP_SYNACK_SET)
913                         || (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
914                          && conntrack->proto.tcp.last_index == TCP_ACK_SET))
915                     && after(ntohl(th->ack_seq),
916                              conntrack->proto.tcp.last_seq)) {
917                         /* Ignore RST closing down invalid SYN or ACK
918                            we had let trough. */ 
919                         WRITE_UNLOCK(&tcp_lock);
920                         if (LOG_INVALID(IPPROTO_TCP))
921                                 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
922                                           "ip_ct_tcp: invalid RST (ignored) ");
923                         return NF_ACCEPT;
924                 }
925                 /* Just fall trough */
926         default:
927                 /* Keep compilers happy. */
928                 break;
929         }
930
931         if (!tcp_in_window(&conntrack->proto.tcp, dir, &index, 
932                            skb, iph, th)) {
933                 WRITE_UNLOCK(&tcp_lock);
934                 return -NF_ACCEPT;
935         }
936         /* From now on we have got in-window packets */
937         
938         /* If FIN was trimmed off, we don't change state. */
939         conntrack->proto.tcp.last_index = index;
940         new_state = tcp_conntracks[dir][index][old_state];
941
942         DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
943                "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
944                 NIPQUAD(iph->saddr), ntohs(th->source),
945                 NIPQUAD(iph->daddr), ntohs(th->dest),
946                 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
947                 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
948                 old_state, new_state);
949
950         conntrack->proto.tcp.state = new_state;
951         timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
952                   && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
953                   ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
954         WRITE_UNLOCK(&tcp_lock);
955
956         if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
957                 /* If only reply is a RST, we can consider ourselves not to
958                    have an established connection: this is a fairly common
959                    problem case, so we can delete the conntrack
960                    immediately.  --RR */
961                 if (th->rst) {
962                         if (del_timer(&conntrack->timeout))
963                                 conntrack->timeout.function((unsigned long)
964                                                             conntrack);
965                         return NF_ACCEPT;
966                 }
967         } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
968                    && (old_state == TCP_CONNTRACK_SYN_RECV
969                        || old_state == TCP_CONNTRACK_ESTABLISHED)
970                    && new_state == TCP_CONNTRACK_ESTABLISHED) {
971                 /* Set ASSURED if we see see valid ack in ESTABLISHED 
972                    after SYN_RECV or a valid answer for a picked up 
973                    connection. */
974                         set_bit(IPS_ASSURED_BIT, &conntrack->status);
975         }
976         ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
977
978         return NF_ACCEPT;
979 }
980  
981   /* Called when a new connection for this protocol found. */
982 static int tcp_new(struct ip_conntrack *conntrack,
983                    const struct sk_buff *skb)
984 {
985         enum tcp_conntrack new_state;
986         struct iphdr *iph = skb->nh.iph;
987         struct tcphdr *th, _tcph;
988 #ifdef DEBUGP_VARS
989         struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
990         struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
991 #endif
992
993         th = skb_header_pointer(skb, iph->ihl * 4,
994                                 sizeof(_tcph), &_tcph);
995         BUG_ON(th == NULL);
996         
997         /* Don't need lock here: this conntrack not in circulation yet */
998         new_state
999                 = tcp_conntracks[0][get_conntrack_index(th)]
1000                 [TCP_CONNTRACK_NONE];
1001
1002         /* Invalid: delete conntrack */
1003         if (new_state >= TCP_CONNTRACK_MAX) {
1004                 DEBUGP("ip_ct_tcp: invalid new deleting.\n");
1005                 return 0;
1006         }
1007
1008         if (new_state == TCP_CONNTRACK_SYN_SENT) {
1009                 /* SYN packet */
1010                 conntrack->proto.tcp.seen[0].td_end =
1011                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1012                                              iph, th);
1013                 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1014                 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1015                         conntrack->proto.tcp.seen[0].td_maxwin = 1;
1016                 conntrack->proto.tcp.seen[0].td_maxend =
1017                         conntrack->proto.tcp.seen[0].td_end;
1018
1019                 tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
1020                 conntrack->proto.tcp.seen[1].flags = 0;
1021                 conntrack->proto.tcp.seen[0].loose = 
1022                 conntrack->proto.tcp.seen[1].loose = 0;
1023         } else if (ip_ct_tcp_loose == 0) {
1024                 /* Don't try to pick up connections. */
1025                 return 0;
1026         } else {
1027                 /*
1028                  * We are in the middle of a connection,
1029                  * its history is lost for us.
1030                  * Let's try to use the data from the packet.
1031                  */
1032                 conntrack->proto.tcp.seen[0].td_end =
1033                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1034                                              iph, th);
1035                 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1036                 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1037                         conntrack->proto.tcp.seen[0].td_maxwin = 1;
1038                 conntrack->proto.tcp.seen[0].td_maxend =
1039                         conntrack->proto.tcp.seen[0].td_end + 
1040                         conntrack->proto.tcp.seen[0].td_maxwin;
1041                 conntrack->proto.tcp.seen[0].td_scale = 0;
1042
1043                 /* We assume SACK. Should we assume window scaling too? */
1044                 conntrack->proto.tcp.seen[0].flags =
1045                 conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
1046                 conntrack->proto.tcp.seen[0].loose = 
1047                 conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
1048         }
1049     
1050         conntrack->proto.tcp.seen[1].td_end = 0;
1051         conntrack->proto.tcp.seen[1].td_maxend = 0;
1052         conntrack->proto.tcp.seen[1].td_maxwin = 1;
1053         conntrack->proto.tcp.seen[1].td_scale = 0;      
1054
1055         /* tcp_packet will set them */
1056         conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
1057         conntrack->proto.tcp.last_index = TCP_NONE_SET;
1058          
1059         DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1060                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1061                 sender->td_end, sender->td_maxend, sender->td_maxwin,
1062                 sender->td_scale, 
1063                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1064                 receiver->td_scale);
1065         return 1;
1066 }
1067   
1068 struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1069 {
1070         .proto                  = IPPROTO_TCP,
1071         .name                   = "tcp",
1072         .pkt_to_tuple           = tcp_pkt_to_tuple,
1073         .invert_tuple           = tcp_invert_tuple,
1074         .print_tuple            = tcp_print_tuple,
1075         .print_conntrack        = tcp_print_conntrack,
1076         .packet                 = tcp_packet,
1077         .new                    = tcp_new,
1078         .error                  = tcp_error,
1079 };