vserver 1.9.3
[linux-2.6.git] / net / ipv4 / netfilter / ip_conntrack_proto_tcp.c
1 /* (C) 1999-2001 Paul `Rusty' Russell
2  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
9  *      - Real stateful connection tracking
10  *      - Modified state transitions table
11  *      - Window scaling support added
12  *      - SACK support added
13  *
14  * Willy Tarreau:
15  *      - State table bugfixes
16  *      - More robust state changes
17  *      - Tuning timer parameters
18  *
19  * version 2.2
20  */
21
22 #include <linux/config.h>
23 #include <linux/types.h>
24 #include <linux/sched.h>
25 #include <linux/timer.h>
26 #include <linux/netfilter.h>
27 #include <linux/module.h>
28 #include <linux/in.h>
29 #include <linux/ip.h>
30 #include <linux/tcp.h>
31 #include <linux/spinlock.h>
32
33 #include <net/tcp.h>
34
35 #include <linux/netfilter.h>
36 #include <linux/netfilter_ipv4.h>
37 #include <linux/netfilter_ipv4/ip_conntrack.h>
38 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39 #include <linux/netfilter_ipv4/lockhelp.h>
40
41 #if 0
42 #define DEBUGP printk
43 #define DEBUGP_VARS
44 #else
45 #define DEBUGP(format, args...)
46 #endif
47
48 /* Protects conntrack->proto.tcp */
49 static DECLARE_RWLOCK(tcp_lock);
50
51 /* "Be conservative in what you do, 
52     be liberal in what you accept from others." 
53     If it's non-zero, we mark only out of window RST segments as INVALID. */
54 int ip_ct_tcp_be_liberal = 0;
55
56 /* When connection is picked up from the middle, how many packets are required
57    to pass in each direction when we assume we are in sync - if any side uses
58    window scaling, we lost the game. 
59    If it is set to zero, we disable picking up already established 
60    connections. */
61 int ip_ct_tcp_loose = 3;
62
63 /* Max number of the retransmitted packets without receiving an (acceptable) 
64    ACK from the destination. If this number is reached, a shorter timer 
65    will be started. */
66 int ip_ct_tcp_max_retrans = 3;
67
68   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
69      closely.  They're more complex. --RR */
70
71 static const char *tcp_conntrack_names[] = {
72         "NONE",
73         "SYN_SENT",
74         "SYN_RECV",
75         "ESTABLISHED",
76         "FIN_WAIT",
77         "CLOSE_WAIT",
78         "LAST_ACK",
79         "TIME_WAIT",
80         "CLOSE",
81         "LISTEN"
82 };
83   
84 #define SECS * HZ
85 #define MINS * 60 SECS
86 #define HOURS * 60 MINS
87 #define DAYS * 24 HOURS
88
89 unsigned long ip_ct_tcp_timeout_syn_sent =      2 MINS;
90 unsigned long ip_ct_tcp_timeout_syn_recv =     60 SECS;
91 unsigned long ip_ct_tcp_timeout_established =   5 DAYS;
92 unsigned long ip_ct_tcp_timeout_fin_wait =      2 MINS;
93 unsigned long ip_ct_tcp_timeout_close_wait =   60 SECS;
94 unsigned long ip_ct_tcp_timeout_last_ack =     30 SECS;
95 unsigned long ip_ct_tcp_timeout_time_wait =     2 MINS;
96 unsigned long ip_ct_tcp_timeout_close =        10 SECS;
97
98 /* RFC1122 says the R2 limit should be at least 100 seconds.
99    Linux uses 15 packets as limit, which corresponds 
100    to ~13-30min depending on RTO. */
101 unsigned long ip_ct_tcp_timeout_max_retrans =     5 MINS;
102  
103 static unsigned long * tcp_timeouts[]
104 = { NULL,                              /*      TCP_CONNTRACK_NONE */
105     &ip_ct_tcp_timeout_syn_sent,       /*      TCP_CONNTRACK_SYN_SENT, */
106     &ip_ct_tcp_timeout_syn_recv,       /*      TCP_CONNTRACK_SYN_RECV, */
107     &ip_ct_tcp_timeout_established,    /*      TCP_CONNTRACK_ESTABLISHED,      */
108     &ip_ct_tcp_timeout_fin_wait,       /*      TCP_CONNTRACK_FIN_WAIT, */
109     &ip_ct_tcp_timeout_close_wait,     /*      TCP_CONNTRACK_CLOSE_WAIT,       */
110     &ip_ct_tcp_timeout_last_ack,       /*      TCP_CONNTRACK_LAST_ACK, */
111     &ip_ct_tcp_timeout_time_wait,      /*      TCP_CONNTRACK_TIME_WAIT,        */
112     &ip_ct_tcp_timeout_close,          /*      TCP_CONNTRACK_CLOSE,    */
113     NULL,                              /*      TCP_CONNTRACK_LISTEN */
114  };
115  
116 #define sNO TCP_CONNTRACK_NONE
117 #define sSS TCP_CONNTRACK_SYN_SENT
118 #define sSR TCP_CONNTRACK_SYN_RECV
119 #define sES TCP_CONNTRACK_ESTABLISHED
120 #define sFW TCP_CONNTRACK_FIN_WAIT
121 #define sCW TCP_CONNTRACK_CLOSE_WAIT
122 #define sLA TCP_CONNTRACK_LAST_ACK
123 #define sTW TCP_CONNTRACK_TIME_WAIT
124 #define sCL TCP_CONNTRACK_CLOSE
125 #define sLI TCP_CONNTRACK_LISTEN
126 #define sIV TCP_CONNTRACK_MAX
127 #define sIG TCP_CONNTRACK_IGNORE
128
129 /* What TCP flags are set from RST/SYN/FIN/ACK. */
130 enum tcp_bit_set {
131         TCP_SYN_SET,
132         TCP_SYNACK_SET,
133         TCP_FIN_SET,
134         TCP_ACK_SET,
135         TCP_RST_SET,
136         TCP_NONE_SET,
137 };
138   
139 /*
140  * The TCP state transition table needs a few words...
141  *
142  * We are the man in the middle. All the packets go through us
143  * but might get lost in transit to the destination.
144  * It is assumed that the destinations can't receive segments 
145  * we haven't seen.
146  *
147  * The checked segment is in window, but our windows are *not*
148  * equivalent with the ones of the sender/receiver. We always
149  * try to guess the state of the current sender.
150  *
151  * The meaning of the states are:
152  *
153  * NONE:        initial state
154  * SYN_SENT:    SYN-only packet seen 
155  * SYN_RECV:    SYN-ACK packet seen
156  * ESTABLISHED: ACK packet seen
157  * FIN_WAIT:    FIN packet seen
158  * CLOSE_WAIT:  ACK seen (after FIN) 
159  * LAST_ACK:    FIN seen (after FIN)
160  * TIME_WAIT:   last ACK seen
161  * CLOSE:       closed connection
162  *
163  * LISTEN state is not used.
164  *
165  * Packets marked as IGNORED (sIG):
166  *      if they may be either invalid or valid 
167  *      and the receiver may send back a connection 
168  *      closing RST or a SYN/ACK.
169  *
170  * Packets marked as INVALID (sIV):
171  *      if they are invalid
172  *      or we do not support the request (simultaneous open)
173  */
174 static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
175         {
176 /* ORIGINAL */
177 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
178 /*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
179 /*
180  *      sNO -> sSS      Initialize a new connection
181  *      sSS -> sSS      Retransmitted SYN
182  *      sSR -> sIG      Late retransmitted SYN?
183  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
184  *                      are errors. Receiver will reply with RST 
185  *                      and close the connection.
186  *                      Or we are not in sync and hold a dead connection.
187  *      sFW -> sIG
188  *      sCW -> sIG
189  *      sLA -> sIG
190  *      sTW -> sSS      Reopened connection (RFC 1122).
191  *      sCL -> sSS
192  */
193 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
194 /*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
195 /*
196  * A SYN/ACK from the client is always invalid:
197  *      - either it tries to set up a simultaneous open, which is 
198  *        not supported;
199  *      - or the firewall has just been inserted between the two hosts
200  *        during the session set-up. The SYN will be retransmitted 
201  *        by the true client (or it'll time out).
202  */
203 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
204 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
205 /*
206  *      sNO -> sIV      Too late and no reason to do anything...
207  *      sSS -> sIV      Client migth not send FIN in this state:
208  *                      we enforce waiting for a SYN/ACK reply first.
209  *      sSR -> sFW      Close started.
210  *      sES -> sFW      
211  *      sFW -> sLA      FIN seen in both directions, waiting for
212  *                      the last ACK. 
213  *                      Migth be a retransmitted FIN as well...
214  *      sCW -> sLA
215  *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
216  *      sTW -> sTW
217  *      sCL -> sCL
218  */
219 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
220 /*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
221 /*
222  *      sNO -> sES      Assumed.
223  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
224  *      sSR -> sES      Established state is reached.
225  *      sES -> sES      :-)
226  *      sFW -> sCW      Normal close request answered by ACK.
227  *      sCW -> sCW
228  *      sLA -> sTW      Last ACK detected.
229  *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
230  *      sCL -> sCL
231  */
232 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
233 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
234 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
235         },
236         {
237 /* REPLY */
238 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
239 /*syn*/    { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
240 /*
241  *      sNO -> sIV      Never reached.
242  *      sSS -> sIV      Simultaneous open, not supported
243  *      sSR -> sIV      Simultaneous open, not supported.
244  *      sES -> sIV      Server may not initiate a connection.
245  *      sFW -> sIV
246  *      sCW -> sIV
247  *      sLA -> sIV
248  *      sTW -> sIV      Reopened connection, but server may not do it.
249  *      sCL -> sIV
250  */
251 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
252 /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
253 /*
254  *      sSS -> sSR      Standard open.
255  *      sSR -> sSR      Retransmitted SYN/ACK.
256  *      sES -> sIG      Late retransmitted SYN/ACK?
257  *      sFW -> sIG
258  *      sCW -> sIG
259  *      sLA -> sIG
260  *      sTW -> sIG
261  *      sCL -> sIG
262  */
263 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
264 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
265 /*
266  *      sSS -> sIV      Server might not send FIN in this state.
267  *      sSR -> sFW      Close started.
268  *      sES -> sFW
269  *      sFW -> sLA      FIN seen in both directions.
270  *      sCW -> sLA
271  *      sLA -> sLA      Retransmitted FIN.
272  *      sTW -> sTW
273  *      sCL -> sCL
274  */
275 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
276 /*ack*/    { sIV, sIV, sIV, sES, sCW, sCW, sTW, sTW, sCL, sIV },
277 /*
278  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
279  *      sSR -> sIV      Simultaneous open.
280  *      sES -> sES      :-)
281  *      sFW -> sCW      Normal close request answered by ACK.
282  *      sCW -> sCW
283  *      sLA -> sTW      Last ACK detected.
284  *      sTW -> sTW      Retransmitted last ACK.
285  *      sCL -> sCL
286  */
287 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
288 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
289 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
290         }
291 };
292
293 static int tcp_pkt_to_tuple(const struct sk_buff *skb,
294                             unsigned int dataoff,
295                             struct ip_conntrack_tuple *tuple)
296 {
297         struct tcphdr _hdr, *hp;
298
299         /* Actually only need first 8 bytes. */
300         hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
301         if (hp == NULL)
302                 return 0;
303
304         tuple->src.u.tcp.port = hp->source;
305         tuple->dst.u.tcp.port = hp->dest;
306
307         return 1;
308 }
309
310 static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
311                             const struct ip_conntrack_tuple *orig)
312 {
313         tuple->src.u.tcp.port = orig->dst.u.tcp.port;
314         tuple->dst.u.tcp.port = orig->src.u.tcp.port;
315         return 1;
316 }
317
318 /* Print out the per-protocol part of the tuple. */
319 static int tcp_print_tuple(struct seq_file *s,
320                            const struct ip_conntrack_tuple *tuple)
321 {
322         return seq_printf(s, "sport=%hu dport=%hu ",
323                           ntohs(tuple->src.u.tcp.port),
324                           ntohs(tuple->dst.u.tcp.port));
325 }
326
327 /* Print out the private part of the conntrack. */
328 static int tcp_print_conntrack(struct seq_file *s,
329                                const struct ip_conntrack *conntrack)
330 {
331         enum tcp_conntrack state;
332
333         READ_LOCK(&tcp_lock);
334         state = conntrack->proto.tcp.state;
335         READ_UNLOCK(&tcp_lock);
336
337         return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338 }
339
340 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
341 {
342         if (tcph->rst) return TCP_RST_SET;
343         else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
344         else if (tcph->fin) return TCP_FIN_SET;
345         else if (tcph->ack) return TCP_ACK_SET;
346         else return TCP_NONE_SET;
347 }
348
349 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
350    in IP Filter' by Guido van Rooij.
351    
352    http://www.nluug.nl/events/sane2000/papers.html
353    http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
354    
355    The boundaries and the conditions are slightly changed:
356    
357         td_maxend = max(sack + max(win,1)) seen in reply packets
358         td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
359         td_end    = max(seq + len) seen in sent packets
360    
361    I.   Upper bound for valid data:     seq + len <= sender.td_maxend
362    II.  Lower bound for valid data:     seq >= sender.td_end - receiver.td_maxwin
363    III. Upper bound for valid ack:      sack <= receiver.td_end
364    IV.  Lower bound for valid ack:      ack >= receiver.td_end - MAXACKWINDOW
365         
366    where sack is the highest right edge of sack block found in the packet.
367         
368    The upper bound limit for a valid ack is not ignored - 
369    we doesn't have to deal with fragments. 
370 */
371
372 static inline __u32 segment_seq_plus_len(__u32 seq,
373                                          size_t len,
374                                          struct iphdr *iph,
375                                          struct tcphdr *tcph)
376   {
377         return (seq + len - (iph->ihl + tcph->doff)*4
378                 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
379 }
380   
381 /* Fixme: what about big packets? */
382 #define MAXACKWINCONST                  66000
383 #define MAXACKWINDOW(sender)                                            \
384         ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
385                                               : MAXACKWINCONST)
386   
387 /*
388  * Simplified tcp_parse_options routine from tcp_input.c
389  */
390 static void tcp_options(const struct sk_buff *skb,
391                         struct iphdr *iph,
392                         struct tcphdr *tcph, 
393                         struct ip_ct_tcp_state *state)
394 {
395         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
396         unsigned char *ptr;
397         int length = (tcph->doff*4) - sizeof(struct tcphdr);
398         
399         if (!length)
400                 return;
401
402         ptr = skb_header_pointer(skb,
403                                  (iph->ihl * 4) + sizeof(struct tcphdr),
404                                  length, buff);
405         BUG_ON(ptr == NULL);
406
407         state->td_scale = 
408         state->flags = 0;
409         
410         while (length > 0) {
411                 int opcode=*ptr++;
412                 int opsize;
413                 
414                 switch (opcode) {
415                 case TCPOPT_EOL:
416                         return;
417                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
418                         length--;
419                         continue;
420                 default:
421                         opsize=*ptr++;
422                         if (opsize < 2) /* "silly options" */
423                                 return;
424                         if (opsize > length)
425                                 break;  /* don't parse partial options */
426
427                         if (opcode == TCPOPT_SACK_PERM 
428                             && opsize == TCPOLEN_SACK_PERM)
429                                 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
430                         else if (opcode == TCPOPT_WINDOW
431                                  && opsize == TCPOLEN_WINDOW) {
432                                 state->td_scale = *(u_int8_t *)ptr;
433                                 
434                                 if (state->td_scale > 14) {
435                                         /* See RFC1323 */
436                                         state->td_scale = 14;
437                                 }
438                                 state->flags |=
439                                         IP_CT_TCP_STATE_FLAG_WINDOW_SCALE;
440                         }
441                         ptr += opsize - 2;
442                         length -= opsize;
443                 }
444         }
445 }
446
447 static void tcp_sack(struct tcphdr *tcph, __u32 *sack)
448 {
449         __u32 tmp;
450         unsigned char *ptr;
451         int length = (tcph->doff*4) - sizeof(struct tcphdr);
452         
453         /* Fast path for timestamp-only option */
454         if (length == TCPOLEN_TSTAMP_ALIGNED*4
455             && *(__u32 *)(tcph + 1) ==
456                 __constant_ntohl((TCPOPT_NOP << 24) 
457                                  | (TCPOPT_NOP << 16)
458                                  | (TCPOPT_TIMESTAMP << 8)
459                                  | TCPOLEN_TIMESTAMP))
460                 return;
461                 
462         ptr = (unsigned char *)(tcph + 1);
463         while (length > 0) {
464                 int opcode=*ptr++;
465                 int opsize, i;
466                 
467                 switch (opcode) {
468                 case TCPOPT_EOL:
469                         return;
470                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
471                         length--;
472                         continue;
473                 default:
474                         opsize=*ptr++;
475                         if (opsize < 2) /* "silly options" */
476                                 return;
477                         if (opsize > length)
478                                 break;  /* don't parse partial options */
479
480                         if (opcode == TCPOPT_SACK 
481                             && opsize >= (TCPOLEN_SACK_BASE 
482                                           + TCPOLEN_SACK_PERBLOCK)
483                             && !((opsize - TCPOLEN_SACK_BASE) 
484                                  % TCPOLEN_SACK_PERBLOCK)) {
485                                 for (i = 0;
486                                      i < (opsize - TCPOLEN_SACK_BASE);
487                                      i += TCPOLEN_SACK_PERBLOCK) {
488                                         tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
489                                         
490                                         if (after(tmp, *sack))
491                                                 *sack = tmp;
492                                 }
493                                 return;
494                         }
495                         ptr += opsize - 2;
496                         length -= opsize;
497                 }
498         }
499 }
500
501 static int tcp_in_window(struct ip_ct_tcp *state, 
502                          enum ip_conntrack_dir dir,
503                          unsigned int *index,
504                          const struct sk_buff *skb,
505                          struct iphdr *iph,
506                          struct tcphdr *tcph)
507 {
508         struct ip_ct_tcp_state *sender = &state->seen[dir];
509         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
510         __u32 seq, ack, sack, end, win, swin;
511         int res;
512         
513         /*
514          * Get the required data from the packet.
515          */
516         seq = ntohl(tcph->seq);
517         ack = sack = ntohl(tcph->ack_seq);
518         win = ntohs(tcph->window);
519         end = segment_seq_plus_len(seq, skb->len, iph, tcph);
520         
521         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
522                 tcp_sack(tcph, &sack);
523                 
524         DEBUGP("tcp_in_window: START\n");
525         DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
526                "seq=%u ack=%u sack=%u win=%u end=%u\n",
527                 NIPQUAD(iph->saddr), ntohs(tcph->source), 
528                 NIPQUAD(iph->daddr), ntohs(tcph->dest),
529                 seq, ack, sack, win, end);
530         DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
531                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
532                 sender->td_end, sender->td_maxend, sender->td_maxwin,
533                 sender->td_scale, 
534                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 
535                 receiver->td_scale);
536                 
537         if (sender->td_end == 0) {
538                 /*
539                  * Initialize sender data.
540                  */
541                 if (tcph->syn && tcph->ack) {
542                         /*
543                          * Outgoing SYN-ACK in reply to a SYN.
544                          */
545                         sender->td_end = 
546                         sender->td_maxend = end;
547                         sender->td_maxwin = (win == 0 ? 1 : win);
548
549                         tcp_options(skb, iph, tcph, sender);
550                         /* 
551                          * RFC 1323:
552                          * Both sides must send the Window Scale option
553                          * to enable window scaling in either direction.
554                          */
555                         if (!(sender->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE
556                               && receiver->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE))
557                                 sender->td_scale = 
558                                 receiver->td_scale = 0;
559                 } else {
560                         /*
561                          * We are in the middle of a connection,
562                          * its history is lost for us.
563                          * Let's try to use the data from the packet.
564                          */
565                         sender->td_end = end;
566                         sender->td_maxwin = (win == 0 ? 1 : win);
567                         sender->td_maxend = end + sender->td_maxwin;
568                 }
569         } else if (state->state == TCP_CONNTRACK_SYN_SENT
570                    && dir == IP_CT_DIR_ORIGINAL
571                    && after(end, sender->td_end)) {
572                 /*
573                  * RFC 793: "if a TCP is reinitialized ... then it need
574                  * not wait at all; it must only be sure to use sequence 
575                  * numbers larger than those recently used."
576                  */
577                 sender->td_end =
578                 sender->td_maxend = end;
579                 sender->td_maxwin = (win == 0 ? 1 : win);
580
581                 tcp_options(skb, iph, tcph, sender);
582         }
583         
584         if (!(tcph->ack)) {
585                 /*
586                  * If there is no ACK, just pretend it was set and OK.
587                  */
588                 ack = sack = receiver->td_end;
589         } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == 
590                     (TCP_FLAG_ACK|TCP_FLAG_RST)) 
591                    && (ack == 0)) {
592                 /*
593                  * Broken TCP stacks, that set ACK in RST packets as well
594                  * with zero ack value.
595                  */
596                 ack = sack = receiver->td_end;
597         }
598
599         if (seq == end)
600                 /*
601                  * Packets contains no data: we assume it is valid
602                  * and check the ack value only.
603                  */
604                 seq = end = sender->td_end;
605                 
606         DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
607                "seq=%u ack=%u sack =%u win=%u end=%u trim=%u\n",
608                 NIPQUAD(iph->saddr), ntohs(tcph->source),
609                 NIPQUAD(iph->daddr), ntohs(tcph->dest),
610                 seq, ack, sack, win, end, 
611                 after(end, sender->td_maxend) && before(seq, sender->td_maxend)
612                 ? sender->td_maxend : end);
613         DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
614                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
615                 sender->td_end, sender->td_maxend, sender->td_maxwin,
616                 sender->td_scale, 
617                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
618                 receiver->td_scale);
619         
620         /* Ignore data over the right edge of the receiver's window. */
621         if (after(end, sender->td_maxend) &&
622             before(seq, sender->td_maxend)) {
623                 end = sender->td_maxend;
624                 if (*index == TCP_FIN_SET)
625                         *index = TCP_ACK_SET;
626         }
627         DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
628                 before(end, sender->td_maxend + 1) 
629                     || before(seq, sender->td_maxend + 1),
630                 after(seq, sender->td_end - receiver->td_maxwin - 1) 
631                     || after(end, sender->td_end - receiver->td_maxwin - 1),
632                 before(sack, receiver->td_end + 1),
633                 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
634         
635         if (sender->loose || receiver->loose ||
636             (before(end, sender->td_maxend + 1) &&
637              after(seq, sender->td_end - receiver->td_maxwin - 1) &&
638              before(sack, receiver->td_end + 1) &&
639              after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
640                 /*
641                  * Take into account window scaling (RFC 1323).
642                  */
643                 if (!tcph->syn)
644                         win <<= sender->td_scale;
645                 
646                 /*
647                  * Update sender data.
648                  */
649                 swin = win + (sack - ack);
650                 if (sender->td_maxwin < swin)
651                         sender->td_maxwin = swin;
652                 if (after(end, sender->td_end))
653                         sender->td_end = end;
654                 if (after(sack + win, receiver->td_maxend - 1)) {
655                         receiver->td_maxend = sack + win;
656                         if (win == 0)
657                                 receiver->td_maxend++;
658                 }
659
660                 /* 
661                  * Check retransmissions.
662                  */
663                 if (*index == TCP_ACK_SET) {
664                         if (state->last_dir == dir
665                             && state->last_seq == seq
666                             && state->last_end == end)
667                                 state->retrans++;
668                         else {
669                                 state->last_dir = dir;
670                                 state->last_seq = seq;
671                                 state->last_end = end;
672                                 state->retrans = 0;
673                         }
674                 }
675                 /*
676                  * Close the window of disabled window tracking :-)
677                  */
678                 if (sender->loose)
679                         sender->loose--;
680                 
681                 res = 1;
682         } else {
683                 if (LOG_INVALID(IPPROTO_TCP))
684                         nf_log_packet(PF_INET, 0, skb, NULL, NULL,
685                         "ip_ct_tcp: %s ",
686                         before(end, sender->td_maxend + 1) ?
687                         after(seq, sender->td_end - receiver->td_maxwin - 1) ?
688                         before(ack, receiver->td_end + 1) ?
689                         after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
690                         : "ACK is under the lower bound (possibly overly delayed ACK)"
691                         : "ACK is over the upper bound (ACKed data has never seen yet)"
692                         : "SEQ is under the lower bound (retransmitted already ACKed data)"
693                         : "SEQ is over the upper bound (over the window of the receiver)");
694
695                 res = ip_ct_tcp_be_liberal && !tcph->rst;
696         }
697   
698         DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
699                "receiver end=%u maxend=%u maxwin=%u\n",
700                 res, sender->td_end, sender->td_maxend, sender->td_maxwin, 
701                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
702
703         return res;
704 }
705
706 #ifdef CONFIG_IP_NF_NAT_NEEDED
707 /* Update sender->td_end after NAT successfully mangled the packet */
708 int ip_conntrack_tcp_update(struct sk_buff *skb,
709                             struct ip_conntrack *conntrack, 
710                             int dir)
711 {
712         struct iphdr *iph = skb->nh.iph;
713         struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
714         __u32 end;
715 #ifdef DEBUGP_VARS
716         struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
717         struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
718 #endif
719
720         end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
721         
722         WRITE_LOCK(&tcp_lock);
723         /*
724          * We have to worry for the ack in the reply packet only...
725          */
726         if (after(end, conntrack->proto.tcp.seen[dir].td_end))
727                 conntrack->proto.tcp.seen[dir].td_end = end;
728         conntrack->proto.tcp.last_end = end;
729         WRITE_UNLOCK(&tcp_lock);
730         DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
731                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
732                 sender->td_end, sender->td_maxend, sender->td_maxwin,
733                 sender->td_scale, 
734                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
735                 receiver->td_scale);
736                 
737         return 1;
738 }
739  
740 EXPORT_SYMBOL(ip_conntrack_tcp_update);
741 #endif
742
743 #define TH_FIN  0x01
744 #define TH_SYN  0x02
745 #define TH_RST  0x04
746 #define TH_PUSH 0x08
747 #define TH_ACK  0x10
748 #define TH_URG  0x20
749 #define TH_ECE  0x40
750 #define TH_CWR  0x80
751
752 /* table of valid flag combinations - ECE and CWR are always valid */
753 static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
754 {
755         [TH_SYN]                        = 1,
756         [TH_SYN|TH_ACK]                 = 1,
757         [TH_RST]                        = 1,
758         [TH_RST|TH_ACK]                 = 1,
759         [TH_RST|TH_ACK|TH_PUSH]         = 1,
760         [TH_FIN|TH_ACK]                 = 1,
761         [TH_ACK]                        = 1,
762         [TH_ACK|TH_PUSH]                = 1,
763         [TH_ACK|TH_URG]                 = 1,
764         [TH_ACK|TH_URG|TH_PUSH]         = 1,
765         [TH_FIN|TH_ACK|TH_PUSH]         = 1,
766         [TH_FIN|TH_ACK|TH_URG]          = 1,
767         [TH_FIN|TH_ACK|TH_URG|TH_PUSH]  = 1,
768 };
769
770 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
771 static int tcp_error(struct sk_buff *skb,
772                      enum ip_conntrack_info *ctinfo,
773                      unsigned int hooknum)
774 {
775         struct iphdr *iph = skb->nh.iph;
776         struct tcphdr _tcph, *th;
777         unsigned int tcplen = skb->len - iph->ihl * 4;
778         u_int8_t tcpflags;
779
780         /* Smaller that minimal TCP header? */
781         th = skb_header_pointer(skb, iph->ihl * 4,
782                                 sizeof(_tcph), &_tcph);
783         if (th == NULL) {
784                 if (LOG_INVALID(IPPROTO_TCP))
785                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
786                                 "ip_ct_tcp: short packet ");
787                 return -NF_ACCEPT;
788         }
789   
790         /* Not whole TCP header or malformed packet */
791         if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
792                 if (LOG_INVALID(IPPROTO_TCP))
793                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
794                                 "ip_ct_tcp: truncated/malformed packet ");
795                 return -NF_ACCEPT;
796         }
797   
798         /* Checksum invalid? Ignore.
799          * We skip checking packets on the outgoing path
800          * because the semantic of CHECKSUM_HW is different there 
801          * and moreover root might send raw packets.
802          */
803         /* FIXME: Source route IP option packets --RR */
804         if (hooknum == NF_IP_PRE_ROUTING
805             && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
806                                  skb->ip_summed == CHECKSUM_HW ? skb->csum
807                                  : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
808                 if (LOG_INVALID(IPPROTO_TCP))
809                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
810                                   "ip_ct_tcp: bad TCP checksum ");
811                 return -NF_ACCEPT;
812         }
813
814         /* Check TCP flags. */
815         tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
816         if (!tcp_valid_flags[tcpflags]) {
817                 if (LOG_INVALID(IPPROTO_TCP))
818                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
819                                   "ip_ct_tcp: invalid TCP flag combination ");
820                 return -NF_ACCEPT;
821         }
822
823         return NF_ACCEPT;
824 }
825
826 /* Returns verdict for packet, or -1 for invalid. */
827 static int tcp_packet(struct ip_conntrack *conntrack,
828                       const struct sk_buff *skb,
829                       enum ip_conntrack_info ctinfo)
830 {
831         enum tcp_conntrack new_state, old_state;
832         enum ip_conntrack_dir dir;
833         struct iphdr *iph = skb->nh.iph;
834         struct tcphdr *th, _tcph;
835         unsigned long timeout;
836         unsigned int index;
837         
838         th = skb_header_pointer(skb, iph->ihl * 4,
839                                 sizeof(_tcph), &_tcph);
840         BUG_ON(th == NULL);
841         
842         WRITE_LOCK(&tcp_lock);
843         old_state = conntrack->proto.tcp.state;
844         dir = CTINFO2DIR(ctinfo);
845         index = get_conntrack_index(th);
846         new_state = tcp_conntracks[dir][index][old_state];
847
848         switch (new_state) {
849         case TCP_CONNTRACK_IGNORE:
850                 /* Either SYN in ORIGINAL, or SYN/ACK in REPLY direction. */
851                 if (index == TCP_SYNACK_SET
852                     && conntrack->proto.tcp.last_index == TCP_SYN_SET
853                     && conntrack->proto.tcp.last_dir != dir
854                     && after(ntohl(th->ack_seq),
855                              conntrack->proto.tcp.last_seq)) {
856                         /* This SYN/ACK acknowledges a SYN that we earlier 
857                          * ignored as invalid. This means that the client and
858                          * the server are both in sync, while the firewall is
859                          * not. We kill this session and block the SYN/ACK so
860                          * that the client cannot but retransmit its SYN and 
861                          * thus initiate a clean new session.
862                          */
863                         WRITE_UNLOCK(&tcp_lock);
864                         if (LOG_INVALID(IPPROTO_TCP))
865                                 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
866                                           "ip_ct_tcp: killing out of sync session ");
867                         if (del_timer(&conntrack->timeout))
868                                 conntrack->timeout.function((unsigned long)
869                                                             conntrack);
870                         return -NF_DROP;
871                 }
872                 conntrack->proto.tcp.last_index = index;
873                 conntrack->proto.tcp.last_dir = dir;
874                 conntrack->proto.tcp.last_seq = ntohl(th->seq);
875                 
876                 WRITE_UNLOCK(&tcp_lock);
877                 if (LOG_INVALID(IPPROTO_TCP))
878                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
879                                   "ip_ct_tcp: invalid SYN (ignored) ");
880                 return NF_ACCEPT;
881         case TCP_CONNTRACK_MAX:
882                 /* Invalid packet */
883                 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
884                        dir, get_conntrack_index(th),
885                        old_state);
886                 WRITE_UNLOCK(&tcp_lock);
887                 if (LOG_INVALID(IPPROTO_TCP))
888                         nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
889                                   "ip_ct_tcp: invalid state ");
890                 return -NF_ACCEPT;
891         case TCP_CONNTRACK_SYN_SENT:
892                 if (old_state >= TCP_CONNTRACK_TIME_WAIT) {     
893                         /* Attempt to reopen a closed connection.
894                         * Delete this connection and look up again. */
895                         WRITE_UNLOCK(&tcp_lock);
896                         if (del_timer(&conntrack->timeout))
897                                 conntrack->timeout.function((unsigned long)
898                                                             conntrack);
899                         return -NF_REPEAT;
900                 }
901                 break;
902         case TCP_CONNTRACK_CLOSE:
903                 if (index == TCP_RST_SET
904                     && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
905                     && conntrack->proto.tcp.last_index <= TCP_SYNACK_SET
906                     && after(ntohl(th->ack_seq),
907                              conntrack->proto.tcp.last_seq)) {
908                         /* Ignore RST closing down invalid SYN 
909                            we had let trough. */ 
910                         WRITE_UNLOCK(&tcp_lock);
911                         if (LOG_INVALID(IPPROTO_TCP))
912                                 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
913                                           "ip_ct_tcp: invalid RST (ignored) ");
914                         return NF_ACCEPT;
915                 }
916                 /* Just fall trough */
917         default:
918                 /* Keep compilers happy. */
919                 break;
920         }
921
922         if (!tcp_in_window(&conntrack->proto.tcp, dir, &index, 
923                            skb, iph, th)) {
924                 WRITE_UNLOCK(&tcp_lock);
925                 return -NF_ACCEPT;
926         }
927         /* From now on we have got in-window packets */
928         
929         /* If FIN was trimmed off, we don't change state. */
930         conntrack->proto.tcp.last_index = index;
931         new_state = tcp_conntracks[dir][index][old_state];
932
933         DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
934                "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
935                 NIPQUAD(iph->saddr), ntohs(th->source),
936                 NIPQUAD(iph->daddr), ntohs(th->dest),
937                 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
938                 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
939                 old_state, new_state);
940
941         conntrack->proto.tcp.state = new_state;
942         timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
943                   && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
944                   ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
945         WRITE_UNLOCK(&tcp_lock);
946
947         if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
948                 /* If only reply is a RST, we can consider ourselves not to
949                    have an established connection: this is a fairly common
950                    problem case, so we can delete the conntrack
951                    immediately.  --RR */
952                 if (th->rst) {
953                         if (del_timer(&conntrack->timeout))
954                                 conntrack->timeout.function((unsigned long)
955                                                             conntrack);
956                         return NF_ACCEPT;
957                 }
958         } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
959                    && (old_state == TCP_CONNTRACK_SYN_RECV
960                        || old_state == TCP_CONNTRACK_ESTABLISHED)
961                    && new_state == TCP_CONNTRACK_ESTABLISHED) {
962                 /* Set ASSURED if we see see valid ack in ESTABLISHED 
963                    after SYN_RECV or a valid answer for a picked up 
964                    connection. */
965                         set_bit(IPS_ASSURED_BIT, &conntrack->status);
966         }
967         ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
968
969         return NF_ACCEPT;
970 }
971  
972   /* Called when a new connection for this protocol found. */
973 static int tcp_new(struct ip_conntrack *conntrack,
974                    const struct sk_buff *skb)
975 {
976         enum tcp_conntrack new_state;
977         struct iphdr *iph = skb->nh.iph;
978         struct tcphdr *th, _tcph;
979 #ifdef DEBUGP_VARS
980         struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
981         struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
982 #endif
983
984         th = skb_header_pointer(skb, iph->ihl * 4,
985                                 sizeof(_tcph), &_tcph);
986         BUG_ON(th == NULL);
987         
988         /* Don't need lock here: this conntrack not in circulation yet */
989         new_state
990                 = tcp_conntracks[0][get_conntrack_index(th)]
991                 [TCP_CONNTRACK_NONE];
992
993         /* Invalid: delete conntrack */
994         if (new_state >= TCP_CONNTRACK_MAX) {
995                 DEBUGP("ip_ct_tcp: invalid new deleting.\n");
996                 return 0;
997         }
998
999         if (new_state == TCP_CONNTRACK_SYN_SENT) {
1000                 /* SYN packet */
1001                 conntrack->proto.tcp.seen[0].td_end =
1002                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1003                                              iph, th);
1004                 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1005                 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1006                         conntrack->proto.tcp.seen[0].td_maxwin = 1;
1007                 conntrack->proto.tcp.seen[0].td_maxend =
1008                         conntrack->proto.tcp.seen[0].td_end;
1009
1010                 tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
1011                 conntrack->proto.tcp.seen[1].flags = 0;
1012                 conntrack->proto.tcp.seen[0].loose = 
1013                 conntrack->proto.tcp.seen[1].loose = 0;
1014         } else if (ip_ct_tcp_loose == 0) {
1015                 /* Don't try to pick up connections. */
1016                 return 0;
1017         } else {
1018                 /*
1019                  * We are in the middle of a connection,
1020                  * its history is lost for us.
1021                  * Let's try to use the data from the packet.
1022                  */
1023                 conntrack->proto.tcp.seen[0].td_end =
1024                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1025                                              iph, th);
1026                 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1027                 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1028                         conntrack->proto.tcp.seen[0].td_maxwin = 1;
1029                 conntrack->proto.tcp.seen[0].td_maxend =
1030                         conntrack->proto.tcp.seen[0].td_end + 
1031                         conntrack->proto.tcp.seen[0].td_maxwin;
1032                 conntrack->proto.tcp.seen[0].td_scale = 0;
1033
1034                 /* We assume SACK. Should we assume window scaling too? */
1035                 conntrack->proto.tcp.seen[0].flags =
1036                 conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
1037                 conntrack->proto.tcp.seen[0].loose = 
1038                 conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
1039         }
1040     
1041         conntrack->proto.tcp.seen[1].td_end = 0;
1042         conntrack->proto.tcp.seen[1].td_maxend = 0;
1043         conntrack->proto.tcp.seen[1].td_maxwin = 1;
1044         conntrack->proto.tcp.seen[1].td_scale = 0;      
1045
1046         /* tcp_packet will set them */
1047         conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
1048         conntrack->proto.tcp.last_index = TCP_NONE_SET;
1049          
1050         DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1051                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1052                 sender->td_end, sender->td_maxend, sender->td_maxwin,
1053                 sender->td_scale, 
1054                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1055                 receiver->td_scale);
1056         return 1;
1057 }
1058   
1059 static int tcp_exp_matches_pkt(struct ip_conntrack_expect *exp,
1060                                const struct sk_buff *skb)
1061 {
1062         const struct iphdr *iph = skb->nh.iph;
1063         struct tcphdr *th, _tcph;
1064         unsigned int datalen;
1065
1066         th = skb_header_pointer(skb, iph->ihl * 4,
1067                                 sizeof(_tcph), &_tcph);
1068         if (th == NULL)
1069                 return 0;
1070         datalen = skb->len - iph->ihl*4 - th->doff*4;
1071
1072         return between(exp->seq, ntohl(th->seq), ntohl(th->seq) + datalen);
1073 }
1074
1075 struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1076 {
1077         .proto                  = IPPROTO_TCP,
1078         .name                   = "tcp",
1079         .pkt_to_tuple           = tcp_pkt_to_tuple,
1080         .invert_tuple           = tcp_invert_tuple,
1081         .print_tuple            = tcp_print_tuple,
1082         .print_conntrack        = tcp_print_conntrack,
1083         .packet                 = tcp_packet,
1084         .new                    = tcp_new,
1085         .exp_matches_pkt        = tcp_exp_matches_pkt,
1086         .error                  = tcp_error,
1087 };