1 /* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
8 * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
9 * - Real stateful connection tracking
10 * - Modified state transitions table
11 * - Window scaling support added
12 * - SACK support added
15 * - State table bugfixes
16 * - More robust state changes
17 * - Tuning timer parameters
22 #include <linux/config.h>
23 #include <linux/types.h>
24 #include <linux/sched.h>
25 #include <linux/timer.h>
26 #include <linux/netfilter.h>
27 #include <linux/module.h>
30 #include <linux/tcp.h>
31 #include <linux/spinlock.h>
35 #include <linux/netfilter.h>
36 #include <linux/netfilter_ipv4.h>
37 #include <linux/netfilter_ipv4/ip_conntrack.h>
38 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39 #include <linux/netfilter_ipv4/lockhelp.h>
45 #define DEBUGP(format, args...)
48 /* Protects conntrack->proto.tcp */
49 static DECLARE_RWLOCK(tcp_lock);
51 /* "Be conservative in what you do,
52 be liberal in what you accept from others."
53 If it's non-zero, we mark only out of window RST segments as INVALID. */
54 int ip_ct_tcp_be_liberal = 0;
56 /* When connection is picked up from the middle, how many packets are required
57 to pass in each direction when we assume we are in sync - if any side uses
58 window scaling, we lost the game.
59 If it is set to zero, we disable picking up already established
61 int ip_ct_tcp_loose = 3;
63 /* Max number of the retransmitted packets without receiving an (acceptable)
64 ACK from the destination. If this number is reached, a shorter timer
66 int ip_ct_tcp_max_retrans = 3;
68 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
69 closely. They're more complex. --RR */
71 static const char *tcp_conntrack_names[] = {
85 #define MINS * 60 SECS
86 #define HOURS * 60 MINS
87 #define DAYS * 24 HOURS
89 unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS;
90 unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS;
91 unsigned long ip_ct_tcp_timeout_established = 5 DAYS;
92 unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS;
93 unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS;
94 unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS;
95 unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS;
96 unsigned long ip_ct_tcp_timeout_close = 10 SECS;
98 /* RFC1122 says the R2 limit should be at least 100 seconds.
99 Linux uses 15 packets as limit, which corresponds
100 to ~13-30min depending on RTO. */
101 unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS;
103 static unsigned long * tcp_timeouts[]
104 = { NULL, /* TCP_CONNTRACK_NONE */
105 &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
106 &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
107 &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
108 &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
109 &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
110 &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
111 &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
112 &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
113 NULL, /* TCP_CONNTRACK_LISTEN */
116 #define sNO TCP_CONNTRACK_NONE
117 #define sSS TCP_CONNTRACK_SYN_SENT
118 #define sSR TCP_CONNTRACK_SYN_RECV
119 #define sES TCP_CONNTRACK_ESTABLISHED
120 #define sFW TCP_CONNTRACK_FIN_WAIT
121 #define sCW TCP_CONNTRACK_CLOSE_WAIT
122 #define sLA TCP_CONNTRACK_LAST_ACK
123 #define sTW TCP_CONNTRACK_TIME_WAIT
124 #define sCL TCP_CONNTRACK_CLOSE
125 #define sLI TCP_CONNTRACK_LISTEN
126 #define sIV TCP_CONNTRACK_MAX
127 #define sIG TCP_CONNTRACK_IGNORE
129 /* What TCP flags are set from RST/SYN/FIN/ACK. */
140 * The TCP state transition table needs a few words...
142 * We are the man in the middle. All the packets go through us
143 * but might get lost in transit to the destination.
144 * It is assumed that the destinations can't receive segments
147 * The checked segment is in window, but our windows are *not*
148 * equivalent with the ones of the sender/receiver. We always
149 * try to guess the state of the current sender.
151 * The meaning of the states are:
153 * NONE: initial state
154 * SYN_SENT: SYN-only packet seen
155 * SYN_RECV: SYN-ACK packet seen
156 * ESTABLISHED: ACK packet seen
157 * FIN_WAIT: FIN packet seen
158 * CLOSE_WAIT: ACK seen (after FIN)
159 * LAST_ACK: FIN seen (after FIN)
160 * TIME_WAIT: last ACK seen
161 * CLOSE: closed connection
163 * LISTEN state is not used.
165 * Packets marked as IGNORED (sIG):
166 * if they may be either invalid or valid
167 * and the receiver may send back a connection
168 * closing RST or a SYN/ACK.
170 * Packets marked as INVALID (sIV):
171 * if they are invalid
172 * or we do not support the request (simultaneous open)
174 static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
177 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
178 /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
180 * sNO -> sSS Initialize a new connection
181 * sSS -> sSS Retransmitted SYN
182 * sSR -> sIG Late retransmitted SYN?
183 * sES -> sIG Error: SYNs in window outside the SYN_SENT state
184 * are errors. Receiver will reply with RST
185 * and close the connection.
186 * Or we are not in sync and hold a dead connection.
190 * sTW -> sSS Reopened connection (RFC 1122).
193 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
194 /*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
196 * A SYN/ACK from the client is always invalid:
197 * - either it tries to set up a simultaneous open, which is
199 * - or the firewall has just been inserted between the two hosts
200 * during the session set-up. The SYN will be retransmitted
201 * by the true client (or it'll time out).
203 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
204 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
206 * sNO -> sIV Too late and no reason to do anything...
207 * sSS -> sIV Client migth not send FIN in this state:
208 * we enforce waiting for a SYN/ACK reply first.
209 * sSR -> sFW Close started.
211 * sFW -> sLA FIN seen in both directions, waiting for
213 * Migth be a retransmitted FIN as well...
215 * sLA -> sLA Retransmitted FIN. Remain in the same state.
219 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
220 /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
222 * sNO -> sES Assumed.
223 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
224 * sSR -> sES Established state is reached.
226 * sFW -> sCW Normal close request answered by ACK.
228 * sLA -> sTW Last ACK detected.
229 * sTW -> sTW Retransmitted last ACK. Remain in the same state.
232 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
233 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
234 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
238 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
239 /*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
241 * sNO -> sIV Never reached.
242 * sSS -> sIV Simultaneous open, not supported
243 * sSR -> sIV Simultaneous open, not supported.
244 * sES -> sIV Server may not initiate a connection.
248 * sTW -> sIV Reopened connection, but server may not do it.
251 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
252 /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
254 * sSS -> sSR Standard open.
255 * sSR -> sSR Retransmitted SYN/ACK.
256 * sES -> sIG Late retransmitted SYN/ACK?
263 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
264 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
266 * sSS -> sIV Server might not send FIN in this state.
267 * sSR -> sFW Close started.
269 * sFW -> sLA FIN seen in both directions.
271 * sLA -> sLA Retransmitted FIN.
275 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
276 /*ack*/ { sIV, sIV, sIV, sES, sCW, sCW, sTW, sTW, sCL, sIV },
278 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
279 * sSR -> sIV Simultaneous open.
281 * sFW -> sCW Normal close request answered by ACK.
283 * sLA -> sTW Last ACK detected.
284 * sTW -> sTW Retransmitted last ACK.
287 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
288 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
289 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
293 static int tcp_pkt_to_tuple(const struct sk_buff *skb,
294 unsigned int dataoff,
295 struct ip_conntrack_tuple *tuple)
297 struct tcphdr _hdr, *hp;
299 /* Actually only need first 8 bytes. */
300 hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
304 tuple->src.u.tcp.port = hp->source;
305 tuple->dst.u.tcp.port = hp->dest;
310 static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
311 const struct ip_conntrack_tuple *orig)
313 tuple->src.u.tcp.port = orig->dst.u.tcp.port;
314 tuple->dst.u.tcp.port = orig->src.u.tcp.port;
318 /* Print out the per-protocol part of the tuple. */
319 static int tcp_print_tuple(struct seq_file *s,
320 const struct ip_conntrack_tuple *tuple)
322 return seq_printf(s, "sport=%hu dport=%hu ",
323 ntohs(tuple->src.u.tcp.port),
324 ntohs(tuple->dst.u.tcp.port));
327 /* Print out the private part of the conntrack. */
328 static int tcp_print_conntrack(struct seq_file *s,
329 const struct ip_conntrack *conntrack)
331 enum tcp_conntrack state;
333 READ_LOCK(&tcp_lock);
334 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock);
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
340 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
342 if (tcph->rst) return TCP_RST_SET;
343 else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
344 else if (tcph->fin) return TCP_FIN_SET;
345 else if (tcph->ack) return TCP_ACK_SET;
346 else return TCP_NONE_SET;
349 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
350 in IP Filter' by Guido van Rooij.
352 http://www.nluug.nl/events/sane2000/papers.html
353 http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
355 The boundaries and the conditions are slightly changed:
357 td_maxend = max(sack + max(win,1)) seen in reply packets
358 td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
359 td_end = max(seq + len) seen in sent packets
361 I. Upper bound for valid data: seq + len <= sender.td_maxend
362 II. Lower bound for valid data: seq >= sender.td_end - receiver.td_maxwin
363 III. Upper bound for valid ack: sack <= receiver.td_end
364 IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
366 where sack is the highest right edge of sack block found in the packet.
368 The upper bound limit for a valid ack is not ignored -
369 we doesn't have to deal with fragments.
372 static inline __u32 segment_seq_plus_len(__u32 seq,
377 return (seq + len - (iph->ihl + tcph->doff)*4
378 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
381 /* Fixme: what about big packets? */
382 #define MAXACKWINCONST 66000
383 #define MAXACKWINDOW(sender) \
384 ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
388 * Simplified tcp_parse_options routine from tcp_input.c
390 static void tcp_options(const struct sk_buff *skb,
393 struct ip_ct_tcp_state *state)
395 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
397 int length = (tcph->doff*4) - sizeof(struct tcphdr);
402 ptr = skb_header_pointer(skb,
403 (iph->ihl * 4) + sizeof(struct tcphdr),
417 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
422 if (opsize < 2) /* "silly options" */
425 break; /* don't parse partial options */
427 if (opcode == TCPOPT_SACK_PERM
428 && opsize == TCPOLEN_SACK_PERM)
429 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
430 else if (opcode == TCPOPT_WINDOW
431 && opsize == TCPOLEN_WINDOW) {
432 state->td_scale = *(u_int8_t *)ptr;
434 if (state->td_scale > 14) {
436 state->td_scale = 14;
439 IP_CT_TCP_STATE_FLAG_WINDOW_SCALE;
447 static void tcp_sack(struct tcphdr *tcph, __u32 *sack)
451 int length = (tcph->doff*4) - sizeof(struct tcphdr);
453 /* Fast path for timestamp-only option */
454 if (length == TCPOLEN_TSTAMP_ALIGNED*4
455 && *(__u32 *)(tcph + 1) ==
456 __constant_ntohl((TCPOPT_NOP << 24)
458 | (TCPOPT_TIMESTAMP << 8)
459 | TCPOLEN_TIMESTAMP))
462 ptr = (unsigned char *)(tcph + 1);
470 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
475 if (opsize < 2) /* "silly options" */
478 break; /* don't parse partial options */
480 if (opcode == TCPOPT_SACK
481 && opsize >= (TCPOLEN_SACK_BASE
482 + TCPOLEN_SACK_PERBLOCK)
483 && !((opsize - TCPOLEN_SACK_BASE)
484 % TCPOLEN_SACK_PERBLOCK)) {
486 i < (opsize - TCPOLEN_SACK_BASE);
487 i += TCPOLEN_SACK_PERBLOCK) {
488 tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
490 if (after(tmp, *sack))
501 static int tcp_in_window(struct ip_ct_tcp *state,
502 enum ip_conntrack_dir dir,
504 const struct sk_buff *skb,
508 struct ip_ct_tcp_state *sender = &state->seen[dir];
509 struct ip_ct_tcp_state *receiver = &state->seen[!dir];
510 __u32 seq, ack, sack, end, win, swin;
514 * Get the required data from the packet.
516 seq = ntohl(tcph->seq);
517 ack = sack = ntohl(tcph->ack_seq);
518 win = ntohs(tcph->window);
519 end = segment_seq_plus_len(seq, skb->len, iph, tcph);
521 if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
522 tcp_sack(tcph, &sack);
524 DEBUGP("tcp_in_window: START\n");
525 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
526 "seq=%u ack=%u sack=%u win=%u end=%u\n",
527 NIPQUAD(iph->saddr), ntohs(tcph->source),
528 NIPQUAD(iph->daddr), ntohs(tcph->dest),
529 seq, ack, sack, win, end);
530 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
531 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
532 sender->td_end, sender->td_maxend, sender->td_maxwin,
534 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
537 if (sender->td_end == 0) {
539 * Initialize sender data.
541 if (tcph->syn && tcph->ack) {
543 * Outgoing SYN-ACK in reply to a SYN.
546 sender->td_maxend = end;
547 sender->td_maxwin = (win == 0 ? 1 : win);
549 tcp_options(skb, iph, tcph, sender);
552 * Both sides must send the Window Scale option
553 * to enable window scaling in either direction.
555 if (!(sender->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE
556 && receiver->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE))
558 receiver->td_scale = 0;
561 * We are in the middle of a connection,
562 * its history is lost for us.
563 * Let's try to use the data from the packet.
565 sender->td_end = end;
566 sender->td_maxwin = (win == 0 ? 1 : win);
567 sender->td_maxend = end + sender->td_maxwin;
569 } else if (state->state == TCP_CONNTRACK_SYN_SENT
570 && dir == IP_CT_DIR_ORIGINAL
571 && after(end, sender->td_end)) {
573 * RFC 793: "if a TCP is reinitialized ... then it need
574 * not wait at all; it must only be sure to use sequence
575 * numbers larger than those recently used."
578 sender->td_maxend = end;
579 sender->td_maxwin = (win == 0 ? 1 : win);
581 tcp_options(skb, iph, tcph, sender);
586 * If there is no ACK, just pretend it was set and OK.
588 ack = sack = receiver->td_end;
589 } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
590 (TCP_FLAG_ACK|TCP_FLAG_RST))
593 * Broken TCP stacks, that set ACK in RST packets as well
594 * with zero ack value.
596 ack = sack = receiver->td_end;
601 * Packets contains no data: we assume it is valid
602 * and check the ack value only.
604 seq = end = sender->td_end;
606 DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
607 "seq=%u ack=%u sack =%u win=%u end=%u trim=%u\n",
608 NIPQUAD(iph->saddr), ntohs(tcph->source),
609 NIPQUAD(iph->daddr), ntohs(tcph->dest),
610 seq, ack, sack, win, end,
611 after(end, sender->td_maxend) && before(seq, sender->td_maxend)
612 ? sender->td_maxend : end);
613 DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
614 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
615 sender->td_end, sender->td_maxend, sender->td_maxwin,
617 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
620 /* Ignore data over the right edge of the receiver's window. */
621 if (after(end, sender->td_maxend) &&
622 before(seq, sender->td_maxend)) {
623 end = sender->td_maxend;
624 if (*index == TCP_FIN_SET)
625 *index = TCP_ACK_SET;
627 DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
628 before(end, sender->td_maxend + 1)
629 || before(seq, sender->td_maxend + 1),
630 after(seq, sender->td_end - receiver->td_maxwin - 1)
631 || after(end, sender->td_end - receiver->td_maxwin - 1),
632 before(sack, receiver->td_end + 1),
633 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
635 if (sender->loose || receiver->loose ||
636 (before(end, sender->td_maxend + 1) &&
637 after(seq, sender->td_end - receiver->td_maxwin - 1) &&
638 before(sack, receiver->td_end + 1) &&
639 after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
641 * Take into account window scaling (RFC 1323).
644 win <<= sender->td_scale;
647 * Update sender data.
649 swin = win + (sack - ack);
650 if (sender->td_maxwin < swin)
651 sender->td_maxwin = swin;
652 if (after(end, sender->td_end))
653 sender->td_end = end;
654 if (after(sack + win, receiver->td_maxend - 1)) {
655 receiver->td_maxend = sack + win;
657 receiver->td_maxend++;
661 * Check retransmissions.
663 if (*index == TCP_ACK_SET) {
664 if (state->last_dir == dir
665 && state->last_seq == seq
666 && state->last_end == end)
669 state->last_dir = dir;
670 state->last_seq = seq;
671 state->last_end = end;
676 * Close the window of disabled window tracking :-)
683 if (LOG_INVALID(IPPROTO_TCP))
684 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
686 before(end, sender->td_maxend + 1) ?
687 after(seq, sender->td_end - receiver->td_maxwin - 1) ?
688 before(ack, receiver->td_end + 1) ?
689 after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
690 : "ACK is under the lower bound (possibly overly delayed ACK)"
691 : "ACK is over the upper bound (ACKed data has never seen yet)"
692 : "SEQ is under the lower bound (retransmitted already ACKed data)"
693 : "SEQ is over the upper bound (over the window of the receiver)");
695 res = ip_ct_tcp_be_liberal && !tcph->rst;
698 DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
699 "receiver end=%u maxend=%u maxwin=%u\n",
700 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
701 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
706 #ifdef CONFIG_IP_NF_NAT_NEEDED
707 /* Update sender->td_end after NAT successfully mangled the packet */
708 int ip_conntrack_tcp_update(struct sk_buff *skb,
709 struct ip_conntrack *conntrack,
712 struct iphdr *iph = skb->nh.iph;
713 struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
716 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
717 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
720 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
722 WRITE_LOCK(&tcp_lock);
724 * We have to worry for the ack in the reply packet only...
726 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
727 conntrack->proto.tcp.seen[dir].td_end = end;
728 conntrack->proto.tcp.last_end = end;
729 WRITE_UNLOCK(&tcp_lock);
730 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
731 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
732 sender->td_end, sender->td_maxend, sender->td_maxwin,
734 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
740 EXPORT_SYMBOL(ip_conntrack_tcp_update);
752 /* table of valid flag combinations - ECE and CWR are always valid */
753 static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
759 [TH_RST|TH_ACK|TH_PUSH] = 1,
762 [TH_ACK|TH_PUSH] = 1,
764 [TH_ACK|TH_URG|TH_PUSH] = 1,
765 [TH_FIN|TH_ACK|TH_PUSH] = 1,
766 [TH_FIN|TH_ACK|TH_URG] = 1,
767 [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
770 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
771 static int tcp_error(struct sk_buff *skb,
772 enum ip_conntrack_info *ctinfo,
773 unsigned int hooknum)
775 struct iphdr *iph = skb->nh.iph;
776 struct tcphdr _tcph, *th;
777 unsigned int tcplen = skb->len - iph->ihl * 4;
780 /* Smaller that minimal TCP header? */
781 th = skb_header_pointer(skb, iph->ihl * 4,
782 sizeof(_tcph), &_tcph);
784 if (LOG_INVALID(IPPROTO_TCP))
785 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
786 "ip_ct_tcp: short packet ");
790 /* Not whole TCP header or malformed packet */
791 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
792 if (LOG_INVALID(IPPROTO_TCP))
793 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
794 "ip_ct_tcp: truncated/malformed packet ");
798 /* Checksum invalid? Ignore.
799 * We skip checking packets on the outgoing path
800 * because the semantic of CHECKSUM_HW is different there
801 * and moreover root might send raw packets.
803 /* FIXME: Source route IP option packets --RR */
804 if (hooknum == NF_IP_PRE_ROUTING
805 && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
806 skb->ip_summed == CHECKSUM_HW ? skb->csum
807 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
808 if (LOG_INVALID(IPPROTO_TCP))
809 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
810 "ip_ct_tcp: bad TCP checksum ");
814 /* Check TCP flags. */
815 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
816 if (!tcp_valid_flags[tcpflags]) {
817 if (LOG_INVALID(IPPROTO_TCP))
818 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
819 "ip_ct_tcp: invalid TCP flag combination ");
826 /* Returns verdict for packet, or -1 for invalid. */
827 static int tcp_packet(struct ip_conntrack *conntrack,
828 const struct sk_buff *skb,
829 enum ip_conntrack_info ctinfo)
831 enum tcp_conntrack new_state, old_state;
832 enum ip_conntrack_dir dir;
833 struct iphdr *iph = skb->nh.iph;
834 struct tcphdr *th, _tcph;
835 unsigned long timeout;
838 th = skb_header_pointer(skb, iph->ihl * 4,
839 sizeof(_tcph), &_tcph);
842 WRITE_LOCK(&tcp_lock);
843 old_state = conntrack->proto.tcp.state;
844 dir = CTINFO2DIR(ctinfo);
845 index = get_conntrack_index(th);
846 new_state = tcp_conntracks[dir][index][old_state];
849 case TCP_CONNTRACK_IGNORE:
850 /* Either SYN in ORIGINAL, or SYN/ACK in REPLY direction. */
851 if (index == TCP_SYNACK_SET
852 && conntrack->proto.tcp.last_index == TCP_SYN_SET
853 && conntrack->proto.tcp.last_dir != dir
854 && after(ntohl(th->ack_seq),
855 conntrack->proto.tcp.last_seq)) {
856 /* This SYN/ACK acknowledges a SYN that we earlier
857 * ignored as invalid. This means that the client and
858 * the server are both in sync, while the firewall is
859 * not. We kill this session and block the SYN/ACK so
860 * that the client cannot but retransmit its SYN and
861 * thus initiate a clean new session.
863 WRITE_UNLOCK(&tcp_lock);
864 if (LOG_INVALID(IPPROTO_TCP))
865 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
866 "ip_ct_tcp: killing out of sync session ");
867 if (del_timer(&conntrack->timeout))
868 conntrack->timeout.function((unsigned long)
872 conntrack->proto.tcp.last_index = index;
873 conntrack->proto.tcp.last_dir = dir;
874 conntrack->proto.tcp.last_seq = ntohl(th->seq);
876 WRITE_UNLOCK(&tcp_lock);
877 if (LOG_INVALID(IPPROTO_TCP))
878 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
879 "ip_ct_tcp: invalid SYN (ignored) ");
881 case TCP_CONNTRACK_MAX:
883 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
884 dir, get_conntrack_index(th),
886 WRITE_UNLOCK(&tcp_lock);
887 if (LOG_INVALID(IPPROTO_TCP))
888 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
889 "ip_ct_tcp: invalid state ");
891 case TCP_CONNTRACK_SYN_SENT:
892 if (old_state >= TCP_CONNTRACK_TIME_WAIT) {
893 /* Attempt to reopen a closed connection.
894 * Delete this connection and look up again. */
895 WRITE_UNLOCK(&tcp_lock);
896 if (del_timer(&conntrack->timeout))
897 conntrack->timeout.function((unsigned long)
902 case TCP_CONNTRACK_CLOSE:
903 if (index == TCP_RST_SET
904 && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
905 && conntrack->proto.tcp.last_index <= TCP_SYNACK_SET
906 && after(ntohl(th->ack_seq),
907 conntrack->proto.tcp.last_seq)) {
908 /* Ignore RST closing down invalid SYN
909 we had let trough. */
910 WRITE_UNLOCK(&tcp_lock);
911 if (LOG_INVALID(IPPROTO_TCP))
912 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
913 "ip_ct_tcp: invalid RST (ignored) ");
916 /* Just fall trough */
918 /* Keep compilers happy. */
922 if (!tcp_in_window(&conntrack->proto.tcp, dir, &index,
924 WRITE_UNLOCK(&tcp_lock);
927 /* From now on we have got in-window packets */
929 /* If FIN was trimmed off, we don't change state. */
930 conntrack->proto.tcp.last_index = index;
931 new_state = tcp_conntracks[dir][index][old_state];
933 DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
934 "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
935 NIPQUAD(iph->saddr), ntohs(th->source),
936 NIPQUAD(iph->daddr), ntohs(th->dest),
937 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
938 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
939 old_state, new_state);
941 conntrack->proto.tcp.state = new_state;
942 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
943 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
944 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
945 WRITE_UNLOCK(&tcp_lock);
947 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
948 /* If only reply is a RST, we can consider ourselves not to
949 have an established connection: this is a fairly common
950 problem case, so we can delete the conntrack
953 if (del_timer(&conntrack->timeout))
954 conntrack->timeout.function((unsigned long)
958 } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
959 && (old_state == TCP_CONNTRACK_SYN_RECV
960 || old_state == TCP_CONNTRACK_ESTABLISHED)
961 && new_state == TCP_CONNTRACK_ESTABLISHED) {
962 /* Set ASSURED if we see see valid ack in ESTABLISHED
963 after SYN_RECV or a valid answer for a picked up
965 set_bit(IPS_ASSURED_BIT, &conntrack->status);
967 ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
972 /* Called when a new connection for this protocol found. */
973 static int tcp_new(struct ip_conntrack *conntrack,
974 const struct sk_buff *skb)
976 enum tcp_conntrack new_state;
977 struct iphdr *iph = skb->nh.iph;
978 struct tcphdr *th, _tcph;
980 struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
981 struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
984 th = skb_header_pointer(skb, iph->ihl * 4,
985 sizeof(_tcph), &_tcph);
988 /* Don't need lock here: this conntrack not in circulation yet */
990 = tcp_conntracks[0][get_conntrack_index(th)]
991 [TCP_CONNTRACK_NONE];
993 /* Invalid: delete conntrack */
994 if (new_state >= TCP_CONNTRACK_MAX) {
995 DEBUGP("ip_ct_tcp: invalid new deleting.\n");
999 if (new_state == TCP_CONNTRACK_SYN_SENT) {
1001 conntrack->proto.tcp.seen[0].td_end =
1002 segment_seq_plus_len(ntohl(th->seq), skb->len,
1004 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1005 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1006 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1007 conntrack->proto.tcp.seen[0].td_maxend =
1008 conntrack->proto.tcp.seen[0].td_end;
1010 tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
1011 conntrack->proto.tcp.seen[1].flags = 0;
1012 conntrack->proto.tcp.seen[0].loose =
1013 conntrack->proto.tcp.seen[1].loose = 0;
1014 } else if (ip_ct_tcp_loose == 0) {
1015 /* Don't try to pick up connections. */
1019 * We are in the middle of a connection,
1020 * its history is lost for us.
1021 * Let's try to use the data from the packet.
1023 conntrack->proto.tcp.seen[0].td_end =
1024 segment_seq_plus_len(ntohl(th->seq), skb->len,
1026 conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1027 if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
1028 conntrack->proto.tcp.seen[0].td_maxwin = 1;
1029 conntrack->proto.tcp.seen[0].td_maxend =
1030 conntrack->proto.tcp.seen[0].td_end +
1031 conntrack->proto.tcp.seen[0].td_maxwin;
1032 conntrack->proto.tcp.seen[0].td_scale = 0;
1034 /* We assume SACK. Should we assume window scaling too? */
1035 conntrack->proto.tcp.seen[0].flags =
1036 conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
1037 conntrack->proto.tcp.seen[0].loose =
1038 conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
1041 conntrack->proto.tcp.seen[1].td_end = 0;
1042 conntrack->proto.tcp.seen[1].td_maxend = 0;
1043 conntrack->proto.tcp.seen[1].td_maxwin = 1;
1044 conntrack->proto.tcp.seen[1].td_scale = 0;
1046 /* tcp_packet will set them */
1047 conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
1048 conntrack->proto.tcp.last_index = TCP_NONE_SET;
1050 DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1051 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1052 sender->td_end, sender->td_maxend, sender->td_maxwin,
1054 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1055 receiver->td_scale);
1059 static int tcp_exp_matches_pkt(struct ip_conntrack_expect *exp,
1060 const struct sk_buff *skb)
1062 const struct iphdr *iph = skb->nh.iph;
1063 struct tcphdr *th, _tcph;
1064 unsigned int datalen;
1066 th = skb_header_pointer(skb, iph->ihl * 4,
1067 sizeof(_tcph), &_tcph);
1070 datalen = skb->len - iph->ihl*4 - th->doff*4;
1072 return between(exp->seq, ntohl(th->seq), ntohl(th->seq) + datalen);
1075 struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1077 .proto = IPPROTO_TCP,
1079 .pkt_to_tuple = tcp_pkt_to_tuple,
1080 .invert_tuple = tcp_invert_tuple,
1081 .print_tuple = tcp_print_tuple,
1082 .print_conntrack = tcp_print_conntrack,
1083 .packet = tcp_packet,
1085 .exp_matches_pkt = tcp_exp_matches_pkt,