/* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * Jozsef Kadlecsik : * - Real stateful connection tracking * - Modified state transitions table * - Window scaling support added * - SACK support added * * Willy Tarreau: * - State table bugfixes * - More robust state changes * - Tuning timer parameters * * version 2.2 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if 0 #define DEBUGP printk #define DEBUGP_VARS #else #define DEBUGP(format, args...) #endif /* Protects conntrack->proto.tcp */ static DECLARE_RWLOCK(tcp_lock); /* "Be conservative in what you do, be liberal in what you accept from others." If it's non-zero, we mark only out of window RST segments as INVALID. */ int ip_ct_tcp_be_liberal = 0; /* When connection is picked up from the middle, how many packets are required to pass in each direction when we assume we are in sync - if any side uses window scaling, we lost the game. If it is set to zero, we disable picking up already established connections. */ int ip_ct_tcp_loose = 3; /* Max number of the retransmitted packets without receiving an (acceptable) ACK from the destination. If this number is reached, a shorter timer will be started. */ int ip_ct_tcp_max_retrans = 3; /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ static const char *tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "LISTEN" }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS; unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS; unsigned long ip_ct_tcp_timeout_established = 5 DAYS; unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS; unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS; unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS; unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS; unsigned long ip_ct_tcp_timeout_close = 10 SECS; /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; static unsigned long * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ NULL, /* TCP_CONNTRACK_LISTEN */ }; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV #define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE #define sLI TCP_CONNTRACK_LISTEN #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE /* What TCP flags are set from RST/SYN/FIN/ACK. */ enum tcp_bit_set { TCP_SYN_SET, TCP_SYNACK_SET, TCP_FIN_SET, TCP_ACK_SET, TCP_RST_SET, TCP_NONE_SET, }; /* * The TCP state transition table needs a few words... * * We are the man in the middle. All the packets go through us * but might get lost in transit to the destination. * It is assumed that the destinations can't receive segments * we haven't seen. * * The checked segment is in window, but our windows are *not* * equivalent with the ones of the sender/receiver. We always * try to guess the state of the current sender. * * The meaning of the states are: * * NONE: initial state * SYN_SENT: SYN-only packet seen * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen * CLOSE: closed connection * * LISTEN state is not used. * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): * if they are invalid * or we do not support the request (simultaneous open) */ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN * sSR -> sIG Late retransmitted SYN? * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. * Or we are not in sync and hold a dead connection. * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, /* * A SYN/ACK from the client is always invalid: * - either it tries to set up a simultaneous open, which is * not supported; * - or the firewall has just been inserted between the two hosts * during the session set-up. The SYN will be retransmitted * by the true client (or it'll time out). */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for * the last ACK. * Migth be a retransmitted FIN as well... * sCW -> sLA * sLA -> sLA Retransmitted FIN. Remain in the same state. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected. * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, /* * sNO -> sIV Never reached. * sSS -> sIV Simultaneous open, not supported * sSR -> sIV Simultaneous open, not supported. * sES -> sIV Server may not initiate a connection. * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV Reopened connection, but server may not do it. * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, /* * sSS -> sSR Standard open. * sSR -> sSR Retransmitted SYN/ACK. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sIG * sCL -> sIG */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. * sCW -> sLA * sLA -> sLA Retransmitted FIN. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*ack*/ { sIV, sIV, sIV, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sSR -> sIV Simultaneous open. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected. * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; static int tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct ip_conntrack_tuple *tuple) { struct tcphdr _hdr, *hp; /* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) return 0; tuple->src.u.tcp.port = hp->source; tuple->dst.u.tcp.port = hp->dest; return 1; } static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *orig) { tuple->src.u.tcp.port = orig->dst.u.tcp.port; tuple->dst.u.tcp.port = orig->src.u.tcp.port; return 1; } /* Print out the per-protocol part of the tuple. */ static int tcp_print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple) { return seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); } /* Print out the private part of the conntrack. */ static int tcp_print_conntrack(struct seq_file *s, const struct ip_conntrack *conntrack) { enum tcp_conntrack state; READ_LOCK(&tcp_lock); state = conntrack->proto.tcp.state; READ_UNLOCK(&tcp_lock); return seq_printf(s, "%s ", tcp_conntrack_names[state]); } static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); else if (tcph->fin) return TCP_FIN_SET; else if (tcph->ack) return TCP_ACK_SET; else return TCP_NONE_SET; } /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. http://www.nluug.nl/events/sane2000/papers.html http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz The boundaries and the conditions are slightly changed: td_maxend = max(sack + max(win,1)) seen in reply packets td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets td_end = max(seq + len) seen in sent packets I. Upper bound for valid data: seq + len <= sender.td_maxend II. Lower bound for valid data: seq >= sender.td_end - receiver.td_maxwin III. Upper bound for valid ack: sack <= receiver.td_end IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW where sack is the highest right edge of sack block found in the packet. The upper bound limit for a valid ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, struct iphdr *iph, struct tcphdr *tcph) { return (seq + len - (iph->ihl + tcph->doff)*4 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); } /* Fixme: what about big packets? */ #define MAXACKWINCONST 66000 #define MAXACKWINDOW(sender) \ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ : MAXACKWINCONST) /* * Simplified tcp_parse_options routine from tcp_input.c */ static void tcp_options(const struct sk_buff *skb, struct iphdr *iph, struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) return; ptr = skb_header_pointer(skb, (iph->ihl * 4) + sizeof(struct tcphdr), length, buff); BUG_ON(ptr == NULL); state->td_scale = state->flags = 0; while (length > 0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) break; /* don't parse partial options */ if (opcode == TCPOPT_SACK_PERM && opsize == TCPOLEN_SACK_PERM) state->flags |= IP_CT_TCP_FLAG_SACK_PERM; else if (opcode == TCPOPT_WINDOW && opsize == TCPOLEN_WINDOW) { state->td_scale = *(u_int8_t *)ptr; if (state->td_scale > 14) { /* See RFC1323 */ state->td_scale = 14; } state->flags |= IP_CT_TCP_STATE_FLAG_WINDOW_SCALE; } ptr += opsize - 2; length -= opsize; } } } static void tcp_sack(struct tcphdr *tcph, __u32 *sack) { __u32 tmp; unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED*4 && *(__u32 *)(tcph + 1) == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) return; ptr = (unsigned char *)(tcph + 1); while (length > 0) { int opcode=*ptr++; int opsize, i; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) break; /* don't parse partial options */ if (opcode == TCPOPT_SACK && opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK)) { for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { tmp = ntohl(*((u_int32_t *)(ptr+i)+1)); if (after(tmp, *sack)) *sack = tmp; } return; } ptr += opsize - 2; length -= opsize; } } } static int tcp_in_window(struct ip_ct_tcp *state, enum ip_conntrack_dir dir, unsigned int *index, const struct sk_buff *skb, struct iphdr *iph, struct tcphdr *tcph) { struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; __u32 seq, ack, sack, end, win, swin; int res; /* * Get the required data from the packet. */ seq = ntohl(tcph->seq); ack = sack = ntohl(tcph->ack_seq); win = ntohs(tcph->window); end = segment_seq_plus_len(seq, skb->len, iph, tcph); if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(tcph, &sack); DEBUGP("tcp_in_window: START\n"); DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " "seq=%u ack=%u sack=%u win=%u end=%u\n", NIPQUAD(iph->saddr), ntohs(tcph->source), NIPQUAD(iph->daddr), ntohs(tcph->dest), seq, ack, sack, win, end); DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); if (sender->td_end == 0) { /* * Initialize sender data. */ if (tcph->syn && tcph->ack) { /* * Outgoing SYN-ACK in reply to a SYN. */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, iph, tcph, sender); /* * RFC 1323: * Both sides must send the Window Scale option * to enable window scaling in either direction. */ if (!(sender->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE)) sender->td_scale = receiver->td_scale = 0; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ sender->td_end = end; sender->td_maxwin = (win == 0 ? 1 : win); sender->td_maxend = end + sender->td_maxwin; } } else if (state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL && after(end, sender->td_end)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, iph, tcph, sender); } if (!(tcph->ack)) { /* * If there is no ACK, just pretend it was set and OK. */ ack = sack = receiver->td_end; } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == (TCP_FLAG_ACK|TCP_FLAG_RST)) && (ack == 0)) { /* * Broken TCP stacks, that set ACK in RST packets as well * with zero ack value. */ ack = sack = receiver->td_end; } if (seq == end) /* * Packets contains no data: we assume it is valid * and check the ack value only. */ seq = end = sender->td_end; DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " "seq=%u ack=%u sack =%u win=%u end=%u trim=%u\n", NIPQUAD(iph->saddr), ntohs(tcph->source), NIPQUAD(iph->daddr), ntohs(tcph->dest), seq, ack, sack, win, end, after(end, sender->td_maxend) && before(seq, sender->td_maxend) ? sender->td_maxend : end); DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); /* Ignore data over the right edge of the receiver's window. */ if (after(end, sender->td_maxend) && before(seq, sender->td_maxend)) { end = sender->td_maxend; if (*index == TCP_FIN_SET) *index = TCP_ACK_SET; } DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", before(end, sender->td_maxend + 1) || before(seq, sender->td_maxend + 1), after(seq, sender->td_end - receiver->td_maxwin - 1) || after(end, sender->td_end - receiver->td_maxwin - 1), before(sack, receiver->td_end + 1), after(ack, receiver->td_end - MAXACKWINDOW(sender))); if (sender->loose || receiver->loose || (before(end, sender->td_maxend + 1) && after(seq, sender->td_end - receiver->td_maxwin - 1) && before(sack, receiver->td_end + 1) && after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { /* * Take into account window scaling (RFC 1323). */ if (!tcph->syn) win <<= sender->td_scale; /* * Update sender data. */ swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; if (after(end, sender->td_end)) sender->td_end = end; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; if (win == 0) receiver->td_maxend++; } /* * Check retransmissions. */ if (*index == TCP_ACK_SET) { if (state->last_dir == dir && state->last_seq == seq && state->last_end == end) state->retrans++; else { state->last_dir = dir; state->last_seq = seq; state->last_end = end; state->retrans = 0; } } /* * Close the window of disabled window tracking :-) */ if (sender->loose) sender->loose--; res = 1; } else { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: %s ", before(end, sender->td_maxend + 1) ? after(seq, sender->td_end - receiver->td_maxwin - 1) ? before(ack, receiver->td_end + 1) ? after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" : "ACK is under the lower bound (possibly overly delayed ACK)" : "ACK is over the upper bound (ACKed data has never seen yet)" : "SEQ is under the lower bound (retransmitted already ACKed data)" : "SEQ is over the upper bound (over the window of the receiver)"); res = ip_ct_tcp_be_liberal && !tcph->rst; } DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " "receiver end=%u maxend=%u maxwin=%u\n", res, sender->td_end, sender->td_maxend, sender->td_maxwin, receiver->td_end, receiver->td_maxend, receiver->td_maxwin); return res; } #ifdef CONFIG_IP_NF_NAT_NEEDED /* Update sender->td_end after NAT successfully mangled the packet */ int ip_conntrack_tcp_update(struct sk_buff *skb, struct ip_conntrack *conntrack, int dir) { struct iphdr *iph = skb->nh.iph; struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; __u32 end; #ifdef DEBUGP_VARS struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; #endif end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); WRITE_LOCK(&tcp_lock); /* * We have to worry for the ack in the reply packet only... */ if (after(end, conntrack->proto.tcp.seen[dir].td_end)) conntrack->proto.tcp.seen[dir].td_end = end; conntrack->proto.tcp.last_end = end; WRITE_UNLOCK(&tcp_lock); DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); return 1; } EXPORT_SYMBOL(ip_conntrack_tcp_update); #endif #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 /* table of valid flag combinations - ECE and CWR are always valid */ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, [TH_RST|TH_ACK|TH_PUSH] = 1, [TH_FIN|TH_ACK] = 1, [TH_ACK] = 1, [TH_ACK|TH_PUSH] = 1, [TH_ACK|TH_URG] = 1, [TH_ACK|TH_URG|TH_PUSH] = 1, [TH_FIN|TH_ACK|TH_PUSH] = 1, [TH_FIN|TH_ACK|TH_URG] = 1, [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static int tcp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum) { struct iphdr *iph = skb->nh.iph; struct tcphdr _tcph, *th; unsigned int tcplen = skb->len - iph->ihl * 4; u_int8_t tcpflags; /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, iph->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: short packet "); return -NF_ACCEPT; } /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ if (hooknum == NF_IP_PRE_ROUTING && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } /* Check TCP flags. */ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } return NF_ACCEPT; } /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct ip_conntrack *conntrack, const struct sk_buff *skb, enum ip_conntrack_info ctinfo) { enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; struct iphdr *iph = skb->nh.iph; struct tcphdr *th, _tcph; unsigned long timeout; unsigned int index; th = skb_header_pointer(skb, iph->ihl * 4, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); WRITE_LOCK(&tcp_lock); old_state = conntrack->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; switch (new_state) { case TCP_CONNTRACK_IGNORE: /* Either SYN in ORIGINAL, or SYN/ACK in REPLY direction. */ if (index == TCP_SYNACK_SET && conntrack->proto.tcp.last_index == TCP_SYN_SET && conntrack->proto.tcp.last_dir != dir && after(ntohl(th->ack_seq), conntrack->proto.tcp.last_seq)) { /* This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is * not. We kill this session and block the SYN/ACK so * that the client cannot but retransmit its SYN and * thus initiate a clean new session. */ WRITE_UNLOCK(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: killing out of sync session "); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); return -NF_DROP; } conntrack->proto.tcp.last_index = index; conntrack->proto.tcp.last_dir = dir; conntrack->proto.tcp.last_seq = ntohl(th->seq); WRITE_UNLOCK(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid SYN (ignored) "); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Invalid packet */ DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); WRITE_UNLOCK(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_SYN_SENT: if (old_state >= TCP_CONNTRACK_TIME_WAIT) { /* Attempt to reopen a closed connection. * Delete this connection and look up again. */ WRITE_UNLOCK(&tcp_lock); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); return -NF_REPEAT; } break; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) && conntrack->proto.tcp.last_index <= TCP_SYNACK_SET && after(ntohl(th->ack_seq), conntrack->proto.tcp.last_seq)) { /* Ignore RST closing down invalid SYN we had let trough. */ WRITE_UNLOCK(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid RST (ignored) "); return NF_ACCEPT; } /* Just fall trough */ default: /* Keep compilers happy. */ break; } if (!tcp_in_window(&conntrack->proto.tcp, dir, &index, skb, iph, th)) { WRITE_UNLOCK(&tcp_lock); return -NF_ACCEPT; } /* From now on we have got in-window packets */ /* If FIN was trimmed off, we don't change state. */ conntrack->proto.tcp.last_index = index; new_state = tcp_conntracks[dir][index][old_state]; DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", NIPQUAD(iph->saddr), ntohs(th->source), NIPQUAD(iph->daddr), ntohs(th->dest), (th->syn ? 1 : 0), (th->ack ? 1 : 0), (th->fin ? 1 : 0), (th->rst ? 1 : 0), old_state, new_state); conntrack->proto.tcp.state = new_state; timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; WRITE_UNLOCK(&tcp_lock); if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); return NF_ACCEPT; } } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &conntrack->status); } ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static int tcp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) { enum tcp_conntrack new_state; struct iphdr *iph = skb->nh.iph; struct tcphdr *th, _tcph; #ifdef DEBUGP_VARS struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; #endif th = skb_header_pointer(skb, iph->ihl * 4, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); /* Don't need lock here: this conntrack not in circulation yet */ new_state = tcp_conntracks[0][get_conntrack_index(th)] [TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { DEBUGP("ip_ct_tcp: invalid new deleting.\n"); return 0; } if (new_state == TCP_CONNTRACK_SYN_SENT) { /* SYN packet */ conntrack->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (conntrack->proto.tcp.seen[0].td_maxwin == 0) conntrack->proto.tcp.seen[0].td_maxwin = 1; conntrack->proto.tcp.seen[0].td_maxend = conntrack->proto.tcp.seen[0].td_end; tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); conntrack->proto.tcp.seen[1].flags = 0; conntrack->proto.tcp.seen[0].loose = conntrack->proto.tcp.seen[1].loose = 0; } else if (ip_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ return 0; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ conntrack->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (conntrack->proto.tcp.seen[0].td_maxwin == 0) conntrack->proto.tcp.seen[0].td_maxwin = 1; conntrack->proto.tcp.seen[0].td_maxend = conntrack->proto.tcp.seen[0].td_end + conntrack->proto.tcp.seen[0].td_maxwin; conntrack->proto.tcp.seen[0].td_scale = 0; /* We assume SACK. Should we assume window scaling too? */ conntrack->proto.tcp.seen[0].flags = conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; conntrack->proto.tcp.seen[0].loose = conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; } conntrack->proto.tcp.seen[1].td_end = 0; conntrack->proto.tcp.seen[1].td_maxend = 0; conntrack->proto.tcp.seen[1].td_maxwin = 1; conntrack->proto.tcp.seen[1].td_scale = 0; /* tcp_packet will set them */ conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; conntrack->proto.tcp.last_index = TCP_NONE_SET; DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); return 1; } static int tcp_exp_matches_pkt(struct ip_conntrack_expect *exp, const struct sk_buff *skb) { const struct iphdr *iph = skb->nh.iph; struct tcphdr *th, _tcph; unsigned int datalen; th = skb_header_pointer(skb, iph->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) return 0; datalen = skb->len - iph->ihl*4 - th->doff*4; return between(exp->seq, ntohl(th->seq), ntohl(th->seq) + datalen); } struct ip_conntrack_protocol ip_conntrack_protocol_tcp = { .proto = IPPROTO_TCP, .name = "tcp", .pkt_to_tuple = tcp_pkt_to_tuple, .invert_tuple = tcp_invert_tuple, .print_tuple = tcp_print_tuple, .print_conntrack = tcp_print_conntrack, .packet = tcp_packet, .new = tcp_new, .exp_matches_pkt = tcp_exp_matches_pkt, .error = tcp_error, };