+ if (tcph->rst) return TCP_RST_SET;
+ else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+ else if (tcph->fin) return TCP_FIN_SET;
+ else if (tcph->ack) return TCP_ACK_SET;
+ else return TCP_NONE_SET;
+}
+
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+ in IP Filter' by Guido van Rooij.
+
+ http://www.nluug.nl/events/sane2000/papers.html
+ http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+
+ The boundaries and the conditions are changed according to RFC793:
+ the packet must intersect the window (i.e. segments may be
+ after the right or before the left edge) and thus receivers may ACK
+ segments after the right edge of the window.
+
+ td_maxend = max(sack + max(win,1)) seen in reply packets
+ td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+ td_maxwin += seq + len - sender.td_maxend
+ if seq + len > sender.td_maxend
+ td_end = max(seq + len) seen in sent packets
+
+ I. Upper bound for valid data: seq <= sender.td_maxend
+ II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
+ III. Upper bound for valid ack: sack <= receiver.td_end
+ IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
+
+ where sack is the highest right edge of sack block found in the packet.
+
+ The upper bound limit for a valid ack is not ignored -
+ we doesn't have to deal with fragments.
+*/
+
+static inline __u32 segment_seq_plus_len(__u32 seq,
+ size_t len,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ return (seq + len - (iph->ihl + tcph->doff)*4
+ + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST 66000
+#define MAXACKWINDOW(sender) \
+ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
+ : MAXACKWINCONST)
+
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+ struct iphdr *iph,
+ struct tcphdr *tcph,
+ struct ip_ct_tcp_state *state)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb,
+ (iph->ihl * 4) + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ state->td_scale =
+ state->flags = 0;
+
+ while (length > 0) {
+ int opcode=*ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK_PERM
+ && opsize == TCPOLEN_SACK_PERM)
+ state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+ else if (opcode == TCPOPT_WINDOW
+ && opsize == TCPOLEN_WINDOW) {
+ state->td_scale = *(u_int8_t *)ptr;
+
+ if (state->td_scale > 14) {
+ /* See RFC1323 */
+ state->td_scale = 14;
+ }
+ state->flags |=
+ IP_CT_TCP_FLAG_WINDOW_SCALE;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static void tcp_sack(const struct sk_buff *skb,
+ struct iphdr *iph,
+ struct tcphdr *tcph,
+ __u32 *sack)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+ __u32 tmp;
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb,
+ (iph->ihl * 4) + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ /* Fast path for timestamp-only option */
+ if (length == TCPOLEN_TSTAMP_ALIGNED*4
+ && *(__be32 *)ptr ==
+ __constant_htonl((TCPOPT_NOP << 24)
+ | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return;
+
+ while (length > 0) {
+ int opcode=*ptr++;
+ int opsize, i;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK
+ && opsize >= (TCPOLEN_SACK_BASE
+ + TCPOLEN_SACK_PERBLOCK)
+ && !((opsize - TCPOLEN_SACK_BASE)
+ % TCPOLEN_SACK_PERBLOCK)) {
+ for (i = 0;
+ i < (opsize - TCPOLEN_SACK_BASE);
+ i += TCPOLEN_SACK_PERBLOCK) {
+ tmp = ntohl(*((__be32 *)(ptr+i)+1));
+
+ if (after(tmp, *sack))
+ *sack = tmp;
+ }
+ return;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static int tcp_in_window(struct ip_ct_tcp *state,
+ enum ip_conntrack_dir dir,
+ unsigned int index,
+ const struct sk_buff *skb,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ struct ip_ct_tcp_state *sender = &state->seen[dir];
+ struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+ __u32 seq, ack, sack, end, win, swin;
+ int res;
+
+ /*
+ * Get the required data from the packet.
+ */
+ seq = ntohl(tcph->seq);
+ ack = sack = ntohl(tcph->ack_seq);
+ win = ntohs(tcph->window);
+ end = segment_seq_plus_len(seq, skb->len, iph, tcph);
+
+ if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+ tcp_sack(skb, iph, tcph, &sack);
+
+ DEBUGP("tcp_in_window: START\n");
+ DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "seq=%u ack=%u sack=%u win=%u end=%u\n",
+ NIPQUAD(iph->saddr), ntohs(tcph->source),
+ NIPQUAD(iph->daddr), ntohs(tcph->dest),
+ seq, ack, sack, win, end);
+ DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ if (sender->td_end == 0) {
+ /*
+ * Initialize sender data.
+ */
+ if (tcph->syn && tcph->ack) {
+ /*
+ * Outgoing SYN-ACK in reply to a SYN.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, iph, tcph, sender);
+ /*
+ * RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+ && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+ sender->td_scale =
+ receiver->td_scale = 0;
+ } else {
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ sender->td_end = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+ sender->td_maxend = end + sender->td_maxwin;
+ }
+ } else if (((state->state == TCP_CONNTRACK_SYN_SENT
+ && dir == IP_CT_DIR_ORIGINAL)
+ || (state->state == TCP_CONNTRACK_SYN_RECV
+ && dir == IP_CT_DIR_REPLY))
+ && after(end, sender->td_end)) {
+ /*
+ * RFC 793: "if a TCP is reinitialized ... then it need
+ * not wait at all; it must only be sure to use sequence
+ * numbers larger than those recently used."
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, iph, tcph, sender);
+ }
+
+ if (!(tcph->ack)) {
+ /*
+ * If there is no ACK, just pretend it was set and OK.
+ */
+ ack = sack = receiver->td_end;
+ } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
+ (TCP_FLAG_ACK|TCP_FLAG_RST))
+ && (ack == 0)) {
+ /*
+ * Broken TCP stacks, that set ACK in RST packets as well
+ * with zero ack value.
+ */
+ ack = sack = receiver->td_end;
+ }
+
+ if (seq == end
+ && (!tcph->rst
+ || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+ /*
+ * Packets contains no data: we assume it is valid
+ * and check the ack value only.
+ * However RST segments are always validated by their
+ * SEQ number, except when seq == 0 (reset sent answering
+ * SYN.
+ */
+ seq = end = sender->td_end;
+
+ DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "seq=%u ack=%u sack =%u win=%u end=%u\n",
+ NIPQUAD(iph->saddr), ntohs(tcph->source),
+ NIPQUAD(iph->daddr), ntohs(tcph->dest),
+ seq, ack, sack, win, end);
+ DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+ before(seq, sender->td_maxend + 1),
+ after(end, sender->td_end - receiver->td_maxwin - 1),
+ before(sack, receiver->td_end + 1),
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)));
+
+ if (sender->loose || receiver->loose ||
+ (before(seq, sender->td_maxend + 1) &&
+ after(end, sender->td_end - receiver->td_maxwin - 1) &&
+ before(sack, receiver->td_end + 1) &&
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
+ /*
+ * Take into account window scaling (RFC 1323).
+ */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /*
+ * Update sender data.
+ */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end))
+ sender->td_end = end;
+ /*
+ * Update receiver data.
+ */
+ if (after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+
+ /*
+ * Check retransmissions.
+ */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir
+ && state->last_seq == seq
+ && state->last_ack == ack
+ && state->last_end == end
+ && state->last_win == win)
+ state->retrans++;
+ else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->last_win = win;
+ state->retrans = 0;
+ }
+ }
+ /*
+ * Close the window of disabled window tracking :-)
+ */
+ if (sender->loose)
+ sender->loose--;
+
+ res = 1;
+ } else {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "ip_ct_tcp: %s ",
+ before(seq, sender->td_maxend + 1) ?
+ after(end, sender->td_end - receiver->td_maxwin - 1) ?
+ before(sack, receiver->td_end + 1) ?
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+ : "ACK is under the lower bound (possible overly delayed ACK)"
+ : "ACK is over the upper bound (ACKed data not seen yet)"
+ : "SEQ is under the lower bound (already ACKed data retransmitted)"
+ : "SEQ is over the upper bound (over the window of the receiver)");
+
+ res = ip_ct_tcp_be_liberal;
+ }
+
+ DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+ "receiver end=%u maxend=%u maxwin=%u\n",
+ res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+
+ return res;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+/* Update sender->td_end after NAT successfully mangled the packet */
+void ip_conntrack_tcp_update(struct sk_buff *skb,
+ struct ip_conntrack *conntrack,
+ enum ip_conntrack_dir dir)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
+ __u32 end;
+#ifdef DEBUGP_VARS
+ struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
+ struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
+#endif
+
+ end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
+
+ write_lock_bh(&tcp_lock);
+ /*
+ * We have to worry for the ack in the reply packet only...
+ */
+ if (after(end, conntrack->proto.tcp.seen[dir].td_end))
+ conntrack->proto.tcp.seen[dir].td_end = end;
+ conntrack->proto.tcp.last_end = end;
+ write_unlock_bh(&tcp_lock);
+ DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+}
+
+#endif
+
+#define TH_FIN 0x01
+#define TH_SYN 0x02
+#define TH_RST 0x04
+#define TH_PUSH 0x08
+#define TH_ACK 0x10
+#define TH_URG 0x20
+#define TH_ECE 0x40
+#define TH_CWR 0x80
+
+/* table of valid flag combinations - ECE and CWR are always valid */
+static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+{
+ [TH_SYN] = 1,
+ [TH_SYN|TH_PUSH] = 1,
+ [TH_SYN|TH_URG] = 1,
+ [TH_SYN|TH_PUSH|TH_URG] = 1,
+ [TH_SYN|TH_ACK] = 1,
+ [TH_SYN|TH_ACK|TH_PUSH] = 1,
+ [TH_RST] = 1,
+ [TH_RST|TH_ACK] = 1,
+ [TH_RST|TH_ACK|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK] = 1,
+ [TH_ACK] = 1,
+ [TH_ACK|TH_PUSH] = 1,
+ [TH_ACK|TH_URG] = 1,
+ [TH_ACK|TH_URG|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK|TH_URG] = 1,
+ [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
+};
+
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
+static int tcp_error(struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr _tcph, *th;
+ unsigned int tcplen = skb->len - iph->ihl * 4;
+ u_int8_t tcpflags;
+
+ /* Smaller that minimal TCP header? */
+ th = skb_header_pointer(skb, iph->ihl * 4,
+ sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "ip_ct_tcp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Not whole TCP header or malformed packet */
+ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "ip_ct_tcp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because it is assumed to be correct.
+ */
+ /* FIXME: Source route IP option packets --RR */
+ if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
+ nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "ip_ct_tcp: bad TCP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ /* Check TCP flags. */
+ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
+ if (!tcp_valid_flags[tcpflags]) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "ip_ct_tcp: invalid TCP flag combination ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;