Attempted fix for the VNET/NETNS bug.
[linux-2.6.git] / linux-2.6-522-iptables-connection-tagging.patch
1 diff -Nurb linux-2.6.27-521/include/linux/netfilter/xt_MARK.h linux-2.6.27-522/include/linux/netfilter/xt_MARK.h
2 --- linux-2.6.27-521/include/linux/netfilter/xt_MARK.h  2008-10-09 18:13:53.000000000 -0400
3 +++ linux-2.6.27-522/include/linux/netfilter/xt_MARK.h  2009-12-07 11:02:21.000000000 -0500
4 @@ -11,6 +11,7 @@
5         XT_MARK_SET=0,
6         XT_MARK_AND,
7         XT_MARK_OR,
8 +       XT_MARK_COPYXID,
9  };
10  
11  struct xt_mark_target_info_v1 {
12 diff -Nurb linux-2.6.27-521/include/linux/netfilter/xt_SETXID.h linux-2.6.27-522/include/linux/netfilter/xt_SETXID.h
13 --- linux-2.6.27-521/include/linux/netfilter/xt_SETXID.h        1969-12-31 19:00:00.000000000 -0500
14 +++ linux-2.6.27-522/include/linux/netfilter/xt_SETXID.h        2009-12-07 11:02:21.000000000 -0500
15 @@ -0,0 +1,14 @@
16 +#ifndef _XT_SETXID_H_target
17 +#define _XT_SETXID_H_target
18 +
19 +/* Version 1 */
20 +enum {
21 +       XT_SET_PACKET_XID=0
22 +};
23 +
24 +struct xt_setxid_target_info_v1 {
25 +       unsigned long mark;
26 +       u_int8_t mode;
27 +};
28 +
29 +#endif /*_XT_SETXID_H_target*/
30 diff -Nurb linux-2.6.27-521/include/linux/netfilter_ipv4/ipt_MARK.h linux-2.6.27-522/include/linux/netfilter_ipv4/ipt_MARK.h
31 --- linux-2.6.27-521/include/linux/netfilter_ipv4/ipt_MARK.h    2008-10-09 18:13:53.000000000 -0400
32 +++ linux-2.6.27-522/include/linux/netfilter_ipv4/ipt_MARK.h    2009-12-07 11:02:21.000000000 -0500
33 @@ -12,6 +12,7 @@
34  #define IPT_MARK_SET   XT_MARK_SET
35  #define IPT_MARK_AND   XT_MARK_AND
36  #define        IPT_MARK_OR     XT_MARK_OR
37 +#define IPT_MARK_COPYXID       XT_MARK_COPYXID
38  
39  #define ipt_mark_target_info_v1 xt_mark_target_info_v1
40  
41 diff -Nurb linux-2.6.27-521/include/linux/netfilter_ipv4/ipt_SETXID.h linux-2.6.27-522/include/linux/netfilter_ipv4/ipt_SETXID.h
42 --- linux-2.6.27-521/include/linux/netfilter_ipv4/ipt_SETXID.h  1969-12-31 19:00:00.000000000 -0500
43 +++ linux-2.6.27-522/include/linux/netfilter_ipv4/ipt_SETXID.h  2009-12-07 11:02:21.000000000 -0500
44 @@ -0,0 +1,13 @@
45 +#ifndef _IPT_SETXID_H_target
46 +#define _IPT_SETXID_H_target
47 +
48 +/* Backwards compatibility for old userspace */
49 +
50 +#include <linux/netfilter/xt_SETXID.h>
51 +
52 +/* Version 1 */
53 +#define IPT_SET_PACKET_XID     XT_SET_PACKET_XID
54 +
55 +#define ipt_setxid_target_info_v1 xt_setxid_target_info_v1
56 +
57 +#endif /*_IPT_SETXID_H_target*/
58 diff -Nurb linux-2.6.27-521/include/net/netfilter/nf_conntrack.h linux-2.6.27-522/include/net/netfilter/nf_conntrack.h
59 --- linux-2.6.27-521/include/net/netfilter/nf_conntrack.h       2008-10-09 18:13:53.000000000 -0400
60 +++ linux-2.6.27-522/include/net/netfilter/nf_conntrack.h       2009-12-07 11:02:21.000000000 -0500
61 @@ -121,6 +121,9 @@
62         /* Storage reserved for other modules: */
63         union nf_conntrack_proto proto;
64  
65 +       /* PLANETLAB. VNET-specific */
66 +       int xid[IP_CT_DIR_MAX];
67 +
68         /* Extensions */
69         struct nf_ct_ext *ext;
70  
71 diff -Nurb linux-2.6.27-521/net/netfilter/Kconfig linux-2.6.27-522/net/netfilter/Kconfig
72 --- linux-2.6.27-521/net/netfilter/Kconfig      2008-10-09 18:13:53.000000000 -0400
73 +++ linux-2.6.27-522/net/netfilter/Kconfig      2009-12-07 11:02:21.000000000 -0500
74 @@ -477,6 +477,13 @@
75           This option adds a "TCPOPTSTRIP" target, which allows you to strip
76           TCP options from TCP packets.
77  
78 +config NETFILTER_XT_TARGET_SETXID
79 +       tristate '"SETXID" target support'
80 +       depends on NETFILTER_XTABLES
81 +       help
82 +         This option adds a `SETXID' target, which allows you to alter the
83 +         xid of a socket.
84 +
85  config NETFILTER_XT_MATCH_COMMENT
86         tristate  '"comment" match support'
87         depends on NETFILTER_XTABLES
88 diff -Nurb linux-2.6.27-521/net/netfilter/Makefile linux-2.6.27-522/net/netfilter/Makefile
89 --- linux-2.6.27-521/net/netfilter/Makefile     2008-10-09 18:13:53.000000000 -0400
90 +++ linux-2.6.27-522/net/netfilter/Makefile     2009-12-07 11:02:21.000000000 -0500
91 @@ -38,6 +38,7 @@
92  obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
93  
94  # targets
95 +obj-$(CONFIG_NETFILTER_XT_TARGET_SETXID) += xt_SETXID.o
96  obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
97  obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
98  obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
99 diff -Nurb linux-2.6.27-521/net/netfilter/nf_conntrack_core.c linux-2.6.27-522/net/netfilter/nf_conntrack_core.c
100 --- linux-2.6.27-521/net/netfilter/nf_conntrack_core.c  2008-10-09 18:13:53.000000000 -0400
101 +++ linux-2.6.27-522/net/netfilter/nf_conntrack_core.c  2009-12-07 11:02:21.000000000 -0500
102 @@ -595,6 +595,9 @@
103         /* Overload tuple linked list to put us in unconfirmed list. */
104         hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed);
105  
106 +       ct->xid[IP_CT_DIR_ORIGINAL] = -1;
107 +       ct->xid[IP_CT_DIR_REPLY] = -1;
108 +
109         spin_unlock_bh(&nf_conntrack_lock);
110  
111         if (exp) {
112 diff -Nurb linux-2.6.27-521/net/netfilter/xt_MARK.c linux-2.6.27-522/net/netfilter/xt_MARK.c
113 --- linux-2.6.27-521/net/netfilter/xt_MARK.c    2008-10-09 18:13:53.000000000 -0400
114 +++ linux-2.6.27-522/net/netfilter/xt_MARK.c    2009-12-08 23:52:32.000000000 -0500
115 @@ -13,7 +13,13 @@
116  #include <linux/module.h>
117  #include <linux/skbuff.h>
118  #include <linux/ip.h>
119 +#include <net/udp.h>
120  #include <net/checksum.h>
121 +#include <net/route.h>
122 +#include <net/inet_hashtables.h>
123 +#include <net/net_namespace.h>
124 +
125 +#include <net/netfilter/nf_conntrack.h>
126  
127  #include <linux/netfilter/x_tables.h>
128  #include <linux/netfilter/xt_MARK.h>
129 @@ -24,6 +30,8 @@
130  MODULE_ALIAS("ipt_MARK");
131  MODULE_ALIAS("ip6t_MARK");
132  
133 +extern DEFINE_PER_CPU(int, sknid_elevator);
134 +
135  static unsigned int
136  mark_tg_v0(struct sk_buff *skb, const struct net_device *in,
137             const struct net_device *out, unsigned int hooknum,
138 @@ -61,14 +69,257 @@
139         return XT_CONTINUE;
140  }
141  
142 +#define PEERCRED_SET(x) ((x!=0) && (x!=(unsigned int)-1)) 
143 +
144 +
145 +static inline u_int16_t
146 +get_dst_port(struct nf_conntrack_tuple *tuple)
147 +{
148 +       switch (tuple->dst.protonum) {
149 +       case IPPROTO_GRE:
150 +               /* XXX Truncate 32-bit GRE key to 16 bits */
151 +               return tuple->dst.u.gre.key;
152 +       case IPPROTO_ICMP:
153 +               /* Bind on ICMP echo ID */
154 +               return tuple->src.u.icmp.id;
155 +       case IPPROTO_TCP:
156 +               return tuple->dst.u.tcp.port;
157 +       case IPPROTO_UDP:
158 +               return tuple->dst.u.udp.port;
159 +       default:
160 +               return tuple->dst.u.all;
161 +       }
162 +}
163 +
164 +static inline u_int16_t
165 +get_src_port(struct nf_conntrack_tuple *tuple)
166 +{
167 +       switch (tuple->dst.protonum) {
168 +       case IPPROTO_GRE:
169 +               /* XXX Truncate 32-bit GRE key to 16 bits */
170 +               return htons(ntohl(tuple->src.u.gre.key));
171 +       case IPPROTO_ICMP:
172 +               /* Bind on ICMP echo ID */
173 +               return tuple->src.u.icmp.id;
174 +       case IPPROTO_TCP:
175 +               return tuple->src.u.tcp.port;
176 +       case IPPROTO_UDP:
177 +               return tuple->src.u.udp.port;
178 +       default:
179 +               return tuple->src.u.all;
180 +       }
181 +}
182 +
183 +static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
184 +               __be16 sport, __be32 daddr, __be16 dport,
185 +               int dif, struct hlist_head udptable[])
186 +{
187 +       struct sock *sk, *result = NULL;
188 +       struct hlist_node *node;
189 +       unsigned short hnum = ntohs(dport);
190 +       int badness = -1;
191 +
192 +       read_lock(&udp_hash_lock);
193 +       sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
194 +               struct inet_sock *inet = inet_sk(sk);
195 +
196 +               if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
197 +                               !ipv6_only_sock(sk)) {
198 +                       int score = (sk->sk_family == PF_INET ? 1 : 0);
199 +
200 +                       if (inet->rcv_saddr) {
201 +                               if (inet->rcv_saddr != daddr)
202 +                                       continue;
203 +                               score+=2;
204 +                       } else {
205 +                               /* block non nx_info ips */
206 +                               if (!v4_addr_in_nx_info(sk->sk_nx_info,
207 +                                       daddr, NXA_MASK_BIND))
208 +                                       continue;
209 +                       }
210 +                       if (inet->daddr) {
211 +                               if (inet->daddr != saddr)
212 +                                       continue;
213 +                               score+=2;
214 +                       }
215 +                       if (inet->dport) {
216 +                               if (inet->dport != sport)
217 +                                       continue;
218 +                               score+=2;
219 +                       }
220 +                       if (sk->sk_bound_dev_if) {
221 +                               if (sk->sk_bound_dev_if != dif)
222 +                                       continue;
223 +                               score+=2;
224 +                       }
225 +                       if (score == 9) {
226 +                               result = sk;
227 +                               break;
228 +                       } else if (score > badness) {
229 +                               result = sk;
230 +                               badness = score;
231 +                       }
232 +               }
233 +       }
234 +
235 +       if (result)
236 +               sock_hold(result);
237 +       read_unlock(&udp_hash_lock);
238 +       return result;
239 +}
240 +
241 +int onceonly = 1;
242 +
243  static unsigned int
244  mark_tg(struct sk_buff *skb, const struct net_device *in,
245          const struct net_device *out, unsigned int hooknum,
246          const struct xt_target *target, const void *targinfo)
247  {
248         const struct xt_mark_tginfo2 *info = targinfo;
249 +    long mark = -1;
250 +    enum ip_conntrack_info ctinfo;
251 +    struct sock *connection_sk;
252 +    int dif;
253 +    struct nf_conn *ct;
254 +    extern struct inet_hashinfo tcp_hashinfo;
255 +    enum ip_conntrack_dir dir;
256 +    int *curtag;
257 +    u_int32_t src_ip;
258 +    u_int32_t dst_ip;
259 +    u_int16_t proto, src_port;
260 +    u_int32_t ip;
261 +    u_int16_t port;
262 +
263 +    // As of 2.6.27.39, Dec 8 2009, 
264 +    // NetNS + VNET = Trouble
265 +    // Let's handle this as a special case
266 +    //
267 +
268 +    
269 +
270 +    if (info->mark == ~0U) {
271 +        struct net *net = dev_net(skb->dev);
272 +        if (net != &init_net) {
273 +            WARN_ON(onceonly);
274 +            onceonly = 0;
275 +            return XT_CONTINUE;
276 +        }
277 +        /* copy-xid */
278 +        dif = ((struct rtable *)(skb->dst))->rt_iif;
279 +
280 +        ct = nf_ct_get(skb, &ctinfo);
281 +        if (!ct) 
282 +            goto out_mark_finish;
283 +
284 +        dir = CTINFO2DIR(ctinfo);
285 +        src_ip = ct->tuplehash[dir].tuple.src.u3.ip;
286 +        dst_ip = ct->tuplehash[dir].tuple.dst.u3.ip;
287 +        src_port = get_src_port(&ct->tuplehash[dir].tuple);
288 +        proto = ct->tuplehash[dir].tuple.dst.protonum;
289 +
290 +        ip = ct->tuplehash[dir].tuple.dst.u3.ip;
291 +        port = get_dst_port(&ct->tuplehash[dir].tuple);
292 +
293 +        if (proto == 1) {
294 +            if (skb->mark > 0)
295 +                /* The packet is marked, it's going out */
296 +                ct->xid[0] = skb->mark;
297 +
298 +            if (ct->xid[0] > 0)
299 +                mark = ct->xid[0];
300 +        }
301 +        else if (proto == 17) {
302 +            struct sock *sk;
303 +            if (!skb->mark) {
304 +                sk = __udp4_lib_lookup(net,src_ip, src_port,
305 +                        ip, port, dif, udp_hash);
306 +
307 +                if (sk && hooknum == NF_INET_LOCAL_IN)
308 +                    mark = sk->sk_nid;
309 +
310 +                if (sk)
311 +                    sock_put(sk);
312 +            }
313 +            else if (skb->mark > 0)
314 +                /* The packet is marked, it's going out */
315 +                ct->xid[0] = skb->mark;
316 +        }
317 +        else if (proto == 6) /* TCP */{
318 +            int sockettype = 0; /* Established socket */
319 +
320 +            /* Looks for an established socket or a listening 
321 +               socket corresponding to the 4-tuple, in that order.
322 +               The order is important for Codemux connections
323 +               to be handled properly */
324 +
325 +            connection_sk = inet_lookup_established(net,
326 +                    &tcp_hashinfo, src_ip, src_port, ip, port, dif);
327 +
328 +            if (!connection_sk) {
329 +                connection_sk = inet_lookup_listener(net,
330 +                        &tcp_hashinfo, ip, port, dif);
331 +                sockettype = 1; /* Listening socket */
332 +            }
333 +
334 +            if (connection_sk) {
335 +                /* The peercred is not set. We set it if the other side has an xid. */
336 +                if (!PEERCRED_SET(connection_sk->sk_peercred.uid)
337 +                        && ct->xid[!dir] > 0 && (sockettype == 0)) {
338 +                    connection_sk->sk_peercred.gid = 
339 +                        connection_sk->sk_peercred.uid = ct->xid[!dir];
340 +                }
341 +
342 +                /* The peercred is set, and is not equal to the XID of 'the other side' */
343 +                else if (PEERCRED_SET(connection_sk->sk_peercred.uid) &&
344 +                        (connection_sk->sk_peercred.uid != ct->xid[!dir]) &&
345 +                        (sockettype == 0)) {
346 +                    mark = connection_sk->sk_peercred.uid;
347 +                }
348 +
349 +                /* Has this connection already been tagged? */
350 +                if (ct->xid[dir] < 1) {
351 +                    /* No - let's tag it */ 
352 +                    ct->xid[dir]=connection_sk->sk_nid;
353 +                }
354 +
355 +                if (mark == -1 && (ct->xid[dir] != 0))
356 +                    mark = ct->xid[dir];
357 +
358 +                if (connection_sk->sk_state == TCP_TIME_WAIT) {
359 +                    inet_twsk_put(inet_twsk(connection_sk));
360 +                    goto out_mark_finish;
361 +                } else
362 +                    sock_put(connection_sk);
363 +            }
364 +
365 +            /* All else failed. Is this a connection over raw sockets?
366 +               That explains why we couldn't get anything out of skb->sk,
367 +               or look up a "real" connection. */
368 +            if (ct->xid[dir] < 1) {
369 +                if (skb->skb_tag)
370 +                    ct->xid[dir] = skb->skb_tag;
371 +            }
372 +
373 +            /* Covers CoDemux case */
374 +            if (mark < 1 && (ct->xid[dir] > 0))
375 +                mark = ct->xid[dir];
376 +
377 +            if (mark < 1 && (ct->xid[!dir] > 0))
378 +                mark = ct->xid[!dir];
379 +            goto out_mark_finish;
380 +        }
381 +    }
382 +    else
383 +        mark = (skb->mark & ~info->mask) ^ info->mark;
384 +
385 +out_mark_finish:
386 +    if (mark != -1)
387 +        skb->mark = mark;
388 +
389 +    curtag = &__get_cpu_var(sknid_elevator);
390 +    if (mark > 0 && *curtag == -2 && hooknum == NF_INET_LOCAL_IN) 
391 +        *curtag = mark;
392  
393 -       skb->mark = (skb->mark & ~info->mask) ^ info->mark;
394         return XT_CONTINUE;
395  }
396  
397 diff -Nurb linux-2.6.27-521/net/netfilter/xt_SETXID.c linux-2.6.27-522/net/netfilter/xt_SETXID.c
398 --- linux-2.6.27-521/net/netfilter/xt_SETXID.c  1969-12-31 19:00:00.000000000 -0500
399 +++ linux-2.6.27-522/net/netfilter/xt_SETXID.c  2009-12-07 11:02:21.000000000 -0500
400 @@ -0,0 +1,79 @@
401 +#include <linux/module.h>
402 +#include <linux/skbuff.h>
403 +#include <linux/ip.h>
404 +#include <net/checksum.h>
405 +#include <linux/vs_network.h>
406 +
407 +#include <linux/netfilter/x_tables.h>
408 +#include <linux/netfilter/xt_SETXID.h>
409 +
410 +MODULE_LICENSE("GPL");
411 +MODULE_AUTHOR("");
412 +MODULE_DESCRIPTION("");
413 +MODULE_ALIAS("ipt_SETXID");
414 +
415 +static unsigned int
416 +target_v1(struct sk_buff **pskb,
417 +         const struct net_device *in,
418 +         const struct net_device *out,
419 +         unsigned int hooknum,
420 +         const struct xt_target *target,
421 +         const void *targinfo)
422 +{
423 +       const struct xt_setxid_target_info_v1 *setxidinfo = targinfo;
424 +
425 +       switch (setxidinfo->mode) {
426 +       case XT_SET_PACKET_XID:
427 +                (*pskb)->skb_tag = setxidinfo->mark;
428 +               break;
429 +       }
430 +       return XT_CONTINUE;
431 +}
432 +
433 +
434 +static int
435 +checkentry_v1(const char *tablename,
436 +             const void *entry,
437 +             const struct xt_target *target,
438 +             void *targinfo,
439 +             unsigned int hook_mask)
440 +{
441 +       struct xt_setxid_target_info_v1 *setxidinfo = targinfo;
442 +
443 +       if (setxidinfo->mode != XT_SET_PACKET_XID) {
444 +               printk(KERN_WARNING "SETXID: unknown mode %u\n",
445 +                      setxidinfo->mode);
446 +               return 0;
447 +       }
448 +
449 +       return 1;
450 +}
451 +
452 +static struct xt_target xt_setxid_target[] = {
453 +       {
454 +               .name           = "SETXID",
455 +               .family         = AF_INET,
456 +               .revision       = 1,
457 +               .checkentry     = checkentry_v1,
458 +               .target         = target_v1,
459 +               .targetsize     = sizeof(struct xt_setxid_target_info_v1),
460 +               .table          = "mangle",
461 +               .me             = THIS_MODULE,
462 +       }
463 +};
464 +
465 +static int __init init(void)
466 +{
467 +       int err;
468 +
469 +       err = xt_register_targets(xt_setxid_target, ARRAY_SIZE(xt_setxid_target));
470 +       return err;
471 +}
472 +
473 +static void __exit fini(void)
474 +{
475 +       xt_unregister_targets(xt_setxid_target, ARRAY_SIZE(xt_setxid_target));
476 +}
477 +
478 +module_init(init);
479 +module_exit(fini);