use new API's (non-tested version)
[linux-2.6.git] / linux-2.6-522-iptables-connection-tagging.patch
1 commit 39e1cee3184d275fa3ec4122de39b90d0d8e9bf4
2 Author: root <root@rhel6.(none)>
3 Date:   Thu Apr 29 19:59:33 2010 -0400
4
5     linux-2.6-522-iptables-connection-tagging.patch
6
7 diff --git a/include/linux/netfilter/xt_SETXID.h b/include/linux/netfilter/xt_SETXID.h
8 new file mode 100644
9 index 0000000..235b9d6
10 --- /dev/null
11 +++ b/include/linux/netfilter/xt_SETXID.h
12 @@ -0,0 +1,13 @@
13 +#ifndef _XT_SETXID_H_target
14 +#define _XT_SETXID_H_target
15 +
16 +enum {
17 +       XT_SET_PACKET_XID=0
18 +};
19 +
20 +struct xt_setxid_target_info_v2 {
21 +       unsigned long mark;
22 +       u_int8_t mode;
23 +};
24 +
25 +#endif /*_XT_SETXID_H_target*/
26 diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
27 index 5cf7270..95a5fde 100644
28 --- a/include/net/netfilter/nf_conntrack.h
29 +++ b/include/net/netfilter/nf_conntrack.h
30 @@ -119,6 +119,9 @@ struct nf_conn {
31         /* Storage reserved for other modules: */
32         union nf_conntrack_proto proto;
33  
34 +       /* PLANETLAB. VNET-specific */
35 +       int xid[IP_CT_DIR_MAX];
36 +       
37         /* Extensions */
38         struct nf_ct_ext *ext;
39  #ifdef CONFIG_NET_NS
40 diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
41 index 634d14a..a2872f5 100644
42 --- a/net/netfilter/Kconfig
43 +++ b/net/netfilter/Kconfig
44 @@ -543,6 +543,13 @@ config NETFILTER_XT_MATCH_CLUSTER
45           If you say Y or M here, try `iptables -m cluster --help` for
46           more information.
47  
48 +config NETFILTER_XT_TARGET_SETXID
49 +       tristate '"SETXID" target support'
50 +       depends on NETFILTER_XTABLES
51 +       help
52 +         This option adds a `SETXID' target, which allows you to alter the
53 +         xid of a socket.
54 +
55  config NETFILTER_XT_MATCH_COMMENT
56         tristate  '"comment" match support'
57         depends on NETFILTER_ADVANCED
58 diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
59 index 49f62ee..820655e 100644
60 --- a/net/netfilter/Makefile
61 +++ b/net/netfilter/Makefile
62 @@ -41,6 +41,7 @@ obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o
63  obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
64  
65  # targets
66 +obj-$(CONFIG_NETFILTER_XT_TARGET_SETXID) += xt_SETXID.o
67  obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
68  obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
69  obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
70 diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
71 index 1e1df20..144e131 100644
72 --- a/net/netfilter/nf_conntrack_core.c
73 +++ b/net/netfilter/nf_conntrack_core.c
74 @@ -673,6 +673,9 @@ init_conntrack(struct net *net,
75         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
76                        &net->ct.unconfirmed);
77  
78 +       ct->xid[IP_CT_DIR_ORIGINAL] = -1;
79 +       ct->xid[IP_CT_DIR_REPLY] = -1;
80 +
81         spin_unlock_bh(&nf_conntrack_lock);
82  
83         if (exp) {
84 diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
85 index 225f8d1..7513997 100644
86 --- a/net/netfilter/xt_MARK.c
87 +++ b/net/netfilter/xt_MARK.c
88 @@ -13,7 +13,13 @@
89  #include <linux/module.h>
90  #include <linux/skbuff.h>
91  #include <linux/ip.h>
92 +#include <net/udp.h>
93  #include <net/checksum.h>
94 +#include <net/route.h>
95 +#include <net/inet_hashtables.h>
96 +#include <net/net_namespace.h>
97 +
98 +#include <net/netfilter/nf_conntrack.h>
99  
100  #include <linux/netfilter/x_tables.h>
101  #include <linux/netfilter/xt_MARK.h>
102 @@ -24,22 +30,267 @@ MODULE_DESCRIPTION("Xtables: packet mark modification");
103  MODULE_ALIAS("ipt_MARK");
104  MODULE_ALIAS("ip6t_MARK");
105  
106 +DECLARE_PER_CPU(int, sknid_elevator);
107 +
108 +#define PEERCRED_SET(x) ((x!=0) && (x!=(unsigned int)-1))
109 +
110 +static inline u_int16_t get_dst_port(struct nf_conntrack_tuple *tuple)
111 +{
112 +       switch (tuple->dst.protonum) {
113 +       case IPPROTO_GRE:
114 +               /* XXX Truncate 32-bit GRE key to 16 bits */
115 +               return tuple->dst.u.gre.key;
116 +       case IPPROTO_ICMP:
117 +               /* Bind on ICMP echo ID */
118 +               return tuple->src.u.icmp.id;
119 +       case IPPROTO_TCP:
120 +               return tuple->dst.u.tcp.port;
121 +       case IPPROTO_UDP:
122 +               return tuple->dst.u.udp.port;
123 +       default:
124 +               return tuple->dst.u.all;
125 +       }
126 +}
127 +
128 +static inline u_int16_t get_src_port(struct nf_conntrack_tuple *tuple)
129 +{
130 +       switch (tuple->dst.protonum) {
131 +       case IPPROTO_GRE:
132 +               /* XXX Truncate 32-bit GRE key to 16 bits */
133 +               return htons(ntohl(tuple->src.u.gre.key));
134 +       case IPPROTO_ICMP:
135 +               /* Bind on ICMP echo ID */
136 +               return tuple->src.u.icmp.id;
137 +       case IPPROTO_TCP:
138 +               return tuple->src.u.tcp.port;
139 +       case IPPROTO_UDP:
140 +               return tuple->src.u.udp.port;
141 +       default:
142 +               return tuple->src.u.all;
143 +       }
144 +}
145 +
146 +static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
147 +                                     __be16 sport, __be32 daddr, __be16 dport,
148 +                                     int dif, struct hlist_head udptable[])
149 +{
150 +       struct sock *sk, *result = NULL;
151 +       struct hlist_node *node;
152 +       unsigned short hnum = ntohs(dport);
153 +       int badness = -1;
154 +
155 +       rcu_read_lock();
156 +       sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
157 +               struct inet_sock *inet = inet_sk(sk);
158 +
159 +               if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
160 +                   !ipv6_only_sock(sk)) {
161 +                       int score = (sk->sk_family == PF_INET ? 1 : 0);
162 +
163 +                       if (inet->rcv_saddr) {
164 +                               if (inet->rcv_saddr != daddr)
165 +                                       continue;
166 +                               score += 2;
167 +                       } else {
168 +                               /* block non nx_info ips */
169 +                               if (!v4_addr_in_nx_info(sk->sk_nx_info,
170 +                                                       daddr, NXA_MASK_BIND))
171 +                                       continue;
172 +                       }
173 +                       if (inet->daddr) {
174 +                               if (inet->daddr != saddr)
175 +                                       continue;
176 +                               score += 2;
177 +                       }
178 +                       if (inet->dport) {
179 +                               if (inet->dport != sport)
180 +                                       continue;
181 +                               score += 2;
182 +                       }
183 +                       if (sk->sk_bound_dev_if) {
184 +                               if (sk->sk_bound_dev_if != dif)
185 +                                       continue;
186 +                               score += 2;
187 +                       }
188 +                       if (score == 9) {
189 +                               result = sk;
190 +                               break;
191 +                       } else if (score > badness) {
192 +                               result = sk;
193 +                               badness = score;
194 +                       }
195 +               }
196 +       }
197 +
198 +       if (result)
199 +               sock_hold(result);
200 +       rcu_read_unlock();
201 +       return result;
202 +}
203 +
204 +int onceonly = 1;
205 +
206  static unsigned int
207  mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
208  {
209         const struct xt_mark_tginfo2 *info = par->targinfo;
210 +       long mark = -1;
211 +       enum ip_conntrack_info ctinfo;
212 +       struct sock *connection_sk;
213 +       int dif;
214 +       struct nf_conn *ct;
215 +       extern struct inet_hashinfo tcp_hashinfo;
216 +       enum ip_conntrack_dir dir;
217 +       int *curtag;
218 +       u_int32_t src_ip;
219 +       u_int32_t dst_ip;
220 +       u_int16_t proto, src_port;
221 +       u_int32_t ip;
222 +       u_int16_t port;
223 +
224 +       if (info->mark == ~0U) {
225 +               // As of 2.6.27.39, Dec 8 2009, 
226 +               // NetNS + VNET = Trouble
227 +               // Let's handle this as a special case
228 +               struct net *net = dev_net(skb->dev);
229 +               if (!net_eq(net, &init_net)) {
230 +                       WARN_ON(onceonly);
231 +                       onceonly = 0;
232 +                       return XT_CONTINUE;
233 +               }
234 +
235 +               /* copy-xid */
236 +               dif = ((struct rtable *)(skb->dst))->rt_iif;
237 +
238 +               ct = nf_ct_get(skb, &ctinfo);
239 +               if (!ct)
240 +                       goto out_mark_finish;
241 +
242 +               dir = CTINFO2DIR(ctinfo);
243 +               src_ip = ct->tuplehash[dir].tuple.src.u3.ip;
244 +               dst_ip = ct->tuplehash[dir].tuple.dst.u3.ip;
245 +               src_port = get_src_port(&ct->tuplehash[dir].tuple);
246 +               proto = ct->tuplehash[dir].tuple.dst.protonum;
247 +
248 +               ip = ct->tuplehash[dir].tuple.dst.u3.ip;
249 +               port = get_dst_port(&ct->tuplehash[dir].tuple);
250 +
251 +               if (proto == 1) {
252 +                       if (skb->mark > 0)
253 +                               /* The packet is marked, it's going out */
254 +                               ct->xid[0] = skb->mark;
255 +
256 +                       if (ct->xid[0] > 0)
257 +                               mark = ct->xid[0];
258 +               } else if (proto == 17) {
259 +                       struct sock *sk;
260 +                       if (!skb->mark) {
261 +                               sk = __udp4_lib_lookup(net, src_ip, src_port,
262 +                                                      ip, port, dif, udp_hash);
263 +
264 +                               if (sk && hooknum == NF_INET_LOCAL_IN)
265 +                                       mark = sk->sk_nid;
266 +
267 +                               if (sk)
268 +                                       sock_put(sk);
269 +                       } else if (skb->mark > 0)
270 +                               /* The packet is marked, it's going out */
271 +                               ct->xid[0] = skb->mark;
272 +               } else if (proto == 6) {        /* TCP */
273 +                       int sockettype = 0;     /* Established socket */
274 +
275 +                       /* Looks for an established socket or a listening 
276 +                          socket corresponding to the 4-tuple, in that order.
277 +                          The order is important for Codemux connections
278 +                          to be handled properly */
279 +
280 +                       connection_sk = inet_lookup_established(net,
281 +                                                               &tcp_hashinfo,
282 +                                                               src_ip,
283 +                                                               src_port, ip,
284 +                                                               port, dif);
285 +
286 +                       if (!connection_sk) {
287 +                               connection_sk = inet_lookup_listener(net,
288 +                                                                    &tcp_hashinfo,
289 +                                                                    ip, port,
290 +                                                                    dif);
291 +                               sockettype = 1; /* Listening socket */
292 +                       }
293 +
294 +                       if (connection_sk) {
295 +                               if (connection_sk->sk_state == TCP_TIME_WAIT) {
296 +                                       inet_twsk_put(inet_twsk(connection_sk));
297 +                                       goto out_mark_finish;
298 +                               }
299 +
300 +                               /* The peercred is not set. We set it if the other side has an xid. */
301 +                               if (!PEERCRED_SET
302 +                                   (connection_sk->sk_peercred.uid)
303 +                                   && ct->xid[!dir] > 0 && (sockettype == 0)) {
304 +                                       connection_sk->sk_peercred.gid =
305 +                                           connection_sk->sk_peercred.uid =
306 +                                           ct->xid[!dir];
307 +                               }
308 +
309 +                               /* The peercred is set, and is not equal to the XID of 'the other side' */
310 +                               else if (PEERCRED_SET
311 +                                        (connection_sk->sk_peercred.uid)
312 +                                        && (connection_sk->sk_peercred.uid !=
313 +                                            ct->xid[!dir])
314 +                                        && (sockettype == 0)) {
315 +                                       mark = connection_sk->sk_peercred.uid;
316 +                               }
317 +
318 +                               /* Has this connection already been tagged? */
319 +                               if (ct->xid[dir] < 1) {
320 +                                       /* No - let's tag it */
321 +                                       ct->xid[dir] = connection_sk->sk_nid;
322 +                               }
323 +
324 +                               if (mark == -1 && (ct->xid[dir] != 0))
325 +                                       mark = ct->xid[dir];
326 +
327 +                               sock_put(connection_sk);
328 +                       }
329 +
330 +                       /* All else failed. Is this a connection over raw sockets?
331 +                          That explains why we couldn't get anything out of skb->sk,
332 +                          or look up a "real" connection. */
333 +                       if (ct->xid[dir] < 1) {
334 +                               if (skb->skb_tag)
335 +                                       ct->xid[dir] = skb->skb_tag;
336 +                       }
337 +
338 +                       /* Covers CoDemux case */
339 +                       if (mark < 1 && (ct->xid[dir] > 0))
340 +                               mark = ct->xid[dir];
341 +
342 +                       if (mark < 1 && (ct->xid[!dir] > 0))
343 +                               mark = ct->xid[!dir];
344 +                       goto out_mark_finish;
345 +               }
346 +       } else
347 +               mark = (skb->mark & ~info->mask) ^ info->mark;
348 +
349 +out_mark_finish:
350 +       if (mark != -1)
351 +               skb->mark = mark;
352 +
353 +       curtag = &__get_cpu_var(sknid_elevator);
354 +       if (mark > 0 && *curtag == -2 && hooknum == NF_INET_LOCAL_IN)
355 +               *curtag = mark;
356  
357 -       skb->mark = (skb->mark & ~info->mask) ^ info->mark;
358         return XT_CONTINUE;
359  }
360  
361  static struct xt_target mark_tg_reg __read_mostly = {
362 -       .name           = "MARK",
363 -       .revision       = 2,
364 -       .family         = NFPROTO_UNSPEC,
365 -       .target         = mark_tg,
366 -       .targetsize     = sizeof(struct xt_mark_tginfo2),
367 -       .me             = THIS_MODULE,
368 +       .name = "MARK",
369 +       .revision = 2,
370 +       .family = NFPROTO_UNSPEC,
371 +       .target = mark_tg,
372 +       .targetsize = sizeof(struct xt_mark_tginfo2),
373 +       .me = THIS_MODULE,
374  };
375  
376  static int __init mark_tg_init(void)
377 diff --git a/net/netfilter/xt_SETXID.c b/net/netfilter/xt_SETXID.c
378 new file mode 100644
379 index 0000000..f8553c5
380 --- /dev/null
381 +++ b/net/netfilter/xt_SETXID.c
382 @@ -0,0 +1,77 @@
383 +#include <linux/module.h>
384 +#include <linux/skbuff.h>
385 +#include <linux/ip.h>
386 +#include <net/checksum.h>
387 +#include <linux/vs_network.h>
388 +
389 +#include <linux/netfilter/x_tables.h>
390 +#include <linux/netfilter/xt_SETXID.h>
391 +
392 +MODULE_LICENSE("GPL");
393 +MODULE_AUTHOR("");
394 +MODULE_DESCRIPTION("");
395 +MODULE_ALIAS("ipt_SETXID");
396 +
397 +static unsigned int
398 +target_v2(struct sk_buff **pskb,
399 +         const struct net_device *in,
400 +         const struct net_device *out,
401 +         unsigned int hooknum,
402 +         const struct xt_target *target, const void *targinfo)
403 +{
404 +       const struct xt_setxid_target_info_v2 *setxidinfo = targinfo;
405 +
406 +       switch (setxidinfo->mode) {
407 +       case XT_SET_PACKET_XID:
408 +               (*pskb)->skb_tag = setxidinfo->mark;
409 +               break;
410 +       }
411 +       return XT_CONTINUE;
412 +}
413 +
414 +static int
415 +checkentry_v2(const char *tablename,
416 +             const void *entry,
417 +             const struct xt_target *target,
418 +             void *targinfo, unsigned int hook_mask)
419 +{
420 +       struct xt_setxid_target_info_v2 *setxidinfo = targinfo;
421 +
422 +       if (setxidinfo->mode != XT_SET_PACKET_XID) {
423 +               printk(KERN_WARNING "SETXID: unknown mode %u\n",
424 +                      setxidinfo->mode);
425 +               return 0;
426 +       }
427 +
428 +       return 1;
429 +}
430 +
431 +static struct xt_target xt_setxid_target[] = {
432 +       {
433 +        .name = "SETXID",
434 +        .family = AF_INET,
435 +        .revision = 2,
436 +        .checkentry = checkentry_v2,
437 +        .target = target_v2,
438 +        .targetsize = sizeof(struct xt_setxid_target_info_v2),
439 +        .table = "mangle",
440 +        .me = THIS_MODULE,
441 +        }
442 +};
443 +
444 +static int __init init(void)
445 +{
446 +       int err;
447 +
448 +       err =
449 +           xt_register_target(xt_setxid_target);
450 +       return err;
451 +}
452 +
453 +static void __exit fini(void)
454 +{
455 +       xt_unregister_target(xt_setxid_target);
456 +}
457 +
458 +module_init(init);
459 +module_exit(fini);