Fix for the recently reported accounting problem: Ticket #396 and Ticket #202.
[linux-2.6.git] / linux-2.6-522-iptables-connection-tagging.patch
1 diff -Nurb linux-2.6.22-521/include/linux/netfilter/xt_MARK.h linux-2.6.22-522/include/linux/netfilter/xt_MARK.h
2 --- linux-2.6.22-521/include/linux/netfilter/xt_MARK.h  2007-07-08 19:32:17.000000000 -0400
3 +++ linux-2.6.22-522/include/linux/netfilter/xt_MARK.h  2008-07-28 16:36:24.000000000 -0400
4 @@ -11,6 +11,7 @@
5         XT_MARK_SET=0,
6         XT_MARK_AND,
7         XT_MARK_OR,
8 +       XT_MARK_COPYXID,
9  };
10  
11  struct xt_mark_target_info_v1 {
12 diff -Nurb linux-2.6.22-521/include/linux/netfilter/xt_SETXID.h linux-2.6.22-522/include/linux/netfilter/xt_SETXID.h
13 --- linux-2.6.22-521/include/linux/netfilter/xt_SETXID.h        1969-12-31 19:00:00.000000000 -0500
14 +++ linux-2.6.22-522/include/linux/netfilter/xt_SETXID.h        2008-07-28 16:36:24.000000000 -0400
15 @@ -0,0 +1,14 @@
16 +#ifndef _XT_SETXID_H_target
17 +#define _XT_SETXID_H_target
18 +
19 +/* Version 1 */
20 +enum {
21 +       XT_SET_PACKET_XID=0
22 +};
23 +
24 +struct xt_setxid_target_info_v1 {
25 +       unsigned long mark;
26 +       u_int8_t mode;
27 +};
28 +
29 +#endif /*_XT_SETXID_H_target*/
30 diff -Nurb linux-2.6.22-521/include/linux/netfilter_ipv4/ipt_MARK.h linux-2.6.22-522/include/linux/netfilter_ipv4/ipt_MARK.h
31 --- linux-2.6.22-521/include/linux/netfilter_ipv4/ipt_MARK.h    2007-07-08 19:32:17.000000000 -0400
32 +++ linux-2.6.22-522/include/linux/netfilter_ipv4/ipt_MARK.h    2008-07-28 16:36:24.000000000 -0400
33 @@ -12,6 +12,7 @@
34  #define IPT_MARK_SET   XT_MARK_SET
35  #define IPT_MARK_AND   XT_MARK_AND
36  #define        IPT_MARK_OR     XT_MARK_OR
37 +#define IPT_MARK_COPYXID       XT_MARK_COPYXID
38  
39  #define ipt_mark_target_info_v1 xt_mark_target_info_v1
40  
41 diff -Nurb linux-2.6.22-521/include/linux/netfilter_ipv4/ipt_SETXID.h linux-2.6.22-522/include/linux/netfilter_ipv4/ipt_SETXID.h
42 --- linux-2.6.22-521/include/linux/netfilter_ipv4/ipt_SETXID.h  1969-12-31 19:00:00.000000000 -0500
43 +++ linux-2.6.22-522/include/linux/netfilter_ipv4/ipt_SETXID.h  2008-07-28 16:36:24.000000000 -0400
44 @@ -0,0 +1,13 @@
45 +#ifndef _IPT_SETXID_H_target
46 +#define _IPT_SETXID_H_target
47 +
48 +/* Backwards compatibility for old userspace */
49 +
50 +#include <linux/netfilter/xt_SETXID.h>
51 +
52 +/* Version 1 */
53 +#define IPT_SET_PACKET_XID     XT_SET_PACKET_XID
54 +
55 +#define ipt_setxid_target_info_v1 xt_setxid_target_info_v1
56 +
57 +#endif /*_IPT_SETXID_H_target*/
58 diff -Nurb linux-2.6.22-521/include/net/netfilter/nf_conntrack.h linux-2.6.22-522/include/net/netfilter/nf_conntrack.h
59 --- linux-2.6.22-521/include/net/netfilter/nf_conntrack.h       2007-07-08 19:32:17.000000000 -0400
60 +++ linux-2.6.22-522/include/net/netfilter/nf_conntrack.h       2008-07-28 16:36:24.000000000 -0400
61 @@ -131,6 +131,9 @@
62         /* Storage reserved for other modules: */
63         union nf_conntrack_proto proto;
64  
65 +       /* PLANETLAB. VNET-specific */
66 +       int xid[IP_CT_DIR_MAX];
67 +
68         /* features dynamically at the end: helper, nat (both optional) */
69         char data[0];
70  };
71 diff -Nurb linux-2.6.22-521/net/netfilter/Kconfig linux-2.6.22-522/net/netfilter/Kconfig
72 --- linux-2.6.22-521/net/netfilter/Kconfig      2007-07-08 19:32:17.000000000 -0400
73 +++ linux-2.6.22-522/net/netfilter/Kconfig      2008-07-28 16:36:24.000000000 -0400
74 @@ -389,6 +389,13 @@
75  
76           To compile it as a module, choose M here.  If unsure, say N.
77  
78 +config NETFILTER_XT_TARGET_SETXID
79 +       tristate '"SETXID" target support'
80 +       depends on NETFILTER_XTABLES
81 +       help
82 +         This option adds a `SETXID' target, which allows you to alter the
83 +         xid of a socket.
84 +
85  config NETFILTER_XT_MATCH_COMMENT
86         tristate  '"comment" match support'
87         depends on NETFILTER_XTABLES
88 diff -Nurb linux-2.6.22-521/net/netfilter/Makefile linux-2.6.22-522/net/netfilter/Makefile
89 --- linux-2.6.22-521/net/netfilter/Makefile     2007-07-08 19:32:17.000000000 -0400
90 +++ linux-2.6.22-522/net/netfilter/Makefile     2008-07-28 16:36:24.000000000 -0400
91 @@ -37,6 +37,7 @@
92  obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
93  
94  # targets
95 +obj-$(CONFIG_NETFILTER_XT_TARGET_SETXID) += xt_SETXID.o
96  obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
97  obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
98  obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
99 diff -Nurb linux-2.6.22-521/net/netfilter/nf_conntrack_core.c linux-2.6.22-522/net/netfilter/nf_conntrack_core.c
100 --- linux-2.6.22-521/net/netfilter/nf_conntrack_core.c  2007-07-08 19:32:17.000000000 -0400
101 +++ linux-2.6.22-522/net/netfilter/nf_conntrack_core.c  2008-07-28 16:36:24.000000000 -0400
102 @@ -726,6 +726,8 @@
103  
104         /* Overload tuple linked list to put us in unconfirmed list. */
105         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
106 +       conntrack->xid[IP_CT_DIR_ORIGINAL] = -1;
107 +       conntrack->xid[IP_CT_DIR_REPLY] = -1;
108  
109         write_unlock_bh(&nf_conntrack_lock);
110  
111 diff -Nurb linux-2.6.22-521/net/netfilter/xt_MARK.c linux-2.6.22-522/net/netfilter/xt_MARK.c
112 --- linux-2.6.22-521/net/netfilter/xt_MARK.c    2007-07-08 19:32:17.000000000 -0400
113 +++ linux-2.6.22-522/net/netfilter/xt_MARK.c    2008-09-14 16:50:22.000000000 -0400
114 @@ -5,13 +5,18 @@
115   * This program is free software; you can redistribute it and/or modify
116   * it under the terms of the GNU General Public License version 2 as
117   * published by the Free Software Foundation.
118 + *
119   */
120  
121  #include <linux/module.h>
122 +#include <linux/version.h>
123  #include <linux/skbuff.h>
124  #include <linux/ip.h>
125  #include <net/checksum.h>
126 +#include <net/route.h>
127 +#include <net/inet_hashtables.h>
128  
129 +#include <net/netfilter/nf_conntrack.h>
130  #include <linux/netfilter/x_tables.h>
131  #include <linux/netfilter/xt_MARK.h>
132  
133 @@ -21,6 +26,50 @@
134  MODULE_ALIAS("ipt_MARK");
135  MODULE_ALIAS("ip6t_MARK");
136  
137 +#define PEERCRED_SET(x) ((x!=0) && (x!=(unsigned int)-1)) 
138 +
139 +static inline u_int16_t
140 +get_dst_port(struct nf_conntrack_tuple *tuple)
141 +{
142 +       switch (tuple->dst.protonum) {
143 +       case IPPROTO_GRE:
144 +               /* XXX Truncate 32-bit GRE key to 16 bits */
145 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
146 +               return tuple->dst.u.gre.key;
147 +#else
148 +               return htons(ntohl(tuple->dst.u.gre.key));
149 +#endif  
150 +       case IPPROTO_ICMP:
151 +               /* Bind on ICMP echo ID */
152 +               return tuple->src.u.icmp.id;
153 +       case IPPROTO_TCP:
154 +               return tuple->dst.u.tcp.port;
155 +       case IPPROTO_UDP:
156 +               return tuple->dst.u.udp.port;
157 +       default:
158 +               return tuple->dst.u.all;
159 +       }
160 +}
161 +
162 +static inline u_int16_t
163 +get_src_port(struct nf_conntrack_tuple *tuple)
164 +{
165 +       switch (tuple->dst.protonum) {
166 +       case IPPROTO_GRE:
167 +               /* XXX Truncate 32-bit GRE key to 16 bits */
168 +               return htons(ntohl(tuple->src.u.gre.key));
169 +       case IPPROTO_ICMP:
170 +               /* Bind on ICMP echo ID */
171 +               return tuple->src.u.icmp.id;
172 +       case IPPROTO_TCP:
173 +               return tuple->src.u.tcp.port;
174 +       case IPPROTO_UDP:
175 +               return tuple->src.u.udp.port;
176 +       default:
177 +               return tuple->src.u.all;
178 +       }
179 +}
180 +
181  static unsigned int
182  target_v0(struct sk_buff **pskb,
183           const struct net_device *in,
184 @@ -35,6 +84,10 @@
185         return XT_CONTINUE;
186  }
187  
188 +extern DEFINE_PER_CPU(int, sknid_elevator);
189 +
190 +#define related(ct) (ct==(IP_CT_IS_REPLY + IP_CT_RELATED))
191 +
192  static unsigned int
193  target_v1(struct sk_buff **pskb,
194           const struct net_device *in,
195 @@ -44,7 +97,20 @@
196           const void *targinfo)
197  {
198         const struct xt_mark_target_info_v1 *markinfo = targinfo;
199 -       int mark = 0;
200 +       enum ip_conntrack_info ctinfo;
201 +      struct sock *connection_sk;
202 +      int dif;
203 +      struct nf_conn *ct;
204 +      extern struct inet_hashinfo tcp_hashinfo;
205 +      enum ip_conntrack_dir dir;
206 +      int *curtag;
207 +      u_int32_t src_ip;
208 +      u_int32_t dst_ip;
209 +      u_int16_t proto, src_port;
210 +      u_int32_t ip;
211 +      u_int16_t port;
212 +
213 +       int mark = -1;
214  
215         switch (markinfo->mode) {
216         case XT_MARK_SET:
217 @@ -58,13 +124,107 @@
218         case XT_MARK_OR:
219                 mark = (*pskb)->mark | markinfo->mark;
220                 break;
221 +
222 +                       case XT_MARK_COPYXID: 
223 +                                       dif = ((struct rtable *)(*pskb)->dst)->rt_iif;
224 +
225 +                                       ct = nf_ct_get((*pskb), &ctinfo);
226 +                                       if (!ct) 
227 +                                                       break;
228 +
229 +                                       dir = CTINFO2DIR(ctinfo);
230 +                                       src_ip = ct->tuplehash[dir].tuple.src.u3.ip;
231 +                                       dst_ip = ct->tuplehash[dir].tuple.dst.u3.ip;
232 +                                       src_port = get_src_port(&ct->tuplehash[dir].tuple);
233 +                                       proto = ct->tuplehash[dir].tuple.dst.protonum;
234 +
235 +                                       ip = ct->tuplehash[dir].tuple.dst.u3.ip;
236 +                                       port = get_dst_port(&ct->tuplehash[dir].tuple);
237 +
238 +                                       if (proto == 1 || proto == 17) {
239 +                                                       if ((*pskb)->mark>0) /* The packet is marked, it's going out */
240 +                                                       {
241 +                                                                       ct->xid[0]=(*pskb)->mark;
242         }
243  
244 +                                                       if (ct->xid[0] > 0) {
245 +                                                                       mark = ct->xid[0];
246 +                                                       }
247 +
248 +                                       }
249 +                                       else if (proto == 6) /* TCP */{
250 +                                                       int sockettype=0; /* Established socket */
251 +                                                       /* Looks for an established socket or a listening socket corresponding to the 4-tuple, in
252 +                                                        * that order. The order is important for Codemux connections to be handled properly */
253 +
254 +                                                       connection_sk = inet_lookup_established(&tcp_hashinfo, src_ip, src_port, ip, port, dif);
255 +
256 +                                                       if (!connection_sk) {
257 +                                                               connection_sk = inet_lookup_listener(&tcp_hashinfo, ip, port, dif);
258 +                                                               sockettype=1; /* Listening socket */
259 +                                                       }
260 +
261 +                                                       if (connection_sk) {
262 +                                                                       /* The peercred is not set. We set it if the other side has an xid. */
263 +                                                                       if (!PEERCRED_SET(connection_sk->sk_peercred.uid)
264 +                                                                                                       && ct->xid[!dir]>0 && (sockettype==0)) {
265 +                                                                                       connection_sk->sk_peercred.gid = connection_sk->sk_peercred.uid = ct->xid[!dir];
266 +                                                                       }
267 +
268 +                                                                       /* The peercred is set, and is not equal to the XID of 'the other side' */
269 +                                                                       else if (PEERCRED_SET(connection_sk->sk_peercred.uid) && (connection_sk->sk_peercred.uid != ct->xid[!dir]) && (sockettype==0)) {
270 +                                                                                       mark = connection_sk->sk_peercred.uid;
271 +                                                                       }
272 +
273 +                                                                       /* Has this connection already been tagged? */
274 +                                                                       if (ct->xid[dir] < 1) {
275 +                                                                                       /* No - let's tag it */ 
276 +                                                                                       ct->xid[dir]=connection_sk->sk_nid;
277 +
278 +                                                                       }
279 +
280 +                                                                       if (mark==-1 && (ct->xid[dir]!= 0))
281 +                                                                                       mark = ct->xid[dir];
282 +
283 +                                                                       if (connection_sk->sk_state == TCP_TIME_WAIT) {
284 +                                                                                       inet_twsk_put(inet_twsk(connection_sk));
285 +                                                                                       break;
286 +                                                                       }
287 +                                                                       else
288 +                                                                                       sock_put(connection_sk);
289 +                                                       }
290 +
291 +                                                       /* All else failed. Is this a connection over raw sockets? That explains
292 +                                                        * why we couldn't get anything out of skb->sk, or look up a "real" connection.*/
293 +                                                       if (ct->xid[dir]<1) {
294 +                                                                       if ((*pskb)->skb_tag) {
295 +                                                                                       ct->xid[dir]=(*pskb)->skb_tag;
296 +                                                                       }
297 +                                                       }
298 +
299 +                                                       /* Covers CoDemux case */
300 +                                                       if (mark < 1 && (ct->xid[dir]>0)) {
301 +                                                                       mark = ct->xid[dir];
302 +                                                       }
303 +
304 +                                                       if (mark < 1 && (ct->xid[!dir]>0)) {
305 +                                                                       mark = ct->xid[!dir];
306 +                                                       }
307 +                                                       break;
308 +                                       }
309 +       }
310 +       if (mark != -1) {
311         (*pskb)->mark = mark;
312 +       }
313 +
314 +       curtag=&__get_cpu_var(sknid_elevator);
315 +       if (mark > 0 && *curtag==-2 && hooknum==NF_IP_LOCAL_IN) 
316 +       {
317 +               *curtag = mark;
318 +       }
319         return XT_CONTINUE;
320  }
321  
322 -
323  static int
324  checkentry_v0(const char *tablename,
325               const void *entry,
326 @@ -92,7 +252,8 @@
327  
328         if (markinfo->mode != XT_MARK_SET
329             && markinfo->mode != XT_MARK_AND
330 -           && markinfo->mode != XT_MARK_OR) {
331 +           && markinfo->mode != XT_MARK_OR
332 +           && markinfo->mode != XT_MARK_COPYXID) {
333                 printk(KERN_WARNING "MARK: unknown mode %u\n",
334                        markinfo->mode);
335                 return 0;
336 diff -Nurb linux-2.6.22-521/net/netfilter/xt_SETXID.c linux-2.6.22-522/net/netfilter/xt_SETXID.c
337 --- linux-2.6.22-521/net/netfilter/xt_SETXID.c  1969-12-31 19:00:00.000000000 -0500
338 +++ linux-2.6.22-522/net/netfilter/xt_SETXID.c  2008-07-28 16:36:24.000000000 -0400
339 @@ -0,0 +1,79 @@
340 +#include <linux/module.h>
341 +#include <linux/skbuff.h>
342 +#include <linux/ip.h>
343 +#include <net/checksum.h>
344 +#include <linux/vs_network.h>
345 +
346 +#include <linux/netfilter/x_tables.h>
347 +#include <linux/netfilter/xt_SETXID.h>
348 +
349 +MODULE_LICENSE("GPL");
350 +MODULE_AUTHOR("");
351 +MODULE_DESCRIPTION("");
352 +MODULE_ALIAS("ipt_SETXID");
353 +
354 +static unsigned int
355 +target_v1(struct sk_buff **pskb,
356 +         const struct net_device *in,
357 +         const struct net_device *out,
358 +         unsigned int hooknum,
359 +         const struct xt_target *target,
360 +         const void *targinfo)
361 +{
362 +       const struct xt_setxid_target_info_v1 *setxidinfo = targinfo;
363 +
364 +       switch (setxidinfo->mode) {
365 +       case XT_SET_PACKET_XID:
366 +                (*pskb)->skb_tag = setxidinfo->mark;
367 +               break;
368 +       }
369 +       return XT_CONTINUE;
370 +}
371 +
372 +
373 +static int
374 +checkentry_v1(const char *tablename,
375 +             const void *entry,
376 +             const struct xt_target *target,
377 +             void *targinfo,
378 +             unsigned int hook_mask)
379 +{
380 +       struct xt_setxid_target_info_v1 *setxidinfo = targinfo;
381 +
382 +       if (setxidinfo->mode != XT_SET_PACKET_XID) {
383 +               printk(KERN_WARNING "SETXID: unknown mode %u\n",
384 +                      setxidinfo->mode);
385 +               return 0;
386 +       }
387 +
388 +       return 1;
389 +}
390 +
391 +static struct xt_target xt_setxid_target[] = {
392 +       {
393 +               .name           = "SETXID",
394 +               .family         = AF_INET,
395 +               .revision       = 1,
396 +               .checkentry     = checkentry_v1,
397 +               .target         = target_v1,
398 +               .targetsize     = sizeof(struct xt_setxid_target_info_v1),
399 +               .table          = "mangle",
400 +               .me             = THIS_MODULE,
401 +       }
402 +};
403 +
404 +static int __init init(void)
405 +{
406 +       int err;
407 +
408 +       err = xt_register_targets(xt_setxid_target, ARRAY_SIZE(xt_setxid_target));
409 +       return err;
410 +}
411 +
412 +static void __exit fini(void)
413 +{
414 +       xt_unregister_targets(xt_setxid_target, ARRAY_SIZE(xt_setxid_target));
415 +}
416 +
417 +module_init(init);
418 +module_exit(fini);