This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / net / ipv4 / netfilter / ip_set_iptree.c
1 /* Copyright (C) 2005 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
2  *
3  * This program is free software; you can redistribute it and/or modify
4  * it under the terms of the GNU General Public License version 2 as
5  * published by the Free Software Foundation.  
6  */
7
8 /* Kernel module implementing an IP set type: the iptree type */
9
10 #include <linux/module.h>
11 #include <linux/ip.h>
12 #include <linux/skbuff.h>
13 #include <linux/slab.h>
14 #include <linux/delay.h>
15 #include <linux/netfilter_ipv4/ip_tables.h>
16 #include <linux/netfilter_ipv4/ip_set.h>
17 #include <linux/errno.h>
18 #include <asm/uaccess.h>
19 #include <asm/bitops.h>
20 #include <linux/spinlock.h>
21
22 /* Backward compatibility */
23 #ifndef __nocast
24 #define __nocast
25 #endif
26
27 #include <linux/netfilter_ipv4/ip_set_iptree.h>
28
29 /* Garbage collection interval in seconds: */
30 #define IPTREE_GC_TIME          5*60
31 /* Sleep so many milliseconds before trying again 
32  * to delete the gc timer at destroying/flushing a set */ 
33 #define IPTREE_DESTROY_SLEEP    100
34
35 static kmem_cache_t *branch_cachep;
36 static kmem_cache_t *leaf_cachep;
37
38 #define ABCD(a,b,c,d,addrp) do {                \
39         a = ((unsigned char *)addrp)[3];        \
40         b = ((unsigned char *)addrp)[2];        \
41         c = ((unsigned char *)addrp)[1];        \
42         d = ((unsigned char *)addrp)[0];        \
43 } while (0)
44
45 #define TESTIP_WALK(map, elem, branch) do {     \
46         if ((map)->tree[elem]) {                \
47                 branch = (map)->tree[elem];     \
48         } else                                  \
49                 return 0;                       \
50 } while (0)
51
52 static inline int
53 __testip(struct ip_set *set, ip_set_ip_t ip, ip_set_ip_t *hash_ip)
54 {
55         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
56         struct ip_set_iptreeb *btree;
57         struct ip_set_iptreec *ctree;
58         struct ip_set_iptreed *dtree;
59         unsigned char a,b,c,d;
60         
61         *hash_ip = ip;
62         ABCD(a, b, c, d, hash_ip);
63         DP("%u %u %u %u timeout %u", a, b, c, d, map->timeout);
64         TESTIP_WALK(map, a, btree);
65         TESTIP_WALK(btree, b, ctree);
66         TESTIP_WALK(ctree, c, dtree);
67         DP("%lu %lu", dtree->expires[d], jiffies);
68         return !!(map->timeout ? (time_after(dtree->expires[d], jiffies))
69                                : dtree->expires[d]);
70 }
71
72 static int
73 testip(struct ip_set *set, const void *data, size_t size,
74        ip_set_ip_t *hash_ip)
75 {
76         struct ip_set_req_iptree *req = 
77             (struct ip_set_req_iptree *) data;
78
79         if (size != sizeof(struct ip_set_req_iptree)) {
80                 ip_set_printk("data length wrong (want %zu, have %zu)",
81                               sizeof(struct ip_set_req_iptree),
82                               size);
83                 return -EINVAL;
84         }
85         return __testip(set, req->ip, hash_ip);
86 }
87
88 static int
89 testip_kernel(struct ip_set *set, 
90               const struct sk_buff *skb,
91               ip_set_ip_t *hash_ip,
92               const u_int32_t *flags,
93               unsigned char index)
94 {
95         int res;
96         
97         DP("flag: %s src: %u.%u.%u.%u dst: %u.%u.%u.%u",
98            flags[index] & IPSET_SRC ? "SRC" : "DST",
99            NIPQUAD(skb->nh.iph->saddr),
100            NIPQUAD(skb->nh.iph->daddr));
101
102         res =  __testip(set,
103                         ntohl(flags[index] & IPSET_SRC 
104                                 ? skb->nh.iph->saddr 
105                                 : skb->nh.iph->daddr),
106                         hash_ip);
107         return (res < 0 ? 0 : res);
108 }
109
110 #define ADDIP_WALK(map, elem, branch, type, cachep, flags) do { \
111         if ((map)->tree[elem]) {                                \
112                 DP("found %u", elem);                           \
113                 branch = (map)->tree[elem];                     \
114         } else {                                                \
115                 branch = (type *)                               \
116                         kmem_cache_alloc(cachep, flags);        \
117                 if (branch == NULL)                             \
118                         return -ENOMEM;                         \
119                 memset(branch, 0, sizeof(*branch));             \
120                 (map)->tree[elem] = branch;                     \
121                 DP("alloc %u", elem);                           \
122         }                                                       \
123 } while (0)     
124
125 static inline int
126 __addip(struct ip_set *set, ip_set_ip_t ip, unsigned int timeout,
127         ip_set_ip_t *hash_ip,
128         unsigned int __nocast flags)
129 {
130         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
131         struct ip_set_iptreeb *btree;
132         struct ip_set_iptreec *ctree;
133         struct ip_set_iptreed *dtree;
134         unsigned char a,b,c,d;
135         int ret = 0;
136         
137         *hash_ip = ip;
138         ABCD(a, b, c, d, hash_ip);
139         DP("%u %u %u %u timeout %u", a, b, c, d, timeout);
140         ADDIP_WALK(map, a, btree, struct ip_set_iptreeb, branch_cachep, flags);
141         ADDIP_WALK(btree, b, ctree, struct ip_set_iptreec, branch_cachep, flags);
142         ADDIP_WALK(ctree, c, dtree, struct ip_set_iptreed, leaf_cachep, flags);
143         if (dtree->expires[d]
144             && (!map->timeout || time_after(dtree->expires[d], jiffies)))
145                 ret = -EEXIST;
146         dtree->expires[d] = map->timeout ? (timeout * HZ + jiffies) : 1;
147         /* Lottery */
148         if (dtree->expires[d] == 0)
149                 dtree->expires[d] = 1;
150         DP("%u %lu", d, dtree->expires[d]);
151         return ret;
152 }
153
154 static int
155 addip(struct ip_set *set, const void *data, size_t size,
156       ip_set_ip_t *hash_ip)
157 {
158         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
159         struct ip_set_req_iptree *req = 
160                 (struct ip_set_req_iptree *) data;
161
162         if (size != sizeof(struct ip_set_req_iptree)) {
163                 ip_set_printk("data length wrong (want %zu, have %zu)",
164                               sizeof(struct ip_set_req_iptree),
165                               size);
166                 return -EINVAL;
167         }
168         DP("%u.%u.%u.%u %u", HIPQUAD(req->ip), req->timeout);
169         return __addip(set, req->ip,
170                        req->timeout ? req->timeout : map->timeout,
171                        hash_ip,
172                        GFP_ATOMIC);
173 }
174
175 static int
176 addip_kernel(struct ip_set *set, 
177              const struct sk_buff *skb,
178              ip_set_ip_t *hash_ip,
179              const u_int32_t *flags,
180              unsigned char index)
181 {
182         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
183
184         return __addip(set,
185                        ntohl(flags[index] & IPSET_SRC 
186                                 ? skb->nh.iph->saddr 
187                                 : skb->nh.iph->daddr),
188                        map->timeout,
189                        hash_ip,
190                        GFP_ATOMIC);
191 }
192
193 #define DELIP_WALK(map, elem, branch) do {      \
194         if ((map)->tree[elem]) {                \
195                 branch = (map)->tree[elem];     \
196         } else                                  \
197                 return -EEXIST;                 \
198 } while (0)
199
200 static inline int 
201 __delip(struct ip_set *set, ip_set_ip_t ip, ip_set_ip_t *hash_ip)
202 {
203         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
204         struct ip_set_iptreeb *btree;
205         struct ip_set_iptreec *ctree;
206         struct ip_set_iptreed *dtree;
207         unsigned char a,b,c,d;
208         
209         *hash_ip = ip;
210         ABCD(a, b, c, d, hash_ip);
211         DELIP_WALK(map, a, btree);
212         DELIP_WALK(btree, b, ctree);
213         DELIP_WALK(ctree, c, dtree);
214
215         if (dtree->expires[d]) {
216                 dtree->expires[d] = 0;
217                 return 0;
218         }
219         return -EEXIST;
220 }
221
222 static int
223 delip(struct ip_set *set, const void *data, size_t size,
224       ip_set_ip_t *hash_ip)
225 {
226         struct ip_set_req_iptree *req =
227             (struct ip_set_req_iptree *) data;
228
229         if (size != sizeof(struct ip_set_req_iptree)) {
230                 ip_set_printk("data length wrong (want %zu, have %zu)",
231                               sizeof(struct ip_set_req_iptree),
232                               size);
233                 return -EINVAL;
234         }
235         return __delip(set, req->ip, hash_ip);
236 }
237
238 static int
239 delip_kernel(struct ip_set *set, 
240              const struct sk_buff *skb,
241              ip_set_ip_t *hash_ip,
242              const u_int32_t *flags,
243              unsigned char index)
244 {
245         return __delip(set,
246                        ntohl(flags[index] & IPSET_SRC 
247                                 ? skb->nh.iph->saddr 
248                                 : skb->nh.iph->daddr),
249                        hash_ip);
250 }
251
252 #define LOOP_WALK_BEGIN(map, i, branch) \
253         for (i = 0; i < 256; i++) {     \
254                 if (!(map)->tree[i])    \
255                         continue;       \
256                 branch = (map)->tree[i]
257
258 #define LOOP_WALK_END }
259
260 static void ip_tree_gc(unsigned long ul_set)
261 {
262         struct ip_set *set = (void *) ul_set;
263         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
264         struct ip_set_iptreeb *btree;
265         struct ip_set_iptreec *ctree;
266         struct ip_set_iptreed *dtree;
267         unsigned int a,b,c,d;
268         unsigned char i,j,k;
269
270         i = j = k = 0;
271         DP("gc: %s", set->name);
272         write_lock_bh(&set->lock);
273         LOOP_WALK_BEGIN(map, a, btree);
274         LOOP_WALK_BEGIN(btree, b, ctree);
275         LOOP_WALK_BEGIN(ctree, c, dtree);
276         for (d = 0; d < 256; d++) {
277                 if (dtree->expires[d]) {
278                         DP("gc: %u %u %u %u: expires %lu jiffies %lu",
279                             a, b, c, d,
280                             dtree->expires[d], jiffies);
281                         if (map->timeout
282                             && time_before(dtree->expires[d], jiffies))
283                                 dtree->expires[d] = 0;
284                         else
285                                 k = 1;
286                 }
287         }
288         if (k == 0) {
289                 DP("gc: %s: leaf %u %u %u empty",
290                     set->name, a, b, c);
291                 kmem_cache_free(leaf_cachep, dtree);
292                 ctree->tree[c] = NULL;
293         } else {
294                 DP("gc: %s: leaf %u %u %u not empty",
295                     set->name, a, b, c);
296                 j = 1;
297                 k = 0;
298         }
299         LOOP_WALK_END;
300         if (j == 0) {
301                 DP("gc: %s: branch %u %u empty",
302                     set->name, a, b);
303                 kmem_cache_free(branch_cachep, ctree);
304                 btree->tree[b] = NULL;
305         } else {
306                 DP("gc: %s: branch %u %u not empty",
307                     set->name, a, b);
308                 i = 1;
309                 j = k = 0;
310         }
311         LOOP_WALK_END;
312         if (i == 0) {
313                 DP("gc: %s: branch %u empty",
314                     set->name, a);
315                 kmem_cache_free(branch_cachep, btree);
316                 map->tree[a] = NULL;
317         } else {
318                 DP("gc: %s: branch %u not empty",
319                     set->name, a);
320                 i = j = k = 0;
321         }
322         LOOP_WALK_END;
323         write_unlock_bh(&set->lock);
324         
325         map->gc.expires = jiffies + map->gc_interval * HZ;
326         add_timer(&map->gc);
327 }
328
329 static inline void init_gc_timer(struct ip_set *set)
330 {
331         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
332
333         /* Even if there is no timeout for the entries,
334          * we still have to call gc because delete
335          * do not clean up empty branches */
336         map->gc_interval = IPTREE_GC_TIME;
337         init_timer(&map->gc);
338         map->gc.data = (unsigned long) set;
339         map->gc.function = ip_tree_gc;
340         map->gc.expires = jiffies + map->gc_interval * HZ;
341         add_timer(&map->gc);
342 }
343
344 static int create(struct ip_set *set, const void *data, size_t size)
345 {
346         struct ip_set_req_iptree_create *req =
347             (struct ip_set_req_iptree_create *) data;
348         struct ip_set_iptree *map;
349
350         if (size != sizeof(struct ip_set_req_iptree_create)) {
351                 ip_set_printk("data length wrong (want %zu, have %zu)",
352                               sizeof(struct ip_set_req_iptree_create),
353                               size);
354                 return -EINVAL;
355         }
356
357         map = kmalloc(sizeof(struct ip_set_iptree), GFP_KERNEL);
358         if (!map) {
359                 DP("out of memory for %d bytes",
360                    sizeof(struct ip_set_iptree));
361                 return -ENOMEM;
362         }
363         memset(map, 0, sizeof(*map));
364         map->timeout = req->timeout;
365         set->data = map;
366
367         init_gc_timer(set);
368
369         return 0;
370 }
371
372 static void __flush(struct ip_set_iptree *map)
373 {
374         struct ip_set_iptreeb *btree;
375         struct ip_set_iptreec *ctree;
376         struct ip_set_iptreed *dtree;
377         unsigned int a,b,c;
378
379         LOOP_WALK_BEGIN(map, a, btree);
380         LOOP_WALK_BEGIN(btree, b, ctree);
381         LOOP_WALK_BEGIN(ctree, c, dtree);
382         kmem_cache_free(leaf_cachep, dtree);
383         LOOP_WALK_END;
384         kmem_cache_free(branch_cachep, ctree);
385         LOOP_WALK_END;
386         kmem_cache_free(branch_cachep, btree);
387         LOOP_WALK_END;
388 }
389
390 static void destroy(struct ip_set *set)
391 {
392         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
393
394         /* gc might be running */
395         while (!del_timer(&map->gc))
396                 msleep(IPTREE_DESTROY_SLEEP);
397         __flush(map);
398         kfree(map);
399         set->data = NULL;
400 }
401
402 static void flush(struct ip_set *set)
403 {
404         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
405         unsigned int timeout = map->timeout;
406         
407         /* gc might be running */
408         while (!del_timer(&map->gc))
409                 msleep(IPTREE_DESTROY_SLEEP);
410         __flush(map);
411         memset(map, 0, sizeof(*map));
412         map->timeout = timeout;
413
414         init_gc_timer(set);
415 }
416
417 static void list_header(const struct ip_set *set, void *data)
418 {
419         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
420         struct ip_set_req_iptree_create *header =
421             (struct ip_set_req_iptree_create *) data;
422
423         header->timeout = map->timeout;
424 }
425
426 static int list_members_size(const struct ip_set *set)
427 {
428         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
429         struct ip_set_iptreeb *btree;
430         struct ip_set_iptreec *ctree;
431         struct ip_set_iptreed *dtree;
432         unsigned int a,b,c,d;
433         unsigned int count = 0;
434
435         LOOP_WALK_BEGIN(map, a, btree);
436         LOOP_WALK_BEGIN(btree, b, ctree);
437         LOOP_WALK_BEGIN(ctree, c, dtree);
438         for (d = 0; d < 256; d++) {
439                 if (dtree->expires[d]
440                     && (!map->timeout || time_after(dtree->expires[d], jiffies)))
441                         count++;
442         }
443         LOOP_WALK_END;
444         LOOP_WALK_END;
445         LOOP_WALK_END;
446
447         DP("members %u", count);
448         return (count * sizeof(struct ip_set_req_iptree));
449 }
450
451 static void list_members(const struct ip_set *set, void *data)
452 {
453         struct ip_set_iptree *map = (struct ip_set_iptree *) set->data;
454         struct ip_set_iptreeb *btree;
455         struct ip_set_iptreec *ctree;
456         struct ip_set_iptreed *dtree;
457         unsigned int a,b,c,d;
458         size_t offset = 0;
459         struct ip_set_req_iptree *entry;
460
461         LOOP_WALK_BEGIN(map, a, btree);
462         LOOP_WALK_BEGIN(btree, b, ctree);
463         LOOP_WALK_BEGIN(ctree, c, dtree);
464         for (d = 0; d < 256; d++) {
465                 if (dtree->expires[d]
466                     && (!map->timeout || time_after(dtree->expires[d], jiffies))) {
467                         entry = (struct ip_set_req_iptree *)(data + offset);
468                         entry->ip = ((a << 24) | (b << 16) | (c << 8) | d);
469                         entry->timeout = !map->timeout ? 0 
470                                 : (dtree->expires[d] - jiffies)/HZ;
471                         offset += sizeof(struct ip_set_req_iptree);
472                 }
473         }
474         LOOP_WALK_END;
475         LOOP_WALK_END;
476         LOOP_WALK_END;
477 }
478
479 static struct ip_set_type ip_set_iptree = {
480         .typename               = SETTYPE_NAME,
481         .features               = IPSET_TYPE_IP | IPSET_DATA_SINGLE,
482         .protocol_version       = IP_SET_PROTOCOL_VERSION,
483         .create                 = &create,
484         .destroy                = &destroy,
485         .flush                  = &flush,
486         .reqsize                = sizeof(struct ip_set_req_iptree),
487         .addip                  = &addip,
488         .addip_kernel           = &addip_kernel,
489         .delip                  = &delip,
490         .delip_kernel           = &delip_kernel,
491         .testip                 = &testip,
492         .testip_kernel          = &testip_kernel,
493         .header_size            = sizeof(struct ip_set_req_iptree_create),
494         .list_header            = &list_header,
495         .list_members_size      = &list_members_size,
496         .list_members           = &list_members,
497         .me                     = THIS_MODULE,
498 };
499
500 MODULE_LICENSE("GPL");
501 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
502 MODULE_DESCRIPTION("iptree type of IP sets");
503
504 static int __init init(void)
505 {
506         int ret;
507         
508         branch_cachep = kmem_cache_create("ip_set_iptreeb",
509                                 sizeof(struct ip_set_iptreeb),
510                                 0, 0, NULL, NULL);
511         if (!branch_cachep) {
512                 printk(KERN_ERR "Unable to create ip_set_iptreeb slab cache\n");
513                 ret = -ENOMEM;
514                 goto out;
515         }
516         leaf_cachep = kmem_cache_create("ip_set_iptreed",
517                                 sizeof(struct ip_set_iptreed),
518                                 0, 0, NULL, NULL);
519         if (!leaf_cachep) {
520                 printk(KERN_ERR "Unable to create ip_set_iptreed slab cache\n");
521                 ret = -ENOMEM;
522                 goto free_branch;
523         }
524         ret = ip_set_register_set_type(&ip_set_iptree);
525         if (ret == 0)
526                 goto out;
527
528         kmem_cache_destroy(leaf_cachep);
529     free_branch:        
530         kmem_cache_destroy(branch_cachep);
531     out:
532         return ret;
533 }
534
535 static void __exit fini(void)
536 {
537         /* FIXME: possible race with ip_set_create() */
538         ip_set_unregister_set_type(&ip_set_iptree);
539         kmem_cache_destroy(leaf_cachep);
540         kmem_cache_destroy(branch_cachep);
541 }
542
543 module_init(init);
544 module_exit(fini);