This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / net / ipv4 / netfilter / ip_tables.c
1 /*
2  * Packet matching code.
3  *
4  * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
5  * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
12  *      - increase module usage count as soon as we have rules inside
13  *        a table
14  * 08 Oct 2005 Harald Welte <lafore@netfilter.org>
15  *      - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables"
16  */
17 #include <linux/config.h>
18 #include <linux/cache.h>
19 #include <linux/capability.h>
20 #include <linux/skbuff.h>
21 #include <linux/kmod.h>
22 #include <linux/vmalloc.h>
23 #include <linux/netdevice.h>
24 #include <linux/module.h>
25 #include <linux/icmp.h>
26 #include <net/ip.h>
27 #include <asm/uaccess.h>
28 #include <asm/semaphore.h>
29 #include <linux/proc_fs.h>
30 #include <linux/err.h>
31 #include <linux/cpumask.h>
32
33 #include <linux/netfilter/x_tables.h>
34 #include <linux/netfilter_ipv4/ip_tables.h>
35
36 MODULE_LICENSE("GPL");
37 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
38 MODULE_DESCRIPTION("IPv4 packet filter");
39
40 /*#define DEBUG_IP_FIREWALL*/
41 /*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
42 /*#define DEBUG_IP_FIREWALL_USER*/
43
44 #ifdef DEBUG_IP_FIREWALL
45 #define dprintf(format, args...)  printk(format , ## args)
46 #else
47 #define dprintf(format, args...)
48 #endif
49
50 #ifdef DEBUG_IP_FIREWALL_USER
51 #define duprintf(format, args...) printk(format , ## args)
52 #else
53 #define duprintf(format, args...)
54 #endif
55
56 #ifdef CONFIG_NETFILTER_DEBUG
57 #define IP_NF_ASSERT(x)                                         \
58 do {                                                            \
59         if (!(x))                                               \
60                 printk("IP_NF_ASSERT: %s:%s:%u\n",              \
61                        __FUNCTION__, __FILE__, __LINE__);       \
62 } while(0)
63 #else
64 #define IP_NF_ASSERT(x)
65 #endif
66
67 #if 0
68 /* All the better to debug you with... */
69 #define static
70 #define inline
71 #endif
72
73 /*
74    We keep a set of rules for each CPU, so we can avoid write-locking
75    them in the softirq when updating the counters and therefore
76    only need to read-lock in the softirq; doing a write_lock_bh() in user
77    context stops packets coming through and allows user context to read
78    the counters or update the rules.
79
80    Hence the start of any table is given by get_table() below.  */
81
82 /* Returns whether matches rule or not. */
83 static inline int
84 ip_packet_match(const struct iphdr *ip,
85                 const char *indev,
86                 const char *outdev,
87                 const struct ipt_ip *ipinfo,
88                 int isfrag)
89 {
90         size_t i;
91         unsigned long ret;
92
93 #define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
94
95         if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
96                   IPT_INV_SRCIP)
97             || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
98                      IPT_INV_DSTIP)) {
99                 dprintf("Source or dest mismatch.\n");
100
101                 dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
102                         NIPQUAD(ip->saddr),
103                         NIPQUAD(ipinfo->smsk.s_addr),
104                         NIPQUAD(ipinfo->src.s_addr),
105                         ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
106                 dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
107                         NIPQUAD(ip->daddr),
108                         NIPQUAD(ipinfo->dmsk.s_addr),
109                         NIPQUAD(ipinfo->dst.s_addr),
110                         ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
111                 return 0;
112         }
113
114         /* Look for ifname matches; this should unroll nicely. */
115         for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
116                 ret |= (((const unsigned long *)indev)[i]
117                         ^ ((const unsigned long *)ipinfo->iniface)[i])
118                         & ((const unsigned long *)ipinfo->iniface_mask)[i];
119         }
120
121         if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
122                 dprintf("VIA in mismatch (%s vs %s).%s\n",
123                         indev, ipinfo->iniface,
124                         ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
125                 return 0;
126         }
127
128         for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
129                 ret |= (((const unsigned long *)outdev)[i]
130                         ^ ((const unsigned long *)ipinfo->outiface)[i])
131                         & ((const unsigned long *)ipinfo->outiface_mask)[i];
132         }
133
134         if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
135                 dprintf("VIA out mismatch (%s vs %s).%s\n",
136                         outdev, ipinfo->outiface,
137                         ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
138                 return 0;
139         }
140
141         /* Check specific protocol */
142         if (ipinfo->proto
143             && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
144                 dprintf("Packet protocol %hi does not match %hi.%s\n",
145                         ip->protocol, ipinfo->proto,
146                         ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
147                 return 0;
148         }
149
150         /* If we have a fragment rule but the packet is not a fragment
151          * then we return zero */
152         if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
153                 dprintf("Fragment rule but not fragment.%s\n",
154                         ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
155                 return 0;
156         }
157
158         return 1;
159 }
160
161 static inline int
162 ip_checkentry(const struct ipt_ip *ip)
163 {
164         if (ip->flags & ~IPT_F_MASK) {
165                 duprintf("Unknown flag bits set: %08X\n",
166                          ip->flags & ~IPT_F_MASK);
167                 return 0;
168         }
169         if (ip->invflags & ~IPT_INV_MASK) {
170                 duprintf("Unknown invflag bits set: %08X\n",
171                          ip->invflags & ~IPT_INV_MASK);
172                 return 0;
173         }
174         return 1;
175 }
176
177 static unsigned int
178 ipt_error(struct sk_buff **pskb,
179           const struct net_device *in,
180           const struct net_device *out,
181           unsigned int hooknum,
182           const void *targinfo,
183           void *userinfo)
184 {
185         if (net_ratelimit())
186                 printk("ip_tables: error: `%s'\n", (char *)targinfo);
187
188         return NF_DROP;
189 }
190
191 static inline
192 int do_match(struct ipt_entry_match *m,
193              const struct sk_buff *skb,
194              const struct net_device *in,
195              const struct net_device *out,
196              int offset,
197              int *hotdrop)
198 {
199         /* Stop iteration if it doesn't match */
200         if (!m->u.kernel.match->match(skb, in, out, m->data, offset, 
201             skb->nh.iph->ihl*4, hotdrop))
202                 return 1;
203         else
204                 return 0;
205 }
206
207 static inline struct ipt_entry *
208 get_entry(void *base, unsigned int offset)
209 {
210         return (struct ipt_entry *)(base + offset);
211 }
212
213 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
214 unsigned int
215 ipt_do_table(struct sk_buff **pskb,
216              unsigned int hook,
217              const struct net_device *in,
218              const struct net_device *out,
219              struct ipt_table *table,
220              void *userdata)
221 {
222         static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
223         u_int16_t offset;
224         struct iphdr *ip;
225         u_int16_t datalen;
226         int hotdrop = 0;
227         /* Initializing verdict to NF_DROP keeps gcc happy. */
228         unsigned int verdict = NF_DROP;
229         const char *indev, *outdev;
230         void *table_base;
231         struct ipt_entry *e, *back;
232         struct xt_table_info *private;
233
234         /* Initialization */
235         ip = (*pskb)->nh.iph;
236         datalen = (*pskb)->len - ip->ihl * 4;
237         indev = in ? in->name : nulldevname;
238         outdev = out ? out->name : nulldevname;
239         /* We handle fragments by dealing with the first fragment as
240          * if it was a normal packet.  All other fragments are treated
241          * normally, except that they will NEVER match rules that ask
242          * things we don't know, ie. tcp syn flag or ports).  If the
243          * rule is also a fragment-specific rule, non-fragments won't
244          * match it. */
245         offset = ntohs(ip->frag_off) & IP_OFFSET;
246
247         read_lock_bh(&table->lock);
248         IP_NF_ASSERT(table->valid_hooks & (1 << hook));
249         private = table->private;
250         table_base = (void *)private->entries[smp_processor_id()];
251         e = get_entry(table_base, private->hook_entry[hook]);
252
253         /* For return from builtin chain */
254         back = get_entry(table_base, private->underflow[hook]);
255
256         do {
257                 IP_NF_ASSERT(e);
258                 IP_NF_ASSERT(back);
259                 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
260                         struct ipt_entry_target *t;
261
262                         if (IPT_MATCH_ITERATE(e, do_match,
263                                               *pskb, in, out,
264                                               offset, &hotdrop) != 0)
265                                 goto no_match;
266
267                         ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
268
269                         t = ipt_get_target(e);
270                         IP_NF_ASSERT(t->u.kernel.target);
271                         /* Standard target? */
272                         if (!t->u.kernel.target->target) {
273                                 int v;
274
275                                 v = ((struct ipt_standard_target *)t)->verdict;
276                                 if (v < 0) {
277                                         /* Pop from stack? */
278                                         if (v != IPT_RETURN) {
279                                                 verdict = (unsigned)(-v) - 1;
280                                                 break;
281                                         }
282                                         e = back;
283                                         back = get_entry(table_base,
284                                                          back->comefrom);
285                                         continue;
286                                 }
287                                 if (table_base + v != (void *)e + e->next_offset
288                                     && !(e->ip.flags & IPT_F_GOTO)) {
289                                         /* Save old back ptr in next entry */
290                                         struct ipt_entry *next
291                                                 = (void *)e + e->next_offset;
292                                         next->comefrom
293                                                 = (void *)back - table_base;
294                                         /* set back pointer to next entry */
295                                         back = next;
296                                 }
297
298                                 e = get_entry(table_base, v);
299                         } else {
300                                 /* Targets which reenter must return
301                                    abs. verdicts */
302 #ifdef CONFIG_NETFILTER_DEBUG
303                                 ((struct ipt_entry *)table_base)->comefrom
304                                         = 0xeeeeeeec;
305 #endif
306                                 verdict = t->u.kernel.target->target(pskb,
307                                                                      in, out,
308                                                                      hook,
309                                                                      t->data,
310                                                                      userdata);
311
312 #ifdef CONFIG_NETFILTER_DEBUG
313                                 if (((struct ipt_entry *)table_base)->comefrom
314                                     != 0xeeeeeeec
315                                     && verdict == IPT_CONTINUE) {
316                                         printk("Target %s reentered!\n",
317                                                t->u.kernel.target->name);
318                                         verdict = NF_DROP;
319                                 }
320                                 ((struct ipt_entry *)table_base)->comefrom
321                                         = 0x57acc001;
322 #endif
323                                 /* Target might have changed stuff. */
324                                 ip = (*pskb)->nh.iph;
325                                 datalen = (*pskb)->len - ip->ihl * 4;
326
327                                 if (verdict == IPT_CONTINUE)
328                                         e = (void *)e + e->next_offset;
329                                 else
330                                         /* Verdict */
331                                         break;
332                         }
333                 } else {
334
335                 no_match:
336                         e = (void *)e + e->next_offset;
337                 }
338         } while (!hotdrop);
339
340         read_unlock_bh(&table->lock);
341
342 #ifdef DEBUG_ALLOW_ALL
343         return NF_ACCEPT;
344 #else
345         if (hotdrop)
346                 return NF_DROP;
347         else return verdict;
348 #endif
349 }
350
351 /* All zeroes == unconditional rule. */
352 static inline int
353 unconditional(const struct ipt_ip *ip)
354 {
355         unsigned int i;
356
357         for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
358                 if (((__u32 *)ip)[i])
359                         return 0;
360
361         return 1;
362 }
363
364 /* Figures out from what hook each rule can be called: returns 0 if
365    there are loops.  Puts hook bitmask in comefrom. */
366 static int
367 mark_source_chains(struct xt_table_info *newinfo,
368                    unsigned int valid_hooks, void *entry0)
369 {
370         unsigned int hook;
371
372         /* No recursion; use packet counter to save back ptrs (reset
373            to 0 as we leave), and comefrom to save source hook bitmask */
374         for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
375                 unsigned int pos = newinfo->hook_entry[hook];
376                 struct ipt_entry *e
377                         = (struct ipt_entry *)(entry0 + pos);
378
379                 if (!(valid_hooks & (1 << hook)))
380                         continue;
381
382                 /* Set initial back pointer. */
383                 e->counters.pcnt = pos;
384
385                 for (;;) {
386                         struct ipt_standard_target *t
387                                 = (void *)ipt_get_target(e);
388
389                         if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
390                                 printk("iptables: loop hook %u pos %u %08X.\n",
391                                        hook, pos, e->comefrom);
392                                 return 0;
393                         }
394                         e->comefrom
395                                 |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
396
397                         /* Unconditional return/END. */
398                         if (e->target_offset == sizeof(struct ipt_entry)
399                             && (strcmp(t->target.u.user.name,
400                                        IPT_STANDARD_TARGET) == 0)
401                             && t->verdict < 0
402                             && unconditional(&e->ip)) {
403                                 unsigned int oldpos, size;
404
405                                 /* Return: backtrack through the last
406                                    big jump. */
407                                 do {
408                                         e->comefrom ^= (1<<NF_IP_NUMHOOKS);
409 #ifdef DEBUG_IP_FIREWALL_USER
410                                         if (e->comefrom
411                                             & (1 << NF_IP_NUMHOOKS)) {
412                                                 duprintf("Back unset "
413                                                          "on hook %u "
414                                                          "rule %u\n",
415                                                          hook, pos);
416                                         }
417 #endif
418                                         oldpos = pos;
419                                         pos = e->counters.pcnt;
420                                         e->counters.pcnt = 0;
421
422                                         /* We're at the start. */
423                                         if (pos == oldpos)
424                                                 goto next;
425
426                                         e = (struct ipt_entry *)
427                                                 (entry0 + pos);
428                                 } while (oldpos == pos + e->next_offset);
429
430                                 /* Move along one */
431                                 size = e->next_offset;
432                                 e = (struct ipt_entry *)
433                                         (entry0 + pos + size);
434                                 e->counters.pcnt = pos;
435                                 pos += size;
436                         } else {
437                                 int newpos = t->verdict;
438
439                                 if (strcmp(t->target.u.user.name,
440                                            IPT_STANDARD_TARGET) == 0
441                                     && newpos >= 0) {
442                                         /* This a jump; chase it. */
443                                         duprintf("Jump rule %u -> %u\n",
444                                                  pos, newpos);
445                                 } else {
446                                         /* ... this is a fallthru */
447                                         newpos = pos + e->next_offset;
448                                 }
449                                 e = (struct ipt_entry *)
450                                         (entry0 + newpos);
451                                 e->counters.pcnt = pos;
452                                 pos = newpos;
453                         }
454                 }
455                 next:
456                 duprintf("Finished chain %u\n", hook);
457         }
458         return 1;
459 }
460
461 static inline int
462 cleanup_match(struct ipt_entry_match *m, unsigned int *i)
463 {
464         if (i && (*i)-- == 0)
465                 return 1;
466
467         if (m->u.kernel.match->destroy)
468                 m->u.kernel.match->destroy(m->data,
469                                            m->u.match_size - sizeof(*m));
470         module_put(m->u.kernel.match->me);
471         return 0;
472 }
473
474 static inline int
475 standard_check(const struct ipt_entry_target *t,
476                unsigned int max_offset)
477 {
478         struct ipt_standard_target *targ = (void *)t;
479
480         /* Check standard info. */
481         if (t->u.target_size
482             != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
483                 duprintf("standard_check: target size %u != %u\n",
484                          t->u.target_size,
485                          IPT_ALIGN(sizeof(struct ipt_standard_target)));
486                 return 0;
487         }
488
489         if (targ->verdict >= 0
490             && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
491                 duprintf("ipt_standard_check: bad verdict (%i)\n",
492                          targ->verdict);
493                 return 0;
494         }
495
496         if (targ->verdict < -NF_MAX_VERDICT - 1) {
497                 duprintf("ipt_standard_check: bad negative verdict (%i)\n",
498                          targ->verdict);
499                 return 0;
500         }
501         return 1;
502 }
503
504 static inline int
505 check_match(struct ipt_entry_match *m,
506             const char *name,
507             const struct ipt_ip *ip,
508             unsigned int hookmask,
509             unsigned int *i)
510 {
511         struct ipt_match *match;
512
513         match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
514                                                    m->u.user.revision),
515                                         "ipt_%s", m->u.user.name);
516         if (IS_ERR(match) || !match) {
517                 duprintf("check_match: `%s' not found\n", m->u.user.name);
518                 return match ? PTR_ERR(match) : -ENOENT;
519         }
520         m->u.kernel.match = match;
521
522         if (m->u.kernel.match->checkentry
523             && !m->u.kernel.match->checkentry(name, ip, m->data,
524                                               m->u.match_size - sizeof(*m),
525                                               hookmask)) {
526                 module_put(m->u.kernel.match->me);
527                 duprintf("ip_tables: check failed for `%s'.\n",
528                          m->u.kernel.match->name);
529                 return -EINVAL;
530         }
531
532         (*i)++;
533         return 0;
534 }
535
536 static struct ipt_target ipt_standard_target;
537
538 static inline int
539 check_entry(struct ipt_entry *e, const char *name, unsigned int size,
540             unsigned int *i)
541 {
542         struct ipt_entry_target *t;
543         struct ipt_target *target;
544         int ret;
545         unsigned int j;
546
547         if (!ip_checkentry(&e->ip)) {
548                 duprintf("ip_tables: ip check failed %p %s.\n", e, name);
549                 return -EINVAL;
550         }
551
552         j = 0;
553         ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
554         if (ret != 0)
555                 goto cleanup_matches;
556
557         t = ipt_get_target(e);
558         target = try_then_request_module(xt_find_target(AF_INET,
559                                                      t->u.user.name,
560                                                      t->u.user.revision),
561                                          "ipt_%s", t->u.user.name);
562         if (IS_ERR(target) || !target) {
563                 duprintf("check_entry: `%s' not found\n", t->u.user.name);
564                 ret = target ? PTR_ERR(target) : -ENOENT;
565                 goto cleanup_matches;
566         }
567         t->u.kernel.target = target;
568
569         if (t->u.kernel.target == &ipt_standard_target) {
570                 if (!standard_check(t, size)) {
571                         ret = -EINVAL;
572                         goto cleanup_matches;
573                 }
574         } else if (t->u.kernel.target->checkentry
575                    && !t->u.kernel.target->checkentry(name, e, t->data,
576                                                       t->u.target_size
577                                                       - sizeof(*t),
578                                                       e->comefrom)) {
579                 module_put(t->u.kernel.target->me);
580                 duprintf("ip_tables: check failed for `%s'.\n",
581                          t->u.kernel.target->name);
582                 ret = -EINVAL;
583                 goto cleanup_matches;
584         }
585
586         (*i)++;
587         return 0;
588
589  cleanup_matches:
590         IPT_MATCH_ITERATE(e, cleanup_match, &j);
591         return ret;
592 }
593
594 static inline int
595 check_entry_size_and_hooks(struct ipt_entry *e,
596                            struct xt_table_info *newinfo,
597                            unsigned char *base,
598                            unsigned char *limit,
599                            const unsigned int *hook_entries,
600                            const unsigned int *underflows,
601                            unsigned int *i)
602 {
603         unsigned int h;
604
605         if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
606             || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
607                 duprintf("Bad offset %p\n", e);
608                 return -EINVAL;
609         }
610
611         if (e->next_offset
612             < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
613                 duprintf("checking: element %p size %u\n",
614                          e, e->next_offset);
615                 return -EINVAL;
616         }
617
618         /* Check hooks & underflows */
619         for (h = 0; h < NF_IP_NUMHOOKS; h++) {
620                 if ((unsigned char *)e - base == hook_entries[h])
621                         newinfo->hook_entry[h] = hook_entries[h];
622                 if ((unsigned char *)e - base == underflows[h])
623                         newinfo->underflow[h] = underflows[h];
624         }
625
626         /* FIXME: underflows must be unconditional, standard verdicts
627            < 0 (not IPT_RETURN). --RR */
628
629         /* Clear counters and comefrom */
630         e->counters = ((struct xt_counters) { 0, 0 });
631         e->comefrom = 0;
632
633         (*i)++;
634         return 0;
635 }
636
637 static inline int
638 cleanup_entry(struct ipt_entry *e, unsigned int *i)
639 {
640         struct ipt_entry_target *t;
641
642         if (i && (*i)-- == 0)
643                 return 1;
644
645         /* Cleanup all matches */
646         IPT_MATCH_ITERATE(e, cleanup_match, NULL);
647         t = ipt_get_target(e);
648         if (t->u.kernel.target->destroy)
649                 t->u.kernel.target->destroy(t->data,
650                                             t->u.target_size - sizeof(*t));
651         module_put(t->u.kernel.target->me);
652         return 0;
653 }
654
655 /* Checks and translates the user-supplied table segment (held in
656    newinfo) */
657 static int
658 translate_table(const char *name,
659                 unsigned int valid_hooks,
660                 struct xt_table_info *newinfo,
661                 void *entry0,
662                 unsigned int size,
663                 unsigned int number,
664                 const unsigned int *hook_entries,
665                 const unsigned int *underflows)
666 {
667         unsigned int i;
668         int ret;
669
670         newinfo->size = size;
671         newinfo->number = number;
672
673         /* Init all hooks to impossible value. */
674         for (i = 0; i < NF_IP_NUMHOOKS; i++) {
675                 newinfo->hook_entry[i] = 0xFFFFFFFF;
676                 newinfo->underflow[i] = 0xFFFFFFFF;
677         }
678
679         duprintf("translate_table: size %u\n", newinfo->size);
680         i = 0;
681         /* Walk through entries, checking offsets. */
682         ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
683                                 check_entry_size_and_hooks,
684                                 newinfo,
685                                 entry0,
686                                 entry0 + size,
687                                 hook_entries, underflows, &i);
688         if (ret != 0)
689                 return ret;
690
691         if (i != number) {
692                 duprintf("translate_table: %u not %u entries\n",
693                          i, number);
694                 return -EINVAL;
695         }
696
697         /* Check hooks all assigned */
698         for (i = 0; i < NF_IP_NUMHOOKS; i++) {
699                 /* Only hooks which are valid */
700                 if (!(valid_hooks & (1 << i)))
701                         continue;
702                 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
703                         duprintf("Invalid hook entry %u %u\n",
704                                  i, hook_entries[i]);
705                         return -EINVAL;
706                 }
707                 if (newinfo->underflow[i] == 0xFFFFFFFF) {
708                         duprintf("Invalid underflow %u %u\n",
709                                  i, underflows[i]);
710                         return -EINVAL;
711                 }
712         }
713
714         if (!mark_source_chains(newinfo, valid_hooks, entry0))
715                 return -ELOOP;
716
717         /* Finally, each sanity check must pass */
718         i = 0;
719         ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
720                                 check_entry, name, size, &i);
721
722         if (ret != 0) {
723                 IPT_ENTRY_ITERATE(entry0, newinfo->size,
724                                   cleanup_entry, &i);
725                 return ret;
726         }
727
728         /* And one copy for every other CPU */
729         for_each_cpu(i) {
730                 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
731                         memcpy(newinfo->entries[i], entry0, newinfo->size);
732         }
733
734         return ret;
735 }
736
737 /* Gets counters. */
738 static inline int
739 add_entry_to_counter(const struct ipt_entry *e,
740                      struct xt_counters total[],
741                      unsigned int *i)
742 {
743         ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
744
745         (*i)++;
746         return 0;
747 }
748
749 static inline int
750 set_entry_to_counter(const struct ipt_entry *e,
751                      struct ipt_counters total[],
752                      unsigned int *i)
753 {
754         SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
755
756         (*i)++;
757         return 0;
758 }
759
760 static void
761 get_counters(const struct xt_table_info *t,
762              struct xt_counters counters[])
763 {
764         unsigned int cpu;
765         unsigned int i;
766         unsigned int curcpu;
767
768         /* Instead of clearing (by a previous call to memset())
769          * the counters and using adds, we set the counters
770          * with data used by 'current' CPU
771          * We dont care about preemption here.
772          */
773         curcpu = raw_smp_processor_id();
774
775         i = 0;
776         IPT_ENTRY_ITERATE(t->entries[curcpu],
777                           t->size,
778                           set_entry_to_counter,
779                           counters,
780                           &i);
781
782         for_each_cpu(cpu) {
783                 if (cpu == curcpu)
784                         continue;
785                 i = 0;
786                 IPT_ENTRY_ITERATE(t->entries[cpu],
787                                   t->size,
788                                   add_entry_to_counter,
789                                   counters,
790                                   &i);
791         }
792 }
793
794 static int
795 copy_entries_to_user(unsigned int total_size,
796                      struct ipt_table *table,
797                      void __user *userptr)
798 {
799         unsigned int off, num, countersize;
800         struct ipt_entry *e;
801         struct xt_counters *counters;
802         struct xt_table_info *private = table->private;
803         int ret = 0;
804         void *loc_cpu_entry;
805
806         /* We need atomic snapshot of counters: rest doesn't change
807            (other than comefrom, which userspace doesn't care
808            about). */
809         countersize = sizeof(struct xt_counters) * private->number;
810         counters = vmalloc_node(countersize, numa_node_id());
811
812         if (counters == NULL)
813                 return -ENOMEM;
814
815         /* First, sum counters... */
816         write_lock_bh(&table->lock);
817         get_counters(private, counters);
818         write_unlock_bh(&table->lock);
819
820         /* choose the copy that is on our node/cpu, ...
821          * This choice is lazy (because current thread is
822          * allowed to migrate to another cpu)
823          */
824         loc_cpu_entry = private->entries[raw_smp_processor_id()];
825         /* ... then copy entire thing ... */
826         if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
827                 ret = -EFAULT;
828                 goto free_counters;
829         }
830
831         /* FIXME: use iterator macros --RR */
832         /* ... then go back and fix counters and names */
833         for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
834                 unsigned int i;
835                 struct ipt_entry_match *m;
836                 struct ipt_entry_target *t;
837
838                 e = (struct ipt_entry *)(loc_cpu_entry + off);
839                 if (copy_to_user(userptr + off
840                                  + offsetof(struct ipt_entry, counters),
841                                  &counters[num],
842                                  sizeof(counters[num])) != 0) {
843                         ret = -EFAULT;
844                         goto free_counters;
845                 }
846
847                 for (i = sizeof(struct ipt_entry);
848                      i < e->target_offset;
849                      i += m->u.match_size) {
850                         m = (void *)e + i;
851
852                         if (copy_to_user(userptr + off + i
853                                          + offsetof(struct ipt_entry_match,
854                                                     u.user.name),
855                                          m->u.kernel.match->name,
856                                          strlen(m->u.kernel.match->name)+1)
857                             != 0) {
858                                 ret = -EFAULT;
859                                 goto free_counters;
860                         }
861                 }
862
863                 t = ipt_get_target(e);
864                 if (copy_to_user(userptr + off + e->target_offset
865                                  + offsetof(struct ipt_entry_target,
866                                             u.user.name),
867                                  t->u.kernel.target->name,
868                                  strlen(t->u.kernel.target->name)+1) != 0) {
869                         ret = -EFAULT;
870                         goto free_counters;
871                 }
872         }
873
874  free_counters:
875         vfree(counters);
876         return ret;
877 }
878
879 static int
880 get_entries(const struct ipt_get_entries *entries,
881             struct ipt_get_entries __user *uptr)
882 {
883         int ret;
884         struct ipt_table *t;
885
886         t = xt_find_table_lock(AF_INET, entries->name);
887         if (t && !IS_ERR(t)) {
888                 struct xt_table_info *private = t->private;
889                 duprintf("t->private->number = %u\n",
890                          private->number);
891                 if (entries->size == private->size)
892                         ret = copy_entries_to_user(private->size,
893                                                    t, uptr->entrytable);
894                 else {
895                         duprintf("get_entries: I've got %u not %u!\n",
896                                  private->size,
897                                  entries->size);
898                         ret = -EINVAL;
899                 }
900                 module_put(t->me);
901                 xt_table_unlock(t);
902         } else
903                 ret = t ? PTR_ERR(t) : -ENOENT;
904
905         return ret;
906 }
907
908 static int
909 do_replace(void __user *user, unsigned int len)
910 {
911         int ret;
912         struct ipt_replace tmp;
913         struct ipt_table *t;
914         struct xt_table_info *newinfo, *oldinfo;
915         struct xt_counters *counters;
916         void *loc_cpu_entry, *loc_cpu_old_entry;
917
918         if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
919                 return -EFAULT;
920
921         /* Hack: Causes ipchains to give correct error msg --RR */
922         if (len != sizeof(tmp) + tmp.size)
923                 return -ENOPROTOOPT;
924
925         /* overflow check */
926         if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
927                         SMP_CACHE_BYTES)
928                 return -ENOMEM;
929         if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
930                 return -ENOMEM;
931
932         newinfo = xt_alloc_table_info(tmp.size);
933         if (!newinfo)
934                 return -ENOMEM;
935
936         /* choose the copy that is our node/cpu */
937         loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
938         if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
939                            tmp.size) != 0) {
940                 ret = -EFAULT;
941                 goto free_newinfo;
942         }
943
944         counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
945         if (!counters) {
946                 ret = -ENOMEM;
947                 goto free_newinfo;
948         }
949
950         ret = translate_table(tmp.name, tmp.valid_hooks,
951                               newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
952                               tmp.hook_entry, tmp.underflow);
953         if (ret != 0)
954                 goto free_newinfo_counters;
955
956         duprintf("ip_tables: Translated table\n");
957
958         t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name),
959                                     "iptable_%s", tmp.name);
960         if (!t || IS_ERR(t)) {
961                 ret = t ? PTR_ERR(t) : -ENOENT;
962                 goto free_newinfo_counters_untrans;
963         }
964
965         /* You lied! */
966         if (tmp.valid_hooks != t->valid_hooks) {
967                 duprintf("Valid hook crap: %08X vs %08X\n",
968                          tmp.valid_hooks, t->valid_hooks);
969                 ret = -EINVAL;
970                 goto put_module;
971         }
972
973         oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
974         if (!oldinfo)
975                 goto put_module;
976
977         /* Update module usage count based on number of rules */
978         duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
979                 oldinfo->number, oldinfo->initial_entries, newinfo->number);
980         if ((oldinfo->number > oldinfo->initial_entries) || 
981             (newinfo->number <= oldinfo->initial_entries)) 
982                 module_put(t->me);
983         if ((oldinfo->number > oldinfo->initial_entries) &&
984             (newinfo->number <= oldinfo->initial_entries))
985                 module_put(t->me);
986
987         /* Get the old counters. */
988         get_counters(oldinfo, counters);
989         /* Decrease module usage counts and free resource */
990         loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
991         IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
992         xt_free_table_info(oldinfo);
993         if (copy_to_user(tmp.counters, counters,
994                          sizeof(struct xt_counters) * tmp.num_counters) != 0)
995                 ret = -EFAULT;
996         vfree(counters);
997         xt_table_unlock(t);
998         return ret;
999
1000  put_module:
1001         module_put(t->me);
1002         xt_table_unlock(t);
1003  free_newinfo_counters_untrans:
1004         IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1005  free_newinfo_counters:
1006         vfree(counters);
1007  free_newinfo:
1008         xt_free_table_info(newinfo);
1009         return ret;
1010 }
1011
1012 /* We're lazy, and add to the first CPU; overflow works its fey magic
1013  * and everything is OK. */
1014 static inline int
1015 add_counter_to_entry(struct ipt_entry *e,
1016                      const struct xt_counters addme[],
1017                      unsigned int *i)
1018 {
1019 #if 0
1020         duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
1021                  *i,
1022                  (long unsigned int)e->counters.pcnt,
1023                  (long unsigned int)e->counters.bcnt,
1024                  (long unsigned int)addme[*i].pcnt,
1025                  (long unsigned int)addme[*i].bcnt);
1026 #endif
1027
1028         ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1029
1030         (*i)++;
1031         return 0;
1032 }
1033
1034 static int
1035 do_add_counters(void __user *user, unsigned int len)
1036 {
1037         unsigned int i;
1038         struct xt_counters_info tmp, *paddc;
1039         struct ipt_table *t;
1040         struct xt_table_info *private;
1041         int ret = 0;
1042         void *loc_cpu_entry;
1043
1044         if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1045                 return -EFAULT;
1046
1047         if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
1048                 return -EINVAL;
1049
1050         paddc = vmalloc_node(len, numa_node_id());
1051         if (!paddc)
1052                 return -ENOMEM;
1053
1054         if (copy_from_user(paddc, user, len) != 0) {
1055                 ret = -EFAULT;
1056                 goto free;
1057         }
1058
1059         t = xt_find_table_lock(AF_INET, tmp.name);
1060         if (!t || IS_ERR(t)) {
1061                 ret = t ? PTR_ERR(t) : -ENOENT;
1062                 goto free;
1063         }
1064
1065         write_lock_bh(&t->lock);
1066         private = t->private;
1067         if (private->number != tmp.num_counters) {
1068                 ret = -EINVAL;
1069                 goto unlock_up_free;
1070         }
1071
1072         i = 0;
1073         /* Choose the copy that is on our node */
1074         loc_cpu_entry = private->entries[raw_smp_processor_id()];
1075         IPT_ENTRY_ITERATE(loc_cpu_entry,
1076                           private->size,
1077                           add_counter_to_entry,
1078                           paddc->counters,
1079                           &i);
1080  unlock_up_free:
1081         write_unlock_bh(&t->lock);
1082         xt_table_unlock(t);
1083         module_put(t->me);
1084  free:
1085         vfree(paddc);
1086
1087         return ret;
1088 }
1089
1090 static int
1091 do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1092 {
1093         int ret;
1094
1095         if (!capable(CAP_NET_ADMIN))
1096                 return -EPERM;
1097
1098         switch (cmd) {
1099         case IPT_SO_SET_REPLACE:
1100                 ret = do_replace(user, len);
1101                 break;
1102
1103         case IPT_SO_SET_ADD_COUNTERS:
1104                 ret = do_add_counters(user, len);
1105                 break;
1106
1107         default:
1108                 duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
1109                 ret = -EINVAL;
1110         }
1111
1112         return ret;
1113 }
1114
1115 static int
1116 do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
1117 {
1118         int ret;
1119
1120         if (!capable(CAP_NET_ADMIN))
1121                 return -EPERM;
1122
1123         switch (cmd) {
1124         case IPT_SO_GET_INFO: {
1125                 char name[IPT_TABLE_MAXNAMELEN];
1126                 struct ipt_table *t;
1127
1128                 if (*len != sizeof(struct ipt_getinfo)) {
1129                         duprintf("length %u != %u\n", *len,
1130                                  sizeof(struct ipt_getinfo));
1131                         ret = -EINVAL;
1132                         break;
1133                 }
1134
1135                 if (copy_from_user(name, user, sizeof(name)) != 0) {
1136                         ret = -EFAULT;
1137                         break;
1138                 }
1139                 name[IPT_TABLE_MAXNAMELEN-1] = '\0';
1140
1141                 t = try_then_request_module(xt_find_table_lock(AF_INET, name),
1142                                             "iptable_%s", name);
1143                 if (t && !IS_ERR(t)) {
1144                         struct ipt_getinfo info;
1145                         struct xt_table_info *private = t->private;
1146
1147                         info.valid_hooks = t->valid_hooks;
1148                         memcpy(info.hook_entry, private->hook_entry,
1149                                sizeof(info.hook_entry));
1150                         memcpy(info.underflow, private->underflow,
1151                                sizeof(info.underflow));
1152                         info.num_entries = private->number;
1153                         info.size = private->size;
1154                         memcpy(info.name, name, sizeof(info.name));
1155
1156                         if (copy_to_user(user, &info, *len) != 0)
1157                                 ret = -EFAULT;
1158                         else
1159                                 ret = 0;
1160                         xt_table_unlock(t);
1161                         module_put(t->me);
1162                 } else
1163                         ret = t ? PTR_ERR(t) : -ENOENT;
1164         }
1165         break;
1166
1167         case IPT_SO_GET_ENTRIES: {
1168                 struct ipt_get_entries get;
1169
1170                 if (*len < sizeof(get)) {
1171                         duprintf("get_entries: %u < %u\n", *len, sizeof(get));
1172                         ret = -EINVAL;
1173                 } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
1174                         ret = -EFAULT;
1175                 } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
1176                         duprintf("get_entries: %u != %u\n", *len,
1177                                  sizeof(struct ipt_get_entries) + get.size);
1178                         ret = -EINVAL;
1179                 } else
1180                         ret = get_entries(&get, user);
1181                 break;
1182         }
1183
1184         case IPT_SO_GET_REVISION_MATCH:
1185         case IPT_SO_GET_REVISION_TARGET: {
1186                 struct ipt_get_revision rev;
1187                 int target;
1188
1189                 if (*len != sizeof(rev)) {
1190                         ret = -EINVAL;
1191                         break;
1192                 }
1193                 if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
1194                         ret = -EFAULT;
1195                         break;
1196                 }
1197
1198                 if (cmd == IPT_SO_GET_REVISION_TARGET)
1199                         target = 1;
1200                 else
1201                         target = 0;
1202
1203                 try_then_request_module(xt_find_revision(AF_INET, rev.name,
1204                                                          rev.revision,
1205                                                          target, &ret),
1206                                         "ipt_%s", rev.name);
1207                 break;
1208         }
1209
1210         default:
1211                 duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
1212                 ret = -EINVAL;
1213         }
1214
1215         return ret;
1216 }
1217
1218 int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
1219 {
1220         int ret;
1221         struct xt_table_info *newinfo;
1222         static struct xt_table_info bootstrap
1223                 = { 0, 0, 0, { 0 }, { 0 }, { } };
1224         void *loc_cpu_entry;
1225
1226         newinfo = xt_alloc_table_info(repl->size);
1227         if (!newinfo)
1228                 return -ENOMEM;
1229
1230         /* choose the copy on our node/cpu
1231          * but dont care of preemption
1232          */
1233         loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1234         memcpy(loc_cpu_entry, repl->entries, repl->size);
1235
1236         ret = translate_table(table->name, table->valid_hooks,
1237                               newinfo, loc_cpu_entry, repl->size,
1238                               repl->num_entries,
1239                               repl->hook_entry,
1240                               repl->underflow);
1241         if (ret != 0) {
1242                 xt_free_table_info(newinfo);
1243                 return ret;
1244         }
1245
1246         if (xt_register_table(table, &bootstrap, newinfo) != 0) {
1247                 xt_free_table_info(newinfo);
1248                 return ret;
1249         }
1250
1251         return 0;
1252 }
1253
1254 void ipt_unregister_table(struct ipt_table *table)
1255 {
1256         struct xt_table_info *private;
1257         void *loc_cpu_entry;
1258
1259         private = xt_unregister_table(table);
1260
1261         /* Decrease module usage counts and free resources */
1262         loc_cpu_entry = private->entries[raw_smp_processor_id()];
1263         IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
1264         xt_free_table_info(private);
1265 }
1266
1267 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
1268 static inline int
1269 icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
1270                      u_int8_t type, u_int8_t code,
1271                      int invert)
1272 {
1273         return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
1274                 ^ invert;
1275 }
1276
1277 static int
1278 icmp_match(const struct sk_buff *skb,
1279            const struct net_device *in,
1280            const struct net_device *out,
1281            const void *matchinfo,
1282            int offset,
1283            unsigned int protoff,
1284            int *hotdrop)
1285 {
1286         struct icmphdr _icmph, *ic;
1287         const struct ipt_icmp *icmpinfo = matchinfo;
1288
1289         /* Must not be a fragment. */
1290         if (offset)
1291                 return 0;
1292
1293         ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
1294         if (ic == NULL) {
1295                 /* We've been asked to examine this packet, and we
1296                  * can't.  Hence, no choice but to drop.
1297                  */
1298                 duprintf("Dropping evil ICMP tinygram.\n");
1299                 *hotdrop = 1;
1300                 return 0;
1301         }
1302
1303         return icmp_type_code_match(icmpinfo->type,
1304                                     icmpinfo->code[0],
1305                                     icmpinfo->code[1],
1306                                     ic->type, ic->code,
1307                                     !!(icmpinfo->invflags&IPT_ICMP_INV));
1308 }
1309
1310 /* Called when user tries to insert an entry of this type. */
1311 static int
1312 icmp_checkentry(const char *tablename,
1313            const void *info,
1314            void *matchinfo,
1315            unsigned int matchsize,
1316            unsigned int hook_mask)
1317 {
1318         const struct ipt_ip *ip = info;
1319         const struct ipt_icmp *icmpinfo = matchinfo;
1320
1321         /* Must specify proto == ICMP, and no unknown invflags */
1322         return ip->proto == IPPROTO_ICMP
1323                 && !(ip->invflags & IPT_INV_PROTO)
1324                 && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
1325                 && !(icmpinfo->invflags & ~IPT_ICMP_INV);
1326 }
1327
1328 /* The built-in targets: standard (NULL) and error. */
1329 static struct ipt_target ipt_standard_target = {
1330         .name           = IPT_STANDARD_TARGET,
1331 };
1332
1333 static struct ipt_target ipt_error_target = {
1334         .name           = IPT_ERROR_TARGET,
1335         .target         = ipt_error,
1336 };
1337
1338 static struct nf_sockopt_ops ipt_sockopts = {
1339         .pf             = PF_INET,
1340         .set_optmin     = IPT_BASE_CTL,
1341         .set_optmax     = IPT_SO_SET_MAX+1,
1342         .set            = do_ipt_set_ctl,
1343         .get_optmin     = IPT_BASE_CTL,
1344         .get_optmax     = IPT_SO_GET_MAX+1,
1345         .get            = do_ipt_get_ctl,
1346 };
1347
1348 static struct ipt_match icmp_matchstruct = {
1349         .name           = "icmp",
1350         .match          = &icmp_match,
1351         .checkentry     = &icmp_checkentry,
1352 };
1353
1354 static int __init init(void)
1355 {
1356         int ret;
1357
1358         xt_proto_init(AF_INET);
1359
1360         /* Noone else will be downing sem now, so we won't sleep */
1361         xt_register_target(AF_INET, &ipt_standard_target);
1362         xt_register_target(AF_INET, &ipt_error_target);
1363         xt_register_match(AF_INET, &icmp_matchstruct);
1364
1365         /* Register setsockopt */
1366         ret = nf_register_sockopt(&ipt_sockopts);
1367         if (ret < 0) {
1368                 duprintf("Unable to register sockopts.\n");
1369                 return ret;
1370         }
1371
1372         printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n");
1373         return 0;
1374 }
1375
1376 static void __exit fini(void)
1377 {
1378         nf_unregister_sockopt(&ipt_sockopts);
1379
1380         xt_unregister_match(AF_INET, &icmp_matchstruct);
1381         xt_unregister_target(AF_INET, &ipt_error_target);
1382         xt_unregister_target(AF_INET, &ipt_standard_target);
1383
1384         xt_proto_fini(AF_INET);
1385 }
1386
1387 EXPORT_SYMBOL(ipt_register_table);
1388 EXPORT_SYMBOL(ipt_unregister_table);
1389 EXPORT_SYMBOL(ipt_do_table);
1390 module_init(init);
1391 module_exit(fini);