vserver 1.9.5.x5
[linux-2.6.git] / net / ipv6 / netfilter / ip6_queue.c
1 /*
2  * This is a module which is used for queueing IPv6 packets and
3  * communicating with userspace via netlink.
4  *
5  * (C) 2001 Fernando Anton, this code is GPL.
6  *     IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
7  *     Universidad Carlos III de Madrid - Leganes (Madrid) - Spain
8  *     Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain
9  *     email: fanton@it.uc3m.es
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License version 2 as
13  * published by the Free Software Foundation.
14  *
15  * 2001-11-06: First try. Working with ip_queue.c for IPv4 and trying
16  *             to adapt it to IPv6
17  *             HEAVILY based in ipqueue.c by James Morris. It's just
18  *             a little modified version of it, so he's nearly the
19  *             real coder of this.
20  *             Few changes needed, mainly the hard_routing code and
21  *             the netlink socket protocol (we're NETLINK_IP6_FW).
22  * 2002-06-25: Code cleanup. [JM: ported cleanup over from ip_queue.c]
23  */
24 #include <linux/module.h>
25 #include <linux/skbuff.h>
26 #include <linux/init.h>
27 #include <linux/ipv6.h>
28 #include <linux/notifier.h>
29 #include <linux/netdevice.h>
30 #include <linux/netfilter.h>
31 #include <linux/netlink.h>
32 #include <linux/spinlock.h>
33 #include <linux/sysctl.h>
34 #include <linux/proc_fs.h>
35 #include <net/sock.h>
36 #include <net/ipv6.h>
37 #include <net/ip6_route.h>
38 #include <linux/netfilter_ipv4/ip_queue.h>
39 #include <linux/netfilter_ipv4/ip_tables.h>
40 #include <linux/netfilter_ipv6/ip6_tables.h>
41
42 #define IPQ_QMAX_DEFAULT 1024
43 #define IPQ_PROC_FS_NAME "ip6_queue"
44 #define NET_IPQ_QMAX 2088
45 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
46
47 struct ipq_rt_info {
48         struct in6_addr daddr;
49         struct in6_addr saddr;
50 };
51
52 struct ipq_queue_entry {
53         struct list_head list;
54         struct nf_info *info;
55         struct sk_buff *skb;
56         struct ipq_rt_info rt_info;
57 };
58
59 typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
60
61 static unsigned char copy_mode = IPQ_COPY_NONE;
62 static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
63 static DEFINE_RWLOCK(queue_lock);
64 static int peer_pid;
65 static unsigned int copy_range;
66 static unsigned int queue_total;
67 static struct sock *ipqnl;
68 static LIST_HEAD(queue_list);
69 static DECLARE_MUTEX(ipqnl_sem);
70
71 static void
72 ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
73 {
74         nf_reinject(entry->skb, entry->info, verdict);
75         kfree(entry);
76 }
77
78 static inline int
79 __ipq_enqueue_entry(struct ipq_queue_entry *entry)
80 {
81        if (queue_total >= queue_maxlen) {
82                if (net_ratelimit()) 
83                        printk(KERN_WARNING "ip6_queue: full at %d entries, "
84                               "dropping packet(s).\n", queue_total);
85                return -ENOSPC;
86        }
87        list_add(&entry->list, &queue_list);
88        queue_total++;
89        return 0;
90 }
91
92 /*
93  * Find and return a queued entry matched by cmpfn, or return the last
94  * entry if cmpfn is NULL.
95  */
96 static inline struct ipq_queue_entry *
97 __ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
98 {
99         struct list_head *p;
100
101         list_for_each_prev(p, &queue_list) {
102                 struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
103                 
104                 if (!cmpfn || cmpfn(entry, data))
105                         return entry;
106         }
107         return NULL;
108 }
109
110 static inline void
111 __ipq_dequeue_entry(struct ipq_queue_entry *entry)
112 {
113         list_del(&entry->list);
114         queue_total--;
115 }
116
117 static inline struct ipq_queue_entry *
118 __ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
119 {
120         struct ipq_queue_entry *entry;
121
122         entry = __ipq_find_entry(cmpfn, data);
123         if (entry == NULL)
124                 return NULL;
125
126         __ipq_dequeue_entry(entry);
127         return entry;
128 }
129
130
131 static inline void
132 __ipq_flush(int verdict)
133 {
134         struct ipq_queue_entry *entry;
135         
136         while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
137                 ipq_issue_verdict(entry, verdict);
138 }
139
140 static inline int
141 __ipq_set_mode(unsigned char mode, unsigned int range)
142 {
143         int status = 0;
144         
145         switch(mode) {
146         case IPQ_COPY_NONE:
147         case IPQ_COPY_META:
148                 copy_mode = mode;
149                 copy_range = 0;
150                 break;
151                 
152         case IPQ_COPY_PACKET:
153                 copy_mode = mode;
154                 copy_range = range;
155                 if (copy_range > 0xFFFF)
156                         copy_range = 0xFFFF;
157                 break;
158                 
159         default:
160                 status = -EINVAL;
161
162         }
163         return status;
164 }
165
166 static inline void
167 __ipq_reset(void)
168 {
169         peer_pid = 0;
170         net_disable_timestamp();
171         __ipq_set_mode(IPQ_COPY_NONE, 0);
172         __ipq_flush(NF_DROP);
173 }
174
175 static struct ipq_queue_entry *
176 ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
177 {
178         struct ipq_queue_entry *entry;
179         
180         write_lock_bh(&queue_lock);
181         entry = __ipq_find_dequeue_entry(cmpfn, data);
182         write_unlock_bh(&queue_lock);
183         return entry;
184 }
185
186 static void
187 ipq_flush(int verdict)
188 {
189         write_lock_bh(&queue_lock);
190         __ipq_flush(verdict);
191         write_unlock_bh(&queue_lock);
192 }
193
194 static struct sk_buff *
195 ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
196 {
197         unsigned char *old_tail;
198         size_t size = 0;
199         size_t data_len = 0;
200         struct sk_buff *skb;
201         struct ipq_packet_msg *pmsg;
202         struct nlmsghdr *nlh;
203
204         read_lock_bh(&queue_lock);
205         
206         switch (copy_mode) {
207         case IPQ_COPY_META:
208         case IPQ_COPY_NONE:
209                 size = NLMSG_SPACE(sizeof(*pmsg));
210                 data_len = 0;
211                 break;
212         
213         case IPQ_COPY_PACKET:
214                 if (copy_range == 0 || copy_range > entry->skb->len)
215                         data_len = entry->skb->len;
216                 else
217                         data_len = copy_range;
218                 
219                 size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
220                 break;
221         
222         default:
223                 *errp = -EINVAL;
224                 read_unlock_bh(&queue_lock);
225                 return NULL;
226         }
227
228         read_unlock_bh(&queue_lock);
229
230         skb = alloc_skb(size, GFP_ATOMIC);
231         if (!skb)
232                 goto nlmsg_failure;
233                 
234         old_tail= skb->tail;
235         nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
236         pmsg = NLMSG_DATA(nlh);
237         memset(pmsg, 0, sizeof(*pmsg));
238
239         pmsg->packet_id       = (unsigned long )entry;
240         pmsg->data_len        = data_len;
241         pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
242         pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
243         pmsg->mark            = entry->skb->nfmark;
244         pmsg->hook            = entry->info->hook;
245         pmsg->hw_protocol     = entry->skb->protocol;
246         
247         if (entry->info->indev)
248                 strcpy(pmsg->indev_name, entry->info->indev->name);
249         else
250                 pmsg->indev_name[0] = '\0';
251         
252         if (entry->info->outdev)
253                 strcpy(pmsg->outdev_name, entry->info->outdev->name);
254         else
255                 pmsg->outdev_name[0] = '\0';
256         
257         if (entry->info->indev && entry->skb->dev) {
258                 pmsg->hw_type = entry->skb->dev->type;
259                 if (entry->skb->dev->hard_header_parse)
260                         pmsg->hw_addrlen =
261                                 entry->skb->dev->hard_header_parse(entry->skb,
262                                                                    pmsg->hw_addr);
263         }
264         
265         if (data_len)
266                 if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
267                         BUG();
268                 
269         nlh->nlmsg_len = skb->tail - old_tail;
270         return skb;
271
272 nlmsg_failure:
273         if (skb)
274                 kfree_skb(skb);
275         *errp = -EINVAL;
276         printk(KERN_ERR "ip6_queue: error creating packet message\n");
277         return NULL;
278 }
279
280 static int
281 ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
282 {
283         int status = -EINVAL;
284         struct sk_buff *nskb;
285         struct ipq_queue_entry *entry;
286
287         if (copy_mode == IPQ_COPY_NONE)
288                 return -EAGAIN;
289
290         entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
291         if (entry == NULL) {
292                 printk(KERN_ERR "ip6_queue: OOM in ipq_enqueue_packet()\n");
293                 return -ENOMEM;
294         }
295
296         entry->info = info;
297         entry->skb = skb;
298
299         if (entry->info->hook == NF_IP_LOCAL_OUT) {
300                 struct ipv6hdr *iph = skb->nh.ipv6h;
301
302                 entry->rt_info.daddr = iph->daddr;
303                 entry->rt_info.saddr = iph->saddr;
304         }
305
306         nskb = ipq_build_packet_message(entry, &status);
307         if (nskb == NULL)
308                 goto err_out_free;
309                 
310         write_lock_bh(&queue_lock);
311         
312         if (!peer_pid)
313                 goto err_out_free_nskb; 
314
315         /* netlink_unicast will either free the nskb or attach it to a socket */ 
316         status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
317         if (status < 0)
318                 goto err_out_unlock;
319         
320         status = __ipq_enqueue_entry(entry);
321         if (status < 0)
322                 goto err_out_unlock;
323
324         write_unlock_bh(&queue_lock);
325         return status;
326         
327 err_out_free_nskb:
328         kfree_skb(nskb); 
329         
330 err_out_unlock:
331         write_unlock_bh(&queue_lock);
332
333 err_out_free:
334         kfree(entry);
335         return status;
336 }
337
338 static int
339 ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
340 {
341         int diff;
342         struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload;
343
344         if (v->data_len < sizeof(*user_iph))
345                 return 0;
346         diff = v->data_len - e->skb->len;
347         if (diff < 0)
348                 skb_trim(e->skb, v->data_len);
349         else if (diff > 0) {
350                 if (v->data_len > 0xFFFF)
351                         return -EINVAL;
352                 if (diff > skb_tailroom(e->skb)) {
353                         struct sk_buff *newskb;
354                         
355                         newskb = skb_copy_expand(e->skb,
356                                                  skb_headroom(e->skb),
357                                                  diff,
358                                                  GFP_ATOMIC);
359                         if (newskb == NULL) {
360                                 printk(KERN_WARNING "ip6_queue: OOM "
361                                       "in mangle, dropping packet\n");
362                                 return -ENOMEM;
363                         }
364                         if (e->skb->sk)
365                                 skb_set_owner_w(newskb, e->skb->sk);
366                         kfree_skb(e->skb);
367                         e->skb = newskb;
368                 }
369                 skb_put(e->skb, diff);
370         }
371         if (!skb_ip_make_writable(&e->skb, v->data_len))
372                 return -ENOMEM;
373         memcpy(e->skb->data, v->payload, v->data_len);
374         e->skb->nfcache |= NFC_ALTERED;
375
376         /*
377          * Extra routing may needed on local out, as the QUEUE target never
378          * returns control to the table.
379          * Not a nice way to cmp, but works
380          */
381         if (e->info->hook == NF_IP_LOCAL_OUT) {
382                 struct ipv6hdr *iph = e->skb->nh.ipv6h;
383                 if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) ||
384                     !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr))
385                         return ip6_route_me_harder(e->skb);
386         }
387         return 0;
388 }
389
390 static inline int
391 id_cmp(struct ipq_queue_entry *e, unsigned long id)
392 {
393         return (id == (unsigned long )e);
394 }
395
396 static int
397 ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
398 {
399         struct ipq_queue_entry *entry;
400
401         if (vmsg->value > NF_MAX_VERDICT)
402                 return -EINVAL;
403
404         entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
405         if (entry == NULL)
406                 return -ENOENT;
407         else {
408                 int verdict = vmsg->value;
409                 
410                 if (vmsg->data_len && vmsg->data_len == len)
411                         if (ipq_mangle_ipv6(vmsg, entry) < 0)
412                                 verdict = NF_DROP;
413                 
414                 ipq_issue_verdict(entry, verdict);
415                 return 0;
416         }
417 }
418
419 static int
420 ipq_set_mode(unsigned char mode, unsigned int range)
421 {
422         int status;
423
424         write_lock_bh(&queue_lock);
425         status = __ipq_set_mode(mode, range);
426         write_unlock_bh(&queue_lock);
427         return status;
428 }
429
430 static int
431 ipq_receive_peer(struct ipq_peer_msg *pmsg,
432                  unsigned char type, unsigned int len)
433 {
434         int status = 0;
435
436         if (len < sizeof(*pmsg))
437                 return -EINVAL;
438
439         switch (type) {
440         case IPQM_MODE:
441                 status = ipq_set_mode(pmsg->msg.mode.value,
442                                       pmsg->msg.mode.range);
443                 break;
444                 
445         case IPQM_VERDICT:
446                 if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
447                         status = -EINVAL;
448                 else
449                         status = ipq_set_verdict(&pmsg->msg.verdict,
450                                                  len - sizeof(*pmsg));
451                         break;
452         default:
453                 status = -EINVAL;
454         }
455         return status;
456 }
457
458 static int
459 dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
460 {
461         if (entry->info->indev)
462                 if (entry->info->indev->ifindex == ifindex)
463                         return 1;
464                         
465         if (entry->info->outdev)
466                 if (entry->info->outdev->ifindex == ifindex)
467                         return 1;
468
469         return 0;
470 }
471
472 static void
473 ipq_dev_drop(int ifindex)
474 {
475         struct ipq_queue_entry *entry;
476         
477         while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
478                 ipq_issue_verdict(entry, NF_DROP);
479 }
480
481 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
482
483 static inline void
484 ipq_rcv_skb(struct sk_buff *skb)
485 {
486         int status, type, pid, flags, nlmsglen, skblen;
487         struct nlmsghdr *nlh;
488
489         skblen = skb->len;
490         if (skblen < sizeof(*nlh))
491                 return;
492
493         nlh = (struct nlmsghdr *)skb->data;
494         nlmsglen = nlh->nlmsg_len;
495         if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
496                 return;
497
498         pid = nlh->nlmsg_pid;
499         flags = nlh->nlmsg_flags;
500         
501         if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
502                 RCV_SKB_FAIL(-EINVAL);
503                 
504         if (flags & MSG_TRUNC)
505                 RCV_SKB_FAIL(-ECOMM);
506                 
507         type = nlh->nlmsg_type;
508         if (type < NLMSG_NOOP || type >= IPQM_MAX)
509                 RCV_SKB_FAIL(-EINVAL);
510                 
511         if (type <= IPQM_BASE)
512                 return;
513         
514         if (security_netlink_recv(skb))
515                 RCV_SKB_FAIL(-EPERM);   
516
517         write_lock_bh(&queue_lock);
518         
519         if (peer_pid) {
520                 if (peer_pid != pid) {
521                         write_unlock_bh(&queue_lock);
522                         RCV_SKB_FAIL(-EBUSY);
523                 }
524         } else {
525                 net_enable_timestamp();
526                 peer_pid = pid;
527         }
528                 
529         write_unlock_bh(&queue_lock);
530         
531         status = ipq_receive_peer(NLMSG_DATA(nlh), type,
532                                   skblen - NLMSG_LENGTH(0));
533         if (status < 0)
534                 RCV_SKB_FAIL(status);
535                 
536         if (flags & NLM_F_ACK)
537                 netlink_ack(skb, nlh, 0);
538         return;
539 }
540
541 static void
542 ipq_rcv_sk(struct sock *sk, int len)
543 {
544         do {
545                 struct sk_buff *skb;
546
547                 if (down_trylock(&ipqnl_sem))
548                         return;
549                         
550                 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
551                         ipq_rcv_skb(skb);
552                         kfree_skb(skb);
553                 }
554                 
555                 up(&ipqnl_sem);
556
557         } while (ipqnl && ipqnl->sk_receive_queue.qlen);
558 }
559
560 static int
561 ipq_rcv_dev_event(struct notifier_block *this,
562                   unsigned long event, void *ptr)
563 {
564         struct net_device *dev = ptr;
565
566         /* Drop any packets associated with the downed device */
567         if (event == NETDEV_DOWN)
568                 ipq_dev_drop(dev->ifindex);
569         return NOTIFY_DONE;
570 }
571
572 static struct notifier_block ipq_dev_notifier = {
573         .notifier_call  = ipq_rcv_dev_event,
574 };
575
576 static int
577 ipq_rcv_nl_event(struct notifier_block *this,
578                  unsigned long event, void *ptr)
579 {
580         struct netlink_notify *n = ptr;
581
582         if (event == NETLINK_URELEASE &&
583             n->protocol == NETLINK_IP6_FW && n->pid) {
584                 write_lock_bh(&queue_lock);
585                 if (n->pid == peer_pid)
586                         __ipq_reset();
587                 write_unlock_bh(&queue_lock);
588         }
589         return NOTIFY_DONE;
590 }
591
592 static struct notifier_block ipq_nl_notifier = {
593         .notifier_call  = ipq_rcv_nl_event,
594 };
595
596 static struct ctl_table_header *ipq_sysctl_header;
597
598 static ctl_table ipq_table[] = {
599         {
600                 .ctl_name       = NET_IPQ_QMAX,
601                 .procname       = NET_IPQ_QMAX_NAME,
602                 .data           = &queue_maxlen,
603                 .maxlen         = sizeof(queue_maxlen),
604                 .mode           = 0644,
605                 .proc_handler   = proc_dointvec
606         },
607         { .ctl_name = 0 }
608 };
609
610 static ctl_table ipq_dir_table[] = {
611         {
612                 .ctl_name       = NET_IPV6,
613                 .procname       = "ipv6",
614                 .mode           = 0555,
615                 .child          = ipq_table
616         },
617         { .ctl_name = 0 }
618 };
619
620 static ctl_table ipq_root_table[] = {
621         {
622                 .ctl_name       = CTL_NET,
623                 .procname       = "net",
624                 .mode           = 0555,
625                 .child          = ipq_dir_table
626         },
627         { .ctl_name = 0 }
628 };
629
630 static int
631 ipq_get_info(char *buffer, char **start, off_t offset, int length)
632 {
633         int len;
634
635         read_lock_bh(&queue_lock);
636         
637         len = sprintf(buffer,
638                       "Peer PID          : %d\n"
639                       "Copy mode         : %hu\n"
640                       "Copy range        : %u\n"
641                       "Queue length      : %u\n"
642                       "Queue max. length : %u\n",
643                       peer_pid,
644                       copy_mode,
645                       copy_range,
646                       queue_total,
647                       queue_maxlen);
648
649         read_unlock_bh(&queue_lock);
650         
651         *start = buffer + offset;
652         len -= offset;
653         if (len > length)
654                 len = length;
655         else if (len < 0)
656                 len = 0;
657         return len;
658 }
659
660 static int
661 init_or_cleanup(int init)
662 {
663         int status = -ENOMEM;
664         struct proc_dir_entry *proc;
665         
666         if (!init)
667                 goto cleanup;
668
669         netlink_register_notifier(&ipq_nl_notifier);
670         ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk);
671         if (ipqnl == NULL) {
672                 printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
673                 goto cleanup_netlink_notifier;
674         }
675
676         proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
677         if (proc)
678                 proc->owner = THIS_MODULE;
679         else {
680                 printk(KERN_ERR "ip6_queue: failed to create proc entry\n");
681                 goto cleanup_ipqnl;
682         }
683         
684         register_netdevice_notifier(&ipq_dev_notifier);
685         ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
686         
687         status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL);
688         if (status < 0) {
689                 printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
690                 goto cleanup_sysctl;
691         }
692         return status;
693
694 cleanup:
695         nf_unregister_queue_handler(PF_INET6);
696         synchronize_net();
697         ipq_flush(NF_DROP);
698         
699 cleanup_sysctl:
700         unregister_sysctl_table(ipq_sysctl_header);
701         unregister_netdevice_notifier(&ipq_dev_notifier);
702         proc_net_remove(IPQ_PROC_FS_NAME);
703         
704 cleanup_ipqnl:
705         sock_release(ipqnl->sk_socket);
706         down(&ipqnl_sem);
707         up(&ipqnl_sem);
708         
709 cleanup_netlink_notifier:
710         netlink_unregister_notifier(&ipq_nl_notifier);
711         return status;
712 }
713
714 static int __init init(void)
715 {
716         
717         return init_or_cleanup(1);
718 }
719
720 static void __exit fini(void)
721 {
722         init_or_cleanup(0);
723 }
724
725 MODULE_DESCRIPTION("IPv6 packet queue handler");
726 MODULE_LICENSE("GPL");
727
728 module_init(init);
729 module_exit(fini);