2 * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * The main interface to build ipfw+dummynet as a linux module.
30 * (and possibly as a windows module as well, though that part
31 * is not complete yet).
33 * The control interface uses the sockopt mechanism
34 * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW).
36 * The data interface uses the netfilter interface, at the moment
37 * hooked to the PRE_ROUTING and POST_ROUTING hooks.
38 * Unfortunately the netfilter interface is a moving target,
39 * so we need a set of macros to adapt to the various cases.
41 * In the netfilter hook we just mark packet as 'QUEUE' and then
42 * let the queue handler to do the whole work (filtering and
43 * possibly emulation).
44 * As we receive packets, we wrap them with an mbuf descriptor
45 * so the existing ipfw+dummynet code runs unmodified.
48 #include <sys/cdefs.h>
49 #include <sys/mbuf.h> /* sizeof struct mbuf */
50 #include <sys/param.h> /* NGROUPS */
55 #include <linux/module.h>
56 #include <linux/kernel.h>
57 #include <linux/netfilter.h>
58 #include <linux/netfilter_ipv4.h> /* NF_IP_PRI_FILTER */
60 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
61 #include <net/netfilter/nf_queue.h> /* nf_queue */
64 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
68 #endif /* !__linux__ */
70 #include <netinet/in.h> /* in_addr */
71 #include <netinet/ip_fw.h> /* ip_fw_ctl_t, ip_fw_chk_t */
72 #include <netinet/ip_dummynet.h> /* ip_dn_ctl_t, ip_dn_io_t */
73 #include <net/pfil.h> /* PFIL_IN, PFIL_OUT */
74 #include <net/inet_hashtables.h> /* inet_lookup */
75 #include <net/route.h> /* inet_iif */
78 * Here we allocate some global variables used in the firewall.
80 //ip_dn_ctl_t *ip_dn_ctl_ptr;
81 int (*ip_dn_ctl_ptr)(struct sockopt *);
83 ip_fw_ctl_t *ip_fw_ctl_ptr;
85 int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
86 ip_fw_chk_t *ip_fw_chk_ptr;
88 void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
91 * Glue code to implement the registration of children with the parent.
92 * Each child should call my_mod_register() when linking, so that
93 * module_init() and module_exit() can call init_children() and
94 * fini_children() to provide the necessary initialization.
96 #include <sys/module.h>
98 struct moduledata *mod;
103 static unsigned int mod_idx;
104 static struct mod_args mods[10]; /* hard limit to 10 modules */
107 * Data structure to cache our ucred related
108 * information. This structure only gets used if
109 * the user specified UID/GID based constraints in
113 gid_t fw_groups[NGROUPS];
120 * my_mod_register should be called automatically as the init
121 * functions in the submodules. Unfortunately this compiler/linker
122 * trick is not supported yet so we call it manually.
125 my_mod_register(struct moduledata *mod, const char *name, int order)
127 struct mod_args m = { mod, name, order };
129 printf("%s %s called\n", __FUNCTION__, name);
130 if (mod_idx < sizeof(mods) / sizeof(mods[0]))
140 /* Call the functions registered at init time. */
141 printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx);
142 for (i = 0; i < mod_idx; i++) {
143 printf("+++ start module %d %s %s at %p order 0x%x\n",
144 i, mods[i].name, mods[i].mod->name,
145 mods[i].mod, mods[i].order);
146 mods[i].mod->evhand(NULL, MOD_LOAD, mods[i].mod->priv);
155 /* Call the functions registered at init time. */
156 for (i = mod_idx - 1; i >= 0; i--) {
157 printf("+++ end module %d %s %s at %p order 0x%x\n",
158 i, mods[i].name, mods[i].mod->name,
159 mods[i].mod, mods[i].order);
160 mods[i].mod->evhand(NULL, MOD_UNLOAD, mods[i].mod->priv);
163 /*--- end of module binding helper functions ---*/
167 * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention.
168 * then call the ipfw handler in order to manage requests.
169 * In turn this is called by the linux set/get handlers.
172 ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user)
177 memset(s, 0, sizeof(s));
180 s->sopt_valsize = len;
183 /* sopt_td is not used but it is referenced */
184 memset(&t, 0, sizeof(t));
187 // printf("%s called with cmd %d len %d\n", __FUNCTION__, cmd, len);
189 if (cmd < IP_DUMMYNET_CONFIGURE && ip_fw_ctl_ptr)
190 ret = ip_fw_ctl_ptr(s);
191 else if (cmd >= IP_DUMMYNET_CONFIGURE && ip_dn_ctl_ptr)
192 ret = ip_dn_ctl_ptr(s);
194 return -ret; /* errors are < 0 on linux */
200 netisr_dispatch(int __unused num, struct mbuf *m)
205 ip_output(struct mbuf *m, struct mbuf __unused *opt,
206 struct route __unused *ro, int __unused flags,
207 struct ip_moptions __unused *imo, struct inpcb __unused *inp)
209 netisr_dispatch(0, m);
213 #else /* this is the linux glue */
215 * setsockopt hook has no return value other than the error code.
218 do_ipfw_set_ctl(struct sock __unused *sk, int cmd,
219 void __user *user, unsigned int len)
221 struct sockopt s; /* pass arguments */
223 return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user);
227 * getsockopt can can return a block of data in response.
230 do_ipfw_get_ctl(struct sock __unused *sk,
231 int cmd, void __user *user, int *len)
233 struct sockopt s; /* pass arguments */
234 int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user);
236 *len = s.sopt_valsize; /* return lenght back to the caller */
241 * declare our [get|set]sockopt hooks
243 static struct nf_sockopt_ops ipfw_sockopts = {
245 .set_optmin = _IPFW_SOCKOPT_BASE,
246 .set_optmax = _IPFW_SOCKOPT_END,
247 .set = do_ipfw_set_ctl,
248 .get_optmin = _IPFW_SOCKOPT_BASE,
249 .get_optmax = _IPFW_SOCKOPT_END,
250 .get = do_ipfw_get_ctl,
251 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
252 .owner = THIS_MODULE,
257 * We need a number of macros to adapt to the various APIs in
258 * different linux versions. Among them:
260 * - the hook names change between macros (NF_IP*) and enum NF_INET_*
262 * - the second argument to the netfilter hook is
263 * struct sk_buff ** in kernels <= 2.6.22
264 * struct sk_buff * in kernels > 2.6.22
266 * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT
268 * - the packet descriptor passed to the queue handler is
269 * struct nf_info in kernels <= 2.6.24
270 * struct nf_queue_entry in kernels <= 2.6.24
272 * - the arguments to the queue handler also change;
276 * declare hook to grab packets from the netfilter interface.
277 * The NF_* names change in different versions of linux, in some
278 * cases they are #defines, in others they are enum, so we
281 #ifndef NF_IP_PRE_ROUTING
282 #define NF_IP_PRE_ROUTING NF_INET_PRE_ROUTING
284 #ifndef NF_IP_POST_ROUTING
285 #define NF_IP_POST_ROUTING NF_INET_POST_ROUTING
289 * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains.
290 * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and
291 * POST_ROUTING chains, so if we want to use that information we
292 * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING.
293 * However at the moment the skb_tag info is not reliable so
294 * we stay with the standard hooks.
296 #if 0 // defined(IPFW_PLANETLAB)
297 #define IPFW_HOOK_IN NF_IP_LOCAL_IN
299 #define IPFW_HOOK_IN NF_IP_PRE_ROUTING
303 * The main netfilter hook.
304 * To make life simple, we queue everything and then do all the
305 * decision in the queue handler.
307 * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff**
308 * so we have an #ifdef to set the proper argument type.
311 call_ipfw(unsigned int __unused hooknum,
312 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have **
313 struct sk_buff __unused **skb,
315 struct sk_buff __unused *skb,
317 const struct net_device __unused *in,
318 const struct net_device __unused *out,
319 int __unused (*okfn)(struct sk_buff *))
324 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
325 #define NF_STOP NF_ACCEPT
328 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
331 * nf_queue_entry is a recent addition, in previous versions
332 * of the code the struct is called nf_info.
334 #define nf_queue_entry nf_info /* for simplicity */
336 /* also, 2.4 and perhaps something else have different arguments */
337 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* unsure on the exact boundary */
338 /* on 2.4 we use nf_info */
339 #define QH_ARGS struct sk_buff *skb, struct nf_info *info, void *data
340 #else /* 2.6.1.. 2.6.24 */
341 #define QH_ARGS struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data
344 #define DEFINE_SKB /* nothing, already an argument */
345 #define REINJECT(_inf, _verd) nf_reinject(skb, _inf, _verd)
347 #else /* 2.6.25 and above */
349 #define QH_ARGS struct nf_queue_entry *info, unsigned int queuenum
350 #define DEFINE_SKB struct sk_buff *skb = info->skb;
351 #define REINJECT(_inf, _verd) nf_reinject(_inf, _verd)
355 * used by dummynet when dropping packets
356 * XXX use dummynet_send()
359 reinject_drop(struct mbuf* m)
361 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) /* unsure on the exact boundary */
362 struct sk_buff *skb = (struct sk_buff *)m;
364 REINJECT(m->queue_entry, NF_DROP);
368 * The real call to the firewall. nf_queue_entry points to the skbuf,
369 * and eventually we need to return both through nf_reinject().
372 ipfw2_queue_handler(QH_ARGS)
374 DEFINE_SKB /* no semicolon here, goes in the macro */
375 int ret = 0; /* return value */
378 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
379 if (skb->nh.iph == NULL) {
380 printf("null dp, len %d reinject now\n", skb->len);
381 REINJECT(info, NF_ACCEPT);
385 m = malloc(sizeof(*m), 0, 0);
387 printf("malloc fail, len %d reinject now\n", skb->len);
388 REINJECT(info, NF_ACCEPT);
393 m->m_len = skb->len; /* len in this skbuf */
394 m->m_pkthdr.len = skb->len; /* total packet len */
395 m->m_pkthdr.rcvif = info->indev;
396 m->queue_entry = info;
397 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
398 m->m_data = skb->nh.iph;
400 m->m_data = skb_network_header(skb);
403 /* XXX add the interface */
404 if (info->hook == IPFW_HOOK_IN) {
405 ret = ipfw_check_in(NULL, &m, info->indev, PFIL_IN, NULL);
407 ret = ipfw_check_out(NULL, &m, info->outdev, PFIL_OUT, NULL);
410 if (m != NULL) { /* Accept. reinject and free the mbuf */
411 REINJECT(info, NF_STOP);
413 } else if (ret == 0) {
414 /* dummynet has kept the packet, will reinject later. */
417 * Packet dropped by ipfw or dummynet, reinject as NF_DROP
418 * mbuf already released by ipfw itself
420 REINJECT(info, NF_DROP);
430 /* XXX should include prototypes for netisr_dispatch and ip_output */
432 * The reinjection routine after a packet comes out from dummynet.
433 * We must update the skb timestamp so ping reports the right time.
436 netisr_dispatch(int num, struct mbuf *m)
438 struct nf_queue_entry *info = m->queue_entry;
439 struct sk_buff *skb = m->m_skb; /* always used */
443 KASSERT((info != NULL), ("%s info null!\n", __FUNCTION__));
444 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) // XXX above 2.6.x ?
445 __net_timestamp(skb); /* update timestamp */
448 /* XXX to obey one-pass, possibly call the queue handler here */
449 REINJECT(info, ((num == -1)?NF_DROP:NF_STOP)); /* accept but no more firewall */
453 ip_output(struct mbuf *m, struct mbuf __unused *opt,
454 struct route __unused *ro, int __unused flags,
455 struct ip_moptions __unused *imo, struct inpcb __unused *inp)
457 netisr_dispatch(0, m);
462 * socket lookup function for linux.
463 * This code is used to associate uid, gid, jail/xid to packets,
464 * and store the info in a cache *ugp where they can be accessed quickly.
465 * The function returns 1 if the info is found, -1 otherwise.
467 * We do this only on selected protocols: TCP, ...
469 * The chain is the following
470 * sk_buff* sock* socket* file*
471 * skb -> sk ->sk_socket->file ->f_owner ->pid
472 * skb -> sk ->sk_socket->file ->f_uid (direct)
473 * skb -> sk ->sk_socket->file ->f_cred->fsuid (2.6.29+)
476 * linux/skbuff.h struct skbuff
477 * net/sock.h struct sock
478 * linux/net.h struct socket
479 * linux/fs.h struct file
481 * With vserver we may have sk->sk_xid and sk->sk_nid that
482 * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2]
485 * Note- for locally generated, outgoing packets we should not need
486 * need a lookup because the sk_buff already points to the socket where
489 extern struct inet_hashinfo tcp_hashinfo;
491 linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
492 const __be32 daddr, const __be16 dport,
493 struct sk_buff *skb, int dir, struct ip_fw_ugid *ugp)
496 int ret = -1; /* default return value */
497 int st = -1; /* state */
499 if (proto != IPPROTO_TCP) /* XXX extend for UDP */
502 if ((dir ? (void *)skb->dst : (void *)skb->dev) == NULL) {
503 panic(" -- this should not happen\n");
511 * Try a lookup. On a match, sk has a refcount that we must
512 * release on exit (we know it because skb->sk = NULL).
514 * inet_lookup above 2.6.24 has an additional 'net' parameter
515 * so we use a macro to conditionally supply it.
516 * swap dst and src depending on the direction.
518 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24)
521 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
522 /* there is no dev_net() on 2.6.25 */
523 #define _OPT_NET_ARG (skb->dev->nd_net),
524 #else /* 2.6.26 and above */
525 #define _OPT_NET_ARG dev_net(skb->dev),
528 sk = (dir) ? /* dir != 0 on output */
529 inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
530 daddr, dport, saddr, sport, // match outgoing
532 inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
533 saddr, sport, daddr, dport, // match incoming
537 if (sk == NULL) /* no match, nothing to be done */
540 ret = 1; /* retrying won't make things better */
542 #ifdef CONFIG_VSERVER
543 ugp->fw_groups[1] = sk->sk_xid;
544 ugp->fw_groups[2] = sk->sk_nid;
546 ugp->fw_groups[1] = ugp->fw_groups[2] = 0;
549 * Exclude tcp states where sk points to a inet_timewait_sock which
550 * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more).
551 * To be safe, use a whitelist and not a blacklist.
552 * Before dereferencing sk_socket grab a lock on sk_callback_lock.
554 * Once again we need conditional code because the UID and GID
555 * location changes between kernels.
557 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
558 /* use the current's real uid/gid */
559 #define _CURR_UID f_uid
560 #define _CURR_GID f_gid
561 #else /* 2.6.29 and above */
562 /* use the current's file access real uid/gid */
563 #define _CURR_UID f_cred->fsuid
564 #define _CURR_GID f_cred->fsgid
567 #ifdef CONFIG_VSERVER
568 ugp->fw_groups[1] = sk->sk_xid;
569 ugp->fw_groups[2] = sk->sk_nid;
572 ugp->fw_groups[2] = 0;
576 #define GOOD_STATES ( \
577 (1<<TCP_LISTEN) | (1<<TCP_SYN_RECV) | (1<<TCP_SYN_SENT) | \
578 (1<<TCP_ESTABLISHED) | (1<<TCP_FIN_WAIT1) | (1<<TCP_FIN_WAIT2) )
579 // surely exclude TCP_CLOSE, TCP_TIME_WAIT, TCP_LAST_ACK
580 // uncertain TCP_CLOSE_WAIT and TCP_CLOSING
582 if ((1<<st) & GOOD_STATES) {
583 read_lock_bh(&sk->sk_callback_lock);
584 if (sk->sk_socket && sk->sk_socket->file) {
585 ugp->fw_uid = sk->sk_socket->file->_CURR_UID;
586 ugp->fw_groups[0] = sk->sk_socket->file->_CURR_GID;
588 read_unlock_bh(&sk->sk_callback_lock);
590 ugp->fw_uid = ugp->fw_groups[0] = 0;
592 if (!skb->sk) /* return the reference that came from the lookup */
601 * Now prepare to hook the various functions.
602 * Linux 2.4 has a different API so we need some adaptation
603 * for register and unregister hooks
605 * the unregister function changed arguments between 2.6.22 and 2.6.24
607 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
609 nf_register_hooks(struct nf_hook_ops *ops, int n)
612 for (i = 0; i < n; i++) {
613 ret = nf_register_hook(ops + i);
621 nf_unregister_hooks(struct nf_hook_ops *ops, int n)
624 for (i = 0; i < n; i++) {
625 nf_unregister_hook(ops + i);
628 #define REG_QH_ARG(fn) fn, NULL /* argument for nf_[un]register_queue_handler */
629 #define UNREG_QH_ARG(fn) //fn /* argument for nf_[un]register_queue_handler */
630 #define SET_MOD_OWNER
632 #else /* linux >= 2.6.0 */
634 struct nf_queue_handler ipfw2_queue_handler_desc = {
635 .outfn = ipfw2_queue_handler,
636 .name = "ipfw2 dummynet queue",
638 #define REG_QH_ARG(fn) &(fn ## _desc)
640 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
641 #define UNREG_QH_ARG(fn) //fn /* argument for nf_[un]register_queue_handler */
643 #define UNREG_QH_ARG(fn) , &(fn ## _desc)
644 #endif /* 2.6.0 < LINUX > 2.6.24 */
646 #define SET_MOD_OWNER .owner = THIS_MODULE,
648 #endif /* !LINUX < 2.6.0 */
650 static struct nf_hook_ops ipfw_ops[] __read_mostly = {
654 .hooknum = IPFW_HOOK_IN,
655 .priority = NF_IP_PRI_FILTER,
661 .hooknum = NF_IP_POST_ROUTING,
662 .priority = NF_IP_PRI_FILTER,
666 #endif /* !__linux__ */
668 /* descriptors for the children */
669 extern moduledata_t *moddesc_ipfw;
670 extern moduledata_t *moddesc_dummynet;
673 * Module glue - init and exit function.
676 ipfw_module_init(void)
680 printf("%s in-hook %d svn id %s\n", __FUNCTION__, IPFW_HOOK_IN, "$Id$");
682 my_mod_register(moddesc_ipfw, "ipfw", 1);
683 my_mod_register(moddesc_dummynet, "dummynet", 2);
689 #else /* linux hook */
690 /* sockopt register, in order to talk with user space */
691 ret = nf_register_sockopt(&ipfw_sockopts);
693 printf("error %d in nf_register_sockopt\n", ret);
697 /* queue handler registration, in order to get network
698 * packet under a private queue */
699 ret = nf_register_queue_handler(PF_INET, REG_QH_ARG(ipfw2_queue_handler) );
700 if (ret < 0) /* queue busy */
701 goto unregister_sockopt;
703 ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
705 goto unregister_sockopt;
707 printf("%s loaded\n", __FUNCTION__);
711 /* handle errors on load */
713 nf_unregister_queue_handler(PF_INET UNREG_QH_ARG(ipfw2_queue_handler) );
714 nf_unregister_sockopt(&ipfw_sockopts);
718 printf("%s error\n", __FUNCTION__);
724 /* module shutdown */
726 ipfw_module_exit(void)
729 #else /* linux hook */
730 nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
731 /* maybe drain the queue before unregistering ? */
732 nf_unregister_queue_handler(PF_INET UNREG_QH_ARG(ipfw2_queue_handler) );
733 nf_unregister_sockopt(&ipfw_sockopts);
738 printf("%s unloaded\n", __FUNCTION__);
742 module_init(ipfw_module_init)
743 module_exit(ipfw_module_exit)
744 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */