Added the new version for dummynet.
authormarta <marta@8c455092-636d-4788-adf5-e71def0336e8>
Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
committermarta <marta@8c455092-636d-4788-adf5-e71def0336e8>
Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
The new code is located into the dummynet2 directory and the spec file
was changed to used this latest version.

Major changes related to PlanetLab are the new table lookup support,
a little fix to accept packets after the reinjection and code cleanup.
The new table lookup support will allow a PlanetLab user to jump
directly to their own rule section, avoiding to scan the whole
ruleset list.

41 files changed:
Makefile
Makefile.openwrt
README
dummynet/Makefile
dummynet/bsd_compat.c
dummynet/include/sys/kernel.h
dummynet/include/sys/mbuf.h
dummynet/include/sys/module.h
dummynet/ip_dummynet.c
dummynet/ip_fw2.c
dummynet/ip_fw_pfil.c
dummynet/ipfw2_mod.c
dummynet/missing.h
dummynet/radix.c
dummynet2/Makefile [new file with mode: 0644]
dummynet2/bsd_compat.c [new file with mode: 0644]
dummynet2/in_cksum.c [new file with mode: 0644]
dummynet2/include/netgraph/ng_ipfw.h [new file with mode: 0644]
dummynet2/include/netinet/ip_dummynet.h [new file with mode: 0644]
dummynet2/include/netinet/ip_fw.h [new file with mode: 0644]
dummynet2/include/netinet/ipfw/ip_fw_private.h [new file with mode: 0644]
dummynet2/ip_dummynet.c [new file with mode: 0644]
dummynet2/ip_fw2.c [new file with mode: 0644]
dummynet2/ip_fw_dynamic.c [new file with mode: 0644]
dummynet2/ip_fw_log.c [new file with mode: 0644]
dummynet2/ip_fw_lookup.c [new file with mode: 0644]
dummynet2/ip_fw_nat.c [new file with mode: 0644]
dummynet2/ip_fw_pfil.c [new file with mode: 0644]
dummynet2/ip_fw_sockopt.c [new file with mode: 0644]
dummynet2/ip_fw_table.c [new file with mode: 0644]
dummynet2/ipfw2_mod.c [new file with mode: 0644]
dummynet2/missing.h [new file with mode: 0644]
dummynet2/radix.c [new file with mode: 0644]
glue.h
ipfw/Makefile
ipfw/dummynet.c
ipfw/glue.c
ipfw/include_e/libutil.h [new file with mode: 0644]
ipfw/include_e/sys/sockio.h [new file with mode: 0644]
ipfw/ipfw2.c
planetlab/ipfwroot.spec

index f863838..51a00a9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ all clean distclean:
        echo target is $(@)
        (cd ipfw && $(MAKE) $(@) )
        (cd dummynet && $(MAKE) $(@) )
+       (cd dummynet2 && $(MAKE) $(@) )
 
 snapshot:
        (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME).tgz --exclude .svn \
index 50dae83..b618a52 100644 (file)
@@ -44,7 +44,9 @@ define Build/Prepare
   # $(warning Preparing ipfw sources)
        mkdir -p $(PKG_BUILD_DIR)
        $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/
+       (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e )
        (cd $(PKG_BUILD_DIR)/dummynet && $(MAKE) include_e )
+       (cd $(PKG_BUILD_DIR)/dummynet2 && $(MAKE) include_e )
 endef
 
 define Build/Compile
@@ -54,10 +56,15 @@ define Build/Compile
                ARCH="$(LINUX_KARCH)" \
                SUBDIRS="$(PKG_BUILD_DIR)/dummynet" \
                VER=openwrt modules
+       $(MAKE) -C "$(LINUX_DIR)" \
+               CROSS_COMPILE="$(TARGET_CROSS)" \
+               ARCH="$(LINUX_KARCH)" \
+               SUBDIRS="$(PKG_BUILD_DIR)/dummynet2" \
+               VER=openwrt modules
        # compile the userland part for openwrt
        $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \
                $(TARGET_CONFIGURE_OPTS) \
-               CFLAGS="$(TARGET_CFLAGS) -I./include -include ../glue.h" \
+               CFLAGS="$(TARGET_CFLAGS) -I./include_e -I./include -include ../glue.h" \
                VER=openwrt all
 endef
 
diff --git a/README b/README
index 7ab66bf..0c3b4e8 100644 (file)
--- a/README
+++ b/README
@@ -14,6 +14,9 @@ version in RELENG_7 and HEAD as of December 2009), plus some glue code
 and headers written from scratch.
 Unless specified otherwise, all the code here is under a BSD license.
 
+Note:
+       - the linux version miss the "one_pass" feature
+
 =================== BUILD INSTRUCTIONS ==========================
 
 ***** Linux 2.6.x ******
@@ -35,6 +38,10 @@ Unless specified otherwise, all the code here is under a BSD license.
            Networking options  --->
               [*] Network packet filtering framework (Netfilter)
 
+       If you have not yet compiled your kernel source, you need to
+       prepare the build environment:
+
+       (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)
 
 ***** Linux 2.4.x *****
 
@@ -114,6 +121,10 @@ Unless specified otherwise, all the code here is under a BSD license.
     rmmod ipfw_mod.o                            # remove the module
 
 ***** PLANETLAB BUILD (within a slice) *****
+These instruction can be used by PlanetLab developers to compile the dummynet module
+on a node. To install the module on the node users need root access in root context.
+PlanetLab users that want to use the dummynet package should ask to PlanetLab support
+for nodes with dummynet emulation capabilities.
 
     Follow the instructions below. You can just cut&paste
 
index cac1958..6c6d9f6 100644 (file)
@@ -5,7 +5,6 @@
 #
 # The defaults are set to build without modifications on PlanetLab
 # and possibly 2.6 versions.
-#
 
 # Some variables need to have specific names, because they are used
 # by the build infrastructure on Linux and OpenWrt. They are:
@@ -33,27 +32,40 @@ $(warning including dummynet/Makefile)
 # lets default for 2.6 for planetlab builds
 VER ?= 2.6
 
-# General values
+#--- General values for all types of build ---
+# obj-m is the target module
 obj-m := ipfw_mod.o
 
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS = ip_fw2.c ip_dummynet.c ip_fw_pfil.c in_cksum.c
+IPFW_SRCS += radix.c 
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
 # generic cflags used on all systems
 #ipfw-cflags += -DIPFW_HASHTABLES
-ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT -DTRACE
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
 # _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
 ipfw-cflags += -D_BSD_SOURCE
 ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
 # the two header trees for empty and override files
-ipfw-cflags += -I $(M)/include_e -I $(M)/include
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/include
 ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
 
 $(warning "---- Building dummynet kernel module for Version $(VER)")
+
 # We have three sections for OpenWrt, Linux 2.4 and Linux 2.6
-#
+
 ifeq ($(VER),openwrt)
+  #--- The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
   M=.
-  obj-y := ipfw2_mod.o bsd_compat.o \
-       in_cksum.o ip_dummynet.o ip_fw2.o ip_fw_pfil.o radix.o
-  O_TARGET := ipfw_mod.o
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
 
   # xcflags-y is a temporary variable where we store build options
   xcflags-y += -O1 -DLINUX_24
@@ -72,22 +84,22 @@ else        # !openwrt, below we do linux builds for 2.4 and 2.6
   # We can override it from the command line, or let the system guess.
 
 ifneq ($(shell echo $(VER)|grep '2.4'),)
-  # The linux 2.4 version
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
   # guess the kernel path -- or is it under /lib/modules ?
-  KERNELPATH ?= /usr/src/`uname -r`/build
-
-  # Guess the gcc include directory
-  # The gcc version is in the last line returned by gcc -v
-  # gcc version 4.3.2 (Debian 4.3.2-1.1)
-  MYGCC_VER ?= $(shell gcc -v 2>&1 |tail -n 1 | cut -d " " -f 3)
-  # We don't know the exact directory unde /usr/lib/gcc so we guess
+  KERNELPATH ?= /usr/src/`uname -r`
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
   MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
   $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
 
   # additional warning
-  #WARN = -Wp,-MD,/home/luigi/ports-luigi/dummynet-branches/ipfw_mod/dummynet/.ipfw2_mod.o.d
-  #WARN += -Iinclude  -include include/linux/autoconf.h
-
   WARN += -Wall -Wundef
   WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
   WARN += -fno-common -Werror-implicit-function-declaration
@@ -96,22 +108,29 @@ ifneq ($(shell echo $(VER)|grep '2.4'),)
   WARN += -m32 -msoft-float # -mregparm=3
   #WARN += -freg-struct-return -mpreferred-stack-boundary=2
   WARN += -Wno-sign-compare
-  WARN += -Wdeclaration-after-statement -Wno-pointer-sign
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+       WARN += -Wno-pointer-sign
+  endif
 
   ccflags-y += -O1 -DLINUX_24
   CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
-       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) ${ccflags-y}
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
   # The Main target
 all: mod24
 
-else
+else # !2.4 --
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
 ifeq ($(IPFW_PLANETLAB),1)
   $(warning "---- Building for PlanetLab")
   ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
 endif
   # if not set, use the version from the installed system
   KERNELPATH ?= /lib/modules/`uname -r`/build
-  # the latest kernel
+  # Otherwise, if you have kernel sources, try something like this:
   #KERNELPATH = /usr/src/linux-2.6.22
   $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
   WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
@@ -119,34 +138,39 @@ endif
 
   # Required by kernel <= 2.6.22, ccflags-y is used on newer version
   LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3)
-  ifeq ($(LINUX_VERSION_CODE),132630)
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+  ifeq ($(shell if [ $(LINUX_VERSION_CODE) -le 132630 ] ; then echo "true"; fi),true)
     EXTRA_CFLAGS += $(ccflags-y)
   endif
 
 all: include_e
        $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules
-endif
+endif # !2.4
 
-#-- back to the common section of code
+#-- back to the common section of code for Linux 2.4 and 2.6
 
 # the list of objects used to build the module
 ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
 
-# Original ipfw and dummynet sources + FreeBSD stuff,
-IPFW_SRCS = ip_fw2.c ip_dummynet.c ip_fw_pfil.c in_cksum.c
-IPFW_SRCS += radix.c 
-# Module glue and functions missing in linux
-IPFW_SRCS += ipfw2_mod.c bsd_compat.c hashtable.c
-
 # additional $(CC) flags
 ccflags-y += $(WARN)
 ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
 ccflags-y += -g
 
 mod24: include_e $(obj-m)
 
 $(obj-m): $(ipfw_mod-y)
        $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+endif # !openwrt
+
+#--- various common targets
 clean:
        -rm -f *.o *.ko Module.symvers *.mod.c
        -rm -rf include_e
@@ -172,6 +196,7 @@ EFILES += net/vnet.h
 
 EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h
 EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h
+EFILES += netinet/in_systm.h
 EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h
 EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h
 EFILES += netinet/udp_var.h
@@ -184,14 +209,13 @@ EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h
 EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h
 EFILES += sys/sysctl.h sys/time.h sys/ucred.h
 
-M ?= $(shell pwd)
 include_e:
        echo "running in $M"
        -@rm -rf $(M)/include_e opt_*
        -@mkdir -p $(M)/include_e
        -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
 
-endif # !openwrt
 
+#--- some other targets for testing purposes
 test_radix: test_radix.o radix.o
-test_radix: CFLAGS=-Wall -Werror -O1
+test_radix: CFLAGS=-Wall -Werror -O2
index 995d60c..cad3c5d 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * $Id$
+ * $Id: bsd_compat.c 4508 2009-12-15 21:54:14Z luigi $
  *
  * kernel variables and functions that are not available in linux.
  */
@@ -32,7 +32,6 @@
 #include <sys/cdefs.h>
 #include <asm/div64.h> /* do_div on 2.4 */
 #include <linux/random.h>      /* get_random_bytes on 2.4 */
-#include "missing.h"
 
 /*
  * gettimeofday would be in sys/time.h but it is not
index 61b3bec..fbc9581 100644 (file)
@@ -5,7 +5,13 @@
 #define _SYS_KERNEL_H_
 
 #define SYSINIT(a, b, c, d, e)  \
-        void *dummy_ ## d = d
+        void *sysinit_ ## d = d
+#define VNET_SYSINIT(a, b, c, d, e)  \
+        void *sysinit_ ## d = d
+#define SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
+#define VNET_SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
 
 /*
  * Some enumerated orders; "ANY" sorts last.
index ed3d3a1..12837bf 100644 (file)
@@ -107,11 +107,21 @@ m_tag_prepend(struct mbuf *m, struct m_tag *t)
        SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
+/*
+ * Return the next tag in the list of tags associated with an mbuf.
+ */
+static __inline struct m_tag *
+m_tag_next(struct mbuf *m, struct m_tag *t)
+{
+        return (SLIST_NEXT(t, m_tag_link));
+}
+
 /*
  * Create an mtag of the given type
  */
 static __inline struct m_tag *
-m_tag_get(int type, int length, int wait)
+m_tag_alloc(uint32_t cookie, int type, int length, int wait)
 {
        int l = length + sizeof(struct m_tag);
        struct m_tag *m = malloc(l, 0, M_NOWAIT);
@@ -119,10 +129,17 @@ m_tag_get(int type, int length, int wait)
                memset(m, 0, l);
                m->m_tag_id = type;
                m->m_tag_len = length;
+               m->m_tag_cookie = cookie;
        }
        return m;
 };
 
+static __inline struct m_tag *
+m_tag_get(int type, int length, int wait)
+{
+       return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait);
+}
+
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
@@ -140,6 +157,7 @@ m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t)
        return NULL;
 };
 
+#define M_SETFIB(_m, _fib)     /* nothing on linux */
 static __inline void
 m_freem(struct mbuf *m)
 {
@@ -156,7 +174,7 @@ m_freem(struct mbuf *m)
 };
 
 /* we cannot pullup */
-#define m_pullup(__m, __i)     (m)
+//#define m_pullup(__m, __i)   (m)
 
 #define M_GETFIB(_m)   0
 
index 5296517..85bf220 100644 (file)
@@ -19,7 +19,6 @@ typedef struct moduledata {
         void            *priv;          /* extra data */
 } moduledata_t;
 
-int my_mod_register(struct moduledata *mod, const char *name, int order);
 /*
  * Hook the module descriptor, md, into our list of things to do.
  * We should in principle respect the order of loading.
index 0b23881..9fd70e2 100644 (file)
@@ -56,8 +56,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.110.2.4 2008/10/31 12:58:1
  * include files marked with XXX are probably not needed
  */
 
-#include "missing.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
index 4e46566..21d1b41 100644 (file)
@@ -70,11 +70,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw2.c,v 1.175.2.13 2008/10/30 16:29:04 bz
 #include <net/pf_mtag.h>
 #include <net/vnet.h>
 
-#ifdef linux
-#define INP_LOCK_ASSERT                /* define before missing.h otherwise ? */
-#include "missing.h"
-#endif
-
 #define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */
 
 #include <netinet/in.h>
@@ -104,10 +99,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw2.c,v 1.175.2.13 2008/10/30 16:29:04 bz
 
 #include <machine/in_cksum.h>  /* XXX for in_cksum */
 
-#ifdef IPFW_HASHTABLES
-#include "hashtable.h"
-#endif
-
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
@@ -183,18 +174,14 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
-static unsigned int dummy_default_rule = IPFW_DEFAULT_RULE;
+unsigned int dummy_default_rule = IPFW_DEFAULT_RULE;
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_default_rule, IPFW_DEFAULT_RULE,
     "The default/max possible rule number.");
-static unsigned int dummy_tables_max = IPFW_TABLES_MAX;
+unsigned int dummy_tables_max = IPFW_TABLES_MAX;
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
     &dummy_tables_max, IPFW_TABLES_MAX,
     "The maximum number of tables.");
-static unsigned int skipto_entries = 256;
-SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, skipto_entries,
-    CTLFLAG_RW, &skipto_entries, 0,
-    "Number of entries in the skipto cache");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
@@ -1886,61 +1873,6 @@ send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
        args->m = NULL;
 }
 
-static void
-set_skipto_table(struct ip_fw_chain *ch)
-{
-       int i, n, sh;
-       struct ip_fw *f, **t, **oldt;
-
-       for (sh = 15; sh > 0; sh--)
-               if (skipto_entries > 1<<sh)
-                       break;
-       sh++;
-       skipto_entries = 1<< (16 - sh) ;
-       /* XXX unsafe and too long */
-       t = malloc(skipto_entries * sizeof(*t), M_IPFW_TBL, M_WAITOK | M_ZERO);
-       if (t == NULL)
-               return;
-       IPFW_RLOCK(ch);
-       /* Store pointers in the table. In the loop i is the next
-        * free slot, n is the slot where the current rule goes.
-        */
-       for (i = 0, f = ch->rules; f; f = f->next) {
-               n = f->rulenum >> sh ;
-               while (i <= n)
-                       t[i++] = f;
-       }
-       V_layer3_chain.skipto_shift = sh;
-       V_layer3_chain.skipto_size = skipto_entries;
-       oldt = V_layer3_chain.skipto_ptrs;
-       V_layer3_chain.skipto_ptrs = t;
-       IPFW_RUNLOCK(ch);
-       if (oldt) {
-               IPFW_WLOCK(ch);
-               IPFW_WUNLOCK(ch);
-               /* now can free oldt */
-               free(oldt, M_IPFW_TBL);
-       }
-}
-#if 0
-/*
- * Map a rule number to a rule pointer, using the skipto table.
- * First lookup the slot, then follow the chain until we find a
- * non-null entry with rulenum >= num. Return default_rule on error.
- */
-static struct ip_fw *
-rule2ptr(struct ip_fw_chain *ch, int num)
-{
-       struct ip_fw *r = NULL;
-       int ix = (num & 0xffff) >> ch->skipto_shift;
-
-       while (ix < ch->skipto_size && (r = ch->skipto_ptrs[ix]) == NULL)
-               ix++;
-       while (r && num < r->rulenum)
-               r = r->next;
-       return (r ? r : ch->default_rule);
-}
-#endif
 /**
  *
  * Given an ip_fw *, lookup_next_rule will return a pointer
@@ -1957,10 +1889,11 @@ rule2ptr(struct ip_fw_chain *ch, int num)
  */
 
 static struct ip_fw *
-lookup_next_rule(struct ip_fw_chain *ch, struct ip_fw *me, uint32_t tablearg)
+lookup_next_rule(struct ip_fw *me, u_int32_t tablearg)
 {
        struct ip_fw *rule = NULL;
        ipfw_insn *cmd;
+       u_int16_t       rulenum;
 
        /* look for action, in case it is a skipto */
        cmd = ACTION_PTR(me);
@@ -1970,19 +1903,21 @@ lookup_next_rule(struct ip_fw_chain *ch, struct ip_fw *me, uint32_t tablearg)
                cmd += F_LEN(cmd);
        if (cmd->opcode == O_TAG)
                cmd += F_LEN(cmd);
-       if (cmd->opcode != O_SKIPTO ) {
-               rule = me->next;
-       } else {
-               tablearg = tablearg ? tablearg : cmd->arg1;
+       if (cmd->opcode == O_SKIPTO ) {
+               if (tablearg != 0) {
+                       rulenum = (u_int16_t)tablearg;
+               } else {
+                       rulenum = cmd->arg1;
+               }
                for (rule = me->next; rule ; rule = rule->next) {
-                       if (rule->rulenum >= tablearg) {
+                       if (rule->rulenum >= rulenum) {
                                break;
                        }
                }
-
-//             rule = rule2ptr(ch, tablearg ? tablearg : cmd->arg1);
        }
-       me->next_rule = rule; /* XXX perhaps unnecessary ? */
+       if (rule == NULL)               /* failure or not a skipto */
+               rule = me->next;
+       me->next_rule = rule;
        return rule;
 }
 
@@ -1994,11 +1929,6 @@ add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
        struct table_entry *ent;
        struct radix_node *rn;
 
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
        if (tbl >= IPFW_TABLES_MAX)
                return (EINVAL);
        rnh = ch->tables[tbl];
@@ -2037,11 +1967,6 @@ del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
        struct table_entry *ent;
        struct sockaddr_in sa, mask;
 
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
        if (tbl >= IPFW_TABLES_MAX)
                return (EINVAL);
        rnh = ch->tables[tbl];
@@ -2085,11 +2010,6 @@ flush_table(struct ip_fw_chain *ch, uint16_t tbl)
 
        IPFW_WLOCK_ASSERT(ch);
 
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
        if (tbl >= IPFW_TABLES_MAX)
                return (EINVAL);
        rnh = ch->tables[tbl];
@@ -2107,10 +2027,6 @@ flush_tables(struct ip_fw_chain *ch)
 
        for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
                flush_table(ch, tbl);
-#ifdef IPFW_HASHTABLES
-       for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
-               ch->hashtab[tbl] = ipfw_ht_destroy(ch->hashtab[tbl]);
-#endif
 }
 
 static int
@@ -2127,10 +2043,6 @@ init_tables(struct ip_fw_chain *ch)
                        return (ENOMEM);
                }
        }
-#ifdef IPFW_HASHTABLES
-        for (i = 0; i < IPFW_TABLES_MAX; i++)
-               ch->hashtab[i] = ipfw_ht_destroy(ch->hashtab[i]);
-#endif
        return (0);
 }
 
@@ -2767,7 +2679,7 @@ do {                                                                      \
                        f = args->rule->next_rule;
 
                if (f == NULL)
-                       f = lookup_next_rule(chain, args->rule, 0);
+                       f = lookup_next_rule(args->rule, 0);
        } else {
                /*
                 * Find the starting rule. It can be either the first
@@ -2984,7 +2896,7 @@ do {                                                                      \
                                            a = dst_port;
                                        else if (v == 3)
                                            a = src_port;
-                                       else if (v >= 4 && v <= 6) {
+                                       else if (v == 4 || v == 5) {
                                            check_uidgid(
                                                    (ipfw_insn_u32 *)cmd,
                                                    proto, oif,
@@ -2994,16 +2906,12 @@ do {                                                                    \
 #ifdef linux
                                            if (v ==4 /* O_UID */)
                                                a = ucred_cache.uid;
-                                           else if (v == 5 /* O_GID */)
-                                               a = ucred_cache.gid;
-                                           else if (v == 6 /* O_JAIL */)
+                                           else if (v == 5 /* O_JAIL */)
                                                a = ucred_cache.xid;
 #else
                                            if (v ==4 /* O_UID */)
                                                a = (*uc)->cr_uid;
-                                           else if (v == 5 /* O_GID */)
-                                               ; // a = groupmember((gid_t)insn->d[0], *uc);
-                                           else if (v == 6 /* O_JAIL */)
+                                           else if (v == 5 /* O_JAIL */)
                                                a = (*uc)->cr_prison->pr_id;
 #endif
                                        } else
@@ -3590,10 +3498,10 @@ do {                                                                    \
                                }
                                /* handle skipto */
                                if (cmd->arg1 == IP_FW_TABLEARG) {
-                                       f = lookup_next_rule(chain, f, tablearg);
-                               } else {
+                                       f = lookup_next_rule(f, tablearg);
+                               } else { // XXX ?
                                        if (f->next_rule == NULL)
-                                               lookup_next_rule(chain, f, 0);
+                                               lookup_next_rule(f, 0);
                                        f = f->next_rule;
                                }
                                /*
@@ -3883,15 +3791,17 @@ add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
                goto done;
         }
 
+       /*
+        * If rulenum is 0, find highest numbered rule before the
+        * default rule, and add autoinc_step
+        */
        if (V_autoinc_step < 1)
                V_autoinc_step = 1;
        else if (V_autoinc_step > 1000)
                V_autoinc_step = 1000;
        if (rule->rulenum == 0) {
                /*
-                * If rulenum is 0, use highest numbered rule before
-                * the default, adding autoinc_step if room.
-                * Also set the number in the caller.
+                * locate the highest numbered rule before default
                 */
                for (f = chain->rules; f; f = f->next) {
                        if (f->rulenum == IPFW_DEFAULT_RULE)
@@ -3905,7 +3815,6 @@ add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 
        /*
         * Now insert the new rule in the right place in the sorted list.
-        * XXX TODO also put in the skipto table.
         */
        for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
                if (f->rulenum > rule->rulenum) { /* found the location */
@@ -3958,7 +3867,6 @@ remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
                prev->next = n;
        V_static_count--;
        V_static_len -= l;
-       // XXX remove from the skipto table
 
        rule->next = chain->reap;
        chain->reap = rule;
@@ -5089,17 +4997,12 @@ ipfw_destroy(void)
        IPFW_WUNLOCK(&V_layer3_chain);
        if (reap != NULL)
                reap_rules(reap);
-       IPFW_DYN_LOCK_DESTROY();
        uma_zdestroy(ipfw_dyn_rule_zone);
+       IPFW_DYN_LOCK_DESTROY();
        if (V_ipfw_dyn_v != NULL)
                free(V_ipfw_dyn_v, M_IPFW);
        IPFW_LOCK_DESTROY(&V_layer3_chain);
 
-#ifdef INET6
-       /* Free IPv6 fw sysctl tree. */
-       sysctl_ctx_free(&ip6_fw_sysctl_ctx);
-#endif
-
        printf("IP firewall unloaded\n");
 }
 
@@ -5154,8 +5057,6 @@ vnet_ipfw_init(const void *unused)
        IPFW_LOCK_INIT(&V_layer3_chain);
        callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
 
-       set_skipto_table(&V_layer3_chain);
-
        bzero(&default_rule, sizeof default_rule);
        default_rule.act_ofs = 0;
        default_rule.rulenum = IPFW_DEFAULT_RULE;
index 368192a..b3fcba6 100644 (file)
@@ -43,17 +43,20 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw_pfil.c,v 1.25.2.2 2008/04/25 10:26:30
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
+#include <sys/ucred.h>
 
 #include <net/if.h>
+#include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
-#include "missing.h"
-
 #include <netinet/in.h>
+#include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
index 667d487..4b7edd1 100644 (file)
@@ -49,8 +49,6 @@
 #include <sys/mbuf.h>                  /* sizeof struct mbuf */
 #include <sys/param.h>                 /* NGROUPS */
 
-#include "missing.h"
-
 #ifdef __linux__
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -407,7 +405,7 @@ ipfw2_queue_handler(QH_ARGS)
        }
 
        if (m != NULL) {        /* Accept. reinject and free the mbuf */
-               REINJECT(info, NF_STOP);
+               REINJECT(info, NF_ACCEPT);
                m_freem(m);
        } else if (ret == 0) {
                /* dummynet has kept the packet, will reinject later. */
@@ -502,7 +500,7 @@ linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
        if (proto != IPPROTO_TCP)       /* XXX extend for UDP */
                return -1;
 
-       if ((dir ? (void *)skb->dst : (void *)skb->dev) == NULL) {
+       if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
                panic(" -- this should not happen\n");
                return -1;
        }
index d18f503..5b04dce 100644 (file)
@@ -33,6 +33,8 @@
 #ifndef _MISSING_H_
 #define _MISSING_H_
 
+#include <sys/cdefs.h>
+
 #ifdef _WIN32
 
 #ifndef DEFINE_SPINLOCK
@@ -50,6 +52,7 @@
 
 #else  /* __linux__ */
 
+#define MALLOC_DECLARE(x)      /* nothing */
 #include <linux/time.h>                /* do_gettimeofday */
 #include <netinet/ip.h>                /* local version */
 struct inpcb;
@@ -122,7 +125,11 @@ struct malloc_type {
 
 #define CTASSERT(x)
 
-#define log(_level, fmt, arg...)  printk(KERN_ERR fmt, ##arg)
+/* log... does not use the first argument */
+#define        LOG_ERR         0x100
+#define        LOG_INFO        0x200
+#define log(_level, fmt, arg...)  do {                 \
+       int __unused x=_level;printk(KERN_ERR fmt, ##arg); } while (0)
 
 /*
  * gettimeofday would be in sys/time.h but it is not
@@ -263,6 +270,10 @@ int in_cksum(struct mbuf *m, int len);
 #define INADDR_TO_IFP(a, b) b = NULL
 #define pf_find_mtag(a) NULL
 #define pf_get_mtag(a) NULL
+/* we don't pullup, fail */
+#define m_pullup(m, x)                                 \
+       ((m)->m_len >= x ? (m) : (netisr_dispatch(-1, m), NULL))
+
 #ifndef _WIN32
 #define AF_LINK AF_ASH /* ? our sys/socket.h */
 #endif
@@ -389,7 +400,6 @@ struct sock *inet_lookup(
         const __be32 saddr, const __be16 sport,
         const __be32 daddr, const __be16 dport,
         const int dif);
-static int inet_iif(const struct sk_buff *skb);
 struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
 #endif /* Linux < 2.6 */
 
@@ -504,4 +514,6 @@ extern  ip_fw_chk_t     *ip_fw_chk_ptr;
 #define SYSCTL_VNET_PROC       SYSCTL_PROC
 #define SYSCTL_VNET_INT                SYSCTL_INT
 
+int my_mod_register(struct moduledata *mod, const char *name, int order);
+
 #endif /* !_MISSING_H_ */
index 575c47c..639a561 100644 (file)
@@ -36,7 +36,6 @@
 #include <sys/param.h>
 #ifdef _KERNEL
 #include <sys/cdefs.h>
-#include "missing.h"
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
@@ -382,7 +381,7 @@ int rn_debug =  1;
  * the leaf (see RNTORT() in route.c), the second one is the parent.
  * This routine initializes the relevant fields of the nodes, so that
  * the leaf is the left child of the parent node, and both nodes have
- * (almost) all all fields filled as appropriate.
+ * (almost) all fields filled as appropriate.
  * (XXX some fields are left unset, see the '#if 0' section).
  * The function returns a pointer to the parent node.
  */
diff --git a/dummynet2/Makefile b/dummynet2/Makefile
new file mode 100644 (file)
index 0000000..2fe1d7b
--- /dev/null
@@ -0,0 +1,226 @@
+#
+# $Id: Makefile 4657 2010-01-04 11:20:53Z marta $
+#
+# gnu Makefile to build linux module for ipfw+dummynet.
+#
+# The defaults are set to build without modifications on PlanetLab
+# and possibly 2.6 versions.
+
+# Some variables need to have specific names, because they are used
+# by the build infrastructure on Linux and OpenWrt. They are:
+# 
+#   ccflags-y  additional $(CC) flags
+#   M          used by Kbuild, we must set it to `pwd`
+#   obj-m      list of .o modules to build
+#   $(MOD)-y   for each $MOD in obj-m, the list of objects
+#   obj-y      same as above, for openwrt
+#   O_TARGET   the link target, for openwrt
+#   EXTRA_CFLAGS as the name says... in openwrt
+#   EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too
+#   KERNELPATH the path to the kernel sources or headers
+#
+# Not sure about this (the name might be reserved)
+#   ipfw-cflags                our flags for building the module
+#
+# Other variables are only private and can be renamed. They include:
+#
+#   VER                linux version we are building for (2.4 2.6 or openwrt)
+#---
+
+$(warning including dummynet/Makefile)
+
+# lets default for 2.6 for planetlab builds
+VER ?= 2.6
+
+#--- General values for all types of build ---
+# obj-m is the target module
+obj-m := ipfw_mod.o
+
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS := ip_fw2.c ip_dummynet.c ip_fw_pfil.c ip_fw_sockopt.c
+IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c
+IPFW_SRCS += radix.c in_cksum.c
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
+# generic cflags used on all systems
+#ipfw-cflags += -DIPFW_HASHTABLES
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
+# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
+ipfw-cflags += -D_BSD_SOURCE
+ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
+# the two header trees for empty and override files
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/include
+# XXX eventually ../dummynet/include will go away
+ipfw-cflags += -I $(M)/../dummynet/include
+ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
+
+$(warning "---- Building dummynet kernel module for Version $(VER)")
+
+# We have three sections for OpenWrt, Linux 2.4 and Linux 2.6
+
+ifeq ($(VER),openwrt)
+  #--- The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
+  M=.
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
+
+  # xcflags-y is a temporary variable where we store build options
+  xcflags-y += -O1 -DLINUX_24
+  xcflags-y += -g
+
+  EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags)
+
+  # we should not export anything
+  #export-objs := ipfw2_mod.o
+-include $(TOPDIR)/Rules.make
+
+else   # !openwrt, below we do linux builds for 2.4 and 2.6
+
+  # KERNELPATH is where the kernel headers reside. On PlanetLab
+  # it is set already by the build system.
+  # We can override it from the command line, or let the system guess.
+
+ifneq ($(shell echo $(VER)|grep '2.4'),)
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
+  # guess the kernel path -- or is it under /lib/modules ?
+  KERNELPATH ?= /usr/src/`uname -r`
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
+  MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
+  $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
+
+  # additional warning
+  WARN += -Wall -Wundef
+  WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
+  WARN += -fno-common -Werror-implicit-function-declaration
+  # WARN += -O2  -fno-stack-protector -m32 -msoft-float -mregparm=3
+  # -mregparm=3 gives a printk error
+  WARN += -m32 -msoft-float # -mregparm=3
+  #WARN += -freg-struct-return -mpreferred-stack-boundary=2
+  WARN += -Wno-sign-compare
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+        WARN += -Wno-pointer-sign
+  endif
+
+  ccflags-y += -O1 -DLINUX_24
+  CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
+  # The Main target
+all: mod24
+
+else # !2.4 --
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
+ifeq ($(IPFW_PLANETLAB),1)
+  $(warning "---- Building for PlanetLab")
+  ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
+endif
+  # if not set, use the version from the installed system
+  KERNELPATH ?= /lib/modules/`uname -r`/build
+  # Otherwise, if you have kernel sources, try something like this:
+  #KERNELPATH = /usr/src/linux-2.6.22
+  $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
+  WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
+  # The main target
+
+  # Required by kernel <= 2.6.22, ccflags-y is used on newer version
+  LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3)
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+  ifeq ($(shell if [ $(LINUX_VERSION_CODE) -le 132630 ] ; then echo "true"; fi),true)
+    EXTRA_CFLAGS += $(ccflags-y)
+  endif
+
+all: include_e
+       $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules
+endif # !2.4
+
+#-- back to the common section of code for Linux 2.4 and 2.6
+
+# the list of objects used to build the module
+ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
+
+# additional $(CC) flags
+ccflags-y += $(WARN)
+ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
+ccflags-y += -g
+
+mod24: include_e $(obj-m)
+
+$(obj-m): $(ipfw_mod-y)
+       $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+endif # !openwrt
+
+#--- various common targets
+clean:
+       -rm -f *.o *.ko Module.symvers *.mod.c
+       -rm -rf include_e
+
+distclean: clean
+       -rm -f .*cmd modules.order opt_*
+       -rm -rf .tmp_versions include_e
+       -rm -rf .*.o.d
+
+# support to create empty dirs and files in include_e/
+# EDIRS is the list of directories, EFILES is the list of files.
+
+EDIRS= altq arpa machine net netinet netinet6 sys
+
+EFILES += opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h
+EFILES += opt_mbuf_stress_test.h opt_param.h
+
+EFILES += altq/if_altq.h
+EFILES += arpa/inet.h
+EFILES += machine/in_cksum.h
+EFILES += net/ethernet.h net/netisr.h net/pf_mtag.h
+EFILES += net/bpf.h net/if_types.h
+EFILES += net/vnet.h
+
+EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h
+EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h
+EFILES += netinet/in_systm.h
+EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h
+EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h
+EFILES += netinet/udp_var.h
+
+EFILES += netinet6/ip6_var.h
+
+EFILES += sys/_lock.h sys/_rwlock.h sys/_mutex.h sys/jail.h
+EFILES += sys/condvar.h sys/eventhandler.h sys/domain.h
+EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h
+EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h
+EFILES += sys/sysctl.h sys/time.h sys/ucred.h
+
+include_e:
+       echo "running in $M"
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
+
+
+#--- some other targets for testing purposes
+test_radix: test_radix.o radix.o
+test_lookup: ip_fw_lookup.o
+test_radix test_lookup: CFLAGS=-Wall -Werror -O1
diff --git a/dummynet2/bsd_compat.c b/dummynet2/bsd_compat.c
new file mode 100644 (file)
index 0000000..70268bb
--- /dev/null
@@ -0,0 +1,370 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: bsd_compat.c 4665 2010-01-04 12:35:39Z luigi $
+ *
+ * kernel variables and functions that are not available in linux.
+ */
+
+#include <sys/cdefs.h>
+#include <asm/div64.h> /* do_div on 2.4 */
+#include <linux/random.h>      /* get_random_bytes on 2.4 */
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+int ticks;             /* kernel ticks counter */
+int hz = 1000;         /* default clock time */
+long tick = 1000;      /* XXX is this 100000/hz ? */
+int bootverbose = 0;
+time_t time_uptime = 0;
+struct timeval boottime;
+
+int     ip_defttl;
+int fw_one_pass = 1;
+u_long  in_ifaddrhmask;                         /* mask for hash table */
+struct  in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+u_int rt_numfibs = RT_NUMFIBS;
+
+/*
+ * pfil hook support.
+ * We make pfil_head_get return a non-null pointer, which is then ignored
+ * in our 'add-hook' routines.
+ */
+struct pfil_head;
+typedef int (pfil_hook_t)
+       (void *, struct mbuf **, struct ifnet *, int, struct inpcb *);
+
+struct pfil_head *
+pfil_head_get(int proto, u_long flags)
+{
+       static int dummy;
+       return (struct pfil_head *)&dummy;
+}
+int
+pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+int
+pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+/* define empty body for kernel function */
+int
+priv_check(struct thread *td, int priv)
+{
+       return 0;
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+       return 0;
+}
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+void
+ether_demux(struct ifnet *ifp, struct mbuf *m)
+{
+       return;
+}
+
+int
+ether_output_frame(struct ifnet *ifp, struct mbuf *m)
+{
+       return 0;
+}
+
+void
+in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
+{
+       return;
+}
+
+void
+icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
+{
+       return;
+}
+
+u_short
+in_cksum_skip(struct mbuf *m, int len, int skip)
+{
+       return 0;
+}
+
+u_short
+in_cksum_hdr(struct ip *ip)
+{
+       return 0;
+}
+
+/*
+ * we don't really reassemble, just return whatever we had.
+ */
+struct mbuf *
+ip_reass(struct mbuf *clone)
+{
+       return clone;
+}
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+/* credentials check */
+#include <netinet/ip_fw.h>
+int
+cred_check(void *_insn,  int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb)
+{
+       int match = 0;
+       ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn;
+
+       if (*ugid_lookupp == 0) {        /* actively lookup and copy in cache */
+               /* returns null if any element of the chain up to file is null.
+                * if sk != NULL then we also have a reference
+                */
+               *ugid_lookupp = linux_lookup(proto,
+                       src_ip.s_addr, htons(src_port),
+                       dst_ip.s_addr, htons(dst_port),
+                       skb, oif ? 1 : 0, u);
+       }
+       if (*ugid_lookupp < 0)
+               return 0;
+
+       if (insn->o.opcode == O_UID)
+               match = (u->uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_JAIL)
+               match = (u->xid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = (u->gid == (uid_t)insn->d[0]);
+       return match;
+}
+
+int
+jailed(struct ucred *cred)
+{
+       return 0;
+}
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int
+in_localaddr(struct in_addr in)
+{
+       return 1;
+}
+
+int
+sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (len < valsize)
+               sopt->sopt_valsize = valsize = len;
+       bcopy(buf, sopt->sopt_val, valsize);
+       return 0;
+}
+
+/*
+ * copy data from userland to kernel
+ */
+int
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (valsize < minlen)
+               return EINVAL;
+       if (valsize > len)
+               sopt->sopt_valsize = valsize = len;
+       bcopy(sopt->sopt_val, buf, valsize);
+       return 0;
+}
+
+void
+getmicrouptime(struct timeval *tv)
+{
+#ifdef _WIN32
+#else
+       do_gettimeofday(tv);
+#endif
+}
+
+
+#include <arpa/inet.h>
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf)
+{
+#ifdef _WIN32
+#else
+       unsigned char *ucp = (unsigned char *)&ina;
+
+       sprintf(buf, "%d.%d.%d.%d",
+       ucp[0] & 0xff,
+       ucp[1] & 0xff,
+       ucp[2] & 0xff,
+       ucp[3] & 0xff);
+#endif
+       return buf;
+}
+
+char *
+inet_ntoa(struct in_addr ina)
+{
+       static char buf[16];
+       return inet_ntoa_r(ina, buf);
+}
+
+int
+random(void)
+{
+#ifdef _WIN32
+       return 0x123456;
+#else
+       int r;
+       get_random_bytes(&r, sizeof(r));
+       return r & 0x7fffffff; 
+#endif
+}
+
+
+/*
+ * do_div really does a u64 / u32 bit division.
+ * we save the sign and convert to uint befor calling.
+ * We are safe just because we always call it with small operands.
+ */
+int64_t
+div64(int64_t a, int64_t b)
+{
+#ifdef _WIN32
+        int a1 = a, b1 = b;
+       return a1/b1;
+#else
+       uint64_t ua, ub;
+       int sign = ((a>0)?1:-1) * ((b>0)?1:-1);
+
+       ua = ((a>0)?a:-a);
+       ub = ((b>0)?b:-b);
+        do_div(ua, ub);
+       return sign*ua;
+#endif
+}
+
+/*
+ * compact version of fnmatch.
+ */
+int
+fnmatch(const char *pattern, const char *string, int flags)
+{
+       char s;
+
+       if (!string || !pattern)
+               return 1;       /* no match */
+       while ( (s = *string++) ) {
+               char p = *pattern++;
+               if (p == '\0')          /* pattern is over, no match */
+                       return 1;
+               if (p == '*')           /* wildcard, match */
+                       return 0;
+               if (p == '.' || p == s) /* char match, continue */
+                       continue;
+               return 1;               /* no match */
+       }
+       /* end of string, make sure the pattern is over too */
+       if (*pattern == '\0' || *pattern == '*')
+               return 0;
+       return 1;       /* no match */
+}
+
+#ifdef _WIN32
+/*
+ * as good as anywhere, place here the missing calls
+ */
+
+void *
+my_alloc(int size)
+{
+       void *_ret = ExAllocatePoolWithTag(0, size, 'wfpi');
+       if (_ret)
+               memset(_ret, 0, size);
+       return _ret;
+}
+
+void
+panic(const char *fmt, ...)
+{
+       printf("%s", fmt);
+       for (;;);
+}
+
+#include <stdarg.h>
+
+extern int _vsnprintf(char *buf, int buf_size, char * fmt, va_list ap);
+
+/*
+ * Windows' _snprintf doesn't terminate buffer with zero if size > buf_size
+ */
+int
+snprintf(char *buf, int buf_size, char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    if (_vsnprintf(buf, buf_size, fmt, ap) < 0)
+        buf[buf_size - 1] = '\0';
+    va_end(ap);
+
+    return 0;
+}
+#endif
diff --git a/dummynet2/in_cksum.c b/dummynet2/in_cksum.c
new file mode 100644 (file)
index 0000000..8972cef
--- /dev/null
@@ -0,0 +1,150 @@
+/*-
+ * Copyright (c) 1988, 1992, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)in_cksum.c  8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $");
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+
+/*
+ * Checksum routine for Internet Protocol family headers (Portable Version).
+ *
+ * This routine is very heavily used in the network
+ * code and should be modified for each CPU to be as fast as possible.
+ */
+
+#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
+
+int
+in_cksum(struct mbuf *m, int len)
+{
+       register u_short *w;
+       register int sum = 0;
+       register int mlen = 0;
+       int byte_swapped = 0;
+
+       union {
+               char    c[2];
+               u_short s;
+       } s_util;
+       union {
+               u_short s[2];
+               long    l;
+       } l_util;
+
+       for (;m && len; m = m->m_next) {
+               if (m->m_len == 0)
+                       continue;
+               w = mtod(m, u_short *);
+               if (mlen == -1) {
+                       /*
+                        * The first byte of this mbuf is the continuation
+                        * of a word spanning between this mbuf and the
+                        * last mbuf.
+                        *
+                        * s_util.c[0] is already saved when scanning previous
+                        * mbuf.
+                        */
+                       s_util.c[1] = *(char *)w;
+                       sum += s_util.s;
+                       w = (u_short *)((char *)w + 1);
+                       mlen = m->m_len - 1;
+                       len--;
+               } else
+                       mlen = m->m_len;
+               if (len < mlen)
+                       mlen = len;
+               len -= mlen;
+               /*
+                * Force to even boundary.
+                */
+#if defined(CONFIG_X86_64)
+               if ((1 & (long) w) && (mlen > 0)) {
+#else
+               if ((1 & (int) w) && (mlen > 0)) {
+#endif
+                       REDUCE;
+                       sum <<= 8;
+                       s_util.c[0] = *(u_char *)w;
+                       w = (u_short *)((char *)w + 1);
+                       mlen--;
+                       byte_swapped = 1;
+               }
+               /*
+                * Unroll the loop to make overhead from
+                * branches &c small.
+                */
+               while ((mlen -= 32) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
+                       sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
+                       sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
+                       w += 16;
+               }
+               mlen += 32;
+               while ((mlen -= 8) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       w += 4;
+               }
+               mlen += 8;
+               if (mlen == 0 && byte_swapped == 0)
+                       continue;
+               REDUCE;
+               while ((mlen -= 2) >= 0) {
+                       sum += *w++;
+               }
+               if (byte_swapped) {
+                       REDUCE;
+                       sum <<= 8;
+                       byte_swapped = 0;
+                       if (mlen == -1) {
+                               s_util.c[1] = *(char *)w;
+                               sum += s_util.s;
+                               mlen = 0;
+                       } else
+                               mlen = -1;
+               } else if (mlen == -1)
+                       s_util.c[0] = *(char *)w;
+       }
+       if (len)
+               printf("cksum: out of data\n");
+       if (mlen == -1) {
+               /* The last mbuf has odd # of bytes. Follow the
+                  standard (the odd byte may be shifted left by 8 bits
+                  or not as determined by endian-ness of the machine) */
+               s_util.c[1] = 0;
+               sum += s_util.s;
+       }
+       REDUCE;
+       return (~sum & 0xffff);
+}
diff --git a/dummynet2/include/netgraph/ng_ipfw.h b/dummynet2/include/netgraph/ng_ipfw.h
new file mode 100644 (file)
index 0000000..55fd890
--- /dev/null
@@ -0,0 +1,33 @@
+/*-
+ * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $
+ */
+#ifndef __NG_IPFW_H
+#define __NG_IPFW_H
+
+#define NG_IPFW_NODE_TYPE    "ipfw"
+#define NGM_IPFW_COOKIE      1105988990
+#endif /* __NG_IPFW_H */
diff --git a/dummynet2/include/netinet/ip_dummynet.h b/dummynet2/include/netinet/ip_dummynet.h
new file mode 100644 (file)
index 0000000..f01bfe2
--- /dev/null
@@ -0,0 +1,374 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.40.2.1 2008/04/25 10:26:30 oleg Exp $
+ */
+
+#ifndef _IP_DUMMYNET_H
+#define _IP_DUMMYNET_H
+
+/*
+ * Definition of dummynet data structures. In the structures, I decided
+ * not to use the macros in <sys/queue.h> in the hope of making the code
+ * easier to port to other architectures. The type of lists and queue we
+ * use here is pretty simple anyways.
+ */
+
+/*
+ * We start with a heap, which is used in the scheduler to decide when
+ * to transmit packets etc.
+ *
+ * The key for the heap is used for two different values:
+ *
+ * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ *
+ * 2. virtual times. These increase in steps of len/x, where len is the
+ *    packet length, and x is either the weight of the flow, or the
+ *    sum of all weights.
+ *    If we limit to max 1000 flows and a max weight of 100, then
+ *    x needs 17 bits. The packet size is 16 bits, so we can easily
+ *    overflow if we do not allow errors.
+ * So we use a key "dn_key" which is 64 bits. Some macros are used to
+ * compare key values and handle wraparounds.
+ * MAX64 returns the largest of two key values.
+ * MY_M is used as a shift count when doing fixed point arithmetic
+ * (a better name would be useful...).
+ */
+typedef u_int64_t dn_key ;      /* sorting key */
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+#define DN_KEY_GT(a,b)     ((int64_t)((a)-(b)) > 0)
+#define DN_KEY_GEQ(a,b)    ((int64_t)((a)-(b)) >= 0)
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#define MY_M   16 /* number of left shift to obtain a larger precision */
+
+/*
+ * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
+ * virtual time wraps every 15 days.
+ */
+
+
+/*
+ * The maximum hash table size for queues.  This value must be a power
+ * of 2.
+ */
+#define DN_MAX_HASH_SIZE 65536
+
+/*
+ * A heap entry is made of a key and a pointer to the actual
+ * object stored in the heap.
+ * The heap is an array of dn_heap_entry entries, dynamically allocated.
+ * Current size is "size", with "elements" actually in use.
+ * The heap normally supports only ordered insert and extract from the top.
+ * If we want to extract an object from the middle of the heap, we
+ * have to know where the object itself is located in the heap (or we
+ * need to scan the whole array). To this purpose, an object has a
+ * field (int) which contains the index of the object itself into the
+ * heap. When the object is moved, the field must also be updated.
+ * The offset of the index in the object is stored in the 'offset'
+ * field in the heap descriptor. The assumption is that this offset
+ * is non-zero if we want to support extract from the middle.
+ */
+struct dn_heap_entry {
+    dn_key key ;       /* sorting key. Topmost element is smallest one */
+    void *object ;     /* object pointer */
+} ;
+
+struct dn_heap {
+    int size ;
+    int elements ;
+    int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
+    struct dn_heap_entry *p ;  /* really an array of "size" entries */
+} ;
+
+#ifdef _KERNEL
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.  This is used within
+ * the dummynet code as well as outside when checking for special
+ * processing requirements.
+ * Note that the first part is the reinject info and is common to
+ * other forms of packet reinjection.
+ */
+struct dn_pkt_tag {
+       struct ipfw_rule_ref rule;      /* matching rule */
+
+    /* second part, dummynet specific */
+    int dn_dir;                        /* action when packet comes out. */
+                               /* see ip_fw_private.h */
+
+    dn_key output_time;                /* when the pkt is due for delivery     */
+    struct ifnet *ifp;         /* interface, for ip_output             */
+    struct _ip6dn_args ip6opt; /* XXX ipv6 options                     */
+};
+#endif /* _KERNEL */
+
+/*
+ * Overall structure of dummynet (with WF2Q+):
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE.
+
+A QUEUE is just a queue with configurable size and queue management
+policy. It is also associated with a mask (to discriminate among
+different flows), a weight (used to give different shares of the
+bandwidth to different flows) and a "pipe", which essentially
+supplies the transmit clock for all queues associated with that
+pipe.
+
+A PIPE emulates a fixed-bandwidth link, whose bandwidth is
+configurable.  The "clock" for a pipe can come from either an
+internal timer, or from the transmit interrupt of an interface.
+A pipe is also associated with one (or more, if masks are used)
+queue, where all packets for that pipe are stored.
+
+The bandwidth available on the pipe is shared by the queues
+associated with that pipe (only one in case the packet is sent
+to a PIPE) according to the WF2Q+ scheduling algorithm and the
+configured weights.
+
+In general, incoming packets are stored in the appropriate queue,
+which is then placed into one of a few heaps managed by a scheduler
+to decide when the packet should be extracted.
+The scheduler (a function called dummynet()) is run at every timer
+tick, and grabs queues from the head of the heaps when they are
+ready for processing.
+
+There are three data structures definining a pipe and associated queues:
+
+ + dn_pipe, which contains the main configuration parameters related
+   to delay and bandwidth;
+ + dn_flow_set, which contains WF2Q+ configuration, flow
+   masks, plr and RED configuration;
+ + dn_flow_queue, which is the per-flow queue (containing the packets)
+
+Multiple dn_flow_set can be linked to the same pipe, and multiple
+dn_flow_queue can be linked to the same dn_flow_set.
+All data structures are linked in a linear list which is used for
+housekeeping purposes.
+
+During configuration, we create and initialize the dn_flow_set
+and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
+
+At runtime: packets are sent to the appropriate dn_flow_set (either
+WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
+which in turn dispatches them to the appropriate dn_flow_queue
+(created dynamically according to the masks).
+
+The transmit clock for fixed rate flows (ready_event()) selects the
+dn_flow_queue to be used to transmit the next packet. For WF2Q,
+wfq_ready_event() extract a pipe which in turn selects the right
+flow using a number of heaps defined into the pipe itself.
+
+ *
+ */
+
+/*
+ * per flow queue. This contains the flow identifier, the queue
+ * of packets, counters, and parameters used to support both RED and
+ * WF2Q+.
+ *
+ * A dn_flow_queue is created and initialized whenever a packet for
+ * a new flow arrives.
+ */
+struct dn_flow_queue {
+    struct dn_flow_queue *next ;
+    struct ipfw_flow_id id ;
+
+    struct mbuf *head, *tail ; /* queue of packets */
+    u_int len ;
+    u_int len_bytes ;
+
+    /*
+     * When we emulate MAC overheads, or channel unavailability due
+     * to other traffic on a shared medium, we augment the packet at
+     * the head of the queue with an 'extra_bits' field representsing
+     * the additional delay the packet will be subject to:
+     *         extra_bits = bw*unavailable_time.
+     * With large bandwidth and large delays, extra_bits (and also numbytes)
+     * can become very large, so better play safe and use 64 bit
+     */
+    uint64_t numbytes ;                /* credit for transmission (dynamic queues) */
+    int64_t extra_bits;                /* extra bits simulating unavailable channel */
+
+    u_int64_t tot_pkts ;       /* statistics counters  */
+    u_int64_t tot_bytes ;
+    u_int32_t drops ;
+
+    int hash_slot ;            /* debugging/diagnostic */
+
+    /* RED parameters */
+    int avg ;                   /* average queue length est. (scaled) */
+    int count ;                 /* arrivals since last RED drop */
+    int random ;                /* random value (scaled) */
+    dn_key idle_time;          /* start of queue idle time */
+
+    /* WF2Q+ support */
+    struct dn_flow_set *fs ;   /* parent flow set */
+    int heap_pos ;             /* position (index) of struct in heap */
+    dn_key sched_time ;                /* current time when queue enters ready_heap */
+
+    dn_key S,F ;               /* start time, finish time */
+    /*
+     * Setting F < S means the timestamp is invalid. We only need
+     * to test this when the queue is empty.
+     */
+} ;
+
+/*
+ * flow_set descriptor. Contains the "template" parameters for the
+ * queue configuration, and pointers to the hash table of dn_flow_queue's.
+ *
+ * The hash table is an array of lists -- we identify the slot by
+ * hashing the flow-id, then scan the list looking for a match.
+ * The size of the hash table (buckets) is configurable on a per-queue
+ * basis.
+ *
+ * A dn_flow_set is created whenever a new queue or pipe is created (in the
+ * latter case, the structure is located inside the struct dn_pipe).
+ */
+struct dn_flow_set {
+    SLIST_ENTRY(dn_flow_set)   next;   /* linked list in a hash slot */
+
+    u_short fs_nr ;             /* flow_set number       */
+    u_short flags_fs;
+#define DN_HAVE_FLOW_MASK      0x0001
+#define DN_IS_RED              0x0002
+#define DN_IS_GENTLE_RED       0x0004
+#define DN_QSIZE_IS_BYTES      0x0008  /* queue size is measured in bytes */
+#define DN_NOERROR             0x0010  /* do not report ENOBUFS on drops  */
+#define        DN_HAS_PROFILE          0x0020  /* the pipe has a delay profile. */
+#define DN_IS_PIPE             0x4000
+#define DN_IS_QUEUE            0x8000
+
+    struct dn_pipe *pipe ;     /* pointer to parent pipe */
+    u_short parent_nr ;                /* parent pipe#, 0 if local to a pipe */
+
+    int weight ;               /* WFQ queue weight */
+    int qsize ;                        /* queue size in slots or bytes */
+    int plr ;                  /* pkt loss rate (2^31-1 means 100%) */
+
+    struct ipfw_flow_id flow_mask ;
+
+    /* hash table of queues onto this flow_set */
+    int rq_size ;              /* number of slots */
+    int rq_elements ;          /* active elements */
+    struct dn_flow_queue **rq; /* array of rq_size entries */
+
+    u_int32_t last_expired ;   /* do not expire too frequently */
+    int backlogged ;           /* #active queues for this flowset */
+
+        /* RED parameters */
+#define SCALE_RED               16
+#define SCALE(x)                ( (x) << SCALE_RED )
+#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
+    int w_q ;                  /* queue weight (scaled) */
+    int max_th ;               /* maximum threshold for queue (scaled) */
+    int min_th ;               /* minimum threshold for queue (scaled) */
+    int max_p ;                        /* maximum value for p_b (scaled) */
+    u_int c_1 ;                        /* max_p/(max_th-min_th) (scaled) */
+    u_int c_2 ;                        /* max_p*min_th/(max_th-min_th) (scaled) */
+    u_int c_3 ;                        /* for GRED, (1-max_p)/max_th (scaled) */
+    u_int c_4 ;                        /* for GRED, 1 - 2*max_p (scaled) */
+    u_int * w_q_lookup ;       /* lookup table for computing (1-w_q)^t */
+    u_int lookup_depth ;       /* depth of lookup table */
+    int lookup_step ;          /* granularity inside the lookup table */
+    int lookup_weight ;                /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+    int avg_pkt_size ;         /* medium packet size */
+    int max_pkt_size ;         /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+/*
+ * Pipe descriptor. Contains global parameters, delay-line queue,
+ * and the flow_set used for fixed-rate queues.
+ *
+ * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
+ *   not_eligible_heap, for queues whose start time is higher
+ *     than the virtual time. Sorted by start time.
+ *   scheduler_heap, for queues eligible for scheduling. Sorted by
+ *     finish time.
+ *   idle_heap, all flows that are idle and can be removed. We
+ *     do that on each tick so we do not slow down too much
+ *     operations during forwarding.
+ *
+ */
+struct dn_pipe {               /* a pipe */
+    SLIST_ENTRY(dn_pipe)       next;   /* linked list in a hash slot */
+
+    int        pipe_nr ;               /* number       */
+    int bandwidth;             /* really, bytes/tick.  */
+    int        delay ;                 /* really, ticks        */
+
+    struct     mbuf *head, *tail ;     /* packets in delay line */
+
+    /* WF2Q+ */
+    struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
+    struct dn_heap not_eligible_heap; /* top extract- key Start time */
+    struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
+
+    dn_key V ;                 /* virtual time */
+    int sum;                   /* sum of weights of all active sessions */
+
+    /* Same as in dn_flow_queue, numbytes can become large */
+    int64_t numbytes;          /* bits I can transmit (more or less). */
+    uint64_t burst;            /* burst size, scaled: bits * hz */
+
+    dn_key sched_time ;                /* time pipe was scheduled in ready_heap */
+    dn_key idle_time;          /* start of pipe idle time */
+
+    /*
+     * When the tx clock come from an interface (if_name[0] != '\0'), its name
+     * is stored below, whereas the ifp is filled when the rule is configured.
+     */
+    char if_name[IFNAMSIZ];
+    struct ifnet *ifp ;
+    int ready ; /* set if ifp != NULL and we got a signal from it */
+
+    struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+    /* fields to simulate a delay profile */
+
+#define ED_MAX_NAME_LEN                32
+    char name[ED_MAX_NAME_LEN];
+    int loss_level;
+    int samples_no;
+    int *samples;
+};
+
+/* dn_pipe_max is used to pass pipe configuration from userland onto
+ * kernel space and back
+ */
+#define ED_MAX_SAMPLES_NO      1024
+struct dn_pipe_max {
+       struct dn_pipe pipe;
+       int samples[ED_MAX_SAMPLES_NO];
+};
+
+SLIST_HEAD(dn_pipe_head, dn_pipe);
+
+#endif /* _IP_DUMMYNET_H */
diff --git a/dummynet2/include/netinet/ip_fw.h b/dummynet2/include/netinet/ip_fw.h
new file mode 100644 (file)
index 0000000..238601f
--- /dev/null
@@ -0,0 +1,574 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ip_fw.h 200580 2009-12-15 16:15:14Z luigi $
+ */
+
+#ifndef _IPFW2_H
+#define _IPFW2_H
+
+/*
+ * The default rule number.  By the design of ip_fw, the default rule
+ * is the last one, so its number can also serve as the highest number
+ * allowed for a rule.  The ip_fw code relies on both meanings of this
+ * constant. 
+ */
+#define        IPFW_DEFAULT_RULE       65535
+
+/*
+ * The number of ipfw tables.  The maximum allowed table number is the
+ * (IPFW_TABLES_MAX - 1).
+ */
+#define        IPFW_TABLES_MAX         128
+
+/*
+ * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
+ * argument between 1 and 65534. The value 0 is unused, the value
+ * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
+ * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
+ * result of the most recent table() lookup.
+ * Note that 16bit is only a historical limit, resulting from
+ * the use of a 16-bit fields for that value. In reality, we can have
+ * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
+ */
+#define        IPFW_ARG_MIN            1
+#define        IPFW_ARG_MAX            65534
+#define IP_FW_TABLEARG         65535   /* XXX should use 0 */
+
+/*
+ * The kernel representation of ipfw rules is made of a list of
+ * 'instructions' (for all practical purposes equivalent to BPF
+ * instructions), which specify which fields of the packet
+ * (or its metadata) should be analysed.
+ *
+ * Each instruction is stored in a structure which begins with
+ * "ipfw_insn", and can contain extra fields depending on the
+ * instruction type (listed below).
+ * Note that the code is written so that individual instructions
+ * have a size which is a multiple of 32 bits. This means that, if
+ * such structures contain pointers or other 64-bit entities,
+ * (there is just one instance now) they may end up unaligned on
+ * 64-bit architectures, so the must be handled with care.
+ *
+ * "enum ipfw_opcodes" are the opcodes supported. We can have up
+ * to 256 different opcodes. When adding new opcodes, they should
+ * be appended to the end of the opcode list before O_LAST_OPCODE,
+ * this will prevent the ABI from being broken, otherwise users
+ * will have to recompile ipfw(8) when they update the kernel.
+ */
+
+enum ipfw_opcodes {            /* arguments (4 byte each)      */
+       O_NOP,
+
+       O_IP_SRC,               /* u32 = IP                     */
+       O_IP_SRC_MASK,          /* ip = IP/mask                 */
+       O_IP_SRC_ME,            /* none                         */
+       O_IP_SRC_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_DST,               /* u32 = IP                     */
+       O_IP_DST_MASK,          /* ip = IP/mask                 */
+       O_IP_DST_ME,            /* none                         */
+       O_IP_DST_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_SRCPORT,           /* (n)port list:mask 4 byte ea  */
+       O_IP_DSTPORT,           /* (n)port list:mask 4 byte ea  */
+       O_PROTO,                /* arg1=protocol                */
+
+       O_MACADDR2,             /* 2 mac addr:mask              */
+       O_MAC_TYPE,             /* same as srcport              */
+
+       O_LAYER2,               /* none                         */
+       O_IN,                   /* none                         */
+       O_FRAG,                 /* none                         */
+
+       O_RECV,                 /* none                         */
+       O_XMIT,                 /* none                         */
+       O_VIA,                  /* none                         */
+
+       O_IPOPT,                /* arg1 = 2*u8 bitmap           */
+       O_IPLEN,                /* arg1 = len                   */
+       O_IPID,                 /* arg1 = id                    */
+
+       O_IPTOS,                /* arg1 = id                    */
+       O_IPPRECEDENCE,         /* arg1 = precedence << 5       */
+       O_IPTTL,                /* arg1 = TTL                   */
+
+       O_IPVER,                /* arg1 = version               */
+       O_UID,                  /* u32 = id                     */
+       O_GID,                  /* u32 = id                     */
+       O_ESTAB,                /* none (tcp established)       */
+       O_TCPFLAGS,             /* arg1 = 2*u8 bitmap           */
+       O_TCPWIN,               /* arg1 = desired win           */
+       O_TCPSEQ,               /* u32 = desired seq.           */
+       O_TCPACK,               /* u32 = desired seq.           */
+       O_ICMPTYPE,             /* u32 = icmp bitmap            */
+       O_TCPOPTS,              /* arg1 = 2*u8 bitmap           */
+
+       O_VERREVPATH,           /* none                         */
+       O_VERSRCREACH,          /* none                         */
+
+       O_PROBE_STATE,          /* none                         */
+       O_KEEP_STATE,           /* none                         */
+       O_LIMIT,                /* ipfw_insn_limit              */
+       O_LIMIT_PARENT,         /* dyn_type, not an opcode.     */
+
+       /*
+        * These are really 'actions'.
+        */
+
+       O_LOG,                  /* ipfw_insn_log                */
+       O_PROB,                 /* u32 = match probability      */
+
+       O_CHECK_STATE,          /* none                         */
+       O_ACCEPT,               /* none                         */
+       O_DENY,                 /* none                         */
+       O_REJECT,               /* arg1=icmp arg (same as deny) */
+       O_COUNT,                /* none                         */
+       O_SKIPTO,               /* arg1=next rule number        */
+       O_PIPE,                 /* arg1=pipe number             */
+       O_QUEUE,                /* arg1=queue number            */
+       O_DIVERT,               /* arg1=port number             */
+       O_TEE,                  /* arg1=port number             */
+       O_FORWARD_IP,           /* fwd sockaddr                 */
+       O_FORWARD_MAC,          /* fwd mac                      */
+       O_NAT,                  /* nope                         */
+       O_REASS,                /* none                         */
+       
+       /*
+        * More opcodes.
+        */
+       O_IPSEC,                /* has ipsec history            */
+       O_IP_SRC_LOOKUP,        /* arg1=table number, u32=value */
+       O_IP_DST_LOOKUP,        /* arg1=table number, u32=value */
+       O_ANTISPOOF,            /* none                         */
+       O_JAIL,                 /* u32 = id                     */
+       O_ALTQ,                 /* u32 = altq classif. qid      */
+       O_DIVERTED,             /* arg1=bitmap (1:loop, 2:out)  */
+       O_TCPDATALEN,           /* arg1 = tcp data len          */
+       O_IP6_SRC,              /* address without mask         */
+       O_IP6_SRC_ME,           /* my addresses                 */
+       O_IP6_SRC_MASK,         /* address with the mask        */
+       O_IP6_DST,
+       O_IP6_DST_ME,
+       O_IP6_DST_MASK,
+       O_FLOW6ID,              /* for flow id tag in the ipv6 pkt */
+       O_ICMP6TYPE,            /* icmp6 packet type filtering  */
+       O_EXT_HDR,              /* filtering for ipv6 extension header */
+       O_IP6,
+
+       /*
+        * actions for ng_ipfw
+        */
+       O_NETGRAPH,             /* send to ng_ipfw              */
+       O_NGTEE,                /* copy to ng_ipfw              */
+
+       O_IP4,
+
+       O_UNREACH6,             /* arg1=icmpv6 code arg (deny)  */
+
+       O_TAG,                  /* arg1=tag number */
+       O_TAGGED,               /* arg1=tag number */
+
+       O_SETFIB,               /* arg1=FIB number */
+       O_FIB,                  /* arg1=FIB desired fib number */
+
+       O_LAST_OPCODE           /* not an opcode!               */
+};
+
+/*
+ * The extension header are filtered only for presence using a bit
+ * vector with a flag for each header.
+ */
+#define EXT_FRAGMENT   0x1
+#define EXT_HOPOPTS    0x2
+#define EXT_ROUTING    0x4
+#define EXT_AH         0x8
+#define EXT_ESP                0x10
+#define EXT_DSTOPTS    0x20
+#define EXT_RTHDR0             0x40
+#define EXT_RTHDR2             0x80
+
+/*
+ * Template for instructions.
+ *
+ * ipfw_insn is used for all instructions which require no operands,
+ * a single 16-bit value (arg1), or a couple of 8-bit values.
+ *
+ * For other instructions which require different/larger arguments
+ * we have derived structures, ipfw_insn_*.
+ *
+ * The size of the instruction (in 32-bit words) is in the low
+ * 6 bits of "len". The 2 remaining bits are used to implement
+ * NOT and OR on individual instructions. Given a type, you can
+ * compute the length to be put in "len" using F_INSN_SIZE(t)
+ *
+ * F_NOT       negates the match result of the instruction.
+ *
+ * F_OR                is used to build or blocks. By default, instructions
+ *             are evaluated as part of a logical AND. An "or" block
+ *             { X or Y or Z } contains F_OR set in all but the last
+ *             instruction of the block. A match will cause the code
+ *             to skip past the last instruction of the block.
+ *
+ * NOTA BENE: in a couple of places we assume that
+ *     sizeof(ipfw_insn) == sizeof(u_int32_t)
+ * this needs to be fixed.
+ *
+ */
+typedef struct _ipfw_insn {    /* template for instructions */
+       u_int8_t        opcode;
+       u_int8_t        len;    /* number of 32-bit words */
+#define        F_NOT           0x80
+#define        F_OR            0x40
+#define        F_LEN_MASK      0x3f
+#define        F_LEN(cmd)      ((cmd)->len & F_LEN_MASK)
+
+       u_int16_t       arg1;
+} ipfw_insn;
+
+/*
+ * The F_INSN_SIZE(type) computes the size, in 4-byte words, of
+ * a given type.
+ */
+#define        F_INSN_SIZE(t)  ((sizeof (t))/sizeof(u_int32_t))
+
+/*
+ * This is used to store an array of 16-bit entries (ports etc.)
+ */
+typedef struct _ipfw_insn_u16 {
+       ipfw_insn o;
+       u_int16_t ports[2];     /* there may be more */
+} ipfw_insn_u16;
+
+/*
+ * This is used to store an array of 32-bit entries
+ * (uid, single IPv4 addresses etc.)
+ */
+typedef struct _ipfw_insn_u32 {
+       ipfw_insn o;
+       u_int32_t d[1]; /* one or more */
+} ipfw_insn_u32;
+
+/*
+ * This is used to store IP addr-mask pairs.
+ */
+typedef struct _ipfw_insn_ip {
+       ipfw_insn o;
+       struct in_addr  addr;
+       struct in_addr  mask;
+} ipfw_insn_ip;
+
+/*
+ * This is used to forward to a given address (ip).
+ */
+typedef struct  _ipfw_insn_sa {
+       ipfw_insn o;
+       struct sockaddr_in sa;
+} ipfw_insn_sa;
+
+/*
+ * This is used for MAC addr-mask pairs.
+ */
+typedef struct _ipfw_insn_mac {
+       ipfw_insn o;
+       u_char addr[12];        /* dst[6] + src[6] */
+       u_char mask[12];        /* dst[6] + src[6] */
+} ipfw_insn_mac;
+
+/*
+ * This is used for interface match rules (recv xx, xmit xx).
+ */
+typedef struct _ipfw_insn_if {
+       ipfw_insn o;
+       union {
+               struct in_addr ip;
+               int glob;
+       } p;
+       char name[IFNAMSIZ];
+} ipfw_insn_if;
+
+/*
+ * This is used for storing an altq queue id number.
+ */
+typedef struct _ipfw_insn_altq {
+       ipfw_insn       o;
+       u_int32_t       qid;
+} ipfw_insn_altq;
+
+/*
+ * This is used for limit rules.
+ */
+typedef struct _ipfw_insn_limit {
+       ipfw_insn o;
+       u_int8_t _pad;
+       u_int8_t limit_mask;    /* combination of DYN_* below   */
+#define        DYN_SRC_ADDR    0x1
+#define        DYN_SRC_PORT    0x2
+#define        DYN_DST_ADDR    0x4
+#define        DYN_DST_PORT    0x8
+
+       u_int16_t conn_limit;
+} ipfw_insn_limit;
+
+/*
+ * This is used for log instructions.
+ */
+typedef struct  _ipfw_insn_log {
+        ipfw_insn o;
+       u_int32_t max_log;      /* how many do we log -- 0 = all */
+       u_int32_t log_left;     /* how many left to log         */
+} ipfw_insn_log;
+
+/*
+ * Data structures required by both ipfw(8) and ipfw(4) but not part of the
+ * management API are protected by IPFW_INTERNAL.
+ */
+#ifdef IPFW_INTERNAL
+/* Server pool support (LSNAT). */
+struct cfg_spool {
+       LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
+       struct in_addr          addr;
+       u_short                 port;
+};
+#endif
+
+/* Redirect modes id. */
+#define REDIR_ADDR      0x01
+#define REDIR_PORT      0x02
+#define REDIR_PROTO     0x04
+
+#ifdef IPFW_INTERNAL
+/* Nat redirect configuration. */
+struct cfg_redir {
+       LIST_ENTRY(cfg_redir)   _next;          /* chain of redir instances */
+       u_int16_t               mode;           /* type of redirect mode */
+       struct in_addr          laddr;          /* local ip address */
+       struct in_addr          paddr;          /* public ip address */
+       struct in_addr          raddr;          /* remote ip address */
+       u_short                 lport;          /* local port */
+       u_short                 pport;          /* public port */
+       u_short                 rport;          /* remote port  */
+       u_short                 pport_cnt;      /* number of public ports */
+       u_short                 rport_cnt;      /* number of remote ports */
+       int                     proto;          /* protocol: tcp/udp */
+       struct alias_link       **alink;        
+       /* num of entry in spool chain */
+       u_int16_t               spool_cnt;      
+       /* chain of spool instances */
+       LIST_HEAD(spool_chain, cfg_spool) spool_chain;
+};
+#endif
+
+#define NAT_BUF_LEN     1024
+
+#ifdef IPFW_INTERNAL
+/* Nat configuration data struct. */
+struct cfg_nat {
+       /* chain of nat instances */
+       LIST_ENTRY(cfg_nat)     _next;
+       int                     id;                     /* nat id */
+       struct in_addr          ip;                     /* nat ip address */
+       char                    if_name[IF_NAMESIZE];   /* interface name */
+       int                     mode;                   /* aliasing mode */
+       struct libalias         *lib;                   /* libalias instance */
+       /* number of entry in spool chain */
+       int                     redir_cnt;              
+       /* chain of redir instances */
+       LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
+};
+#endif
+
+#define SOF_NAT         sizeof(struct cfg_nat)
+#define SOF_REDIR       sizeof(struct cfg_redir)
+#define SOF_SPOOL       sizeof(struct cfg_spool)
+
+/* Nat command. */
+typedef struct _ipfw_insn_nat {
+       ipfw_insn       o;
+       struct cfg_nat *nat;    
+} ipfw_insn_nat;
+
+/* Apply ipv6 mask on ipv6 addr */
+#define APPLY_MASK(addr,mask)                          \
+    (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
+    (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
+    (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
+    (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];
+
+/* Structure for ipv6 */
+typedef struct _ipfw_insn_ip6 {
+       ipfw_insn o;
+       struct in6_addr addr6;
+       struct in6_addr mask6;
+} ipfw_insn_ip6;
+
+/* Used to support icmp6 types */
+typedef struct _ipfw_insn_icmp6 {
+       ipfw_insn o;
+       uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
+                       *     define ICMP6_MAXTYPE
+                       *     as follows: n = ICMP6_MAXTYPE/32 + 1
+                        *     Actually is 203 
+                       */
+} ipfw_insn_icmp6;
+
+/*
+ * Here we have the structure representing an ipfw rule.
+ *
+ * It starts with a general area (with link fields and counters)
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values.
+ *
+ * Given a rule pointer  r:
+ *
+ *  r->cmd             is the start of the first instruction.
+ *  ACTION_PTR(r)      is the start of the first action (things to do
+ *                     once a rule matched).
+ *
+ * When assembling instruction, remember the following:
+ *
+ *  + if a rule has a "keep-state" (or "limit") option, then the
+ *     first instruction (at r->cmd) MUST BE an O_PROBE_STATE
+ *  + if a rule has a "log" option, then the first action
+ *     (at ACTION_PTR(r)) MUST be O_LOG
+ *  + if a rule has an "altq" option, it comes after "log"
+ *  + if a rule has an O_TAG option, it comes after "log" and "altq"
+ *
+ * NOTE: we use a simple linked list of rules because we never need
+ *     to delete a rule without scanning the list. We do not use
+ *     queue(3) macros for portability and readability.
+ */
+
+struct ip_fw {
+       struct ip_fw    *x_next;        /* linked list of rules         */
+       struct ip_fw    *next_rule;     /* ptr to next [skipto] rule    */
+       /* 'next_rule' is used to pass up 'set_disable' status          */
+
+       uint16_t        act_ofs;        /* offset of action in 32-bit units */
+       uint16_t        cmd_len;        /* # of 32-bit words in cmd     */
+       uint16_t        rulenum;        /* rule number                  */
+       uint8_t set;            /* rule set (0..31)             */
+#define        RESVD_SET       31      /* set for default and persistent rules */
+       uint8_t         _pad;           /* padding                      */
+       uint32_t        id;             /* rule id */
+
+       /* These fields are present in all rules.                       */
+       uint64_t        pcnt;           /* Packet counter               */
+       uint64_t        bcnt;           /* Byte counter                 */
+       uint32_t        timestamp;      /* tv_sec of last match         */
+
+       ipfw_insn       cmd[1];         /* storage for commands         */
+};
+
+#define ACTION_PTR(rule)                               \
+       (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )
+
+#define RULESIZE(rule)  (sizeof(struct ip_fw) + \
+       ((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+
+/*
+ * This structure is used as a flow mask and a flow id for various
+ * parts of the code.
+ */
+struct ipfw_flow_id {
+       u_int32_t       dst_ip;
+       u_int32_t       src_ip;
+       u_int16_t       dst_port;
+       u_int16_t       src_port;
+       u_int8_t        fib;
+       u_int8_t        proto;
+       u_int8_t        flags;  /* protocol-specific flags */
+       uint8_t         addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */
+       struct in6_addr dst_ip6;        /* could also store MAC addr! */
+       struct in6_addr src_ip6;
+       u_int32_t       flow_id6;
+       u_int32_t       frag_id6;
+};
+
+#define IS_IP6_FLOW_ID(id)     ((id)->addr_type == 6)
+
+/*
+ * Dynamic ipfw rule.
+ */
+typedef struct _ipfw_dyn_rule ipfw_dyn_rule;
+
+struct _ipfw_dyn_rule {
+       ipfw_dyn_rule   *next;          /* linked list of rules.        */
+       struct ip_fw *rule;             /* pointer to rule              */
+       /* 'rule' is used to pass up the rule number (from the parent)  */
+
+       ipfw_dyn_rule *parent;          /* pointer to parent rule       */
+       u_int64_t       pcnt;           /* packet match counter         */
+       u_int64_t       bcnt;           /* byte match counter           */
+       struct ipfw_flow_id id;         /* (masked) flow id             */
+       u_int32_t       expire;         /* expire time                  */
+       u_int32_t       bucket;         /* which bucket in hash table   */
+       u_int32_t       state;          /* state of this rule (typically a
+                                        * combination of TCP flags)
+                                        */
+       u_int32_t       ack_fwd;        /* most recent ACKs in forward  */
+       u_int32_t       ack_rev;        /* and reverse directions (used */
+                                       /* to generate keepalives)      */
+       u_int16_t       dyn_type;       /* rule type                    */
+       u_int16_t       count;          /* refcount                     */
+};
+
+/*
+ * Definitions for IP option names.
+ */
+#define        IP_FW_IPOPT_LSRR        0x01
+#define        IP_FW_IPOPT_SSRR        0x02
+#define        IP_FW_IPOPT_RR          0x04
+#define        IP_FW_IPOPT_TS          0x08
+
+/*
+ * Definitions for TCP option names.
+ */
+#define        IP_FW_TCPOPT_MSS        0x01
+#define        IP_FW_TCPOPT_WINDOW     0x02
+#define        IP_FW_TCPOPT_SACK       0x04
+#define        IP_FW_TCPOPT_TS         0x08
+#define        IP_FW_TCPOPT_CC         0x10
+
+#define        ICMP_REJECT_RST         0x100   /* fake ICMP code (send a TCP RST) */
+#define        ICMP6_UNREACH_RST       0x100   /* fake ICMPv6 code (send a TCP RST) */
+
+/*
+ * These are used for lookup tables.
+ */
+typedef struct _ipfw_table_entry {
+       in_addr_t       addr;           /* network address              */
+       u_int32_t       value;          /* value                        */
+       u_int16_t       tbl;            /* table number                 */
+       u_int8_t        masklen;        /* mask length                  */
+} ipfw_table_entry;
+
+typedef struct _ipfw_table {
+       u_int32_t       size;           /* size of entries in bytes     */
+       u_int32_t       cnt;            /* # of entries                 */
+       u_int16_t       tbl;            /* table number                 */
+       ipfw_table_entry ent[0];        /* entries                      */
+} ipfw_table;
+
+#endif /* _IPFW2_H */
diff --git a/dummynet2/include/netinet/ipfw/ip_fw_private.h b/dummynet2/include/netinet/ipfw/ip_fw_private.h
new file mode 100644 (file)
index 0000000..41ae845
--- /dev/null
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+#define MTAG_IPFW      1148380143      /* IPFW-tagged cookie */
+#define MTAG_IPFW_RULE 1262273568      /* rule reference */
+
+/* Return values from ipfw_chk() */
+enum {
+       IP_FW_PASS = 0,
+       IP_FW_DENY,
+       IP_FW_DIVERT,
+       IP_FW_TEE,
+       IP_FW_DUMMYNET,
+       IP_FW_NETGRAPH,
+       IP_FW_NGTEE,
+       IP_FW_NAT,
+       IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+       struct ip6_pktopts *opt_or;
+       struct route_in6 ro_or;
+       int flags_or;
+       struct ip6_moptions *im6o_or;
+       struct ifnet *origifp_or;
+       struct ifnet *ifp_or;
+       struct sockaddr_in6 dst_or;
+       u_long mtu_or;
+       struct route_in6 ro_pmtu_or;
+};
+
+/*
+ * Reference to an ipfw rule that can be carried outside critical sections.
+ * A rule is identified by rulenum:rule_id which is ordered.
+ * In version chain_id the rule can be found in slot 'slot', so
+ * we don't need a lookup if chain_id == chain->id.
+ *
+ * On exit from the firewall this structure refers to the rule after
+ * the matching one (slot points to the new rule; rulenum:rule_id-1
+ * is the matching rule), and additional info (e.g. info often contains
+ * the insn argument or tablearg in the low 16 bits, in host format).
+ * On entry, the structure is valid if slot>0, and refers to the starting
+ * rules. 'info' contains the reason for reinject, e.g. divert port,
+ * divert direction, and so on.
+ */
+struct ipfw_rule_ref {
+       uint32_t        slot;           /* slot for matching rule       */
+       uint32_t        rulenum;        /* matching rule number         */
+       uint32_t        rule_id;        /* matching rule id             */
+       uint32_t        chain_id;       /* ruleset id                   */
+       uint32_t        info;           /* see below                    */
+};
+
+enum {
+       IPFW_INFO_MASK  = 0x0000ffff,
+       IPFW_INFO_OUT   = 0x00000000,   /* outgoing, just for convenience */
+       IPFW_INFO_IN    = 0x80000000,   /* incoming, overloads dir */
+       IPFW_ONEPASS    = 0x40000000,   /* One-pass, do not reinject */
+       IPFW_IS_MASK    = 0x30000000,   /* which source ? */
+       IPFW_IS_DIVERT  = 0x20000000,
+       IPFW_IS_DUMMYNET =0x10000000,
+       IPFW_IS_PIPE    = 0x08000000,   /* pip1=1, queue = 0 */
+};
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+       struct mbuf     *m;             /* the mbuf chain               */
+       struct ifnet    *oif;           /* output interface             */
+       struct sockaddr_in *next_hop;   /* forward address              */
+
+       /*
+        * On return, it points to the matching rule.
+        * On entry, rule.slot > 0 means the info is valid and
+        * contains the the starting rule for an ipfw search.
+        * If chain_id == chain->id && slot >0 then jump to that slot.
+        * Otherwise, we locate the first rule >= rulenum:rule_id
+        */
+       struct ipfw_rule_ref rule;      /* match/restart info           */
+
+       struct ether_header *eh;        /* for bridged packets          */
+
+       struct ipfw_flow_id f_id;       /* grabbed from IP header       */
+       //uint32_t      cookie;         /* a cookie depending on rule action */
+       struct inpcb    *inp;
+
+       struct _ip6dn_args      dummypar; /* dummynet->ip6_output */
+       struct sockaddr_in hopstore;    /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+       DIR_MASK =      0x3,
+       DIR_OUT =       0,
+       DIR_IN =        1,
+       DIR_FWD =       2,
+       DIR_DROP =      3,
+       PROTO_LAYER2 =  0x4, /* set for layer 2 */
+       /* PROTO_DEFAULT = 0, */
+       PROTO_IPV4 =    0x08,
+       PROTO_IPV6 =    0x10,
+       PROTO_IFB =     0x0c, /* layer2 + ifbridge */
+    /*  PROTO_OLDBDG =  0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifdef __linux__
+#define FREE_PKT(m)    netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m)    m_freem(m)
+#endif
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+       struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+       struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define        V_norule_counter        VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define        V_verbose_limit         VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+       MATCH_REVERSE = 0,
+       MATCH_FORWARD,
+       MATCH_NONE,
+       MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+    u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+       int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void);    /* uma_zcreate .... */
+void ipfw_dyn_detach(void);    /* uma_zdestroy ... */
+void ipfw_dyn_init(void);      /* per-vnet initialization */
+void ipfw_dyn_uninit(int);     /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define        V_fw_one_pass           VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define        V_fw_verbose            VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define        V_layer3_chain          VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define        V_set_disable           VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step         VNET(autoinc_step)
+
+struct ip_fw_chain {
+       struct ip_fw    *rules;         /* list of rules */
+       struct ip_fw    *reap;          /* list of rules to reap */
+       struct ip_fw    *default_rule;
+       int             n_rules;        /* number of static rules */
+       int             static_len;     /* total len of static rules */
+       struct ip_fw    **map;          /* array of rule ptrs to ease lookup */
+       LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
+       struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+        spinlock_t rwmtx;
+        spinlock_t uh_lock;
+#else
+       struct rwlock   rwmtx;
+       struct rwlock   uh_lock;        /* lock for upper half */
+#endif
+       uint32_t        id;             /* ruleset id */
+};
+
+struct sockopt;        /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define        IPFW_LOCK_INIT(_chain) do {                     \
+       rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+       rw_init(&(_chain)->uh_lock, "IPFW UH lock");    \
+       } while (0)
+
+#define        IPFW_LOCK_DESTROY(_chain) do {                  \
+       rw_destroy(&(_chain)->rwmtx);                   \
+       rw_destroy(&(_chain)->uh_lock);                 \
+       } while (0)
+
+#define        IPFW_WLOCK_ASSERT(_chain)       rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+void ipfw_flush_tables(struct ip_fw_chain *ch);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+
+/* hooks for divert */
+extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+/* In ip_fw_nat.c */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+/* netgraph prototypes */
+
+typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int);
+extern  ng_ipfw_input_t *ng_ipfw_input_p;
+#define NG_IPFW_LOADED  (ng_ipfw_input_p != NULL)
+
+#define TAGSIZ  (sizeof(struct ng_ipfw_tag) - sizeof(struct m_tag))
+
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/dummynet2/ip_dummynet.c b/dummynet2/ip_dummynet.c
new file mode 100644 (file)
index 0000000..bb34c04
--- /dev/null
@@ -0,0 +1,2370 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#define        DUMMYNET_DEBUG
+
+#include "opt_inet6.h"
+
+/*
+ * This module implements IP dummynet, a bandwidth limiter/delay emulator
+ * used in conjunction with the ipfw package.
+ * Description of the data structures used is in ip_dummynet.h
+ * Here you mainly find the following blocks of code:
+ *  + variable declarations;
+ *  + heap management functions;
+ *  + scheduler and dummynet functions;
+ *  + configuration and initialization.
+ *
+ * NOTA BENE: critical sections are protected by the "dummynet lock".
+ *
+ * Most important Changes:
+ *
+ * 011004: KLDable
+ * 010124: Fixed WF2Q behaviour
+ * 010122: Fixed spl protection.
+ * 000601: WF2Q support
+ * 000106: large rewrite, use heaps to handle very many pipes.
+ * 980513:     initial release
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>                /* ip_len, ip_off */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ; /* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static long searches, search_steps ;
+static int pipe_expire = 1 ;   /* expire queue if empty */
+static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+
+static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
+static long pipe_byte_limit = 1024 * 1024;
+
+static int red_lookup_depth = 256;     /* RED - default lookup table depth */
+static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
+static int red_max_pkt_size = 1500;     /* RED - default max packet size */
+
+static struct timeval prev_t, t;
+static long tick_last;                 /* Last tick duration (usec). */
+static long tick_delta;                        /* Last vs standard tick diff (usec). */
+static long tick_delta_sum;            /* Accumulated tick difference (usec).*/
+static long tick_adjustment;           /* Tick adjustments done. */
+static long tick_lost;                 /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static int             io_fast;
+static unsigned long   io_pkt;
+static unsigned long   io_pkt_fast;
+static unsigned long   io_pkt_drop;
+
+/*
+ * Three heaps contain queues and pipes that the scheduler handles:
+ *
+ * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
+ *
+ * wfq_ready_heap contains the pipes associated with WF2Q flows
+ *
+ * extract_heap contains pipes associated with delay lines.
+ *
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
+
+static int     heap_init(struct dn_heap *h, int size);
+static int     heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void    heap_extract(struct dn_heap *h, void *obj);
+static void    transmit_event(struct dn_pipe *pipe, struct mbuf **head,
+                   struct mbuf **tail);
+static void    ready_event(struct dn_flow_queue *q, struct mbuf **head,
+                   struct mbuf **tail);
+static void    ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
+                   struct mbuf **tail);
+
+#define        HASHSIZE        16
+#define        HASH(num)       ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
+static struct dn_pipe_head     pipehash[HASHSIZE];     /* all pipes */
+static struct dn_flow_set_head flowsethash[HASHSIZE];  /* all flowsets */
+
+static struct callout dn_timeout;
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
+#if 0  /* curr_time is 64 bit */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+    CTLFLAG_RD, &curr_time, 0, "Current tick");
+#endif
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
+    CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
+    CTLFLAG_RW, &dn_max_ratio, 0,
+    "Max ratio between dynamic queues and buckets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
+#endif
+
+#ifdef DUMMYNET_DEBUG
+int    dummynet_debug = 0;
+#ifdef SYSCTL_NODE
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
+           0, "control debugging printfs");
+#endif
+#define        DPRINTF(X)      if (dummynet_debug) printf X
+#else
+#define        DPRINTF(X)
+#endif
+
+static struct task     dn_task;
+static struct taskqueue        *dn_tq = NULL;
+static void dummynet_task(void *, int);
+
+#if defined( __linux__ ) || defined( _WIN32 )
+static DEFINE_SPINLOCK(dummynet_mtx);
+#else
+static struct mtx dummynet_mtx;
+#endif
+#define        DUMMYNET_LOCK_INIT() \
+       mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
+#define        DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
+#define        DUMMYNET_LOCK()         mtx_lock(&dummynet_mtx)
+#define        DUMMYNET_UNLOCK()       mtx_unlock(&dummynet_mtx)
+#define        DUMMYNET_LOCK_ASSERT()  mtx_assert(&dummynet_mtx, MA_OWNED)
+
+static int     config_pipe(struct dn_pipe *p);
+static int     ip_dn_ctl(struct sockopt *sopt);
+
+static void    dummynet(void *);
+static void    dummynet_flush(void);
+static void    dummynet_send(struct mbuf *);
+void           dummynet_drain(void);
+static int     dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+
+/*
+ * Flow queue is idle if:
+ *   1) it's empty for at least 1 tick
+ *   2) it has invalid timestamp (WF2Q case)
+ *   3) parent pipe has no 'exhausted' burst.
+ */
+#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \
+       curr_time > (q)->idle_time + 1 && \
+       ((q)->numbytes + (curr_time - (q)->idle_time - 1) * \
+       (q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst))
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define        HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+    struct dn_heap_entry *p;
+
+    if (h->size >= new_size ) {
+       printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
+               h->size, new_size);
+       return 0 ;
+    }
+    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
+    if (p == NULL) {
+       printf("dummynet: %s, resize %d failed\n", __func__, new_size );
+       return 1 ; /* error */
+    }
+    if (h->size > 0) {
+       bcopy(h->p, p, h->size * sizeof(*p) );
+       free(h->p, M_DUMMYNET);
+    }
+    h->p = p ;
+    h->size = new_size ;
+    return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+           *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+           *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+    int son = h->elements ;
+
+    if (p == NULL)     /* data already there, set starting point */
+       son = key1 ;
+    else {             /* insert new element at the end, possibly resize */
+       son = h->elements ;
+       if (son == h->size) /* need resize... */
+           if (heap_init(h, h->elements+1) )
+               return 1 ; /* failure... */
+       h->p[son].object = p ;
+       h->p[son].key = key1 ;
+       h->elements++ ;
+    }
+    while (son > 0) {                          /* bubble up */
+       int father = HEAP_FATHER(son) ;
+       struct dn_heap_entry tmp  ;
+
+       if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+           break ; /* found right position */
+       /* son smaller than father, swap and repeat */
+       HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+       SET_OFFSET(h, son);
+       son = father ;
+    }
+    SET_OFFSET(h, son);
+    return 0 ;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+static void
+heap_extract(struct dn_heap *h, void *obj)
+{
+    int child, father, max = h->elements - 1 ;
+
+    if (max < 0) {
+       printf("dummynet: warning, extract from empty heap 0x%p\n", h);
+       return ;
+    }
+    father = 0 ; /* default: move up smallest child */
+    if (obj != NULL) { /* extract specific element, index is at offset */
+       if (h->offset <= 0)
+           panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
+       father = *((int *)((char *)obj + h->offset)) ;
+       if (father < 0 || father >= h->elements) {
+           printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+               father, h->elements);
+           panic("dummynet: heap_extract");
+       }
+    }
+    RESET_OFFSET(h, father);
+    child = HEAP_LEFT(father) ;                /* left child */
+    while (child <= max) {             /* valid entry */
+       if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+           child = child+1 ;           /* take right child, otherwise left */
+       h->p[father] = h->p[child] ;
+       SET_OFFSET(h, father);
+       father = child ;
+       child = HEAP_LEFT(child) ;   /* left child for next loop */
+    }
+    h->elements-- ;
+    if (father != max) {
+       /*
+        * Fill hole with last entry and bubble up, reusing the insert code
+        */
+       h->p[father] = h->p[max] ;
+       heap_insert(h, father, NULL); /* this one cannot fail */
+    }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+    int temp;
+    int i ;
+    int max = h->elements-1 ;
+    struct dn_heap_entry buf ;
+
+    if (h->offset <= 0)
+       panic("cannot move items on this heap");
+
+    i = *((int *)((char *)object + h->offset));
+    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+       h->p[i].key = new_key ;
+       for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+                i = temp ) { /* bubble up */
+           HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+           SET_OFFSET(h, i);
+       }
+    } else {           /* must move down */
+       h->p[i].key = new_key ;
+       while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+           if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+               temp++ ; /* select child with min key */
+           if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+               HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+               SET_OFFSET(h, i);
+           } else
+               break ;
+           i = temp ;
+       }
+    }
+    SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+    int i ;
+
+    for (i = 0 ; i < h->elements ; i++ )
+       heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+static void
+heap_free(struct dn_heap *h)
+{
+    if (h->size >0 )
+       free(h->p, M_DUMMYNET);
+    bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+/*
+ * Dispose a list of packet. Use an inline functions so if we
+ * need to free extra state associated to a packet, this is a
+ * central point to do it.
+ */
+
+static __inline void dn_free_pkts(struct mbuf *mnext)
+{
+       struct mbuf *m;
+
+       while ((m = mnext) != NULL) {
+               mnext = m->m_nextpkt;
+               FREE_PKT(m);
+       }
+}
+
+/*
+ * Return the mbuf tag holding the dummynet state.  As an optimization
+ * this is assumed to be the first tag on the list.  If this turns out
+ * wrong we'll need to search the list.
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+    struct m_tag *mtag = m_tag_first(m);
+    KASSERT(mtag != NULL &&
+           mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+           mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+           ("packet on dummynet queue w/o dummynet tag!"));
+    return (struct dn_pkt_tag *)(mtag+1);
+}
+
+/*
+ * Scheduler functions:
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with fixed-rate queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * wfq_ready_event() does something similar with WF2Q queues, and the
+ * event handled is the start time of the head pkt.
+ *
+ * In all cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+{
+       struct mbuf *m;
+       struct dn_pkt_tag *pkt;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       while ((m = pipe->head) != NULL) {
+               pkt = dn_tag_get(m);
+               if (!DN_KEY_LEQ(pkt->output_time, curr_time))
+                       break;
+
+               pipe->head = m->m_nextpkt;
+               if (*tail != NULL)
+                       (*tail)->m_nextpkt = m;
+               else
+                       *head = m;
+               *tail = m;
+       }
+       if (*tail != NULL)
+               (*tail)->m_nextpkt = NULL;
+
+       /* If there are leftover packets, put into the heap for next event. */
+       if ((m = pipe->head) != NULL) {
+               pkt = dn_tag_get(m);
+               /*
+                * XXX Should check errors on heap_insert, by draining the
+                * whole pipe p and hoping in the future we are more successful.
+                */
+               heap_insert(&extract_heap, pkt->output_time, pipe);
+       }
+}
+
+#ifndef __linux__
+#define div64(a, b)    ((int64_t)(a) / (int64_t)(b))
+#endif
+/*
+ * Compute how many ticks we have to wait before being able to send
+ * a packet. This is computed as the "wire time" for the packet
+ * (length + extra bits), minus the credit available, scaled to ticks.
+ * Check that the result is not be negative (it could be if we have
+ * too much leftover credit in q->numbytes).
+ */
+static inline dn_key
+set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+{
+       int64_t ret;
+
+       ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
+               - q->numbytes + p->bandwidth - 1 , p->bandwidth);
+       if (ret < 0)
+               ret = 0;
+       return ret;
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are in milliseconds
+ * so we need to divide by 1000.
+ */
+static dn_key
+compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+{
+       int index;
+       dn_key extra_bits;
+
+       if (!p->samples || p->samples_no == 0)
+               return 0;
+       index  = random() % p->samples_no;
+       extra_bits = div64((dn_key)p->samples[index] * p->bandwidth, 1000);
+       if (index >= p->loss_level) {
+               struct dn_pkt_tag *dt = dn_tag_get(pkt);
+               if (dt)
+                       dt->dn_dir = DIR_DROP;
+       }
+       return extra_bits;
+}
+
+static void
+free_pipe(struct dn_pipe *p)
+{
+       if (p->samples)
+               free(p->samples, M_DUMMYNET);
+       free(p, M_DUMMYNET);
+}
+
+/*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+static void
+move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
+    int len)
+{
+    struct dn_pkt_tag *dt = dn_tag_get(pkt);
+
+    q->head = pkt->m_nextpkt ;
+    q->len-- ;
+    q->len_bytes -= len ;
+
+    dt->output_time = curr_time + p->delay ;
+
+    if (p->head == NULL)
+       p->head = pkt;
+    else
+       p->tail->m_nextpkt = pkt;
+    p->tail = pkt;
+    p->tail->m_nextpkt = NULL;
+}
+
+/*
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
+ */
+static void
+ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+{
+       struct mbuf *pkt;
+       struct dn_pipe *p = q->fs->pipe;
+       int p_was_empty;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       if (p == NULL) {
+               printf("dummynet: ready_event- pipe is gone\n");
+               return;
+       }
+       p_was_empty = (p->head == NULL);
+
+       /*
+        * Schedule fixed-rate queues linked to this pipe:
+        * account for the bw accumulated since last scheduling, then
+        * drain as many pkts as allowed by q->numbytes and move to
+        * the delay line (in p) computing output time.
+        * bandwidth==0 (no limit) means we can drain the whole queue,
+        * setting len_scaled = 0 does the job.
+        */
+       q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
+       while ((pkt = q->head) != NULL) {
+               int len = pkt->m_pkthdr.len;
+               dn_key len_scaled = p->bandwidth ? len*8*hz
+                       + q->extra_bits*hz
+                       : 0;
+
+               if (DN_KEY_GT(len_scaled, q->numbytes))
+                       break;
+               q->numbytes -= len_scaled;
+               move_pkt(pkt, q, p, len);
+               if (q->head)
+                       q->extra_bits = compute_extra_bits(q->head, p);
+       }
+       /*
+        * If we have more packets queued, schedule next ready event
+        * (can only occur when bandwidth != 0, otherwise we would have
+        * flushed the whole queue in the previous loop).
+        * To this purpose we record the current time and compute how many
+        * ticks to go for the finish time of the packet.
+        */
+       if ((pkt = q->head) != NULL) {  /* this implies bandwidth != 0 */
+               dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+
+               q->sched_time = curr_time;
+               heap_insert(&ready_heap, curr_time + t, (void *)q);
+               /*
+                * XXX Should check errors on heap_insert, and drain the whole
+                * queue on error hoping next time we are luckier.
+                */
+       } else          /* RED needs to know when the queue becomes empty. */
+               q->idle_time = curr_time;
+
+       /*
+        * If the delay line was empty call transmit_event() now.
+        * Otherwise, the scheduler will take care of it.
+        */
+       if (p_was_empty)
+               transmit_event(p, head, tail);
+}
+
+/*
+ * Called when we can transmit packets on WF2Q queues. Take pkts out of
+ * the queues at their start time, and enqueue into the delay line.
+ * Packets are drained until p->numbytes < 0. As long as
+ * len_scaled >= p->numbytes, the packet goes into the delay line
+ * with a deadline p->delay. For the last packet, if p->numbytes < 0,
+ * there is an additional delay.
+ */
+static void
+ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+{
+       int p_was_empty = (p->head == NULL);
+       struct dn_heap *sch = &(p->scheduler_heap);
+       struct dn_heap *neh = &(p->not_eligible_heap);
+       int64_t p_numbytes = p->numbytes;
+
+       /*
+        * p->numbytes is only 32bits in FBSD7, but we might need 64 bits.
+        * Use a local variable for the computations, and write back the
+        * results when done, saturating if needed.
+        * The local variable has no impact on performance and helps
+        * reducing diffs between the various branches.
+        */
+
+       DUMMYNET_LOCK_ASSERT();
+
+       if (p->if_name[0] == 0)         /* tx clock is simulated */
+               p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
+       else {  /*
+                * tx clock is for real,
+                * the ifq must be empty or this is a NOP.
+                */
+#ifdef __linux__
+               return;
+#else
+               if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
+                       return;
+               else {
+                       DPRINTF(("dummynet: pipe %d ready from %s --\n",
+                           p->pipe_nr, p->if_name));
+               }
+#endif
+       }
+
+       /*
+        * While we have backlogged traffic AND credit, we need to do
+        * something on the queue.
+        */
+       while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
+               if (sch->elements > 0) {
+                       /* Have some eligible pkts to send out. */
+                       struct dn_flow_queue *q = sch->p[0].object;
+                       struct mbuf *pkt = q->head;
+                       struct dn_flow_set *fs = q->fs;
+                       uint64_t len = pkt->m_pkthdr.len;
+                       int len_scaled = p->bandwidth ? len * 8 * hz : 0;
+
+                       heap_extract(sch, NULL); /* Remove queue from heap. */
+                       p_numbytes -= len_scaled;
+                       move_pkt(pkt, q, p, len);
+
+                       p->V += div64((len << MY_M), p->sum);   /* Update V. */
+                       q->S = q->F;                    /* Update start time. */
+                       if (q->len == 0) {
+                               /* Flow not backlogged any more. */
+                               fs->backlogged--;
+                               heap_insert(&(p->idle_heap), q->F, q);
+                       } else {
+                               /* Still backlogged. */
+
+                               /*
+                                * Update F and position in backlogged queue,
+                                * then put flow in not_eligible_heap
+                                * (we will fix this later).
+                                */
+                               len = (q->head)->m_pkthdr.len;
+                               q->F += div64((len << MY_M), fs->weight);
+                               if (DN_KEY_LEQ(q->S, p->V))
+                                       heap_insert(neh, q->S, q);
+                               else
+                                       heap_insert(sch, q->F, q);
+                       }
+               }
+               /*
+                * Now compute V = max(V, min(S_i)). Remember that all elements
+                * in sch have by definition S_i <= V so if sch is not empty,
+                * V is surely the max and we must not update it. Conversely,
+                * if sch is empty we only need to look at neh.
+                */
+               if (sch->elements == 0 && neh->elements > 0)
+                       p->V = MAX64(p->V, neh->p[0].key);
+               /* Move from neh to sch any packets that have become eligible */
+               while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
+                       struct dn_flow_queue *q = neh->p[0].object;
+                       heap_extract(neh, NULL);
+                       heap_insert(sch, q->F, q);
+               }
+
+               if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
+                       p_numbytes = -1;        /* Mark not ready for I/O. */
+                       break;
+               }
+       }
+       if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0) {
+               p->idle_time = curr_time;
+               /*
+                * No traffic and no events scheduled.
+                * We can get rid of idle-heap.
+                */
+               if (p->idle_heap.elements > 0) {
+                       int i;
+
+                       for (i = 0; i < p->idle_heap.elements; i++) {
+                               struct dn_flow_queue *q;
+                               
+                               q = p->idle_heap.p[i].object;
+                               q->F = 0;
+                               q->S = q->F + 1;
+                       }
+                       p->sum = 0;
+                       p->V = 0;
+                       p->idle_heap.elements = 0;
+               }
+       }
+       /*
+        * If we are getting clocks from dummynet (not a real interface) and
+        * If we are under credit, schedule the next ready event.
+        * Also fix the delivery time of the last packet.
+        */
+       if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
+               dn_key t = 0;           /* Number of ticks i have to wait. */
+
+               if (p->bandwidth > 0)
+                       t = div64(p->bandwidth - 1 - p_numbytes, p->bandwidth);
+               dn_tag_get(p->tail)->output_time += t;
+               p->sched_time = curr_time;
+               heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
+               /*
+                * XXX Should check errors on heap_insert, and drain the whole
+                * queue on error hoping next time we are luckier.
+                */
+       }
+
+       /* Write back p_numbytes (adjust 64->32bit if necessary). */
+       p->numbytes = p_numbytes;
+
+       /*
+        * If the delay line was empty call transmit_event() now.
+        * Otherwise, the scheduler will take care of it.
+        */
+       if (p_was_empty)
+               transmit_event(p, head, tail);
+}
+
+/*
+ * This is called one tick, after previous run. It is used to
+ * schedule next run.
+ */
+static void
+dummynet(void * __unused unused)
+{
+
+       taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+/*
+ * The main dummynet processing function.
+ */
+static void
+dummynet_task(void *context, int pending)
+{
+       struct mbuf *head = NULL, *tail = NULL;
+       struct dn_pipe *pipe;
+       struct dn_heap *heaps[3];
+       struct dn_heap *h;
+       void *p;        /* generic parameter to handler */
+       int i;
+
+       DUMMYNET_LOCK();
+
+       heaps[0] = &ready_heap;                 /* fixed-rate queues */
+       heaps[1] = &wfq_ready_heap;             /* wfq queues */
+       heaps[2] = &extract_heap;               /* delay line */
+
+       /* Update number of lost(coalesced) ticks. */
+       tick_lost += pending - 1;
+       getmicrouptime(&t);
+       /* Last tick duration (usec). */
+       tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
+           (t.tv_usec - prev_t.tv_usec);
+       /* Last tick vs standard tick difference (usec). */
+       tick_delta = (tick_last * hz - 1000000) / hz;
+       /* Accumulated tick difference (usec). */
+       tick_delta_sum += tick_delta;
+       prev_t = t;
+       /*
+        * Adjust curr_time if accumulated tick difference greater than
+        * 'standard' tick. Since curr_time should be monotonically increasing,
+        * we do positive adjustment as required and throttle curr_time in
+        * case of negative adjustment.
+        */
+       curr_time++;
+       if (tick_delta_sum - tick >= 0) {
+               int diff = tick_delta_sum / tick;
+               curr_time += diff;
+               tick_diff += diff;
+               tick_delta_sum %= tick;
+               tick_adjustment++;
+       } else if (tick_delta_sum + tick <= 0) {
+               curr_time--;
+               tick_diff--;
+               tick_delta_sum += tick;
+               tick_adjustment++;
+       }
+
+       for (i = 0; i < 3; i++) {
+               h = heaps[i];
+               while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
+                       if (h->p[0].key > curr_time)
+                               printf("dummynet: warning, "
+                                   "heap %d is %d ticks late\n",
+                                   i, (int)(curr_time - h->p[0].key));
+                       /* store a copy before heap_extract */
+                       p = h->p[0].object;
+                       /* need to extract before processing */
+                       heap_extract(h, NULL);
+                       if (i == 0)
+                               ready_event(p, &head, &tail);
+                       else if (i == 1) {
+                               struct dn_pipe *pipe = p;
+                               if (pipe->if_name[0] != '\0')
+                                       printf("dummynet: bad ready_event_wfq "
+                                           "for pipe %s\n", pipe->if_name);
+                               else
+                                       ready_event_wfq(p, &head, &tail);
+                       } else
+                               transmit_event(p, &head, &tail);
+               }
+       }
+
+       /* Sweep pipes trying to expire idle flow_queues. */
+       for (i = 0; i < HASHSIZE; i++) {
+               SLIST_FOREACH(pipe, &pipehash[i], next) {
+                       if (pipe->idle_heap.elements > 0 &&
+                           DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
+                               struct dn_flow_queue *q =
+                                   pipe->idle_heap.p[0].object;
+
+                               heap_extract(&(pipe->idle_heap), NULL);
+                               /* Mark timestamp as invalid. */
+                               q->S = q->F + 1;
+                               pipe->sum -= q->fs->weight;
+                       }
+               }
+       }
+
+       DUMMYNET_UNLOCK();
+
+       if (head != NULL)
+               dummynet_send(head);
+
+       callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+
+static void
+dummynet_send(struct mbuf *m)
+{
+       struct mbuf *n;
+
+       for (; m != NULL; m = n) {
+               struct ifnet *ifp = NULL;
+               int dst;
+               struct m_tag *tag;
+
+               n = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               tag = m_tag_first(m);
+               if (tag == NULL) {
+                       dst = DIR_DROP;
+               } else {
+                       struct dn_pkt_tag *pkt = dn_tag_get(m);
+                       /* extract the dummynet info, rename the tag */
+                       dst = pkt->dn_dir;
+                       ifp = pkt->ifp;
+                       /* rename the tag so it carries reinject info */
+                       tag->m_tag_cookie = MTAG_IPFW_RULE;
+                       tag->m_tag_id = 0;
+               }
+
+               switch (dst) {
+               case DIR_OUT:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+                       break ;
+               case DIR_IN :
+                       /* put header in network format for ip_input() */
+                       //SET_NET_IPLEN(mtod(m, struct ip *));
+                       netisr_dispatch(NETISR_IP, m);
+                       break;
+#ifdef INET6
+               case DIR_IN | PROTO_IPV6:
+                       netisr_dispatch(NETISR_IPV6, m);
+                       break;
+
+               case DIR_OUT | PROTO_IPV6:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+                       break;
+#endif
+               case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+                       if (bridge_dn_p != NULL)
+                               ((*bridge_dn_p)(m, ifp));
+                       else
+                               printf("dummynet: if_bridge not loaded\n");
+
+                       break;
+               case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+                       /*
+                        * The Ethernet code assumes the Ethernet header is
+                        * contiguous in the first mbuf header.
+                        * Insure this is true.
+                        */
+                       if (m->m_len < ETHER_HDR_LEN &&
+                           (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+                               printf("dummynet/ether: pullup failed, "
+                                   "dropping packet\n");
+                               break;
+                       }
+                       ether_demux(m->m_pkthdr.rcvif, m);
+                       break;
+               case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+                       ether_output_frame(ifp, m);
+                       break;
+
+               case DIR_DROP:
+                       /* drop the packet after some time */
+                       FREE_PKT(m);
+                       break;
+
+               default:
+                       printf("dummynet: bad switch %d!\n", dst);
+                       FREE_PKT(m);
+                       break;
+               }
+       }
+}
+
+/*
+ * Unconditionally expire empty queues in case of shortage.
+ * Returns the number of queues freed.
+ */
+static int
+expire_queues(struct dn_flow_set *fs)
+{
+    struct dn_flow_queue *q, *prev ;
+    int i, initial_elements = fs->rq_elements ;
+
+    if (fs->last_expired == time_uptime)
+       return 0 ;
+    fs->last_expired = time_uptime ;
+    for (i = 0 ; i <= fs->rq_size ; i++) { /* last one is overflow */
+       for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) {
+           if (!QUEUE_IS_IDLE(q)) {
+               prev = q ;
+               q = q->next ;
+           } else { /* entry is idle, expire it */
+               struct dn_flow_queue *old_q = q ;
+
+               if (prev != NULL)
+                   prev->next = q = q->next ;
+               else
+                   fs->rq[i] = q = q->next ;
+               fs->rq_elements-- ;
+               free(old_q, M_DUMMYNET);
+           }
+       }
+    }
+    return initial_elements - fs->rq_elements ;
+}
+
+/*
+ * If room, create a new queue and put at head of slot i;
+ * otherwise, create or use the default queue.
+ */
+static struct dn_flow_queue *
+create_queue(struct dn_flow_set *fs, int i)
+{
+       struct dn_flow_queue *q;
+
+       if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
+           expire_queues(fs) == 0) {
+               /* No way to get room, use or create overflow queue. */
+               i = fs->rq_size;
+               if (fs->rq[i] != NULL)
+                   return fs->rq[i];
+       }
+       q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (q == NULL) {
+               printf("dummynet: sorry, cannot allocate queue for new flow\n");
+               return (NULL);
+       }
+       q->fs = fs;
+       q->hash_slot = i;
+       q->next = fs->rq[i];
+       q->S = q->F + 1;        /* hack - mark timestamp as invalid. */
+       q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0);
+       fs->rq[i] = q;
+       fs->rq_elements++;
+       return (q);
+}
+
+/*
+ * Given a flow_set and a pkt in last_pkt, find a matching queue
+ * after appropriate masking. The queue is moved to front
+ * so that further searches take less time.
+ */
+static struct dn_flow_queue *
+find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
+{
+    int i = 0 ; /* we need i and q for new allocations */
+    struct dn_flow_queue *q, *prev;
+    int is_v6 = IS_IP6_FLOW_ID(id);
+
+    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
+       q = fs->rq[0] ;
+    else {
+       /* first, do the masking, then hash */
+       id->dst_port &= fs->flow_mask.dst_port ;
+       id->src_port &= fs->flow_mask.src_port ;
+       id->proto &= fs->flow_mask.proto ;
+       id->flags = 0 ; /* we don't care about this one */
+       if (is_v6) {
+           APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
+           APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
+           id->flow_id6 &= fs->flow_mask.flow_id6;
+
+           i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
+
+               ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
+
+               ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
+
+               ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
+
+               (id->dst_port << 1) ^ (id->src_port) ^
+               (id->proto ) ^
+               (id->flow_id6);
+       } else {
+           id->dst_ip &= fs->flow_mask.dst_ip ;
+           id->src_ip &= fs->flow_mask.src_ip ;
+
+           i = ( (id->dst_ip) & 0xffff ) ^
+               ( (id->dst_ip >> 15) & 0xffff ) ^
+               ( (id->src_ip << 1) & 0xffff ) ^
+               ( (id->src_ip >> 16 ) & 0xffff ) ^
+               (id->dst_port << 1) ^ (id->src_port) ^
+               (id->proto );
+       }
+       i = i % fs->rq_size ;
+       /* finally, scan the current list for a match */
+       searches++ ;
+       for (prev=NULL, q = fs->rq[i] ; q ; ) {
+           search_steps++;
+           if (is_v6 &&
+                   IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
+                   IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
+                   id->dst_port == q->id.dst_port &&
+                   id->src_port == q->id.src_port &&
+                   id->proto == q->id.proto &&
+                   id->flags == q->id.flags &&
+                   id->flow_id6 == q->id.flow_id6)
+               break ; /* found */
+
+           if (!is_v6 && id->dst_ip == q->id.dst_ip &&
+                   id->src_ip == q->id.src_ip &&
+                   id->dst_port == q->id.dst_port &&
+                   id->src_port == q->id.src_port &&
+                   id->proto == q->id.proto &&
+                   id->flags == q->id.flags)
+               break ; /* found */
+
+           /* No match. Check if we can expire the entry */
+           if (pipe_expire && QUEUE_IS_IDLE(q)) {
+               /* entry is idle and not in any heap, expire it */
+               struct dn_flow_queue *old_q = q ;
+
+               if (prev != NULL)
+                   prev->next = q = q->next ;
+               else
+                   fs->rq[i] = q = q->next ;
+               fs->rq_elements-- ;
+               free(old_q, M_DUMMYNET);
+               continue ;
+           }
+           prev = q ;
+           q = q->next ;
+       }
+       if (q && prev != NULL) { /* found and not in front */
+           prev->next = q->next ;
+           q->next = fs->rq[i] ;
+           fs->rq[i] = q ;
+       }
+    }
+    if (q == NULL) { /* no match, need to allocate a new entry */
+       q = create_queue(fs, i);
+       if (q != NULL)
+       q->id = *id ;
+    }
+    return q ;
+}
+
+static int
+red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+{
+       /*
+        * RED algorithm
+        *
+        * RED calculates the average queue size (avg) using a low-pass filter
+        * with an exponential weighted (w_q) moving average:
+        *      avg  <-  (1-w_q) * avg + w_q * q_size
+        * where q_size is the queue length (measured in bytes or * packets).
+        *
+        * If q_size == 0, we compute the idle time for the link, and set
+        *      avg = (1 - w_q)^(idle/s)
+        * where s is the time needed for transmitting a medium-sized packet.
+        *
+        * Now, if avg < min_th the packet is enqueued.
+        * If avg > max_th the packet is dropped. Otherwise, the packet is
+        * dropped with probability P function of avg.
+        */
+
+       int64_t p_b = 0;
+
+       /* Queue in bytes or packets? */
+       u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
+           q->len_bytes : q->len;
+
+       DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
+
+       /* Average queue size estimation. */
+       if (q_size != 0) {
+               /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+               int diff = SCALE(q_size) - q->avg;
+               int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+               q->avg += (int)v;
+       } else {
+               /*
+                * Queue is empty, find for how long the queue has been
+                * empty and use a lookup table for computing
+                * (1 - * w_q)^(idle_time/s) where s is the time to send a
+                * (small) packet.
+                * XXX check wraps...
+                */
+               if (q->avg) {
+                       u_int t = div64(curr_time - q->idle_time,
+                           fs->lookup_step);
+
+                       q->avg = (t < fs->lookup_depth) ?
+                           SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+               }
+       }
+       DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
+
+       /* Should i drop? */
+       if (q->avg < fs->min_th) {
+               q->count = -1;
+               return (0);     /* accept packet */
+       }
+       if (q->avg >= fs->max_th) {     /* average queue >=  max threshold */
+               if (fs->flags_fs & DN_IS_GENTLE_RED) {
+                       /*
+                        * According to Gentle-RED, if avg is greater than
+                        * max_th the packet is dropped with a probability
+                        *       p_b = c_3 * avg - c_4
+                        * where c_3 = (1 - max_p) / max_th
+                        *       c_4 = 1 - 2 * max_p
+                        */
+                       p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+                           fs->c_4;
+               } else {
+                       q->count = -1;
+                       DPRINTF(("dummynet: - drop"));
+                       return (1);
+               }
+       } else if (q->avg > fs->min_th) {
+               /*
+                * We compute p_b using the linear dropping function
+                *       p_b = c_1 * avg - c_2
+                * where c_1 = max_p / (max_th - min_th)
+                *       c_2 = max_p * min_th / (max_th - min_th)
+                */
+               p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+       }
+
+       if (fs->flags_fs & DN_QSIZE_IS_BYTES)
+               p_b = div64(p_b * len, fs->max_pkt_size);
+       if (++q->count == 0)
+               q->random = random() & 0xffff;
+       else {
+               /*
+                * q->count counts packets arrived since last drop, so a greater
+                * value of q->count means a greater packet drop probability.
+                */
+               if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+                       q->count = 0;
+                       DPRINTF(("dummynet: - red drop"));
+                       /* After a drop we calculate a new random value. */
+                       q->random = random() & 0xffff;
+                       return (1);     /* drop */
+               }
+       }
+       /* End of RED algorithm. */
+
+       return (0);     /* accept */
+}
+
+static __inline struct dn_flow_set *
+locate_flowset(int fs_nr)
+{
+       struct dn_flow_set *fs;
+
+       SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
+               if (fs->fs_nr == fs_nr)
+                       return (fs);
+
+       return (NULL);
+}
+
+static __inline struct dn_pipe *
+locate_pipe(int pipe_nr)
+{
+       struct dn_pipe *pipe;
+
+       SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
+               if (pipe->pipe_nr == pipe_nr)
+                       return (pipe);
+
+       return (NULL);
+}
+
+/*
+ * dummynet hook for packets. Below 'pipe' is a pipe or a queue
+ * depending on whether WF2Q or fixed bw is used.
+ *
+ * pipe_nr     pipe or queue the packet is destined for.
+ * dir         where shall we send the packet after dummynet.
+ * m           the mbuf with the packet
+ * ifp         the 'ifp' parameter from the caller.
+ *             NULL in ip_input, destination interface in ip_output,
+ * rule                matching rule, in case of multiple passes
+ */
+static int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+       struct mbuf *m = *m0, *head = NULL, *tail = NULL;
+       struct dn_pkt_tag *pkt;
+       struct m_tag *mtag;
+       struct dn_flow_set *fs = NULL;
+       struct dn_pipe *pipe;
+       uint64_t len = m->m_pkthdr.len;
+       struct dn_flow_queue *q = NULL;
+       int is_pipe = fwa->rule.info & IPFW_IS_PIPE;
+
+       KASSERT(m->m_nextpkt == NULL,
+           ("dummynet_io: mbuf queue passed to dummynet"));
+
+       DUMMYNET_LOCK();
+       io_pkt++;
+       /*
+        * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
+        */
+       if (is_pipe) {
+               pipe = locate_pipe(fwa->rule.info & IPFW_INFO_MASK);
+               if (pipe != NULL)
+                       fs = &(pipe->fs);
+       } else
+               fs = locate_flowset(fwa->rule.info & IPFW_INFO_MASK);
+
+       if (fs == NULL)
+               goto dropit;    /* This queue/pipe does not exist! */
+       pipe = fs->pipe;
+       if (pipe == NULL) {     /* Must be a queue, try find a matching pipe. */
+               pipe = locate_pipe(fs->parent_nr);
+               if (pipe != NULL)
+                       fs->pipe = pipe;
+               else {
+                       printf("dummynet: no pipe %d for queue %d, drop pkt\n",
+                           fs->parent_nr, fs->fs_nr);
+                       goto dropit;
+               }
+       }
+       q = find_queue(fs, &(fwa->f_id));
+       if (q == NULL)
+               goto dropit;            /* Cannot allocate queue. */
+
+       /* Update statistics, then check reasons to drop pkt. */
+       q->tot_bytes += len;
+       q->tot_pkts++;
+       if (fs->plr && random() < fs->plr)
+               goto dropit;            /* Random pkt drop. */
+       if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+               if (q->len_bytes > fs->qsize)
+                       goto dropit;    /* Queue size overflow. */
+       } else {
+               if (q->len >= fs->qsize)
+                       goto dropit;    /* Queue count overflow. */
+       }
+       if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
+               goto dropit;
+
+       /* XXX expensive to zero, see if we can remove it. */
+       mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+           sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
+       if (mtag == NULL)
+               goto dropit;            /* Cannot allocate packet header. */
+       m_tag_prepend(m, mtag);         /* Attach to mbuf chain. */
+
+       pkt = (struct dn_pkt_tag *)(mtag + 1);
+       /*
+        * Ok, i can handle the pkt now...
+        * Build and enqueue packet + parameters.
+        */
+       pkt->rule = fwa->rule;
+       pkt->rule.info &= IPFW_ONEPASS; /* only keep this info */
+       pkt->dn_dir = dir;
+       pkt->ifp = fwa->oif;
+
+       if (q->head == NULL)
+               q->head = m;
+       else
+               q->tail->m_nextpkt = m;
+       q->tail = m;
+       q->len++;
+       q->len_bytes += len;
+
+       if (q->head != m)               /* Flow was not idle, we are done. */
+               goto done;
+
+       if (is_pipe) {                  /* Fixed rate queues. */
+               if (q->idle_time < curr_time) {
+                       /* Calculate available burst size. */
+                       q->numbytes +=
+                           (curr_time - q->idle_time - 1) * pipe->bandwidth;
+                       if (q->numbytes > pipe->burst)
+                               q->numbytes = pipe->burst;
+                       if (io_fast)
+                               q->numbytes += pipe->bandwidth;
+               }
+       } else {                        /* WF2Q. */
+               if (pipe->idle_time < curr_time &&
+                   pipe->scheduler_heap.elements == 0 &&
+                   pipe->not_eligible_heap.elements == 0) {
+                       /* Calculate available burst size. */
+                       pipe->numbytes +=
+                           (curr_time - pipe->idle_time - 1) * pipe->bandwidth;
+                       if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst)
+                               pipe->numbytes = pipe->burst;
+                       if (io_fast)
+                               pipe->numbytes += pipe->bandwidth;
+               }
+               pipe->idle_time = curr_time;
+       }
+       /* Necessary for both: fixed rate & WF2Q queues. */
+       q->idle_time = curr_time;
+
+       /*
+        * If we reach this point the flow was previously idle, so we need
+        * to schedule it. This involves different actions for fixed-rate or
+        * WF2Q queues.
+        */
+       if (is_pipe) {
+               /* Fixed-rate queue: just insert into the ready_heap. */
+               dn_key t = 0;
+
+               if (pipe->bandwidth) {
+                       q->extra_bits = compute_extra_bits(m, pipe);
+                       t = set_ticks(m, q, pipe);
+               }
+               q->sched_time = curr_time;
+               if (t == 0)             /* Must process it now. */
+                       ready_event(q, &head, &tail);
+               else
+                       heap_insert(&ready_heap, curr_time + t , q);
+       } else {
+               /*
+                * WF2Q. First, compute start time S: if the flow was
+                * idle (S = F + 1) set S to the virtual time V for the
+                * controlling pipe, and update the sum of weights for the pipe;
+                * otherwise, remove flow from idle_heap and set S to max(F,V).
+                * Second, compute finish time F = S + len / weight.
+                * Third, if pipe was idle, update V = max(S, V).
+                * Fourth, count one more backlogged flow.
+                */
+               if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
+                       q->S = pipe->V;
+                       pipe->sum += fs->weight; /* Add weight of new queue. */
+               } else {
+                       heap_extract(&(pipe->idle_heap), q);
+                       q->S = MAX64(q->F, pipe->V);
+               }
+               q->F = q->S + div64(len << MY_M, fs->weight);
+
+               if (pipe->not_eligible_heap.elements == 0 &&
+                   pipe->scheduler_heap.elements == 0)
+                       pipe->V = MAX64(q->S, pipe->V);
+               fs->backlogged++;
+               /*
+                * Look at eligibility. A flow is not eligibile if S>V (when
+                * this happens, it means that there is some other flow already
+                * scheduled for the same pipe, so the scheduler_heap cannot be
+                * empty). If the flow is not eligible we just store it in the
+                * not_eligible_heap. Otherwise, we store in the scheduler_heap
+                * and possibly invoke ready_event_wfq() right now if there is
+                * leftover credit.
+                * Note that for all flows in scheduler_heap (SCH), S_i <= V,
+                * and for all flows in not_eligible_heap (NEH), S_i > V.
+                * So when we need to compute max(V, min(S_i)) forall i in
+                * SCH+NEH, we only need to look into NEH.
+                */
+               if (DN_KEY_GT(q->S, pipe->V)) {         /* Not eligible. */
+                       if (pipe->scheduler_heap.elements == 0)
+                               printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
+                       heap_insert(&(pipe->not_eligible_heap), q->S, q);
+               } else {
+                       heap_insert(&(pipe->scheduler_heap), q->F, q);
+                       if (pipe->numbytes >= 0) {       /* Pipe is idle. */
+                               if (pipe->scheduler_heap.elements != 1)
+                                       printf("dummynet: OUCH! pipe should have been idle!\n");
+                               DPRINTF(("dummynet: waking up pipe %d at %d\n",
+                                   pipe->pipe_nr, (int)(q->F >> MY_M)));
+                               pipe->sched_time = curr_time;
+                               ready_event_wfq(pipe, &head, &tail);
+                       }
+               }
+       }
+done:
+       if (head == m && (dir & PROTO_LAYER2) == 0 ) {
+               /* Fast io. */
+               io_pkt_fast++;
+               if (m->m_nextpkt != NULL)
+                       printf("dummynet: fast io: pkt chain detected!\n");
+               head = m->m_nextpkt = NULL;
+       } else
+               *m0 = NULL;             /* Normal io. */
+
+       DUMMYNET_UNLOCK();
+       if (head != NULL)
+               dummynet_send(head);
+       return (0);
+
+dropit:
+       io_pkt_drop++;
+       if (q)
+               q->drops++;
+       DUMMYNET_UNLOCK();
+       FREE_PKT(m);
+       *m0 = NULL;
+       return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+}
+
+/*
+ * Dispose all packets and flow_queues on a flow_set.
+ * If all=1, also remove red lookup table and other storage,
+ * including the descriptor itself.
+ * For the one in dn_pipe MUST also cleanup ready_heap...
+ */
+static void
+purge_flow_set(struct dn_flow_set *fs, int all)
+{
+       struct dn_flow_queue *q, *qn;
+       int i;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       for (i = 0; i <= fs->rq_size; i++) {
+               for (q = fs->rq[i]; q != NULL; q = qn) {
+                       dn_free_pkts(q->head);
+                       qn = q->next;
+                       free(q, M_DUMMYNET);
+               }
+               fs->rq[i] = NULL;
+       }
+
+       fs->rq_elements = 0;
+       if (all) {
+               /* RED - free lookup table. */
+               if (fs->w_q_lookup != NULL)
+                       free(fs->w_q_lookup, M_DUMMYNET);
+               if (fs->rq != NULL)
+                       free(fs->rq, M_DUMMYNET);
+               /* If this fs is not part of a pipe, free it. */
+               if (fs->pipe == NULL || fs != &(fs->pipe->fs))
+                       free(fs, M_DUMMYNET);
+       }
+}
+
+/*
+ * Dispose all packets queued on a pipe (not a flow_set).
+ * Also free all resources associated to a pipe, which is about
+ * to be deleted.
+ */
+static void
+purge_pipe(struct dn_pipe *pipe)
+{
+
+    purge_flow_set( &(pipe->fs), 1 );
+
+    dn_free_pkts(pipe->head);
+
+    heap_free( &(pipe->scheduler_heap) );
+    heap_free( &(pipe->not_eligible_heap) );
+    heap_free( &(pipe->idle_heap) );
+}
+
+/*
+ * Delete all pipes and heaps returning memory. Must also
+ * remove references from all ipfw rules to all pipes.
+ */
+static void
+dummynet_flush(void)
+{
+       struct dn_pipe *pipe, *pipe1;
+       struct dn_flow_set *fs, *fs1;
+       int i;
+
+       DUMMYNET_LOCK();
+       /* Free heaps so we don't have unwanted events. */
+       heap_free(&ready_heap);
+       heap_free(&wfq_ready_heap);
+       heap_free(&extract_heap);
+
+       /*
+        * Now purge all queued pkts and delete all pipes.
+        *
+        * XXXGL: can we merge the for(;;) cycles into one or not?
+        */
+       for (i = 0; i < HASHSIZE; i++)
+               SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
+                       SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
+                       purge_flow_set(fs, 1);
+               }
+       for (i = 0; i < HASHSIZE; i++)
+               SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
+                       SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
+                       purge_pipe(pipe);
+                       free_pipe(pipe);
+               }
+       DUMMYNET_UNLOCK();
+}
+
+/*
+ * setup RED parameters
+ */
+static int
+config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+{
+       int i;
+
+       x->w_q = p->w_q;
+       x->min_th = SCALE(p->min_th);
+       x->max_th = SCALE(p->max_th);
+       x->max_p = p->max_p;
+
+       x->c_1 = p->max_p / (p->max_th - p->min_th);
+       x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
+
+       if (x->flags_fs & DN_IS_GENTLE_RED) {
+               x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
+               x->c_4 = SCALE(1) - 2 * p->max_p;
+       }
+
+       /* If the lookup table already exist, free and create it again. */
+       if (x->w_q_lookup) {
+               free(x->w_q_lookup, M_DUMMYNET);
+               x->w_q_lookup = NULL;
+       }
+       if (red_lookup_depth == 0) {
+               printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+                   "must be > 0\n");
+               free(x, M_DUMMYNET);
+               return (EINVAL);
+       }
+       x->lookup_depth = red_lookup_depth;
+       x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+           M_DUMMYNET, M_NOWAIT);
+       if (x->w_q_lookup == NULL) {
+               printf("dummynet: sorry, cannot allocate red lookup table\n");
+               free(x, M_DUMMYNET);
+               return(ENOSPC);
+       }
+
+       /* Fill the lookup table with (1 - w_q)^x */
+       x->lookup_step = p->lookup_step;
+       x->lookup_weight = p->lookup_weight;
+       x->w_q_lookup[0] = SCALE(1) - x->w_q;
+
+       for (i = 1; i < x->lookup_depth; i++)
+               x->w_q_lookup[i] =
+                   SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+
+       if (red_avg_pkt_size < 1)
+               red_avg_pkt_size = 512;
+       x->avg_pkt_size = red_avg_pkt_size;
+       if (red_max_pkt_size < 1)
+               red_max_pkt_size = 1500;
+       x->max_pkt_size = red_max_pkt_size;
+       return (0);
+}
+
+static int
+alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
+{
+    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
+       int l = pfs->rq_size;
+
+       if (l == 0)
+           l = dn_hash_size;
+       if (l < 4)
+           l = 4;
+       else if (l > DN_MAX_HASH_SIZE)
+           l = DN_MAX_HASH_SIZE;
+       x->rq_size = l;
+    } else                  /* one is enough for null mask */
+       x->rq_size = 1;
+    x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
+           M_DUMMYNET, M_NOWAIT | M_ZERO);
+    if (x->rq == NULL) {
+       printf("dummynet: sorry, cannot allocate queue\n");
+       return (ENOMEM);
+    }
+    x->rq_elements = 0;
+    return 0 ;
+}
+
+static void
+set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
+{
+       x->flags_fs = src->flags_fs;
+       x->qsize = src->qsize;
+       x->plr = src->plr;
+       x->flow_mask = src->flow_mask;
+       if (x->flags_fs & DN_QSIZE_IS_BYTES) {
+               if (x->qsize > pipe_byte_limit)
+                       x->qsize = 1024 * 1024;
+       } else {
+               if (x->qsize == 0)
+                       x->qsize = 50;
+               if (x->qsize > pipe_slot_limit)
+                       x->qsize = 50;
+       }
+       /* Configuring RED. */
+       if (x->flags_fs & DN_IS_RED)
+               config_red(src, x);     /* XXX should check errors */
+}
+
+/*
+ * Setup pipe or queue parameters.
+ */
+static int
+config_pipe(struct dn_pipe *p)
+{
+       struct dn_flow_set *pfs = &(p->fs);
+       struct dn_flow_queue *q;
+       int i, error;
+
+       /*
+        * The config program passes parameters as follows:
+        * bw = bits/second (0 means no limits),
+        * delay = ms, must be translated into ticks.
+        * qsize = slots/bytes
+        */
+       p->delay = (p->delay * hz) / 1000;
+       /* Scale burst size: bytes -> bits * hz */
+       p->burst *= 8 * hz;
+       /* We need either a pipe number or a flow_set number. */
+       if (p->pipe_nr == 0 && pfs->fs_nr == 0)
+               return (EINVAL);
+       if (p->pipe_nr != 0 && pfs->fs_nr != 0)
+               return (EINVAL);
+       if (p->pipe_nr != 0) {                  /* this is a pipe */
+               struct dn_pipe *pipe;
+
+               DUMMYNET_LOCK();
+               pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+
+               if (pipe == NULL) {             /* new pipe */
+                       pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
+                           M_NOWAIT | M_ZERO);
+                       if (pipe == NULL) {
+                               DUMMYNET_UNLOCK();
+                               printf("dummynet: no memory for new pipe\n");
+                               return (ENOMEM);
+                       }
+                       pipe->pipe_nr = p->pipe_nr;
+                       pipe->fs.pipe = pipe;
+                       /*
+                        * idle_heap is the only one from which
+                        * we extract from the middle.
+                        */
+                       pipe->idle_heap.size = pipe->idle_heap.elements = 0;
+                       pipe->idle_heap.offset =
+                           offsetof(struct dn_flow_queue, heap_pos);
+               } else {
+                       /* Flush accumulated credit for all queues. */
+                       for (i = 0; i <= pipe->fs.rq_size; i++) {
+                               for (q = pipe->fs.rq[i]; q; q = q->next) {
+                                       q->numbytes = p->burst +
+                                           (io_fast ? p->bandwidth : 0);
+                               }
+                       }
+               }
+
+               pipe->bandwidth = p->bandwidth;
+               pipe->burst = p->burst;
+               pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0);
+               bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
+               pipe->ifp = NULL;               /* reset interface ptr */
+               pipe->delay = p->delay;
+               set_fs_parms(&(pipe->fs), pfs);
+
+               /* Handle changes in the delay profile. */
+               if (p->samples_no > 0) {
+                       if (pipe->samples_no != p->samples_no) {
+                               if (pipe->samples != NULL)
+                                       free(pipe->samples, M_DUMMYNET);
+                               pipe->samples =
+                                   malloc(p->samples_no*sizeof(dn_key),
+                                       M_DUMMYNET, M_NOWAIT | M_ZERO);
+                               if (pipe->samples == NULL) {
+                                       DUMMYNET_UNLOCK();
+                                       printf("dummynet: no memory "
+                                               "for new samples\n");
+                                       return (ENOMEM);
+                               }
+                               pipe->samples_no = p->samples_no;
+                       }
+
+                       strncpy(pipe->name,p->name,sizeof(pipe->name));
+                       pipe->loss_level = p->loss_level;
+                       for (i = 0; i<pipe->samples_no; ++i)
+                               pipe->samples[i] = p->samples[i];
+               } else if (pipe->samples != NULL) {
+                       free(pipe->samples, M_DUMMYNET);
+                       pipe->samples = NULL;
+                       pipe->samples_no = 0;
+               }
+
+               if (pipe->fs.rq == NULL) {      /* a new pipe */
+                       error = alloc_hash(&(pipe->fs), pfs);
+                       if (error) {
+                               DUMMYNET_UNLOCK();
+                               free_pipe(pipe);
+                               return (error);
+                       }
+                       SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
+                           pipe, next);
+               }
+               DUMMYNET_UNLOCK();
+       } else {                                /* config queue */
+               struct dn_flow_set *fs;
+
+               DUMMYNET_LOCK();
+               fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
+
+               if (fs == NULL) {               /* new */
+                       if (pfs->parent_nr == 0) { /* need link to a pipe */
+                               DUMMYNET_UNLOCK();
+                               return (EINVAL);
+                       }
+                       fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
+                           M_NOWAIT | M_ZERO);
+                       if (fs == NULL) {
+                               DUMMYNET_UNLOCK();
+                               printf(
+                                   "dummynet: no memory for new flow_set\n");
+                               return (ENOMEM);
+                       }
+                       fs->fs_nr = pfs->fs_nr;
+                       fs->parent_nr = pfs->parent_nr;
+                       fs->weight = pfs->weight;
+                       if (fs->weight == 0)
+                               fs->weight = 1;
+                       else if (fs->weight > 100)
+                               fs->weight = 100;
+               } else {
+                       /*
+                        * Change parent pipe not allowed;
+                        * must delete and recreate.
+                        */
+                       if (pfs->parent_nr != 0 &&
+                           fs->parent_nr != pfs->parent_nr) {
+                               DUMMYNET_UNLOCK();
+                               return (EINVAL);
+                       }
+               }
+
+               set_fs_parms(fs, pfs);
+
+               if (fs->rq == NULL) {           /* a new flow_set */
+                       error = alloc_hash(fs, pfs);
+                       if (error) {
+                               DUMMYNET_UNLOCK();
+                               free(fs, M_DUMMYNET);
+                               return (error);
+                       }
+                       SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
+                           fs, next);
+               }
+               DUMMYNET_UNLOCK();
+       }
+       return (0);
+}
+
+/*
+ * Helper function to remove from a heap queues which are linked to
+ * a flow_set about to be deleted.
+ */
+static void
+fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+{
+    int i, found;
+
+    for (i = found = 0 ; i < h->elements ;) {
+       if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
+           h->elements-- ;
+           h->p[i] = h->p[h->elements] ;
+           found++ ;
+       } else
+           i++ ;
+    }
+    if (found)
+       heapify(h);
+}
+
+/*
+ * helper function to remove a pipe from a heap (can be there at most once)
+ */
+static void
+pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+{
+    int i;
+
+    for (i=0; i < h->elements ; i++ ) {
+       if (h->p[i].object == p) { /* found it */
+           h->elements-- ;
+           h->p[i] = h->p[h->elements] ;
+           heapify(h);
+           break ;
+       }
+    }
+}
+
+/*
+ * drain all queues. Called in case of severe mbuf shortage.
+ */
+void
+dummynet_drain(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    heap_free(&ready_heap);
+    heap_free(&wfq_ready_heap);
+    heap_free(&extract_heap);
+    /* remove all references to this pipe from flow_sets */
+    for (i = 0; i < HASHSIZE; i++)
+       SLIST_FOREACH(fs, &flowsethash[i], next)
+               purge_flow_set(fs, 0);
+
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next) {
+               purge_flow_set(&(pipe->fs), 0);
+               dn_free_pkts(pipe->head);
+               pipe->head = pipe->tail = NULL;
+       }
+    }
+}
+
+/*
+ * Fully delete a pipe or a queue, cleaning up associated info.
+ */
+static int
+delete_pipe(struct dn_pipe *p)
+{
+
+    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+       return EINVAL ;
+    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+       return EINVAL ;
+    if (p->pipe_nr != 0) { /* this is an old-style pipe */
+       struct dn_pipe *pipe;
+       struct dn_flow_set *fs;
+       int i;
+
+       DUMMYNET_LOCK();
+       pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+
+       if (pipe == NULL) {
+           DUMMYNET_UNLOCK();
+           return (ENOENT);    /* not found */
+       }
+
+       /* Unlink from list of pipes. */
+       SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+
+       /* Remove all references to this pipe from flow_sets. */
+       for (i = 0; i < HASHSIZE; i++) {
+           SLIST_FOREACH(fs, &flowsethash[i], next) {
+               if (fs->pipe == pipe) {
+                       printf("dummynet: ++ ref to pipe %d from fs %d\n",
+                           p->pipe_nr, fs->fs_nr);
+                       fs->pipe = NULL ;
+                       purge_flow_set(fs, 0);
+               }
+           }
+       }
+       fs_remove_from_heap(&ready_heap, &(pipe->fs));
+       purge_pipe(pipe); /* remove all data associated to this pipe */
+       /* remove reference to here from extract_heap and wfq_ready_heap */
+       pipe_remove_from_heap(&extract_heap, pipe);
+       pipe_remove_from_heap(&wfq_ready_heap, pipe);
+       DUMMYNET_UNLOCK();
+
+       free_pipe(pipe);
+    } else { /* this is a WF2Q queue (dn_flow_set) */
+       struct dn_flow_set *fs;
+
+       DUMMYNET_LOCK();
+       fs = locate_flowset(p->fs.fs_nr); /* locate set */
+
+       if (fs == NULL) {
+           DUMMYNET_UNLOCK();
+           return (ENOENT); /* not found */
+       }
+
+       /* Unlink from list of flowsets. */
+       SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
+
+       if (fs->pipe != NULL) {
+           /* Update total weight on parent pipe and cleanup parent heaps. */
+           fs->pipe->sum -= fs->weight * fs->backlogged ;
+           fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
+           fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
+#if 1  /* XXX should i remove from idle_heap as well ? */
+           fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
+#endif
+       }
+       purge_flow_set(fs, 1);
+       DUMMYNET_UNLOCK();
+    }
+    return 0 ;
+}
+
+/*
+ * helper function used to copy data from kernel in DUMMYNET_GET
+ */
+static char *
+dn_copy_set(struct dn_flow_set *set, char *bp)
+{
+    int i, copied = 0 ;
+    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    for (i = 0 ; i <= set->rq_size ; i++) {
+       for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
+           if (q->hash_slot != i)
+               printf("dummynet: ++ at %d: wrong slot (have %d, "
+                   "should be %d)\n", copied, q->hash_slot, i);
+           if (q->fs != set)
+               printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
+                       i, q->fs, set);
+           copied++ ;
+           bcopy(q, qp, sizeof( *q ) );
+           /* cleanup pointers */
+           qp->next = NULL ;
+           qp->head = qp->tail = NULL ;
+           qp->fs = NULL ;
+       }
+    }
+    if (copied != set->rq_elements)
+       printf("dummynet: ++ wrong count, have %d should be %d\n",
+           copied, set->rq_elements);
+    return (char *)qp ;
+}
+
+static size_t
+dn_calc_size(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    size_t size = 0;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+    /*
+     * Compute size of data structures: list of pipes and flow_sets.
+     */
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next)
+               size += sizeof(*pipe) +
+                   pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
+       SLIST_FOREACH(fs, &flowsethash[i], next)
+               size += sizeof (*fs) +
+                   fs->rq_elements * sizeof(struct dn_flow_queue);
+    }
+    return size;
+}
+
+static int
+dummynet_get(struct sockopt *sopt)
+{
+    char *buf, *bp ; /* bp is the "copy-pointer" */
+    size_t size ;
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int error=0, i ;
+
+    /* XXX lock held too long */
+    DUMMYNET_LOCK();
+    /*
+     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
+     *      cannot use this flag while holding a mutex.
+     */
+    for (i = 0; i < 10; i++) {
+       size = dn_calc_size();
+       DUMMYNET_UNLOCK();
+       buf = malloc(size, M_TEMP, M_WAITOK);
+       DUMMYNET_LOCK();
+       if (size >= dn_calc_size())
+               break;
+       free(buf, M_TEMP);
+       buf = NULL;
+    }
+    if (buf == NULL) {
+       DUMMYNET_UNLOCK();
+       return ENOBUFS ;
+    }
+    bp = buf;
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next) {
+               struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
+
+               /*
+                * Copy pipe descriptor into *bp, convert delay back to ms,
+                * then copy the flow_set descriptor(s) one at a time.
+                * After each flow_set, copy the queue descriptor it owns.
+                */
+               bcopy(pipe, bp, sizeof(*pipe));
+               pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
+               pipe_bp->burst = div64(pipe_bp->burst, 8 * hz);
+               /*
+                * XXX the following is a hack based on ->next being the
+                * first field in dn_pipe and dn_flow_set. The correct
+                * solution would be to move the dn_flow_set to the beginning
+                * of struct dn_pipe.
+                */
+               pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
+               /* Clean pointers. */
+               pipe_bp->head = pipe_bp->tail = NULL;
+               pipe_bp->fs.next.sle_next = NULL;
+               pipe_bp->fs.pipe = NULL;
+               pipe_bp->fs.rq = NULL;
+               pipe_bp->samples = NULL;
+
+               bp += sizeof(*pipe) ;
+               bp = dn_copy_set(&(pipe->fs), bp);
+       }
+    }
+
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(fs, &flowsethash[i], next) {
+               struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+
+               bcopy(fs, bp, sizeof(*fs));
+               /* XXX same hack as above */
+               fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+               fs_bp->pipe = NULL;
+               fs_bp->rq = NULL;
+               bp += sizeof(*fs);
+               bp = dn_copy_set(fs, bp);
+       }
+    }
+
+    DUMMYNET_UNLOCK();
+
+    error = sooptcopyout(sopt, buf, size);
+    free(buf, M_TEMP);
+    return error ;
+}
+
+/*
+ * Handler for the various dummynet socket options (get, flush, config, del)
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+    int error;
+    struct dn_pipe *p = NULL;
+
+    error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+    if (error)
+       return (error);
+
+    /* Disallow sets in really-really secure mode. */
+    if (sopt->sopt_dir == SOPT_SET) {
+#if __FreeBSD_version >= 500034
+       error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+       if (error)
+           return (error);
+#else
+       if (securelevel >= 3)
+           return (EPERM);
+#endif
+    }
+
+    switch (sopt->sopt_name) {
+    default :
+       printf("dummynet: -- unknown option %d", sopt->sopt_name);
+       error = EINVAL ;
+       break;
+
+    case IP_DUMMYNET_GET :
+       error = dummynet_get(sopt);
+       break ;
+
+    case IP_DUMMYNET_FLUSH :
+       dummynet_flush() ;
+       break ;
+
+    case IP_DUMMYNET_CONFIGURE :
+       p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK);
+       error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p);
+       if (error)
+           break ;
+       if (p->samples_no > 0)
+           p->samples = &(((struct dn_pipe_max *)p)->samples[0]);
+
+       error = config_pipe(p);
+       break ;
+
+    case IP_DUMMYNET_DEL :     /* remove a pipe or queue */
+       p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK);
+       error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p);
+       if (error)
+           break ;
+
+       error = delete_pipe(p);
+       break ;
+    }
+
+    if (p != NULL)
+       free(p, M_TEMP);
+
+    return error ;
+}
+
+static void
+ip_dn_init(void)
+{
+       int i;
+
+       if (bootverbose)
+               printf("DUMMYNET with IPv6 initialized (040826)\n");
+
+       DUMMYNET_LOCK_INIT();
+
+       for (i = 0; i < HASHSIZE; i++) {
+               SLIST_INIT(&pipehash[i]);
+               SLIST_INIT(&flowsethash[i]);
+       }
+       ready_heap.size = ready_heap.elements = 0;
+       ready_heap.offset = 0;
+
+       wfq_ready_heap.size = wfq_ready_heap.elements = 0;
+       wfq_ready_heap.offset = 0;
+
+       extract_heap.size = extract_heap.elements = 0;
+       extract_heap.offset = 0;
+
+       ip_dn_ctl_ptr = ip_dn_ctl;
+       ip_dn_io_ptr = dummynet_io;
+
+       TASK_INIT(&dn_task, 0, dummynet_task, NULL);
+       dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
+           taskqueue_thread_enqueue, &dn_tq);
+       taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+       callout_init(&dn_timeout, CALLOUT_MPSAFE);
+       callout_reset(&dn_timeout, 1, dummynet, NULL);
+
+       /* Initialize curr_time adjustment mechanics. */
+       getmicrouptime(&prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(void)
+{
+       ip_dn_ctl_ptr = NULL;
+       ip_dn_io_ptr = NULL;
+
+       DUMMYNET_LOCK();
+       callout_stop(&dn_timeout);
+       DUMMYNET_UNLOCK();
+       taskqueue_drain(dn_tq, &dn_task);
+       taskqueue_free(dn_tq);
+
+       dummynet_flush();
+
+       DUMMYNET_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+       switch (type) {
+       case MOD_LOAD:
+               if (ip_dn_io_ptr) {
+                   printf("DUMMYNET already loaded\n");
+                   return EEXIST ;
+               }
+               ip_dn_init();
+               break;
+
+       case MOD_UNLOAD:
+#if !defined(KLD_MODULE)
+               printf("dummynet statically compiled, cannot unload\n");
+               return EINVAL ;
+#else
+               ip_dn_destroy();
+#endif
+               break ;
+       default:
+               return EOPNOTSUPP;
+               break ;
+       }
+       return 0 ;
+}
+
+static moduledata_t dummynet_mod = {
+       "dummynet",
+       dummynet_modevent,
+       NULL
+};
+DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 1);
+/* end of file */
diff --git a/dummynet2/ip_fw2.c b/dummynet2/ip_fw2.c
new file mode 100644 (file)
index 0000000..3cc08e7
--- /dev/null
@@ -0,0 +1,2466 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * The FreeBSD IP packet firewall, main file
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pf_mtag.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_carp.h>
+#include <netinet/pim.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/sctp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/scope6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
+ */
+
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define        V_ipfw_vnet_ready       VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define        V_fw_deny_unknown_exthdrs       VNET(fw_deny_unknown_exthdrs)
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+
+VNET_DEFINE(int, autoinc_step);
+
+/*
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
+ */
+VNET_DEFINE(u_int32_t, set_disable);
+#define        V_set_disable                   VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
+VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+    "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+    CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+    "Rule number auto-increment step");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+    "Log matches to ipfw rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+    CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+    "Set upper limit of matches of ipfw rules logged");
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+    &dummy_def, 0,
+    "The default/max possible rule number.");
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
+    &dummy_tables_max, 0,
+    "The maximum number of tables.");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+    &default_to_accept, 0,
+    "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+    CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+    "Number of static rules");
+
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6);
+SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+    "Deny packets with unknown IPv6 Extension Headers");
+#endif /* INET6 */
+
+#endif /* SYSCTL_NODE */
+
+
+/*
+ * Some macros used in the various matching options.
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT     ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ *     (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+       u_char want_clear;
+       bits = ~bits;
+
+       if ( ((cmd->arg1 & 0xff) & bits) != 0)
+               return 0; /* some bits we want set were clear */
+       want_clear = (cmd->arg1 >> 8) & 0xff;
+       if ( (want_clear & bits) != want_clear)
+               return 0; /* some bits we want clear were set */
+       return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(ip + 1);
+       int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[IPOPT_OPTVAL];
+
+               if (opt == IPOPT_EOL)
+                       break;
+               if (opt == IPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[IPOPT_OLEN];
+                       if (optlen <= 0 || optlen > x)
+                               return 0; /* invalid or truncated */
+               }
+               switch (opt) {
+
+               default:
+                       break;
+
+               case IPOPT_LSRR:
+                       bits |= IP_FW_IPOPT_LSRR;
+                       break;
+
+               case IPOPT_SSRR:
+                       bits |= IP_FW_IPOPT_SSRR;
+                       break;
+
+               case IPOPT_RR:
+                       bits |= IP_FW_IPOPT_RR;
+                       break;
+
+               case IPOPT_TS:
+                       bits |= IP_FW_IPOPT_TS;
+                       break;
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(tcp + 1);
+       int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[0];
+               if (opt == TCPOPT_EOL)
+                       break;
+               if (opt == TCPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[1];
+                       if (optlen <= 0)
+                               break;
+               }
+
+               switch (opt) {
+
+               default:
+                       break;
+
+               case TCPOPT_MAXSEG:
+                       bits |= IP_FW_TCPOPT_MSS;
+                       break;
+
+               case TCPOPT_WINDOW:
+                       bits |= IP_FW_TCPOPT_WINDOW;
+                       break;
+
+               case TCPOPT_SACK_PERMITTED:
+               case TCPOPT_SACK:
+                       bits |= IP_FW_TCPOPT_SACK;
+                       break;
+
+               case TCPOPT_TIMESTAMP:
+                       bits |= IP_FW_TCPOPT_TS;
+                       break;
+
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
+{
+       if (ifp == NULL)        /* no iface with this packet, match fails */
+               return 0;
+       /* Check by name or by IP address */
+       if (cmd->name[0] != '\0') { /* match by name */
+               /* Check name */
+               if (cmd->p.glob) {
+                       if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+                               return(1);
+               } else {
+                       if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+                               return(1);
+               }
+       } else {
+#if !defined( __linux__ ) && !defined( _WIN32 )
+               struct ifaddr *ia;
+
+               if_addr_rlock(ifp);
+               TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+                       if (ia->ifa_addr->sa_family != AF_INET)
+                               continue;
+                       if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+                           (ia->ifa_addr))->sin_addr.s_addr) {
+                               if_addr_runlock(ifp);
+                               return(1);      /* match */
+                       }
+               }
+               if_addr_runlock(ifp);
+#endif
+       }
+       return(0);      /* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ * 
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ *   ip verify unicast reverse-path
+ *   ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+#if defined( __linux__ ) || defined( _WIN32 )
+       return 0;
+#else
+       struct route ro;
+       struct sockaddr_in *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in *)&(ro.ro_dst);
+       dst->sin_family = AF_INET;
+       dst->sin_len = sizeof(*dst);
+       dst->sin_addr = src;
+       in_rtalloc_ign(&ro, 0, fib);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /*
+        * If ifp is provided, check for equality with rtentry.
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * in order to pass packets injected back by if_simloop():
+        * if useloopback == 1 routing entry (via lo0) for our own address
+        * may exist, so we need to handle routing assymetry.
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+            satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+#endif
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+       return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+       int i;
+       for (i=0; i <= cmd->o.arg1; ++i )
+               if (curr_flow == cmd->d[i] )
+                       return 1;
+       return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+       struct ifnet *mdc;
+       struct ifaddr *mdc2;
+       struct in6_ifaddr *fdm;
+       struct in6_addr copia;
+
+       TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+               if_addr_rlock(mdc);
+               TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+                       if (mdc2->ifa_addr->sa_family == AF_INET6) {
+                               fdm = (struct in6_ifaddr *)mdc2;
+                               copia = fdm->ia_addr.sin6_addr;
+                               /* need for leaving scope_id in the sock_addr */
+                               in6_clearscope(&copia);
+                               if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+                                       if_addr_runlock(mdc);
+                                       return 1;
+                               }
+                       }
+               }
+               if_addr_runlock(mdc);
+       }
+       return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp)
+{
+       struct route_in6 ro;
+       struct sockaddr_in6 *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+       dst->sin6_family = AF_INET6;
+       dst->sin6_len = sizeof(*dst);
+       dst->sin6_addr = *src;
+       /* XXX MRT 0 for ipv6 at this time */
+       rtalloc_ign((struct route *)&ro, 0);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /* 
+        * if ifp is provided, check for equality with rtentry
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * to support the case of sending packets to an address of our own.
+        * (where the former interface is the first argument of if_simloop()
+        *  (=ifp), the latter is lo0)
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+           IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+       if ((icmp6_type <= ICMP6_MAXTYPE) &&
+           (icmp6_type == ICMP6_ECHO_REQUEST ||
+           icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+           icmp6_type == ICMP6_WRUREQUEST ||
+           icmp6_type == ICMP6_FQDN_QUERY ||
+           icmp6_type == ICMP6_NI_QUERY))
+               return (1);
+
+       return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+       struct mbuf *m;
+
+       m = args->m;
+       if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *tcp;
+               tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+               if ((tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m0;
+                       m0 = ipfw_send_pkt(args->m, &(args->f_id),
+                           ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                           tcp->th_flags | TH_RST);
+                       if (m0 != NULL)
+                               ip6_output(m0, NULL, NULL, 0, NULL, NULL,
+                                   NULL);
+               }
+               FREE_PKT(m);
+       } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+               /*
+                * Unlike above, the mbufs need to line up with the ip6 hdr,
+                * as the contents are read. We need to m_adj() the
+                * needed amount.
+                * The mbuf will however be thrown away so we can adjust it.
+                * Remember we did an m_pullup on it already so we
+                * can make some assumptions about contiguousness.
+                */
+               if (args->L3offset)
+                       m_adj(m, args->L3offset);
+#endif
+               icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+       } else
+               FREE_PKT(m);
+
+       args->m = NULL;
+}
+
+#endif /* INET6 */
+
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
+{
+
+#if 0
+       /* XXX When ip is not guaranteed to be at mtod() we will
+        * need to account for this */
+        * The mbuf will however be thrown away so we can adjust it.
+        * Remember we did an m_pullup on it already so we
+        * can make some assumptions about contiguousness.
+        */
+       if (args->L3offset)
+               m_adj(m, args->L3offset);
+#endif
+       if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+               /* We need the IP header in host order for icmp_error(). */
+               SET_HOST_IPLEN(ip);
+               icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+       } else if (args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *const tcp =
+                   L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+               if ( (tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m;
+                       m = ipfw_send_pkt(args->m, &(args->f_id),
+                               ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                               tcp->th_flags | TH_RST);
+                       if (m != NULL)
+                               ip_output(m, NULL, NULL, 0, NULL, NULL);
+               }
+               FREE_PKT(args->m);
+       } else
+               FREE_PKT(args->m);
+       args->m = NULL;
+}
+
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
+ */
+static int
+check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct ucred **uc, int *ugid_lookupp,
+    struct inpcb *inp)
+{
+#ifdef __linux__
+       return cred_check(insn, proto, oif,
+       dst_ip, dst_port, src_ip, src_port,
+       (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else  /* FreeBSD */
+       struct inpcbinfo *pi;
+       int wildcard;
+       struct inpcb *pcb;
+       int match;
+
+       /*
+        * Check to see if the UDP or TCP stack supplied us with
+        * the PCB. If so, rather then holding a lock and looking
+        * up the PCB, we can use the one that was supplied.
+        */
+       if (inp && *ugid_lookupp == 0) {
+               INP_LOCK_ASSERT(inp);
+               if (inp->inp_socket != NULL) {
+                       *uc = crhold(inp->inp_cred);
+                       *ugid_lookupp = 1;
+               } else
+                       *ugid_lookupp = -1;
+       }
+       /*
+        * If we have already been here and the packet has no
+        * PCB entry associated with it, then we can safely
+        * assume that this is a no match.
+        */
+       if (*ugid_lookupp == -1)
+               return (0);
+       if (proto == IPPROTO_TCP) {
+               wildcard = 0;
+               pi = &V_tcbinfo;
+       } else if (proto == IPPROTO_UDP) {
+               wildcard = INPLOOKUP_WILDCARD;
+               pi = &V_udbinfo;
+       } else
+               return 0;
+       match = 0;
+       if (*ugid_lookupp == 0) {
+               INP_INFO_RLOCK(pi);
+               pcb =  (oif) ?
+                       in_pcblookup_hash(pi,
+                               dst_ip, htons(dst_port),
+                               src_ip, htons(src_port),
+                               wildcard, oif) :
+                       in_pcblookup_hash(pi,
+                               src_ip, htons(src_port),
+                               dst_ip, htons(dst_port),
+                               wildcard, NULL);
+               if (pcb != NULL) {
+                       *uc = crhold(pcb->inp_cred);
+                       *ugid_lookupp = 1;
+               }
+               INP_INFO_RUNLOCK(pi);
+               if (*ugid_lookupp == 0) {
+                       /*
+                        * We tried and failed, set the variable to -1
+                        * so we will not try again on this packet.
+                        */
+                       *ugid_lookupp = -1;
+                       return (0);
+               }
+       } 
+       if (insn->o.opcode == O_UID)
+               match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = groupmember((gid_t)insn->d[0], *uc);
+       else if (insn->o.opcode == O_JAIL)
+               match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
+       return match;
+#endif
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+       struct ip_fw_chain *chain)
+{
+       args->rule.chain_id = chain->id;
+       args->rule.slot = slot + 1; /* we use 0 as a marker */
+       args->rule.rule_id = 1 + chain->map[slot]->id;
+       args->rule.rulenum = chain->map[slot]->rulenum;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ *     args->m (in/out) The packet; we set to NULL when/if we nuke it.
+ *             Starts with the IP header.
+ *     args->eh (in)   Mac header if present, NULL for layer3 packet.
+ *     args->L3offset  Number of bytes bypassed if we came from L2.
+ *                     e.g. often sizeof(eh)  ** NOTYET **
+ *     args->oif       Outgoing interface, NULL if packet is incoming.
+ *             The incoming interface is in the mbuf. (in)
+ *     args->divert_rule (in/out)
+ *             Skip up to the first rule past this rule number;
+ *             upon return, non-zero port number for divert or tee.
+ *
+ *     args->rule      Pointer to the last matching rule (in/out)
+ *     args->next_hop  Socket we are forwarding to (out).
+ *     args->f_id      Addresses grabbed from the packet (out)
+ *     args->rule.info a cookie depending on rule action
+ *
+ * Return value:
+ *
+ *     IP_FW_PASS      the packet must be accepted
+ *     IP_FW_DENY      the packet must be dropped
+ *     IP_FW_DIVERT    divert packet, port in m_tag
+ *     IP_FW_TEE       tee packet, port in m_tag
+ *     IP_FW_DUMMYNET  to dummynet, pipe in args->cookie
+ *     IP_FW_NETGRAPH  into netgraph, cookie args->cookie
+ *             args->rule contains the matching rule,
+ *             args->rule.info has additional information.
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+
+       /*
+        * Local variables holding state while processing a packet:
+        *
+        * IMPORTANT NOTE: to speed up the processing of rules, there
+        * are some assumption on the values of the variables, which
+        * are documented here. Should you change them, please check
+        * the implementation of the various instructions to make sure
+        * that they still work.
+        *
+        * args->eh     The MAC header. It is non-null for a layer2
+        *      packet, it is NULL for a layer-3 packet.
+        * **notyet**
+        * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+        *
+        * m | args->m  Pointer to the mbuf, as received from the caller.
+        *      It may change if ipfw_chk() does an m_pullup, or if it
+        *      consumes the packet because it calls send_reject().
+        *      XXX This has to change, so that ipfw_chk() never modifies
+        *      or consumes the buffer.
+        * ip   is the beginning of the ip(4 or 6) header.
+        *      Calculated by adding the L3offset to the start of data.
+        *      (Until we start using L3offset, the packet is
+        *      supposed to start with the ip header).
+        */
+       struct mbuf *m = args->m;
+       struct ip *ip = mtod(m, struct ip *);
+
+       /*
+        * For rules which contain uid/gid or jail constraints, cache
+        * a copy of the users credentials after the pcb lookup has been
+        * executed. This will speed up the processing of rules with
+        * these types of constraints, as well as decrease contention
+        * on pcb related locks.
+        */
+#ifdef __linux__
+       struct bsd_ucred ucred_cache;
+#else
+       struct ucred *ucred_cache = NULL;
+#endif
+       int ucred_lookup = 0;
+
+       /*
+        * oif | args->oif      If NULL, ipfw_chk has been called on the
+        *      inbound path (ether_input, ip_input).
+        *      If non-NULL, ipfw_chk has been called on the outbound path
+        *      (ether_output, ip_output).
+        */
+       struct ifnet *oif = args->oif;
+
+       int f_pos = 0;          /* index of current rule in the array */
+       int retval = 0;
+
+       /*
+        * hlen The length of the IP header.
+        */
+       u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
+
+       /*
+        * offset       The offset of a fragment. offset != 0 means that
+        *      we have a fragment at this offset of an IPv4 packet.
+        *      offset == 0 means that (if this is an IPv4 packet)
+        *      this is the first or only fragment.
+        *      For IPv6 offset == 0 means there is no Fragment Header. 
+        *      If offset != 0 for IPv6 always use correct mask to
+        *      get the correct offset because we add IP6F_MORE_FRAG
+        *      to be able to dectect the first fragment which would
+        *      otherwise have offset = 0.
+        */
+       u_short offset = 0;
+
+       /*
+        * Local copies of addresses. They are only valid if we have
+        * an IP packet.
+        *
+        * proto        The protocol. Set to 0 for non-ip packets,
+        *      or to the protocol read from the packet otherwise.
+        *      proto != 0 means that we have an IPv4 packet.
+        *
+        * src_port, dst_port   port numbers, in HOST format. Only
+        *      valid for TCP and UDP packets.
+        *
+        * src_ip, dst_ip       ip addresses, in NETWORK format.
+        *      Only valid for IPv4 packets.
+        */
+       uint8_t proto;
+       uint16_t src_port = 0, dst_port = 0;    /* NOTE: host format    */
+       struct in_addr src_ip, dst_ip;          /* NOTE: network format */
+       uint16_t iplen=0;
+       int pktlen;
+       uint16_t        etype = 0;      /* Host order stored ether type */
+
+       /*
+        * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+        *      MATCH_NONE when checked and not matched (q = NULL),
+        *      MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+        */
+       int dyn_dir = MATCH_UNKNOWN;
+       ipfw_dyn_rule *q = NULL;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       /*
+        * We store in ulp a pointer to the upper layer protocol header.
+        * In the ipv4 case this is easy to determine from the header,
+        * but for ipv6 we might have some additional headers in the middle.
+        * ulp is NULL if not found.
+        */
+       void *ulp = NULL;               /* upper layer protocol pointer. */
+       /* XXX ipv6 variables */
+       int is_ipv6 = 0;
+       u_int16_t ext_hd = 0;   /* bits vector for extension header filtering */
+       /* end of ipv6 variables */
+       int is_ipv4 = 0;
+
+       int done = 0;           /* flag to exit the outer loop */
+
+       if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
+               return (IP_FW_PASS);    /* accept */
+
+       dst_ip.s_addr = 0;              /* make sure it is initialized */
+       src_ip.s_addr = 0;              /* make sure it is initialized */
+       pktlen = m->m_pkthdr.len;
+       args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+       proto = args->f_id.proto = 0;   /* mark f_id invalid */
+               /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T)                                  \
+do {                                                           \
+       int x = (_len) + sizeof(T);                             \
+       if ((m)->m_len < x) {                                   \
+               args->m = m = m_pullup(m, x);                   \
+               if (m == NULL)                                  \
+                       goto pullup_failed;                     \
+       }                                                       \
+       p = (mtod(m, char *) + (_len));                         \
+} while (0)
+
+       /*
+        * if we have an ether header,
+        */
+       if (args->eh)
+               etype = ntohs(args->eh->ether_type);
+
+       /* Identify IP packets and fill up variables. */
+       if (pktlen >= sizeof(struct ip6_hdr) &&
+           (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+               is_ipv6 = 1;
+               args->f_id.addr_type = 6;
+               hlen = sizeof(struct ip6_hdr);
+               proto = ip6->ip6_nxt;
+
+               /* Search extension headers to find upper layer protocols */
+               while (ulp == NULL) {
+                       switch (proto) {
+                       case IPPROTO_ICMPV6:
+                               PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+                               args->f_id.flags = ICMP6(ulp)->icmp6_type;
+                               break;
+
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               args->f_id.flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_SCTP:
+                               PULLUP_TO(hlen, ulp, struct sctphdr);
+                               src_port = SCTP(ulp)->src_port;
+                               dst_port = SCTP(ulp)->dest_port;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_HOPOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_HOPOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ROUTING:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+                               switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+                               case 0:
+                                       ext_hd |= EXT_RTHDR0;
+                                       break;
+                               case 2:
+                                       ext_hd |= EXT_RTHDR2;
+                                       break;
+                               default:
+                                       printf("IPFW2: IPV6 - Unknown Routing "
+                                           "Header type(%d)\n",
+                                           ((struct ip6_rthdr *)ulp)->ip6r_type);
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               ext_hd |= EXT_ROUTING;
+                               hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+                               proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_FRAGMENT:  /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_frag);
+                               ext_hd |= EXT_FRAGMENT;
+                               hlen += sizeof (struct ip6_frag);
+                               proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+                               offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_OFF_MASK;
+                               /* Add IP6F_MORE_FRAG for offset of first
+                                * fragment to be != 0. */
+                               offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_MORE_FRAG;
+                               if (offset == 0) {
+                                       printf("IPFW2: IPV6 - Invalid Fragment "
+                                           "Header\n");
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               args->f_id.frag_id6 =
+                                   ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_DSTOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_DSTOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_AH:        /* RFC 2402 */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               ext_hd |= EXT_AH;
+                               hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+                               proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ESP:       /* RFC 2406 */
+                               PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
+                               /* Anything past Seq# is variable length and
+                                * data past this ext. header is encrypted. */
+                               ext_hd |= EXT_ESP;
+                               break;
+
+                       case IPPROTO_NONE:      /* RFC 2460 */
+                               /*
+                                * Packet ends here, and IPv6 header has
+                                * already been pulled up. If ip6e_len!=0
+                                * then octets must be ignored.
+                                */
+                               ulp = ip; /* non-NULL to get out of loop. */
+                               break;
+
+                       case IPPROTO_OSPFIGP:
+                               /* XXX OSPF header check? */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+
+                       case IPPROTO_PIM:
+                               /* XXX PIM header check? */
+                               PULLUP_TO(hlen, ulp, struct pim);
+                               break;
+
+                       case IPPROTO_CARP:
+                               PULLUP_TO(hlen, ulp, struct carp_header);
+                               if (((struct carp_header *)ulp)->carp_version !=
+                                   CARP_VERSION) 
+                                       return (IP_FW_DENY);
+                               if (((struct carp_header *)ulp)->carp_type !=
+                                   CARP_ADVERTISEMENT) 
+                                       return (IP_FW_DENY);
+                               break;
+
+                       case IPPROTO_IPV6:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hdr);
+                               break;
+
+                       case IPPROTO_IPV4:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip);
+                               break;
+
+                       default:
+                               printf("IPFW2: IPV6 - Unknown Extension "
+                                   "Header(%d), ext_hd=%x\n", proto, ext_hd);
+                               if (V_fw_deny_unknown_exthdrs)
+                                   return (IP_FW_DENY);
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+                       } /*switch */
+               }
+               ip = mtod(m, struct ip *);
+               ip6 = (struct ip6_hdr *)ip;
+               args->f_id.src_ip6 = ip6->ip6_src;
+               args->f_id.dst_ip6 = ip6->ip6_dst;
+               args->f_id.src_ip = 0;
+               args->f_id.dst_ip = 0;
+               args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+       } else if (pktlen >= sizeof(struct ip) &&
+           (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+               is_ipv4 = 1;
+               hlen = ip->ip_hl << 2;
+               args->f_id.addr_type = 4;
+
+               /*
+                * Collect parameters into local variables for faster matching.
+                */
+               proto = ip->ip_p;
+               src_ip = ip->ip_src;
+               dst_ip = ip->ip_dst;
+               offset = ntohs(ip->ip_off) & IP_OFFMASK;
+               iplen = ntohs(ip->ip_len);
+               pktlen = iplen < pktlen ? iplen : pktlen;
+
+               if (offset == 0) {
+                       switch (proto) {
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               args->f_id.flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_ICMP:
+                               PULLUP_TO(hlen, ulp, struct icmphdr);
+                               args->f_id.flags = ICMP(ulp)->icmp_type;
+                               break;
+
+                       default:
+                               break;
+                       }
+               }
+
+               ip = mtod(m, struct ip *);
+               args->f_id.src_ip = ntohl(src_ip.s_addr);
+               args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+       }
+#undef PULLUP_TO
+       if (proto) { /* we may have port numbers, store them */
+               args->f_id.proto = proto;
+               args->f_id.src_port = src_port = ntohs(src_port);
+               args->f_id.dst_port = dst_port = ntohs(dst_port);
+       }
+
+       IPFW_RLOCK(chain);
+       if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+               IPFW_RUNLOCK(chain);
+               return (IP_FW_PASS);    /* accept */
+       }
+       if (args->rule.slot) {
+               /*
+                * Packet has already been tagged as a result of a previous
+                * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+                * REASS, NETGRAPH, DIVERT/TEE...)
+                * Validate the slot and continue from the next one
+                * if still present, otherwise do a lookup.
+                */
+               f_pos = (args->rule.chain_id == chain->id) ?
+                   args->rule.slot :
+                   ipfw_find_rule(chain, args->rule.rulenum,
+                       args->rule.rule_id);
+       } else {
+               f_pos = 0;
+       }
+
+       /*
+        * Now scan the rules, and parse microinstructions for each rule.
+        * We have two nested loops and an inner switch. Sometimes we
+        * need to break out of one or both loops, or re-enter one of
+        * the loops with updated variables. Loop variables are:
+        *
+        *      f_pos (outer loop) points to the current rule.
+        *              On output it points to the matching rule.
+        *      done (outer loop) is used as a flag to break the loop.
+        *      l (inner loop)  residual length of current rule.
+        *              cmd points to the current microinstruction.
+        *
+        * We break the inner loop by setting l=0 and possibly
+        * cmdlen=0 if we don't want to advance cmd.
+        * We break the outer loop by setting done=1
+        * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+        * as needed.
+        */
+       for (; f_pos < chain->n_rules; f_pos++) {
+               ipfw_insn *cmd;
+               uint32_t tablearg = 0;
+               int l, cmdlen, skip_or; /* skip rest of OR block */
+               struct ip_fw *f;
+
+               f = chain->map[f_pos];
+               if (V_set_disable & (1 << f->set) )
+                       continue;
+
+               skip_or = 0;
+               for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+                   l -= cmdlen, cmd += cmdlen) {
+                       int match;
+
+                       /*
+                        * check_body is a jump target used when we find a
+                        * CHECK_STATE, and need to jump to the body of
+                        * the target rule.
+                        */
+
+/* check_body: */
+                       cmdlen = F_LEN(cmd);
+                       /*
+                        * An OR block (insn_1 || .. || insn_n) has the
+                        * F_OR bit set in all but the last instruction.
+                        * The first match will set "skip_or", and cause
+                        * the following instructions to be skipped until
+                        * past the one with the F_OR bit clear.
+                        */
+                       if (skip_or) {          /* skip this instruction */
+                               if ((cmd->len & F_OR) == 0)
+                                       skip_or = 0;    /* next one is good */
+                               continue;
+                       }
+                       match = 0; /* set to 1 if we succeed */
+
+                       switch (cmd->opcode) {
+                       /*
+                        * The first set of opcodes compares the packet's
+                        * fields with some pattern, setting 'match' if a
+                        * match is found. At the end of the loop there is
+                        * logic to deal with F_NOT and F_OR flags associated
+                        * with the opcode.
+                        */
+                       case O_NOP:
+                               match = 1;
+                               break;
+
+                       case O_FORWARD_MAC:
+                               printf("ipfw: opcode %d unimplemented\n",
+                                   cmd->opcode);
+                               break;
+
+                       case O_GID:
+                       case O_UID:
+                       case O_JAIL:
+                               /*
+                                * We only check offset == 0 && proto != 0,
+                                * as this ensures that we have a
+                                * packet with the ports info.
+                                */
+                               if (offset!=0)
+                                       break;
+                               if (is_ipv6) /* XXX to be fixed later */
+                                       break;
+                               if (proto == IPPROTO_TCP ||
+                                   proto == IPPROTO_UDP)
+                                       match = check_uidgid(
+                                                   (ipfw_insn_u32 *)cmd,
+                                                   proto, oif,
+                                                   dst_ip, dst_port,
+                                                   src_ip, src_port, (void *)&ucred_cache,
+                                                   &ucred_lookup, (struct inpcb *)args->m);
+                               break;
+
+                       case O_RECV:
+                               match = iface_match(m->m_pkthdr.rcvif,
+                                   (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_XMIT:
+                               match = iface_match(oif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_VIA:
+                               match = iface_match(oif ? oif :
+                                   m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_MACADDR2:
+                               if (args->eh != NULL) { /* have MAC header */
+                                       u_int32_t *want = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->addr;
+                                       u_int32_t *mask = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->mask;
+                                       u_int32_t *hdr = (u_int32_t *)args->eh;
+
+                                       match =
+                                           ( want[0] == (hdr[0] & mask[0]) &&
+                                             want[1] == (hdr[1] & mask[1]) &&
+                                             want[2] == (hdr[2] & mask[2]) );
+                               }
+                               break;
+
+                       case O_MAC_TYPE:
+                               if (args->eh != NULL) {
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (etype >= p[0] &&
+                                                   etype <= p[1]);
+                               }
+                               break;
+
+                       case O_FRAG:
+                               match = (offset != 0);
+                               break;
+
+                       case O_IN:      /* "out" is "not in" */
+                               match = (oif == NULL);
+                               break;
+
+                       case O_LAYER2:
+                               match = (args->eh != NULL);
+                               break;
+
+                       case O_DIVERTED:
+                           {
+                               /* For diverted packets, args->rule.info
+                                * contains the divert port (in host format)
+                                * reason and direction.
+                                */
+                               uint32_t i = args->rule.info;
+                               match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+                                   cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+                           }
+                               break;
+
+                       case O_PROTO:
+                               /*
+                                * We do not allow an arg of 0 so the
+                                * check of "proto" only suffices.
+                                */
+                               match = (proto == cmd->arg1);
+                               break;
+
+                       case O_IP_SRC:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   src_ip.s_addr);
+                               break;
+
+                       case O_IP_SRC_LOOKUP:
+                       case O_IP_DST_LOOKUP:
+                               if (is_ipv4) {
+                                   uint32_t key =
+                                       (cmd->opcode == O_IP_DST_LOOKUP) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t v = 0;
+
+                                   if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+                                       /* generic lookup. The key must be
+                                        * in 32bit big-endian format.
+                                        */
+                                       v = ((ipfw_insn_u32 *)cmd)->d[1];
+                                       if (v == 0)
+                                           key = dst_ip.s_addr;
+                                       else if (v == 1)
+                                           key = src_ip.s_addr;
+                                       else if (offset != 0)
+                                           break;
+                                       else if (proto != IPPROTO_TCP &&
+                                               proto != IPPROTO_UDP)
+                                           break;
+                                       else if (v == 2)
+                                           key = htonl(dst_port);
+                                       else if (v == 3)
+                                           key = htonl(src_port);
+                                       else if (v == 4 || v == 5) {
+                                           check_uidgid(
+                                               (ipfw_insn_u32 *)cmd,
+                                               proto, oif,
+                                               dst_ip, dst_port,
+                                               src_ip, src_port, (void *)&ucred_cache,
+                                               &ucred_lookup, (struct inpcb *)args->m);
+#ifdef __linux__
+                                           if (v ==4 /* O_UID */)
+                                               key = ucred_cache.uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache.xid;
+#else
+                                           if (v == 4 /* O_UID */)
+                                               key = ucred_cache->cr_uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache->cr_prison->pr_id;
+#endif
+                                           key = htonl(key);
+                                       } else
+                                           break;
+                                   }
+                                   match = ipfw_lookup_table(chain,
+                                       cmd->arg1, key, &v);
+                                   if (!match)
+                                       break;
+                                   if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+                                       match =
+                                           ((ipfw_insn_u32 *)cmd)->d[0] == v;
+                                   else
+                                       tablearg = v;
+                               }
+                               break;
+
+                       case O_IP_SRC_MASK:
+                       case O_IP_DST_MASK:
+                               if (is_ipv4) {
+                                   uint32_t a =
+                                       (cmd->opcode == O_IP_DST_MASK) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+                                   int i = cmdlen-1;
+
+                                   for (; !match && i>0; i-= 2, p+= 2)
+                                       match = (p[0] == (a & p[1]));
+                               }
+                               break;
+
+                       case O_IP_SRC_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(src_ip, tif);
+                                       match = (tif != NULL);
+                               }
+                               break;
+
+                       case O_IP_DST_SET:
+                       case O_IP_SRC_SET:
+                               if (is_ipv4) {
+                                       u_int32_t *d = (u_int32_t *)(cmd+1);
+                                       u_int32_t addr =
+                                           cmd->opcode == O_IP_DST_SET ?
+                                               args->f_id.dst_ip :
+                                               args->f_id.src_ip;
+
+                                           if (addr < d[0])
+                                                   break;
+                                           addr -= d[0]; /* subtract base */
+                                           match = (addr < cmd->arg1) &&
+                                               ( d[ 1 + (addr>>5)] &
+                                                 (1<<(addr & 0x1f)) );
+                               }
+                               break;
+
+                       case O_IP_DST:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   dst_ip.s_addr);
+                               break;
+
+                       case O_IP_DST_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(dst_ip, tif);
+                                       match = (tif != NULL);
+                               }
+                               break;
+
+                       case O_IP_SRCPORT:
+                       case O_IP_DSTPORT:
+                               /*
+                                * offset == 0 && proto != 0 is enough
+                                * to guarantee that we have a
+                                * packet with port info.
+                                */
+                               if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+                                   && offset == 0) {
+                                       u_int16_t x =
+                                           (cmd->opcode == O_IP_SRCPORT) ?
+                                               src_port : dst_port ;
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (x>=p[0] && x<=p[1]);
+                               }
+                               break;
+
+                       case O_ICMPTYPE:
+                               match = (offset == 0 && proto==IPPROTO_ICMP &&
+                                   icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+                               break;
+
+#ifdef INET6
+                       case O_ICMP6TYPE:
+                               match = is_ipv6 && offset == 0 &&
+                                   proto==IPPROTO_ICMPV6 &&
+                                   icmp6type_match(
+                                       ICMP6(ulp)->icmp6_type,
+                                       (ipfw_insn_u32 *)cmd);
+                               break;
+#endif /* INET6 */
+
+                       case O_IPOPT:
+                               match = (is_ipv4 &&
+                                   ipopts_match(ip, cmd) );
+                               break;
+
+                       case O_IPVER:
+                               match = (is_ipv4 &&
+                                   cmd->arg1 == ip->ip_v);
+                               break;
+
+                       case O_IPID:
+                       case O_IPLEN:
+                       case O_IPTTL:
+                               if (is_ipv4) {  /* only for IP packets */
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   if (cmd->opcode == O_IPLEN)
+                                       x = iplen;
+                                   else if (cmd->opcode == O_IPTTL)
+                                       x = ip->ip_ttl;
+                                   else /* must be IPID */
+                                       x = ntohs(ip->ip_id);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_IPPRECEDENCE:
+                               match = (is_ipv4 &&
+                                   (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+                               break;
+
+                       case O_IPTOS:
+                               match = (is_ipv4 &&
+                                   flags_match(cmd, ip->ip_tos));
+                               break;
+
+                       case O_TCPDATALEN:
+                               if (proto == IPPROTO_TCP && offset == 0) {
+                                   struct tcphdr *tcp;
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   tcp = TCP(ulp);
+                                   x = iplen -
+                                       ((ip->ip_hl + tcp->th_off) << 2);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_TCPFLAGS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   flags_match(cmd, TCP(ulp)->th_flags));
+                               break;
+
+                       case O_TCPOPTS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   tcpopts_match(TCP(ulp), cmd));
+                               break;
+
+                       case O_TCPSEQ:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_seq);
+                               break;
+
+                       case O_TCPACK:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_ack);
+                               break;
+
+                       case O_TCPWIN:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   cmd->arg1 == TCP(ulp)->th_win);
+                               break;
+
+                       case O_ESTAB:
+                               /* reject packets which have SYN only */
+                               /* XXX should i also check for TH_ACK ? */
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   (TCP(ulp)->th_flags &
+                                    (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+                               break;
+
+                       case O_ALTQ: {
+                               struct pf_mtag *at;
+                               ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                               match = 1;
+                               at = pf_find_mtag(m);
+                               if (at != NULL && at->qid != 0)
+                                       break;
+                               at = pf_get_mtag(m);
+                               if (at == NULL) {
+                                       /*
+                                        * Let the packet fall back to the
+                                        * default ALTQ.
+                                        */
+                                       break;
+                               }
+                               at->qid = altq->qid;
+                               if (is_ipv4)
+                                       at->af = AF_INET;
+                               else
+                                       at->af = AF_LINK;
+                               at->hdr = ip;
+                               break;
+                       }
+
+                       case O_LOG:
+                                       ipfw_log(f, hlen, args, m,
+                                           oif, offset, tablearg, ip);
+                               match = 1;
+                               break;
+
+                       case O_PROB:
+                               match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+                               break;
+
+                       case O_VERREVPATH:
+                               /* Outgoing packets automatically pass/match */
+                               match = ((oif != NULL) ||
+                                   (m->m_pkthdr.rcvif == NULL) ||
+                                   (
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           m->m_pkthdr.rcvif) :
+#endif
+                                   verify_path(src_ip, m->m_pkthdr.rcvif,
+                                       args->f_id.fib)));
+                               break;
+
+                       case O_VERSRCREACH:
+                               /* Outgoing packets automatically pass/match */
+                               match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           NULL) :
+#endif
+                                   verify_path(src_ip, NULL, args->f_id.fib)));
+                               break;
+
+                       case O_ANTISPOOF:
+                               /* Outgoing packets automatically pass/match */
+                               if (oif == NULL && hlen > 0 &&
+                                   (  (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+                                   || (is_ipv6 &&
+                                       in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+                                   ))
+                                       match =
+#ifdef INET6
+                                           is_ipv6 ? verify_path6(
+                                               &(args->f_id.src_ip6),
+                                               m->m_pkthdr.rcvif) :
+#endif
+                                           verify_path(src_ip,
+                                               m->m_pkthdr.rcvif,
+                                               args->f_id.fib);
+                               else
+                                       match = 1;
+                               break;
+
+                       case O_IPSEC:
+#ifdef IPSEC
+                               match = (m_tag_find(m,
+                                   PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+                               /* otherwise no match */
+                               break;
+
+#ifdef INET6
+                       case O_IP6_SRC:
+                               match = is_ipv6 &&
+                                   IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+
+                       case O_IP6_DST:
+                               match = is_ipv6 &&
+                               IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                               if (is_ipv6) {
+                                       int i = cmdlen - 1;
+                                       struct in6_addr p;
+                                       struct in6_addr *d =
+                                           &((ipfw_insn_ip6 *)cmd)->addr6;
+
+                                       for (; !match && i > 0; d += 2,
+                                           i -= F_INSN_SIZE(struct in6_addr)
+                                           * 2) {
+                                               p = (cmd->opcode ==
+                                                   O_IP6_SRC_MASK) ?
+                                                   args->f_id.src_ip6:
+                                                   args->f_id.dst_ip6;
+                                               APPLY_MASK(&p, &d[1]);
+                                               match =
+                                                   IN6_ARE_ADDR_EQUAL(&d[0],
+                                                   &p);
+                                       }
+                               }
+                               break;
+
+                       case O_IP6_SRC_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+                               break;
+
+                       case O_IP6_DST_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+                               break;
+
+                       case O_FLOW6ID:
+                               match = is_ipv6 &&
+                                   flow6id_match(args->f_id.flow_id6,
+                                   (ipfw_insn_u32 *) cmd);
+                               break;
+
+                       case O_EXT_HDR:
+                               match = is_ipv6 &&
+                                   (ext_hd & ((ipfw_insn *) cmd)->arg1);
+                               break;
+
+                       case O_IP6:
+                               match = is_ipv6;
+                               break;
+#endif
+
+                       case O_IP4:
+                               match = is_ipv4;
+                               break;
+
+                       case O_TAG: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               /* Packet is already tagged with this tag? */
+                               mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+                               /* We have `untag' action when F_NOT flag is
+                                * present. And we must remove this mtag from
+                                * mbuf and reset `match' to zero (`match' will
+                                * be inversed later).
+                                * Otherwise we should allocate new mtag and
+                                * push it into mbuf.
+                                */
+                               if (cmd->len & F_NOT) { /* `untag' action */
+                                       if (mtag != NULL)
+                                               m_tag_delete(m, mtag);
+                                       match = 0;
+                               } else if (mtag == NULL) {
+                                       if ((mtag = m_tag_alloc(MTAG_IPFW,
+                                           tag, 0, M_NOWAIT)) != NULL)
+                                               m_tag_prepend(m, mtag);
+                                       match = 1;
+                               }
+                               break;
+                       }
+
+                       case O_FIB: /* try match the specified fib */
+                               if (args->f_id.fib == cmd->arg1)
+                                       match = 1;
+                               break;
+
+                       case O_TAGGED: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               if (cmdlen == 1) {
+                                       match = m_tag_locate(m, MTAG_IPFW,
+                                           tag, NULL) != NULL;
+                                       break;
+                               }
+
+                               /* we have ranges */
+                               for (mtag = m_tag_first(m);
+                                   mtag != NULL && !match;
+                                   mtag = m_tag_next(m, mtag)) {
+                                       uint16_t *p;
+                                       int i;
+
+                                       if (mtag->m_tag_cookie != MTAG_IPFW)
+                                               continue;
+
+                                       p = ((ipfw_insn_u16 *)cmd)->ports;
+                                       i = cmdlen - 1;
+                                       for(; !match && i > 0; i--, p += 2)
+                                               match =
+                                                   mtag->m_tag_id >= p[0] &&
+                                                   mtag->m_tag_id <= p[1];
+                               }
+                               break;
+                       }
+                               
+                       /*
+                        * The second set of opcodes represents 'actions',
+                        * i.e. the terminal part of a rule once the packet
+                        * matches all previous patterns.
+                        * Typically there is only one action for each rule,
+                        * and the opcode is stored at the end of the rule
+                        * (but there are exceptions -- see below).
+                        *
+                        * In general, here we set retval and terminate the
+                        * outer loop (would be a 'break 3' in some language,
+                        * but we need to set l=0, done=1)
+                        *
+                        * Exceptions:
+                        * O_COUNT and O_SKIPTO actions:
+                        *   instead of terminating, we jump to the next rule
+                        *   (setting l=0), or to the SKIPTO target (setting
+                        *   f/f_len, cmd and l as needed), respectively.
+                        *
+                        * O_TAG, O_LOG and O_ALTQ action parameters:
+                        *   perform some action and set match = 1;
+                        *
+                        * O_LIMIT and O_KEEP_STATE: these opcodes are
+                        *   not real 'actions', and are stored right
+                        *   before the 'action' part of the rule.
+                        *   These opcodes try to install an entry in the
+                        *   state tables; if successful, we continue with
+                        *   the next opcode (match=1; break;), otherwise
+                        *   the packet must be dropped (set retval,
+                        *   break loops with l=0, done=1)
+                        *
+                        * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+                        *   cause a lookup of the state table, and a jump
+                        *   to the 'action' part of the parent rule
+                        *   if an entry is found, or
+                        *   (CHECK_STATE only) a jump to the next rule if
+                        *   the entry is not found.
+                        *   The result of the lookup is cached so that
+                        *   further instances of these opcodes become NOPs.
+                        *   The jump to the next rule is done by setting
+                        *   l=0, cmdlen=0.
+                        */
+                       case O_LIMIT:
+                       case O_KEEP_STATE:
+                               if (ipfw_install_state(f,
+                                   (ipfw_insn_limit *)cmd, args, tablearg)) {
+                                       /* error or limit violation */
+                                       retval = IP_FW_DENY;
+                                       l = 0;  /* exit inner loop */
+                                       done = 1; /* exit outer loop */
+                               }
+                               match = 1;
+                               break;
+
+                       case O_PROBE_STATE:
+                       case O_CHECK_STATE:
+                               /*
+                                * dynamic rules are checked at the first
+                                * keep-state or check-state occurrence,
+                                * with the result being stored in dyn_dir.
+                                * The compiler introduces a PROBE_STATE
+                                * instruction for us when we have a
+                                * KEEP_STATE (because PROBE_STATE needs
+                                * to be run first).
+                                */
+                               if (dyn_dir == MATCH_UNKNOWN &&
+                                   (q = ipfw_lookup_dyn_rule(&args->f_id,
+                                    &dyn_dir, proto == IPPROTO_TCP ?
+                                       TCP(ulp) : NULL))
+                                       != NULL) {
+                                       /*
+                                        * Found dynamic entry, update stats
+                                        * and jump to the 'action' part of
+                                        * the parent rule by setting
+                                        * f, cmd, l and clearing cmdlen.
+                                        */
+                                       q->pcnt++;
+                                       q->bcnt += pktlen;
+                                       /* XXX we would like to have f_pos
+                                        * readily accessible in the dynamic
+                                        * rule, instead of having to
+                                        * lookup q->rule.
+                                        */
+                                       f = q->rule;
+                                       f_pos = ipfw_find_rule(chain,
+                                               f->rulenum, f->id);
+                                       cmd = ACTION_PTR(f);
+                                       l = f->cmd_len - f->act_ofs;
+                                       ipfw_dyn_unlock();
+                                       cmdlen = 0;
+                                       match = 1;
+                                       break;
+                               }
+                               /*
+                                * Dynamic entry not found. If CHECK_STATE,
+                                * skip to next rule, if PROBE_STATE just
+                                * ignore and continue with next opcode.
+                                */
+                               if (cmd->opcode == O_CHECK_STATE)
+                                       l = 0;  /* exit inner loop */
+                               match = 1;
+                               break;
+
+                       case O_ACCEPT:
+                               retval = 0;     /* accept */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_PIPE:
+                       case O_QUEUE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               if (cmd->opcode == O_PIPE)
+                                       args->rule.info |= IPFW_IS_PIPE;
+                               if (V_fw_one_pass)
+                                       args->rule.info |= IPFW_ONEPASS;
+                               retval = IP_FW_DUMMYNET;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_DIVERT:
+                       case O_TEE:
+                               if (args->eh) /* not on layer 2 */
+                                   break;
+                               /* otherwise this is terminal */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               retval = (cmd->opcode == O_DIVERT) ?
+                                       IP_FW_DIVERT : IP_FW_TEE;
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+                               break;
+
+                       case O_COUNT:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_SKIPTO:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                           /* If possible use cached f_pos (in f->next_rule),
+                            * whose version is written in f->next_rule
+                            * (horrible hacks to avoid changing the ABI).
+                            */
+                           if (cmd->arg1 != IP_FW_TABLEARG &&
+                                   (uintptr_t)f->x_next == chain->id) {
+                               f_pos = (uintptr_t)f->next_rule;
+                               } else {
+                               int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               /* make sure we do not jump backward */
+                               if (i <= f->rulenum)
+                                   i = f->rulenum + 1;
+                               f_pos = ipfw_find_rule(chain, i, 0);
+                               /* update the cache */
+                               if (cmd->arg1 != IP_FW_TABLEARG) {
+                                   f->next_rule =
+                                       (void *)(uintptr_t)f_pos;
+                                   f->x_next =
+                                       (void *)(uintptr_t)chain->id;
+                               }
+                               }
+                               /*
+                            * Skip disabled rules, and re-enter
+                            * the inner loop with the correct
+                            * f_pos, f, l and cmd.
+                                * Also clear cmdlen and skip_or
+                                */
+                           for (; f_pos < chain->n_rules - 1 &&
+                                   (V_set_disable &
+                                    (1 << chain->map[f_pos]->set));
+                                   f_pos++)
+                               ;
+                           /* prepare to enter the inner loop */
+                           f = chain->map[f_pos];
+                                       l = f->cmd_len;
+                                       cmd = f->cmd;
+                               match = 1;
+                               cmdlen = 0;
+                               skip_or = 0;
+                               break;
+
+                       case O_REJECT:
+                               /*
+                                * Drop the packet and send a reject notice
+                                * if the packet is not ICMP (or is an ICMP
+                                * query), and it is not multicast/broadcast.
+                                */
+                               if (hlen > 0 && is_ipv4 && offset == 0 &&
+                                   (proto != IPPROTO_ICMP ||
+                                    is_icmp_query(ICMP(ulp))) &&
+                                   !(m->m_flags & (M_BCAST|M_MCAST)) &&
+                                   !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
+                                       send_reject(args, cmd->arg1, iplen, ip);
+                                       m = args->m;
+                               }
+                               /* FALLTHROUGH */
+#ifdef INET6
+                       case O_UNREACH6:
+                               if (hlen > 0 && is_ipv6 &&
+                                   ((offset & IP6F_OFF_MASK) == 0) &&
+                                   (proto != IPPROTO_ICMPV6 ||
+                                    (is_icmp6_query(args->f_id.flags) == 1)) &&
+                                   !(m->m_flags & (M_BCAST|M_MCAST)) &&
+                                   !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
+                                       send_reject6(
+                                           args, cmd->arg1, hlen,
+                                           (struct ip6_hdr *)ip);
+                                       m = args->m;
+                               }
+                               /* FALLTHROUGH */
+#endif
+                       case O_DENY:
+                               retval = IP_FW_DENY;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_FORWARD_IP:
+                               if (args->eh)   /* not valid on layer2 pkts */
+                                       break;
+                               if (!q || dyn_dir == MATCH_FORWARD) {
+                                   struct sockaddr_in *sa;
+                                   sa = &(((ipfw_insn_sa *)cmd)->sa);
+                                   if (sa->sin_addr.s_addr == INADDR_ANY) {
+                                       bcopy(sa, &args->hopstore,
+                                                       sizeof(*sa));
+                                       args->hopstore.sin_addr.s_addr =
+                                                   htonl(tablearg);
+                                       args->next_hop = &args->hopstore;
+                                   } else {
+                                       args->next_hop = sa;
+                                   }
+                               }
+                               retval = IP_FW_PASS;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_NETGRAPH:
+                       case O_NGTEE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               retval = (cmd->opcode == O_NETGRAPH) ?
+                                   IP_FW_NETGRAPH : IP_FW_NGTEE;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_SETFIB:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               M_SETFIB(m, cmd->arg1);
+                               args->f_id.fib = cmd->arg1;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_NAT:
+                               if (!IPFW_NAT_LOADED) {
+                                   retval = IP_FW_DENY;
+                               } else {
+                                   struct cfg_nat *t;
+                                   int nat_id;
+
+                                   set_match(args, f_pos, chain);
+                                   t = ((ipfw_insn_nat *)cmd)->nat;
+                                   if (t == NULL) {
+                                       nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                               tablearg : cmd->arg1;
+                                       t = (*lookup_nat_ptr)(&chain->nat, nat_id);
+
+                                       if (t == NULL) {
+                                           retval = IP_FW_DENY;
+                                           l = 0;      /* exit inner loop */
+                                           done = 1;   /* exit outer loop */
+                                           break;
+                                       }
+                                       if (cmd->arg1 != IP_FW_TABLEARG)
+                                           ((ipfw_insn_nat *)cmd)->nat = t;
+                                   }
+                                   retval = ipfw_nat_ptr(args, t, m);
+                               }
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_REASS: {
+                               int ip_off;
+
+                               f->pcnt++;
+                               f->bcnt += pktlen;
+                               l = 0;  /* in any case exit inner loop */
+                               ip_off = ntohs(ip->ip_off);
+
+                               /* if not fragmented, go to next rule */
+                               if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
+                                   break;
+                               /* 
+                                * ip_reass() expects len & off in host
+                                * byte order.
+                                */
+                               SET_HOST_IPLEN(ip);
+
+                               args->m = m = ip_reass(m);
+
+                               /*
+                                * do IP header checksum fixup.
+                                */
+                               if (m == NULL) { /* fragment got swallowed */
+                                   retval = IP_FW_DENY;
+                               } else { /* good, packet complete */
+                                   int hlen;
+
+                                   ip = mtod(m, struct ip *);
+                                   hlen = ip->ip_hl << 2;
+                                   SET_NET_IPLEN(ip);
+                                   ip->ip_sum = 0;
+                                   if (hlen == sizeof(struct ip))
+                                       ip->ip_sum = in_cksum_hdr(ip);
+                                   else
+                                       ip->ip_sum = in_cksum(m, hlen);
+                                   retval = IP_FW_REASS;
+                                   set_match(args, f_pos, chain);
+                               }
+                               done = 1;       /* exit outer loop */
+                               break;
+                       }
+
+                       default:
+                               panic("-- unknown opcode %d\n", cmd->opcode);
+                       } /* end of switch() on opcodes */
+                       /*
+                        * if we get here with l=0, then match is irrelevant.
+                        */
+
+                       if (cmd->len & F_NOT)
+                               match = !match;
+
+                       if (match) {
+                               if (cmd->len & F_OR)
+                                       skip_or = 1;
+                       } else {
+                               if (!(cmd->len & F_OR)) /* not an OR block, */
+                                       break;          /* try next rule    */
+                       }
+
+               }       /* end of inner loop, scan opcodes */
+
+               if (done)
+                       break;
+
+/* next_rule:; */      /* try next rule                */
+
+       }               /* end of outer for, scan rules */
+
+       if (done) {
+               struct ip_fw *rule = chain->map[f_pos];
+               /* Update statistics */
+               rule->pcnt++;
+               rule->bcnt += pktlen;
+               rule->timestamp = time_uptime;
+       } else {
+               retval = IP_FW_DENY;
+               printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+       }
+       IPFW_RUNLOCK(chain);
+#ifndef __linux__
+       if (ucred_cache != NULL)
+               crfree(ucred_cache);
+#endif
+       return (retval);
+
+pullup_failed:
+       if (V_fw_verbose)
+               printf("ipfw: pullup failed\n");
+       return (IP_FW_DENY);
+}
+
+/*
+ * Module and VNET glue
+ */
+
+/*
+ * Stuff that must be initialised only on boot or module load
+ */
+static int
+ipfw_init(void)
+{
+       int error = 0;
+
+       ipfw_dyn_attach();
+       /*
+        * Only print out this stuff the first time around,
+        * when called from the sysinit code.
+        */
+       printf("ipfw2 "
+#ifdef INET6
+               "(+ipv6) "
+#endif
+               "initialized, divert %s, nat %s, "
+               "rule-based forwarding "
+#ifdef IPFIREWALL_FORWARD
+               "enabled, "
+#else
+               "disabled, "
+#endif
+               "default to %s, logging ",
+#ifdef IPDIVERT
+               "enabled",
+#else
+               "loadable",
+#endif
+#ifdef IPFIREWALL_NAT
+               "enabled",
+#else
+               "loadable",
+#endif
+               default_to_accept ? "accept" : "deny");
+
+       /*
+        * Note: V_xxx variables can be accessed here but the vnet specific
+        * initializer may not have been called yet for the VIMAGE case.
+        * Tuneables will have been processed. We will print out values for
+        * the default vnet. 
+        * XXX This should all be rationalized AFTER 8.0
+        */
+       if (V_fw_verbose == 0)
+               printf("disabled\n");
+       else if (V_verbose_limit == 0)
+               printf("unlimited\n");
+       else
+               printf("limited to %d packets/entry by default\n",
+                   V_verbose_limit);
+
+       ipfw_log_bpf(1); /* init */
+       return (error);
+}
+
+/*
+ * Called for the removal of the last instance only on module unload.
+ */
+static void
+ipfw_destroy(void)
+{
+
+       ipfw_log_bpf(0); /* uninit */
+       ipfw_dyn_detach();
+       printf("IP firewall unloaded\n");
+}
+
+/*
+ * Stuff that must be initialized for every instance
+ * (including the first of course).
+ */
+static int
+vnet_ipfw_init(const void *unused)
+{
+       int error;
+       struct ip_fw *rule = NULL;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+
+       /* First set up some values that are compile time options */
+       V_autoinc_step = 100;   /* bounded to 1..1000 in add_rule() */
+       V_fw_deny_unknown_exthdrs = 1;
+#ifdef IPFIREWALL_VERBOSE
+       V_fw_verbose = 1;
+#endif
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+       V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#endif
+#ifdef IPFIREWALL_NAT
+       LIST_INIT(&chain->nat);
+#endif
+
+       /* insert the default rule and create the initial map */
+       chain->n_rules = 1;
+       chain->static_len = sizeof(struct ip_fw);
+       chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
+       if (chain->map)
+               rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
+       if (rule == NULL) {
+               if (chain->map)
+                       free(chain->map, M_IPFW);
+               printf("ipfw2: ENOSPC initializing default rule "
+                       "(support disabled)\n");
+               return (ENOSPC);
+       }
+       error = ipfw_init_tables(chain);
+       if (error) {
+               panic("init_tables"); /* XXX Marko fix this ! */
+       }
+
+       /* fill and insert the default rule */
+       rule->act_ofs = 0;
+       rule->rulenum = IPFW_DEFAULT_RULE;
+       rule->cmd_len = 1;
+       rule->set = RESVD_SET;
+       rule->cmd[0].len = 1;
+       rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+       chain->rules = chain->default_rule = chain->map[0] = rule;
+       chain->id = rule->id = 1;
+
+       IPFW_LOCK_INIT(chain);
+       ipfw_dyn_init();
+
+       /* First set up some values that are compile time options */
+       V_ipfw_vnet_ready = 1;          /* Open for business */
+
+       /*
+        * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
+        * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
+        * we still keep the module alive because the sockopt and
+        * layer2 paths are still useful.
+        * ipfw[6]_hook return 0 on success, ENOENT on failure,
+        * so we can ignore the exact return value and just set a flag.
+        *
+        * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
+        * changes in the underlying (per-vnet) variables trigger
+        * immediate hook()/unhook() calls.
+        * In layer2 we have the same behaviour, except that V_ether_ipfw
+        * is checked on each packet because there are no pfil hooks.
+        */
+       V_ip_fw_ctl_ptr = ipfw_ctl;
+       V_ip_fw_chk_ptr = ipfw_chk;
+       error = ipfw_attach_hooks(1);
+       return (error);
+}
+
+/*
+ * Called for the removal of each instance.
+ */
+static int
+vnet_ipfw_uninit(const void *unused)
+{
+       struct ip_fw *reap, *rule;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+       int i;
+
+       V_ipfw_vnet_ready = 0; /* tell new callers to go away */
+       /*
+        * disconnect from ipv4, ipv6, layer2 and sockopt.
+        * Then grab, release and grab again the WLOCK so we make
+        * sure the update is propagated and nobody will be in.
+        */
+       (void)ipfw_attach_hooks(0 /* detach */);
+       V_ip_fw_chk_ptr = NULL;
+       V_ip_fw_ctl_ptr = NULL;
+       IPFW_UH_WLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+       IPFW_UH_WLOCK(chain);
+
+       IPFW_WLOCK(chain);
+       IPFW_WUNLOCK(chain);
+       IPFW_WLOCK(chain);
+
+       ipfw_dyn_uninit(0);     /* run the callout_drain */
+       ipfw_flush_tables(chain);
+       reap = NULL;
+       for (i = 0; i < chain->n_rules; i++) {
+               rule = chain->map[i];
+               rule->x_next = reap;
+               reap = rule;
+       }
+       if (chain->map)
+               free(chain->map, M_IPFW);
+       IPFW_WUNLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+       if (reap != NULL)
+               ipfw_reap_rules(reap);
+       IPFW_LOCK_DESTROY(chain);
+       ipfw_dyn_uninit(1);     /* free the remaining parts */
+       return 0;
+}
+
+/*
+ * Module event handler.
+ * In general we have the choice of handling most of these events by the
+ * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
+ * use the SYSINIT handlers as they are more capable of expressing the
+ * flow of control during module and vnet operations, so this is just
+ * a skeleton. Note there is no SYSINIT equivalent of the module
+ * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
+ */
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+       int err = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               /* Called once at module load or
+                * system boot if compiled in. */
+               break;
+       case MOD_QUIESCE:
+               /* Called before unload. May veto unloading. */
+               break;
+       case MOD_UNLOAD:
+               /* Called during unload. */
+               break;
+       case MOD_SHUTDOWN:
+               /* Called during system shutdown. */
+               break;
+       default:
+               err = EOPNOTSUPP;
+               break;
+       }
+       return err;
+}
+
+static moduledata_t ipfwmod = {
+       "ipfw",
+       ipfw_modevent,
+       0
+};
+
+/* Define startup order. */
+#define        IPFW_SI_SUB_FIREWALL    SI_SUB_PROTO_IFATTACHDOMAIN
+#define        IPFW_MODEVENT_ORDER     (SI_ORDER_ANY - 255) /* On boot slot in here. */
+#define        IPFW_MODULE_ORDER       (IPFW_MODEVENT_ORDER + 1) /* A little later. */
+#define        IPFW_VNET_ORDER         (IPFW_MODEVENT_ORDER + 2) /* Later still. */
+
+DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
+MODULE_VERSION(ipfw, 2);
+/* should declare some dependencies here */
+
+/*
+ * Starting up. Done in order after ipfwmod() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+           ipfw_init, NULL);
+VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+           vnet_ipfw_init, NULL);
+/*
+ * Closing up shop. These are done in REVERSE ORDER, but still
+ * after ipfwmod() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+           ipfw_destroy, NULL);
+VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+           vnet_ipfw_uninit, NULL);
+/* end of file */
diff --git a/dummynet2/ip_fw_dynamic.c b/dummynet2/ip_fw_dynamic.c
new file mode 100644 (file)
index 0000000..9c7d2cd
--- /dev/null
@@ -0,0 +1,1237 @@
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#define        DEB(x)
+#define        DDB(x) x
+
+/*
+ * Dynamic rule support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>    /* ip_defttl */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>       /* IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ *  + stateful rules;
+ *  + enforcing limits on the number of sessions;
+ *  + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. XXX we should make them survive.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+
+/*
+ * Static variables followed by global ones
+ */
+static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets);
+static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
+static VNET_DEFINE(struct callout, ipfw_timeout);
+#define        V_ipfw_dyn_v                    VNET(ipfw_dyn_v)
+#define        V_dyn_buckets                   VNET(dyn_buckets)
+#define        V_curr_dyn_buckets              VNET(curr_dyn_buckets)
+#define V_ipfw_timeout                  VNET(ipfw_timeout)
+
+static uma_zone_t ipfw_dyn_rule_zone;
+#if defined( __linux__ ) || defined( _WIN32 )
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
+static struct mtx ipfw_dyn_mtx;                /* mutex guarding dynamic rules */
+#endif
+
+#define        IPFW_DYN_LOCK_INIT() \
+       mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
+#define        IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
+#define        IPFW_DYN_LOCK()         mtx_lock(&ipfw_dyn_mtx)
+#define        IPFW_DYN_UNLOCK()       mtx_unlock(&ipfw_dyn_mtx)
+#define        IPFW_DYN_LOCK_ASSERT()  mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+
+void
+ipfw_dyn_unlock(void)
+{
+       IPFW_DYN_UNLOCK();
+}
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
+
+#define        V_dyn_ack_lifetime              VNET(dyn_ack_lifetime)
+#define        V_dyn_syn_lifetime              VNET(dyn_syn_lifetime)
+#define        V_dyn_fin_lifetime              VNET(dyn_fin_lifetime)
+#define        V_dyn_rst_lifetime              VNET(dyn_rst_lifetime)
+#define        V_dyn_udp_lifetime              VNET(dyn_udp_lifetime)
+#define        V_dyn_short_lifetime            VNET(dyn_short_lifetime)
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
+static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
+static VNET_DEFINE(u_int32_t, dyn_keepalive);
+
+#define        V_dyn_keepalive_interval        VNET(dyn_keepalive_interval)
+#define        V_dyn_keepalive_period          VNET(dyn_keepalive_period)
+#define        V_dyn_keepalive                 VNET(dyn_keepalive)
+
+static VNET_DEFINE(u_int32_t, dyn_count);      /* # of dynamic rules */
+static VNET_DEFINE(u_int32_t, dyn_max);                /* max # of dynamic rules */
+
+#define        V_dyn_count                     VNET(dyn_count)
+#define        V_dyn_max                       VNET(dyn_max)
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+    CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
+    "Number of dyn. buckets");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+    CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+    "Current Number of dyn. buckets");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count,
+    CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
+    "Number of dyn. rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max,
+    CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
+    "Max number of dyn. rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+    "Lifetime of dyn. rules for acks");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+    "Lifetime of dyn. rules for syn");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+    "Lifetime of dyn. rules for fin");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+    "Lifetime of dyn. rules for rst");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+    "Lifetime of dyn. rules for UDP");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+    "Lifetime of dyn. rules for other situations");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+    CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+    "Enable keepalives for dyn. rules");
+#endif /* SYSCTL_NODE */
+
+
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+       u_int32_t i;
+       i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+           (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+           (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+           (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+           (id->dst_port) ^ (id->src_port);
+       return i;
+}
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+       u_int32_t i;
+
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(id)) 
+               i = hash_packet6(id);
+       else
+#endif /* INET6 */
+       i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+       i &= (V_curr_dyn_buckets - 1);
+       return i;
+}
+
+static __inline void
+unlink_dyn_rule_print(struct ipfw_flow_id *id)
+{
+       struct in_addr da;
+#ifdef INET6
+       char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+#else
+       char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(id)) {
+               ip6_sprintf(src, &id->src_ip6);
+               ip6_sprintf(dst, &id->dst_ip6);
+       } else
+#endif
+       {
+               da.s_addr = htonl(id->src_ip);
+               inet_ntoa_r(da, src);
+               da.s_addr = htonl(id->dst_ip);
+               inet_ntoa_r(da, dst);
+       }
+       printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
+           src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) {                               \
+       ipfw_dyn_rule *old_q = q;                                       \
+                                                                       \
+       /* remove a refcount to the parent */                           \
+       if (q->dyn_type == O_LIMIT)                                     \
+               q->parent->count--;                                     \
+       DEB(unlink_dyn_rule_print(&q->id);)                             \
+       if (prev != NULL)                                               \
+               prev->next = q = q->next;                               \
+       else                                                            \
+               head = q = q->next;                                     \
+       V_dyn_count--;                                                  \
+       uma_zfree(ipfw_dyn_rule_zone, old_q); }
+
+#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
+
+/**
+ * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
+ *
+ * If keep_me == NULL, rules are deleted even if not expired,
+ * otherwise only expired rules are removed.
+ *
+ * The value of the second parameter is also used to point to identify
+ * a rule we absolutely do not want to remove (e.g. because we are
+ * holding a reference to it -- this is the case with O_LIMIT_PARENT
+ * rules). The pointer is only used for comparison, so any non-null
+ * value will do.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
+{
+       static u_int32_t last_remove = 0;
+
+#define FORCE (keep_me == NULL)
+
+       ipfw_dyn_rule *prev, *q;
+       int i, pass = 0, max_pass = 0;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+               return;
+       /* do not expire more than once per second, it is useless */
+       if (!FORCE && last_remove == time_uptime)
+               return;
+       last_remove = time_uptime;
+
+       /*
+        * because O_LIMIT refer to parent rules, during the first pass only
+        * remove child and mark any pending LIMIT_PARENT, and remove
+        * them in a second pass.
+        */
+next_pass:
+       for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+               for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
+                       /*
+                        * Logic can become complex here, so we split tests.
+                        */
+                       if (q == keep_me)
+                               goto next;
+                       if (rule != NULL && rule != q->rule)
+                               goto next; /* not the one we are looking for */
+                       if (q->dyn_type == O_LIMIT_PARENT) {
+                               /*
+                                * handle parent in the second pass,
+                                * record we need one.
+                                */
+                               max_pass = 1;
+                               if (pass == 0)
+                                       goto next;
+                               if (FORCE && q->count != 0 ) {
+                                       /* XXX should not happen! */
+                                       printf("ipfw: OUCH! cannot remove rule,"
+                                            " count %d\n", q->count);
+                               }
+                       } else {
+                               if (!FORCE &&
+                                   !TIME_LEQ( q->expire, time_uptime ))
+                                       goto next;
+                       }
+             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
+                     UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                     continue;
+             }
+next:
+                       prev=q;
+                       q=q->next;
+               }
+       }
+       if (pass++ < max_pass)
+               goto next_pass;
+}
+
+void
+ipfw_remove_dyn_children(struct ip_fw *rule)
+{
+       IPFW_DYN_LOCK();
+       remove_dyn_rule(rule, NULL /* force removal */);
+       IPFW_DYN_UNLOCK();
+}
+
+/**
+ * lookup a dynamic rule, locked version
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+       /*
+        * stateful ipfw extensions.
+        * Lookup into dynamic session queue
+        */
+#define MATCH_REVERSE  0
+#define MATCH_FORWARD  1
+#define MATCH_NONE     2
+#define MATCH_UNKNOWN  3
+       int i, dir = MATCH_NONE;
+       ipfw_dyn_rule *prev, *q=NULL;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL)
+               goto done;      /* not found */
+       i = hash_packet( pkt );
+       for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
+               if (q->dyn_type == O_LIMIT_PARENT && q->count)
+                       goto next;
+               if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
+                       UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                       continue;
+               }
+               if (pkt->proto == q->id.proto &&
+                   q->dyn_type != O_LIMIT_PARENT) {
+                       if (IS_IP6_FLOW_ID(pkt)) {
+                           if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                               &(q->id.src_ip6)) &&
+                           IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                               &(q->id.dst_ip6)) &&
+                           pkt->src_port == q->id.src_port &&
+                           pkt->dst_port == q->id.dst_port ) {
+                               dir = MATCH_FORWARD;
+                               break;
+                           }
+                           if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                                   &(q->id.dst_ip6)) &&
+                               IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                                   &(q->id.src_ip6)) &&
+                               pkt->src_port == q->id.dst_port &&
+                               pkt->dst_port == q->id.src_port ) {
+                                   dir = MATCH_REVERSE;
+                                   break;
+                           }
+                       } else {
+                           if (pkt->src_ip == q->id.src_ip &&
+                               pkt->dst_ip == q->id.dst_ip &&
+                               pkt->src_port == q->id.src_port &&
+                               pkt->dst_port == q->id.dst_port ) {
+                                   dir = MATCH_FORWARD;
+                                   break;
+                           }
+                           if (pkt->src_ip == q->id.dst_ip &&
+                               pkt->dst_ip == q->id.src_ip &&
+                               pkt->src_port == q->id.dst_port &&
+                               pkt->dst_port == q->id.src_port ) {
+                                   dir = MATCH_REVERSE;
+                                   break;
+                           }
+                       }
+               }
+next:
+               prev = q;
+               q = q->next;
+       }
+       if (q == NULL)
+               goto done; /* q = NULL, not found */
+
+       if ( prev != NULL) { /* found and not in front */
+               prev->next = q->next;
+               q->next = V_ipfw_dyn_v[i];
+               V_ipfw_dyn_v[i] = q;
+       }
+       if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
+               u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
+
+#define BOTH_SYN       (TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN       (TH_FIN | (TH_FIN << 8))
+               q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
+               switch (q->state) {
+               case TH_SYN:                            /* opening */
+                       q->expire = time_uptime + V_dyn_syn_lifetime;
+                       break;
+
+               case BOTH_SYN:                  /* move to established */
+               case BOTH_SYN | TH_FIN :        /* one side tries to close */
+               case BOTH_SYN | (TH_FIN << 8) :
+                       if (tcp) {
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+                           u_int32_t ack = ntohl(tcp->th_ack);
+                           if (dir == MATCH_FORWARD) {
+                               if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
+                                   q->ack_fwd = ack;
+                               else { /* ignore out-of-sequence */
+                                   break;
+                               }
+                           } else {
+                               if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
+                                   q->ack_rev = ack;
+                               else { /* ignore out-of-sequence */
+                                   break;
+                               }
+                           }
+                       }
+                       q->expire = time_uptime + V_dyn_ack_lifetime;
+                       break;
+
+               case BOTH_SYN | BOTH_FIN:       /* both sides closed */
+                       if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+                               V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
+                       q->expire = time_uptime + V_dyn_fin_lifetime;
+                       break;
+
+               default:
+#if 0
+                       /*
+                        * reset or some invalid combination, but can also
+                        * occur if we use keep-state the wrong way.
+                        */
+                       if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+                               printf("invalid state: 0x%x\n", q->state);
+#endif
+                       if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+                               V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
+                       q->expire = time_uptime + V_dyn_rst_lifetime;
+                       break;
+               }
+       } else if (pkt->proto == IPPROTO_UDP) {
+               q->expire = time_uptime + V_dyn_udp_lifetime;
+       } else {
+               /* other protocols */
+               q->expire = time_uptime + V_dyn_short_lifetime;
+       }
+done:
+       if (match_direction)
+               *match_direction = dir;
+       return q;
+}
+
+ipfw_dyn_rule *
+ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+       ipfw_dyn_rule *q;
+
+       IPFW_DYN_LOCK();
+       q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+       if (q == NULL)
+               IPFW_DYN_UNLOCK();
+       /* NB: return table locked when q is not NULL */
+       return q;
+}
+
+static void
+realloc_dynamic_table(void)
+{
+       IPFW_DYN_LOCK_ASSERT();
+
+       /*
+        * Try reallocation, make sure we have a power of 2 and do
+        * not allow more than 64k entries. In case of overflow,
+        * default to 1024.
+        */
+
+       if (V_dyn_buckets > 65536)
+               V_dyn_buckets = 1024;
+       if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
+               V_dyn_buckets = V_curr_dyn_buckets; /* reset */
+               return;
+       }
+       V_curr_dyn_buckets = V_dyn_buckets;
+       if (V_ipfw_dyn_v != NULL)
+               free(V_ipfw_dyn_v, M_IPFW);
+       for (;;) {
+               V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
+                      M_IPFW, M_NOWAIT | M_ZERO);
+               if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
+                       break;
+               V_curr_dyn_buckets /= 2;
+       }
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ *   (O_LIMIT). When they are created, the parent is
+ *   increased by 1, and decreased on delete. In this case,
+ *   the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+       ipfw_dyn_rule *r;
+       int i;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL ||
+           (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
+               realloc_dynamic_table();
+               if (V_ipfw_dyn_v == NULL)
+                       return NULL; /* failed ! */
+       }
+       i = hash_packet(id);
+
+       r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+       if (r == NULL) {
+               printf ("ipfw: sorry cannot allocate state\n");
+               return NULL;
+       }
+
+       /* increase refcount on parent, and set pointer */
+       if (dyn_type == O_LIMIT) {
+               ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+               if ( parent->dyn_type != O_LIMIT_PARENT)
+                       panic("invalid parent");
+               parent->count++;
+               r->parent = parent;
+               rule = parent->rule;
+       }
+
+       r->id = *id;
+       r->expire = time_uptime + V_dyn_syn_lifetime;
+       r->rule = rule;
+       r->dyn_type = dyn_type;
+       r->pcnt = r->bcnt = 0;
+       r->count = 0;
+
+       r->bucket = i;
+       r->next = V_ipfw_dyn_v[i];
+       V_ipfw_dyn_v[i] = r;
+       V_dyn_count++;
+       DEB({
+               struct in_addr da;
+#ifdef INET6
+               char src[INET6_ADDRSTRLEN];
+               char dst[INET6_ADDRSTRLEN];
+#else
+               char src[INET_ADDRSTRLEN];
+               char dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(r->id))) {
+                       ip6_sprintf(src, &r->id.src_ip6);
+                       ip6_sprintf(dst, &r->id.dst_ip6);
+               } else
+#endif
+               {
+                       da.s_addr = htonl(r->id.src_ip);
+                       inet_ntoa_r(da, src);
+                       da.s_addr = htonl(r->id.dst_ip);
+                       inet_ntoa_r(da, dst);
+               }
+               printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
+                   dyn_type, src, r->id.src_port, dst, r->id.dst_port,
+                   V_dyn_count);
+       })
+       return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+       ipfw_dyn_rule *q;
+       int i;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v) {
+               int is_v6 = IS_IP6_FLOW_ID(pkt);
+               i = hash_packet( pkt );
+               for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+                       if (q->dyn_type == O_LIMIT_PARENT &&
+                           rule== q->rule &&
+                           pkt->proto == q->id.proto &&
+                           pkt->src_port == q->id.src_port &&
+                           pkt->dst_port == q->id.dst_port &&
+                           (
+                               (is_v6 &&
+                                IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                                       &(q->id.src_ip6)) &&
+                                IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                                       &(q->id.dst_ip6))) ||
+                               (!is_v6 &&
+                                pkt->src_ip == q->id.src_ip &&
+                                pkt->dst_ip == q->id.dst_ip)
+                           )
+                       ) {
+                               q->expire = time_uptime + V_dyn_short_lifetime;
+                               DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
+                               return q;
+                       }
+       }
+       return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+int
+ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg)
+{
+       static int last_log;
+       ipfw_dyn_rule *q;
+       struct in_addr da;
+#ifdef INET6
+       char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+       char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+       src[0] = '\0';
+       dst[0] = '\0';
+
+       IPFW_DYN_LOCK();
+
+       DEB(
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(&(args->f_id))) {
+               ip6_sprintf(src, &args->f_id.src_ip6);
+               ip6_sprintf(dst, &args->f_id.dst_ip6);
+       } else
+#endif
+       {
+               da.s_addr = htonl(args->f_id.src_ip);
+               inet_ntoa_r(da, src);
+               da.s_addr = htonl(args->f_id.dst_ip);
+               inet_ntoa_r(da, dst);
+       }
+       printf("ipfw: %s: type %d %s %u -> %s %u\n",
+           __func__, cmd->o.opcode, src, args->f_id.src_port,
+           dst, args->f_id.dst_port);
+       src[0] = '\0';
+       dst[0] = '\0';
+       )
+
+       q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+       if (q != NULL) {        /* should never occur */
+               if (last_log != time_uptime) {
+                       last_log = time_uptime;
+                       printf("ipfw: %s: entry already present, done\n",
+                           __func__);
+               }
+               IPFW_DYN_UNLOCK();
+               return (0);
+       }
+
+       if (V_dyn_count >= V_dyn_max)
+               /* Run out of slots, try to remove any expired rule. */
+               remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
+
+       if (V_dyn_count >= V_dyn_max) {
+               if (last_log != time_uptime) {
+                       last_log = time_uptime;
+                       printf("ipfw: %s: Too many dynamic rules\n", __func__);
+               }
+               IPFW_DYN_UNLOCK();
+               return (1);     /* cannot install, notify caller */
+       }
+
+       switch (cmd->o.opcode) {
+       case O_KEEP_STATE:      /* bidir rule */
+               add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+               break;
+
+       case O_LIMIT: {         /* limit number of sessions */
+               struct ipfw_flow_id id;
+               ipfw_dyn_rule *parent;
+               uint32_t conn_limit;
+               uint16_t limit_mask = cmd->limit_mask;
+
+               conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
+                   tablearg : cmd->conn_limit;
+                 
+               DEB(
+               if (cmd->conn_limit == IP_FW_TABLEARG)
+                       printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+                           "(tablearg)\n", __func__, conn_limit);
+               else
+                       printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+                           __func__, conn_limit);
+               )
+
+               id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+               id.proto = args->f_id.proto;
+               id.addr_type = args->f_id.addr_type;
+               id.fib = M_GETFIB(args->m);
+
+               if (IS_IP6_FLOW_ID (&(args->f_id))) {
+                       if (limit_mask & DYN_SRC_ADDR)
+                               id.src_ip6 = args->f_id.src_ip6;
+                       if (limit_mask & DYN_DST_ADDR)
+                               id.dst_ip6 = args->f_id.dst_ip6;
+               } else {
+                       if (limit_mask & DYN_SRC_ADDR)
+                               id.src_ip = args->f_id.src_ip;
+                       if (limit_mask & DYN_DST_ADDR)
+                               id.dst_ip = args->f_id.dst_ip;
+               }
+               if (limit_mask & DYN_SRC_PORT)
+                       id.src_port = args->f_id.src_port;
+               if (limit_mask & DYN_DST_PORT)
+                       id.dst_port = args->f_id.dst_port;
+               if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+                       printf("ipfw: %s: add parent failed\n", __func__);
+                       IPFW_DYN_UNLOCK();
+                       return (1);
+               }
+
+               if (parent->count >= conn_limit) {
+                       /* See if we can remove some expired rule. */
+                       remove_dyn_rule(rule, parent);
+                       if (parent->count >= conn_limit) {
+                               if (V_fw_verbose && last_log != time_uptime) {
+                                       last_log = time_uptime;
+#ifdef INET6
+                                       /*
+                                        * XXX IPv6 flows are not
+                                        * supported yet.
+                                        */
+                                       if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                                               char ip6buf[INET6_ADDRSTRLEN];
+                                               snprintf(src, sizeof(src),
+                                                   "[%s]", ip6_sprintf(ip6buf,
+                                                       &args->f_id.src_ip6));
+                                               snprintf(dst, sizeof(dst),
+                                                   "[%s]", ip6_sprintf(ip6buf,
+                                                       &args->f_id.dst_ip6));
+                                       } else
+#endif
+                                       {
+                                               da.s_addr =
+                                                   htonl(args->f_id.src_ip);
+                                               inet_ntoa_r(da, src);
+                                               da.s_addr =
+                                                   htonl(args->f_id.dst_ip);
+                                               inet_ntoa_r(da, dst);
+                                       }
+                                       log(LOG_SECURITY | LOG_DEBUG,
+                                           "ipfw: %d %s %s:%u -> %s:%u, %s\n",
+                                           parent->rule->rulenum,
+                                           "drop session",
+                                           src, (args->f_id.src_port),
+                                           dst, (args->f_id.dst_port),
+                                           "too many entries");
+                               }
+                               IPFW_DYN_UNLOCK();
+                               return (1);
+                       }
+               }
+               add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
+               break;
+       }
+       default:
+               printf("ipfw: %s: unknown dynamic rule type %u\n",
+                   __func__, cmd->o.opcode);
+               IPFW_DYN_UNLOCK();
+               return (1);
+       }
+
+       /* XXX just set lifetime */
+       lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+       IPFW_DYN_UNLOCK();
+       return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+struct mbuf *
+ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+    u_int32_t ack, int flags)
+{
+#ifdef __linux__       // XXX to be revised
+       return NULL;
+#else
+       struct mbuf *m;
+       int len, dir;
+       struct ip *h = NULL;            /* stupid compiler */
+#ifdef INET6
+       struct ip6_hdr *h6 = NULL;
+#endif
+       struct tcphdr *th = NULL;
+
+       MGETHDR(m, M_DONTWAIT, MT_DATA);
+       if (m == NULL)
+               return (NULL);
+
+       M_SETFIB(m, id->fib);
+#ifdef MAC
+       if (replyto != NULL)
+               mac_netinet_firewall_reply(replyto, m);
+       else
+               mac_netinet_firewall_send(m);
+#else
+       (void)replyto;          /* don't warn about unused arg */
+#endif
+
+       switch (id->addr_type) {
+       case 4:
+               len = sizeof(struct ip) + sizeof(struct tcphdr);
+               break;
+#ifdef INET6
+       case 6:
+               len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+               break;
+#endif
+       default:
+               /* XXX: log me?!? */
+               FREE_PKT(m);
+               return (NULL);
+       }
+       dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
+
+       m->m_data += max_linkhdr;
+       m->m_flags |= M_SKIP_FIREWALL;
+       m->m_pkthdr.len = m->m_len = len;
+       m->m_pkthdr.rcvif = NULL;
+       bzero(m->m_data, len);
+
+       switch (id->addr_type) {
+       case 4:
+               h = mtod(m, struct ip *);
+
+               /* prepare for checksum */
+               h->ip_p = IPPROTO_TCP;
+               h->ip_len = htons(sizeof(struct tcphdr));
+               if (dir) {
+                       h->ip_src.s_addr = htonl(id->src_ip);
+                       h->ip_dst.s_addr = htonl(id->dst_ip);
+               } else {
+                       h->ip_src.s_addr = htonl(id->dst_ip);
+                       h->ip_dst.s_addr = htonl(id->src_ip);
+               }
+
+               th = (struct tcphdr *)(h + 1);
+               break;
+#ifdef INET6
+       case 6:
+               h6 = mtod(m, struct ip6_hdr *);
+
+               /* prepare for checksum */
+               h6->ip6_nxt = IPPROTO_TCP;
+               h6->ip6_plen = htons(sizeof(struct tcphdr));
+               if (dir) {
+                       h6->ip6_src = id->src_ip6;
+                       h6->ip6_dst = id->dst_ip6;
+               } else {
+                       h6->ip6_src = id->dst_ip6;
+                       h6->ip6_dst = id->src_ip6;
+               }
+
+               th = (struct tcphdr *)(h6 + 1);
+               break;
+#endif
+       }
+
+       if (dir) {
+               th->th_sport = htons(id->src_port);
+               th->th_dport = htons(id->dst_port);
+       } else {
+               th->th_sport = htons(id->dst_port);
+               th->th_dport = htons(id->src_port);
+       }
+       th->th_off = sizeof(struct tcphdr) >> 2;
+
+       if (flags & TH_RST) {
+               if (flags & TH_ACK) {
+                       th->th_seq = htonl(ack);
+                       th->th_flags = TH_RST;
+               } else {
+                       if (flags & TH_SYN)
+                               seq++;
+                       th->th_ack = htonl(seq);
+                       th->th_flags = TH_RST | TH_ACK;
+               }
+       } else {
+               /*
+                * Keepalive - use caller provided sequence numbers
+                */
+               th->th_seq = htonl(seq);
+               th->th_ack = htonl(ack);
+               th->th_flags = TH_ACK;
+       }
+
+       switch (id->addr_type) {
+       case 4:
+               th->th_sum = in_cksum(m, len);
+
+               /* finish the ip header */
+               h->ip_v = 4;
+               h->ip_hl = sizeof(*h) >> 2;
+               h->ip_tos = IPTOS_LOWDELAY;
+               h->ip_off = 0;
+               h->ip_len = htons(len);
+               h->ip_ttl = V_ip_defttl;
+               h->ip_sum = 0;
+               break;
+#ifdef INET6
+       case 6:
+               th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
+                   sizeof(struct tcphdr));
+
+               /* finish the ip6 header */
+               h6->ip6_vfc |= IPV6_VERSION;
+               h6->ip6_hlim = IPV6_DEFHLIM;
+               break;
+#endif
+       }
+
+       return (m);
+#endif /* !__linux__ */
+}
+
+/*
+ * This procedure is only used to handle keepalives. It is invoked
+ * every dyn_keepalive_period
+ */
+static void
+ipfw_tick(void * vnetx) 
+{
+       struct mbuf *m0, *m, *mnext, **mtailp;
+#ifdef INET6
+       struct mbuf *m6, **m6_tailp;
+#endif
+       int i;
+       ipfw_dyn_rule *q;
+#ifdef VIMAGE
+       struct vnet *vp = vnetx;
+#endif
+
+       CURVNET_SET(vp);
+       if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+               goto done;
+
+       /*
+        * We make a chain of packets to go out here -- not deferring
+        * until after we drop the IPFW dynamic rule lock would result
+        * in a lock order reversal with the normal packet input -> ipfw
+        * call stack.
+        */
+       m0 = NULL;
+       mtailp = &m0;
+#ifdef INET6
+       m6 = NULL;
+       m6_tailp = &m6;
+#endif
+       IPFW_DYN_LOCK();
+       for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+               for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
+                       if (q->dyn_type == O_LIMIT_PARENT)
+                               continue;
+                       if (q->id.proto != IPPROTO_TCP)
+                               continue;
+                       if ( (q->state & BOTH_SYN) != BOTH_SYN)
+                               continue;
+                       if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
+                           q->expire))
+                               continue;       /* too early */
+                       if (TIME_LEQ(q->expire, time_uptime))
+                               continue;       /* too late, rule expired */
+
+                       m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1,
+                               q->ack_fwd, TH_SYN);
+                       mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1,
+                               q->ack_rev, 0);
+
+                       switch (q->id.addr_type) {
+                       case 4:
+                               if (m != NULL) {
+                                       *mtailp = m;
+                                       mtailp = &(*mtailp)->m_nextpkt;
+                               }
+                               if (mnext != NULL) {
+                                       *mtailp = mnext;
+                                       mtailp = &(*mtailp)->m_nextpkt;
+                               }
+                               break;
+#ifdef INET6
+                       case 6:
+                               if (m != NULL) {
+                                       *m6_tailp = m;
+                                       m6_tailp = &(*m6_tailp)->m_nextpkt;
+                               }
+                               if (mnext != NULL) {
+                                       *m6_tailp = mnext;
+                                       m6_tailp = &(*m6_tailp)->m_nextpkt;
+                               }
+                               break;
+#endif
+                       }
+
+                       m = mnext = NULL;
+               }
+       }
+       IPFW_DYN_UNLOCK();
+       for (m = mnext = m0; m != NULL; m = mnext) {
+               mnext = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip_output(m, NULL, NULL, 0, NULL, NULL);
+       }
+#ifdef INET6
+       for (m = mnext = m6; m != NULL; m = mnext) {
+               mnext = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+       }
+#endif
+done:
+       callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
+                     ipfw_tick, vnetx);
+       CURVNET_RESTORE();
+}
+
+void
+ipfw_dyn_attach(void)
+{
+        ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+            sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+            UMA_ALIGN_PTR, 0);
+
+        IPFW_DYN_LOCK_INIT();
+}
+
+void
+ipfw_dyn_detach(void)
+{
+        uma_zdestroy(ipfw_dyn_rule_zone);
+        IPFW_DYN_LOCK_DESTROY();
+}
+
+void
+ipfw_dyn_init(void)
+{
+        V_ipfw_dyn_v = NULL;
+        V_dyn_buckets = 256;    /* must be power of 2 */
+        V_curr_dyn_buckets = 256; /* must be power of 2 */
+        V_dyn_ack_lifetime = 300;
+        V_dyn_syn_lifetime = 20;
+        V_dyn_fin_lifetime = 1;
+        V_dyn_rst_lifetime = 1;
+        V_dyn_udp_lifetime = 10;
+        V_dyn_short_lifetime = 5;
+
+        V_dyn_keepalive_interval = 20;
+        V_dyn_keepalive_period = 5;
+        V_dyn_keepalive = 1;    /* do send keepalives */
+        
+        V_dyn_max = 4096;       /* max # of dynamic rules */
+        callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
+        callout_reset(&V_ipfw_timeout, hz, ipfw_tick, curvnet);
+}
+
+void
+ipfw_dyn_uninit(int pass)
+{
+       if (pass == 0)
+               callout_drain(&V_ipfw_timeout);
+       else {
+               if (V_ipfw_dyn_v != NULL)
+                       free(V_ipfw_dyn_v, M_IPFW);
+       }
+}
+
+int
+ipfw_dyn_len(void)
+{
+       return (V_ipfw_dyn_v == NULL) ? 0 :
+               (V_dyn_count * sizeof(ipfw_dyn_rule));
+}
+
+void
+ipfw_get_dynamic(char **pbp, const char *ep)
+{
+       ipfw_dyn_rule *p, *last = NULL;
+       char *bp;
+       int i;
+
+       if (V_ipfw_dyn_v == NULL)
+               return;
+       bp = *pbp;
+
+       IPFW_DYN_LOCK();
+       for (i = 0 ; i < V_curr_dyn_buckets; i++)
+               for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
+                       if (bp + sizeof *p <= ep) {
+                               ipfw_dyn_rule *dst =
+                                       (ipfw_dyn_rule *)bp;
+                               bcopy(p, dst, sizeof *p);
+                               bcopy(&(p->rule->rulenum), &(dst->rule),
+                                   sizeof(p->rule->rulenum));
+                               /*
+                                * store set number into high word of
+                                * dst->rule pointer.
+                                */
+                               bcopy(&(p->rule->set),
+                                   (char *)&dst->rule +
+                                   sizeof(p->rule->rulenum),
+                                   sizeof(p->rule->set));
+                               /*
+                                * store a non-null value in "next".
+                                * The userland code will interpret a
+                                * NULL here as a marker
+                                * for the last dynamic rule.
+                                */
+                               bcopy(&dst, &dst->next, sizeof(dst));
+                               last = dst;
+                               dst->expire =
+                                   TIME_LEQ(dst->expire, time_uptime) ?
+                                       0 : dst->expire - time_uptime ;
+                               bp += sizeof(ipfw_dyn_rule);
+                       }
+               }
+       IPFW_DYN_UNLOCK();
+       if (last != NULL) /* mark last dynamic rule */
+               bzero(&last->next, sizeof(last));
+       *pbp = bp;
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_log.c b/dummynet2/ip_fw_log.c
new file mode 100644 (file)
index 0000000..1bc1216
--- /dev/null
@@ -0,0 +1,434 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Logging support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/if_types.h>      /* for IFT_ETHER */
+#include <net/bpf.h>           /* for BPF */
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_var.h>  /* ip6_sprintf() */
+#endif
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+#ifdef WITHOUT_BPF
+void
+ipfw_log_bpf(int onoff)
+{
+}
+#else /* !WITHOUT_BPF */
+static struct ifnet *log_if;   /* hook to attach to bpf */
+
+/* we use this dummy function for all ifnet callbacks */
+static int
+log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+       return EINVAL;
+}
+
+void
+ipfw_log_bpf(int onoff)
+{
+       struct ifnet *ifp;
+
+       if (onoff) {
+               if (log_if)
+                       return;
+               ifp = if_alloc(IFT_ETHER);
+               if (ifp == NULL)
+                       return;
+               if_initname(ifp, "ipfw", 0);
+               ifp->if_mtu = 65536;
+               ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+               ifp->if_init = (void *)log_dummy;
+               ifp->if_ioctl = log_dummy;
+               ifp->if_start = (void *)log_dummy;
+               ifp->if_output = (void *)log_dummy;
+               ifp->if_addrlen = 6;
+               ifp->if_hdrlen = 14;
+               if_attach(ifp);
+               ifp->if_baudrate = IF_Mbps(10);
+               bpfattach(ifp, DLT_EN10MB, 14);
+               log_if = ifp;
+       } else {
+               if (log_if) {
+                       ether_ifdetach(log_if);
+                       if_free(log_if);
+               }
+               log_if = NULL;
+       }
+}
+#endif /* !WITHOUT_BPF */
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+    struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+    struct ip *ip)
+{
+       char *action;
+       int limit_reached = 0;
+       char action2[40], proto[128], fragment[32];
+
+       if (V_fw_verbose == 0) {
+#ifndef WITHOUT_BPF
+               struct m_hdr mh;
+
+               if (log_if == NULL || log_if->if_bpf == NULL)
+                       return;
+               /* BPF treats the "mbuf" as read-only */
+               mh.mh_next = m;
+               mh.mh_len = ETHER_HDR_LEN;
+               if (args->eh) { /* layer2, use orig hdr */
+                       mh.mh_data = (char *)args->eh;
+               } else {
+                       /* add fake header. Later we will store
+                        * more info in the header
+                        */
+                       mh.mh_data = "DDDDDDSSSSSS\x08\x00";
+               }
+               BPF_MTAP(log_if, (struct mbuf *)&mh);
+#endif /* !WITHOUT_BPF */
+               return;
+       }
+       /* the old 'log' function */
+       fragment[0] = '\0';
+       proto[0] = '\0';
+
+       if (f == NULL) {        /* bogus pkt */
+               if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+                       return;
+               V_norule_counter++;
+               if (V_norule_counter == V_verbose_limit)
+                       limit_reached = V_verbose_limit;
+               action = "Refuse";
+       } else {        /* O_LOG is the first action, find the real one */
+               ipfw_insn *cmd = ACTION_PTR(f);
+               ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+               if (l->max_log != 0 && l->log_left == 0)
+                       return;
+               l->log_left--;
+               if (l->log_left == 0)
+                       limit_reached = l->max_log;
+               cmd += F_LEN(cmd);      /* point to first action */
+               if (cmd->opcode == O_ALTQ) {
+                       ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                       snprintf(SNPARGS(action2, 0), "Altq %d",
+                               altq->qid);
+                       cmd += F_LEN(cmd);
+               }
+               if (cmd->opcode == O_PROB)
+                       cmd += F_LEN(cmd);
+
+               if (cmd->opcode == O_TAG)
+                       cmd += F_LEN(cmd);
+
+               action = action2;
+               switch (cmd->opcode) {
+               case O_DENY:
+                       action = "Deny";
+                       break;
+
+               case O_REJECT:
+                       if (cmd->arg1==ICMP_REJECT_RST)
+                               action = "Reset";
+                       else if (cmd->arg1==ICMP_UNREACH_HOST)
+                               action = "Reject";
+                       else
+                               snprintf(SNPARGS(action2, 0), "Unreach %d",
+                                       cmd->arg1);
+                       break;
+
+               case O_UNREACH6:
+                       if (cmd->arg1==ICMP6_UNREACH_RST)
+                               action = "Reset";
+                       else
+                               snprintf(SNPARGS(action2, 0), "Unreach %d",
+                                       cmd->arg1);
+                       break;
+
+               case O_ACCEPT:
+                       action = "Accept";
+                       break;
+               case O_COUNT:
+                       action = "Count";
+                       break;
+               case O_DIVERT:
+                       snprintf(SNPARGS(action2, 0), "Divert %d",
+                               cmd->arg1);
+                       break;
+               case O_TEE:
+                       snprintf(SNPARGS(action2, 0), "Tee %d",
+                               cmd->arg1);
+                       break;
+               case O_SETFIB:
+                       snprintf(SNPARGS(action2, 0), "SetFib %d",
+                               cmd->arg1);
+                       break;
+               case O_SKIPTO:
+                       snprintf(SNPARGS(action2, 0), "SkipTo %d",
+                               cmd->arg1);
+                       break;
+               case O_PIPE:
+                       snprintf(SNPARGS(action2, 0), "Pipe %d",
+                               cmd->arg1);
+                       break;
+               case O_QUEUE:
+                       snprintf(SNPARGS(action2, 0), "Queue %d",
+                               cmd->arg1);
+                       break;
+               case O_FORWARD_IP: {
+                       ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+                       int len;
+                       struct in_addr dummyaddr;
+                       if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+                               dummyaddr.s_addr = htonl(tablearg);
+                       else
+                               dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+                       len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+                               inet_ntoa(dummyaddr));
+
+                       if (sa->sa.sin_port)
+                               snprintf(SNPARGS(action2, len), ":%d",
+                                   sa->sa.sin_port);
+                       }
+                       break;
+               case O_NETGRAPH:
+                       snprintf(SNPARGS(action2, 0), "Netgraph %d",
+                               cmd->arg1);
+                       break;
+               case O_NGTEE:
+                       snprintf(SNPARGS(action2, 0), "Ngtee %d",
+                               cmd->arg1);
+                       break;
+               case O_NAT:
+                       action = "Nat";
+                       break;
+               case O_REASS:
+                       action = "Reass";
+                       break;
+               default:
+                       action = "UNKNOWN";
+                       break;
+               }
+       }
+
+       if (hlen == 0) {        /* non-ip */
+               snprintf(SNPARGS(proto, 0), "MAC");
+
+       } else {
+               int len;
+#ifdef INET6
+               char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+               char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+               struct icmphdr *icmp;
+               struct tcphdr *tcp;
+               struct udphdr *udp;
+#ifdef INET6
+               struct ip6_hdr *ip6 = NULL;
+               struct icmp6_hdr *icmp6;
+#endif
+               src[0] = '\0';
+               dst[0] = '\0';
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                       char ip6buf[INET6_ADDRSTRLEN];
+                       snprintf(src, sizeof(src), "[%s]",
+                           ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+                       snprintf(dst, sizeof(dst), "[%s]",
+                           ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+                       ip6 = (struct ip6_hdr *)ip;
+                       tcp = (struct tcphdr *)(((char *)ip) + hlen);
+                       udp = (struct udphdr *)(((char *)ip) + hlen);
+               } else
+#endif
+               {
+                       tcp = L3HDR(struct tcphdr, ip);
+                       udp = L3HDR(struct udphdr, ip);
+
+                       inet_ntoa_r(ip->ip_src, src);
+                       inet_ntoa_r(ip->ip_dst, dst);
+               }
+
+               switch (args->f_id.proto) {
+               case IPPROTO_TCP:
+                       len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+                       if (offset == 0)
+                               snprintf(SNPARGS(proto, len), ":%d %s:%d",
+                                   ntohs(tcp->th_sport),
+                                   dst,
+                                   ntohs(tcp->th_dport));
+                       else
+                               snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+
+               case IPPROTO_UDP:
+                       len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+                       if (offset == 0)
+                               snprintf(SNPARGS(proto, len), ":%d %s:%d",
+                                   ntohs(udp->uh_sport),
+                                   dst,
+                                   ntohs(udp->uh_dport));
+                       else
+                               snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+
+               case IPPROTO_ICMP:
+                       icmp = L3HDR(struct icmphdr, ip);
+                       if (offset == 0)
+                               len = snprintf(SNPARGS(proto, 0),
+                                   "ICMP:%u.%u ",
+                                   icmp->icmp_type, icmp->icmp_code);
+                       else
+                               len = snprintf(SNPARGS(proto, 0), "ICMP ");
+                       len += snprintf(SNPARGS(proto, len), "%s", src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+#ifdef INET6
+               case IPPROTO_ICMPV6:
+                       icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+                       if (offset == 0)
+                               len = snprintf(SNPARGS(proto, 0),
+                                   "ICMPv6:%u.%u ",
+                                   icmp6->icmp6_type, icmp6->icmp6_code);
+                       else
+                               len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+                       len += snprintf(SNPARGS(proto, len), "%s", src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+#endif
+               default:
+                       len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+                           args->f_id.proto, src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+               }
+
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                       if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+                               snprintf(SNPARGS(fragment, 0),
+                                   " (frag %08x:%d@%d%s)",
+                                   args->f_id.frag_id6,
+                                   ntohs(ip6->ip6_plen) - hlen,
+                                   ntohs(offset & IP6F_OFF_MASK) << 3,
+                                   (offset & IP6F_MORE_FRAG) ? "+" : "");
+               } else
+#endif
+               {
+                       int ipoff, iplen;
+                       ipoff = ntohs(ip->ip_off);
+                       iplen = ntohs(ip->ip_len);
+                       if (ipoff & (IP_MF | IP_OFFMASK))
+                               snprintf(SNPARGS(fragment, 0),
+                                   " (frag %d:%d@%d%s)",
+                                   ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
+                                   offset << 3,
+                                   (ipoff & IP_MF) ? "+" : "");
+               }
+       }
+#ifndef __linux__
+       if (oif || m->m_pkthdr.rcvif)
+               log(LOG_SECURITY | LOG_INFO,
+                   "ipfw: %d %s %s %s via %s%s\n",
+                   f ? f->rulenum : -1,
+                   action, proto, oif ? "out" : "in",
+                   oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+                   fragment);
+       else
+#endif
+               log(LOG_SECURITY | LOG_INFO,
+                   "ipfw: %d %s %s [no if info]%s\n",
+                   f ? f->rulenum : -1,
+                   action, proto, fragment);
+       if (limit_reached)
+               log(LOG_SECURITY | LOG_NOTICE,
+                   "ipfw: limit %d reached on entry %d\n",
+                   limit_reached, f ? f->rulenum : -1);
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_lookup.c b/dummynet2/ip_fw_lookup.c
new file mode 100644 (file)
index 0000000..bf04cb6
--- /dev/null
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Rule and pipe lookup support for ipfw.
+ *
+
+ipfw and dummynet need to quickly find objects (rules, pipes)
+that may be dynamically created or destroyed.
+To address the problem, we label each new object with a unique
+32-bit identifier whose low K bits are the index in a lookup
+table. All existing objects are referred by the lookup table,
+and identifiers are chosen so that for each slot there is
+at most one active object (whose identifier points to the slot).
+This is almost a hash table, except that we can pick the
+identifiers after looking at the table's occupation so
+we have a trivial hash function and are collision free.
+
+With this structure, operations are very fast and simple:
+- the table has N entries s[i] with two fields, 'id' and 'ptr',
+  with N <= M = 2^k (M is an upper bound to the size of the table);
+- initially, all slots have s[i].id = i, and the pointers
+  are used to build a freelist (tailq).
+- a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N].
+  This is easy to detect and we can use ptr to build the freelist.
+- when a new object is created, we put it in the empty slot i at the
+  head of the freelist, and set the id to s[i].id;
+- when an object is destroyed, we append its slot i to the end
+  of the freelist, and set s[i].id += M (note M, not N).
+- on a lookup for id = X, we look at slot i = X & (M-1),
+  and consider the lookup successful only if the slot is not
+  empty and s[i].id == X;
+- wraps occur at most every F * 2^32/M operations, where F is
+  the number of free slots. Because F is usually a reasonable
+  fraction of M, we should not worry too much.
+- if the table fills up, we can extend it by increasing N
+- shrinking the table is more difficult as we might create
+  collisions during the rehashing.
+ *
+ */
+
+#include <sys/cdefs.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup");
+#define Malloc(n)      malloc(n, M_IPFW_LUT, M_WAITOK)
+#define Calloc(n)      calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO)
+#define Free(p)                free(p, M_IPFW_LUT)
+
+#define log(x, arg...)
+
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define Malloc(n)      malloc(n)
+#define Calloc(n)      calloc(1, n)
+#define Free(p)                free(p)
+#define log(x, arg...) fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg)
+#endif /* !_KERNEL */
+
+struct entry {
+       uint32_t        id;
+       struct entry    *ptr;
+};
+
+struct lookup_table {
+       int _size;
+       int used;
+       int mask; /* 2^k -1, used for hashing */
+       struct entry *f_head, *f_tail; /* freelist */
+       struct entry *  s;      /* slots, array of N entries */
+};
+
+static __inline int empty(struct lookup_table *head, const void *p)
+{
+       const struct entry *ep = p;
+       return (ep == NULL ||
+               (ep >= head->s && ep < &head->s[head->_size]));
+}
+
+/*
+ * init or reinit a table
+ */
+struct lookup_table *
+ipfw_lut_init(struct lookup_table *head, int new_size, int mask)
+{
+       int i;
+       struct entry *s;        /* the new slots */
+       struct entry *fh, *ft;  /* the freelist */
+
+       if (head != NULL) {
+               mask = head->mask;
+               if (new_size <= head->_size)
+                       return head;
+               if (new_size >= mask+1) {
+                       log("size larger than mask");
+                       return NULL;
+               }
+       } else {
+               log("old is null, initialize");
+               head = Calloc(sizeof(*head));
+               if (head == NULL)
+                       return NULL;
+               if (new_size >= mask)
+                       mask = new_size;
+               if (mask & (mask -1)) {
+                       for (i = 1; i < mask; i += i)
+                           ;
+                       log("mask %d not 2^k, round up to %d", mask, i);
+                       mask = i;
+               }
+               mask = head->mask = mask - 1;
+       }
+
+       s = Calloc(new_size * sizeof(*s));
+       if (s == NULL)
+               return NULL;
+       if (!head->s) {
+               head->s = s;
+               head->_size = 1;
+       }
+       fh = ft = NULL;
+       /* remap the entries, adjust the freelist */
+       for (i = 0; i < new_size; i++) {
+               s[i].id = (i >= head->_size) ? i : head->s[i].id;
+               if (i < head->_size && !empty(head, head->s[i].ptr)) {
+                       s[i].ptr = head->s[i].ptr;
+                       continue;
+               }
+               if (fh == NULL)
+                       fh = &s[i];
+               else
+                       ft->ptr = &s[i];
+               ft = &s[i];
+       }
+       head->f_head = fh;
+       head->f_tail = ft;
+
+       /* write lock on the structure, to protect the readers */
+       fh = head->s;
+       head->s = s;
+       head->_size = new_size;
+       /* release write lock */
+       if (fh != s)
+               Free(fh);
+       log("done");
+       return head;
+}
+
+/* insert returns the id */
+int
+ipfw_lut_insert(struct lookup_table *head, void *d)
+{
+       struct entry *e;
+
+       e = head->f_head;
+       if (e == NULL)
+               return -1;
+       head->f_head = e->ptr;
+       e->ptr = d;
+       head->used++;
+       return e->id;
+}
+
+/* delete, returns the original entry */
+void *
+ipfw_lut_delete(struct lookup_table *head, int id)
+{
+       int i = id & head->mask;
+       void *result;
+       struct entry *e;
+
+       if (i >= head->_size)
+               return NULL;
+       e = &head->s[i];
+       if (e->id != id)
+               return NULL;
+       result = e->ptr;
+       /* write lock to invalidate the entry to readers */
+       e->id += head->mask + 1; /* prepare for next insert */
+       e->ptr = NULL;
+       /* release write lock */
+       if (head->f_head == NULL)
+               head->f_head = e;
+       else
+               head->f_tail->ptr = e;
+       head->f_tail = e;
+       head->used--;
+       return result;
+}
+
+void *
+ipfw_lut_lookup(struct lookup_table *head, int id)
+{
+       int i = id & head->mask;
+       struct entry *e;
+
+       if (i >= head->_size)
+               return NULL;
+       e = &head->s[i];
+       return (e->id == id) ? e->ptr : NULL;
+}
+
+void
+ipfw_lut_dump(struct lookup_table *head)
+{
+       int i;
+
+       log("head %p size %d used %d freelist %d",
+           head, head->_size, head->used, head->f_head ?
+                   head->f_head - head->s : -1);
+       for (i = 0; i < head->_size; i++) {
+               struct entry *e = &head->s[i];
+               char ee = empty(head, e->ptr) ? 'E' : ' ';
+               log("%5d  %5d %c %p", i, e->id, ee,
+                   ee == 'E' && e->ptr != NULL ?
+                   (void *)((struct entry *)e->ptr - head->s) : e->ptr);
+       }
+}
+
+#ifndef _KERNEL
+void dump_p(struct lookup_table *p, int *map)
+{
+       int i;
+       for (i = 0; i < p->_size; i++) {
+           int id = (int)ipfw_lut_lookup(p, map[i]);
+           log("%3d: %3d: %c", map[i] % 64, i, id);
+       }
+}
+int main(int argc, char *argv[])
+{
+       int i, j, l;
+#define S 1000
+       int map[S];
+       struct lookup_table *p;
+       struct lookup_table *p1;
+       const char *m = "nel mezzo del cammin di nostra vita mi ritrovai"
+               " in una selva oscura e la diritta via era smarrita!";
+
+       fprintf(stderr, "testing lookup\n");
+
+       l = strlen(m);
+
+       p = ipfw_lut_init(NULL, 120, 33);
+
+       ipfw_lut_dump(p);
+       for (i = 0; i < l; i++) {
+           int x = m[i];
+           int id = ipfw_lut_insert(p, (void *)x);
+           //ipfw_lut_dump(p);
+           map[i] = id;
+           for (j=0; j < 10; j++) {
+                   id = ipfw_lut_insert(p, (void *)'a');
+                   // ipfw_lut_dump(p);
+                   ipfw_lut_delete(p, id);
+                   // ipfw_lut_dump(p);
+           }
+       //    ipfw_lut_dump(p);
+       } 
+       dump_p(p, map);
+       p1 = ipfw_lut_init(p, 23, 0);
+       if (!p1)
+               return 1;
+       dump_p(p1, map);
+       p1 = ipfw_lut_init(p1, 120, 0);
+       if (!p1)
+               return 1;
+       dump_p(p1, map);
+       return 0;
+}
+#endif
+/* end of file */
diff --git a/dummynet2/ip_fw_nat.c b/dummynet2/ip_fw_nat.c
new file mode 100644 (file)
index 0000000..ead46a7
--- /dev/null
@@ -0,0 +1,606 @@
+/*-
+ * Copyright (c) 2008 Paolo Pisati
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+
+#define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */
+
+#include <netinet/libalias/alias.h>
+#include <netinet/libalias/alias_local.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
+#define        V_ifaddr_event_tag      VNET(ifaddr_event_tag)
+
+static void 
+ifaddr_change(void *arg __unused, struct ifnet *ifp)
+{
+       struct cfg_nat *ptr;
+       struct ifaddr *ifa;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+       IPFW_WLOCK(chain);
+       /* Check every nat entry... */
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               /* ...using nic 'ifp->if_xname' as dynamic alias address. */
+               if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
+                       continue;
+                       if_addr_rlock(ifp);
+                       TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+                               if (ifa->ifa_addr == NULL)
+                                       continue;
+                               if (ifa->ifa_addr->sa_family != AF_INET)
+                                       continue;
+                               ptr->ip = ((struct sockaddr_in *) 
+                                   (ifa->ifa_addr))->sin_addr;
+                               LibAliasSetAddress(ptr->lib, ptr->ip);
+                       }
+                       if_addr_runlock(ifp);
+               }
+       IPFW_WUNLOCK(chain);
+}
+
+/*
+ * delete the pointers for nat entry ix, or all of them if ix < 0
+ */
+static void
+flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
+{
+       int i;
+       ipfw_insn_nat *cmd;
+
+       IPFW_WLOCK_ASSERT(chain);
+       for (i = 0; i < chain->n_rules; i++) {
+               cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
+               /* XXX skip log and the like ? */
+               if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
+                           (ix < 0 || cmd->nat->id == ix))
+                       cmd->nat = NULL;
+       }
+}
+
+static void
+del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
+{
+       struct cfg_redir *r, *tmp_r;
+       struct cfg_spool *s, *tmp_s;
+       int i, num;
+
+       LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
+               num = 1; /* Number of alias_link to delete. */
+               switch (r->mode) {
+               case REDIR_PORT:
+                       num = r->pport_cnt;
+                       /* FALLTHROUGH */
+               case REDIR_ADDR:
+               case REDIR_PROTO:
+                       /* Delete all libalias redirect entry. */
+                       for (i = 0; i < num; i++)
+                               LibAliasRedirectDelete(n->lib, r->alink[i]);
+                       /* Del spool cfg if any. */
+                       LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
+                               LIST_REMOVE(s, _next);
+                               free(s, M_IPFW);
+                       }
+                       free(r->alink, M_IPFW);
+                       LIST_REMOVE(r, _next);
+                       free(r, M_IPFW);
+                       break;
+               default:
+                       printf("unknown redirect mode: %u\n", r->mode);                         
+                       /* XXX - panic?!?!? */
+                       break; 
+               }
+       }
+}
+
+static int
+add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
+{
+       struct cfg_redir *r, *ser_r;
+       struct cfg_spool *s, *ser_s;
+       int cnt, off, i;
+
+       for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
+               ser_r = (struct cfg_redir *)&buf[off];
+               r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+               memcpy(r, ser_r, SOF_REDIR);
+               LIST_INIT(&r->spool_chain);
+               off += SOF_REDIR;
+               r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
+                   M_IPFW, M_WAITOK | M_ZERO);
+               switch (r->mode) {
+               case REDIR_ADDR:
+                       r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
+                           r->paddr);
+                       break;
+               case REDIR_PORT:
+                       for (i = 0 ; i < r->pport_cnt; i++) {
+                               /* If remotePort is all ports, set it to 0. */
+                               u_short remotePortCopy = r->rport + i;
+                               if (r->rport_cnt == 1 && r->rport == 0)
+                                       remotePortCopy = 0;
+                               r->alink[i] = LibAliasRedirectPort(ptr->lib,
+                                   r->laddr, htons(r->lport + i), r->raddr,
+                                   htons(remotePortCopy), r->paddr, 
+                                   htons(r->pport + i), r->proto);
+                               if (r->alink[i] == NULL) {
+                                       r->alink[0] = NULL;
+                                       break;
+                               }
+                       }
+                       break;
+               case REDIR_PROTO:
+                       r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
+                           r->raddr, r->paddr, r->proto);
+                       break;
+               default:
+                       printf("unknown redirect mode: %u\n", r->mode);
+                       break; 
+               }
+               /* XXX perhaps return an error instead of panic ? */
+               if (r->alink[0] == NULL)
+                       panic("LibAliasRedirect* returned NULL");
+               /* LSNAT handling. */
+                       for (i = 0; i < r->spool_cnt; i++) {
+                               ser_s = (struct cfg_spool *)&buf[off];
+                       s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+                               memcpy(s, ser_s, SOF_SPOOL);
+                               LibAliasAddServer(ptr->lib, r->alink[0], 
+                                   s->addr, htons(s->port));
+                               off += SOF_SPOOL;
+                               /* Hook spool entry. */
+                       LIST_INSERT_HEAD(&r->spool_chain, s, _next);
+                       }
+               /* And finally hook this redir entry. */
+               LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
+       }
+       return (1);
+}
+
+static int
+ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
+{
+       struct mbuf *mcl;
+       struct ip *ip;
+       /* XXX - libalias duct tape */
+       int ldt, retval;
+       char *c;
+
+       ldt = 0;
+       retval = 0;
+       mcl = m_megapullup(m, m->m_pkthdr.len);
+       if (mcl == NULL) {
+               args->m = NULL;
+               return (IP_FW_DENY);
+       }
+       ip = mtod(mcl, struct ip *);
+
+       /* 
+        * XXX - Libalias checksum offload 'duct tape':
+        * 
+        * locally generated packets have only pseudo-header checksum
+        * calculated and libalias will break it[1], so mark them for
+        * later fix.  Moreover there are cases when libalias modifies
+        * tcp packet data[2], mark them for later fix too.
+        *
+        * [1] libalias was never meant to run in kernel, so it does
+        * not have any knowledge about checksum offloading, and
+        * expects a packet with a full internet checksum.
+        * Unfortunately, packets generated locally will have just the
+        * pseudo header calculated, and when libalias tries to adjust
+        * the checksum it will actually compute a wrong value.
+        *
+        * [2] when libalias modifies tcp's data content, full TCP
+        * checksum has to be recomputed: the problem is that
+        * libalias does not have any idea about checksum offloading.
+        * To work around this, we do not do checksumming in LibAlias,
+        * but only mark the packets in th_x2 field. If we receive a
+        * marked packet, we calculate correct checksum for it
+        * aware of offloading.  Why such a terrible hack instead of
+        * recalculating checksum for each packet?
+        * Because the previous checksum was not checked!
+        * Recalculating checksums for EVERY packet will hide ALL
+        * transmission errors. Yes, marked packets still suffer from
+        * this problem. But, sigh, natd(8) has this problem, too.
+        *
+        * TODO: -make libalias mbuf aware (so
+        * it can handle delayed checksum and tso)
+        */
+
+       if (mcl->m_pkthdr.rcvif == NULL && 
+           mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
+               ldt = 1;
+
+       c = mtod(mcl, char *);
+       if (args->oif == NULL)
+               retval = LibAliasIn(t->lib, c, 
+                       mcl->m_len + M_TRAILINGSPACE(mcl));
+       else
+               retval = LibAliasOut(t->lib, c, 
+                       mcl->m_len + M_TRAILINGSPACE(mcl));
+       if (retval == PKT_ALIAS_RESPOND) {
+         m->m_flags |= M_SKIP_FIREWALL;
+         retval = PKT_ALIAS_OK;
+       }
+       if (retval != PKT_ALIAS_OK &&
+           retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
+               /* XXX - should i add some logging? */
+               m_free(mcl);
+               args->m = NULL;
+               return (IP_FW_DENY);
+       }
+       mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
+
+       /* 
+        * XXX - libalias checksum offload 
+        * 'duct tape' (see above) 
+        */
+
+       if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && 
+           ip->ip_p == IPPROTO_TCP) {
+               struct tcphdr   *th; 
+
+               th = (struct tcphdr *)(ip + 1);
+               if (th->th_x2) 
+                       ldt = 1;
+       }
+
+       if (ldt) {
+               struct tcphdr   *th;
+               struct udphdr   *uh;
+               u_short cksum;
+
+               /* XXX check if ip_len can stay in net format */
+               cksum = in_pseudo(
+                   ip->ip_src.s_addr,
+                   ip->ip_dst.s_addr, 
+                   htons(ip->ip_p + ntohs(ip->ip_len) - (ip->ip_hl << 2))
+               );
+                                       
+               switch (ip->ip_p) {
+               case IPPROTO_TCP:
+                       th = (struct tcphdr *)(ip + 1);
+                       /* 
+                        * Maybe it was set in 
+                        * libalias... 
+                        */
+                       th->th_x2 = 0;
+                       th->th_sum = cksum;
+                       mcl->m_pkthdr.csum_data = 
+                           offsetof(struct tcphdr, th_sum);
+                       break;
+               case IPPROTO_UDP:
+                       uh = (struct udphdr *)(ip + 1);
+                       uh->uh_sum = cksum;
+                       mcl->m_pkthdr.csum_data = 
+                           offsetof(struct udphdr, uh_sum);
+                       break;                                          
+               }
+               /* No hw checksum offloading: do it ourselves */
+               if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
+                       in_delayed_cksum(mcl);
+                       mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+               }
+       }
+       args->m = mcl;
+       return (IP_FW_NAT);
+}
+
+static struct cfg_nat *
+lookup_nat(struct nat_list *l, int nat_id)
+{
+       struct cfg_nat *res;
+
+       LIST_FOREACH(res, l, _next) {
+               if (res->id == nat_id)
+                       break;
+       }
+       return res;
+}
+
+static int 
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+       struct cfg_nat *ptr, *ser_n;
+       char *buf;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+       sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat));
+       ser_n = (struct cfg_nat *)buf;
+
+       /* check valid parameter ser_n->id > 0 ? */
+       /* 
+        * Find/create nat rule.
+        */
+       IPFW_WLOCK(chain);
+       ptr = lookup_nat(&chain->nat, ser_n->id);
+       if (ptr == NULL) {
+               /* New rule: allocate and init new instance. */
+               ptr = malloc(sizeof(struct cfg_nat), 
+                   M_IPFW, M_NOWAIT | M_ZERO);
+               if (ptr == NULL) {
+                       IPFW_WUNLOCK(chain);
+                       free(buf, M_IPFW);
+                       return (ENOSPC);
+               }
+               ptr->lib = LibAliasInit(NULL);
+               if (ptr->lib == NULL) {
+                       IPFW_WUNLOCK(chain);
+                       free(ptr, M_IPFW);
+                       free(buf, M_IPFW);
+                       return (EINVAL);
+               }
+               LIST_INIT(&ptr->redir_chain);
+       } else {
+               /* Entry already present: temporarly unhook it. */
+               LIST_REMOVE(ptr, _next);
+               flush_nat_ptrs(chain, ser_n->id);
+       }
+       IPFW_WUNLOCK(chain);
+
+       /* 
+        * Basic nat configuration.
+        */
+       ptr->id = ser_n->id;
+       /* 
+        * XXX - what if this rule doesn't nat any ip and just 
+        * redirect? 
+        * do we set aliasaddress to 0.0.0.0?
+        */
+       ptr->ip = ser_n->ip;
+       ptr->redir_cnt = ser_n->redir_cnt;
+       ptr->mode = ser_n->mode;
+       LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
+       LibAliasSetAddress(ptr->lib, ptr->ip);
+       memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
+
+       /* 
+        * Redir and LSNAT configuration.
+        */
+       /* Delete old cfgs. */
+       del_redir_spool_cfg(ptr, &ptr->redir_chain);
+       /* Add new entries. */
+       add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+       free(buf, M_IPFW);
+       IPFW_WLOCK(chain);
+       LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+       IPFW_WUNLOCK(chain);
+       return (0);
+}
+
+static int
+ipfw_nat_del(struct sockopt *sopt)
+{
+       struct cfg_nat *ptr;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+       int i;
+               
+       sooptcopyin(sopt, &i, sizeof i, sizeof i);
+       /* XXX validate i */
+       IPFW_WLOCK(chain);
+       ptr = lookup_nat(&chain->nat, i);
+       if (ptr == NULL) {
+               IPFW_WUNLOCK(chain);
+               return (EINVAL);
+       }
+       LIST_REMOVE(ptr, _next);
+       flush_nat_ptrs(chain, i);
+       IPFW_WUNLOCK(chain);
+       del_redir_spool_cfg(ptr, &ptr->redir_chain);
+       LibAliasUninit(ptr->lib);
+       free(ptr, M_IPFW);
+       return (0);
+}
+
+static int
+ipfw_nat_get_cfg(struct sockopt *sopt)
+{      
+       uint8_t *data;
+       struct cfg_nat *n;
+       struct cfg_redir *r;
+       struct cfg_spool *s;
+       int nat_cnt, off;
+       struct ip_fw_chain *chain;
+       int err = ENOSPC;
+               
+       chain = &V_layer3_chain;
+       nat_cnt = 0;
+       off = sizeof(nat_cnt);
+
+       data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+       IPFW_RLOCK(chain);
+       /* Serialize all the data. */
+       LIST_FOREACH(n, &chain->nat, _next) {
+               nat_cnt++;
+               if (off + SOF_NAT >= NAT_BUF_LEN)
+                       goto nospace;
+                       bcopy(n, &data[off], SOF_NAT);
+                       off += SOF_NAT;
+                       LIST_FOREACH(r, &n->redir_chain, _next) {
+                       if (off + SOF_REDIR >= NAT_BUF_LEN)
+                               goto nospace;
+                       bcopy(r, &data[off], SOF_REDIR);
+                                       off += SOF_REDIR;
+                       LIST_FOREACH(s, &r->spool_chain, _next) {
+                               if (off + SOF_SPOOL >= NAT_BUF_LEN)
+                                                       goto nospace;
+                               bcopy(s, &data[off], SOF_SPOOL);
+                               off += SOF_SPOOL;
+                                       }
+                       }
+       }
+       err = 0; /* all good */
+nospace:
+       IPFW_RUNLOCK(chain);
+       if (err == 0) {
+       bcopy(&nat_cnt, data, sizeof(nat_cnt));
+       sooptcopyout(sopt, data, NAT_BUF_LEN);
+       } else {
+       printf("serialized data buffer not big enough:"
+           "please increase NAT_BUF_LEN\n");
+       }
+       free(data, M_IPFW);
+       return (err);
+}
+
+static int
+ipfw_nat_get_log(struct sockopt *sopt)
+{
+       uint8_t *data;
+       struct cfg_nat *ptr;
+       int i, size;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+
+       IPFW_RLOCK(chain);
+       /* one pass to count, one to copy the data */
+       i = 0;
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               if (ptr->lib->logDesc == NULL) 
+                       continue;
+               i++;
+       }
+       size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
+       data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
+               if (data == NULL) {
+               IPFW_RUNLOCK(chain);
+                       return (ENOSPC);
+               }
+       i = 0;
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               if (ptr->lib->logDesc == NULL)
+                       continue;
+               bcopy(&ptr->id, &data[i], sizeof(int));
+               i += sizeof(int);
+               bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
+               i += LIBALIAS_BUF_SIZE;
+       }
+       IPFW_RUNLOCK(chain);
+       sooptcopyout(sopt, data, size);
+       free(data, M_IPFW);
+       return(0);
+}
+
+static void
+ipfw_nat_init(void)
+{
+
+       IPFW_WLOCK(&V_layer3_chain);
+       /* init ipfw hooks */
+       ipfw_nat_ptr = ipfw_nat;
+       lookup_nat_ptr = lookup_nat;
+       ipfw_nat_cfg_ptr = ipfw_nat_cfg;
+       ipfw_nat_del_ptr = ipfw_nat_del;
+       ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
+       ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+       IPFW_WUNLOCK(&V_layer3_chain);
+       V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
+           ifaddr_event, ifaddr_change,
+           NULL, EVENTHANDLER_PRI_ANY);
+}
+
+static void
+ipfw_nat_destroy(void)
+{
+       struct cfg_nat *ptr, *ptr_temp;
+       struct ip_fw_chain *chain;
+       
+       chain = &V_layer3_chain;
+       IPFW_WLOCK(chain);
+       LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
+               LIST_REMOVE(ptr, _next);
+               del_redir_spool_cfg(ptr, &ptr->redir_chain);
+               LibAliasUninit(ptr->lib);
+               free(ptr, M_IPFW);
+       }
+       EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
+       flush_nat_ptrs(chain, -1 /* flush all */);
+       /* deregister ipfw_nat */
+       ipfw_nat_ptr = NULL;
+       lookup_nat_ptr = NULL;
+       ipfw_nat_cfg_ptr = NULL;
+       ipfw_nat_del_ptr = NULL;
+       ipfw_nat_get_cfg_ptr = NULL;
+       ipfw_nat_get_log_ptr = NULL;
+       IPFW_WUNLOCK(chain);
+}
+
+static int
+ipfw_nat_modevent(module_t mod, int type, void *unused)
+{
+       int err = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               ipfw_nat_init();
+               break;
+
+       case MOD_UNLOAD:
+               ipfw_nat_destroy();
+               break;
+
+       default:
+               return EOPNOTSUPP;
+               break;
+       }
+       return err;
+}
+
+static moduledata_t ipfw_nat_mod = {
+       "ipfw_nat",
+       ipfw_nat_modevent,
+       0
+};
+
+DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
+MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_VERSION(ipfw_nat, 1);
+/* end of file */
diff --git a/dummynet2/ip_fw_pfil.c b/dummynet2/ip_fw_pfil.c
new file mode 100644 (file)
index 0000000..db7cec6
--- /dev/null
@@ -0,0 +1,410 @@
+/*-
+ * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif /* KLD_MODULE */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pfil.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_dummynet.h>
+#include <netgraph/ng_ipfw.h>
+
+#include <machine/in_cksum.h>
+
+static VNET_DEFINE(int, fw_enable) = 1;
+#define V_fw_enable    VNET(fw_enable)
+
+#ifdef INET6
+static VNET_DEFINE(int, fw6_enable) = 1;
+#define V_fw6_enable   VNET(fw6_enable)
+#endif
+
+int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+/* Divert hooks. */
+void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+/* ng_ipfw hooks. */
+ng_ipfw_input_t *ng_ipfw_input_p = NULL;
+
+/* Forward declarations. */
+static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
+    ipfw_chg_hook, "I", "Enable ipfw");
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6_fw);
+SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
+    ipfw_chg_hook, "I", "Enable ipfw+6");
+#endif /* INET6 */
+#endif /* SYSCTL_NODE */
+
+/*
+ * The pfilter hook to pass packets to ipfw_chk and then to
+ * dummynet, divert, netgraph or other modules.
+ * The packet may be consumed.
+ */            
+int
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp)
+{
+       struct ip_fw_args args;
+       struct m_tag *tag;
+       int ipfw;
+       int ret;
+
+       /* all the processing now uses ip_len in net format */
+       SET_NET_IPLEN(mtod(*m0, struct ip *));
+
+       /* convert dir to IPFW values */
+       dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
+       bzero(&args, sizeof(args));
+
+again:
+       /*
+        * extract and remove the tag if present. If we are left
+        * with onepass, optimize the outgoing path.
+        */
+       tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+       if (tag != NULL) {
+               args.rule = *((struct ipfw_rule_ref *)(tag+1));
+               m_tag_delete(*m0, tag);
+               if (args.rule.info & IPFW_ONEPASS) {
+                       SET_HOST_IPLEN(mtod(*m0, struct ip *));
+                       return 0;
+               }
+       }
+
+       args.m = *m0;
+       args.oif = dir == DIR_OUT ? ifp : NULL;
+       args.inp = inp;
+
+       ipfw = ipfw_chk(&args);
+       *m0 = args.m;
+
+       KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+           __func__));
+
+       /* breaking out of the switch means drop */
+       ret = 0;        /* default return value for pass */
+       switch (ipfw) {
+       case IP_FW_PASS:
+               /* next_hop may be set by ipfw_chk */
+               if (args.next_hop == NULL)
+                       break; /* pass */
+#ifndef IPFIREWALL_FORWARD
+               ret = EACCES;
+#else
+           {
+               struct m_tag *fwd_tag;
+
+               /* Incoming packets should not be tagged so we do not
+                * m_tag_find. Outgoing packets may be tagged, so we
+                * reuse the tag if present.
+                */
+               fwd_tag = (dir == DIR_IN) ? NULL :
+                       m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+               if (fwd_tag != NULL) {
+                       m_tag_unlink(*m0, fwd_tag);
+               } else {
+                       fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
+                               sizeof(struct sockaddr_in), M_NOWAIT);
+                       if (fwd_tag == NULL) {
+                               ret = EACCES;
+                               break; /* i.e. drop */
+                       }
+               }
+               bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
+               m_tag_prepend(*m0, fwd_tag);
+
+               if (in_localip(args.next_hop->sin_addr))
+                       (*m0)->m_flags |= M_FASTFWD_OURS;
+           }
+#endif
+               break;
+
+       case IP_FW_DENY:
+               ret = EACCES;
+               break; /* i.e. drop */
+
+       case IP_FW_DUMMYNET:
+               ret = EACCES;
+               if (ip_dn_io_ptr == NULL)
+                       break; /* i.e. drop */
+               if (mtod(*m0, struct ip *)->ip_v == 4)
+                       ret = ip_dn_io_ptr(m0, dir, &args);
+               else if (mtod(*m0, struct ip *)->ip_v == 6)
+                       ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
+               else
+                       break; /* drop it */
+               /*
+                * XXX should read the return value.
+                * dummynet normally eats the packet and sets *m0=NULL
+                * unless the packet can be sent immediately. In this
+                * case args is updated and we should re-run the
+                * check without clearing args.
+                */
+               if (*m0 != NULL)
+                       goto again;
+               break;
+
+       case IP_FW_TEE:
+       case IP_FW_DIVERT:
+               if (ip_divert_ptr == NULL) {
+                       ret = EACCES;
+                       break; /* i.e. drop */
+               }
+               ret = ipfw_divert(m0, dir, &args.rule,
+                       (ipfw == IP_FW_TEE) ? 1 : 0);
+               /* continue processing for the original packet (tee). */
+               if (*m0)
+                       goto again;
+               break;
+
+       case IP_FW_NGTEE:
+       case IP_FW_NETGRAPH:
+               if (!NG_IPFW_LOADED) {
+                       ret = EACCES;
+                       break; /* i.e. drop */
+               }
+               ret = ng_ipfw_input_p(m0, dir, &args,
+                       (ipfw == IP_FW_NGTEE) ? 1 : 0);
+               if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
+                       goto again;     /* continue with packet */
+               break;
+               
+       case IP_FW_NAT:
+       case IP_FW_REASS:
+               goto again;             /* continue with packet */
+       
+       default:
+               KASSERT(0, ("%s: unknown retval", __func__));
+       }
+
+       if (ret != 0) {
+               if (*m0)
+                       FREE_PKT(*m0);
+               *m0 = NULL;
+       }
+       if (*m0)
+               SET_HOST_IPLEN(mtod(*m0, struct ip *));
+       return ret;
+}
+
+/* do the divert, return 1 on error 0 on success */
+static int
+ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
+       int tee)
+{
+       /*
+        * ipfw_chk() has already tagged the packet with the divert tag.
+        * If tee is set, copy packet and return original.
+        * If not tee, consume packet and send it to divert socket.
+        */
+       struct mbuf *clone;
+       struct ip *ip;
+       struct m_tag *tag;
+
+       /* Cloning needed for tee? */
+       if (tee == 0) {
+               clone = *m0;    /* use the original mbuf */
+               *m0 = NULL;
+       } else {
+               clone = m_dup(*m0, M_DONTWAIT);
+               /* If we cannot duplicate the mbuf, we sacrifice the divert
+                * chain and continue with the tee-ed packet.
+                */
+               if (clone == NULL)
+                       return 1;
+       }
+
+       /*
+        * Divert listeners can normally handle non-fragmented packets,
+        * but we can only reass in the non-tee case.
+        * This means that listeners on a tee rule may get fragments,
+        * and have to live with that.
+        * Note that we now have the 'reass' ipfw option so if we care
+        * we can do it before a 'tee'.
+        */
+       ip = mtod(clone, struct ip *);
+       if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
+               int hlen;
+               struct mbuf *reass;
+
+               SET_HOST_IPLEN(ip); /* ip_reass wants host order */
+               reass = ip_reass(clone); /* Reassemble packet. */
+               if (reass == NULL)
+                       return 0; /* not an error */
+               /* if reass = NULL then it was consumed by ip_reass */
+               /*
+                * IP header checksum fixup after reassembly and leave header
+                * in network byte order.
+                */
+               ip = mtod(reass, struct ip *);
+               hlen = ip->ip_hl << 2;
+               SET_NET_IPLEN(ip);
+               ip->ip_sum = 0;
+               if (hlen == sizeof(struct ip))
+                       ip->ip_sum = in_cksum_hdr(ip);
+               else
+                       ip->ip_sum = in_cksum(reass, hlen);
+               clone = reass;
+       }
+       /* attach a tag to the packet with the reinject info */
+       tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+                   sizeof(struct ipfw_rule_ref), M_NOWAIT);
+       if (tag == NULL) {
+               FREE_PKT(clone);
+               return 1;
+       }
+       *((struct ipfw_rule_ref *)(tag+1)) = *rule;
+       m_tag_prepend(clone, tag);
+
+       /* Do the dirty job... */
+       ip_divert_ptr(clone, incoming);
+       return 0;
+}
+
+/*
+ * attach or detach hooks for a given protocol family
+ */
+static int
+ipfw_hook(int onoff, int pf)
+{
+       struct pfil_head *pfh;
+
+       pfh = pfil_head_get(PFIL_TYPE_AF, pf);
+       if (pfh == NULL)
+               return ENOENT;
+
+       (void) (onoff ? pfil_add_hook : pfil_remove_hook)
+           (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
+
+       return 0;
+}
+
+int
+ipfw_attach_hooks(int arg)
+{
+       int error = 0;
+
+       if (arg == 0) /* detach */
+               ipfw_hook(0, AF_INET);
+       else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+                error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
+                printf("ipfw_hook() error\n");
+        }
+#ifdef INET6
+       if (arg == 0) /* detach */
+               ipfw_hook(0, AF_INET6);
+       else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+                error = ENOENT;
+                printf("ipfw6_hook() error\n");
+        }
+#endif
+       return error;
+}
+
+int
+ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
+{
+       int enable;
+       int oldenable;
+       int error;
+       int af;
+
+       if (arg1 == &VNET_NAME(fw_enable)) {
+               enable = V_fw_enable;
+               af = AF_INET;
+       }
+#ifdef INET6
+       else if (arg1 == &VNET_NAME(fw6_enable)) {
+               enable = V_fw6_enable;
+               af = AF_INET6;
+       }
+#endif
+       else 
+               return (EINVAL);
+
+       oldenable = enable;
+
+       error = sysctl_handle_int(oidp, &enable, 0, req);
+
+       if (error)
+               return (error);
+
+       enable = (enable) ? 1 : 0;
+
+       if (enable == oldenable)
+               return (0);
+
+       error = ipfw_hook(enable, af);
+       if (error)
+               return (error);
+       if (af == AF_INET)
+               V_fw_enable = enable;
+#ifdef INET6
+       else if (af == AF_INET6)
+               V_fw6_enable = enable;
+#endif
+
+       return (0);
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_sockopt.c b/dummynet2/ip_fw_sockopt.c
new file mode 100644 (file)
index 0000000..086d7f0
--- /dev/null
@@ -0,0 +1,1086 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Sockopt support for ipfw. The routines here implement
+ * the upper half of the ipfw code.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>  /* struct m_tag used by nested headers */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+/*
+ * static variables followed by global ones (none in this file)
+ */
+
+/*
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
+ */
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
+{
+       int i, lo, hi;
+       struct ip_fw *r;
+
+       for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+               i = (lo + hi) / 2;
+               r = chain->map[i];
+               if (r->rulenum < key)
+                       lo = i + 1;     /* continue from the next one */
+               else if (r->rulenum > key)
+                       hi = i;         /* this might be good */
+               else if (r->id < id)
+                       lo = i + 1;     /* continue from the next one */
+               else /* r->id >= id */
+                       hi = i;         /* this might be good */
+       };
+       return hi;
+}
+
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
+
+       for (;;) {
+               struct ip_fw **map;
+               int i;
+
+               i = chain->n_rules + extra;
+               map = malloc(i * sizeof(struct ip_fw *), M_IPFW, M_WAITOK);
+               if (map == NULL) {
+                       printf("%s: cannot allocate map\n", __FUNCTION__);
+                       return NULL;
+               }
+               if (!locked)
+                       IPFW_UH_WLOCK(chain);
+               if (i >= chain->n_rules + extra) /* good */
+                       return map;
+               /* otherwise we lost the race, free and retry */
+               if (!locked)
+                       IPFW_UH_WUNLOCK(chain);
+               free(map, M_IPFW);
+       }
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+       struct ip_fw **old_map;
+
+       IPFW_WLOCK(chain);
+       chain->id++;
+       chain->n_rules = new_len;
+       old_map = chain->map;
+       chain->map = new_map;
+       IPFW_WUNLOCK(chain);
+       return old_map;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
+ */
+int
+ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+       struct ip_fw *rule;
+       int i, l, insert_before;
+       struct ip_fw **map;     /* the new array of pointers */
+
+       if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
+               return (EINVAL);
+
+       l = RULESIZE(input_rule);
+       rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+       if (rule == NULL)
+               return (ENOSPC);
+       /* get_map returns with IPFW_UH_WLOCK if successful */
+       map = get_map(chain, 1, 0 /* not locked */);
+       if (map == NULL) {
+               free(rule, M_IPFW);
+               return ENOSPC;
+       }
+
+       bcopy(input_rule, rule, l);
+       /* clear fields not settable from userland */
+       rule->x_next = NULL;
+       rule->next_rule = NULL;
+       rule->pcnt = 0;
+       rule->bcnt = 0;
+       rule->timestamp = 0;
+
+       if (V_autoinc_step < 1)
+               V_autoinc_step = 1;
+       else if (V_autoinc_step > 1000)
+               V_autoinc_step = 1000;
+       /* find the insertion point, we will insert before */
+       insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+       i = ipfw_find_rule(chain, insert_before, 0);
+       /* duplicate first part */
+       if (i > 0)
+               bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+       map[i] = rule;
+       /* duplicate remaining part, we always have the default rule */
+       bcopy(chain->map + i, map + i + 1,
+               sizeof(struct ip_fw *) *(chain->n_rules - i));
+       if (rule->rulenum == 0) {
+               /* write back the number */
+               rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
+               if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+                       rule->rulenum += V_autoinc_step;
+               input_rule->rulenum = rule->rulenum;
+       }
+
+       rule->id = chain->id + 1;
+       map = swap_map(chain, map, chain->n_rules + 1);
+       chain->static_len += l;
+       IPFW_UH_WUNLOCK(chain);
+       if (map)
+               free(map, M_IPFW);
+       return (0);
+}
+
+/*
+ * Reclaim storage associated with a list of rules.  This is
+ * typically the list created using remove_rule.
+ * A NULL pointer on input is handled correctly.
+ */
+void
+ipfw_reap_rules(struct ip_fw *head)
+{
+       struct ip_fw *rule;
+
+       while ((rule = head) != NULL) {
+               head = head->x_next;
+               free(rule, M_IPFW);
+       }
+}
+
+/**
+ * Remove all rules with given number, and also do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an u_int32_t. The low 16 bit are the rule or set number,
+ * the next 8 bits are the new set, the top 8 bits are the command:
+ *
+ *     0       delete rules with given number
+ *     1       delete rules with given set number
+ *     2       move rules with given number to new set
+ *     3       move rules with given set number to new set
+ *     4       swap sets with given numbers
+ *     5       delete rules with given number and with given set number
+ */
+static int
+del_entry(struct ip_fw_chain *chain, u_int32_t arg)
+{
+       struct ip_fw *rule;
+       uint32_t rulenum;       /* rule or old_set */
+       uint8_t cmd, new_set;
+       int start, end = 0, i, ofs, n;
+       struct ip_fw **map = NULL;
+       int error = 0;
+
+       rulenum = arg & 0xffff;
+       cmd = (arg >> 24) & 0xff;
+       new_set = (arg >> 16) & 0xff;
+
+       if (cmd > 5 || new_set > RESVD_SET)
+               return EINVAL;
+       if (cmd == 0 || cmd == 2 || cmd == 5) {
+               if (rulenum >= IPFW_DEFAULT_RULE)
+                       return EINVAL;
+       } else {
+               if (rulenum > RESVD_SET)        /* old_set */
+                       return EINVAL;
+       }
+
+       IPFW_UH_WLOCK(chain); /* prevent conflicts among the writers */
+       chain->reap = NULL;     /* prepare for deletions */
+
+       switch (cmd) {
+       case 0: /* delete rules with given number (0 is special means all) */
+       case 1: /* delete all rules with given set number, rule->set == rulenum */
+       case 5: /* delete rules with given number and with given set number.
+                * rulenum - given rule number;
+                * new_set - given set number.
+                */
+               /* locate first rule to delete (start), the one after the
+                * last one (end), and count how many rules to delete (n)
+                */
+               n = 0;
+               if (cmd == 1) { /* look for a specific set, must scan all */
+                       for (start = -1, i = 0; i < chain->n_rules; i++) {
+                               if (chain->map[start]->set != rulenum)
+                                       continue;
+                               if (start < 0)
+                                       start = i;
+                               end = i;
+                               n++;
+                       }
+                       end++;  /* first non-matching */
+               } else {
+                       start = ipfw_find_rule(chain, rulenum, 0);
+                       for (end = start; end < chain->n_rules; end++) {
+                               rule = chain->map[end];
+                               if (rulenum > 0 && rule->rulenum != rulenum)
+                                       break;
+                               if (rule->set != RESVD_SET &&
+                                   (cmd == 0 || rule->set == new_set) )
+                                       n++;
+                       }
+               }
+               if (n == 0 && arg == 0)
+                       break; /* special case, flush on empty ruleset */
+               /* allocate the map, if needed */
+               if (n > 0)
+                       map = get_map(chain, -n, 1 /* locked */);
+               if (n == 0 || map == NULL) {
+                       error = EINVAL;
+               break;
+               }
+               /* copy the initial part of the map */
+               if (start > 0)
+                       bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+               /* copy active rules between start and end */
+               for (i = ofs = start; i < end; i++) {
+                       rule = chain->map[i];
+                       if (!(rule->set != RESVD_SET &&
+                           (cmd == 0 || rule->set == new_set) ))
+                               map[ofs++] = chain->map[i];
+               }
+               /* finally the tail */
+               bcopy(chain->map + end, map + ofs,
+                       (chain->n_rules - end) * sizeof(struct ip_fw *));
+               map = swap_map(chain, map, chain->n_rules - n);
+               /* now remove the rules deleted */
+               for (i = start; i < end; i++) {
+                       rule = map[i];
+                       if (rule->set != RESVD_SET &&
+                           (cmd == 0 || rule->set == new_set) ) {
+                               int l = RULESIZE(rule);
+
+                               chain->static_len -= l;
+                               ipfw_remove_dyn_children(rule);
+                               rule->x_next = chain->reap;
+                               chain->reap = rule;
+                       }
+               }
+               break;
+
+       case 2: /* move rules with given number to new set */
+               IPFW_UH_WLOCK(chain);
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->rulenum == rulenum)
+                               rule->set = new_set;
+               }
+               IPFW_UH_WUNLOCK(chain);
+               break;
+
+       case 3: /* move rules with given set number to new set */
+               IPFW_UH_WLOCK(chain);
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->set == rulenum)
+                               rule->set = new_set;
+               }
+               IPFW_UH_WUNLOCK(chain);
+               break;
+
+       case 4: /* swap two sets */
+               IPFW_UH_WLOCK(chain);
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->set == rulenum)
+                               rule->set = new_set;
+                       else if (rule->set == new_set)
+                               rule->set = rulenum;
+               }
+               IPFW_UH_WUNLOCK(chain);
+               break;
+       }
+       rule = chain->reap;
+       chain->reap = NULL;
+       IPFW_UH_WUNLOCK(chain);
+       ipfw_reap_rules(rule);
+       if (map)
+               free(map, M_IPFW);
+       return error;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+       ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+       if (log_only == 0) {
+               rule->bcnt = rule->pcnt = 0;
+               rule->timestamp = 0;
+       }
+       if (l->o.opcode == O_LOG)
+               l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ *     0       work with rules from all set's;
+ *     1       work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+       struct ip_fw *rule;
+       char *msg;
+       int i;
+
+       uint16_t rulenum = arg & 0xffff;
+       uint8_t set = (arg >> 16) & 0xff;
+       uint8_t cmd = (arg >> 24) & 0xff;
+
+       if (cmd > 1)
+               return (EINVAL);
+       if (cmd == 1 && set > RESVD_SET)
+               return (EINVAL);
+
+       IPFW_UH_RLOCK(chain);
+       if (rulenum == 0) {
+               V_norule_counter = 0;
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       /* Skip rules not in our set. */
+                       if (cmd == 1 && rule->set != set)
+                               continue;
+                       clear_counters(rule, log_only);
+               }
+               msg = log_only ? "All logging counts reset" :
+                   "Accounting cleared";
+       } else {
+               int cleared = 0;
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->rulenum == rulenum) {
+                                       if (cmd == 0 || rule->set == set)
+                                               clear_counters(rule, log_only);
+                               cleared = 1;
+                       }
+                       if (rule->rulenum > rulenum)
+                               break;
+                       }
+               if (!cleared) { /* we did not find any matching rules */
+                       IPFW_WUNLOCK(chain);
+                       return (EINVAL);
+               }
+               msg = log_only ? "logging count reset" : "cleared";
+       }
+       IPFW_UH_RUNLOCK(chain);
+
+       if (V_fw_verbose) {
+               int lev = LOG_SECURITY | LOG_NOTICE;
+
+               if (rulenum)
+                       log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+               else
+                       log(lev, "ipfw: %s.\n", msg);
+       }
+       return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+       int l, cmdlen = 0;
+       int have_action=0;
+       ipfw_insn *cmd;
+
+       if (size < sizeof(*rule)) {
+               printf("ipfw: rule too short\n");
+               return (EINVAL);
+       }
+       /* first, check for valid size */
+       l = RULESIZE(rule);
+       if (l != size) {
+               printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+               return (EINVAL);
+       }
+       if (rule->act_ofs >= rule->cmd_len) {
+               printf("ipfw: bogus action offset (%u > %u)\n",
+                   rule->act_ofs, rule->cmd_len - 1);
+               return (EINVAL);
+       }
+       /*
+        * Now go for the individual checks. Very simple ones, basically only
+        * instruction sizes.
+        */
+       for (l = rule->cmd_len, cmd = rule->cmd ;
+                       l > 0 ; l -= cmdlen, cmd += cmdlen) {
+               cmdlen = F_LEN(cmd);
+               if (cmdlen > l) {
+                       printf("ipfw: opcode %d size truncated\n",
+                           cmd->opcode);
+                       return EINVAL;
+               }
+               switch (cmd->opcode) {
+               case O_PROBE_STATE:
+               case O_KEEP_STATE:
+               case O_PROTO:
+               case O_IP_SRC_ME:
+               case O_IP_DST_ME:
+               case O_LAYER2:
+               case O_IN:
+               case O_FRAG:
+               case O_DIVERTED:
+               case O_IPOPT:
+               case O_IPTOS:
+               case O_IPPRECEDENCE:
+               case O_IPVER:
+               case O_TCPWIN:
+               case O_TCPFLAGS:
+               case O_TCPOPTS:
+               case O_ESTAB:
+               case O_VERREVPATH:
+               case O_VERSRCREACH:
+               case O_ANTISPOOF:
+               case O_IPSEC:
+#ifdef INET6
+               case O_IP6_SRC_ME:
+               case O_IP6_DST_ME:
+               case O_EXT_HDR:
+               case O_IP6:
+#endif
+               case O_IP4:
+               case O_TAG:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       break;
+
+               case O_FIB:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       if (cmd->arg1 >= rt_numfibs) {
+                               printf("ipfw: invalid fib number %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       break;
+
+               case O_SETFIB:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       if (cmd->arg1 >= rt_numfibs) {
+                               printf("ipfw: invalid fib number %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       goto check_action;
+
+               case O_UID:
+               case O_GID:
+               case O_JAIL:
+               case O_IP_SRC:
+               case O_IP_DST:
+               case O_TCPSEQ:
+               case O_TCPACK:
+               case O_PROB:
+               case O_ICMPTYPE:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+                               goto bad_size;
+                       break;
+
+               case O_LIMIT:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+                               goto bad_size;
+                       break;
+
+               case O_LOG:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+                               goto bad_size;
+
+                       ((ipfw_insn_log *)cmd)->log_left =
+                           ((ipfw_insn_log *)cmd)->max_log;
+
+                       break;
+
+               case O_IP_SRC_MASK:
+               case O_IP_DST_MASK:
+                       /* only odd command lengths */
+                       if ( !(cmdlen & 1) || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_IP_SRC_SET:
+               case O_IP_DST_SET:
+                       if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+                               printf("ipfw: invalid set size %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+                           (cmd->arg1+31)/32 )
+                               goto bad_size;
+                       break;
+
+               case O_IP_SRC_LOOKUP:
+               case O_IP_DST_LOOKUP:
+                       if (cmd->arg1 >= IPFW_TABLES_MAX) {
+                               printf("ipfw: invalid table number %d\n",
+                                   cmd->arg1);
+                               return (EINVAL);
+                       }
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+                           cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
+                           cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+                               goto bad_size;
+                       break;
+
+               case O_MACADDR2:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+                               goto bad_size;
+                       break;
+
+               case O_NOP:
+               case O_IPID:
+               case O_IPTTL:
+               case O_IPLEN:
+               case O_TCPDATALEN:
+               case O_TAGGED:
+                       if (cmdlen < 1 || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_MAC_TYPE:
+               case O_IP_SRCPORT:
+               case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+                       if (cmdlen < 2 || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_RECV:
+               case O_XMIT:
+               case O_VIA:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+                               goto bad_size;
+                       break;
+
+               case O_ALTQ:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+                               goto bad_size;
+                       break;
+
+               case O_PIPE:
+               case O_QUEUE:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       goto check_action;
+
+               case O_FORWARD_IP:
+#ifdef IPFIREWALL_FORWARD
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+                               goto bad_size;
+                       goto check_action;
+#else
+                       return EINVAL;
+#endif
+
+               case O_DIVERT:
+               case O_TEE:
+                       if (ip_divert_ptr == NULL)
+                               return EINVAL;
+                       else
+                               goto check_size;
+               case O_NETGRAPH:
+               case O_NGTEE:
+                       if (!NG_IPFW_LOADED)
+                               return EINVAL;
+                       else
+                               goto check_size;
+               case O_NAT:
+                       if (!IPFW_NAT_LOADED)
+                               return EINVAL;
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+                               goto bad_size;          
+                       goto check_action;
+               case O_FORWARD_MAC: /* XXX not implemented yet */
+               case O_CHECK_STATE:
+               case O_COUNT:
+               case O_ACCEPT:
+               case O_DENY:
+               case O_REJECT:
+#ifdef INET6
+               case O_UNREACH6:
+#endif
+               case O_SKIPTO:
+               case O_REASS:
+check_size:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+check_action:
+                       if (have_action) {
+                               printf("ipfw: opcode %d, multiple actions"
+                                       " not allowed\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+                       have_action = 1;
+                       if (l != cmdlen) {
+                               printf("ipfw: opcode %d, action must be"
+                                       " last opcode\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+                       break;
+#ifdef INET6
+               case O_IP6_SRC:
+               case O_IP6_DST:
+                       if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+                           F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       break;
+
+               case O_FLOW6ID:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+                           ((ipfw_insn_u32 *)cmd)->o.arg1)
+                               goto bad_size;
+                       break;
+
+               case O_IP6_SRC_MASK:
+               case O_IP6_DST_MASK:
+                       if ( !(cmdlen & 1) || cmdlen > 127)
+                               goto bad_size;
+                       break;
+               case O_ICMP6TYPE:
+                       if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+                               goto bad_size;
+                       break;
+#endif
+
+               default:
+                       switch (cmd->opcode) {
+#ifndef INET6
+                       case O_IP6_SRC_ME:
+                       case O_IP6_DST_ME:
+                       case O_EXT_HDR:
+                       case O_IP6:
+                       case O_UNREACH6:
+                       case O_IP6_SRC:
+                       case O_IP6_DST:
+                       case O_FLOW6ID:
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                       case O_ICMP6TYPE:
+                               printf("ipfw: no IPv6 support in kernel\n");
+                               return EPROTONOSUPPORT;
+#endif
+                       default:
+                               printf("ipfw: opcode %d, unknown opcode\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+               }
+       }
+       if (have_action == 0) {
+               printf("ipfw: missing action\n");
+               return EINVAL;
+       }
+       return 0;
+
+bad_size:
+       printf("ipfw: opcode %d size %d wrong\n",
+               cmd->opcode, cmdlen);
+       return EINVAL;
+}
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+       char *bp = buf;
+       char *ep = bp + space;
+       struct ip_fw *rule, *dst;
+       int l, i;
+       time_t  boot_seconds;
+
+        boot_seconds = boottime.tv_sec;
+       for (i = 0; i < chain->n_rules; i++) {
+               rule = chain->map[i];
+               l = RULESIZE(rule);
+               if (bp + l > ep) { /* should not happen */
+                       printf("overflow dumping static rules\n");
+                       break;
+               }
+               dst = (struct ip_fw *)bp;
+               bcopy(rule, dst, l);
+                       /*
+                        * XXX HACK. Store the disable mask in the "next"
+                        * pointer in a wild attempt to keep the ABI the same.
+                        * Why do we do this on EVERY rule?
+                        */
+               bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+               if (dst->timestamp)
+                       dst->timestamp += boot_seconds;
+               bp += l;
+       }
+       ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
+       return (bp - (char *)buf);
+}
+
+
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define        RULE_MAXSIZE    (256*sizeof(u_int32_t))
+       int error;
+       size_t size;
+       struct ip_fw *buf, *rule;
+       struct ip_fw_chain *chain;
+       u_int32_t rulenum[2];
+
+       error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+       if (error)
+               return (error);
+
+       /*
+        * Disallow modifications in really-really secure mode, but still allow
+        * the logging counters to be reset.
+        */
+       if (sopt->sopt_name == IP_FW_ADD ||
+           (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+               error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+               if (error)
+                       return (error);
+       }
+
+       chain = &V_layer3_chain;
+       error = 0;
+
+       switch (sopt->sopt_name) {
+       case IP_FW_GET:
+               /*
+                * pass up a copy of the current rules. Static rules
+                * come first (the last of which has number IPFW_DEFAULT_RULE),
+                * followed by a possibly empty list of dynamic rule.
+                * The last dynamic rule has NULL in the "next" field.
+                *
+                * Note that the calculated size is used to bound the
+                * amount of data returned to the user.  The rule set may
+                * change between calculating the size and returning the
+                * data in which case we'll just return what fits.
+                */
+               for (;;) {
+                       int len = 0, want;
+
+                       size = chain->static_len;
+                       size += ipfw_dyn_len();
+               if (size >= sopt->sopt_valsize)
+                       break;
+               buf = malloc(size, M_TEMP, M_WAITOK);
+                       if (buf == NULL)
+                               break;
+                       IPFW_UH_RLOCK(chain);
+                       /* check again how much space we need */
+                       want = chain->static_len + ipfw_dyn_len();
+                       if (size >= want)
+                               len = ipfw_getrules(chain, buf, size);
+                       IPFW_UH_RUNLOCK(chain);
+                       if (size >= want)
+                               error = sooptcopyout(sopt, buf, len);
+               free(buf, M_TEMP);
+                       if (size >= want)
+                               break;
+               }
+               break;
+
+       case IP_FW_FLUSH:
+               /* locking is done within del_entry() */
+               error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
+               break;
+
+       case IP_FW_ADD:
+               rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+               error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+                       sizeof(struct ip_fw) );
+               if (error == 0)
+                       error = check_ipfw_struct(rule, sopt->sopt_valsize);
+               if (error == 0) {
+                       /* locking is done within ipfw_add_rule() */
+                       error = ipfw_add_rule(chain, rule);
+                       size = RULESIZE(rule);
+                       if (!error && sopt->sopt_dir == SOPT_GET)
+                               error = sooptcopyout(sopt, rule, size);
+               }
+               free(rule, M_TEMP);
+               break;
+
+       case IP_FW_DEL:
+               /*
+                * IP_FW_DEL is used for deleting single rules or sets,
+                * and (ab)used to atomically manipulate sets. Argument size
+                * is used to distinguish between the two:
+                *    sizeof(u_int32_t)
+                *      delete single rule or set of rules,
+                *      or reassign rules (or sets) to a different set.
+                *    2*sizeof(u_int32_t)
+                *      atomic disable/enable sets.
+                *      first u_int32_t contains sets to be disabled,
+                *      second u_int32_t contains sets to be enabled.
+                */
+               error = sooptcopyin(sopt, rulenum,
+                       2*sizeof(u_int32_t), sizeof(u_int32_t));
+               if (error)
+                       break;
+               size = sopt->sopt_valsize;
+               if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
+                       /* delete or reassign, locking done in del_entry() */
+                       error = del_entry(chain, rulenum[0]);
+               } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+                       IPFW_UH_WLOCK(chain);
+                       V_set_disable =
+                           (V_set_disable | rulenum[0]) & ~rulenum[1] &
+                           ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+                       IPFW_UH_WUNLOCK(chain);
+               } else
+                       error = EINVAL;
+               break;
+
+       case IP_FW_ZERO:
+       case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+               rulenum[0] = 0;
+               if (sopt->sopt_val != 0) {
+                   error = sooptcopyin(sopt, rulenum,
+                           sizeof(u_int32_t), sizeof(u_int32_t));
+                   if (error)
+                       break;
+               }
+               error = zero_entry(chain, rulenum[0],
+                       sopt->sopt_name == IP_FW_RESETLOG);
+               break;
+
+       /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+       case IP_FW_TABLE_ADD:
+               {
+                       ipfw_table_entry ent;
+
+                       error = sooptcopyin(sopt, &ent,
+                           sizeof(ent), sizeof(ent));
+                       if (error)
+                               break;
+                       error = ipfw_add_table_entry(chain, ent.tbl,
+                           ent.addr, ent.masklen, ent.value);
+               }
+               break;
+
+       case IP_FW_TABLE_DEL:
+               {
+                       ipfw_table_entry ent;
+
+                       error = sooptcopyin(sopt, &ent,
+                           sizeof(ent), sizeof(ent));
+                       if (error)
+                               break;
+                       error = ipfw_del_table_entry(chain, ent.tbl,
+                           ent.addr, ent.masklen);
+               }
+               break;
+
+       case IP_FW_TABLE_FLUSH:
+               {
+                       u_int16_t tbl;
+
+                       error = sooptcopyin(sopt, &tbl,
+                           sizeof(tbl), sizeof(tbl));
+                       if (error)
+                               break;
+                       IPFW_WLOCK(chain);
+                       error = ipfw_flush_table(chain, tbl);
+                       IPFW_WUNLOCK(chain);
+               }
+               break;
+
+       case IP_FW_TABLE_GETSIZE:
+               {
+                       u_int32_t tbl, cnt;
+
+                       if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+                           sizeof(tbl))))
+                               break;
+                       IPFW_RLOCK(chain);
+                       error = ipfw_count_table(chain, tbl, &cnt);
+                       IPFW_RUNLOCK(chain);
+                       if (error)
+                               break;
+                       error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+               }
+               break;
+
+       case IP_FW_TABLE_LIST:
+               {
+                       ipfw_table *tbl;
+
+                       if (sopt->sopt_valsize < sizeof(*tbl)) {
+                               error = EINVAL;
+                               break;
+                       }
+                       size = sopt->sopt_valsize;
+                       tbl = malloc(size, M_TEMP, M_WAITOK);
+                       error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+                       if (error) {
+                               free(tbl, M_TEMP);
+                               break;
+                       }
+                       tbl->size = (size - sizeof(*tbl)) /
+                           sizeof(ipfw_table_entry);
+                       IPFW_RLOCK(chain);
+                       error = ipfw_dump_table(chain, tbl);
+                       IPFW_RUNLOCK(chain);
+                       if (error) {
+                               free(tbl, M_TEMP);
+                               break;
+                       }
+                       error = sooptcopyout(sopt, tbl, size);
+                       free(tbl, M_TEMP);
+               }
+               break;
+
+       /*--- NAT operations are protected by the IPFW_LOCK ---*/
+       case IP_FW_NAT_CFG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_cfg_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_CFG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_DEL:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_del_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_DEL: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_GET_CONFIG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_get_cfg_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_GET_CFG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_GET_LOG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_get_log_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_GET_LOG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       default:
+               printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+               error = EINVAL;
+       }
+
+       return (error);
+#undef RULE_MAXSIZE
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_table.c b/dummynet2/ip_fw_table.c
new file mode 100644 (file)
index 0000000..8cbf457
--- /dev/null
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Lookup table support for ipfw
+ *
+ * Lookup tables are implemented (at the moment) using the radix
+ * tree used for routing tables. Tables store key-value entries, where
+ * keys are network prefixes (addr/masklen), and values are integers.
+ * As a degenerate case we can interpret keys as 32-bit integers
+ * (with a /32 mask).
+ *
+ * The table is protected by the IPFW lock even for manipulation coming
+ * from userland, because operations are typically fast.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <net/if.h>    /* ip_fw.h requires IFNAMSIZ */
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+struct table_entry {
+       struct radix_node       rn[2];
+       struct sockaddr_in      addr, mask;
+       u_int32_t               value;
+};
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v)     *((uint8_t *)&(v))
+#define KEY_OFS                (8*offsetof(struct sockaddr_in, sin_addr))
+
+int
+ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct radix_node *rn;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
+       if (ent == NULL)
+               return (ENOMEM);
+       ent->value = value;
+       KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8;
+       ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+       ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+       IPFW_WLOCK(ch);
+       rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
+       if (rn == NULL) {
+               IPFW_WUNLOCK(ch);
+               free(ent, M_IPFW_TBL);
+               return (EEXIST);
+       }
+       IPFW_WUNLOCK(ch);
+       return (0);
+}
+
+int
+ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct sockaddr_in sa, mask;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       KEY_LEN(sa) = KEY_LEN(mask) = 8;
+       mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+       sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+       IPFW_WLOCK(ch);
+       ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+       if (ent == NULL) {
+               IPFW_WUNLOCK(ch);
+               return (ESRCH);
+       }
+       IPFW_WUNLOCK(ch);
+       free(ent, M_IPFW_TBL);
+       return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+       struct radix_node_head * const rnh = arg;
+       struct table_entry *ent;
+
+       ent = (struct table_entry *)
+           rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+       if (ent != NULL)
+               free(ent, M_IPFW_TBL);
+       return (0);
+}
+
+int
+ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+       struct radix_node_head *rnh;
+
+       IPFW_WLOCK_ASSERT(ch);
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       KASSERT(rnh != NULL, ("NULL IPFW table"));
+       rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+       return (0);
+}
+
+void
+ipfw_flush_tables(struct ip_fw_chain *ch)
+{
+       uint16_t tbl;
+
+       IPFW_WLOCK_ASSERT(ch);
+
+       for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
+               ipfw_flush_table(ch, tbl);
+}
+
+int
+ipfw_init_tables(struct ip_fw_chain *ch)
+{ 
+       int i;
+       uint16_t j;
+
+       for (i = 0; i < IPFW_TABLES_MAX; i++) {
+               if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) {
+                       for (j = 0; j < i; j++) {
+                               (void) ipfw_flush_table(ch, j);
+                       }
+                       return (ENOMEM);
+               }
+       }
+       return (0);
+}
+
+int
+ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct sockaddr_in sa;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (0);
+       rnh = ch->tables[tbl];
+       KEY_LEN(sa) = 8;
+       sa.sin_addr.s_addr = addr;
+       ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+       if (ent != NULL) {
+               *val = ent->value;
+               return (1);
+       }
+       return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+       u_int32_t * const cnt = arg;
+
+       (*cnt)++;
+       return (0);
+}
+
+int
+ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+       struct radix_node_head *rnh;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       *cnt = 0;
+       rnh->rnh_walktree(rnh, count_table_entry, cnt);
+       return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+       struct table_entry * const n = (struct table_entry *)rn;
+       ipfw_table * const tbl = arg;
+       ipfw_table_entry *ent;
+
+       if (tbl->cnt == tbl->size)
+               return (1);
+       ent = &tbl->ent[tbl->cnt];
+       ent->tbl = tbl->tbl;
+       if (in_nullhost(n->mask.sin_addr))
+               ent->masklen = 0;
+       else
+               ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+       ent->addr = n->addr.sin_addr.s_addr;
+       ent->value = n->value;
+       tbl->cnt++;
+       return (0);
+}
+
+int
+ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+       struct radix_node_head *rnh;
+
+       if (tbl->tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl->tbl];
+       tbl->cnt = 0;
+       rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+       return (0);
+}
+/* end of file */
diff --git a/dummynet2/ipfw2_mod.c b/dummynet2/ipfw2_mod.c
new file mode 100644 (file)
index 0000000..f59a37c
--- /dev/null
@@ -0,0 +1,768 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: ipfw2_mod.c 4671 2010-01-04 17:50:51Z luigi $
+ *
+ * The main interface to build ipfw+dummynet as a linux module.
+ * (and possibly as a windows module as well, though that part
+ * is not complete yet).
+ *
+ * The control interface uses the sockopt mechanism
+ * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW).
+ *
+ * The data interface uses the netfilter interface, at the moment
+ * hooked to the PRE_ROUTING and POST_ROUTING hooks.
+ * Unfortunately the netfilter interface is a moving target,
+ * so we need a set of macros to adapt to the various cases.
+ *
+ * In the netfilter hook we just mark packet as 'QUEUE' and then
+ * let the queue handler to do the whole work (filtering and
+ * possibly emulation).
+ * As we receive packets, we wrap them with an mbuf descriptor
+ * so the existing ipfw+dummynet code runs unmodified.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/mbuf.h>                  /* sizeof struct mbuf */
+#include <sys/param.h>                 /* NGROUPS */
+
+#ifdef __linux__
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>      /* NF_IP_PRI_FILTER */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
+#include <net/netfilter/nf_queue.h>    /* nf_queue */
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+#define __read_mostly
+#endif
+
+#endif /* !__linux__ */
+
+#include <netinet/in.h>                        /* in_addr */
+#include <netinet/ip_fw.h>             /* ip_fw_ctl_t, ip_fw_chk_t */
+#include <netinet/ipfw/ip_fw_private.h>                /* ip_fw_ctl_t, ip_fw_chk_t */
+#include <netinet/ip_dummynet.h>       /* ip_dn_ctl_t, ip_dn_io_t */
+#include <net/pfil.h>                  /* PFIL_IN, PFIL_OUT */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#warning --- inet_hashtables not present on 2.4
+#include <linux/tcp.h>
+#include <net/route.h>
+#include <net/sock.h>
+static inline int inet_iif(const struct sk_buff *skb)
+{
+        return ((struct rtable *)skb->dst)->rt_iif;
+}
+
+#else
+#include <net/inet_hashtables.h>       /* inet_lookup */
+#endif
+#include <net/route.h>                 /* inet_iif */
+
+/*
+ * Here we allocate some global variables used in the firewall.
+ */
+//ip_dn_ctl_t    *ip_dn_ctl_ptr;
+int (*ip_dn_ctl_ptr)(struct sockopt *);
+
+ip_fw_ctl_t    *ip_fw_ctl_ptr;
+
+int    (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
+ip_fw_chk_t    *ip_fw_chk_ptr;
+
+void           (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+/*---
+ * Glue code to implement the registration of children with the parent.
+ * Each child should call my_mod_register() when linking, so that
+ * module_init() and module_exit() can call init_children() and
+ * fini_children() to provide the necessary initialization.
+ * We use the same mechanism for MODULE_ and SYSINIT_.
+ * The former only get a pointer to the moduledata,
+ * the latter have two function pointers (init/uninit)
+ */
+#include <sys/module.h>
+struct mod_args {
+        const char *name;
+        int order;
+        struct moduledata *mod;
+       void (*init)(void), (*uninit)(void);
+};
+
+static unsigned int mod_idx;
+static struct mod_args mods[10];       /* hard limit to 10 modules */
+
+int
+my_mod_register(const char *name, int order,
+       struct moduledata *mod, void *init, void *uninit);
+/*
+ * my_mod_register should be called automatically as the init
+ * functions in the submodules. Unfortunately this compiler/linker
+ * trick is not supported yet so we call it manually.
+ */
+int
+my_mod_register(const char *name, int order,
+       struct moduledata *mod, void *init, void *uninit)
+{
+       struct mod_args m = { .name = name, .order = order,
+               .mod = mod, .init = init, .uninit = uninit };
+
+       printf("%s %s called\n", __FUNCTION__, name);
+       if (mod_idx < sizeof(mods) / sizeof(mods[0]))
+               mods[mod_idx++] = m;
+       return 0;
+}
+
+static void
+init_children(void)
+{
+       unsigned int i;
+
+        /* Call the functions registered at init time. */
+       printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx);
+        for (i = 0; i < mod_idx; i++) {
+               struct mod_args *m = &mods[i];
+                printf("+++ start module %d %s %s at %p order 0x%x\n",
+                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
+                        m->mod, m->order);
+               if (m->mod && m->mod->evhand)
+                       m->mod->evhand(NULL, MOD_LOAD, m->mod->priv);
+               else if (m->init)
+                       m->init();
+        }
+}
+
+static void
+fini_children(void)
+{
+       int i;
+
+        /* Call the functions registered at init time. */
+        for (i = mod_idx - 1; i >= 0; i--) {
+               struct mod_args *m = &mods[i];
+                printf("+++ end module %d %s %s at %p order 0x%x\n",
+                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
+                        m->mod, m->order);
+               if (m->mod && m->mod->evhand)
+                       m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv);
+               else if (m->uninit)
+                       m->uninit();
+        }
+}
+/*--- end of module binding helper functions ---*/
+
+/*---
+ * Control hooks:
+ * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention.
+ * then call the ipfw handler in order to manage requests.
+ * In turn this is called by the linux set/get handlers.
+ */
+static int
+ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user)
+{
+       struct thread t;
+       int ret = EINVAL;
+
+       memset(s, 0, sizeof(s));
+       s->sopt_name = cmd;
+       s->sopt_dir = dir;
+       s->sopt_valsize = len;
+       s->sopt_val = user;
+
+       /* sopt_td is not used but it is referenced */
+       memset(&t, 0, sizeof(t));
+       s->sopt_td = &t;
+       
+       // printf("%s called with cmd %d len %d\n", __FUNCTION__, cmd, len);
+
+       if (cmd < IP_DUMMYNET_CONFIGURE && ip_fw_ctl_ptr)
+               ret = ip_fw_ctl_ptr(s);
+       else if (cmd >= IP_DUMMYNET_CONFIGURE && ip_dn_ctl_ptr)
+               ret = ip_dn_ctl_ptr(s);
+
+       return -ret;    /* errors are < 0 on linux */
+}
+
+#ifdef _WIN32
+
+void
+netisr_dispatch(int __unused num, struct mbuf *m)
+{
+}
+
+int
+ip_output(struct mbuf *m, struct mbuf __unused *opt,
+       struct route __unused *ro, int __unused flags,
+    struct ip_moptions __unused *imo, struct inpcb __unused *inp)
+{
+       netisr_dispatch(0, m);
+       return 0;
+}
+
+#else /* this is the linux glue */
+/*
+ * setsockopt hook has no return value other than the error code.
+ */
+static int
+do_ipfw_set_ctl(struct sock __unused *sk, int cmd,
+       void __user *user, unsigned int len)
+{
+       struct sockopt s;       /* pass arguments */
+
+       return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user);
+}
+
+/*
+ * getsockopt can can return a block of data in response.
+ */
+static int
+do_ipfw_get_ctl(struct sock __unused *sk,
+       int cmd, void __user *user, int *len)
+{
+       struct sockopt s;       /* pass arguments */
+       int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user);
+
+       *len = s.sopt_valsize;  /* return lenght back to the caller */
+       return ret;
+}
+
+/*
+ * declare our [get|set]sockopt hooks
+ */
+static struct nf_sockopt_ops ipfw_sockopts = {
+       .pf             = PF_INET,
+       .set_optmin     = _IPFW_SOCKOPT_BASE,
+       .set_optmax     = _IPFW_SOCKOPT_END,
+       .set            = do_ipfw_set_ctl,
+       .get_optmin     = _IPFW_SOCKOPT_BASE,
+       .get_optmax     = _IPFW_SOCKOPT_END,
+       .get            = do_ipfw_get_ctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+       .owner          = THIS_MODULE,
+#endif
+};
+
+/*----
+ * We need a number of macros to adapt to the various APIs in
+ * different linux versions. Among them:
+ *
+ * - the hook names change between macros (NF_IP*) and enum NF_INET_*
+ *
+ * - the second argument to the netfilter hook is
+ *     struct sk_buff **       in kernels <= 2.6.22
+ *     struct sk_buff *        in kernels > 2.6.22
+ *
+ * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT
+ *
+ * - the packet descriptor passed to the queue handler is
+ *     struct nf_info          in kernels <= 2.6.24
+ *     struct nf_queue_entry   in kernels <= 2.6.24
+ *
+ * - the arguments to the queue handler also change;
+ */
+
+/*
+ * declare hook to grab packets from the netfilter interface.
+ * The NF_* names change in different versions of linux, in some
+ * cases they are #defines, in others they are enum, so we
+ * need to adapt.
+ */
+#ifndef NF_IP_PRE_ROUTING
+#define NF_IP_PRE_ROUTING      NF_INET_PRE_ROUTING
+#endif
+#ifndef NF_IP_POST_ROUTING
+#define NF_IP_POST_ROUTING     NF_INET_POST_ROUTING
+#endif
+
+/*
+ * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains.
+ * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and
+ * POST_ROUTING chains, so if we want to use that information we
+ * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING.
+ * However at the moment the skb_tag info is not reliable so
+ * we stay with the standard hooks.
+ */
+#if 0 // defined(IPFW_PLANETLAB)
+#define IPFW_HOOK_IN NF_IP_LOCAL_IN
+#else
+#define IPFW_HOOK_IN NF_IP_PRE_ROUTING
+#endif
+
+/*
+ * The main netfilter hook.
+ * To make life simple, we queue everything and then do all the
+ * decision in the queue handler.
+ *
+ * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff**
+ * so we have an #ifdef to set the proper argument type.
+ */
+static unsigned int
+call_ipfw(unsigned int __unused hooknum,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have **
+       struct sk_buff  __unused **skb,
+#else
+       struct sk_buff  __unused *skb,
+#endif
+       const struct net_device  __unused *in,
+       const struct net_device  __unused *out,
+       int __unused (*okfn)(struct sk_buff *))
+{
+       return NF_QUEUE;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#define        NF_STOP         NF_ACCEPT
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+
+/*
+ * nf_queue_entry is a recent addition, in previous versions
+ * of the code the struct is called nf_info.
+ */
+#define nf_queue_entry nf_info /* for simplicity */
+
+/* also, 2.4 and perhaps something else have different arguments */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* unsure on the exact boundary */
+/* on 2.4 we use nf_info */
+#define QH_ARGS                struct sk_buff *skb, struct nf_info *info, void *data
+#else  /* 2.6.1.. 2.6.24 */
+#define QH_ARGS                struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data
+#endif
+
+#define DEFINE_SKB     /* nothing, already an argument */
+#define        REINJECT(_inf, _verd)   nf_reinject(skb, _inf, _verd)
+
+#else  /* 2.6.25 and above */
+
+#define QH_ARGS                struct nf_queue_entry *info, unsigned int queuenum
+#define DEFINE_SKB     struct sk_buff *skb = info->skb;
+#define        REINJECT(_inf, _verd)   nf_reinject(_inf, _verd)
+#endif
+
+/*
+ * used by dummynet when dropping packets
+ * XXX use dummynet_send()
+ */
+void
+reinject_drop(struct mbuf* m)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)        /* unsure on the exact boundary */
+       struct sk_buff *skb = (struct sk_buff *)m;
+#endif
+       REINJECT(m->queue_entry, NF_DROP);
+}
+
+/*
+ * The real call to the firewall. nf_queue_entry points to the skbuf,
+ * and eventually we need to return both through nf_reinject().
+ */
+static int
+ipfw2_queue_handler(QH_ARGS)
+{
+       DEFINE_SKB      /* no semicolon here, goes in the macro */
+       int ret = 0;    /* return value */
+       struct mbuf *m;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+       if (skb->nh.iph == NULL) {
+               printf("null dp, len %d reinject now\n", skb->len);
+               REINJECT(info, NF_ACCEPT);
+               return 0;
+       }
+#endif
+       m = malloc(sizeof(*m), 0, 0);
+       if (m == NULL) {
+               printf("malloc fail, len %d reinject now\n", skb->len);
+               REINJECT(info, NF_ACCEPT);
+               return 0;
+       }
+
+       m->m_skb = skb;
+       m->m_len = skb->len;            /* len in this skbuf */
+       m->m_pkthdr.len = skb->len;     /* total packet len */
+       m->m_pkthdr.rcvif = info->indev;
+       m->queue_entry = info;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+       m->m_data = skb->nh.iph;
+#else
+       m->m_data = skb_network_header(skb);
+#endif
+
+       /* XXX add the interface */
+       if (info->hook == IPFW_HOOK_IN) {
+               ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL);
+       } else {
+               ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL);
+       }
+
+       if (m != NULL) {        /* Accept. reinject and free the mbuf */
+               REINJECT(info, NF_ACCEPT);
+               m_freem(m);
+       } else if (ret == 0) {
+               /* dummynet has kept the packet, will reinject later. */
+       } else {
+               /*
+                * Packet dropped by ipfw or dummynet. Nothing to do as
+                * FREE_PKT already did a reinject as NF_DROP
+                */
+       }
+       return 0;
+}
+
+struct route;
+struct ip_moptions;
+struct inpcb;
+
+
+/* XXX should include prototypes for netisr_dispatch and ip_output */
+/*
+ * The reinjection routine after a packet comes out from dummynet.
+ * We must update the skb timestamp so ping reports the right time.
+ */
+void
+netisr_dispatch(int num, struct mbuf *m)
+{
+       struct nf_queue_entry *info = m->queue_entry;
+       struct sk_buff *skb = m->m_skb; /* always used */
+
+       m_freem(m);
+
+       KASSERT((info != NULL), ("%s info null!\n", __FUNCTION__));
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)       // XXX above 2.6.x ?
+       __net_timestamp(skb);   /* update timestamp */
+#endif
+
+       /* XXX to obey one-pass, possibly call the queue handler here */
+       REINJECT(info, ((num == -1)?NF_DROP:NF_STOP));  /* accept but no more firewall */
+}
+
+int
+ip_output(struct mbuf *m, struct mbuf __unused *opt,
+       struct route __unused *ro, int __unused flags,
+    struct ip_moptions __unused *imo, struct inpcb __unused *inp)
+{
+       netisr_dispatch(0, m);
+        return 0;
+}
+
+/*
+ * socket lookup function for linux.
+ * This code is used to associate uid, gid, jail/xid to packets,
+ * and store the info in a cache *ugp where they can be accessed quickly.
+ * The function returns 1 if the info is found, -1 otherwise.
+ *
+ * We do this only on selected protocols: TCP, ...
+ *
+ * The chain is the following
+ *   sk_buff*  sock*  socket*    file*
+ *     skb  ->  sk ->sk_socket->file ->f_owner    ->pid
+ *     skb  ->  sk ->sk_socket->file ->f_uid (direct)
+ *     skb  ->  sk ->sk_socket->file ->f_cred->fsuid (2.6.29+)
+ *
+ * Related headers:
+ * linux/skbuff.h      struct skbuff
+ * net/sock.h          struct sock
+ * linux/net.h         struct socket
+ * linux/fs.h          struct file
+ *
+ * With vserver we may have sk->sk_xid and sk->sk_nid that
+ * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2]
+ * (no matches yet)
+ *
+ * Note- for locally generated, outgoing packets we should not need
+ * need a lookup because the sk_buff already points to the socket where
+ * the info is.
+ */
+extern struct inet_hashinfo tcp_hashinfo;
+int
+linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
+               const __be32 daddr, const __be16 dport,
+               struct sk_buff *skb, int dir, struct bsd_ucred *u)
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,0)
+       return -1;
+#else
+       struct sock *sk;
+       int ret = -1;   /* default return value */
+       int st = -1;    /* state */
+
+
+       if (proto != IPPROTO_TCP)       /* XXX extend for UDP */
+               return -1;
+
+       if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
+               panic(" -- this should not happen\n");
+               return -1;
+       }
+
+       if (skb->sk) {
+               sk = skb->sk;
+       } else {
+               /*
+                * Try a lookup. On a match, sk has a refcount that we must
+                * release on exit (we know it because skb->sk = NULL).
+                *
+                * inet_lookup above 2.6.24 has an additional 'net' parameter
+                * so we use a macro to conditionally supply it.
+                * swap dst and src depending on the direction.
+                */
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24)
+#define _OPT_NET_ARG
+#else
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+/* there is no dev_net() on 2.6.25 */
+#define _OPT_NET_ARG (skb->dev->nd_net),
+#else  /* 2.6.26 and above */
+#define _OPT_NET_ARG dev_net(skb->dev),
+#endif
+#endif
+               sk =  (dir) ? /* dir != 0 on output */
+                   inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
+                       daddr, dport, saddr, sport,     // match outgoing
+                       inet_iif(skb)) :
+                   inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
+                       saddr, sport, daddr, dport,     // match incoming
+                       skb->dev->ifindex);
+#undef _OPT_NET_ARG
+
+               if (sk == NULL) /* no match, nothing to be done */
+                       return -1;
+       }
+       ret = 1;        /* retrying won't make things better */
+       st = sk->sk_state;
+#ifdef CONFIG_VSERVER
+       u->xid = sk->sk_xid;
+       u->nid = sk->sk_nid;
+#else
+       u->xid = u->nid = 0;
+#endif
+       /*
+        * Exclude tcp states where sk points to a inet_timewait_sock which
+        * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more).
+        * To be safe, use a whitelist and not a blacklist.
+        * Before dereferencing sk_socket grab a lock on sk_callback_lock.
+        *
+        * Once again we need conditional code because the UID and GID
+        * location changes between kernels.
+        */
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
+/* use the current's real uid/gid */
+#define _CURR_UID f_uid
+#define _CURR_GID f_gid
+#else /* 2.6.29 and above */
+/* use the current's file access real uid/gid */
+#define _CURR_UID f_cred->fsuid
+#define _CURR_GID f_cred->fsgid
+#endif
+
+#define GOOD_STATES (  \
+       (1<<TCP_LISTEN) | (1<<TCP_SYN_RECV)   | (1<<TCP_SYN_SENT)   | \
+       (1<<TCP_ESTABLISHED)  | (1<<TCP_FIN_WAIT1) | (1<<TCP_FIN_WAIT2) )
+       // surely exclude TCP_CLOSE, TCP_TIME_WAIT, TCP_LAST_ACK
+       // uncertain TCP_CLOSE_WAIT and TCP_CLOSING
+
+       if ((1<<st) & GOOD_STATES) {
+               read_lock_bh(&sk->sk_callback_lock);
+               if (sk->sk_socket && sk->sk_socket->file) {
+                       u->uid = sk->sk_socket->file->_CURR_UID;
+                       u->gid = sk->sk_socket->file->_CURR_GID;
+               }
+               read_unlock_bh(&sk->sk_callback_lock);
+       } else {
+               u->uid = u->gid = 0;
+       }
+       if (!skb->sk) /* return the reference that came from the lookup */
+               sock_put(sk);
+#undef GOOD_STATES
+#undef _CURR_UID
+#undef _CURR_GID
+       return ret;
+
+#endif /* LINUX > 2.4 */
+}
+
+/*
+ * Now prepare to hook the various functions.
+ * Linux 2.4 has a different API so we need some adaptation
+ * for register and unregister hooks
+ *
+ * the unregister function changed arguments between 2.6.22 and 2.6.24
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+static int
+nf_register_hooks(struct nf_hook_ops *ops, int n)
+{
+       int i, ret = 0;
+       for (i = 0; i < n; i++) {
+               ret = nf_register_hook(ops + i);
+               if (ret < 0)
+                       break;
+       }
+       return ret;
+}
+
+static void
+nf_unregister_hooks(struct nf_hook_ops *ops, int n)
+{
+       int i;
+       for (i = 0; i < n; i++) {
+               nf_unregister_hook(ops + i);
+       }
+}
+#define REG_QH_ARG(fn) fn, NULL        /* argument for nf_[un]register_queue_handler */
+#define UNREG_QH_ARG(fn) //fn  /* argument for nf_[un]register_queue_handler */
+#define SET_MOD_OWNER
+
+#else /* linux >= 2.6.0 */
+
+struct nf_queue_handler ipfw2_queue_handler_desc = {
+        .outfn = ipfw2_queue_handler,
+        .name = "ipfw2 dummynet queue",
+};
+#define REG_QH_ARG(fn) &(fn ## _desc)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define UNREG_QH_ARG(fn) //fn  /* argument for nf_[un]register_queue_handler */
+#else
+#define UNREG_QH_ARG(fn)       , &(fn ## _desc)
+#endif /* 2.6.0 < LINUX > 2.6.24 */
+
+#define SET_MOD_OWNER  .owner = THIS_MODULE,
+
+#endif /* !LINUX < 2.6.0 */
+
+static struct nf_hook_ops ipfw_ops[] __read_mostly = {
+        {
+                .hook           = call_ipfw,
+                .pf             = PF_INET,
+                .hooknum        = IPFW_HOOK_IN,
+                .priority       = NF_IP_PRI_FILTER,
+                SET_MOD_OWNER
+        },
+        {
+                .hook           = call_ipfw,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_POST_ROUTING,
+                .priority       = NF_IP_PRI_FILTER,
+               SET_MOD_OWNER
+        },
+};
+#endif /* !__linux__ */
+
+/* descriptors for the children, until i find a way for the
+ * linker to produce them
+ */
+extern moduledata_t *moddesc_ipfw;
+extern moduledata_t *moddesc_dummynet;
+extern void *sysinit_ipfw_init;
+extern void *sysuninit_ipfw_destroy;
+extern void *sysinit_vnet_ipfw_init;
+extern void *sysuninit_vnet_ipfw_uninit;
+
+/*
+ * Module glue - init and exit function.
+ */
+static int __init
+ipfw_module_init(void)
+{
+       int ret = 0;
+
+       printf("%s in-hook %d svn id %s\n", __FUNCTION__, IPFW_HOOK_IN, "$Id: ipfw2_mod.c 4671 2010-01-04 17:50:51Z luigi $");
+
+       rn_init(64);
+
+       my_mod_register("ipfw",  1, moddesc_ipfw, NULL, NULL);
+       my_mod_register("sy_ipfw",  2, NULL,
+               sysinit_ipfw_init, sysuninit_ipfw_destroy);
+       my_mod_register("sy_Vnet_ipfw",  3, NULL,
+               sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit);
+       my_mod_register("dummynet",  4, moddesc_dummynet, NULL, NULL);
+       init_children();
+
+#ifdef _WIN32
+       return ret;
+
+#else  /* linux hook */
+       /* sockopt register, in order to talk with user space */
+       ret = nf_register_sockopt(&ipfw_sockopts);
+        if (ret < 0) {
+               printf("error %d in nf_register_sockopt\n", ret);
+               goto clean_modules;
+       }
+
+       /* queue handler registration, in order to get network
+        * packet under a private queue */
+       ret = nf_register_queue_handler(PF_INET, REG_QH_ARG(ipfw2_queue_handler) );
+        if (ret < 0)   /* queue busy */
+               goto unregister_sockopt;
+
+        ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
+        if (ret < 0)
+               goto unregister_sockopt;
+
+       printf("%s loaded\n", __FUNCTION__);
+       return 0;
+
+
+/* handle errors on load */
+unregister_sockopt:
+       nf_unregister_queue_handler(PF_INET  UNREG_QH_ARG(ipfw2_queue_handler) );
+       nf_unregister_sockopt(&ipfw_sockopts);
+
+clean_modules:
+       fini_children();
+       printf("%s error\n", __FUNCTION__);
+
+       return ret;
+#endif /* linux */
+}
+
+/* module shutdown */
+static void __exit
+ipfw_module_exit(void)
+{
+#ifdef _WIN32
+#else  /* linux hook */
+        nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
+       /* maybe drain the queue before unregistering ? */
+       nf_unregister_queue_handler(PF_INET  UNREG_QH_ARG(ipfw2_queue_handler) );
+       nf_unregister_sockopt(&ipfw_sockopts);
+#endif /* linux */
+
+       fini_children();
+
+       printf("%s unloaded\n", __FUNCTION__);
+}
+
+#ifdef __linux__
+module_init(ipfw_module_init)
+module_exit(ipfw_module_exit)
+MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
+#endif
diff --git a/dummynet2/missing.h b/dummynet2/missing.h
new file mode 100644 (file)
index 0000000..09ea13a
--- /dev/null
@@ -0,0 +1,562 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: missing.h 4666 2010-01-04 12:55:32Z luigi $
+ *
+ * Header for kernel variables and functions that are not available in
+ * userland.
+ */
+
+#ifndef _MISSING_H_
+#define _MISSING_H_
+
+#include <sys/cdefs.h>
+
+/* portability features, to be set before the rest: */
+#define HAVE_NET_IPLEN         /* iplen/ipoff in net format */
+#define WITHOUT_BPF            /* do not use bpf logging */
+
+#ifdef _WIN32
+
+#ifndef DEFINE_SPINLOCK
+#define DEFINE_SPINLOCK(x)     FAST_MUTEX x
+#endif
+/* spinlock --> Guarded Mutex KGUARDED_MUTEX */
+/* http://www.reactos.org/wiki/index.php/Guarded_Mutex */
+#define spin_lock_init(_l)
+#define spin_lock_bh(_l)
+#define spin_unlock_bh(_l)
+
+#include <sys/socket.h>                /* bsd-compat.c */
+#include <netinet/in.h>                /* bsd-compat.c */
+#include <netinet/ip.h>                /* local version */
+
+#else  /* __linux__ */
+
+#define MALLOC_DECLARE(x)      /* nothing */
+#include <linux/time.h>                /* do_gettimeofday */
+#include <netinet/ip.h>                /* local version */
+struct inpcb;
+
+/*
+ * Kernel locking support.
+ * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c
+ *
+ * In linux we use spinlock_bh to implement both.
+ * For 'struct rwlock' we need an #ifdef to change it to spinlock_t
+ */
+
+#ifndef DEFINE_SPINLOCK        /* this is for linux 2.4 */
+#define DEFINE_SPINLOCK(x)   spinlock_t x = SPIN_LOCK_UNLOCKED
+#endif
+
+#endif /* __linux__ */
+
+#define rw_assert(a, b)
+#define rw_destroy(_l)
+#define rw_init(_l, msg)       spin_lock_init(_l)
+#define rw_rlock(_l)           spin_lock_bh(_l)
+#define rw_runlock(_l)         spin_unlock_bh(_l)
+#define rw_wlock(_l)           spin_lock_bh(_l)
+#define rw_wunlock(_l)         spin_unlock_bh(_l)
+#define rw_init_flags(_l, s, v)
+
+#define mtx_assert(a, b)
+#define        mtx_destroy(m)
+#define mtx_init(m, a,b,c)     spin_lock_init(m)
+#define mtx_lock(_l)           spin_lock_bh(_l)
+#define mtx_unlock(_l)         spin_unlock_bh(_l)
+
+/* end of locking support */
+
+/* in netinet/in.h */
+#define        in_nullhost(x)  ((x).s_addr == INADDR_ANY)
+
+/* bzero not present on linux, but this should go in glue.h */
+#define bzero(s, n) memset(s, 0, n)
+#define bcmp(p1, p2, n) memcmp(p1, p2, n)
+
+/* ethernet stuff */
+#define        ETHERTYPE_IP            0x0800  /* IP protocol */
+#define        ETHER_ADDR_LEN          6       /* length of an Ethernet address */
+struct ether_header {
+        u_char  ether_dhost[ETHER_ADDR_LEN];
+        u_char  ether_shost[ETHER_ADDR_LEN];
+        u_short ether_type;
+};
+
+#define ETHER_ADDR_LEN          6       /* length of an Ethernet address */
+#define ETHER_TYPE_LEN          2       /* length of the Ethernet type field */
+#define ETHER_HDR_LEN           (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN)
+
+/*
+ * Historically, BSD keeps ip_len and ip_off in host format
+ * when doing layer 3 processing, and this often requires
+ * to translate the format back and forth.
+ * To make the process explicit, we define a couple of macros
+ * that also take into account the fact that at some point
+ * we may want to keep those fields always in net format.
+ */
+
+#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN)
+#define SET_NET_IPLEN(p)        do {} while (0)
+#define SET_HOST_IPLEN(p)       do {} while (0)
+#else /* never on linux */
+#define SET_NET_IPLEN(p)        do {            \
+        struct ip *h_ip = (p);                  \
+        h_ip->ip_len = htons(h_ip->ip_len);     \
+        h_ip->ip_off = htons(h_ip->ip_off);     \
+        } while (0)
+
+#define SET_HOST_IPLEN(p)       do {            \
+        struct ip *h_ip = (p);                  \
+        h_ip->ip_len = ntohs(h_ip->ip_len);     \
+        h_ip->ip_off = ntohs(h_ip->ip_off);     \
+        } while (0)
+#endif /* !HAVE_NET_IPLEN */
+
+/* ip_dummynet.c */
+#define __FreeBSD_version 500035
+
+#ifdef __linux__
+struct moduledata;
+int my_mod_register(const char *name,
+       int order, struct moduledata *mod, void *init, void *uninit);
+
+/* define some macro for ip_dummynet */
+
+struct malloc_type {
+};
+
+#define MALLOC_DEFINE(type, shortdesc, longdesc)       \
+       struct malloc_type type[1]; void *md_dummy_ ## type = type
+
+#define CTASSERT(x)
+
+/* log... does not use the first argument */
+#define        LOG_ERR         0x100
+#define        LOG_INFO        0x200
+#define log(_level, fmt, arg...)  do {                 \
+       int __unused x=_level;printk(KERN_ERR fmt, ##arg); } while (0)
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+#else  /* _WIN32 */
+#define MALLOC_DEFINE(a,b,c)
+#endif /* _WIN32 */
+
+extern int     hz;
+extern long    tick;           /* exists in 2.4 but not in 2.6 */
+extern int     bootverbose;
+extern time_t  time_uptime;
+extern struct timeval boottime;
+
+extern int     max_linkhdr;
+extern int     ip_defttl;
+extern u_long  in_ifaddrhmask;                         /* mask for hash table */
+extern struct in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+/*-------------------------------------------------*/
+
+/* define, includes and functions missing in linux */
+/* include and define */
+#include <arpa/inet.h>         /* inet_ntoa */
+
+struct mbuf;
+
+/* used by ip_dummynet.c */
+void reinject_drop(struct mbuf* m);
+
+#include <linux/errno.h>       /* error define */
+#include <linux/if.h>          /* IFNAMESIZ */
+
+void rn_init(int);
+/*
+ * some network structure can be defined in the bsd way
+ * by using the _FAVOR_BSD definition. This is not true
+ * for icmp structure.
+ * XXX struct icmp contains bsd names in 
+ * /usr/include/netinet/ip_icmp.h
+ */
+#ifdef __linux__
+#define icmp_code code
+#define icmp_type type
+
+/* linux in6_addr has no member __u6_addr
+ * replace the whole structure ?
+ */
+#define __u6_addr       in6_u
+#define __u6_addr32     u6_addr32
+#endif /* __linux__ */
+
+/* defined in linux/sctp.h with no bsd definition */
+struct sctphdr {
+        uint16_t src_port;      /* source port */
+        uint16_t dest_port;     /* destination port */
+        uint32_t v_tag;         /* verification tag of packet */
+        uint32_t checksum;      /* Adler32 C-Sum */
+        /* chunks follow... */
+};
+
+/* missing definition */
+#define TH_FIN  0x01
+#define TH_SYN  0x02
+#define TH_RST  0x04
+#define TH_ACK  0x10
+
+#define RTF_CLONING    0x100           /* generate new routes on use */
+
+#define IPPROTO_OSPFIGP         89              /* OSPFIGP */
+#define IPPROTO_CARP            112             /* CARP */
+#ifndef _WIN32
+#define IPPROTO_IPV4            IPPROTO_IPIP    /* for compatibility */
+#endif
+
+#define        CARP_VERSION            2
+#define        CARP_ADVERTISEMENT      0x01
+
+#define PRIV_NETINET_IPFW       491     /* Administer IPFW firewall. */
+
+#define IP_FORWARDING           0x1             /* most of ip header exists */
+
+#define NETISR_IP       2               /* same as AF_INET */
+
+#define PRIV_NETINET_DUMMYNET   494     /* Administer DUMMYNET. */
+
+extern int securelevel;
+
+struct carp_header {
+#if BYTE_ORDER == LITTLE_ENDIAN
+        u_int8_t        carp_type:4,
+                        carp_version:4;
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+        u_int8_t        carp_version:4,
+                        carp_type:4;
+#endif
+};
+
+struct pim {
+       int dummy;      /* windows compiler does not like empty definition */
+};
+
+struct route {
+       struct  rtentry *ro_rt;
+       struct  sockaddr ro_dst;
+};
+
+struct ifaltq {
+       void *ifq_head;
+};
+
+/*
+ * ifnet->if_snd is used in ip_dummynet.c to take the transmission
+ * clock.
+ */
+#if defined( __linux__)
+#define        if_xname        name
+#define        if_snd          XXX
+#elif defined( _WIN32 )
+/* used in ip_dummynet.c */
+struct ifnet {
+       char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
+//        struct ifaltq if_snd;          /* output queue (includes altq) */
+};
+
+struct net_device {
+       char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
+};
+#endif
+
+/* involves mbufs */
+int in_cksum(struct mbuf *m, int len);
+#define divert_cookie(mtag) 0
+#define divert_info(mtag) 0
+#define INADDR_TO_IFP(a, b) b = NULL
+#define pf_find_mtag(a) NULL
+#define pf_get_mtag(a) NULL
+#ifndef _WIN32
+#define AF_LINK AF_ASH /* ? our sys/socket.h */
+#endif
+
+/* we don't pullup, either success or free and fail */
+#define m_pullup(m, x)                                 \
+       ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL))
+
+struct pf_mtag {
+       void            *hdr;           /* saved hdr pos in mbuf, for ECN */
+       sa_family_t      af;            /* for ECN */
+        u_int32_t        qid;           /* queue id */
+};
+
+#if 0 // ndef radix
+/* radix stuff in radix.h and radix.c */
+struct radix_node {
+       caddr_t rn_key;         /* object of search */
+       caddr_t rn_mask;        /* netmask, if present */
+};
+#endif /* !radix */
+
+/* missing kernel functions */
+char *inet_ntoa(struct in_addr ina);
+int random(void);
+
+/*
+ * Return the risult of a/b
+ *
+ * this is used in linux kernel space,
+ * since the 64bit division needs to
+ * be done using a macro
+ */
+int64_t
+div64(int64_t a, int64_t b);
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf);
+
+/* from bsd sys/queue.h */
+#define TAILQ_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = TAILQ_FIRST((head));                               \
+            (var) && ((tvar) = TAILQ_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+#define SLIST_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = SLIST_FIRST((head));                               \
+            (var) && ((tvar) = SLIST_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+/* depending of linux version */
+#ifndef ETHERTYPE_IPV6
+#define ETHERTYPE_IPV6          0x86dd          /* IP protocol version 6 */
+#endif
+
+/*-------------------------------------------------*/
+#define RT_NUMFIBS 1
+extern u_int rt_numfibs;
+
+/* involves kernel locking function */
+#ifdef RTFREE
+#undef RTFREE
+#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n");
+#endif
+
+void getmicrouptime(struct timeval *tv);
+
+/* from sys/netinet/ip_output.c */
+struct ip_moptions;
+struct route;
+struct ip;
+
+struct mbuf *ip_reass(struct mbuf *);
+u_short in_cksum_hdr(struct ip *);
+int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+    struct ip_moptions *imo, struct inpcb *inp);
+
+/* from net/netisr.c */
+void netisr_dispatch(int num, struct mbuf *m);
+
+/* definition moved in missing.c */
+int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len);
+
+int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen);
+
+/* defined in session.c */
+int priv_check(struct thread *td, int priv);
+
+/* struct ucred is in linux/socket.h and has pid, uid, gid.
+ * We need a 'bsd_ucred' to store also the extra info
+ */
+
+struct bsd_ucred {
+       uid_t           uid;
+       gid_t           gid;
+       uint32_t        xid;
+       uint32_t        nid;
+};
+
+int
+cred_check(void *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb);
+
+int securelevel_ge(struct ucred *cr, int level);
+
+struct sysctl_oid;
+struct sysctl_req;
+
+/*
+ * sysctl are mapped into /sys/module/ipfw_mod parameters
+ */
+#define CTLFLAG_RD             1
+#define CTLFLAG_RDTUN          1
+#define CTLFLAG_RW             2
+#define CTLFLAG_SECURE3                0 // unsupported
+#define CTLFLAG_VNET    0      /* unsupported */
+
+#ifdef _WIN32
+#define module_param_named(_name, _var, _ty, _perm)
+#else
+
+/* Linux 2.4 is mostly for openwrt */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#include <linux/bitops.h>       /* generic_ffs() used in ip_fw2.c */
+typedef uint32_t __be32;
+typedef uint16_t __be16;
+struct sock;
+struct net;
+struct inet_hashinfo;
+struct sock *inet_lookup(
+       struct inet_hashinfo *hashinfo,
+        const __be32 saddr, const __be16 sport,
+        const __be32 daddr, const __be16 dport,
+        const int dif);
+struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+#endif /* Linux < 2.6 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+#define module_param_named(_name, _var, _ty, _perm)    \
+       //module_param(_name, _ty, 0644)
+#endif
+#endif /* __linux__ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+typedef unsigned long uintptr_t;
+#endif
+
+#define SYSCTL_DECL(_1)
+#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8)
+#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6)
+#define _SYSCTL_BASE(_name, _var, _ty, _perm)          \
+       module_param_named(_name, *(_var), _ty,         \
+               ( (_perm) == CTLFLAG_RD) ? 0444: 0644 )
+#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b)
+
+#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc)       \
+       _SYSCTL_BASE(_name, _var, int, _mode)
+
+#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc)      \
+       _SYSCTL_BASE(_name, _var, long, _mode)
+
+#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc)     \
+       _SYSCTL_BASE(_name, _var, ulong, _mode)
+
+#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc)      \
+        _SYSCTL_BASE(_name, _var, uint, _mode)
+
+#define SYSCTL_HANDLER_ARGS            \
+       struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req
+int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_long(SYSCTL_HANDLER_ARGS); 
+
+#define TUNABLE_INT(_name, _ptr)
+
+void ether_demux(struct ifnet *ifp, struct mbuf *m);
+
+int ether_output_frame(struct ifnet *ifp, struct mbuf *m);
+
+void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
+
+void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu);
+
+void rtfree(struct rtentry *rt);
+
+u_short in_cksum_skip(struct mbuf *m, int len, int skip);
+
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+int jailed(struct ucred *cred);
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int in_localaddr(struct in_addr in);
+
+/* the prototype is already in the headers */
+//int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); 
+
+int fnmatch(const char *pattern, const char *string, int flags);
+
+int
+linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
+       const __be32 daddr, const __be16 dport,
+       struct sk_buff *skb, int dir, struct bsd_ucred *u);
+
+/* vnet wrappers, in vnet.h and ip_var.h */
+//int ipfw_init(void);
+//void ipfw_destroy(void);
+struct ip_fw_args;
+extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
+
+#define curvnet                 NULL
+#define        CURVNET_SET(_v)
+#define        CURVNET_RESTORE()
+#define VNET_ASSERT(condition)
+
+#define VNET_NAME(n)            n
+#define VNET_DECLARE(t, n)      extern t n
+#define VNET_DEFINE(t, n)       t n
+#define _VNET_PTR(b, n)         &VNET_NAME(n)
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VNET_VNET_PTR(vnet, n)          (&(n))
+#define VNET_VNET(vnet, n)              (n)
+
+#define VNET_PTR(n)             (&(n))
+#define VNET(n)                 (n)
+
+int
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp);
+
+extern int (*ip_dn_ctl_ptr)(struct sockopt *);
+typedef int ip_fw_ctl_t(struct sockopt *);
+extern ip_fw_ctl_t *ip_fw_ctl_ptr;
+
+/* For kernel ipfw_ether and ipfw_bridge. */
+struct ip_fw_args;
+typedef int ip_fw_chk_t(struct ip_fw_args *args);
+extern  ip_fw_chk_t     *ip_fw_chk_ptr;
+
+#define V_ip_fw_chk_ptr         VNET(ip_fw_chk_ptr)
+#define V_ip_fw_ctl_ptr         VNET(ip_fw_ctl_ptr)
+#define        V_tcbinfo               VNET(tcbinfo)
+#define        V_udbinfo               VNET(udbinfo)
+
+#define SYSCTL_VNET_PROC       SYSCTL_PROC
+#define SYSCTL_VNET_INT                SYSCTL_INT
+
+#endif /* !_MISSING_H_ */
diff --git a/dummynet2/radix.c b/dummynet2/radix.c
new file mode 100644 (file)
index 0000000..5d508e4
--- /dev/null
@@ -0,0 +1,1186 @@
+/*-
+ * Copyright (c) 1988, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)radix.c     8.5 (Berkeley) 5/19/95
+ * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $
+ */
+
+/*
+ * Routines to build and maintain radix trees for routing lookups.
+ */
+#include <sys/param.h>
+#ifdef _KERNEL
+#include <sys/cdefs.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <net/radix.h>
+#include "opt_mpath.h"
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+#else /* !_KERNEL */
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x)       fprintf(stderr, "PANIC: %s", x), exit(1)
+#define min(a, b) ((a) < (b) ? (a) : (b) )
+#include "include/net/radix.h"
+#endif /* !_KERNEL */
+
+static int     rn_walktree_from(struct radix_node_head *h, void *a, void *m,
+                   walktree_f_t *f, void *w);
+static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
+static struct radix_node
+        *rn_insert(void *, struct radix_node_head *, int *,
+            struct radix_node [2]),
+        *rn_newpair(void *, int, struct radix_node[2]),
+        *rn_search(void *, struct radix_node *),
+        *rn_search_m(void *, struct radix_node *, void *);
+
+static int     max_keylen;
+static struct radix_mask *rn_mkfreelist;
+static struct radix_node_head *mask_rnhead;
+/*
+ * Work area -- the following point to 3 buffers of size max_keylen,
+ * allocated in this order in a block of memory malloc'ed by rn_init.
+ * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards.
+ * addmask_key is used in rn_addmask in rw mode and not thread-safe.
+ */
+static char *rn_zeros, *rn_ones, *addmask_key;
+
+#define MKGet(m) {                                             \
+       if (rn_mkfreelist) {                                    \
+               m = rn_mkfreelist;                              \
+               rn_mkfreelist = (m)->rm_mklist;                 \
+       } else                                                  \
+               R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); }
+#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);}
+
+#define rn_masktop (mask_rnhead->rnh_treetop)
+
+static int     rn_lexobetter(void *m_arg, void *n_arg);
+static struct radix_mask *
+               rn_new_radix_mask(struct radix_node *tt,
+                   struct radix_mask *next);
+static int     rn_satisfies_leaf(char *trial, struct radix_node *leaf,
+                   int skip);
+
+/*
+ * The data structure for the keys is a radix tree with one way
+ * branching removed.  The index rn_bit at an internal node n represents a bit
+ * position to be tested.  The tree is arranged so that all descendants
+ * of a node n have keys whose bits all agree up to position rn_bit - 1.
+ * (We say the index of n is rn_bit.)
+ *
+ * There is at least one descendant which has a one bit at position rn_bit,
+ * and at least one with a zero there.
+ *
+ * A route is determined by a pair of key and mask.  We require that the
+ * bit-wise logical and of the key and mask to be the key.
+ * We define the index of a route to associated with the mask to be
+ * the first bit number in the mask where 0 occurs (with bit number 0
+ * representing the highest order bit).
+ *
+ * We say a mask is normal if every bit is 0, past the index of the mask.
+ * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
+ * and m is a normal mask, then the route applies to every descendant of n.
+ * If the index(m) < rn_bit, this implies the trailing last few bits of k
+ * before bit b are all 0, (and hence consequently true of every descendant
+ * of n), so the route applies to all descendants of the node as well.
+ *
+ * Similar logic shows that a non-normal mask m such that
+ * index(m) <= index(n) could potentially apply to many children of n.
+ * Thus, for each non-host route, we attach its mask to a list at an internal
+ * node as high in the tree as we can go.
+ *
+ * The present version of the code makes use of normal routes in short-
+ * circuiting an explict mask and compare operation when testing whether
+ * a key satisfies a normal route, and also in remembering the unique leaf
+ * that governs a subtree.
+ */
+
+/*
+ * Most of the functions in this code assume that the key/mask arguments
+ * are sockaddr-like structures, where the first byte is an u_char
+ * indicating the size of the entire structure.
+ *
+ * To make the assumption more explicit, we use the LEN() macro to access
+ * this field. It is safe to pass an expression with side effects
+ * to LEN() as the argument is evaluated only once.
+ * We cast the result to int as this is the dominant usage.
+ */
+#define LEN(x) ( (int) (*(const u_char *)(x)) )
+
+/*
+ * XXX THIS NEEDS TO BE FIXED
+ * In the code, pointers to keys and masks are passed as either
+ * 'void *' (because callers use to pass pointers of various kinds), or
+ * 'caddr_t' (which is fine for pointer arithmetics, but not very
+ * clean when you dereference it to access data). Furthermore, caddr_t
+ * is really 'char *', while the natural type to operate on keys and
+ * masks would be 'u_char'. This mismatch require a lot of casts and
+ * intermediate variables to adapt types that clutter the code.
+ */
+
+/*
+ * Search a node in the tree matching the key.
+ */
+static struct radix_node *
+rn_search(v_arg, head)
+       void *v_arg;
+       struct radix_node *head;
+{
+       register struct radix_node *x;
+       register caddr_t v;
+
+       for (x = head, v = v_arg; x->rn_bit >= 0;) {
+               if (x->rn_bmask & v[x->rn_offset])
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       }
+       return (x);
+}
+
+/*
+ * Same as above, but with an additional mask.
+ * XXX note this function is used only once.
+ */
+static struct radix_node *
+rn_search_m(v_arg, head, m_arg)
+       struct radix_node *head;
+       void *v_arg, *m_arg;
+{
+       register struct radix_node *x;
+       register caddr_t v = v_arg, m = m_arg;
+
+       for (x = head; x->rn_bit >= 0;) {
+               if ((x->rn_bmask & m[x->rn_offset]) &&
+                   (x->rn_bmask & v[x->rn_offset]))
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       }
+       return x;
+}
+
+int
+rn_refines(m_arg, n_arg)
+       void *m_arg, *n_arg;
+{
+       register caddr_t m = m_arg, n = n_arg;
+       register caddr_t lim, lim2 = lim = n + LEN(n);
+       int longer = LEN(n++) - LEN(m++);
+       int masks_are_equal = 1;
+
+       if (longer > 0)
+               lim -= longer;
+       while (n < lim) {
+               if (*n & ~(*m))
+                       return 0;
+               if (*n++ != *m++)
+                       masks_are_equal = 0;
+       }
+       while (n < lim2)
+               if (*n++)
+                       return 0;
+       if (masks_are_equal && (longer < 0))
+               for (lim2 = m - longer; m < lim2; )
+                       if (*m++)
+                               return 1;
+       return (!masks_are_equal);
+}
+
+struct radix_node *
+rn_lookup(v_arg, m_arg, head)
+       void *v_arg, *m_arg;
+       struct radix_node_head *head;
+{
+       register struct radix_node *x;
+       caddr_t netmask = 0;
+
+       if (m_arg) {
+               x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
+               if (x == 0)
+                       return (0);
+               netmask = x->rn_key;
+       }
+       x = rn_match(v_arg, head);
+       if (x && netmask) {
+               while (x && x->rn_mask != netmask)
+                       x = x->rn_dupedkey;
+       }
+       return x;
+}
+
+static int
+rn_satisfies_leaf(trial, leaf, skip)
+       char *trial;
+       register struct radix_node *leaf;
+       int skip;
+{
+       register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
+       char *cplim;
+       int length = min(LEN(cp), LEN(cp2));
+
+       if (cp3 == NULL)
+               cp3 = rn_ones;
+       else
+               length = min(length, LEN(cp3));
+       cplim = cp + length; cp3 += skip; cp2 += skip;
+       for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
+               if ((*cp ^ *cp2) & *cp3)
+                       return 0;
+       return 1;
+}
+
+struct radix_node *
+rn_match(v_arg, head)
+       void *v_arg;
+       struct radix_node_head *head;
+{
+       caddr_t v = v_arg;
+       register struct radix_node *t = head->rnh_treetop, *x;
+       register caddr_t cp = v, cp2;
+       caddr_t cplim;
+       struct radix_node *saved_t, *top = t;
+       int off = t->rn_offset, vlen = LEN(cp), matched_off;
+       register int test, b, rn_bit;
+
+       /*
+        * Open code rn_search(v, top) to avoid overhead of extra
+        * subroutine call.
+        */
+       for (; t->rn_bit >= 0; ) {
+               if (t->rn_bmask & cp[t->rn_offset])
+                       t = t->rn_right;
+               else
+                       t = t->rn_left;
+       }
+       /*
+        * See if we match exactly as a host destination
+        * or at least learn how many bits match, for normal mask finesse.
+        *
+        * It doesn't hurt us to limit how many bytes to check
+        * to the length of the mask, since if it matches we had a genuine
+        * match and the leaf we have is the most specific one anyway;
+        * if it didn't match with a shorter length it would fail
+        * with a long one.  This wins big for class B&C netmasks which
+        * are probably the most common case...
+        */
+       if (t->rn_mask)
+               vlen = *(u_char *)t->rn_mask;
+       cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
+       for (; cp < cplim; cp++, cp2++)
+               if (*cp != *cp2)
+                       goto on1;
+       /*
+        * This extra grot is in case we are explicitly asked
+        * to look up the default.  Ugh!
+        *
+        * Never return the root node itself, it seems to cause a
+        * lot of confusion.
+        */
+       if (t->rn_flags & RNF_ROOT)
+               t = t->rn_dupedkey;
+       return t;
+on1:
+       test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
+       for (b = 7; (test >>= 1) > 0;)
+               b--;
+       matched_off = cp - v;
+       b += matched_off << 3;
+       rn_bit = -1 - b;
+       /*
+        * If there is a host route in a duped-key chain, it will be first.
+        */
+       if ((saved_t = t)->rn_mask == 0)
+               t = t->rn_dupedkey;
+       for (; t; t = t->rn_dupedkey)
+               /*
+                * Even if we don't match exactly as a host,
+                * we may match if the leaf we wound up at is
+                * a route to a net.
+                */
+               if (t->rn_flags & RNF_NORMAL) {
+                       if (rn_bit <= t->rn_bit)
+                               return t;
+               } else if (rn_satisfies_leaf(v, t, matched_off))
+                               return t;
+       t = saved_t;
+       /* start searching up the tree */
+       do {
+               register struct radix_mask *m;
+               t = t->rn_parent;
+               m = t->rn_mklist;
+               /*
+                * If non-contiguous masks ever become important
+                * we can restore the masking and open coding of
+                * the search and satisfaction test and put the
+                * calculation of "off" back before the "do".
+                */
+               while (m) {
+                       if (m->rm_flags & RNF_NORMAL) {
+                               if (rn_bit <= m->rm_bit)
+                                       return (m->rm_leaf);
+                       } else {
+                               off = min(t->rn_offset, matched_off);
+                               x = rn_search_m(v, t, m->rm_mask);
+                               while (x && x->rn_mask != m->rm_mask)
+                                       x = x->rn_dupedkey;
+                               if (x && rn_satisfies_leaf(v, x, off))
+                                       return x;
+                       }
+                       m = m->rm_mklist;
+               }
+       } while (t != top);
+       return 0;
+}
+
+#ifdef RN_DEBUG
+int    rn_nodenum;
+struct radix_node *rn_clist;
+int    rn_saveinfo;
+int    rn_debug =  1;
+#endif
+
+/*
+ * Whenever we add a new leaf to the tree, we also add a parent node,
+ * so we allocate them as an array of two elements: the first one must be
+ * the leaf (see RNTORT() in route.c), the second one is the parent.
+ * This routine initializes the relevant fields of the nodes, so that
+ * the leaf is the left child of the parent node, and both nodes have
+ * (almost) all all fields filled as appropriate.
+ * (XXX some fields are left unset, see the '#if 0' section).
+ * The function returns a pointer to the parent node.
+ */
+
+static struct radix_node *
+rn_newpair(v, b, nodes)
+       void *v;
+       int b;
+       struct radix_node nodes[2];
+{
+       register struct radix_node *tt = nodes, *t = tt + 1;
+       t->rn_bit = b;
+       t->rn_bmask = 0x80 >> (b & 7);
+       t->rn_left = tt;
+       t->rn_offset = b >> 3;
+
+#if 0  /* XXX perhaps we should fill these fields as well. */
+       t->rn_parent = t->rn_right = NULL;
+
+       tt->rn_mask = NULL;
+       tt->rn_dupedkey = NULL;
+       tt->rn_bmask = 0;
+#endif
+       tt->rn_bit = -1;
+       tt->rn_key = (caddr_t)v;
+       tt->rn_parent = t;
+       tt->rn_flags = t->rn_flags = RNF_ACTIVE;
+       tt->rn_mklist = t->rn_mklist = 0;
+#ifdef RN_DEBUG
+       tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
+       tt->rn_twin = t;
+       tt->rn_ybro = rn_clist;
+       rn_clist = tt;
+#endif
+       return t;
+}
+
+static struct radix_node *
+rn_insert(v_arg, head, dupentry, nodes)
+       void *v_arg;
+       struct radix_node_head *head;
+       int *dupentry;
+       struct radix_node nodes[2];
+{
+       caddr_t v = v_arg;
+       struct radix_node *top = head->rnh_treetop;
+       int head_off = top->rn_offset, vlen = LEN(v);
+       register struct radix_node *t = rn_search(v_arg, top);
+       register caddr_t cp = v + head_off;
+       register int b;
+       struct radix_node *tt;
+       /*
+        * Find first bit at which v and t->rn_key differ
+        */
+    {
+       register caddr_t cp2 = t->rn_key + head_off;
+       register int cmp_res;
+       caddr_t cplim = v + vlen;
+
+       while (cp < cplim)
+               if (*cp2++ != *cp++)
+                       goto on1;
+       *dupentry = 1;
+       return t;
+on1:
+       *dupentry = 0;
+       cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
+       for (b = (cp - v) << 3; cmp_res; b--)
+               cmp_res >>= 1;
+    }
+    {
+       register struct radix_node *p, *x = top;
+       cp = v;
+       do {
+               p = x;
+               if (cp[x->rn_offset] & x->rn_bmask)
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       } while (b > (unsigned) x->rn_bit);
+                               /* x->rn_bit < b && x->rn_bit >= 0 */
+#ifdef RN_DEBUG
+       if (rn_debug)
+               log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p);
+#endif
+       t = rn_newpair(v_arg, b, nodes); 
+       tt = t->rn_left;
+       if ((cp[p->rn_offset] & p->rn_bmask) == 0)
+               p->rn_left = t;
+       else
+               p->rn_right = t;
+       x->rn_parent = t;
+       t->rn_parent = p; /* frees x, p as temp vars below */
+       if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
+               t->rn_right = x;
+       } else {
+               t->rn_right = tt;
+               t->rn_left = x;
+       }
+#ifdef RN_DEBUG
+       if (rn_debug)
+               log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p);
+#endif
+    }
+       return (tt);
+}
+
+struct radix_node *
+rn_addmask(n_arg, search, skip)
+       int search, skip;
+       void *n_arg;
+{
+       caddr_t netmask = (caddr_t)n_arg;
+       register struct radix_node *x;
+       register caddr_t cp, cplim;
+       register int b = 0, mlen, j;
+       int maskduplicated, m0, isnormal;
+       struct radix_node *saved_x;
+       static int last_zeroed = 0;
+
+       if ((mlen = LEN(netmask)) > max_keylen)
+               mlen = max_keylen;
+       if (skip == 0)
+               skip = 1;
+       if (mlen <= skip)
+               return (mask_rnhead->rnh_nodes);
+       if (skip > 1)
+               bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
+       if ((m0 = mlen) > skip)
+               bcopy(netmask + skip, addmask_key + skip, mlen - skip);
+       /*
+        * Trim trailing zeroes.
+        */
+       for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
+               cp--;
+       mlen = cp - addmask_key;
+       if (mlen <= skip) {
+               if (m0 >= last_zeroed)
+                       last_zeroed = mlen;
+               return (mask_rnhead->rnh_nodes);
+       }
+       if (m0 < last_zeroed)
+               bzero(addmask_key + m0, last_zeroed - m0);
+       *addmask_key = last_zeroed = mlen;
+       x = rn_search(addmask_key, rn_masktop);
+       if (bcmp(addmask_key, x->rn_key, mlen) != 0)
+               x = 0;
+       if (x || search)
+               return (x);
+       R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
+       if ((saved_x = x) == 0)
+               return (0);
+       netmask = cp = (caddr_t)(x + 2);
+       bcopy(addmask_key, cp, mlen);
+       x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
+       if (maskduplicated) {
+               log(LOG_ERR, "rn_addmask: mask impossibly already in tree");
+               Free(saved_x);
+               return (x);
+       }
+       /*
+        * Calculate index of mask, and check for normalcy.
+        * First find the first byte with a 0 bit, then if there are
+        * more bits left (remember we already trimmed the trailing 0's),
+        * the pattern must be one of those in normal_chars[], or we have
+        * a non-contiguous mask.
+        */
+       cplim = netmask + mlen;
+       isnormal = 1;
+       for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;)
+               cp++;
+       if (cp != cplim) {
+               static char normal_chars[] = {
+                       0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
+
+               for (j = 0x80; (j & *cp) != 0; j >>= 1)
+                       b++;
+               if (*cp != normal_chars[b] || cp != (cplim - 1))
+                       isnormal = 0;
+       }
+       b += (cp - netmask) << 3;
+       x->rn_bit = -1 - b;
+       if (isnormal)
+               x->rn_flags |= RNF_NORMAL;
+       return (x);
+}
+
+static int     /* XXX: arbitrary ordering for non-contiguous masks */
+rn_lexobetter(m_arg, n_arg)
+       void *m_arg, *n_arg;
+{
+       register u_char *mp = m_arg, *np = n_arg, *lim;
+
+       if (LEN(mp) > LEN(np))
+               return 1;  /* not really, but need to check longer one first */
+       if (LEN(mp) == LEN(np))
+               for (lim = mp + LEN(mp); mp < lim;)
+                       if (*mp++ > *np++)
+                               return 1;
+       return 0;
+}
+
+static struct radix_mask *
+rn_new_radix_mask(tt, next)
+       register struct radix_node *tt;
+       register struct radix_mask *next;
+{
+       register struct radix_mask *m;
+
+       MKGet(m);
+       if (m == 0) {
+               log(LOG_ERR, "Mask for route not entered\n");
+               return (0);
+       }
+       bzero(m, sizeof *m);
+       m->rm_bit = tt->rn_bit;
+       m->rm_flags = tt->rn_flags;
+       if (tt->rn_flags & RNF_NORMAL)
+               m->rm_leaf = tt;
+       else
+               m->rm_mask = tt->rn_mask;
+       m->rm_mklist = next;
+       tt->rn_mklist = m;
+       return m;
+}
+
+struct radix_node *
+rn_addroute(v_arg, n_arg, head, treenodes)
+       void *v_arg, *n_arg;
+       struct radix_node_head *head;
+       struct radix_node treenodes[2];
+{
+       caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
+       register struct radix_node *t, *x = 0, *tt;
+       struct radix_node *saved_tt, *top = head->rnh_treetop;
+       short b = 0, b_leaf = 0;
+       int keyduplicated;
+       caddr_t mmask;
+       struct radix_mask *m, **mp;
+
+       /*
+        * In dealing with non-contiguous masks, there may be
+        * many different routes which have the same mask.
+        * We will find it useful to have a unique pointer to
+        * the mask to speed avoiding duplicate references at
+        * nodes and possibly save time in calculating indices.
+        */
+       if (netmask)  {
+               if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
+                       return (0);
+               b_leaf = x->rn_bit;
+               b = -1 - x->rn_bit;
+               netmask = x->rn_key;
+       }
+       /*
+        * Deal with duplicated keys: attach node to previous instance
+        */
+       saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
+       if (keyduplicated) {
+               for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
+#ifdef RADIX_MPATH
+                       /* permit multipath, if enabled for the family */
+                       if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
+                               /*
+                                * go down to the end of multipaths, so that
+                                * new entry goes into the end of rn_dupedkey
+                                * chain.
+                                */
+                               do {
+                                       t = tt;
+                                       tt = tt->rn_dupedkey;
+                               } while (tt && t->rn_mask == tt->rn_mask);
+                               break;
+                       }
+#endif
+                       if (tt->rn_mask == netmask)
+                               return (0);
+                       if (netmask == 0 ||
+                           (tt->rn_mask &&
+                            ((b_leaf < tt->rn_bit) /* index(netmask) > node */
+                             || rn_refines(netmask, tt->rn_mask)
+                             || rn_lexobetter(netmask, tt->rn_mask))))
+                               break;
+               }
+               /*
+                * If the mask is not duplicated, we wouldn't
+                * find it among possible duplicate key entries
+                * anyway, so the above test doesn't hurt.
+                *
+                * We sort the masks for a duplicated key the same way as
+                * in a masklist -- most specific to least specific.
+                * This may require the unfortunate nuisance of relocating
+                * the head of the list.
+                *
+                * We also reverse, or doubly link the list through the
+                * parent pointer.
+                */
+               if (tt == saved_tt) {
+                       struct  radix_node *xx = x;
+                       /* link in at head of list */
+                       (tt = treenodes)->rn_dupedkey = t;
+                       tt->rn_flags = t->rn_flags;
+                       tt->rn_parent = x = t->rn_parent;
+                       t->rn_parent = tt;                      /* parent */
+                       if (x->rn_left == t)
+                               x->rn_left = tt;
+                       else
+                               x->rn_right = tt;
+                       saved_tt = tt; x = xx;
+               } else {
+                       (tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
+                       t->rn_dupedkey = tt;
+                       tt->rn_parent = t;                      /* parent */
+                       if (tt->rn_dupedkey)                    /* parent */
+                               tt->rn_dupedkey->rn_parent = tt; /* parent */
+               }
+#ifdef RN_DEBUG
+               t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
+               tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt;
+#endif
+               tt->rn_key = (caddr_t) v;
+               tt->rn_bit = -1;
+               tt->rn_flags = RNF_ACTIVE;
+       }
+       /*
+        * Put mask in tree.
+        */
+       if (netmask) {
+               tt->rn_mask = netmask;
+               tt->rn_bit = x->rn_bit;
+               tt->rn_flags |= x->rn_flags & RNF_NORMAL;
+       }
+       t = saved_tt->rn_parent;
+       if (keyduplicated)
+               goto on2;
+       b_leaf = -1 - t->rn_bit;
+       if (t->rn_right == saved_tt)
+               x = t->rn_left;
+       else
+               x = t->rn_right;
+       /* Promote general routes from below */
+       if (x->rn_bit < 0) {
+           for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+               if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
+                       *mp = m = rn_new_radix_mask(x, 0);
+                       if (m)
+                               mp = &m->rm_mklist;
+               }
+       } else if (x->rn_mklist) {
+               /*
+                * Skip over masks whose index is > that of new node
+                */
+               for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
+                       if (m->rm_bit >= b_leaf)
+                               break;
+               t->rn_mklist = m; *mp = 0;
+       }
+on2:
+       /* Add new route to highest possible ancestor's list */
+       if ((netmask == 0) || (b > t->rn_bit ))
+               return tt; /* can't lift at all */
+       b_leaf = tt->rn_bit;
+       do {
+               x = t;
+               t = t->rn_parent;
+       } while (b <= t->rn_bit && x != top);
+       /*
+        * Search through routes associated with node to
+        * insert new route according to index.
+        * Need same criteria as when sorting dupedkeys to avoid
+        * double loop on deletion.
+        */
+       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) {
+               if (m->rm_bit < b_leaf)
+                       continue;
+               if (m->rm_bit > b_leaf)
+                       break;
+               if (m->rm_flags & RNF_NORMAL) {
+                       mmask = m->rm_leaf->rn_mask;
+                       if (tt->rn_flags & RNF_NORMAL) {
+                           log(LOG_ERR,
+                               "Non-unique normal route, mask not entered\n");
+                               return tt;
+                       }
+               } else
+                       mmask = m->rm_mask;
+               if (mmask == netmask) {
+                       m->rm_refs++;
+                       tt->rn_mklist = m;
+                       return tt;
+               }
+               if (rn_refines(netmask, mmask)
+                   || rn_lexobetter(netmask, mmask))
+                       break;
+       }
+       *mp = rn_new_radix_mask(tt, *mp);
+       return tt;
+}
+
+struct radix_node *
+rn_delete(v_arg, netmask_arg, head)
+       void *v_arg, *netmask_arg;
+       struct radix_node_head *head;
+{
+       register struct radix_node *t, *p, *x, *tt;
+       struct radix_mask *m, *saved_m, **mp;
+       struct radix_node *dupedkey, *saved_tt, *top;
+       caddr_t v, netmask;
+       int b, head_off, vlen;
+
+       v = v_arg;
+       netmask = netmask_arg;
+       x = head->rnh_treetop;
+       tt = rn_search(v, x);
+       head_off = x->rn_offset;
+       vlen =  LEN(v);
+       saved_tt = tt;
+       top = x;
+       if (tt == 0 ||
+           bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
+               return (0);
+       /*
+        * Delete our route from mask lists.
+        */
+       if (netmask) {
+               if ((x = rn_addmask(netmask, 1, head_off)) == 0)
+                       return (0);
+               netmask = x->rn_key;
+               while (tt->rn_mask != netmask)
+                       if ((tt = tt->rn_dupedkey) == 0)
+                               return (0);
+       }
+       if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
+               goto on1;
+       if (tt->rn_flags & RNF_NORMAL) {
+               if (m->rm_leaf != tt || m->rm_refs > 0) {
+                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
+                       return 0;  /* dangling ref could cause disaster */
+               }
+       } else {
+               if (m->rm_mask != tt->rn_mask) {
+                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
+                       goto on1;
+               }
+               if (--m->rm_refs >= 0)
+                       goto on1;
+       }
+       b = -1 - tt->rn_bit;
+       t = saved_tt->rn_parent;
+       if (b > t->rn_bit)
+               goto on1; /* Wasn't lifted at all */
+       do {
+               x = t;
+               t = t->rn_parent;
+       } while (b <= t->rn_bit && x != top);
+       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
+               if (m == saved_m) {
+                       *mp = m->rm_mklist;
+                       MKFree(m);
+                       break;
+               }
+       if (m == 0) {
+               log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
+               if (tt->rn_flags & RNF_NORMAL)
+                       return (0); /* Dangling ref to us */
+       }
+on1:
+       /*
+        * Eliminate us from tree
+        */
+       if (tt->rn_flags & RNF_ROOT)
+               return (0);
+#ifdef RN_DEBUG
+       /* Get us out of the creation list */
+       for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {}
+       if (t) t->rn_ybro = tt->rn_ybro;
+#endif
+       t = tt->rn_parent;
+       dupedkey = saved_tt->rn_dupedkey;
+       if (dupedkey) {
+               /*
+                * Here, tt is the deletion target and
+                * saved_tt is the head of the dupekey chain.
+                */
+               if (tt == saved_tt) {
+                       /* remove from head of chain */
+                       x = dupedkey; x->rn_parent = t;
+                       if (t->rn_left == tt)
+                               t->rn_left = x;
+                       else
+                               t->rn_right = x;
+               } else {
+                       /* find node in front of tt on the chain */
+                       for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
+                               p = p->rn_dupedkey;
+                       if (p) {
+                               p->rn_dupedkey = tt->rn_dupedkey;
+                               if (tt->rn_dupedkey)            /* parent */
+                                       tt->rn_dupedkey->rn_parent = p;
+                                                               /* parent */
+                       } else log(LOG_ERR, "rn_delete: couldn't find us\n");
+               }
+               t = tt + 1;
+               if  (t->rn_flags & RNF_ACTIVE) {
+#ifndef RN_DEBUG
+                       *++x = *t;
+                       p = t->rn_parent;
+#else
+                       b = t->rn_info;
+                       *++x = *t;
+                       t->rn_info = b;
+                       p = t->rn_parent;
+#endif
+                       if (p->rn_left == t)
+                               p->rn_left = x;
+                       else
+                               p->rn_right = x;
+                       x->rn_left->rn_parent = x;
+                       x->rn_right->rn_parent = x;
+               }
+               goto out;
+       }
+       if (t->rn_left == tt)
+               x = t->rn_right;
+       else
+               x = t->rn_left;
+       p = t->rn_parent;
+       if (p->rn_right == t)
+               p->rn_right = x;
+       else
+               p->rn_left = x;
+       x->rn_parent = p;
+       /*
+        * Demote routes attached to us.
+        */
+       if (t->rn_mklist) {
+               if (x->rn_bit >= 0) {
+                       for (mp = &x->rn_mklist; (m = *mp);)
+                               mp = &m->rm_mklist;
+                       *mp = t->rn_mklist;
+               } else {
+                       /* If there are any key,mask pairs in a sibling
+                          duped-key chain, some subset will appear sorted
+                          in the same order attached to our mklist */
+                       for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
+                               if (m == x->rn_mklist) {
+                                       struct radix_mask *mm = m->rm_mklist;
+                                       x->rn_mklist = 0;
+                                       if (--(m->rm_refs) < 0)
+                                               MKFree(m);
+                                       m = mm;
+                               }
+                       if (m)
+                               log(LOG_ERR,
+                                   "rn_delete: Orphaned Mask %p at %p\n",
+                                   (void *)m, (void *)x);
+               }
+       }
+       /*
+        * We may be holding an active internal node in the tree.
+        */
+       x = tt + 1;
+       if (t != x) {
+#ifndef RN_DEBUG
+               *t = *x;
+#else
+               b = t->rn_info;
+               *t = *x;
+               t->rn_info = b;
+#endif
+               t->rn_left->rn_parent = t;
+               t->rn_right->rn_parent = t;
+               p = x->rn_parent;
+               if (p->rn_left == x)
+                       p->rn_left = t;
+               else
+                       p->rn_right = t;
+       }
+out:
+       tt->rn_flags &= ~RNF_ACTIVE;
+       tt[1].rn_flags &= ~RNF_ACTIVE;
+       return (tt);
+}
+
+/*
+ * This is the same as rn_walktree() except for the parameters and the
+ * exit.
+ */
+static int
+rn_walktree_from(h, a, m, f, w)
+       struct radix_node_head *h;
+       void *a, *m;
+       walktree_f_t *f;
+       void *w;
+{
+       int error;
+       struct radix_node *base, *next;
+       u_char *xa = (u_char *)a;
+       u_char *xm = (u_char *)m;
+       register struct radix_node *rn, *last = 0 /* shut up gcc */;
+       int stopping = 0;
+       int lastb;
+
+       /*
+        * rn_search_m is sort-of-open-coded here. We cannot use the
+        * function because we need to keep track of the last node seen.
+        */
+       /* printf("about to search\n"); */
+       for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) {
+               last = rn;
+               /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n",
+                      rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */
+               if (!(rn->rn_bmask & xm[rn->rn_offset])) {
+                       break;
+               }
+               if (rn->rn_bmask & xa[rn->rn_offset]) {
+                       rn = rn->rn_right;
+               } else {
+                       rn = rn->rn_left;
+               }
+       }
+       /* printf("done searching\n"); */
+
+       /*
+        * Two cases: either we stepped off the end of our mask,
+        * in which case last == rn, or we reached a leaf, in which
+        * case we want to start from the last node we looked at.
+        * Either way, last is the node we want to start from.
+        */
+       rn = last;
+       lastb = rn->rn_bit;
+
+       /* printf("rn %p, lastb %d\n", rn, lastb);*/
+
+       /*
+        * This gets complicated because we may delete the node
+        * while applying the function f to it, so we need to calculate
+        * the successor node in advance.
+        */
+       while (rn->rn_bit >= 0)
+               rn = rn->rn_left;
+
+       while (!stopping) {
+               /* printf("node %p (%d)\n", rn, rn->rn_bit); */
+               base = rn;
+               /* If at right child go back up, otherwise, go right */
+               while (rn->rn_parent->rn_right == rn
+                      && !(rn->rn_flags & RNF_ROOT)) {
+                       rn = rn->rn_parent;
+
+                       /* if went up beyond last, stop */
+                       if (rn->rn_bit <= lastb) {
+                               stopping = 1;
+                               /* printf("up too far\n"); */
+                               /*
+                                * XXX we should jump to the 'Process leaves'
+                                * part, because the values of 'rn' and 'next'
+                                * we compute will not be used. Not a big deal
+                                * because this loop will terminate, but it is
+                                * inefficient and hard to understand!
+                                */
+                       }
+               }
+               
+               /* 
+                * At the top of the tree, no need to traverse the right
+                * half, prevent the traversal of the entire tree in the
+                * case of default route.
+                */
+               if (rn->rn_parent->rn_flags & RNF_ROOT)
+                       stopping = 1;
+
+               /* Find the next *leaf* since next node might vanish, too */
+               for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
+                       rn = rn->rn_left;
+               next = rn;
+               /* Process leaves */
+               while ((rn = base) != 0) {
+                       base = rn->rn_dupedkey;
+                       /* printf("leaf %p\n", rn); */
+                       if (!(rn->rn_flags & RNF_ROOT)
+                           && (error = (*f)(rn, w)))
+                               return (error);
+               }
+               rn = next;
+
+               if (rn->rn_flags & RNF_ROOT) {
+                       /* printf("root, stopping"); */
+                       stopping = 1;
+               }
+
+       }
+       return 0;
+}
+
+static int
+rn_walktree(h, f, w)
+       struct radix_node_head *h;
+       walktree_f_t *f;
+       void *w;
+{
+       int error;
+       struct radix_node *base, *next;
+       register struct radix_node *rn = h->rnh_treetop;
+       /*
+        * This gets complicated because we may delete the node
+        * while applying the function f to it, so we need to calculate
+        * the successor node in advance.
+        */
+
+       /* First time through node, go left */
+       while (rn->rn_bit >= 0)
+               rn = rn->rn_left;
+       for (;;) {
+               base = rn;
+               /* If at right child go back up, otherwise, go right */
+               while (rn->rn_parent->rn_right == rn
+                      && (rn->rn_flags & RNF_ROOT) == 0)
+                       rn = rn->rn_parent;
+               /* Find the next *leaf* since next node might vanish, too */
+               for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
+                       rn = rn->rn_left;
+               next = rn;
+               /* Process leaves */
+               while ((rn = base)) {
+                       base = rn->rn_dupedkey;
+                       if (!(rn->rn_flags & RNF_ROOT)
+                           && (error = (*f)(rn, w)))
+                               return (error);
+               }
+               rn = next;
+               if (rn->rn_flags & RNF_ROOT)
+                       return (0);
+       }
+       /* NOTREACHED */
+}
+
+/*
+ * Allocate and initialize an empty tree. This has 3 nodes, which are
+ * part of the radix_node_head (in the order <left,root,right>) and are
+ * marked RNF_ROOT so they cannot be freed.
+ * The leaves have all-zero and all-one keys, with significant
+ * bits starting at 'off'.
+ * Return 1 on success, 0 on error.
+ */
+int
+rn_inithead(head, off)
+       void **head;
+       int off;
+{
+       register struct radix_node_head *rnh;
+       register struct radix_node *t, *tt, *ttt;
+       if (*head)
+               return (1);
+       R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh));
+       if (rnh == 0)
+               return (0);
+#ifdef _KERNEL
+       RADIX_NODE_HEAD_LOCK_INIT(rnh);
+#endif
+       *head = rnh;
+       t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
+       ttt = rnh->rnh_nodes + 2;
+       t->rn_right = ttt;
+       t->rn_parent = t;
+       tt = t->rn_left;        /* ... which in turn is rnh->rnh_nodes */
+       tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
+       tt->rn_bit = -1 - off;
+       *ttt = *tt;
+       ttt->rn_key = rn_ones;
+       rnh->rnh_addaddr = rn_addroute;
+       rnh->rnh_deladdr = rn_delete;
+       rnh->rnh_matchaddr = rn_match;
+       rnh->rnh_lookup = rn_lookup;
+       rnh->rnh_walktree = rn_walktree;
+       rnh->rnh_walktree_from = rn_walktree_from;
+       rnh->rnh_treetop = t;
+       return (1);
+}
+
+void
+rn_init(int maxk)
+{
+       char *cp, *cplim;
+
+       max_keylen = maxk;
+       if (max_keylen == 0) {
+               log(LOG_ERR,
+                   "rn_init: radix functions require max_keylen be set\n");
+               return;
+       }
+       R_Malloc(rn_zeros, char *, 3 * max_keylen);
+       if (rn_zeros == NULL)
+               panic("rn_init");
+       bzero(rn_zeros, 3 * max_keylen);
+       rn_ones = cp = rn_zeros + max_keylen;
+       addmask_key = cplim = rn_ones + max_keylen;
+       while (cp < cplim)
+               *cp++ = -1;
+       if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
+               panic("rn_init 2");
+}
diff --git a/glue.h b/glue.h
index 1f8aa62..de0ab23 100644 (file)
--- a/glue.h
+++ b/glue.h
@@ -23,7 +23,7 @@
  * SUCH DAMAGE.
  */
 /*
- * $Id: glue.h 4436 2009-12-10 18:31:49Z luigi $
+ * $Id: glue.h 4661 2010-01-04 11:56:12Z luigi $
  *
  * glue code to adapt the FreeBSD version to linux and windows,
  * userland and kernel.
@@ -241,6 +241,11 @@ int
 sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
          size_t newlen);
  
+#ifdef __linux__
+/* linux does not have sin_len in sockaddr, we only remap in userland */
+#define        sin_len sin_zero[0]
+#endif /* __linux__ */
+
 #else /* KERNEL_MODULE */
 
 /* linux and windows kernel do not have bcopy ? */
@@ -250,6 +255,11 @@ sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
 #include <linux/in6.h>
 #endif
 
+/* skb_dst() was introduced from linux 2.6.31 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)        // or 2.4.x
+#define skb_dst(_dummy) skb->dst
+#endif
+
 /* definitions useful for the kernel side */
 
 struct route_in6 { };
@@ -260,10 +270,6 @@ struct route_in6 { };
 
 #define INET_ADDRSTRLEN                16
 
-#ifdef linux
-/* linux does not have sin_len in sockaddr */
-#define        sin_len sin_zero[0]
-#endif /* linux */
 
 /*
  * List of values used for set/getsockopt options.
index 807f2d1..7b4a272 100644 (file)
@@ -9,7 +9,7 @@ $(warning Building userland ipfw for $(VER))
 EXTRA_CFLAGS += -O1
 EXTRA_CFLAGS += -Wall -Werror
 EXTRA_CFLAGS += -include ../glue.h
-EXTRA_CFLAGS += -I ./include
+EXTRA_CFLAGS += -I ./include_e -I ./include
 
 ifneq ($(VER),openwrt)
 OSARCH := $(shell uname)
@@ -41,7 +41,21 @@ all: ipfw
 ipfw: $(OBJS)
        $(CC) $(LDFLAGS) -o $@ $^
 
-$(OBJS) : ipfw2.h ../glue.h include/netinet
+$(OBJS) : ipfw2.h ../glue.h include/netinet include_e
+
+# support to create empty dirs and files in include_e/
+# EDIRS is the list of directories, EFILES is the list of files.
+EDIRS   = sys
+
+EFILES  = sys/sockio.h libutil.h
+M ?= $(shell pwd)
+
+include_e:
+       echo "running in $M"
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
 
 include/netinet:
        -@rm -rf include/netinet
index 6cfbff0..c50962d 100644 (file)
@@ -33,6 +33,7 @@
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
+#include <libutil.h>
 #include <netdb.h>
 #include <stdio.h>
 #include <stdlib.h>
index 19ea71e..fb3d5c3 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * $Id: glue.c 4469 2009-12-11 20:23:11Z marta $
+ * $Id: glue.c 4540 2009-12-16 17:22:47Z marta $
  *
  * Userland functions missing in linux
  */
diff --git a/ipfw/include_e/libutil.h b/ipfw/include_e/libutil.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ipfw/include_e/sys/sockio.h b/ipfw/include_e/sys/sockio.h
new file mode 100644 (file)
index 0000000..e69de29
index 5d70328..85979f8 100644 (file)
@@ -224,11 +224,14 @@ static struct _s_x rule_action_params[] = {
        { NULL, 0 }     /* terminator */
 };
 
-/* index of 'lookup ... ' keys in the kernel */
+/*                             
+ * The 'lookup' instruction accepts one of the following arguments.
+ * -1 is a terminator for the list.
+ * Arguments are passed as v[1] in O_DST_LOOKUP options.
+ */
 static int lookup_key[] = {
        TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT,
-       TOK_UID, TOK_GID, TOK_JAIL,
-       TOK_PROTO, TOK_MACTYPE, 0, };
+       TOK_UID, TOK_JAIL, -1 };
 
 static struct _s_x rule_options[] = {
        { "tagged",             TOK_TAGGED },
@@ -756,8 +759,8 @@ print_ip(ipfw_insn_ip *cmd, char const *s)
 
                if (d < sizeof(lookup_key)/sizeof(lookup_key[0]))
                        arg = match_value(rule_options, lookup_key[d]);
-               printf("%s lookup %s %d,%d", cmd->o.len & F_NOT ? " not": "",
-                       arg, cmd->o.arg1, a[0]);
+               printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "",
+                       arg, cmd->o.arg1);
                return;
        }
        printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);
@@ -3518,26 +3521,21 @@ read_options:
                        int j;
 
                        if (ac < 2)
-                               errx(EX_USAGE, "format: lookup argument tablenum[,arg]");
+                               errx(EX_USAGE, "format: lookup argument tablenum");
                        cmd->opcode = O_IP_DST_LOOKUP;
                        cmd->len |= F_INSN_SIZE(ipfw_insn) + 2;
                        i = match_token(rule_options, *av);
-                       for (j = 0; lookup_key[j] ; j++) {
+                       for (j = 0; lookup_key[j] >= 0 ; j++) {
                                if (i == lookup_key[j])
                                        break;
                        }
-                       if (lookup_key[j] == 0)
+                       if (lookup_key[j] <= 0)
                                errx(EX_USAGE, "format: cannot lookup on %s", *av);
                        c->d[1] = j; // i converted to option
                        ac--; av++;
-                       p = strchr(*av, ',');
-                       if (p) {
-                               *p++ = '\0';
-                               c->d[0] = strtoul(p, NULL, 0);
-                       } else {
-                               c->d[0] = ~0;
-                       }
-                       cmd->arg1 = strtoul(*av, NULL, 0);
+                       cmd->arg1 = strtoul(*av, &p, 0);
+                       if (p && *p)
+                               errx(EX_USAGE, "format: lookup argument tablenum");
                        ac--; av++;
                    }
                        break;
index 0e5e696..68373b9 100644 (file)
@@ -5,7 +5,7 @@
 # restart crond
 # modprobe ipfw_mod.ko (depmod ?)
 #
-%define url $URL:$
+%define url $URL$
 
 # Marta Carbone <marta.carbone@iet.unipi.it>
 # 2009 - Universita` di Pisa
@@ -32,6 +32,7 @@ Group: System Environment/Kernel
 Source0: %{name}-%{version}.tar.bz2
 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
 Requires: vixie-cron
+Requires: vsys-scripts
 
 Vendor: unipi
 Packager: PlanetLab <marta@onelab2.iet.unipi.it>
@@ -56,7 +57,7 @@ rm -rf $RPM_BUILD_ROOT
 %__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1
 
 %install
-install -D -m 755 dummynet/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
+install -D -m 755 dummynet2/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
 install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw
 install -D -m 755 planetlab/ipfw-cleanup $RPM_BUILD_ROOT/usr/bin/ipfw-cleanup
 install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron
@@ -76,8 +77,15 @@ rm -rf $RPM_BUILD_ROOT
 %postun
 # unload the module if present
 LOADED=`cat /proc/modules | grep ^ipfw_mod`; if [ -n "$LOADED" ] ; then rmmod ipfw_mod; fi
+# clean the old database and initialize the firewall
+echo "super dbcleanup" | /vsys/ipfw-be 0
+echo "super init" | /vsys/ipfw-be 0
 
 %changelog
+* Wed Jan 06 2010 Marta Carbone <marta.carbone@iet.unipi.it>
+- move to dummynet2, added support for table lookup
+- added the vsys-script dependencies and the ipfw initialization
+
 * Tue Dec 15 2009 Marta Carbone <marta.carbone@iet.unipi.it>
 - more work on the radix code, added sysctl read/write support