Added the new version for dummynet.

author marta <marta@8c455092-636d-4788-adf5-e71def0336e8>

Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)

committer marta <marta@8c455092-636d-4788-adf5-e71def0336e8>

Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
author marta <marta@8c455092-636d-4788-adf5-e71def0336e8>
Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
committer marta <marta@8c455092-636d-4788-adf5-e71def0336e8>
Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
diff --git a/Makefile b/Makefile

index f863838..51a00a9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ all clean distclean:
         echo target is $(@)
         (cd ipfw && $(MAKE) $(@) )
         (cd dummynet && $(MAKE) $(@) )
+       (cd dummynet2 && $(MAKE) $(@) )
  
  snapshot:
         (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME).tgz --exclude .svn \
diff --git a/Makefile.openwrt b/Makefile.openwrt

index 50dae83..b618a52 100644 (file)
--- a/Makefile.openwrt
+++ b/Makefile.openwrt
@@ -44,7 +44,9 @@ define Build/Prepare
    # $(warning Preparing ipfw sources)
         mkdir -p $(PKG_BUILD_DIR)
         $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/
+       (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e )
         (cd $(PKG_BUILD_DIR)/dummynet && $(MAKE) include_e )
+       (cd $(PKG_BUILD_DIR)/dummynet2 && $(MAKE) include_e )
  endef
  
  define Build/Compile
@@ -54,10 +56,15 @@ define Build/Compile
                 ARCH="$(LINUX_KARCH)" \
                 SUBDIRS="$(PKG_BUILD_DIR)/dummynet" \
                 VER=openwrt modules
+       $(MAKE) -C "$(LINUX_DIR)" \
+               CROSS_COMPILE="$(TARGET_CROSS)" \
+               ARCH="$(LINUX_KARCH)" \
+               SUBDIRS="$(PKG_BUILD_DIR)/dummynet2" \
+               VER=openwrt modules
         # compile the userland part for openwrt
         $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \
                 $(TARGET_CONFIGURE_OPTS) \
-               CFLAGS="$(TARGET_CFLAGS) -I./include -include ../glue.h" \
+               CFLAGS="$(TARGET_CFLAGS) -I./include_e -I./include -include ../glue.h" \
                 VER=openwrt all
  endef
  
diff --git a/README b/README

index 7ab66bf..0c3b4e8 100644 (file)
--- a/README
+++ b/README
@@ -14,6 +14,9 @@ version in RELENG_7 and HEAD as of December 2009), plus some glue code
  and headers written from scratch.
  Unless specified otherwise, all the code here is under a BSD license.
  
+Note:
+       - the linux version miss the "one_pass" feature
+
  =================== BUILD INSTRUCTIONS ==========================
  
  ***** Linux 2.6.x ******
@@ -35,6 +38,10 @@ Unless specified otherwise, all the code here is under a BSD license.
             Networking options  --->
                [*] Network packet filtering framework (Netfilter)
  
+       If you have not yet compiled your kernel source, you need to
+       prepare the build environment:
+
+       (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)
  
  ***** Linux 2.4.x *****
  
@@ -114,6 +121,10 @@ Unless specified otherwise, all the code here is under a BSD license.
      rmmod ipfw_mod.o                            # remove the module
  
  ***** PLANETLAB BUILD (within a slice) *****
+These instruction can be used by PlanetLab developers to compile the dummynet module
+on a node. To install the module on the node users need root access in root context.
+PlanetLab users that want to use the dummynet package should ask to PlanetLab support
+for nodes with dummynet emulation capabilities.
  
      Follow the instructions below. You can just cut&paste
  
diff --git a/dummynet/Makefile b/dummynet/Makefile

index cac1958..6c6d9f6 100644 (file)
--- a/dummynet/Makefile
+++ b/dummynet/Makefile
@@ -5,7 +5,6 @@
  #
  # The defaults are set to build without modifications on PlanetLab
  # and possibly 2.6 versions.
-#
  
  # Some variables need to have specific names, because they are used
  # by the build infrastructure on Linux and OpenWrt. They are:
@@ -33,27 +32,40 @@ $(warning including dummynet/Makefile)
  # lets default for 2.6 for planetlab builds
  VER ?= 2.6
  
-# General values
+#--- General values for all types of build ---
+# obj-m is the target module
  obj-m := ipfw_mod.o
  
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS = ip_fw2.c ip_dummynet.c ip_fw_pfil.c in_cksum.c
+IPFW_SRCS += radix.c 
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
  # generic cflags used on all systems
  #ipfw-cflags += -DIPFW_HASHTABLES
-ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT -DTRACE
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
  # _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
  ipfw-cflags += -D_BSD_SOURCE
  ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
  # the two header trees for empty and override files
-ipfw-cflags += -I $(M)/include_e -I $(M)/include
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/include
  ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
  
  $(warning "---- Building dummynet kernel module for Version $(VER)")
+
  # We have three sections for OpenWrt, Linux 2.4 and Linux 2.6
-#
+
  ifeq ($(VER),openwrt)
+  #--- The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
    M=.
-  obj-y := ipfw2_mod.o bsd_compat.o \
-       in_cksum.o ip_dummynet.o ip_fw2.o ip_fw_pfil.o radix.o
-  O_TARGET := ipfw_mod.o
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
  
    # xcflags-y is a temporary variable where we store build options
    xcflags-y += -O1 -DLINUX_24
@@ -72,22 +84,22 @@ else        # !openwrt, below we do linux builds for 2.4 and 2.6
    # We can override it from the command line, or let the system guess.
  
  ifneq ($(shell echo $(VER)|grep '2.4'),)
-  # The linux 2.4 version
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
    # guess the kernel path -- or is it under /lib/modules ?
-  KERNELPATH ?= /usr/src/`uname -r`/build
-
-  # Guess the gcc include directory
-  # The gcc version is in the last line returned by gcc -v
-  # gcc version 4.3.2 (Debian 4.3.2-1.1)
-  MYGCC_VER ?= $(shell gcc -v 2>&1 |tail -n 1 | cut -d " " -f 3)
-  # We don't know the exact directory unde /usr/lib/gcc so we guess
+  KERNELPATH ?= /usr/src/`uname -r`
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
    MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
    $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
  
    # additional warning
-  #WARN = -Wp,-MD,/home/luigi/ports-luigi/dummynet-branches/ipfw_mod/dummynet/.ipfw2_mod.o.d
-  #WARN += -Iinclude  -include include/linux/autoconf.h
-
    WARN += -Wall -Wundef
    WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
    WARN += -fno-common -Werror-implicit-function-declaration
@@ -96,22 +108,29 @@ ifneq ($(shell echo $(VER)|grep '2.4'),)
    WARN += -m32 -msoft-float # -mregparm=3
    #WARN += -freg-struct-return -mpreferred-stack-boundary=2
    WARN += -Wno-sign-compare
-  WARN += -Wdeclaration-after-statement -Wno-pointer-sign
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+       WARN += -Wno-pointer-sign
+  endif
  
    ccflags-y += -O1 -DLINUX_24
    CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
-       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) ${ccflags-y}
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
    # The Main target
  all: mod24
  
-else
+else # !2.4 --
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
  ifeq ($(IPFW_PLANETLAB),1)
    $(warning "---- Building for PlanetLab")
    ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
  endif
    # if not set, use the version from the installed system
    KERNELPATH ?= /lib/modules/`uname -r`/build
-  # the latest kernel
+  # Otherwise, if you have kernel sources, try something like this:
    #KERNELPATH = /usr/src/linux-2.6.22
    $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
    WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
@@ -119,34 +138,39 @@ endif
  
    # Required by kernel <= 2.6.22, ccflags-y is used on newer version
    LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3)
-  ifeq ($(LINUX_VERSION_CODE),132630)
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+  ifeq ($(shell if [ $(LINUX_VERSION_CODE) -le 132630 ] ; then echo "true"; fi),true)
      EXTRA_CFLAGS += $(ccflags-y)
    endif
  
  all: include_e
         $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules
-endif
+endif # !2.4
  
-#-- back to the common section of code
+#-- back to the common section of code for Linux 2.4 and 2.6
  
  # the list of objects used to build the module
  ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
  
-# Original ipfw and dummynet sources + FreeBSD stuff,
-IPFW_SRCS = ip_fw2.c ip_dummynet.c ip_fw_pfil.c in_cksum.c
-IPFW_SRCS += radix.c 
-# Module glue and functions missing in linux
-IPFW_SRCS += ipfw2_mod.c bsd_compat.c hashtable.c
-
  # additional $(CC) flags
  ccflags-y += $(WARN)
  ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
  ccflags-y += -g
  
  mod24: include_e $(obj-m)
  
  $(obj-m): $(ipfw_mod-y)
         $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+endif # !openwrt
+
+#--- various common targets
  clean:
         -rm -f *.o *.ko Module.symvers *.mod.c
         -rm -rf include_e
@@ -172,6 +196,7 @@ EFILES += net/vnet.h
  
  EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h
  EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h
+EFILES += netinet/in_systm.h
  EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h
  EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h
  EFILES += netinet/udp_var.h
@@ -184,14 +209,13 @@ EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h
  EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h
  EFILES += sys/sysctl.h sys/time.h sys/ucred.h
  
-M ?= $(shell pwd)
  include_e:
         echo "running in $M"
         -@rm -rf $(M)/include_e opt_*
         -@mkdir -p $(M)/include_e
         -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
  
-endif # !openwrt
  
+#--- some other targets for testing purposes
  test_radix: test_radix.o radix.o
-test_radix: CFLAGS=-Wall -Werror -O1
+test_radix: CFLAGS=-Wall -Werror -O2
diff --git a/dummynet/bsd_compat.c b/dummynet/bsd_compat.c

index 995d60c..cad3c5d 100644 (file)
--- a/dummynet/bsd_compat.c
+++ b/dummynet/bsd_compat.c
@@ -24,7 +24,7 @@
   */
  
  /*
- * $Id$
+ * $Id: bsd_compat.c 4508 2009-12-15 21:54:14Z luigi $
   *
   * kernel variables and functions that are not available in linux.
   */
@@ -32,7 +32,6 @@
  #include <sys/cdefs.h>
  #include <asm/div64.h> /* do_div on 2.4 */
  #include <linux/random.h>      /* get_random_bytes on 2.4 */
-#include "missing.h"
  
  /*
   * gettimeofday would be in sys/time.h but it is not
diff --git a/dummynet/include/sys/kernel.h b/dummynet/include/sys/kernel.h

index 61b3bec..fbc9581 100644 (file)
--- a/dummynet/include/sys/kernel.h
+++ b/dummynet/include/sys/kernel.h
@@ -5,7 +5,13 @@
  #define _SYS_KERNEL_H_
  
  #define SYSINIT(a, b, c, d, e)  \
-        void *dummy_ ## d = d
+        void *sysinit_ ## d = d
+#define VNET_SYSINIT(a, b, c, d, e)  \
+        void *sysinit_ ## d = d
+#define SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
+#define VNET_SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
  
  /*
   * Some enumerated orders; "ANY" sorts last.
diff --git a/dummynet/include/sys/mbuf.h b/dummynet/include/sys/mbuf.h

index ed3d3a1..12837bf 100644 (file)
--- a/dummynet/include/sys/mbuf.h
+++ b/dummynet/include/sys/mbuf.h
@@ -107,11 +107,21 @@ m_tag_prepend(struct mbuf *m, struct m_tag *t)
         SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
  }
  
+/*
+ * Return the next tag in the list of tags associated with an mbuf.
+ */
+static __inline struct m_tag *
+m_tag_next(struct mbuf *m, struct m_tag *t)
+{
+ 
+        return (SLIST_NEXT(t, m_tag_link));
+}
+
  /*
   * Create an mtag of the given type
   */
  static __inline struct m_tag *
-m_tag_get(int type, int length, int wait)
+m_tag_alloc(uint32_t cookie, int type, int length, int wait)
  {
         int l = length + sizeof(struct m_tag);
         struct m_tag *m = malloc(l, 0, M_NOWAIT);
@@ -119,10 +129,17 @@ m_tag_get(int type, int length, int wait)
                 memset(m, 0, l);
                 m->m_tag_id = type;
                 m->m_tag_len = length;
+               m->m_tag_cookie = cookie;
         }
         return m;
  };
  
+static __inline struct m_tag *
+m_tag_get(int type, int length, int wait)
+{
+       return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait);
+}
+
  static __inline struct m_tag *
  m_tag_first(struct mbuf *m)
  {
@@ -140,6 +157,7 @@ m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t)
         return NULL;
  };
  
+#define M_SETFIB(_m, _fib)     /* nothing on linux */
  static __inline void
  m_freem(struct mbuf *m)
  {
@@ -156,7 +174,7 @@ m_freem(struct mbuf *m)
  };
  
  /* we cannot pullup */
-#define m_pullup(__m, __i)     (m)
+//#define m_pullup(__m, __i)   (m)
  
  #define M_GETFIB(_m)   0
  
diff --git a/dummynet/include/sys/module.h b/dummynet/include/sys/module.h

index 5296517..85bf220 100644 (file)
--- a/dummynet/include/sys/module.h
+++ b/dummynet/include/sys/module.h
@@ -19,7 +19,6 @@ typedef struct moduledata {
          void            *priv;          /* extra data */
  } moduledata_t;
  
-int my_mod_register(struct moduledata *mod, const char *name, int order);
  /*
   * Hook the module descriptor, md, into our list of things to do.
   * We should in principle respect the order of loading.
diff --git a/dummynet/ip_dummynet.c b/dummynet/ip_dummynet.c

index 0b23881..9fd70e2 100644 (file)
--- a/dummynet/ip_dummynet.c
+++ b/dummynet/ip_dummynet.c
@@ -56,8 +56,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.110.2.4 2008/10/31 12:58:1
   * include files marked with XXX are probably not needed
   */
  
-#include "missing.h"
-
  #include <sys/param.h>
  #include <sys/systm.h>
  #include <sys/malloc.h>
diff --git a/dummynet/ip_fw2.c b/dummynet/ip_fw2.c

index 4e46566..21d1b41 100644 (file)
--- a/dummynet/ip_fw2.c
+++ b/dummynet/ip_fw2.c
@@ -70,11 +70,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw2.c,v 1.175.2.13 2008/10/30 16:29:04 bz
  #include <net/pf_mtag.h>
  #include <net/vnet.h>
  
-#ifdef linux
-#define INP_LOCK_ASSERT                /* define before missing.h otherwise ? */
-#include "missing.h"
-#endif
-
  #define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */
  
  #include <netinet/in.h>
@@ -104,10 +99,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw2.c,v 1.175.2.13 2008/10/30 16:29:04 bz
  
  #include <machine/in_cksum.h>  /* XXX for in_cksum */
  
-#ifdef IPFW_HASHTABLES
-#include "hashtable.h"
-#endif
-
  #ifdef MAC
  #include <security/mac/mac_framework.h>
  #endif
@@ -183,18 +174,14 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
  SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
      CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
      "Set upper limit of matches of ipfw rules logged");
-static unsigned int dummy_default_rule = IPFW_DEFAULT_RULE;
+unsigned int dummy_default_rule = IPFW_DEFAULT_RULE;
  SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
      &dummy_default_rule, IPFW_DEFAULT_RULE,
      "The default/max possible rule number.");
-static unsigned int dummy_tables_max = IPFW_TABLES_MAX;
+unsigned int dummy_tables_max = IPFW_TABLES_MAX;
  SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
      &dummy_tables_max, IPFW_TABLES_MAX,
      "The maximum number of tables.");
-static unsigned int skipto_entries = 256;
-SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, skipto_entries,
-    CTLFLAG_RW, &skipto_entries, 0,
-    "Number of entries in the skipto cache");
  SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
      &default_to_accept, 0,
      "Make the default rule accept all packets.");
@@ -1886,61 +1873,6 @@ send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
         args->m = NULL;
  }
  
-static void
-set_skipto_table(struct ip_fw_chain *ch)
-{
-       int i, n, sh;
-       struct ip_fw *f, **t, **oldt;
-
-       for (sh = 15; sh > 0; sh--)
-               if (skipto_entries > 1<<sh)
-                       break;
-       sh++;
-       skipto_entries = 1<< (16 - sh) ;
-       /* XXX unsafe and too long */
-       t = malloc(skipto_entries * sizeof(*t), M_IPFW_TBL, M_WAITOK | M_ZERO);
-       if (t == NULL)
-               return;
-       IPFW_RLOCK(ch);
-       /* Store pointers in the table. In the loop i is the next
-        * free slot, n is the slot where the current rule goes.
-        */
-       for (i = 0, f = ch->rules; f; f = f->next) {
-               n = f->rulenum >> sh ;
-               while (i <= n)
-                       t[i++] = f;
-       }
-       V_layer3_chain.skipto_shift = sh;
-       V_layer3_chain.skipto_size = skipto_entries;
-       oldt = V_layer3_chain.skipto_ptrs;
-       V_layer3_chain.skipto_ptrs = t;
-       IPFW_RUNLOCK(ch);
-       if (oldt) {
-               IPFW_WLOCK(ch);
-               IPFW_WUNLOCK(ch);
-               /* now can free oldt */
-               free(oldt, M_IPFW_TBL);
-       }
-}
-#if 0
-/*
- * Map a rule number to a rule pointer, using the skipto table.
- * First lookup the slot, then follow the chain until we find a
- * non-null entry with rulenum >= num. Return default_rule on error.
- */
-static struct ip_fw *
-rule2ptr(struct ip_fw_chain *ch, int num)
-{
-       struct ip_fw *r = NULL;
-       int ix = (num & 0xffff) >> ch->skipto_shift;
-
-       while (ix < ch->skipto_size && (r = ch->skipto_ptrs[ix]) == NULL)
-               ix++;
-       while (r && num < r->rulenum)
-               r = r->next;
-       return (r ? r : ch->default_rule);
-}
-#endif
  /**
   *
   * Given an ip_fw *, lookup_next_rule will return a pointer
@@ -1957,10 +1889,11 @@ rule2ptr(struct ip_fw_chain *ch, int num)
   */
  
  static struct ip_fw *
-lookup_next_rule(struct ip_fw_chain *ch, struct ip_fw *me, uint32_t tablearg)
+lookup_next_rule(struct ip_fw *me, u_int32_t tablearg)
  {
         struct ip_fw *rule = NULL;
         ipfw_insn *cmd;
+       u_int16_t       rulenum;
  
         /* look for action, in case it is a skipto */
         cmd = ACTION_PTR(me);
@@ -1970,19 +1903,21 @@ lookup_next_rule(struct ip_fw_chain *ch, struct ip_fw *me, uint32_t tablearg)
                 cmd += F_LEN(cmd);
         if (cmd->opcode == O_TAG)
                 cmd += F_LEN(cmd);
-       if (cmd->opcode != O_SKIPTO ) {
-               rule = me->next;
-       } else {
-               tablearg = tablearg ? tablearg : cmd->arg1;
+       if (cmd->opcode == O_SKIPTO ) {
+               if (tablearg != 0) {
+                       rulenum = (u_int16_t)tablearg;
+               } else {
+                       rulenum = cmd->arg1;
+               }
                 for (rule = me->next; rule ; rule = rule->next) {
-                       if (rule->rulenum >= tablearg) {
+                       if (rule->rulenum >= rulenum) {
                                 break;
                         }
                 }
-
-//             rule = rule2ptr(ch, tablearg ? tablearg : cmd->arg1);
         }
-       me->next_rule = rule; /* XXX perhaps unnecessary ? */
+       if (rule == NULL)               /* failure or not a skipto */
+               rule = me->next;
+       me->next_rule = rule;
         return rule;
  }
  
@@ -1994,11 +1929,6 @@ add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
         struct table_entry *ent;
         struct radix_node *rn;
  
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
         if (tbl >= IPFW_TABLES_MAX)
                 return (EINVAL);
         rnh = ch->tables[tbl];
@@ -2037,11 +1967,6 @@ del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
         struct table_entry *ent;
         struct sockaddr_in sa, mask;
  
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
         if (tbl >= IPFW_TABLES_MAX)
                 return (EINVAL);
         rnh = ch->tables[tbl];
@@ -2085,11 +2010,6 @@ flush_table(struct ip_fw_chain *ch, uint16_t tbl)
  
         IPFW_WLOCK_ASSERT(ch);
  
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
         if (tbl >= IPFW_TABLES_MAX)
                 return (EINVAL);
         rnh = ch->tables[tbl];
@@ -2107,10 +2027,6 @@ flush_tables(struct ip_fw_chain *ch)
  
         for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
                 flush_table(ch, tbl);
-#ifdef IPFW_HASHTABLES
-       for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
-               ch->hashtab[tbl] = ipfw_ht_destroy(ch->hashtab[tbl]);
-#endif
  }
  
  static int
@@ -2127,10 +2043,6 @@ init_tables(struct ip_fw_chain *ch)
                         return (ENOMEM);
                 }
         }
-#ifdef IPFW_HASHTABLES
-        for (i = 0; i < IPFW_TABLES_MAX; i++)
-               ch->hashtab[i] = ipfw_ht_destroy(ch->hashtab[i]);
-#endif
         return (0);
  }
  
@@ -2767,7 +2679,7 @@ do {                                                                      \
                         f = args->rule->next_rule;
  
                 if (f == NULL)
-                       f = lookup_next_rule(chain, args->rule, 0);
+                       f = lookup_next_rule(args->rule, 0);
         } else {
                 /*
                  * Find the starting rule. It can be either the first
@@ -2984,7 +2896,7 @@ do {                                                                      \
                                             a = dst_port;
                                         else if (v == 3)
                                             a = src_port;
-                                       else if (v >= 4 && v <= 6) {
+                                       else if (v == 4 || v == 5) {
                                             check_uidgid(
                                                     (ipfw_insn_u32 *)cmd,
                                                     proto, oif,
@@ -2994,16 +2906,12 @@ do {                                                                    \
  #ifdef linux
                                             if (v ==4 /* O_UID */)
                                                 a = ucred_cache.uid;
-                                           else if (v == 5 /* O_GID */)
-                                               a = ucred_cache.gid;
-                                           else if (v == 6 /* O_JAIL */)
+                                           else if (v == 5 /* O_JAIL */)
                                                 a = ucred_cache.xid;
  #else
                                             if (v ==4 /* O_UID */)
                                                 a = (*uc)->cr_uid;
-                                           else if (v == 5 /* O_GID */)
-                                               ; // a = groupmember((gid_t)insn->d[0], *uc);
-                                           else if (v == 6 /* O_JAIL */)
+                                           else if (v == 5 /* O_JAIL */)
                                                 a = (*uc)->cr_prison->pr_id;
  #endif
                                         } else
@@ -3590,10 +3498,10 @@ do {                                                                    \
                                 }
                                 /* handle skipto */
                                 if (cmd->arg1 == IP_FW_TABLEARG) {
-                                       f = lookup_next_rule(chain, f, tablearg);
-                               } else {
+                                       f = lookup_next_rule(f, tablearg);
+                               } else { // XXX ?
                                         if (f->next_rule == NULL)
-                                               lookup_next_rule(chain, f, 0);
+                                               lookup_next_rule(f, 0);
                                         f = f->next_rule;
                                 }
                                 /*
@@ -3883,15 +3791,17 @@ add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
                 goto done;
          }
  
+       /*
+        * If rulenum is 0, find highest numbered rule before the
+        * default rule, and add autoinc_step
+        */
         if (V_autoinc_step < 1)
                 V_autoinc_step = 1;
         else if (V_autoinc_step > 1000)
                 V_autoinc_step = 1000;
         if (rule->rulenum == 0) {
                 /*
-                * If rulenum is 0, use highest numbered rule before
-                * the default, adding autoinc_step if room.
-                * Also set the number in the caller.
+                * locate the highest numbered rule before default
                  */
                 for (f = chain->rules; f; f = f->next) {
                         if (f->rulenum == IPFW_DEFAULT_RULE)
@@ -3905,7 +3815,6 @@ add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
  
         /*
          * Now insert the new rule in the right place in the sorted list.
-        * XXX TODO also put in the skipto table.
          */
         for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
                 if (f->rulenum > rule->rulenum) { /* found the location */
@@ -3958,7 +3867,6 @@ remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
                 prev->next = n;
         V_static_count--;
         V_static_len -= l;
-       // XXX remove from the skipto table
  
         rule->next = chain->reap;
         chain->reap = rule;
@@ -5089,17 +4997,12 @@ ipfw_destroy(void)
         IPFW_WUNLOCK(&V_layer3_chain);
         if (reap != NULL)
                 reap_rules(reap);
-       IPFW_DYN_LOCK_DESTROY();
         uma_zdestroy(ipfw_dyn_rule_zone);
+       IPFW_DYN_LOCK_DESTROY();
         if (V_ipfw_dyn_v != NULL)
                 free(V_ipfw_dyn_v, M_IPFW);
         IPFW_LOCK_DESTROY(&V_layer3_chain);
  
-#ifdef INET6
-       /* Free IPv6 fw sysctl tree. */
-       sysctl_ctx_free(&ip6_fw_sysctl_ctx);
-#endif
-
         printf("IP firewall unloaded\n");
  }
  
@@ -5154,8 +5057,6 @@ vnet_ipfw_init(const void *unused)
         IPFW_LOCK_INIT(&V_layer3_chain);
         callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
  
-       set_skipto_table(&V_layer3_chain);
-
         bzero(&default_rule, sizeof default_rule);
         default_rule.act_ofs = 0;
         default_rule.rulenum = IPFW_DEFAULT_RULE;
diff --git a/dummynet/ip_fw_pfil.c b/dummynet/ip_fw_pfil.c

index 368192a..b3fcba6 100644 (file)
--- a/dummynet/ip_fw_pfil.c
+++ b/dummynet/ip_fw_pfil.c
@@ -43,17 +43,20 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw_pfil.c,v 1.25.2.2 2008/04/25 10:26:30
  #include <sys/mbuf.h>
  #include <sys/module.h>
  #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
  #include <sys/socket.h>
  #include <sys/socketvar.h>
  #include <sys/sysctl.h>
+#include <sys/ucred.h>
  
  #include <net/if.h>
+#include <net/route.h>
  #include <net/pfil.h>
  #include <net/vnet.h>
  
-#include "missing.h"
-
  #include <netinet/in.h>
+#include <netinet/in_systm.h>
  #include <netinet/ip.h>
  #include <netinet/ip_var.h>
  #include <netinet/ip_fw.h>
diff --git a/dummynet/ipfw2_mod.c b/dummynet/ipfw2_mod.c

index 667d487..4b7edd1 100644 (file)
--- a/dummynet/ipfw2_mod.c
+++ b/dummynet/ipfw2_mod.c
@@ -49,8 +49,6 @@
  #include <sys/mbuf.h>                  /* sizeof struct mbuf */
  #include <sys/param.h>                 /* NGROUPS */
  
-#include "missing.h"
-
  #ifdef __linux__
  #include <linux/module.h>
  #include <linux/kernel.h>
@@ -407,7 +405,7 @@ ipfw2_queue_handler(QH_ARGS)
         }
  
         if (m != NULL) {        /* Accept. reinject and free the mbuf */
-               REINJECT(info, NF_STOP);
+               REINJECT(info, NF_ACCEPT);
                 m_freem(m);
         } else if (ret == 0) {
                 /* dummynet has kept the packet, will reinject later. */
@@ -502,7 +500,7 @@ linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
         if (proto != IPPROTO_TCP)       /* XXX extend for UDP */
                 return -1;
  
-       if ((dir ? (void *)skb->dst : (void *)skb->dev) == NULL) {
+       if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
                 panic(" -- this should not happen\n");
                 return -1;
         }
diff --git a/dummynet/missing.h b/dummynet/missing.h

index d18f503..5b04dce 100644 (file)
--- a/dummynet/missing.h
+++ b/dummynet/missing.h
@@ -33,6 +33,8 @@
  #ifndef _MISSING_H_
  #define _MISSING_H_
  
+#include <sys/cdefs.h>
+
  #ifdef _WIN32
  
  #ifndef DEFINE_SPINLOCK
@@ -50,6 +52,7 @@
  
  #else  /* __linux__ */
  
+#define MALLOC_DECLARE(x)      /* nothing */
  #include <linux/time.h>                /* do_gettimeofday */
  #include <netinet/ip.h>                /* local version */
  struct inpcb;
@@ -122,7 +125,11 @@ struct malloc_type {
  
  #define CTASSERT(x)
  
-#define log(_level, fmt, arg...)  printk(KERN_ERR fmt, ##arg)
+/* log... does not use the first argument */
+#define        LOG_ERR         0x100
+#define        LOG_INFO        0x200
+#define log(_level, fmt, arg...)  do {                 \
+       int __unused x=_level;printk(KERN_ERR fmt, ##arg); } while (0)
  
  /*
   * gettimeofday would be in sys/time.h but it is not
@@ -263,6 +270,10 @@ int in_cksum(struct mbuf *m, int len);
  #define INADDR_TO_IFP(a, b) b = NULL
  #define pf_find_mtag(a) NULL
  #define pf_get_mtag(a) NULL
+/* we don't pullup, fail */
+#define m_pullup(m, x)                                 \
+       ((m)->m_len >= x ? (m) : (netisr_dispatch(-1, m), NULL))
+
  #ifndef _WIN32
  #define AF_LINK AF_ASH /* ? our sys/socket.h */
  #endif
@@ -389,7 +400,6 @@ struct sock *inet_lookup(
          const __be32 saddr, const __be16 sport,
          const __be32 daddr, const __be16 dport,
          const int dif);
-static int inet_iif(const struct sk_buff *skb);
  struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
  #endif /* Linux < 2.6 */
  
@@ -504,4 +514,6 @@ extern  ip_fw_chk_t     *ip_fw_chk_ptr;
  #define SYSCTL_VNET_PROC       SYSCTL_PROC
  #define SYSCTL_VNET_INT                SYSCTL_INT
  
+int my_mod_register(struct moduledata *mod, const char *name, int order);
+
  #endif /* !_MISSING_H_ */
diff --git a/dummynet/radix.c b/dummynet/radix.c

index 575c47c..639a561 100644 (file)
--- a/dummynet/radix.c
+++ b/dummynet/radix.c
@@ -36,7 +36,6 @@
  #include <sys/param.h>
  #ifdef _KERNEL
  #include <sys/cdefs.h>
-#include "missing.h"
  #include <sys/lock.h>
  #include <sys/mutex.h>
  #include <sys/rwlock.h>
@@ -382,7 +381,7 @@ int rn_debug =  1;
   * the leaf (see RNTORT() in route.c), the second one is the parent.
   * This routine initializes the relevant fields of the nodes, so that
   * the leaf is the left child of the parent node, and both nodes have
- * (almost) all all fields filled as appropriate.
+ * (almost) all fields filled as appropriate.
   * (XXX some fields are left unset, see the '#if 0' section).
   * The function returns a pointer to the parent node.
   */
diff --git a/dummynet2/Makefile b/dummynet2/Makefile

new file mode 100644 (file)

index 0000000..2fe1d7b
--- /dev/null
+++ b/dummynet2/Makefile
@@ -0,0 +1,226 @@
+#
+# $Id: Makefile 4657 2010-01-04 11:20:53Z marta $
+#
+# gnu Makefile to build linux module for ipfw+dummynet.
+#
+# The defaults are set to build without modifications on PlanetLab
+# and possibly 2.6 versions.
+
+# Some variables need to have specific names, because they are used
+# by the build infrastructure on Linux and OpenWrt. They are:
+# 
+#   ccflags-y  additional $(CC) flags
+#   M          used by Kbuild, we must set it to `pwd`
+#   obj-m      list of .o modules to build
+#   $(MOD)-y   for each $MOD in obj-m, the list of objects
+#   obj-y      same as above, for openwrt
+#   O_TARGET   the link target, for openwrt
+#   EXTRA_CFLAGS as the name says... in openwrt
+#   EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too
+#   KERNELPATH the path to the kernel sources or headers
+#
+# Not sure about this (the name might be reserved)
+#   ipfw-cflags                our flags for building the module
+#
+# Other variables are only private and can be renamed. They include:
+#
+#   VER                linux version we are building for (2.4 2.6 or openwrt)
+#---
+
+$(warning including dummynet/Makefile)
+
+# lets default for 2.6 for planetlab builds
+VER ?= 2.6
+
+#--- General values for all types of build ---
+# obj-m is the target module
+obj-m := ipfw_mod.o
+
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS := ip_fw2.c ip_dummynet.c ip_fw_pfil.c ip_fw_sockopt.c
+IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c
+IPFW_SRCS += radix.c in_cksum.c
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
+# generic cflags used on all systems
+#ipfw-cflags += -DIPFW_HASHTABLES
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
+# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
+ipfw-cflags += -D_BSD_SOURCE
+ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
+# the two header trees for empty and override files
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/include
+# XXX eventually ../dummynet/include will go away
+ipfw-cflags += -I $(M)/../dummynet/include
+ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
+
+$(warning "---- Building dummynet kernel module for Version $(VER)")
+
+# We have three sections for OpenWrt, Linux 2.4 and Linux 2.6
+
+ifeq ($(VER),openwrt)
+  #--- The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
+  M=.
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
+
+  # xcflags-y is a temporary variable where we store build options
+  xcflags-y += -O1 -DLINUX_24
+  xcflags-y += -g
+
+  EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags)
+
+  # we should not export anything
+  #export-objs := ipfw2_mod.o
+-include $(TOPDIR)/Rules.make
+
+else   # !openwrt, below we do linux builds for 2.4 and 2.6
+
+  # KERNELPATH is where the kernel headers reside. On PlanetLab
+  # it is set already by the build system.
+  # We can override it from the command line, or let the system guess.
+
+ifneq ($(shell echo $(VER)|grep '2.4'),)
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
+  # guess the kernel path -- or is it under /lib/modules ?
+  KERNELPATH ?= /usr/src/`uname -r`
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
+  MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
+  $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
+
+  # additional warning
+  WARN += -Wall -Wundef
+  WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
+  WARN += -fno-common -Werror-implicit-function-declaration
+  # WARN += -O2  -fno-stack-protector -m32 -msoft-float -mregparm=3
+  # -mregparm=3 gives a printk error
+  WARN += -m32 -msoft-float # -mregparm=3
+  #WARN += -freg-struct-return -mpreferred-stack-boundary=2
+  WARN += -Wno-sign-compare
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+        WARN += -Wno-pointer-sign
+  endif
+
+  ccflags-y += -O1 -DLINUX_24
+  CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
+  # The Main target
+all: mod24
+
+else # !2.4 --
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
+ifeq ($(IPFW_PLANETLAB),1)
+  $(warning "---- Building for PlanetLab")
+  ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
+endif
+  # if not set, use the version from the installed system
+  KERNELPATH ?= /lib/modules/`uname -r`/build
+  # Otherwise, if you have kernel sources, try something like this:
+  #KERNELPATH = /usr/src/linux-2.6.22
+  $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
+  WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
+  # The main target
+
+  # Required by kernel <= 2.6.22, ccflags-y is used on newer version
+  LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3)
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+  ifeq ($(shell if [ $(LINUX_VERSION_CODE) -le 132630 ] ; then echo "true"; fi),true)
+    EXTRA_CFLAGS += $(ccflags-y)
+  endif
+
+all: include_e
+       $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules
+endif # !2.4
+
+#-- back to the common section of code for Linux 2.4 and 2.6
+
+# the list of objects used to build the module
+ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
+
+# additional $(CC) flags
+ccflags-y += $(WARN)
+ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
+ccflags-y += -g
+
+mod24: include_e $(obj-m)
+
+$(obj-m): $(ipfw_mod-y)
+       $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+endif # !openwrt
+
+#--- various common targets
+clean:
+       -rm -f *.o *.ko Module.symvers *.mod.c
+       -rm -rf include_e
+
+distclean: clean
+       -rm -f .*cmd modules.order opt_*
+       -rm -rf .tmp_versions include_e
+       -rm -rf .*.o.d
+
+# support to create empty dirs and files in include_e/
+# EDIRS is the list of directories, EFILES is the list of files.
+
+EDIRS= altq arpa machine net netinet netinet6 sys
+
+EFILES += opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h
+EFILES += opt_mbuf_stress_test.h opt_param.h
+
+EFILES += altq/if_altq.h
+EFILES += arpa/inet.h
+EFILES += machine/in_cksum.h
+EFILES += net/ethernet.h net/netisr.h net/pf_mtag.h
+EFILES += net/bpf.h net/if_types.h
+EFILES += net/vnet.h
+
+EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h
+EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h
+EFILES += netinet/in_systm.h
+EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h
+EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h
+EFILES += netinet/udp_var.h
+
+EFILES += netinet6/ip6_var.h
+
+EFILES += sys/_lock.h sys/_rwlock.h sys/_mutex.h sys/jail.h
+EFILES += sys/condvar.h sys/eventhandler.h sys/domain.h
+EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h
+EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h
+EFILES += sys/sysctl.h sys/time.h sys/ucred.h
+
+include_e:
+       echo "running in $M"
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
+
+
+#--- some other targets for testing purposes
+test_radix: test_radix.o radix.o
+test_lookup: ip_fw_lookup.o
+test_radix test_lookup: CFLAGS=-Wall -Werror -O1
diff --git a/dummynet2/bsd_compat.c b/dummynet2/bsd_compat.c

new file mode 100644 (file)

index 0000000..70268bb
--- /dev/null
+++ b/dummynet2/bsd_compat.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: bsd_compat.c 4665 2010-01-04 12:35:39Z luigi $
+ *
+ * kernel variables and functions that are not available in linux.
+ */
+
+#include <sys/cdefs.h>
+#include <asm/div64.h> /* do_div on 2.4 */
+#include <linux/random.h>      /* get_random_bytes on 2.4 */
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+int ticks;             /* kernel ticks counter */
+int hz = 1000;         /* default clock time */
+long tick = 1000;      /* XXX is this 100000/hz ? */
+int bootverbose = 0;
+time_t time_uptime = 0;
+struct timeval boottime;
+
+int     ip_defttl;
+int fw_one_pass = 1;
+u_long  in_ifaddrhmask;                         /* mask for hash table */
+struct  in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+u_int rt_numfibs = RT_NUMFIBS;
+
+/*
+ * pfil hook support.
+ * We make pfil_head_get return a non-null pointer, which is then ignored
+ * in our 'add-hook' routines.
+ */
+struct pfil_head;
+typedef int (pfil_hook_t)
+       (void *, struct mbuf **, struct ifnet *, int, struct inpcb *);
+
+struct pfil_head *
+pfil_head_get(int proto, u_long flags)
+{
+       static int dummy;
+       return (struct pfil_head *)&dummy;
+}
+ 
+int
+pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+int
+pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+/* define empty body for kernel function */
+int
+priv_check(struct thread *td, int priv)
+{
+       return 0;
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+       return 0;
+}
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+void
+ether_demux(struct ifnet *ifp, struct mbuf *m)
+{
+       return;
+}
+
+int
+ether_output_frame(struct ifnet *ifp, struct mbuf *m)
+{
+       return 0;
+}
+
+void
+in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
+{
+       return;
+}
+
+void
+icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
+{
+       return;
+}
+
+u_short
+in_cksum_skip(struct mbuf *m, int len, int skip)
+{
+       return 0;
+}
+
+u_short
+in_cksum_hdr(struct ip *ip)
+{
+       return 0;
+}
+
+/*
+ * we don't really reassemble, just return whatever we had.
+ */
+struct mbuf *
+ip_reass(struct mbuf *clone)
+{
+       return clone;
+}
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+/* credentials check */
+#include <netinet/ip_fw.h>
+int
+cred_check(void *_insn,  int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb)
+{
+       int match = 0;
+       ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn;
+
+       if (*ugid_lookupp == 0) {        /* actively lookup and copy in cache */
+               /* returns null if any element of the chain up to file is null.
+                * if sk != NULL then we also have a reference
+                */
+               *ugid_lookupp = linux_lookup(proto,
+                       src_ip.s_addr, htons(src_port),
+                       dst_ip.s_addr, htons(dst_port),
+                       skb, oif ? 1 : 0, u);
+       }
+       if (*ugid_lookupp < 0)
+               return 0;
+
+       if (insn->o.opcode == O_UID)
+               match = (u->uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_JAIL)
+               match = (u->xid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = (u->gid == (uid_t)insn->d[0]);
+       return match;
+}
+
+int
+jailed(struct ucred *cred)
+{
+       return 0;
+}
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int
+in_localaddr(struct in_addr in)
+{
+       return 1;
+}
+
+int
+sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (len < valsize)
+               sopt->sopt_valsize = valsize = len;
+       bcopy(buf, sopt->sopt_val, valsize);
+       return 0;
+}
+
+/*
+ * copy data from userland to kernel
+ */
+int
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (valsize < minlen)
+               return EINVAL;
+       if (valsize > len)
+               sopt->sopt_valsize = valsize = len;
+       bcopy(sopt->sopt_val, buf, valsize);
+       return 0;
+}
+
+void
+getmicrouptime(struct timeval *tv)
+{
+#ifdef _WIN32
+#else
+       do_gettimeofday(tv);
+#endif
+}
+
+
+#include <arpa/inet.h>
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf)
+{
+#ifdef _WIN32
+#else
+       unsigned char *ucp = (unsigned char *)&ina;
+
+       sprintf(buf, "%d.%d.%d.%d",
+       ucp[0] & 0xff,
+       ucp[1] & 0xff,
+       ucp[2] & 0xff,
+       ucp[3] & 0xff);
+#endif
+       return buf;
+}
+
+char *
+inet_ntoa(struct in_addr ina)
+{
+       static char buf[16];
+       return inet_ntoa_r(ina, buf);
+}
+
+int
+random(void)
+{
+#ifdef _WIN32
+       return 0x123456;
+#else
+       int r;
+       get_random_bytes(&r, sizeof(r));
+       return r & 0x7fffffff; 
+#endif
+}
+
+
+/*
+ * do_div really does a u64 / u32 bit division.
+ * we save the sign and convert to uint befor calling.
+ * We are safe just because we always call it with small operands.
+ */
+int64_t
+div64(int64_t a, int64_t b)
+{
+#ifdef _WIN32
+        int a1 = a, b1 = b;
+       return a1/b1;
+#else
+       uint64_t ua, ub;
+       int sign = ((a>0)?1:-1) * ((b>0)?1:-1);
+
+       ua = ((a>0)?a:-a);
+       ub = ((b>0)?b:-b);
+        do_div(ua, ub);
+       return sign*ua;
+#endif
+}
+
+/*
+ * compact version of fnmatch.
+ */
+int
+fnmatch(const char *pattern, const char *string, int flags)
+{
+       char s;
+
+       if (!string || !pattern)
+               return 1;       /* no match */
+       while ( (s = *string++) ) {
+               char p = *pattern++;
+               if (p == '\0')          /* pattern is over, no match */
+                       return 1;
+               if (p == '*')           /* wildcard, match */
+                       return 0;
+               if (p == '.' || p == s) /* char match, continue */
+                       continue;
+               return 1;               /* no match */
+       }
+       /* end of string, make sure the pattern is over too */
+       if (*pattern == '\0' || *pattern == '*')
+               return 0;
+       return 1;       /* no match */
+}
+
+#ifdef _WIN32
+/*
+ * as good as anywhere, place here the missing calls
+ */
+
+void *
+my_alloc(int size)
+{
+       void *_ret = ExAllocatePoolWithTag(0, size, 'wfpi');
+       if (_ret)
+               memset(_ret, 0, size);
+       return _ret;
+}
+
+void
+panic(const char *fmt, ...)
+{
+       printf("%s", fmt);
+       for (;;);
+}
+
+#include <stdarg.h>
+
+extern int _vsnprintf(char *buf, int buf_size, char * fmt, va_list ap);
+
+/*
+ * Windows' _snprintf doesn't terminate buffer with zero if size > buf_size
+ */
+int
+snprintf(char *buf, int buf_size, char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    if (_vsnprintf(buf, buf_size, fmt, ap) < 0)
+        buf[buf_size - 1] = '\0';
+    va_end(ap);
+
+    return 0;
+}
+#endif
diff --git a/dummynet2/in_cksum.c b/dummynet2/in_cksum.c

new file mode 100644 (file)

index 0000000..8972cef
--- /dev/null
+++ b/dummynet2/in_cksum.c
@@ -0,0 +1,150 @@
+/*-
+ * Copyright (c) 1988, 1992, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)in_cksum.c  8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $");
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+
+/*
+ * Checksum routine for Internet Protocol family headers (Portable Version).
+ *
+ * This routine is very heavily used in the network
+ * code and should be modified for each CPU to be as fast as possible.
+ */
+
+#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
+
+int
+in_cksum(struct mbuf *m, int len)
+{
+       register u_short *w;
+       register int sum = 0;
+       register int mlen = 0;
+       int byte_swapped = 0;
+
+       union {
+               char    c[2];
+               u_short s;
+       } s_util;
+       union {
+               u_short s[2];
+               long    l;
+       } l_util;
+
+       for (;m && len; m = m->m_next) {
+               if (m->m_len == 0)
+                       continue;
+               w = mtod(m, u_short *);
+               if (mlen == -1) {
+                       /*
+                        * The first byte of this mbuf is the continuation
+                        * of a word spanning between this mbuf and the
+                        * last mbuf.
+                        *
+                        * s_util.c[0] is already saved when scanning previous
+                        * mbuf.
+                        */
+                       s_util.c[1] = *(char *)w;
+                       sum += s_util.s;
+                       w = (u_short *)((char *)w + 1);
+                       mlen = m->m_len - 1;
+                       len--;
+               } else
+                       mlen = m->m_len;
+               if (len < mlen)
+                       mlen = len;
+               len -= mlen;
+               /*
+                * Force to even boundary.
+                */
+#if defined(CONFIG_X86_64)
+               if ((1 & (long) w) && (mlen > 0)) {
+#else
+               if ((1 & (int) w) && (mlen > 0)) {
+#endif
+                       REDUCE;
+                       sum <<= 8;
+                       s_util.c[0] = *(u_char *)w;
+                       w = (u_short *)((char *)w + 1);
+                       mlen--;
+                       byte_swapped = 1;
+               }
+               /*
+                * Unroll the loop to make overhead from
+                * branches &c small.
+                */
+               while ((mlen -= 32) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
+                       sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
+                       sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
+                       w += 16;
+               }
+               mlen += 32;
+               while ((mlen -= 8) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       w += 4;
+               }
+               mlen += 8;
+               if (mlen == 0 && byte_swapped == 0)
+                       continue;
+               REDUCE;
+               while ((mlen -= 2) >= 0) {
+                       sum += *w++;
+               }
+               if (byte_swapped) {
+                       REDUCE;
+                       sum <<= 8;
+                       byte_swapped = 0;
+                       if (mlen == -1) {
+                               s_util.c[1] = *(char *)w;
+                               sum += s_util.s;
+                               mlen = 0;
+                       } else
+                               mlen = -1;
+               } else if (mlen == -1)
+                       s_util.c[0] = *(char *)w;
+       }
+       if (len)
+               printf("cksum: out of data\n");
+       if (mlen == -1) {
+               /* The last mbuf has odd # of bytes. Follow the
+                  standard (the odd byte may be shifted left by 8 bits
+                  or not as determined by endian-ness of the machine) */
+               s_util.c[1] = 0;
+               sum += s_util.s;
+       }
+       REDUCE;
+       return (~sum & 0xffff);
+}
diff --git a/dummynet2/include/netgraph/ng_ipfw.h b/dummynet2/include/netgraph/ng_ipfw.h

new file mode 100644 (file)

index 0000000..55fd890
--- /dev/null
+++ b/dummynet2/include/netgraph/ng_ipfw.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $
+ */
+#ifndef __NG_IPFW_H
+#define __NG_IPFW_H
+
+#define NG_IPFW_NODE_TYPE    "ipfw"
+#define NGM_IPFW_COOKIE      1105988990
+#endif /* __NG_IPFW_H */
diff --git a/dummynet2/include/netinet/ip_dummynet.h b/dummynet2/include/netinet/ip_dummynet.h

new file mode 100644 (file)

index 0000000..f01bfe2
--- /dev/null
+++ b/dummynet2/include/netinet/ip_dummynet.h
@@ -0,0 +1,374 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.40.2.1 2008/04/25 10:26:30 oleg Exp $
+ */
+
+#ifndef _IP_DUMMYNET_H
+#define _IP_DUMMYNET_H
+
+/*
+ * Definition of dummynet data structures. In the structures, I decided
+ * not to use the macros in <sys/queue.h> in the hope of making the code
+ * easier to port to other architectures. The type of lists and queue we
+ * use here is pretty simple anyways.
+ */
+
+/*
+ * We start with a heap, which is used in the scheduler to decide when
+ * to transmit packets etc.
+ *
+ * The key for the heap is used for two different values:
+ *
+ * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ *
+ * 2. virtual times. These increase in steps of len/x, where len is the
+ *    packet length, and x is either the weight of the flow, or the
+ *    sum of all weights.
+ *    If we limit to max 1000 flows and a max weight of 100, then
+ *    x needs 17 bits. The packet size is 16 bits, so we can easily
+ *    overflow if we do not allow errors.
+ * So we use a key "dn_key" which is 64 bits. Some macros are used to
+ * compare key values and handle wraparounds.
+ * MAX64 returns the largest of two key values.
+ * MY_M is used as a shift count when doing fixed point arithmetic
+ * (a better name would be useful...).
+ */
+typedef u_int64_t dn_key ;      /* sorting key */
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+#define DN_KEY_GT(a,b)     ((int64_t)((a)-(b)) > 0)
+#define DN_KEY_GEQ(a,b)    ((int64_t)((a)-(b)) >= 0)
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#define MY_M   16 /* number of left shift to obtain a larger precision */
+
+/*
+ * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
+ * virtual time wraps every 15 days.
+ */
+
+
+/*
+ * The maximum hash table size for queues.  This value must be a power
+ * of 2.
+ */
+#define DN_MAX_HASH_SIZE 65536
+
+/*
+ * A heap entry is made of a key and a pointer to the actual
+ * object stored in the heap.
+ * The heap is an array of dn_heap_entry entries, dynamically allocated.
+ * Current size is "size", with "elements" actually in use.
+ * The heap normally supports only ordered insert and extract from the top.
+ * If we want to extract an object from the middle of the heap, we
+ * have to know where the object itself is located in the heap (or we
+ * need to scan the whole array). To this purpose, an object has a
+ * field (int) which contains the index of the object itself into the
+ * heap. When the object is moved, the field must also be updated.
+ * The offset of the index in the object is stored in the 'offset'
+ * field in the heap descriptor. The assumption is that this offset
+ * is non-zero if we want to support extract from the middle.
+ */
+struct dn_heap_entry {
+    dn_key key ;       /* sorting key. Topmost element is smallest one */
+    void *object ;     /* object pointer */
+} ;
+
+struct dn_heap {
+    int size ;
+    int elements ;
+    int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
+    struct dn_heap_entry *p ;  /* really an array of "size" entries */
+} ;
+
+#ifdef _KERNEL
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.  This is used within
+ * the dummynet code as well as outside when checking for special
+ * processing requirements.
+ * Note that the first part is the reinject info and is common to
+ * other forms of packet reinjection.
+ */
+struct dn_pkt_tag {
+       struct ipfw_rule_ref rule;      /* matching rule */
+
+    /* second part, dummynet specific */
+    int dn_dir;                        /* action when packet comes out. */
+                               /* see ip_fw_private.h */
+
+    dn_key output_time;                /* when the pkt is due for delivery     */
+    struct ifnet *ifp;         /* interface, for ip_output             */
+    struct _ip6dn_args ip6opt; /* XXX ipv6 options                     */
+};
+#endif /* _KERNEL */
+
+/*
+ * Overall structure of dummynet (with WF2Q+):
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE.
+
+A QUEUE is just a queue with configurable size and queue management
+policy. It is also associated with a mask (to discriminate among
+different flows), a weight (used to give different shares of the
+bandwidth to different flows) and a "pipe", which essentially
+supplies the transmit clock for all queues associated with that
+pipe.
+
+A PIPE emulates a fixed-bandwidth link, whose bandwidth is
+configurable.  The "clock" for a pipe can come from either an
+internal timer, or from the transmit interrupt of an interface.
+A pipe is also associated with one (or more, if masks are used)
+queue, where all packets for that pipe are stored.
+
+The bandwidth available on the pipe is shared by the queues
+associated with that pipe (only one in case the packet is sent
+to a PIPE) according to the WF2Q+ scheduling algorithm and the
+configured weights.
+
+In general, incoming packets are stored in the appropriate queue,
+which is then placed into one of a few heaps managed by a scheduler
+to decide when the packet should be extracted.
+The scheduler (a function called dummynet()) is run at every timer
+tick, and grabs queues from the head of the heaps when they are
+ready for processing.
+
+There are three data structures definining a pipe and associated queues:
+
+ + dn_pipe, which contains the main configuration parameters related
+   to delay and bandwidth;
+ + dn_flow_set, which contains WF2Q+ configuration, flow
+   masks, plr and RED configuration;
+ + dn_flow_queue, which is the per-flow queue (containing the packets)
+
+Multiple dn_flow_set can be linked to the same pipe, and multiple
+dn_flow_queue can be linked to the same dn_flow_set.
+All data structures are linked in a linear list which is used for
+housekeeping purposes.
+
+During configuration, we create and initialize the dn_flow_set
+and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
+
+At runtime: packets are sent to the appropriate dn_flow_set (either
+WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
+which in turn dispatches them to the appropriate dn_flow_queue
+(created dynamically according to the masks).
+
+The transmit clock for fixed rate flows (ready_event()) selects the
+dn_flow_queue to be used to transmit the next packet. For WF2Q,
+wfq_ready_event() extract a pipe which in turn selects the right
+flow using a number of heaps defined into the pipe itself.
+
+ *
+ */
+
+/*
+ * per flow queue. This contains the flow identifier, the queue
+ * of packets, counters, and parameters used to support both RED and
+ * WF2Q+.
+ *
+ * A dn_flow_queue is created and initialized whenever a packet for
+ * a new flow arrives.
+ */
+struct dn_flow_queue {
+    struct dn_flow_queue *next ;
+    struct ipfw_flow_id id ;
+
+    struct mbuf *head, *tail ; /* queue of packets */
+    u_int len ;
+    u_int len_bytes ;
+
+    /*
+     * When we emulate MAC overheads, or channel unavailability due
+     * to other traffic on a shared medium, we augment the packet at
+     * the head of the queue with an 'extra_bits' field representsing
+     * the additional delay the packet will be subject to:
+     *         extra_bits = bw*unavailable_time.
+     * With large bandwidth and large delays, extra_bits (and also numbytes)
+     * can become very large, so better play safe and use 64 bit
+     */
+    uint64_t numbytes ;                /* credit for transmission (dynamic queues) */
+    int64_t extra_bits;                /* extra bits simulating unavailable channel */
+
+    u_int64_t tot_pkts ;       /* statistics counters  */
+    u_int64_t tot_bytes ;
+    u_int32_t drops ;
+
+    int hash_slot ;            /* debugging/diagnostic */
+
+    /* RED parameters */
+    int avg ;                   /* average queue length est. (scaled) */
+    int count ;                 /* arrivals since last RED drop */
+    int random ;                /* random value (scaled) */
+    dn_key idle_time;          /* start of queue idle time */
+
+    /* WF2Q+ support */
+    struct dn_flow_set *fs ;   /* parent flow set */
+    int heap_pos ;             /* position (index) of struct in heap */
+    dn_key sched_time ;                /* current time when queue enters ready_heap */
+
+    dn_key S,F ;               /* start time, finish time */
+    /*
+     * Setting F < S means the timestamp is invalid. We only need
+     * to test this when the queue is empty.
+     */
+} ;
+
+/*
+ * flow_set descriptor. Contains the "template" parameters for the
+ * queue configuration, and pointers to the hash table of dn_flow_queue's.
+ *
+ * The hash table is an array of lists -- we identify the slot by
+ * hashing the flow-id, then scan the list looking for a match.
+ * The size of the hash table (buckets) is configurable on a per-queue
+ * basis.
+ *
+ * A dn_flow_set is created whenever a new queue or pipe is created (in the
+ * latter case, the structure is located inside the struct dn_pipe).
+ */
+struct dn_flow_set {
+    SLIST_ENTRY(dn_flow_set)   next;   /* linked list in a hash slot */
+
+    u_short fs_nr ;             /* flow_set number       */
+    u_short flags_fs;
+#define DN_HAVE_FLOW_MASK      0x0001
+#define DN_IS_RED              0x0002
+#define DN_IS_GENTLE_RED       0x0004
+#define DN_QSIZE_IS_BYTES      0x0008  /* queue size is measured in bytes */
+#define DN_NOERROR             0x0010  /* do not report ENOBUFS on drops  */
+#define        DN_HAS_PROFILE          0x0020  /* the pipe has a delay profile. */
+#define DN_IS_PIPE             0x4000
+#define DN_IS_QUEUE            0x8000
+
+    struct dn_pipe *pipe ;     /* pointer to parent pipe */
+    u_short parent_nr ;                /* parent pipe#, 0 if local to a pipe */
+
+    int weight ;               /* WFQ queue weight */
+    int qsize ;                        /* queue size in slots or bytes */
+    int plr ;                  /* pkt loss rate (2^31-1 means 100%) */
+
+    struct ipfw_flow_id flow_mask ;
+
+    /* hash table of queues onto this flow_set */
+    int rq_size ;              /* number of slots */
+    int rq_elements ;          /* active elements */
+    struct dn_flow_queue **rq; /* array of rq_size entries */
+
+    u_int32_t last_expired ;   /* do not expire too frequently */
+    int backlogged ;           /* #active queues for this flowset */
+
+        /* RED parameters */
+#define SCALE_RED               16
+#define SCALE(x)                ( (x) << SCALE_RED )
+#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
+    int w_q ;                  /* queue weight (scaled) */
+    int max_th ;               /* maximum threshold for queue (scaled) */
+    int min_th ;               /* minimum threshold for queue (scaled) */
+    int max_p ;                        /* maximum value for p_b (scaled) */
+    u_int c_1 ;                        /* max_p/(max_th-min_th) (scaled) */
+    u_int c_2 ;                        /* max_p*min_th/(max_th-min_th) (scaled) */
+    u_int c_3 ;                        /* for GRED, (1-max_p)/max_th (scaled) */
+    u_int c_4 ;                        /* for GRED, 1 - 2*max_p (scaled) */
+    u_int * w_q_lookup ;       /* lookup table for computing (1-w_q)^t */
+    u_int lookup_depth ;       /* depth of lookup table */
+    int lookup_step ;          /* granularity inside the lookup table */
+    int lookup_weight ;                /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+    int avg_pkt_size ;         /* medium packet size */
+    int max_pkt_size ;         /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+/*
+ * Pipe descriptor. Contains global parameters, delay-line queue,
+ * and the flow_set used for fixed-rate queues.
+ *
+ * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
+ *   not_eligible_heap, for queues whose start time is higher
+ *     than the virtual time. Sorted by start time.
+ *   scheduler_heap, for queues eligible for scheduling. Sorted by
+ *     finish time.
+ *   idle_heap, all flows that are idle and can be removed. We
+ *     do that on each tick so we do not slow down too much
+ *     operations during forwarding.
+ *
+ */
+struct dn_pipe {               /* a pipe */
+    SLIST_ENTRY(dn_pipe)       next;   /* linked list in a hash slot */
+
+    int        pipe_nr ;               /* number       */
+    int bandwidth;             /* really, bytes/tick.  */
+    int        delay ;                 /* really, ticks        */
+
+    struct     mbuf *head, *tail ;     /* packets in delay line */
+
+    /* WF2Q+ */
+    struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
+    struct dn_heap not_eligible_heap; /* top extract- key Start time */
+    struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
+
+    dn_key V ;                 /* virtual time */
+    int sum;                   /* sum of weights of all active sessions */
+
+    /* Same as in dn_flow_queue, numbytes can become large */
+    int64_t numbytes;          /* bits I can transmit (more or less). */
+    uint64_t burst;            /* burst size, scaled: bits * hz */
+
+    dn_key sched_time ;                /* time pipe was scheduled in ready_heap */
+    dn_key idle_time;          /* start of pipe idle time */
+
+    /*
+     * When the tx clock come from an interface (if_name[0] != '\0'), its name
+     * is stored below, whereas the ifp is filled when the rule is configured.
+     */
+    char if_name[IFNAMSIZ];
+    struct ifnet *ifp ;
+    int ready ; /* set if ifp != NULL and we got a signal from it */
+
+    struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+    /* fields to simulate a delay profile */
+
+#define ED_MAX_NAME_LEN                32
+    char name[ED_MAX_NAME_LEN];
+    int loss_level;
+    int samples_no;
+    int *samples;
+};
+
+/* dn_pipe_max is used to pass pipe configuration from userland onto
+ * kernel space and back
+ */
+#define ED_MAX_SAMPLES_NO      1024
+struct dn_pipe_max {
+       struct dn_pipe pipe;
+       int samples[ED_MAX_SAMPLES_NO];
+};
+
+SLIST_HEAD(dn_pipe_head, dn_pipe);
+
+#endif /* _IP_DUMMYNET_H */
diff --git a/dummynet2/include/netinet/ip_fw.h b/dummynet2/include/netinet/ip_fw.h

new file mode 100644 (file)

index 0000000..238601f
--- /dev/null
+++ b/dummynet2/include/netinet/ip_fw.h
@@ -0,0 +1,574 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ip_fw.h 200580 2009-12-15 16:15:14Z luigi $
+ */
+
+#ifndef _IPFW2_H
+#define _IPFW2_H
+
+/*
+ * The default rule number.  By the design of ip_fw, the default rule
+ * is the last one, so its number can also serve as the highest number
+ * allowed for a rule.  The ip_fw code relies on both meanings of this
+ * constant. 
+ */
+#define        IPFW_DEFAULT_RULE       65535
+
+/*
+ * The number of ipfw tables.  The maximum allowed table number is the
+ * (IPFW_TABLES_MAX - 1).
+ */
+#define        IPFW_TABLES_MAX         128
+
+/*
+ * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
+ * argument between 1 and 65534. The value 0 is unused, the value
+ * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
+ * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
+ * result of the most recent table() lookup.
+ * Note that 16bit is only a historical limit, resulting from
+ * the use of a 16-bit fields for that value. In reality, we can have
+ * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
+ */
+#define        IPFW_ARG_MIN            1
+#define        IPFW_ARG_MAX            65534
+#define IP_FW_TABLEARG         65535   /* XXX should use 0 */
+
+/*
+ * The kernel representation of ipfw rules is made of a list of
+ * 'instructions' (for all practical purposes equivalent to BPF
+ * instructions), which specify which fields of the packet
+ * (or its metadata) should be analysed.
+ *
+ * Each instruction is stored in a structure which begins with
+ * "ipfw_insn", and can contain extra fields depending on the
+ * instruction type (listed below).
+ * Note that the code is written so that individual instructions
+ * have a size which is a multiple of 32 bits. This means that, if
+ * such structures contain pointers or other 64-bit entities,
+ * (there is just one instance now) they may end up unaligned on
+ * 64-bit architectures, so the must be handled with care.
+ *
+ * "enum ipfw_opcodes" are the opcodes supported. We can have up
+ * to 256 different opcodes. When adding new opcodes, they should
+ * be appended to the end of the opcode list before O_LAST_OPCODE,
+ * this will prevent the ABI from being broken, otherwise users
+ * will have to recompile ipfw(8) when they update the kernel.
+ */
+
+enum ipfw_opcodes {            /* arguments (4 byte each)      */
+       O_NOP,
+
+       O_IP_SRC,               /* u32 = IP                     */
+       O_IP_SRC_MASK,          /* ip = IP/mask                 */
+       O_IP_SRC_ME,            /* none                         */
+       O_IP_SRC_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_DST,               /* u32 = IP                     */
+       O_IP_DST_MASK,          /* ip = IP/mask                 */
+       O_IP_DST_ME,            /* none                         */
+       O_IP_DST_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_SRCPORT,           /* (n)port list:mask 4 byte ea  */
+       O_IP_DSTPORT,           /* (n)port list:mask 4 byte ea  */
+       O_PROTO,                /* arg1=protocol                */
+
+       O_MACADDR2,             /* 2 mac addr:mask              */
+       O_MAC_TYPE,             /* same as srcport              */
+
+       O_LAYER2,               /* none                         */
+       O_IN,                   /* none                         */
+       O_FRAG,                 /* none                         */
+
+       O_RECV,                 /* none                         */
+       O_XMIT,                 /* none                         */
+       O_VIA,                  /* none                         */
+
+       O_IPOPT,                /* arg1 = 2*u8 bitmap           */
+       O_IPLEN,                /* arg1 = len                   */
+       O_IPID,                 /* arg1 = id                    */
+
+       O_IPTOS,                /* arg1 = id                    */
+       O_IPPRECEDENCE,         /* arg1 = precedence << 5       */
+       O_IPTTL,                /* arg1 = TTL                   */
+
+       O_IPVER,                /* arg1 = version               */
+       O_UID,                  /* u32 = id                     */
+       O_GID,                  /* u32 = id                     */
+       O_ESTAB,                /* none (tcp established)       */
+       O_TCPFLAGS,             /* arg1 = 2*u8 bitmap           */
+       O_TCPWIN,               /* arg1 = desired win           */
+       O_TCPSEQ,               /* u32 = desired seq.           */
+       O_TCPACK,               /* u32 = desired seq.           */
+       O_ICMPTYPE,             /* u32 = icmp bitmap            */
+       O_TCPOPTS,              /* arg1 = 2*u8 bitmap           */
+
+       O_VERREVPATH,           /* none                         */
+       O_VERSRCREACH,          /* none                         */
+
+       O_PROBE_STATE,          /* none                         */
+       O_KEEP_STATE,           /* none                         */
+       O_LIMIT,                /* ipfw_insn_limit              */
+       O_LIMIT_PARENT,         /* dyn_type, not an opcode.     */
+
+       /*
+        * These are really 'actions'.
+        */
+
+       O_LOG,                  /* ipfw_insn_log                */
+       O_PROB,                 /* u32 = match probability      */
+
+       O_CHECK_STATE,          /* none                         */
+       O_ACCEPT,               /* none                         */
+       O_DENY,                 /* none                         */
+       O_REJECT,               /* arg1=icmp arg (same as deny) */
+       O_COUNT,                /* none                         */
+       O_SKIPTO,               /* arg1=next rule number        */
+       O_PIPE,                 /* arg1=pipe number             */
+       O_QUEUE,                /* arg1=queue number            */
+       O_DIVERT,               /* arg1=port number             */
+       O_TEE,                  /* arg1=port number             */
+       O_FORWARD_IP,           /* fwd sockaddr                 */
+       O_FORWARD_MAC,          /* fwd mac                      */
+       O_NAT,                  /* nope                         */
+       O_REASS,                /* none                         */
+       
+       /*
+        * More opcodes.
+        */
+       O_IPSEC,                /* has ipsec history            */
+       O_IP_SRC_LOOKUP,        /* arg1=table number, u32=value */
+       O_IP_DST_LOOKUP,        /* arg1=table number, u32=value */
+       O_ANTISPOOF,            /* none                         */
+       O_JAIL,                 /* u32 = id                     */
+       O_ALTQ,                 /* u32 = altq classif. qid      */
+       O_DIVERTED,             /* arg1=bitmap (1:loop, 2:out)  */
+       O_TCPDATALEN,           /* arg1 = tcp data len          */
+       O_IP6_SRC,              /* address without mask         */
+       O_IP6_SRC_ME,           /* my addresses                 */
+       O_IP6_SRC_MASK,         /* address with the mask        */
+       O_IP6_DST,
+       O_IP6_DST_ME,
+       O_IP6_DST_MASK,
+       O_FLOW6ID,              /* for flow id tag in the ipv6 pkt */
+       O_ICMP6TYPE,            /* icmp6 packet type filtering  */
+       O_EXT_HDR,              /* filtering for ipv6 extension header */
+       O_IP6,
+
+       /*
+        * actions for ng_ipfw
+        */
+       O_NETGRAPH,             /* send to ng_ipfw              */
+       O_NGTEE,                /* copy to ng_ipfw              */
+
+       O_IP4,
+
+       O_UNREACH6,             /* arg1=icmpv6 code arg (deny)  */
+
+       O_TAG,                  /* arg1=tag number */
+       O_TAGGED,               /* arg1=tag number */
+
+       O_SETFIB,               /* arg1=FIB number */
+       O_FIB,                  /* arg1=FIB desired fib number */
+
+       O_LAST_OPCODE           /* not an opcode!               */
+};
+
+/*
+ * The extension header are filtered only for presence using a bit
+ * vector with a flag for each header.
+ */
+#define EXT_FRAGMENT   0x1
+#define EXT_HOPOPTS    0x2
+#define EXT_ROUTING    0x4
+#define EXT_AH         0x8
+#define EXT_ESP                0x10
+#define EXT_DSTOPTS    0x20
+#define EXT_RTHDR0             0x40
+#define EXT_RTHDR2             0x80
+
+/*
+ * Template for instructions.
+ *
+ * ipfw_insn is used for all instructions which require no operands,
+ * a single 16-bit value (arg1), or a couple of 8-bit values.
+ *
+ * For other instructions which require different/larger arguments
+ * we have derived structures, ipfw_insn_*.
+ *
+ * The size of the instruction (in 32-bit words) is in the low
+ * 6 bits of "len". The 2 remaining bits are used to implement
+ * NOT and OR on individual instructions. Given a type, you can
+ * compute the length to be put in "len" using F_INSN_SIZE(t)
+ *
+ * F_NOT       negates the match result of the instruction.
+ *
+ * F_OR                is used to build or blocks. By default, instructions
+ *             are evaluated as part of a logical AND. An "or" block
+ *             { X or Y or Z } contains F_OR set in all but the last
+ *             instruction of the block. A match will cause the code
+ *             to skip past the last instruction of the block.
+ *
+ * NOTA BENE: in a couple of places we assume that
+ *     sizeof(ipfw_insn) == sizeof(u_int32_t)
+ * this needs to be fixed.
+ *
+ */
+typedef struct _ipfw_insn {    /* template for instructions */
+       u_int8_t        opcode;
+       u_int8_t        len;    /* number of 32-bit words */
+#define        F_NOT           0x80
+#define        F_OR            0x40
+#define        F_LEN_MASK      0x3f
+#define        F_LEN(cmd)      ((cmd)->len & F_LEN_MASK)
+
+       u_int16_t       arg1;
+} ipfw_insn;
+
+/*
+ * The F_INSN_SIZE(type) computes the size, in 4-byte words, of
+ * a given type.
+ */
+#define        F_INSN_SIZE(t)  ((sizeof (t))/sizeof(u_int32_t))
+
+/*
+ * This is used to store an array of 16-bit entries (ports etc.)
+ */
+typedef struct _ipfw_insn_u16 {
+       ipfw_insn o;
+       u_int16_t ports[2];     /* there may be more */
+} ipfw_insn_u16;
+
+/*
+ * This is used to store an array of 32-bit entries
+ * (uid, single IPv4 addresses etc.)
+ */
+typedef struct _ipfw_insn_u32 {
+       ipfw_insn o;
+       u_int32_t d[1]; /* one or more */
+} ipfw_insn_u32;
+
+/*
+ * This is used to store IP addr-mask pairs.
+ */
+typedef struct _ipfw_insn_ip {
+       ipfw_insn o;
+       struct in_addr  addr;
+       struct in_addr  mask;
+} ipfw_insn_ip;
+
+/*
+ * This is used to forward to a given address (ip).
+ */
+typedef struct  _ipfw_insn_sa {
+       ipfw_insn o;
+       struct sockaddr_in sa;
+} ipfw_insn_sa;
+
+/*
+ * This is used for MAC addr-mask pairs.
+ */
+typedef struct _ipfw_insn_mac {
+       ipfw_insn o;
+       u_char addr[12];        /* dst[6] + src[6] */
+       u_char mask[12];        /* dst[6] + src[6] */
+} ipfw_insn_mac;
+
+/*
+ * This is used for interface match rules (recv xx, xmit xx).
+ */
+typedef struct _ipfw_insn_if {
+       ipfw_insn o;
+       union {
+               struct in_addr ip;
+               int glob;
+       } p;
+       char name[IFNAMSIZ];
+} ipfw_insn_if;
+
+/*
+ * This is used for storing an altq queue id number.
+ */
+typedef struct _ipfw_insn_altq {
+       ipfw_insn       o;
+       u_int32_t       qid;
+} ipfw_insn_altq;
+
+/*
+ * This is used for limit rules.
+ */
+typedef struct _ipfw_insn_limit {
+       ipfw_insn o;
+       u_int8_t _pad;
+       u_int8_t limit_mask;    /* combination of DYN_* below   */
+#define        DYN_SRC_ADDR    0x1
+#define        DYN_SRC_PORT    0x2
+#define        DYN_DST_ADDR    0x4
+#define        DYN_DST_PORT    0x8
+
+       u_int16_t conn_limit;
+} ipfw_insn_limit;
+
+/*
+ * This is used for log instructions.
+ */
+typedef struct  _ipfw_insn_log {
+        ipfw_insn o;
+       u_int32_t max_log;      /* how many do we log -- 0 = all */
+       u_int32_t log_left;     /* how many left to log         */
+} ipfw_insn_log;
+
+/*
+ * Data structures required by both ipfw(8) and ipfw(4) but not part of the
+ * management API are protected by IPFW_INTERNAL.
+ */
+#ifdef IPFW_INTERNAL
+/* Server pool support (LSNAT). */
+struct cfg_spool {
+       LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
+       struct in_addr          addr;
+       u_short                 port;
+};
+#endif
+
+/* Redirect modes id. */
+#define REDIR_ADDR      0x01
+#define REDIR_PORT      0x02
+#define REDIR_PROTO     0x04
+
+#ifdef IPFW_INTERNAL
+/* Nat redirect configuration. */
+struct cfg_redir {
+       LIST_ENTRY(cfg_redir)   _next;          /* chain of redir instances */
+       u_int16_t               mode;           /* type of redirect mode */
+       struct in_addr          laddr;          /* local ip address */
+       struct in_addr          paddr;          /* public ip address */
+       struct in_addr          raddr;          /* remote ip address */
+       u_short                 lport;          /* local port */
+       u_short                 pport;          /* public port */
+       u_short                 rport;          /* remote port  */
+       u_short                 pport_cnt;      /* number of public ports */
+       u_short                 rport_cnt;      /* number of remote ports */
+       int                     proto;          /* protocol: tcp/udp */
+       struct alias_link       **alink;        
+       /* num of entry in spool chain */
+       u_int16_t               spool_cnt;      
+       /* chain of spool instances */
+       LIST_HEAD(spool_chain, cfg_spool) spool_chain;
+};
+#endif
+
+#define NAT_BUF_LEN     1024
+
+#ifdef IPFW_INTERNAL
+/* Nat configuration data struct. */
+struct cfg_nat {
+       /* chain of nat instances */
+       LIST_ENTRY(cfg_nat)     _next;
+       int                     id;                     /* nat id */
+       struct in_addr          ip;                     /* nat ip address */
+       char                    if_name[IF_NAMESIZE];   /* interface name */
+       int                     mode;                   /* aliasing mode */
+       struct libalias         *lib;                   /* libalias instance */
+       /* number of entry in spool chain */
+       int                     redir_cnt;              
+       /* chain of redir instances */
+       LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
+};
+#endif
+
+#define SOF_NAT         sizeof(struct cfg_nat)
+#define SOF_REDIR       sizeof(struct cfg_redir)
+#define SOF_SPOOL       sizeof(struct cfg_spool)
+
+/* Nat command. */
+typedef struct _ipfw_insn_nat {
+       ipfw_insn       o;
+       struct cfg_nat *nat;    
+} ipfw_insn_nat;
+
+/* Apply ipv6 mask on ipv6 addr */
+#define APPLY_MASK(addr,mask)                          \
+    (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
+    (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
+    (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
+    (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];
+
+/* Structure for ipv6 */
+typedef struct _ipfw_insn_ip6 {
+       ipfw_insn o;
+       struct in6_addr addr6;
+       struct in6_addr mask6;
+} ipfw_insn_ip6;
+
+/* Used to support icmp6 types */
+typedef struct _ipfw_insn_icmp6 {
+       ipfw_insn o;
+       uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
+                       *     define ICMP6_MAXTYPE
+                       *     as follows: n = ICMP6_MAXTYPE/32 + 1
+                        *     Actually is 203 
+                       */
+} ipfw_insn_icmp6;
+
+/*
+ * Here we have the structure representing an ipfw rule.
+ *
+ * It starts with a general area (with link fields and counters)
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values.
+ *
+ * Given a rule pointer  r:
+ *
+ *  r->cmd             is the start of the first instruction.
+ *  ACTION_PTR(r)      is the start of the first action (things to do
+ *                     once a rule matched).
+ *
+ * When assembling instruction, remember the following:
+ *
+ *  + if a rule has a "keep-state" (or "limit") option, then the
+ *     first instruction (at r->cmd) MUST BE an O_PROBE_STATE
+ *  + if a rule has a "log" option, then the first action
+ *     (at ACTION_PTR(r)) MUST be O_LOG
+ *  + if a rule has an "altq" option, it comes after "log"
+ *  + if a rule has an O_TAG option, it comes after "log" and "altq"
+ *
+ * NOTE: we use a simple linked list of rules because we never need
+ *     to delete a rule without scanning the list. We do not use
+ *     queue(3) macros for portability and readability.
+ */
+
+struct ip_fw {
+       struct ip_fw    *x_next;        /* linked list of rules         */
+       struct ip_fw    *next_rule;     /* ptr to next [skipto] rule    */
+       /* 'next_rule' is used to pass up 'set_disable' status          */
+
+       uint16_t        act_ofs;        /* offset of action in 32-bit units */
+       uint16_t        cmd_len;        /* # of 32-bit words in cmd     */
+       uint16_t        rulenum;        /* rule number                  */
+       uint8_t set;            /* rule set (0..31)             */
+#define        RESVD_SET       31      /* set for default and persistent rules */
+       uint8_t         _pad;           /* padding                      */
+       uint32_t        id;             /* rule id */
+
+       /* These fields are present in all rules.                       */
+       uint64_t        pcnt;           /* Packet counter               */
+       uint64_t        bcnt;           /* Byte counter                 */
+       uint32_t        timestamp;      /* tv_sec of last match         */
+
+       ipfw_insn       cmd[1];         /* storage for commands         */
+};
+
+#define ACTION_PTR(rule)                               \
+       (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )
+
+#define RULESIZE(rule)  (sizeof(struct ip_fw) + \
+       ((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+
+/*
+ * This structure is used as a flow mask and a flow id for various
+ * parts of the code.
+ */
+struct ipfw_flow_id {
+       u_int32_t       dst_ip;
+       u_int32_t       src_ip;
+       u_int16_t       dst_port;
+       u_int16_t       src_port;
+       u_int8_t        fib;
+       u_int8_t        proto;
+       u_int8_t        flags;  /* protocol-specific flags */
+       uint8_t         addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */
+       struct in6_addr dst_ip6;        /* could also store MAC addr! */
+       struct in6_addr src_ip6;
+       u_int32_t       flow_id6;
+       u_int32_t       frag_id6;
+};
+
+#define IS_IP6_FLOW_ID(id)     ((id)->addr_type == 6)
+
+/*
+ * Dynamic ipfw rule.
+ */
+typedef struct _ipfw_dyn_rule ipfw_dyn_rule;
+
+struct _ipfw_dyn_rule {
+       ipfw_dyn_rule   *next;          /* linked list of rules.        */
+       struct ip_fw *rule;             /* pointer to rule              */
+       /* 'rule' is used to pass up the rule number (from the parent)  */
+
+       ipfw_dyn_rule *parent;          /* pointer to parent rule       */
+       u_int64_t       pcnt;           /* packet match counter         */
+       u_int64_t       bcnt;           /* byte match counter           */
+       struct ipfw_flow_id id;         /* (masked) flow id             */
+       u_int32_t       expire;         /* expire time                  */
+       u_int32_t       bucket;         /* which bucket in hash table   */
+       u_int32_t       state;          /* state of this rule (typically a
+                                        * combination of TCP flags)
+                                        */
+       u_int32_t       ack_fwd;        /* most recent ACKs in forward  */
+       u_int32_t       ack_rev;        /* and reverse directions (used */
+                                       /* to generate keepalives)      */
+       u_int16_t       dyn_type;       /* rule type                    */
+       u_int16_t       count;          /* refcount                     */
+};
+
+/*
+ * Definitions for IP option names.
+ */
+#define        IP_FW_IPOPT_LSRR        0x01
+#define        IP_FW_IPOPT_SSRR        0x02
+#define        IP_FW_IPOPT_RR          0x04
+#define        IP_FW_IPOPT_TS          0x08
+
+/*
+ * Definitions for TCP option names.
+ */
+#define        IP_FW_TCPOPT_MSS        0x01
+#define        IP_FW_TCPOPT_WINDOW     0x02
+#define        IP_FW_TCPOPT_SACK       0x04
+#define        IP_FW_TCPOPT_TS         0x08
+#define        IP_FW_TCPOPT_CC         0x10
+
+#define        ICMP_REJECT_RST         0x100   /* fake ICMP code (send a TCP RST) */
+#define        ICMP6_UNREACH_RST       0x100   /* fake ICMPv6 code (send a TCP RST) */
+
+/*
+ * These are used for lookup tables.
+ */
+typedef struct _ipfw_table_entry {
+       in_addr_t       addr;           /* network address              */
+       u_int32_t       value;          /* value                        */
+       u_int16_t       tbl;            /* table number                 */
+       u_int8_t        masklen;        /* mask length                  */
+} ipfw_table_entry;
+
+typedef struct _ipfw_table {
+       u_int32_t       size;           /* size of entries in bytes     */
+       u_int32_t       cnt;            /* # of entries                 */
+       u_int16_t       tbl;            /* table number                 */
+       ipfw_table_entry ent[0];        /* entries                      */
+} ipfw_table;
+
+#endif /* _IPFW2_H */
diff --git a/dummynet2/include/netinet/ipfw/ip_fw_private.h b/dummynet2/include/netinet/ipfw/ip_fw_private.h

new file mode 100644 (file)

index 0000000..41ae845
--- /dev/null
+++ b/dummynet2/include/netinet/ipfw/ip_fw_private.h
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+#define MTAG_IPFW      1148380143      /* IPFW-tagged cookie */
+#define MTAG_IPFW_RULE 1262273568      /* rule reference */
+
+/* Return values from ipfw_chk() */
+enum {
+       IP_FW_PASS = 0,
+       IP_FW_DENY,
+       IP_FW_DIVERT,
+       IP_FW_TEE,
+       IP_FW_DUMMYNET,
+       IP_FW_NETGRAPH,
+       IP_FW_NGTEE,
+       IP_FW_NAT,
+       IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+       struct ip6_pktopts *opt_or;
+       struct route_in6 ro_or;
+       int flags_or;
+       struct ip6_moptions *im6o_or;
+       struct ifnet *origifp_or;
+       struct ifnet *ifp_or;
+       struct sockaddr_in6 dst_or;
+       u_long mtu_or;
+       struct route_in6 ro_pmtu_or;
+};
+
+/*
+ * Reference to an ipfw rule that can be carried outside critical sections.
+ * A rule is identified by rulenum:rule_id which is ordered.
+ * In version chain_id the rule can be found in slot 'slot', so
+ * we don't need a lookup if chain_id == chain->id.
+ *
+ * On exit from the firewall this structure refers to the rule after
+ * the matching one (slot points to the new rule; rulenum:rule_id-1
+ * is the matching rule), and additional info (e.g. info often contains
+ * the insn argument or tablearg in the low 16 bits, in host format).
+ * On entry, the structure is valid if slot>0, and refers to the starting
+ * rules. 'info' contains the reason for reinject, e.g. divert port,
+ * divert direction, and so on.
+ */
+struct ipfw_rule_ref {
+       uint32_t        slot;           /* slot for matching rule       */
+       uint32_t        rulenum;        /* matching rule number         */
+       uint32_t        rule_id;        /* matching rule id             */
+       uint32_t        chain_id;       /* ruleset id                   */
+       uint32_t        info;           /* see below                    */
+};
+
+enum {
+       IPFW_INFO_MASK  = 0x0000ffff,
+       IPFW_INFO_OUT   = 0x00000000,   /* outgoing, just for convenience */
+       IPFW_INFO_IN    = 0x80000000,   /* incoming, overloads dir */
+       IPFW_ONEPASS    = 0x40000000,   /* One-pass, do not reinject */
+       IPFW_IS_MASK    = 0x30000000,   /* which source ? */
+       IPFW_IS_DIVERT  = 0x20000000,
+       IPFW_IS_DUMMYNET =0x10000000,
+       IPFW_IS_PIPE    = 0x08000000,   /* pip1=1, queue = 0 */
+};
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+       struct mbuf     *m;             /* the mbuf chain               */
+       struct ifnet    *oif;           /* output interface             */
+       struct sockaddr_in *next_hop;   /* forward address              */
+
+       /*
+        * On return, it points to the matching rule.
+        * On entry, rule.slot > 0 means the info is valid and
+        * contains the the starting rule for an ipfw search.
+        * If chain_id == chain->id && slot >0 then jump to that slot.
+        * Otherwise, we locate the first rule >= rulenum:rule_id
+        */
+       struct ipfw_rule_ref rule;      /* match/restart info           */
+
+       struct ether_header *eh;        /* for bridged packets          */
+
+       struct ipfw_flow_id f_id;       /* grabbed from IP header       */
+       //uint32_t      cookie;         /* a cookie depending on rule action */
+       struct inpcb    *inp;
+
+       struct _ip6dn_args      dummypar; /* dummynet->ip6_output */
+       struct sockaddr_in hopstore;    /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+       DIR_MASK =      0x3,
+       DIR_OUT =       0,
+       DIR_IN =        1,
+       DIR_FWD =       2,
+       DIR_DROP =      3,
+       PROTO_LAYER2 =  0x4, /* set for layer 2 */
+       /* PROTO_DEFAULT = 0, */
+       PROTO_IPV4 =    0x08,
+       PROTO_IPV6 =    0x10,
+       PROTO_IFB =     0x0c, /* layer2 + ifbridge */
+    /*  PROTO_OLDBDG =  0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifdef __linux__
+#define FREE_PKT(m)    netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m)    m_freem(m)
+#endif
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+       struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+       struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define        V_norule_counter        VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define        V_verbose_limit         VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+       MATCH_REVERSE = 0,
+       MATCH_FORWARD,
+       MATCH_NONE,
+       MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+    u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+       int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void);    /* uma_zcreate .... */
+void ipfw_dyn_detach(void);    /* uma_zdestroy ... */
+void ipfw_dyn_init(void);      /* per-vnet initialization */
+void ipfw_dyn_uninit(int);     /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define        V_fw_one_pass           VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define        V_fw_verbose            VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define        V_layer3_chain          VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define        V_set_disable           VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step         VNET(autoinc_step)
+
+struct ip_fw_chain {
+       struct ip_fw    *rules;         /* list of rules */
+       struct ip_fw    *reap;          /* list of rules to reap */
+       struct ip_fw    *default_rule;
+       int             n_rules;        /* number of static rules */
+       int             static_len;     /* total len of static rules */
+       struct ip_fw    **map;          /* array of rule ptrs to ease lookup */
+       LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
+       struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+        spinlock_t rwmtx;
+        spinlock_t uh_lock;
+#else
+       struct rwlock   rwmtx;
+       struct rwlock   uh_lock;        /* lock for upper half */
+#endif
+       uint32_t        id;             /* ruleset id */
+};
+
+struct sockopt;        /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define        IPFW_LOCK_INIT(_chain) do {                     \
+       rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+       rw_init(&(_chain)->uh_lock, "IPFW UH lock");    \
+       } while (0)
+
+#define        IPFW_LOCK_DESTROY(_chain) do {                  \
+       rw_destroy(&(_chain)->rwmtx);                   \
+       rw_destroy(&(_chain)->uh_lock);                 \
+       } while (0)
+
+#define        IPFW_WLOCK_ASSERT(_chain)       rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+void ipfw_flush_tables(struct ip_fw_chain *ch);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+
+/* hooks for divert */
+extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+/* In ip_fw_nat.c */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+/* netgraph prototypes */
+
+typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int);
+extern  ng_ipfw_input_t *ng_ipfw_input_p;
+#define NG_IPFW_LOADED  (ng_ipfw_input_p != NULL)
+
+#define TAGSIZ  (sizeof(struct ng_ipfw_tag) - sizeof(struct m_tag))
+
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/dummynet2/ip_dummynet.c b/dummynet2/ip_dummynet.c

new file mode 100644 (file)

index 0000000..bb34c04
--- /dev/null
+++ b/dummynet2/ip_dummynet.c
@@ -0,0 +1,2370 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#define        DUMMYNET_DEBUG
+
+#include "opt_inet6.h"
+
+/*
+ * This module implements IP dummynet, a bandwidth limiter/delay emulator
+ * used in conjunction with the ipfw package.
+ * Description of the data structures used is in ip_dummynet.h
+ * Here you mainly find the following blocks of code:
+ *  + variable declarations;
+ *  + heap management functions;
+ *  + scheduler and dummynet functions;
+ *  + configuration and initialization.
+ *
+ * NOTA BENE: critical sections are protected by the "dummynet lock".
+ *
+ * Most important Changes:
+ *
+ * 011004: KLDable
+ * 010124: Fixed WF2Q behaviour
+ * 010122: Fixed spl protection.
+ * 000601: WF2Q support
+ * 000106: large rewrite, use heaps to handle very many pipes.
+ * 980513:     initial release
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>                /* ip_len, ip_off */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ; /* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static long searches, search_steps ;
+static int pipe_expire = 1 ;   /* expire queue if empty */
+static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+
+static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
+static long pipe_byte_limit = 1024 * 1024;
+
+static int red_lookup_depth = 256;     /* RED - default lookup table depth */
+static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
+static int red_max_pkt_size = 1500;     /* RED - default max packet size */
+
+static struct timeval prev_t, t;
+static long tick_last;                 /* Last tick duration (usec). */
+static long tick_delta;                        /* Last vs standard tick diff (usec). */
+static long tick_delta_sum;            /* Accumulated tick difference (usec).*/
+static long tick_adjustment;           /* Tick adjustments done. */
+static long tick_lost;                 /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static int             io_fast;
+static unsigned long   io_pkt;
+static unsigned long   io_pkt_fast;
+static unsigned long   io_pkt_drop;
+
+/*
+ * Three heaps contain queues and pipes that the scheduler handles:
+ *
+ * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
+ *
+ * wfq_ready_heap contains the pipes associated with WF2Q flows
+ *
+ * extract_heap contains pipes associated with delay lines.
+ *
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
+
+static int     heap_init(struct dn_heap *h, int size);
+static int     heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void    heap_extract(struct dn_heap *h, void *obj);
+static void    transmit_event(struct dn_pipe *pipe, struct mbuf **head,
+                   struct mbuf **tail);
+static void    ready_event(struct dn_flow_queue *q, struct mbuf **head,
+                   struct mbuf **tail);
+static void    ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
+                   struct mbuf **tail);
+
+#define        HASHSIZE        16
+#define        HASH(num)       ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
+static struct dn_pipe_head     pipehash[HASHSIZE];     /* all pipes */
+static struct dn_flow_set_head flowsethash[HASHSIZE];  /* all flowsets */
+
+static struct callout dn_timeout;
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
+#if 0  /* curr_time is 64 bit */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+    CTLFLAG_RD, &curr_time, 0, "Current tick");
+#endif
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
+    CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
+    CTLFLAG_RW, &dn_max_ratio, 0,
+    "Max ratio between dynamic queues and buckets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
+#endif
+
+#ifdef DUMMYNET_DEBUG
+int    dummynet_debug = 0;
+#ifdef SYSCTL_NODE
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
+           0, "control debugging printfs");
+#endif
+#define        DPRINTF(X)      if (dummynet_debug) printf X
+#else
+#define        DPRINTF(X)
+#endif
+
+static struct task     dn_task;
+static struct taskqueue        *dn_tq = NULL;
+static void dummynet_task(void *, int);
+
+#if defined( __linux__ ) || defined( _WIN32 )
+static DEFINE_SPINLOCK(dummynet_mtx);
+#else
+static struct mtx dummynet_mtx;
+#endif
+#define        DUMMYNET_LOCK_INIT() \
+       mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
+#define        DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
+#define        DUMMYNET_LOCK()         mtx_lock(&dummynet_mtx)
+#define        DUMMYNET_UNLOCK()       mtx_unlock(&dummynet_mtx)
+#define        DUMMYNET_LOCK_ASSERT()  mtx_assert(&dummynet_mtx, MA_OWNED)
+
+static int     config_pipe(struct dn_pipe *p);
+static int     ip_dn_ctl(struct sockopt *sopt);
+
+static void    dummynet(void *);
+static void    dummynet_flush(void);
+static void    dummynet_send(struct mbuf *);
+void           dummynet_drain(void);
+static int     dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+
+/*
+ * Flow queue is idle if:
+ *   1) it's empty for at least 1 tick
+ *   2) it has invalid timestamp (WF2Q case)
+ *   3) parent pipe has no 'exhausted' burst.
+ */
+#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \
+       curr_time > (q)->idle_time + 1 && \
+       ((q)->numbytes + (curr_time - (q)->idle_time - 1) * \
+       (q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst))
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define        HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+    struct dn_heap_entry *p;
+
+    if (h->size >= new_size ) {
+       printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
+               h->size, new_size);
+       return 0 ;
+    }
+    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
+    if (p == NULL) {
+       printf("dummynet: %s, resize %d failed\n", __func__, new_size );
+       return 1 ; /* error */
+    }
+    if (h->size > 0) {
+       bcopy(h->p, p, h->size * sizeof(*p) );
+       free(h->p, M_DUMMYNET);
+    }
+    h->p = p ;
+    h->size = new_size ;
+    return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+           *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+           *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+    int son = h->elements ;
+
+    if (p == NULL)     /* data already there, set starting point */
+       son = key1 ;
+    else {             /* insert new element at the end, possibly resize */
+       son = h->elements ;
+       if (son == h->size) /* need resize... */
+           if (heap_init(h, h->elements+1) )
+               return 1 ; /* failure... */
+       h->p[son].object = p ;
+       h->p[son].key = key1 ;
+       h->elements++ ;
+    }
+    while (son > 0) {                          /* bubble up */
+       int father = HEAP_FATHER(son) ;
+       struct dn_heap_entry tmp  ;
+
+       if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+           break ; /* found right position */
+       /* son smaller than father, swap and repeat */
+       HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+       SET_OFFSET(h, son);
+       son = father ;
+    }
+    SET_OFFSET(h, son);
+    return 0 ;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+static void
+heap_extract(struct dn_heap *h, void *obj)
+{
+    int child, father, max = h->elements - 1 ;
+
+    if (max < 0) {
+       printf("dummynet: warning, extract from empty heap 0x%p\n", h);
+       return ;
+    }
+    father = 0 ; /* default: move up smallest child */
+    if (obj != NULL) { /* extract specific element, index is at offset */
+       if (h->offset <= 0)
+           panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
+       father = *((int *)((char *)obj + h->offset)) ;
+       if (father < 0 || father >= h->elements) {
+           printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+               father, h->elements);
+           panic("dummynet: heap_extract");
+       }
+    }
+    RESET_OFFSET(h, father);
+    child = HEAP_LEFT(father) ;                /* left child */
+    while (child <= max) {             /* valid entry */
+       if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+           child = child+1 ;           /* take right child, otherwise left */
+       h->p[father] = h->p[child] ;
+       SET_OFFSET(h, father);
+       father = child ;
+       child = HEAP_LEFT(child) ;   /* left child for next loop */
+    }
+    h->elements-- ;
+    if (father != max) {
+       /*
+        * Fill hole with last entry and bubble up, reusing the insert code
+        */
+       h->p[father] = h->p[max] ;
+       heap_insert(h, father, NULL); /* this one cannot fail */
+    }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+    int temp;
+    int i ;
+    int max = h->elements-1 ;
+    struct dn_heap_entry buf ;
+
+    if (h->offset <= 0)
+       panic("cannot move items on this heap");
+
+    i = *((int *)((char *)object + h->offset));
+    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+       h->p[i].key = new_key ;
+       for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+                i = temp ) { /* bubble up */
+           HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+           SET_OFFSET(h, i);
+       }
+    } else {           /* must move down */
+       h->p[i].key = new_key ;
+       while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+           if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+               temp++ ; /* select child with min key */
+           if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+               HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+               SET_OFFSET(h, i);
+           } else
+               break ;
+           i = temp ;
+       }
+    }
+    SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+    int i ;
+
+    for (i = 0 ; i < h->elements ; i++ )
+       heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+static void
+heap_free(struct dn_heap *h)
+{
+    if (h->size >0 )
+       free(h->p, M_DUMMYNET);
+    bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+/*
+ * Dispose a list of packet. Use an inline functions so if we
+ * need to free extra state associated to a packet, this is a
+ * central point to do it.
+ */
+
+static __inline void dn_free_pkts(struct mbuf *mnext)
+{
+       struct mbuf *m;
+
+       while ((m = mnext) != NULL) {
+               mnext = m->m_nextpkt;
+               FREE_PKT(m);
+       }
+}
+
+/*
+ * Return the mbuf tag holding the dummynet state.  As an optimization
+ * this is assumed to be the first tag on the list.  If this turns out
+ * wrong we'll need to search the list.
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+    struct m_tag *mtag = m_tag_first(m);
+    KASSERT(mtag != NULL &&
+           mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+           mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+           ("packet on dummynet queue w/o dummynet tag!"));
+    return (struct dn_pkt_tag *)(mtag+1);
+}
+
+/*
+ * Scheduler functions:
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with fixed-rate queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * wfq_ready_event() does something similar with WF2Q queues, and the
+ * event handled is the start time of the head pkt.
+ *
+ * In all cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+{
+       struct mbuf *m;
+       struct dn_pkt_tag *pkt;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       while ((m = pipe->head) != NULL) {
+               pkt = dn_tag_get(m);
+               if (!DN_KEY_LEQ(pkt->output_time, curr_time))
+                       break;
+
+               pipe->head = m->m_nextpkt;
+               if (*tail != NULL)
+                       (*tail)->m_nextpkt = m;
+               else
+                       *head = m;
+               *tail = m;
+       }
+       if (*tail != NULL)
+               (*tail)->m_nextpkt = NULL;
+
+       /* If there are leftover packets, put into the heap for next event. */
+       if ((m = pipe->head) != NULL) {
+               pkt = dn_tag_get(m);
+               /*
+                * XXX Should check errors on heap_insert, by draining the
+                * whole pipe p and hoping in the future we are more successful.
+                */
+               heap_insert(&extract_heap, pkt->output_time, pipe);
+       }
+}
+
+#ifndef __linux__
+#define div64(a, b)    ((int64_t)(a) / (int64_t)(b))
+#endif
+/*
+ * Compute how many ticks we have to wait before being able to send
+ * a packet. This is computed as the "wire time" for the packet
+ * (length + extra bits), minus the credit available, scaled to ticks.
+ * Check that the result is not be negative (it could be if we have
+ * too much leftover credit in q->numbytes).
+ */
+static inline dn_key
+set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+{
+       int64_t ret;
+
+       ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
+               - q->numbytes + p->bandwidth - 1 , p->bandwidth);
+       if (ret < 0)
+               ret = 0;
+       return ret;
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are in milliseconds
+ * so we need to divide by 1000.
+ */
+static dn_key
+compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+{
+       int index;
+       dn_key extra_bits;
+
+       if (!p->samples || p->samples_no == 0)
+               return 0;
+       index  = random() % p->samples_no;
+       extra_bits = div64((dn_key)p->samples[index] * p->bandwidth, 1000);
+       if (index >= p->loss_level) {
+               struct dn_pkt_tag *dt = dn_tag_get(pkt);
+               if (dt)
+                       dt->dn_dir = DIR_DROP;
+       }
+       return extra_bits;
+}
+
+static void
+free_pipe(struct dn_pipe *p)
+{
+       if (p->samples)
+               free(p->samples, M_DUMMYNET);
+       free(p, M_DUMMYNET);
+}
+
+/*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+static void
+move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
+    int len)
+{
+    struct dn_pkt_tag *dt = dn_tag_get(pkt);
+
+    q->head = pkt->m_nextpkt ;
+    q->len-- ;
+    q->len_bytes -= len ;
+
+    dt->output_time = curr_time + p->delay ;
+
+    if (p->head == NULL)
+       p->head = pkt;
+    else
+       p->tail->m_nextpkt = pkt;
+    p->tail = pkt;
+    p->tail->m_nextpkt = NULL;
+}
+
+/*
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
+ */
+static void
+ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+{
+       struct mbuf *pkt;
+       struct dn_pipe *p = q->fs->pipe;
+       int p_was_empty;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       if (p == NULL) {
+               printf("dummynet: ready_event- pipe is gone\n");
+               return;
+       }
+       p_was_empty = (p->head == NULL);
+
+       /*
+        * Schedule fixed-rate queues linked to this pipe:
+        * account for the bw accumulated since last scheduling, then
+        * drain as many pkts as allowed by q->numbytes and move to
+        * the delay line (in p) computing output time.
+        * bandwidth==0 (no limit) means we can drain the whole queue,
+        * setting len_scaled = 0 does the job.
+        */
+       q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
+       while ((pkt = q->head) != NULL) {
+               int len = pkt->m_pkthdr.len;
+               dn_key len_scaled = p->bandwidth ? len*8*hz
+                       + q->extra_bits*hz
+                       : 0;
+
+               if (DN_KEY_GT(len_scaled, q->numbytes))
+                       break;
+               q->numbytes -= len_scaled;
+               move_pkt(pkt, q, p, len);
+               if (q->head)
+                       q->extra_bits = compute_extra_bits(q->head, p);
+       }
+       /*
+        * If we have more packets queued, schedule next ready event
+        * (can only occur when bandwidth != 0, otherwise we would have
+        * flushed the whole queue in the previous loop).
+        * To this purpose we record the current time and compute how many
+        * ticks to go for the finish time of the packet.
+        */
+       if ((pkt = q->head) != NULL) {  /* this implies bandwidth != 0 */
+               dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+
+               q->sched_time = curr_time;
+               heap_insert(&ready_heap, curr_time + t, (void *)q);
+               /*
+                * XXX Should check errors on heap_insert, and drain the whole
+                * queue on error hoping next time we are luckier.
+                */
+       } else          /* RED needs to know when the queue becomes empty. */
+               q->idle_time = curr_time;
+
+       /*
+        * If the delay line was empty call transmit_event() now.
+        * Otherwise, the scheduler will take care of it.
+        */
+       if (p_was_empty)
+               transmit_event(p, head, tail);
+}
+
+/*
+ * Called when we can transmit packets on WF2Q queues. Take pkts out of
+ * the queues at their start time, and enqueue into the delay line.
+ * Packets are drained until p->numbytes < 0. As long as
+ * len_scaled >= p->numbytes, the packet goes into the delay line
+ * with a deadline p->delay. For the last packet, if p->numbytes < 0,
+ * there is an additional delay.
+ */
+static void
+ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+{
+       int p_was_empty = (p->head == NULL);
+       struct dn_heap *sch = &(p->scheduler_heap);
+       struct dn_heap *neh = &(p->not_eligible_heap);
+       int64_t p_numbytes = p->numbytes;
+
+       /*
+        * p->numbytes is only 32bits in FBSD7, but we might need 64 bits.
+        * Use a local variable for the computations, and write back the
+        * results when done, saturating if needed.
+        * The local variable has no impact on performance and helps
+        * reducing diffs between the various branches.
+        */
+
+       DUMMYNET_LOCK_ASSERT();
+
+       if (p->if_name[0] == 0)         /* tx clock is simulated */
+               p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
+       else {  /*
+                * tx clock is for real,
+                * the ifq must be empty or this is a NOP.
+                */
+#ifdef __linux__
+               return;
+#else
+               if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
+                       return;
+               else {
+                       DPRINTF(("dummynet: pipe %d ready from %s --\n",
+                           p->pipe_nr, p->if_name));
+               }
+#endif
+       }
+
+       /*
+        * While we have backlogged traffic AND credit, we need to do
+        * something on the queue.
+        */
+       while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
+               if (sch->elements > 0) {
+                       /* Have some eligible pkts to send out. */
+                       struct dn_flow_queue *q = sch->p[0].object;
+                       struct mbuf *pkt = q->head;
+                       struct dn_flow_set *fs = q->fs;
+                       uint64_t len = pkt->m_pkthdr.len;
+                       int len_scaled = p->bandwidth ? len * 8 * hz : 0;
+
+                       heap_extract(sch, NULL); /* Remove queue from heap. */
+                       p_numbytes -= len_scaled;
+                       move_pkt(pkt, q, p, len);
+
+                       p->V += div64((len << MY_M), p->sum);   /* Update V. */
+                       q->S = q->F;                    /* Update start time. */
+                       if (q->len == 0) {
+                               /* Flow not backlogged any more. */
+                               fs->backlogged--;
+                               heap_insert(&(p->idle_heap), q->F, q);
+                       } else {
+                               /* Still backlogged. */
+
+                               /*
+                                * Update F and position in backlogged queue,
+                                * then put flow in not_eligible_heap
+                                * (we will fix this later).
+                                */
+                               len = (q->head)->m_pkthdr.len;
+                               q->F += div64((len << MY_M), fs->weight);
+                               if (DN_KEY_LEQ(q->S, p->V))
+                                       heap_insert(neh, q->S, q);
+                               else
+                                       heap_insert(sch, q->F, q);
+                       }
+               }
+               /*
+                * Now compute V = max(V, min(S_i)). Remember that all elements
+                * in sch have by definition S_i <= V so if sch is not empty,
+                * V is surely the max and we must not update it. Conversely,
+                * if sch is empty we only need to look at neh.
+                */
+               if (sch->elements == 0 && neh->elements > 0)
+                       p->V = MAX64(p->V, neh->p[0].key);
+               /* Move from neh to sch any packets that have become eligible */
+               while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
+                       struct dn_flow_queue *q = neh->p[0].object;
+                       heap_extract(neh, NULL);
+                       heap_insert(sch, q->F, q);
+               }
+
+               if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
+                       p_numbytes = -1;        /* Mark not ready for I/O. */
+                       break;
+               }
+       }
+       if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0) {
+               p->idle_time = curr_time;
+               /*
+                * No traffic and no events scheduled.
+                * We can get rid of idle-heap.
+                */
+               if (p->idle_heap.elements > 0) {
+                       int i;
+
+                       for (i = 0; i < p->idle_heap.elements; i++) {
+                               struct dn_flow_queue *q;
+                               
+                               q = p->idle_heap.p[i].object;
+                               q->F = 0;
+                               q->S = q->F + 1;
+                       }
+                       p->sum = 0;
+                       p->V = 0;
+                       p->idle_heap.elements = 0;
+               }
+       }
+       /*
+        * If we are getting clocks from dummynet (not a real interface) and
+        * If we are under credit, schedule the next ready event.
+        * Also fix the delivery time of the last packet.
+        */
+       if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
+               dn_key t = 0;           /* Number of ticks i have to wait. */
+
+               if (p->bandwidth > 0)
+                       t = div64(p->bandwidth - 1 - p_numbytes, p->bandwidth);
+               dn_tag_get(p->tail)->output_time += t;
+               p->sched_time = curr_time;
+               heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
+               /*
+                * XXX Should check errors on heap_insert, and drain the whole
+                * queue on error hoping next time we are luckier.
+                */
+       }
+
+       /* Write back p_numbytes (adjust 64->32bit if necessary). */
+       p->numbytes = p_numbytes;
+
+       /*
+        * If the delay line was empty call transmit_event() now.
+        * Otherwise, the scheduler will take care of it.
+        */
+       if (p_was_empty)
+               transmit_event(p, head, tail);
+}
+
+/*
+ * This is called one tick, after previous run. It is used to
+ * schedule next run.
+ */
+static void
+dummynet(void * __unused unused)
+{
+
+       taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+/*
+ * The main dummynet processing function.
+ */
+static void
+dummynet_task(void *context, int pending)
+{
+       struct mbuf *head = NULL, *tail = NULL;
+       struct dn_pipe *pipe;
+       struct dn_heap *heaps[3];
+       struct dn_heap *h;
+       void *p;        /* generic parameter to handler */
+       int i;
+
+       DUMMYNET_LOCK();
+
+       heaps[0] = &ready_heap;                 /* fixed-rate queues */
+       heaps[1] = &wfq_ready_heap;             /* wfq queues */
+       heaps[2] = &extract_heap;               /* delay line */
+
+       /* Update number of lost(coalesced) ticks. */
+       tick_lost += pending - 1;
+ 
+       getmicrouptime(&t);
+       /* Last tick duration (usec). */
+       tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
+           (t.tv_usec - prev_t.tv_usec);
+       /* Last tick vs standard tick difference (usec). */
+       tick_delta = (tick_last * hz - 1000000) / hz;
+       /* Accumulated tick difference (usec). */
+       tick_delta_sum += tick_delta;
+ 
+       prev_t = t;
+ 
+       /*
+        * Adjust curr_time if accumulated tick difference greater than
+        * 'standard' tick. Since curr_time should be monotonically increasing,
+        * we do positive adjustment as required and throttle curr_time in
+        * case of negative adjustment.
+        */
+       curr_time++;
+       if (tick_delta_sum - tick >= 0) {
+               int diff = tick_delta_sum / tick;
+ 
+               curr_time += diff;
+               tick_diff += diff;
+               tick_delta_sum %= tick;
+               tick_adjustment++;
+       } else if (tick_delta_sum + tick <= 0) {
+               curr_time--;
+               tick_diff--;
+               tick_delta_sum += tick;
+               tick_adjustment++;
+       }
+
+       for (i = 0; i < 3; i++) {
+               h = heaps[i];
+               while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
+                       if (h->p[0].key > curr_time)
+                               printf("dummynet: warning, "
+                                   "heap %d is %d ticks late\n",
+                                   i, (int)(curr_time - h->p[0].key));
+                       /* store a copy before heap_extract */
+                       p = h->p[0].object;
+                       /* need to extract before processing */
+                       heap_extract(h, NULL);
+                       if (i == 0)
+                               ready_event(p, &head, &tail);
+                       else if (i == 1) {
+                               struct dn_pipe *pipe = p;
+                               if (pipe->if_name[0] != '\0')
+                                       printf("dummynet: bad ready_event_wfq "
+                                           "for pipe %s\n", pipe->if_name);
+                               else
+                                       ready_event_wfq(p, &head, &tail);
+                       } else
+                               transmit_event(p, &head, &tail);
+               }
+       }
+
+       /* Sweep pipes trying to expire idle flow_queues. */
+       for (i = 0; i < HASHSIZE; i++) {
+               SLIST_FOREACH(pipe, &pipehash[i], next) {
+                       if (pipe->idle_heap.elements > 0 &&
+                           DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
+                               struct dn_flow_queue *q =
+                                   pipe->idle_heap.p[0].object;
+
+                               heap_extract(&(pipe->idle_heap), NULL);
+                               /* Mark timestamp as invalid. */
+                               q->S = q->F + 1;
+                               pipe->sum -= q->fs->weight;
+                       }
+               }
+       }
+
+       DUMMYNET_UNLOCK();
+
+       if (head != NULL)
+               dummynet_send(head);
+
+       callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+
+static void
+dummynet_send(struct mbuf *m)
+{
+       struct mbuf *n;
+
+       for (; m != NULL; m = n) {
+               struct ifnet *ifp = NULL;
+               int dst;
+               struct m_tag *tag;
+
+               n = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               tag = m_tag_first(m);
+               if (tag == NULL) {
+                       dst = DIR_DROP;
+               } else {
+                       struct dn_pkt_tag *pkt = dn_tag_get(m);
+                       /* extract the dummynet info, rename the tag */
+                       dst = pkt->dn_dir;
+                       ifp = pkt->ifp;
+                       /* rename the tag so it carries reinject info */
+                       tag->m_tag_cookie = MTAG_IPFW_RULE;
+                       tag->m_tag_id = 0;
+               }
+
+               switch (dst) {
+               case DIR_OUT:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+                       break ;
+               case DIR_IN :
+                       /* put header in network format for ip_input() */
+                       //SET_NET_IPLEN(mtod(m, struct ip *));
+                       netisr_dispatch(NETISR_IP, m);
+                       break;
+#ifdef INET6
+               case DIR_IN | PROTO_IPV6:
+                       netisr_dispatch(NETISR_IPV6, m);
+                       break;
+
+               case DIR_OUT | PROTO_IPV6:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+                       break;
+#endif
+               case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+                       if (bridge_dn_p != NULL)
+                               ((*bridge_dn_p)(m, ifp));
+                       else
+                               printf("dummynet: if_bridge not loaded\n");
+
+                       break;
+               case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+                       /*
+                        * The Ethernet code assumes the Ethernet header is
+                        * contiguous in the first mbuf header.
+                        * Insure this is true.
+                        */
+                       if (m->m_len < ETHER_HDR_LEN &&
+                           (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+                               printf("dummynet/ether: pullup failed, "
+                                   "dropping packet\n");
+                               break;
+                       }
+                       ether_demux(m->m_pkthdr.rcvif, m);
+                       break;
+               case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+                       ether_output_frame(ifp, m);
+                       break;
+
+               case DIR_DROP:
+                       /* drop the packet after some time */
+                       FREE_PKT(m);
+                       break;
+
+               default:
+                       printf("dummynet: bad switch %d!\n", dst);
+                       FREE_PKT(m);
+                       break;
+               }
+       }
+}
+
+/*
+ * Unconditionally expire empty queues in case of shortage.
+ * Returns the number of queues freed.
+ */
+static int
+expire_queues(struct dn_flow_set *fs)
+{
+    struct dn_flow_queue *q, *prev ;
+    int i, initial_elements = fs->rq_elements ;
+
+    if (fs->last_expired == time_uptime)
+       return 0 ;
+    fs->last_expired = time_uptime ;
+    for (i = 0 ; i <= fs->rq_size ; i++) { /* last one is overflow */
+       for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) {
+           if (!QUEUE_IS_IDLE(q)) {
+               prev = q ;
+               q = q->next ;
+           } else { /* entry is idle, expire it */
+               struct dn_flow_queue *old_q = q ;
+
+               if (prev != NULL)
+                   prev->next = q = q->next ;
+               else
+                   fs->rq[i] = q = q->next ;
+               fs->rq_elements-- ;
+               free(old_q, M_DUMMYNET);
+           }
+       }
+    }
+    return initial_elements - fs->rq_elements ;
+}
+
+/*
+ * If room, create a new queue and put at head of slot i;
+ * otherwise, create or use the default queue.
+ */
+static struct dn_flow_queue *
+create_queue(struct dn_flow_set *fs, int i)
+{
+       struct dn_flow_queue *q;
+
+       if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
+           expire_queues(fs) == 0) {
+               /* No way to get room, use or create overflow queue. */
+               i = fs->rq_size;
+               if (fs->rq[i] != NULL)
+                   return fs->rq[i];
+       }
+       q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (q == NULL) {
+               printf("dummynet: sorry, cannot allocate queue for new flow\n");
+               return (NULL);
+       }
+       q->fs = fs;
+       q->hash_slot = i;
+       q->next = fs->rq[i];
+       q->S = q->F + 1;        /* hack - mark timestamp as invalid. */
+       q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0);
+       fs->rq[i] = q;
+       fs->rq_elements++;
+       return (q);
+}
+
+/*
+ * Given a flow_set and a pkt in last_pkt, find a matching queue
+ * after appropriate masking. The queue is moved to front
+ * so that further searches take less time.
+ */
+static struct dn_flow_queue *
+find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
+{
+    int i = 0 ; /* we need i and q for new allocations */
+    struct dn_flow_queue *q, *prev;
+    int is_v6 = IS_IP6_FLOW_ID(id);
+
+    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
+       q = fs->rq[0] ;
+    else {
+       /* first, do the masking, then hash */
+       id->dst_port &= fs->flow_mask.dst_port ;
+       id->src_port &= fs->flow_mask.src_port ;
+       id->proto &= fs->flow_mask.proto ;
+       id->flags = 0 ; /* we don't care about this one */
+       if (is_v6) {
+           APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
+           APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
+           id->flow_id6 &= fs->flow_mask.flow_id6;
+
+           i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
+
+               ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
+
+               ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
+
+               ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
+
+               (id->dst_port << 1) ^ (id->src_port) ^
+               (id->proto ) ^
+               (id->flow_id6);
+       } else {
+           id->dst_ip &= fs->flow_mask.dst_ip ;
+           id->src_ip &= fs->flow_mask.src_ip ;
+
+           i = ( (id->dst_ip) & 0xffff ) ^
+               ( (id->dst_ip >> 15) & 0xffff ) ^
+               ( (id->src_ip << 1) & 0xffff ) ^
+               ( (id->src_ip >> 16 ) & 0xffff ) ^
+               (id->dst_port << 1) ^ (id->src_port) ^
+               (id->proto );
+       }
+       i = i % fs->rq_size ;
+       /* finally, scan the current list for a match */
+       searches++ ;
+       for (prev=NULL, q = fs->rq[i] ; q ; ) {
+           search_steps++;
+           if (is_v6 &&
+                   IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
+                   IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
+                   id->dst_port == q->id.dst_port &&
+                   id->src_port == q->id.src_port &&
+                   id->proto == q->id.proto &&
+                   id->flags == q->id.flags &&
+                   id->flow_id6 == q->id.flow_id6)
+               break ; /* found */
+
+           if (!is_v6 && id->dst_ip == q->id.dst_ip &&
+                   id->src_ip == q->id.src_ip &&
+                   id->dst_port == q->id.dst_port &&
+                   id->src_port == q->id.src_port &&
+                   id->proto == q->id.proto &&
+                   id->flags == q->id.flags)
+               break ; /* found */
+
+           /* No match. Check if we can expire the entry */
+           if (pipe_expire && QUEUE_IS_IDLE(q)) {
+               /* entry is idle and not in any heap, expire it */
+               struct dn_flow_queue *old_q = q ;
+
+               if (prev != NULL)
+                   prev->next = q = q->next ;
+               else
+                   fs->rq[i] = q = q->next ;
+               fs->rq_elements-- ;
+               free(old_q, M_DUMMYNET);
+               continue ;
+           }
+           prev = q ;
+           q = q->next ;
+       }
+       if (q && prev != NULL) { /* found and not in front */
+           prev->next = q->next ;
+           q->next = fs->rq[i] ;
+           fs->rq[i] = q ;
+       }
+    }
+    if (q == NULL) { /* no match, need to allocate a new entry */
+       q = create_queue(fs, i);
+       if (q != NULL)
+       q->id = *id ;
+    }
+    return q ;
+}
+
+static int
+red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+{
+       /*
+        * RED algorithm
+        *
+        * RED calculates the average queue size (avg) using a low-pass filter
+        * with an exponential weighted (w_q) moving average:
+        *      avg  <-  (1-w_q) * avg + w_q * q_size
+        * where q_size is the queue length (measured in bytes or * packets).
+        *
+        * If q_size == 0, we compute the idle time for the link, and set
+        *      avg = (1 - w_q)^(idle/s)
+        * where s is the time needed for transmitting a medium-sized packet.
+        *
+        * Now, if avg < min_th the packet is enqueued.
+        * If avg > max_th the packet is dropped. Otherwise, the packet is
+        * dropped with probability P function of avg.
+        */
+
+       int64_t p_b = 0;
+
+       /* Queue in bytes or packets? */
+       u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
+           q->len_bytes : q->len;
+
+       DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
+
+       /* Average queue size estimation. */
+       if (q_size != 0) {
+               /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+               int diff = SCALE(q_size) - q->avg;
+               int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+               q->avg += (int)v;
+       } else {
+               /*
+                * Queue is empty, find for how long the queue has been
+                * empty and use a lookup table for computing
+                * (1 - * w_q)^(idle_time/s) where s is the time to send a
+                * (small) packet.
+                * XXX check wraps...
+                */
+               if (q->avg) {
+                       u_int t = div64(curr_time - q->idle_time,
+                           fs->lookup_step);
+
+                       q->avg = (t < fs->lookup_depth) ?
+                           SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+               }
+       }
+       DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
+
+       /* Should i drop? */
+       if (q->avg < fs->min_th) {
+               q->count = -1;
+               return (0);     /* accept packet */
+       }
+       if (q->avg >= fs->max_th) {     /* average queue >=  max threshold */
+               if (fs->flags_fs & DN_IS_GENTLE_RED) {
+                       /*
+                        * According to Gentle-RED, if avg is greater than
+                        * max_th the packet is dropped with a probability
+                        *       p_b = c_3 * avg - c_4
+                        * where c_3 = (1 - max_p) / max_th
+                        *       c_4 = 1 - 2 * max_p
+                        */
+                       p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+                           fs->c_4;
+               } else {
+                       q->count = -1;
+                       DPRINTF(("dummynet: - drop"));
+                       return (1);
+               }
+       } else if (q->avg > fs->min_th) {
+               /*
+                * We compute p_b using the linear dropping function
+                *       p_b = c_1 * avg - c_2
+                * where c_1 = max_p / (max_th - min_th)
+                *       c_2 = max_p * min_th / (max_th - min_th)
+                */
+               p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+       }
+
+       if (fs->flags_fs & DN_QSIZE_IS_BYTES)
+               p_b = div64(p_b * len, fs->max_pkt_size);
+       if (++q->count == 0)
+               q->random = random() & 0xffff;
+       else {
+               /*
+                * q->count counts packets arrived since last drop, so a greater
+                * value of q->count means a greater packet drop probability.
+                */
+               if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+                       q->count = 0;
+                       DPRINTF(("dummynet: - red drop"));
+                       /* After a drop we calculate a new random value. */
+                       q->random = random() & 0xffff;
+                       return (1);     /* drop */
+               }
+       }
+       /* End of RED algorithm. */
+
+       return (0);     /* accept */
+}
+
+static __inline struct dn_flow_set *
+locate_flowset(int fs_nr)
+{
+       struct dn_flow_set *fs;
+
+       SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
+               if (fs->fs_nr == fs_nr)
+                       return (fs);
+
+       return (NULL);
+}
+
+static __inline struct dn_pipe *
+locate_pipe(int pipe_nr)
+{
+       struct dn_pipe *pipe;
+
+       SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
+               if (pipe->pipe_nr == pipe_nr)
+                       return (pipe);
+
+       return (NULL);
+}
+
+/*
+ * dummynet hook for packets. Below 'pipe' is a pipe or a queue
+ * depending on whether WF2Q or fixed bw is used.
+ *
+ * pipe_nr     pipe or queue the packet is destined for.
+ * dir         where shall we send the packet after dummynet.
+ * m           the mbuf with the packet
+ * ifp         the 'ifp' parameter from the caller.
+ *             NULL in ip_input, destination interface in ip_output,
+ * rule                matching rule, in case of multiple passes
+ */
+static int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+       struct mbuf *m = *m0, *head = NULL, *tail = NULL;
+       struct dn_pkt_tag *pkt;
+       struct m_tag *mtag;
+       struct dn_flow_set *fs = NULL;
+       struct dn_pipe *pipe;
+       uint64_t len = m->m_pkthdr.len;
+       struct dn_flow_queue *q = NULL;
+       int is_pipe = fwa->rule.info & IPFW_IS_PIPE;
+
+       KASSERT(m->m_nextpkt == NULL,
+           ("dummynet_io: mbuf queue passed to dummynet"));
+
+       DUMMYNET_LOCK();
+       io_pkt++;
+       /*
+        * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
+        */
+       if (is_pipe) {
+               pipe = locate_pipe(fwa->rule.info & IPFW_INFO_MASK);
+               if (pipe != NULL)
+                       fs = &(pipe->fs);
+       } else
+               fs = locate_flowset(fwa->rule.info & IPFW_INFO_MASK);
+
+       if (fs == NULL)
+               goto dropit;    /* This queue/pipe does not exist! */
+       pipe = fs->pipe;
+       if (pipe == NULL) {     /* Must be a queue, try find a matching pipe. */
+               pipe = locate_pipe(fs->parent_nr);
+               if (pipe != NULL)
+                       fs->pipe = pipe;
+               else {
+                       printf("dummynet: no pipe %d for queue %d, drop pkt\n",
+                           fs->parent_nr, fs->fs_nr);
+                       goto dropit;
+               }
+       }
+       q = find_queue(fs, &(fwa->f_id));
+       if (q == NULL)
+               goto dropit;            /* Cannot allocate queue. */
+
+       /* Update statistics, then check reasons to drop pkt. */
+       q->tot_bytes += len;
+       q->tot_pkts++;
+       if (fs->plr && random() < fs->plr)
+               goto dropit;            /* Random pkt drop. */
+       if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+               if (q->len_bytes > fs->qsize)
+                       goto dropit;    /* Queue size overflow. */
+       } else {
+               if (q->len >= fs->qsize)
+                       goto dropit;    /* Queue count overflow. */
+       }
+       if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
+               goto dropit;
+
+       /* XXX expensive to zero, see if we can remove it. */
+       mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+           sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
+       if (mtag == NULL)
+               goto dropit;            /* Cannot allocate packet header. */
+       m_tag_prepend(m, mtag);         /* Attach to mbuf chain. */
+
+       pkt = (struct dn_pkt_tag *)(mtag + 1);
+       /*
+        * Ok, i can handle the pkt now...
+        * Build and enqueue packet + parameters.
+        */
+       pkt->rule = fwa->rule;
+       pkt->rule.info &= IPFW_ONEPASS; /* only keep this info */
+       pkt->dn_dir = dir;
+       pkt->ifp = fwa->oif;
+
+       if (q->head == NULL)
+               q->head = m;
+       else
+               q->tail->m_nextpkt = m;
+       q->tail = m;
+       q->len++;
+       q->len_bytes += len;
+
+       if (q->head != m)               /* Flow was not idle, we are done. */
+               goto done;
+
+       if (is_pipe) {                  /* Fixed rate queues. */
+               if (q->idle_time < curr_time) {
+                       /* Calculate available burst size. */
+                       q->numbytes +=
+                           (curr_time - q->idle_time - 1) * pipe->bandwidth;
+                       if (q->numbytes > pipe->burst)
+                               q->numbytes = pipe->burst;
+                       if (io_fast)
+                               q->numbytes += pipe->bandwidth;
+               }
+       } else {                        /* WF2Q. */
+               if (pipe->idle_time < curr_time &&
+                   pipe->scheduler_heap.elements == 0 &&
+                   pipe->not_eligible_heap.elements == 0) {
+                       /* Calculate available burst size. */
+                       pipe->numbytes +=
+                           (curr_time - pipe->idle_time - 1) * pipe->bandwidth;
+                       if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst)
+                               pipe->numbytes = pipe->burst;
+                       if (io_fast)
+                               pipe->numbytes += pipe->bandwidth;
+               }
+               pipe->idle_time = curr_time;
+       }
+       /* Necessary for both: fixed rate & WF2Q queues. */
+       q->idle_time = curr_time;
+
+       /*
+        * If we reach this point the flow was previously idle, so we need
+        * to schedule it. This involves different actions for fixed-rate or
+        * WF2Q queues.
+        */
+       if (is_pipe) {
+               /* Fixed-rate queue: just insert into the ready_heap. */
+               dn_key t = 0;
+
+               if (pipe->bandwidth) {
+                       q->extra_bits = compute_extra_bits(m, pipe);
+                       t = set_ticks(m, q, pipe);
+               }
+               q->sched_time = curr_time;
+               if (t == 0)             /* Must process it now. */
+                       ready_event(q, &head, &tail);
+               else
+                       heap_insert(&ready_heap, curr_time + t , q);
+       } else {
+               /*
+                * WF2Q. First, compute start time S: if the flow was
+                * idle (S = F + 1) set S to the virtual time V for the
+                * controlling pipe, and update the sum of weights for the pipe;
+                * otherwise, remove flow from idle_heap and set S to max(F,V).
+                * Second, compute finish time F = S + len / weight.
+                * Third, if pipe was idle, update V = max(S, V).
+                * Fourth, count one more backlogged flow.
+                */
+               if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
+                       q->S = pipe->V;
+                       pipe->sum += fs->weight; /* Add weight of new queue. */
+               } else {
+                       heap_extract(&(pipe->idle_heap), q);
+                       q->S = MAX64(q->F, pipe->V);
+               }
+               q->F = q->S + div64(len << MY_M, fs->weight);
+
+               if (pipe->not_eligible_heap.elements == 0 &&
+                   pipe->scheduler_heap.elements == 0)
+                       pipe->V = MAX64(q->S, pipe->V);
+               fs->backlogged++;
+               /*
+                * Look at eligibility. A flow is not eligibile if S>V (when
+                * this happens, it means that there is some other flow already
+                * scheduled for the same pipe, so the scheduler_heap cannot be
+                * empty). If the flow is not eligible we just store it in the
+                * not_eligible_heap. Otherwise, we store in the scheduler_heap
+                * and possibly invoke ready_event_wfq() right now if there is
+                * leftover credit.
+                * Note that for all flows in scheduler_heap (SCH), S_i <= V,
+                * and for all flows in not_eligible_heap (NEH), S_i > V.
+                * So when we need to compute max(V, min(S_i)) forall i in
+                * SCH+NEH, we only need to look into NEH.
+                */
+               if (DN_KEY_GT(q->S, pipe->V)) {         /* Not eligible. */
+                       if (pipe->scheduler_heap.elements == 0)
+                               printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
+                       heap_insert(&(pipe->not_eligible_heap), q->S, q);
+               } else {
+                       heap_insert(&(pipe->scheduler_heap), q->F, q);
+                       if (pipe->numbytes >= 0) {       /* Pipe is idle. */
+                               if (pipe->scheduler_heap.elements != 1)
+                                       printf("dummynet: OUCH! pipe should have been idle!\n");
+                               DPRINTF(("dummynet: waking up pipe %d at %d\n",
+                                   pipe->pipe_nr, (int)(q->F >> MY_M)));
+                               pipe->sched_time = curr_time;
+                               ready_event_wfq(pipe, &head, &tail);
+                       }
+               }
+       }
+done:
+       if (head == m && (dir & PROTO_LAYER2) == 0 ) {
+               /* Fast io. */
+               io_pkt_fast++;
+               if (m->m_nextpkt != NULL)
+                       printf("dummynet: fast io: pkt chain detected!\n");
+               head = m->m_nextpkt = NULL;
+       } else
+               *m0 = NULL;             /* Normal io. */
+
+       DUMMYNET_UNLOCK();
+       if (head != NULL)
+               dummynet_send(head);
+       return (0);
+
+dropit:
+       io_pkt_drop++;
+       if (q)
+               q->drops++;
+       DUMMYNET_UNLOCK();
+       FREE_PKT(m);
+       *m0 = NULL;
+       return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+}
+
+/*
+ * Dispose all packets and flow_queues on a flow_set.
+ * If all=1, also remove red lookup table and other storage,
+ * including the descriptor itself.
+ * For the one in dn_pipe MUST also cleanup ready_heap...
+ */
+static void
+purge_flow_set(struct dn_flow_set *fs, int all)
+{
+       struct dn_flow_queue *q, *qn;
+       int i;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       for (i = 0; i <= fs->rq_size; i++) {
+               for (q = fs->rq[i]; q != NULL; q = qn) {
+                       dn_free_pkts(q->head);
+                       qn = q->next;
+                       free(q, M_DUMMYNET);
+               }
+               fs->rq[i] = NULL;
+       }
+
+       fs->rq_elements = 0;
+       if (all) {
+               /* RED - free lookup table. */
+               if (fs->w_q_lookup != NULL)
+                       free(fs->w_q_lookup, M_DUMMYNET);
+               if (fs->rq != NULL)
+                       free(fs->rq, M_DUMMYNET);
+               /* If this fs is not part of a pipe, free it. */
+               if (fs->pipe == NULL || fs != &(fs->pipe->fs))
+                       free(fs, M_DUMMYNET);
+       }
+}
+
+/*
+ * Dispose all packets queued on a pipe (not a flow_set).
+ * Also free all resources associated to a pipe, which is about
+ * to be deleted.
+ */
+static void
+purge_pipe(struct dn_pipe *pipe)
+{
+
+    purge_flow_set( &(pipe->fs), 1 );
+
+    dn_free_pkts(pipe->head);
+
+    heap_free( &(pipe->scheduler_heap) );
+    heap_free( &(pipe->not_eligible_heap) );
+    heap_free( &(pipe->idle_heap) );
+}
+
+/*
+ * Delete all pipes and heaps returning memory. Must also
+ * remove references from all ipfw rules to all pipes.
+ */
+static void
+dummynet_flush(void)
+{
+       struct dn_pipe *pipe, *pipe1;
+       struct dn_flow_set *fs, *fs1;
+       int i;
+
+       DUMMYNET_LOCK();
+       /* Free heaps so we don't have unwanted events. */
+       heap_free(&ready_heap);
+       heap_free(&wfq_ready_heap);
+       heap_free(&extract_heap);
+
+       /*
+        * Now purge all queued pkts and delete all pipes.
+        *
+        * XXXGL: can we merge the for(;;) cycles into one or not?
+        */
+       for (i = 0; i < HASHSIZE; i++)
+               SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
+                       SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
+                       purge_flow_set(fs, 1);
+               }
+       for (i = 0; i < HASHSIZE; i++)
+               SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
+                       SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
+                       purge_pipe(pipe);
+                       free_pipe(pipe);
+               }
+       DUMMYNET_UNLOCK();
+}
+
+/*
+ * setup RED parameters
+ */
+static int
+config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+{
+       int i;
+
+       x->w_q = p->w_q;
+       x->min_th = SCALE(p->min_th);
+       x->max_th = SCALE(p->max_th);
+       x->max_p = p->max_p;
+
+       x->c_1 = p->max_p / (p->max_th - p->min_th);
+       x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
+
+       if (x->flags_fs & DN_IS_GENTLE_RED) {
+               x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
+               x->c_4 = SCALE(1) - 2 * p->max_p;
+       }
+
+       /* If the lookup table already exist, free and create it again. */
+       if (x->w_q_lookup) {
+               free(x->w_q_lookup, M_DUMMYNET);
+               x->w_q_lookup = NULL;
+       }
+       if (red_lookup_depth == 0) {
+               printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+                   "must be > 0\n");
+               free(x, M_DUMMYNET);
+               return (EINVAL);
+       }
+       x->lookup_depth = red_lookup_depth;
+       x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+           M_DUMMYNET, M_NOWAIT);
+       if (x->w_q_lookup == NULL) {
+               printf("dummynet: sorry, cannot allocate red lookup table\n");
+               free(x, M_DUMMYNET);
+               return(ENOSPC);
+       }
+
+       /* Fill the lookup table with (1 - w_q)^x */
+       x->lookup_step = p->lookup_step;
+       x->lookup_weight = p->lookup_weight;
+       x->w_q_lookup[0] = SCALE(1) - x->w_q;
+
+       for (i = 1; i < x->lookup_depth; i++)
+               x->w_q_lookup[i] =
+                   SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+
+       if (red_avg_pkt_size < 1)
+               red_avg_pkt_size = 512;
+       x->avg_pkt_size = red_avg_pkt_size;
+       if (red_max_pkt_size < 1)
+               red_max_pkt_size = 1500;
+       x->max_pkt_size = red_max_pkt_size;
+       return (0);
+}
+
+static int
+alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
+{
+    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
+       int l = pfs->rq_size;
+
+       if (l == 0)
+           l = dn_hash_size;
+       if (l < 4)
+           l = 4;
+       else if (l > DN_MAX_HASH_SIZE)
+           l = DN_MAX_HASH_SIZE;
+       x->rq_size = l;
+    } else                  /* one is enough for null mask */
+       x->rq_size = 1;
+    x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
+           M_DUMMYNET, M_NOWAIT | M_ZERO);
+    if (x->rq == NULL) {
+       printf("dummynet: sorry, cannot allocate queue\n");
+       return (ENOMEM);
+    }
+    x->rq_elements = 0;
+    return 0 ;
+}
+
+static void
+set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
+{
+       x->flags_fs = src->flags_fs;
+       x->qsize = src->qsize;
+       x->plr = src->plr;
+       x->flow_mask = src->flow_mask;
+       if (x->flags_fs & DN_QSIZE_IS_BYTES) {
+               if (x->qsize > pipe_byte_limit)
+                       x->qsize = 1024 * 1024;
+       } else {
+               if (x->qsize == 0)
+                       x->qsize = 50;
+               if (x->qsize > pipe_slot_limit)
+                       x->qsize = 50;
+       }
+       /* Configuring RED. */
+       if (x->flags_fs & DN_IS_RED)
+               config_red(src, x);     /* XXX should check errors */
+}
+
+/*
+ * Setup pipe or queue parameters.
+ */
+static int
+config_pipe(struct dn_pipe *p)
+{
+       struct dn_flow_set *pfs = &(p->fs);
+       struct dn_flow_queue *q;
+       int i, error;
+
+       /*
+        * The config program passes parameters as follows:
+        * bw = bits/second (0 means no limits),
+        * delay = ms, must be translated into ticks.
+        * qsize = slots/bytes
+        */
+       p->delay = (p->delay * hz) / 1000;
+       /* Scale burst size: bytes -> bits * hz */
+       p->burst *= 8 * hz;
+       /* We need either a pipe number or a flow_set number. */
+       if (p->pipe_nr == 0 && pfs->fs_nr == 0)
+               return (EINVAL);
+       if (p->pipe_nr != 0 && pfs->fs_nr != 0)
+               return (EINVAL);
+       if (p->pipe_nr != 0) {                  /* this is a pipe */
+               struct dn_pipe *pipe;
+
+               DUMMYNET_LOCK();
+               pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+
+               if (pipe == NULL) {             /* new pipe */
+                       pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
+                           M_NOWAIT | M_ZERO);
+                       if (pipe == NULL) {
+                               DUMMYNET_UNLOCK();
+                               printf("dummynet: no memory for new pipe\n");
+                               return (ENOMEM);
+                       }
+                       pipe->pipe_nr = p->pipe_nr;
+                       pipe->fs.pipe = pipe;
+                       /*
+                        * idle_heap is the only one from which
+                        * we extract from the middle.
+                        */
+                       pipe->idle_heap.size = pipe->idle_heap.elements = 0;
+                       pipe->idle_heap.offset =
+                           offsetof(struct dn_flow_queue, heap_pos);
+               } else {
+                       /* Flush accumulated credit for all queues. */
+                       for (i = 0; i <= pipe->fs.rq_size; i++) {
+                               for (q = pipe->fs.rq[i]; q; q = q->next) {
+                                       q->numbytes = p->burst +
+                                           (io_fast ? p->bandwidth : 0);
+                               }
+                       }
+               }
+
+               pipe->bandwidth = p->bandwidth;
+               pipe->burst = p->burst;
+               pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0);
+               bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
+               pipe->ifp = NULL;               /* reset interface ptr */
+               pipe->delay = p->delay;
+               set_fs_parms(&(pipe->fs), pfs);
+
+               /* Handle changes in the delay profile. */
+               if (p->samples_no > 0) {
+                       if (pipe->samples_no != p->samples_no) {
+                               if (pipe->samples != NULL)
+                                       free(pipe->samples, M_DUMMYNET);
+                               pipe->samples =
+                                   malloc(p->samples_no*sizeof(dn_key),
+                                       M_DUMMYNET, M_NOWAIT | M_ZERO);
+                               if (pipe->samples == NULL) {
+                                       DUMMYNET_UNLOCK();
+                                       printf("dummynet: no memory "
+                                               "for new samples\n");
+                                       return (ENOMEM);
+                               }
+                               pipe->samples_no = p->samples_no;
+                       }
+
+                       strncpy(pipe->name,p->name,sizeof(pipe->name));
+                       pipe->loss_level = p->loss_level;
+                       for (i = 0; i<pipe->samples_no; ++i)
+                               pipe->samples[i] = p->samples[i];
+               } else if (pipe->samples != NULL) {
+                       free(pipe->samples, M_DUMMYNET);
+                       pipe->samples = NULL;
+                       pipe->samples_no = 0;
+               }
+
+               if (pipe->fs.rq == NULL) {      /* a new pipe */
+                       error = alloc_hash(&(pipe->fs), pfs);
+                       if (error) {
+                               DUMMYNET_UNLOCK();
+                               free_pipe(pipe);
+                               return (error);
+                       }
+                       SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
+                           pipe, next);
+               }
+               DUMMYNET_UNLOCK();
+       } else {                                /* config queue */
+               struct dn_flow_set *fs;
+
+               DUMMYNET_LOCK();
+               fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
+
+               if (fs == NULL) {               /* new */
+                       if (pfs->parent_nr == 0) { /* need link to a pipe */
+                               DUMMYNET_UNLOCK();
+                               return (EINVAL);
+                       }
+                       fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
+                           M_NOWAIT | M_ZERO);
+                       if (fs == NULL) {
+                               DUMMYNET_UNLOCK();
+                               printf(
+                                   "dummynet: no memory for new flow_set\n");
+                               return (ENOMEM);
+                       }
+                       fs->fs_nr = pfs->fs_nr;
+                       fs->parent_nr = pfs->parent_nr;
+                       fs->weight = pfs->weight;
+                       if (fs->weight == 0)
+                               fs->weight = 1;
+                       else if (fs->weight > 100)
+                               fs->weight = 100;
+               } else {
+                       /*
+                        * Change parent pipe not allowed;
+                        * must delete and recreate.
+                        */
+                       if (pfs->parent_nr != 0 &&
+                           fs->parent_nr != pfs->parent_nr) {
+                               DUMMYNET_UNLOCK();
+                               return (EINVAL);
+                       }
+               }
+
+               set_fs_parms(fs, pfs);
+
+               if (fs->rq == NULL) {           /* a new flow_set */
+                       error = alloc_hash(fs, pfs);
+                       if (error) {
+                               DUMMYNET_UNLOCK();
+                               free(fs, M_DUMMYNET);
+                               return (error);
+                       }
+                       SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
+                           fs, next);
+               }
+               DUMMYNET_UNLOCK();
+       }
+       return (0);
+}
+
+/*
+ * Helper function to remove from a heap queues which are linked to
+ * a flow_set about to be deleted.
+ */
+static void
+fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+{
+    int i, found;
+
+    for (i = found = 0 ; i < h->elements ;) {
+       if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
+           h->elements-- ;
+           h->p[i] = h->p[h->elements] ;
+           found++ ;
+       } else
+           i++ ;
+    }
+    if (found)
+       heapify(h);
+}
+
+/*
+ * helper function to remove a pipe from a heap (can be there at most once)
+ */
+static void
+pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+{
+    int i;
+
+    for (i=0; i < h->elements ; i++ ) {
+       if (h->p[i].object == p) { /* found it */
+           h->elements-- ;
+           h->p[i] = h->p[h->elements] ;
+           heapify(h);
+           break ;
+       }
+    }
+}
+
+/*
+ * drain all queues. Called in case of severe mbuf shortage.
+ */
+void
+dummynet_drain(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    heap_free(&ready_heap);
+    heap_free(&wfq_ready_heap);
+    heap_free(&extract_heap);
+    /* remove all references to this pipe from flow_sets */
+    for (i = 0; i < HASHSIZE; i++)
+       SLIST_FOREACH(fs, &flowsethash[i], next)
+               purge_flow_set(fs, 0);
+
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next) {
+               purge_flow_set(&(pipe->fs), 0);
+               dn_free_pkts(pipe->head);
+               pipe->head = pipe->tail = NULL;
+       }
+    }
+}
+
+/*
+ * Fully delete a pipe or a queue, cleaning up associated info.
+ */
+static int
+delete_pipe(struct dn_pipe *p)
+{
+
+    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+       return EINVAL ;
+    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+       return EINVAL ;
+    if (p->pipe_nr != 0) { /* this is an old-style pipe */
+       struct dn_pipe *pipe;
+       struct dn_flow_set *fs;
+       int i;
+
+       DUMMYNET_LOCK();
+       pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+
+       if (pipe == NULL) {
+           DUMMYNET_UNLOCK();
+           return (ENOENT);    /* not found */
+       }
+
+       /* Unlink from list of pipes. */
+       SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+
+       /* Remove all references to this pipe from flow_sets. */
+       for (i = 0; i < HASHSIZE; i++) {
+           SLIST_FOREACH(fs, &flowsethash[i], next) {
+               if (fs->pipe == pipe) {
+                       printf("dummynet: ++ ref to pipe %d from fs %d\n",
+                           p->pipe_nr, fs->fs_nr);
+                       fs->pipe = NULL ;
+                       purge_flow_set(fs, 0);
+               }
+           }
+       }
+       fs_remove_from_heap(&ready_heap, &(pipe->fs));
+       purge_pipe(pipe); /* remove all data associated to this pipe */
+       /* remove reference to here from extract_heap and wfq_ready_heap */
+       pipe_remove_from_heap(&extract_heap, pipe);
+       pipe_remove_from_heap(&wfq_ready_heap, pipe);
+       DUMMYNET_UNLOCK();
+
+       free_pipe(pipe);
+    } else { /* this is a WF2Q queue (dn_flow_set) */
+       struct dn_flow_set *fs;
+
+       DUMMYNET_LOCK();
+       fs = locate_flowset(p->fs.fs_nr); /* locate set */
+
+       if (fs == NULL) {
+           DUMMYNET_UNLOCK();
+           return (ENOENT); /* not found */
+       }
+
+       /* Unlink from list of flowsets. */
+       SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
+
+       if (fs->pipe != NULL) {
+           /* Update total weight on parent pipe and cleanup parent heaps. */
+           fs->pipe->sum -= fs->weight * fs->backlogged ;
+           fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
+           fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
+#if 1  /* XXX should i remove from idle_heap as well ? */
+           fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
+#endif
+       }
+       purge_flow_set(fs, 1);
+       DUMMYNET_UNLOCK();
+    }
+    return 0 ;
+}
+
+/*
+ * helper function used to copy data from kernel in DUMMYNET_GET
+ */
+static char *
+dn_copy_set(struct dn_flow_set *set, char *bp)
+{
+    int i, copied = 0 ;
+    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    for (i = 0 ; i <= set->rq_size ; i++) {
+       for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
+           if (q->hash_slot != i)
+               printf("dummynet: ++ at %d: wrong slot (have %d, "
+                   "should be %d)\n", copied, q->hash_slot, i);
+           if (q->fs != set)
+               printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
+                       i, q->fs, set);
+           copied++ ;
+           bcopy(q, qp, sizeof( *q ) );
+           /* cleanup pointers */
+           qp->next = NULL ;
+           qp->head = qp->tail = NULL ;
+           qp->fs = NULL ;
+       }
+    }
+    if (copied != set->rq_elements)
+       printf("dummynet: ++ wrong count, have %d should be %d\n",
+           copied, set->rq_elements);
+    return (char *)qp ;
+}
+
+static size_t
+dn_calc_size(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    size_t size = 0;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+    /*
+     * Compute size of data structures: list of pipes and flow_sets.
+     */
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next)
+               size += sizeof(*pipe) +
+                   pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
+       SLIST_FOREACH(fs, &flowsethash[i], next)
+               size += sizeof (*fs) +
+                   fs->rq_elements * sizeof(struct dn_flow_queue);
+    }
+    return size;
+}
+
+static int
+dummynet_get(struct sockopt *sopt)
+{
+    char *buf, *bp ; /* bp is the "copy-pointer" */
+    size_t size ;
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int error=0, i ;
+
+    /* XXX lock held too long */
+    DUMMYNET_LOCK();
+    /*
+     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
+     *      cannot use this flag while holding a mutex.
+     */
+    for (i = 0; i < 10; i++) {
+       size = dn_calc_size();
+       DUMMYNET_UNLOCK();
+       buf = malloc(size, M_TEMP, M_WAITOK);
+       DUMMYNET_LOCK();
+       if (size >= dn_calc_size())
+               break;
+       free(buf, M_TEMP);
+       buf = NULL;
+    }
+    if (buf == NULL) {
+       DUMMYNET_UNLOCK();
+       return ENOBUFS ;
+    }
+    bp = buf;
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next) {
+               struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
+
+               /*
+                * Copy pipe descriptor into *bp, convert delay back to ms,
+                * then copy the flow_set descriptor(s) one at a time.
+                * After each flow_set, copy the queue descriptor it owns.
+                */
+               bcopy(pipe, bp, sizeof(*pipe));
+               pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
+               pipe_bp->burst = div64(pipe_bp->burst, 8 * hz);
+               /*
+                * XXX the following is a hack based on ->next being the
+                * first field in dn_pipe and dn_flow_set. The correct
+                * solution would be to move the dn_flow_set to the beginning
+                * of struct dn_pipe.
+                */
+               pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
+               /* Clean pointers. */
+               pipe_bp->head = pipe_bp->tail = NULL;
+               pipe_bp->fs.next.sle_next = NULL;
+               pipe_bp->fs.pipe = NULL;
+               pipe_bp->fs.rq = NULL;
+               pipe_bp->samples = NULL;
+
+               bp += sizeof(*pipe) ;
+               bp = dn_copy_set(&(pipe->fs), bp);
+       }
+    }
+
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(fs, &flowsethash[i], next) {
+               struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+
+               bcopy(fs, bp, sizeof(*fs));
+               /* XXX same hack as above */
+               fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+               fs_bp->pipe = NULL;
+               fs_bp->rq = NULL;
+               bp += sizeof(*fs);
+               bp = dn_copy_set(fs, bp);
+       }
+    }
+
+    DUMMYNET_UNLOCK();
+
+    error = sooptcopyout(sopt, buf, size);
+    free(buf, M_TEMP);
+    return error ;
+}
+
+/*
+ * Handler for the various dummynet socket options (get, flush, config, del)
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+    int error;
+    struct dn_pipe *p = NULL;
+
+    error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+    if (error)
+       return (error);
+
+    /* Disallow sets in really-really secure mode. */
+    if (sopt->sopt_dir == SOPT_SET) {
+#if __FreeBSD_version >= 500034
+       error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+       if (error)
+           return (error);
+#else
+       if (securelevel >= 3)
+           return (EPERM);
+#endif
+    }
+
+    switch (sopt->sopt_name) {
+    default :
+       printf("dummynet: -- unknown option %d", sopt->sopt_name);
+       error = EINVAL ;
+       break;
+
+    case IP_DUMMYNET_GET :
+       error = dummynet_get(sopt);
+       break ;
+
+    case IP_DUMMYNET_FLUSH :
+       dummynet_flush() ;
+       break ;
+
+    case IP_DUMMYNET_CONFIGURE :
+       p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK);
+       error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p);
+       if (error)
+           break ;
+       if (p->samples_no > 0)
+           p->samples = &(((struct dn_pipe_max *)p)->samples[0]);
+
+       error = config_pipe(p);
+       break ;
+
+    case IP_DUMMYNET_DEL :     /* remove a pipe or queue */
+       p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK);
+       error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p);
+       if (error)
+           break ;
+
+       error = delete_pipe(p);
+       break ;
+    }
+
+    if (p != NULL)
+       free(p, M_TEMP);
+
+    return error ;
+}
+
+static void
+ip_dn_init(void)
+{
+       int i;
+
+       if (bootverbose)
+               printf("DUMMYNET with IPv6 initialized (040826)\n");
+
+       DUMMYNET_LOCK_INIT();
+
+       for (i = 0; i < HASHSIZE; i++) {
+               SLIST_INIT(&pipehash[i]);
+               SLIST_INIT(&flowsethash[i]);
+       }
+       ready_heap.size = ready_heap.elements = 0;
+       ready_heap.offset = 0;
+
+       wfq_ready_heap.size = wfq_ready_heap.elements = 0;
+       wfq_ready_heap.offset = 0;
+
+       extract_heap.size = extract_heap.elements = 0;
+       extract_heap.offset = 0;
+
+       ip_dn_ctl_ptr = ip_dn_ctl;
+       ip_dn_io_ptr = dummynet_io;
+
+       TASK_INIT(&dn_task, 0, dummynet_task, NULL);
+       dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
+           taskqueue_thread_enqueue, &dn_tq);
+       taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+       callout_init(&dn_timeout, CALLOUT_MPSAFE);
+       callout_reset(&dn_timeout, 1, dummynet, NULL);
+
+       /* Initialize curr_time adjustment mechanics. */
+       getmicrouptime(&prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(void)
+{
+       ip_dn_ctl_ptr = NULL;
+       ip_dn_io_ptr = NULL;
+
+       DUMMYNET_LOCK();
+       callout_stop(&dn_timeout);
+       DUMMYNET_UNLOCK();
+       taskqueue_drain(dn_tq, &dn_task);
+       taskqueue_free(dn_tq);
+
+       dummynet_flush();
+
+       DUMMYNET_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+       switch (type) {
+       case MOD_LOAD:
+               if (ip_dn_io_ptr) {
+                   printf("DUMMYNET already loaded\n");
+                   return EEXIST ;
+               }
+               ip_dn_init();
+               break;
+
+       case MOD_UNLOAD:
+#if !defined(KLD_MODULE)
+               printf("dummynet statically compiled, cannot unload\n");
+               return EINVAL ;
+#else
+               ip_dn_destroy();
+#endif
+               break ;
+       default:
+               return EOPNOTSUPP;
+               break ;
+       }
+       return 0 ;
+}
+
+static moduledata_t dummynet_mod = {
+       "dummynet",
+       dummynet_modevent,
+       NULL
+};
+DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 1);
+/* end of file */
diff --git a/dummynet2/ip_fw2.c b/dummynet2/ip_fw2.c

new file mode 100644 (file)

index 0000000..3cc08e7
--- /dev/null
+++ b/dummynet2/ip_fw2.c
@@ -0,0 +1,2466 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * The FreeBSD IP packet firewall, main file
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pf_mtag.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_carp.h>
+#include <netinet/pim.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/sctp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/scope6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
+ */
+
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define        V_ipfw_vnet_ready       VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define        V_fw_deny_unknown_exthdrs       VNET(fw_deny_unknown_exthdrs)
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+
+VNET_DEFINE(int, autoinc_step);
+
+/*
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
+ */
+VNET_DEFINE(u_int32_t, set_disable);
+#define        V_set_disable                   VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
+VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+    "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+    CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+    "Rule number auto-increment step");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+    "Log matches to ipfw rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+    CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+    "Set upper limit of matches of ipfw rules logged");
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+    &dummy_def, 0,
+    "The default/max possible rule number.");
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
+    &dummy_tables_max, 0,
+    "The maximum number of tables.");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+    &default_to_accept, 0,
+    "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+    CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+    "Number of static rules");
+
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6);
+SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+    "Deny packets with unknown IPv6 Extension Headers");
+#endif /* INET6 */
+
+#endif /* SYSCTL_NODE */
+
+
+/*
+ * Some macros used in the various matching options.
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT     ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ *     (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+       u_char want_clear;
+       bits = ~bits;
+
+       if ( ((cmd->arg1 & 0xff) & bits) != 0)
+               return 0; /* some bits we want set were clear */
+       want_clear = (cmd->arg1 >> 8) & 0xff;
+       if ( (want_clear & bits) != want_clear)
+               return 0; /* some bits we want clear were set */
+       return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(ip + 1);
+       int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[IPOPT_OPTVAL];
+
+               if (opt == IPOPT_EOL)
+                       break;
+               if (opt == IPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[IPOPT_OLEN];
+                       if (optlen <= 0 || optlen > x)
+                               return 0; /* invalid or truncated */
+               }
+               switch (opt) {
+
+               default:
+                       break;
+
+               case IPOPT_LSRR:
+                       bits |= IP_FW_IPOPT_LSRR;
+                       break;
+
+               case IPOPT_SSRR:
+                       bits |= IP_FW_IPOPT_SSRR;
+                       break;
+
+               case IPOPT_RR:
+                       bits |= IP_FW_IPOPT_RR;
+                       break;
+
+               case IPOPT_TS:
+                       bits |= IP_FW_IPOPT_TS;
+                       break;
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(tcp + 1);
+       int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[0];
+               if (opt == TCPOPT_EOL)
+                       break;
+               if (opt == TCPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[1];
+                       if (optlen <= 0)
+                               break;
+               }
+
+               switch (opt) {
+
+               default:
+                       break;
+
+               case TCPOPT_MAXSEG:
+                       bits |= IP_FW_TCPOPT_MSS;
+                       break;
+
+               case TCPOPT_WINDOW:
+                       bits |= IP_FW_TCPOPT_WINDOW;
+                       break;
+
+               case TCPOPT_SACK_PERMITTED:
+               case TCPOPT_SACK:
+                       bits |= IP_FW_TCPOPT_SACK;
+                       break;
+
+               case TCPOPT_TIMESTAMP:
+                       bits |= IP_FW_TCPOPT_TS;
+                       break;
+
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
+{
+       if (ifp == NULL)        /* no iface with this packet, match fails */
+               return 0;
+       /* Check by name or by IP address */
+       if (cmd->name[0] != '\0') { /* match by name */
+               /* Check name */
+               if (cmd->p.glob) {
+                       if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+                               return(1);
+               } else {
+                       if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+                               return(1);
+               }
+       } else {
+#if !defined( __linux__ ) && !defined( _WIN32 )
+               struct ifaddr *ia;
+
+               if_addr_rlock(ifp);
+               TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+                       if (ia->ifa_addr->sa_family != AF_INET)
+                               continue;
+                       if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+                           (ia->ifa_addr))->sin_addr.s_addr) {
+                               if_addr_runlock(ifp);
+                               return(1);      /* match */
+                       }
+               }
+               if_addr_runlock(ifp);
+#endif
+       }
+       return(0);      /* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ * 
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ *   ip verify unicast reverse-path
+ *   ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+#if defined( __linux__ ) || defined( _WIN32 )
+       return 0;
+#else
+       struct route ro;
+       struct sockaddr_in *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in *)&(ro.ro_dst);
+       dst->sin_family = AF_INET;
+       dst->sin_len = sizeof(*dst);
+       dst->sin_addr = src;
+       in_rtalloc_ign(&ro, 0, fib);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /*
+        * If ifp is provided, check for equality with rtentry.
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * in order to pass packets injected back by if_simloop():
+        * if useloopback == 1 routing entry (via lo0) for our own address
+        * may exist, so we need to handle routing assymetry.
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+            satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+#endif
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+       return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+       int i;
+       for (i=0; i <= cmd->o.arg1; ++i )
+               if (curr_flow == cmd->d[i] )
+                       return 1;
+       return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+       struct ifnet *mdc;
+       struct ifaddr *mdc2;
+       struct in6_ifaddr *fdm;
+       struct in6_addr copia;
+
+       TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+               if_addr_rlock(mdc);
+               TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+                       if (mdc2->ifa_addr->sa_family == AF_INET6) {
+                               fdm = (struct in6_ifaddr *)mdc2;
+                               copia = fdm->ia_addr.sin6_addr;
+                               /* need for leaving scope_id in the sock_addr */
+                               in6_clearscope(&copia);
+                               if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+                                       if_addr_runlock(mdc);
+                                       return 1;
+                               }
+                       }
+               }
+               if_addr_runlock(mdc);
+       }
+       return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp)
+{
+       struct route_in6 ro;
+       struct sockaddr_in6 *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+       dst->sin6_family = AF_INET6;
+       dst->sin6_len = sizeof(*dst);
+       dst->sin6_addr = *src;
+       /* XXX MRT 0 for ipv6 at this time */
+       rtalloc_ign((struct route *)&ro, 0);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /* 
+        * if ifp is provided, check for equality with rtentry
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * to support the case of sending packets to an address of our own.
+        * (where the former interface is the first argument of if_simloop()
+        *  (=ifp), the latter is lo0)
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+           IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+       if ((icmp6_type <= ICMP6_MAXTYPE) &&
+           (icmp6_type == ICMP6_ECHO_REQUEST ||
+           icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+           icmp6_type == ICMP6_WRUREQUEST ||
+           icmp6_type == ICMP6_FQDN_QUERY ||
+           icmp6_type == ICMP6_NI_QUERY))
+               return (1);
+
+       return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+       struct mbuf *m;
+
+       m = args->m;
+       if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *tcp;
+               tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+               if ((tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m0;
+                       m0 = ipfw_send_pkt(args->m, &(args->f_id),
+                           ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                           tcp->th_flags | TH_RST);
+                       if (m0 != NULL)
+                               ip6_output(m0, NULL, NULL, 0, NULL, NULL,
+                                   NULL);
+               }
+               FREE_PKT(m);
+       } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+               /*
+                * Unlike above, the mbufs need to line up with the ip6 hdr,
+                * as the contents are read. We need to m_adj() the
+                * needed amount.
+                * The mbuf will however be thrown away so we can adjust it.
+                * Remember we did an m_pullup on it already so we
+                * can make some assumptions about contiguousness.
+                */
+               if (args->L3offset)
+                       m_adj(m, args->L3offset);
+#endif
+               icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+       } else
+               FREE_PKT(m);
+
+       args->m = NULL;
+}
+
+#endif /* INET6 */
+
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
+{
+
+#if 0
+       /* XXX When ip is not guaranteed to be at mtod() we will
+        * need to account for this */
+        * The mbuf will however be thrown away so we can adjust it.
+        * Remember we did an m_pullup on it already so we
+        * can make some assumptions about contiguousness.
+        */
+       if (args->L3offset)
+               m_adj(m, args->L3offset);
+#endif
+       if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+               /* We need the IP header in host order for icmp_error(). */
+               SET_HOST_IPLEN(ip);
+               icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+       } else if (args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *const tcp =
+                   L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+               if ( (tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m;
+                       m = ipfw_send_pkt(args->m, &(args->f_id),
+                               ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                               tcp->th_flags | TH_RST);
+                       if (m != NULL)
+                               ip_output(m, NULL, NULL, 0, NULL, NULL);
+               }
+               FREE_PKT(args->m);
+       } else
+               FREE_PKT(args->m);
+       args->m = NULL;
+}
+
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
+ */
+static int
+check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct ucred **uc, int *ugid_lookupp,
+    struct inpcb *inp)
+{
+#ifdef __linux__
+       return cred_check(insn, proto, oif,
+       dst_ip, dst_port, src_ip, src_port,
+       (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else  /* FreeBSD */
+       struct inpcbinfo *pi;
+       int wildcard;
+       struct inpcb *pcb;
+       int match;
+
+       /*
+        * Check to see if the UDP or TCP stack supplied us with
+        * the PCB. If so, rather then holding a lock and looking
+        * up the PCB, we can use the one that was supplied.
+        */
+       if (inp && *ugid_lookupp == 0) {
+               INP_LOCK_ASSERT(inp);
+               if (inp->inp_socket != NULL) {
+                       *uc = crhold(inp->inp_cred);
+                       *ugid_lookupp = 1;
+               } else
+                       *ugid_lookupp = -1;
+       }
+       /*
+        * If we have already been here and the packet has no
+        * PCB entry associated with it, then we can safely
+        * assume that this is a no match.
+        */
+       if (*ugid_lookupp == -1)
+               return (0);
+       if (proto == IPPROTO_TCP) {
+               wildcard = 0;
+               pi = &V_tcbinfo;
+       } else if (proto == IPPROTO_UDP) {
+               wildcard = INPLOOKUP_WILDCARD;
+               pi = &V_udbinfo;
+       } else
+               return 0;
+       match = 0;
+       if (*ugid_lookupp == 0) {
+               INP_INFO_RLOCK(pi);
+               pcb =  (oif) ?
+                       in_pcblookup_hash(pi,
+                               dst_ip, htons(dst_port),
+                               src_ip, htons(src_port),
+                               wildcard, oif) :
+                       in_pcblookup_hash(pi,
+                               src_ip, htons(src_port),
+                               dst_ip, htons(dst_port),
+                               wildcard, NULL);
+               if (pcb != NULL) {
+                       *uc = crhold(pcb->inp_cred);
+                       *ugid_lookupp = 1;
+               }
+               INP_INFO_RUNLOCK(pi);
+               if (*ugid_lookupp == 0) {
+                       /*
+                        * We tried and failed, set the variable to -1
+                        * so we will not try again on this packet.
+                        */
+                       *ugid_lookupp = -1;
+                       return (0);
+               }
+       } 
+       if (insn->o.opcode == O_UID)
+               match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = groupmember((gid_t)insn->d[0], *uc);
+       else if (insn->o.opcode == O_JAIL)
+               match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
+       return match;
+#endif
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+       struct ip_fw_chain *chain)
+{
+       args->rule.chain_id = chain->id;
+       args->rule.slot = slot + 1; /* we use 0 as a marker */
+       args->rule.rule_id = 1 + chain->map[slot]->id;
+       args->rule.rulenum = chain->map[slot]->rulenum;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ *     args->m (in/out) The packet; we set to NULL when/if we nuke it.
+ *             Starts with the IP header.
+ *     args->eh (in)   Mac header if present, NULL for layer3 packet.
+ *     args->L3offset  Number of bytes bypassed if we came from L2.
+ *                     e.g. often sizeof(eh)  ** NOTYET **
+ *     args->oif       Outgoing interface, NULL if packet is incoming.
+ *             The incoming interface is in the mbuf. (in)
+ *     args->divert_rule (in/out)
+ *             Skip up to the first rule past this rule number;
+ *             upon return, non-zero port number for divert or tee.
+ *
+ *     args->rule      Pointer to the last matching rule (in/out)
+ *     args->next_hop  Socket we are forwarding to (out).
+ *     args->f_id      Addresses grabbed from the packet (out)
+ *     args->rule.info a cookie depending on rule action
+ *
+ * Return value:
+ *
+ *     IP_FW_PASS      the packet must be accepted
+ *     IP_FW_DENY      the packet must be dropped
+ *     IP_FW_DIVERT    divert packet, port in m_tag
+ *     IP_FW_TEE       tee packet, port in m_tag
+ *     IP_FW_DUMMYNET  to dummynet, pipe in args->cookie
+ *     IP_FW_NETGRAPH  into netgraph, cookie args->cookie
+ *             args->rule contains the matching rule,
+ *             args->rule.info has additional information.
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+
+       /*
+        * Local variables holding state while processing a packet:
+        *
+        * IMPORTANT NOTE: to speed up the processing of rules, there
+        * are some assumption on the values of the variables, which
+        * are documented here. Should you change them, please check
+        * the implementation of the various instructions to make sure
+        * that they still work.
+        *
+        * args->eh     The MAC header. It is non-null for a layer2
+        *      packet, it is NULL for a layer-3 packet.
+        * **notyet**
+        * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+        *
+        * m | args->m  Pointer to the mbuf, as received from the caller.
+        *      It may change if ipfw_chk() does an m_pullup, or if it
+        *      consumes the packet because it calls send_reject().
+        *      XXX This has to change, so that ipfw_chk() never modifies
+        *      or consumes the buffer.
+        * ip   is the beginning of the ip(4 or 6) header.
+        *      Calculated by adding the L3offset to the start of data.
+        *      (Until we start using L3offset, the packet is
+        *      supposed to start with the ip header).
+        */
+       struct mbuf *m = args->m;
+       struct ip *ip = mtod(m, struct ip *);
+
+       /*
+        * For rules which contain uid/gid or jail constraints, cache
+        * a copy of the users credentials after the pcb lookup has been
+        * executed. This will speed up the processing of rules with
+        * these types of constraints, as well as decrease contention
+        * on pcb related locks.
+        */
+#ifdef __linux__
+       struct bsd_ucred ucred_cache;
+#else
+       struct ucred *ucred_cache = NULL;
+#endif
+       int ucred_lookup = 0;
+
+       /*
+        * oif | args->oif      If NULL, ipfw_chk has been called on the
+        *      inbound path (ether_input, ip_input).
+        *      If non-NULL, ipfw_chk has been called on the outbound path
+        *      (ether_output, ip_output).
+        */
+       struct ifnet *oif = args->oif;
+
+       int f_pos = 0;          /* index of current rule in the array */
+       int retval = 0;
+
+       /*
+        * hlen The length of the IP header.
+        */
+       u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
+
+       /*
+        * offset       The offset of a fragment. offset != 0 means that
+        *      we have a fragment at this offset of an IPv4 packet.
+        *      offset == 0 means that (if this is an IPv4 packet)
+        *      this is the first or only fragment.
+        *      For IPv6 offset == 0 means there is no Fragment Header. 
+        *      If offset != 0 for IPv6 always use correct mask to
+        *      get the correct offset because we add IP6F_MORE_FRAG
+        *      to be able to dectect the first fragment which would
+        *      otherwise have offset = 0.
+        */
+       u_short offset = 0;
+
+       /*
+        * Local copies of addresses. They are only valid if we have
+        * an IP packet.
+        *
+        * proto        The protocol. Set to 0 for non-ip packets,
+        *      or to the protocol read from the packet otherwise.
+        *      proto != 0 means that we have an IPv4 packet.
+        *
+        * src_port, dst_port   port numbers, in HOST format. Only
+        *      valid for TCP and UDP packets.
+        *
+        * src_ip, dst_ip       ip addresses, in NETWORK format.
+        *      Only valid for IPv4 packets.
+        */
+       uint8_t proto;
+       uint16_t src_port = 0, dst_port = 0;    /* NOTE: host format    */
+       struct in_addr src_ip, dst_ip;          /* NOTE: network format */
+       uint16_t iplen=0;
+       int pktlen;
+       uint16_t        etype = 0;      /* Host order stored ether type */
+
+       /*
+        * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+        *      MATCH_NONE when checked and not matched (q = NULL),
+        *      MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+        */
+       int dyn_dir = MATCH_UNKNOWN;
+       ipfw_dyn_rule *q = NULL;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       /*
+        * We store in ulp a pointer to the upper layer protocol header.
+        * In the ipv4 case this is easy to determine from the header,
+        * but for ipv6 we might have some additional headers in the middle.
+        * ulp is NULL if not found.
+        */
+       void *ulp = NULL;               /* upper layer protocol pointer. */
+       /* XXX ipv6 variables */
+       int is_ipv6 = 0;
+       u_int16_t ext_hd = 0;   /* bits vector for extension header filtering */
+       /* end of ipv6 variables */
+       int is_ipv4 = 0;
+
+       int done = 0;           /* flag to exit the outer loop */
+
+       if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
+               return (IP_FW_PASS);    /* accept */
+
+       dst_ip.s_addr = 0;              /* make sure it is initialized */
+       src_ip.s_addr = 0;              /* make sure it is initialized */
+       pktlen = m->m_pkthdr.len;
+       args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+       proto = args->f_id.proto = 0;   /* mark f_id invalid */
+               /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T)                                  \
+do {                                                           \
+       int x = (_len) + sizeof(T);                             \
+       if ((m)->m_len < x) {                                   \
+               args->m = m = m_pullup(m, x);                   \
+               if (m == NULL)                                  \
+                       goto pullup_failed;                     \
+       }                                                       \
+       p = (mtod(m, char *) + (_len));                         \
+} while (0)
+
+       /*
+        * if we have an ether header,
+        */
+       if (args->eh)
+               etype = ntohs(args->eh->ether_type);
+
+       /* Identify IP packets and fill up variables. */
+       if (pktlen >= sizeof(struct ip6_hdr) &&
+           (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+               is_ipv6 = 1;
+               args->f_id.addr_type = 6;
+               hlen = sizeof(struct ip6_hdr);
+               proto = ip6->ip6_nxt;
+
+               /* Search extension headers to find upper layer protocols */
+               while (ulp == NULL) {
+                       switch (proto) {
+                       case IPPROTO_ICMPV6:
+                               PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+                               args->f_id.flags = ICMP6(ulp)->icmp6_type;
+                               break;
+
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               args->f_id.flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_SCTP:
+                               PULLUP_TO(hlen, ulp, struct sctphdr);
+                               src_port = SCTP(ulp)->src_port;
+                               dst_port = SCTP(ulp)->dest_port;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_HOPOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_HOPOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ROUTING:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+                               switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+                               case 0:
+                                       ext_hd |= EXT_RTHDR0;
+                                       break;
+                               case 2:
+                                       ext_hd |= EXT_RTHDR2;
+                                       break;
+                               default:
+                                       printf("IPFW2: IPV6 - Unknown Routing "
+                                           "Header type(%d)\n",
+                                           ((struct ip6_rthdr *)ulp)->ip6r_type);
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               ext_hd |= EXT_ROUTING;
+                               hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+                               proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_FRAGMENT:  /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_frag);
+                               ext_hd |= EXT_FRAGMENT;
+                               hlen += sizeof (struct ip6_frag);
+                               proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+                               offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_OFF_MASK;
+                               /* Add IP6F_MORE_FRAG for offset of first
+                                * fragment to be != 0. */
+                               offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_MORE_FRAG;
+                               if (offset == 0) {
+                                       printf("IPFW2: IPV6 - Invalid Fragment "
+                                           "Header\n");
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               args->f_id.frag_id6 =
+                                   ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_DSTOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_DSTOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_AH:        /* RFC 2402 */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               ext_hd |= EXT_AH;
+                               hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+                               proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ESP:       /* RFC 2406 */
+                               PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
+                               /* Anything past Seq# is variable length and
+                                * data past this ext. header is encrypted. */
+                               ext_hd |= EXT_ESP;
+                               break;
+
+                       case IPPROTO_NONE:      /* RFC 2460 */
+                               /*
+                                * Packet ends here, and IPv6 header has
+                                * already been pulled up. If ip6e_len!=0
+                                * then octets must be ignored.
+                                */
+                               ulp = ip; /* non-NULL to get out of loop. */
+                               break;
+
+                       case IPPROTO_OSPFIGP:
+                               /* XXX OSPF header check? */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+
+                       case IPPROTO_PIM:
+                               /* XXX PIM header check? */
+                               PULLUP_TO(hlen, ulp, struct pim);
+                               break;
+
+                       case IPPROTO_CARP:
+                               PULLUP_TO(hlen, ulp, struct carp_header);
+                               if (((struct carp_header *)ulp)->carp_version !=
+                                   CARP_VERSION) 
+                                       return (IP_FW_DENY);
+                               if (((struct carp_header *)ulp)->carp_type !=
+                                   CARP_ADVERTISEMENT) 
+                                       return (IP_FW_DENY);
+                               break;
+
+                       case IPPROTO_IPV6:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hdr);
+                               break;
+
+                       case IPPROTO_IPV4:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip);
+                               break;
+
+                       default:
+                               printf("IPFW2: IPV6 - Unknown Extension "
+                                   "Header(%d), ext_hd=%x\n", proto, ext_hd);
+                               if (V_fw_deny_unknown_exthdrs)
+                                   return (IP_FW_DENY);
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+                       } /*switch */
+               }
+               ip = mtod(m, struct ip *);
+               ip6 = (struct ip6_hdr *)ip;
+               args->f_id.src_ip6 = ip6->ip6_src;
+               args->f_id.dst_ip6 = ip6->ip6_dst;
+               args->f_id.src_ip = 0;
+               args->f_id.dst_ip = 0;
+               args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+       } else if (pktlen >= sizeof(struct ip) &&
+           (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+               is_ipv4 = 1;
+               hlen = ip->ip_hl << 2;
+               args->f_id.addr_type = 4;
+
+               /*
+                * Collect parameters into local variables for faster matching.
+                */
+               proto = ip->ip_p;
+               src_ip = ip->ip_src;
+               dst_ip = ip->ip_dst;
+               offset = ntohs(ip->ip_off) & IP_OFFMASK;
+               iplen = ntohs(ip->ip_len);
+               pktlen = iplen < pktlen ? iplen : pktlen;
+
+               if (offset == 0) {
+                       switch (proto) {
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               args->f_id.flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_ICMP:
+                               PULLUP_TO(hlen, ulp, struct icmphdr);
+                               args->f_id.flags = ICMP(ulp)->icmp_type;
+                               break;
+
+                       default:
+                               break;
+                       }
+               }
+
+               ip = mtod(m, struct ip *);
+               args->f_id.src_ip = ntohl(src_ip.s_addr);
+               args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+       }
+#undef PULLUP_TO
+       if (proto) { /* we may have port numbers, store them */
+               args->f_id.proto = proto;
+               args->f_id.src_port = src_port = ntohs(src_port);
+               args->f_id.dst_port = dst_port = ntohs(dst_port);
+       }
+
+       IPFW_RLOCK(chain);
+       if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+               IPFW_RUNLOCK(chain);
+               return (IP_FW_PASS);    /* accept */
+       }
+       if (args->rule.slot) {
+               /*
+                * Packet has already been tagged as a result of a previous
+                * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+                * REASS, NETGRAPH, DIVERT/TEE...)
+                * Validate the slot and continue from the next one
+                * if still present, otherwise do a lookup.
+                */
+               f_pos = (args->rule.chain_id == chain->id) ?
+                   args->rule.slot :
+                   ipfw_find_rule(chain, args->rule.rulenum,
+                       args->rule.rule_id);
+       } else {
+               f_pos = 0;
+       }
+
+       /*
+        * Now scan the rules, and parse microinstructions for each rule.
+        * We have two nested loops and an inner switch. Sometimes we
+        * need to break out of one or both loops, or re-enter one of
+        * the loops with updated variables. Loop variables are:
+        *
+        *      f_pos (outer loop) points to the current rule.
+        *              On output it points to the matching rule.
+        *      done (outer loop) is used as a flag to break the loop.
+        *      l (inner loop)  residual length of current rule.
+        *              cmd points to the current microinstruction.
+        *
+        * We break the inner loop by setting l=0 and possibly
+        * cmdlen=0 if we don't want to advance cmd.
+        * We break the outer loop by setting done=1
+        * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+        * as needed.
+        */
+       for (; f_pos < chain->n_rules; f_pos++) {
+               ipfw_insn *cmd;
+               uint32_t tablearg = 0;
+               int l, cmdlen, skip_or; /* skip rest of OR block */
+               struct ip_fw *f;
+
+               f = chain->map[f_pos];
+               if (V_set_disable & (1 << f->set) )
+                       continue;
+
+               skip_or = 0;
+               for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+                   l -= cmdlen, cmd += cmdlen) {
+                       int match;
+
+                       /*
+                        * check_body is a jump target used when we find a
+                        * CHECK_STATE, and need to jump to the body of
+                        * the target rule.
+                        */
+
+/* check_body: */
+                       cmdlen = F_LEN(cmd);
+                       /*
+                        * An OR block (insn_1 || .. || insn_n) has the
+                        * F_OR bit set in all but the last instruction.
+                        * The first match will set "skip_or", and cause
+                        * the following instructions to be skipped until
+                        * past the one with the F_OR bit clear.
+                        */
+                       if (skip_or) {          /* skip this instruction */
+                               if ((cmd->len & F_OR) == 0)
+                                       skip_or = 0;    /* next one is good */
+                               continue;
+                       }
+                       match = 0; /* set to 1 if we succeed */
+
+                       switch (cmd->opcode) {
+                       /*
+                        * The first set of opcodes compares the packet's
+                        * fields with some pattern, setting 'match' if a
+                        * match is found. At the end of the loop there is
+                        * logic to deal with F_NOT and F_OR flags associated
+                        * with the opcode.
+                        */
+                       case O_NOP:
+                               match = 1;
+                               break;
+
+                       case O_FORWARD_MAC:
+                               printf("ipfw: opcode %d unimplemented\n",
+                                   cmd->opcode);
+                               break;
+
+                       case O_GID:
+                       case O_UID:
+                       case O_JAIL:
+                               /*
+                                * We only check offset == 0 && proto != 0,
+                                * as this ensures that we have a
+                                * packet with the ports info.
+                                */
+                               if (offset!=0)
+                                       break;
+                               if (is_ipv6) /* XXX to be fixed later */
+                                       break;
+                               if (proto == IPPROTO_TCP ||
+                                   proto == IPPROTO_UDP)
+                                       match = check_uidgid(
+                                                   (ipfw_insn_u32 *)cmd,
+                                                   proto, oif,
+                                                   dst_ip, dst_port,
+                                                   src_ip, src_port, (void *)&ucred_cache,
+                                                   &ucred_lookup, (struct inpcb *)args->m);
+                               break;
+
+                       case O_RECV:
+                               match = iface_match(m->m_pkthdr.rcvif,
+                                   (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_XMIT:
+                               match = iface_match(oif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_VIA:
+                               match = iface_match(oif ? oif :
+                                   m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_MACADDR2:
+                               if (args->eh != NULL) { /* have MAC header */
+                                       u_int32_t *want = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->addr;
+                                       u_int32_t *mask = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->mask;
+                                       u_int32_t *hdr = (u_int32_t *)args->eh;
+
+                                       match =
+                                           ( want[0] == (hdr[0] & mask[0]) &&
+                                             want[1] == (hdr[1] & mask[1]) &&
+                                             want[2] == (hdr[2] & mask[2]) );
+                               }
+                               break;
+
+                       case O_MAC_TYPE:
+                               if (args->eh != NULL) {
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (etype >= p[0] &&
+                                                   etype <= p[1]);
+                               }
+                               break;
+
+                       case O_FRAG:
+                               match = (offset != 0);
+                               break;
+
+                       case O_IN:      /* "out" is "not in" */
+                               match = (oif == NULL);
+                               break;
+
+                       case O_LAYER2:
+                               match = (args->eh != NULL);
+                               break;
+
+                       case O_DIVERTED:
+                           {
+                               /* For diverted packets, args->rule.info
+                                * contains the divert port (in host format)
+                                * reason and direction.
+                                */
+                               uint32_t i = args->rule.info;
+                               match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+                                   cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+                           }
+                               break;
+
+                       case O_PROTO:
+                               /*
+                                * We do not allow an arg of 0 so the
+                                * check of "proto" only suffices.
+                                */
+                               match = (proto == cmd->arg1);
+                               break;
+
+                       case O_IP_SRC:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   src_ip.s_addr);
+                               break;
+
+                       case O_IP_SRC_LOOKUP:
+                       case O_IP_DST_LOOKUP:
+                               if (is_ipv4) {
+                                   uint32_t key =
+                                       (cmd->opcode == O_IP_DST_LOOKUP) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t v = 0;
+
+                                   if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+                                       /* generic lookup. The key must be
+                                        * in 32bit big-endian format.
+                                        */
+                                       v = ((ipfw_insn_u32 *)cmd)->d[1];
+                                       if (v == 0)
+                                           key = dst_ip.s_addr;
+                                       else if (v == 1)
+                                           key = src_ip.s_addr;
+                                       else if (offset != 0)
+                                           break;
+                                       else if (proto != IPPROTO_TCP &&
+                                               proto != IPPROTO_UDP)
+                                           break;
+                                       else if (v == 2)
+                                           key = htonl(dst_port);
+                                       else if (v == 3)
+                                           key = htonl(src_port);
+                                       else if (v == 4 || v == 5) {
+                                           check_uidgid(
+                                               (ipfw_insn_u32 *)cmd,
+                                               proto, oif,
+                                               dst_ip, dst_port,
+                                               src_ip, src_port, (void *)&ucred_cache,
+                                               &ucred_lookup, (struct inpcb *)args->m);
+#ifdef __linux__
+                                           if (v ==4 /* O_UID */)
+                                               key = ucred_cache.uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache.xid;
+#else
+                                           if (v == 4 /* O_UID */)
+                                               key = ucred_cache->cr_uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache->cr_prison->pr_id;
+#endif
+                                           key = htonl(key);
+                                       } else
+                                           break;
+                                   }
+                                   match = ipfw_lookup_table(chain,
+                                       cmd->arg1, key, &v);
+                                   if (!match)
+                                       break;
+                                   if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+                                       match =
+                                           ((ipfw_insn_u32 *)cmd)->d[0] == v;
+                                   else
+                                       tablearg = v;
+                               }
+                               break;
+
+                       case O_IP_SRC_MASK:
+                       case O_IP_DST_MASK:
+                               if (is_ipv4) {
+                                   uint32_t a =
+                                       (cmd->opcode == O_IP_DST_MASK) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+                                   int i = cmdlen-1;
+
+                                   for (; !match && i>0; i-= 2, p+= 2)
+                                       match = (p[0] == (a & p[1]));
+                               }
+                               break;
+
+                       case O_IP_SRC_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(src_ip, tif);
+                                       match = (tif != NULL);
+                               }
+                               break;
+
+                       case O_IP_DST_SET:
+                       case O_IP_SRC_SET:
+                               if (is_ipv4) {
+                                       u_int32_t *d = (u_int32_t *)(cmd+1);
+                                       u_int32_t addr =
+                                           cmd->opcode == O_IP_DST_SET ?
+                                               args->f_id.dst_ip :
+                                               args->f_id.src_ip;
+
+                                           if (addr < d[0])
+                                                   break;
+                                           addr -= d[0]; /* subtract base */
+                                           match = (addr < cmd->arg1) &&
+                                               ( d[ 1 + (addr>>5)] &
+                                                 (1<<(addr & 0x1f)) );
+                               }
+                               break;
+
+                       case O_IP_DST:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   dst_ip.s_addr);
+                               break;
+
+                       case O_IP_DST_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(dst_ip, tif);
+                                       match = (tif != NULL);
+                               }
+                               break;
+
+                       case O_IP_SRCPORT:
+                       case O_IP_DSTPORT:
+                               /*
+                                * offset == 0 && proto != 0 is enough
+                                * to guarantee that we have a
+                                * packet with port info.
+                                */
+                               if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+                                   && offset == 0) {
+                                       u_int16_t x =
+                                           (cmd->opcode == O_IP_SRCPORT) ?
+                                               src_port : dst_port ;
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (x>=p[0] && x<=p[1]);
+                               }
+                               break;
+
+                       case O_ICMPTYPE:
+                               match = (offset == 0 && proto==IPPROTO_ICMP &&
+                                   icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+                               break;
+
+#ifdef INET6
+                       case O_ICMP6TYPE:
+                               match = is_ipv6 && offset == 0 &&
+                                   proto==IPPROTO_ICMPV6 &&
+                                   icmp6type_match(
+                                       ICMP6(ulp)->icmp6_type,
+                                       (ipfw_insn_u32 *)cmd);
+                               break;
+#endif /* INET6 */
+
+                       case O_IPOPT:
+                               match = (is_ipv4 &&
+                                   ipopts_match(ip, cmd) );
+                               break;
+
+                       case O_IPVER:
+                               match = (is_ipv4 &&
+                                   cmd->arg1 == ip->ip_v);
+                               break;
+
+                       case O_IPID:
+                       case O_IPLEN:
+                       case O_IPTTL:
+                               if (is_ipv4) {  /* only for IP packets */
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   if (cmd->opcode == O_IPLEN)
+                                       x = iplen;
+                                   else if (cmd->opcode == O_IPTTL)
+                                       x = ip->ip_ttl;
+                                   else /* must be IPID */
+                                       x = ntohs(ip->ip_id);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_IPPRECEDENCE:
+                               match = (is_ipv4 &&
+                                   (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+                               break;
+
+                       case O_IPTOS:
+                               match = (is_ipv4 &&
+                                   flags_match(cmd, ip->ip_tos));
+                               break;
+
+                       case O_TCPDATALEN:
+                               if (proto == IPPROTO_TCP && offset == 0) {
+                                   struct tcphdr *tcp;
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   tcp = TCP(ulp);
+                                   x = iplen -
+                                       ((ip->ip_hl + tcp->th_off) << 2);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_TCPFLAGS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   flags_match(cmd, TCP(ulp)->th_flags));
+                               break;
+
+                       case O_TCPOPTS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   tcpopts_match(TCP(ulp), cmd));
+                               break;
+
+                       case O_TCPSEQ:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_seq);
+                               break;
+
+                       case O_TCPACK:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_ack);
+                               break;
+
+                       case O_TCPWIN:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   cmd->arg1 == TCP(ulp)->th_win);
+                               break;
+
+                       case O_ESTAB:
+                               /* reject packets which have SYN only */
+                               /* XXX should i also check for TH_ACK ? */
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   (TCP(ulp)->th_flags &
+                                    (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+                               break;
+
+                       case O_ALTQ: {
+                               struct pf_mtag *at;
+                               ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                               match = 1;
+                               at = pf_find_mtag(m);
+                               if (at != NULL && at->qid != 0)
+                                       break;
+                               at = pf_get_mtag(m);
+                               if (at == NULL) {
+                                       /*
+                                        * Let the packet fall back to the
+                                        * default ALTQ.
+                                        */
+                                       break;
+                               }
+                               at->qid = altq->qid;
+                               if (is_ipv4)
+                                       at->af = AF_INET;
+                               else
+                                       at->af = AF_LINK;
+                               at->hdr = ip;
+                               break;
+                       }
+
+                       case O_LOG:
+                                       ipfw_log(f, hlen, args, m,
+                                           oif, offset, tablearg, ip);
+                               match = 1;
+                               break;
+
+                       case O_PROB:
+                               match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+                               break;
+
+                       case O_VERREVPATH:
+                               /* Outgoing packets automatically pass/match */
+                               match = ((oif != NULL) ||
+                                   (m->m_pkthdr.rcvif == NULL) ||
+                                   (
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           m->m_pkthdr.rcvif) :
+#endif
+                                   verify_path(src_ip, m->m_pkthdr.rcvif,
+                                       args->f_id.fib)));
+                               break;
+
+                       case O_VERSRCREACH:
+                               /* Outgoing packets automatically pass/match */
+                               match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           NULL) :
+#endif
+                                   verify_path(src_ip, NULL, args->f_id.fib)));
+                               break;
+
+                       case O_ANTISPOOF:
+                               /* Outgoing packets automatically pass/match */
+                               if (oif == NULL && hlen > 0 &&
+                                   (  (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+                                   || (is_ipv6 &&
+                                       in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+                                   ))
+                                       match =
+#ifdef INET6
+                                           is_ipv6 ? verify_path6(
+                                               &(args->f_id.src_ip6),
+                                               m->m_pkthdr.rcvif) :
+#endif
+                                           verify_path(src_ip,
+                                               m->m_pkthdr.rcvif,
+                                               args->f_id.fib);
+                               else
+                                       match = 1;
+                               break;
+
+                       case O_IPSEC:
+#ifdef IPSEC
+                               match = (m_tag_find(m,
+                                   PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+                               /* otherwise no match */
+                               break;
+
+#ifdef INET6
+                       case O_IP6_SRC:
+                               match = is_ipv6 &&
+                                   IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+
+                       case O_IP6_DST:
+                               match = is_ipv6 &&
+                               IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                               if (is_ipv6) {
+                                       int i = cmdlen - 1;
+                                       struct in6_addr p;
+                                       struct in6_addr *d =
+                                           &((ipfw_insn_ip6 *)cmd)->addr6;
+
+                                       for (; !match && i > 0; d += 2,
+                                           i -= F_INSN_SIZE(struct in6_addr)
+                                           * 2) {
+                                               p = (cmd->opcode ==
+                                                   O_IP6_SRC_MASK) ?
+                                                   args->f_id.src_ip6:
+                                                   args->f_id.dst_ip6;
+                                               APPLY_MASK(&p, &d[1]);
+                                               match =
+                                                   IN6_ARE_ADDR_EQUAL(&d[0],
+                                                   &p);
+                                       }
+                               }
+                               break;
+
+                       case O_IP6_SRC_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+                               break;
+
+                       case O_IP6_DST_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+                               break;
+
+                       case O_FLOW6ID:
+                               match = is_ipv6 &&
+                                   flow6id_match(args->f_id.flow_id6,
+                                   (ipfw_insn_u32 *) cmd);
+                               break;
+
+                       case O_EXT_HDR:
+                               match = is_ipv6 &&
+                                   (ext_hd & ((ipfw_insn *) cmd)->arg1);
+                               break;
+
+                       case O_IP6:
+                               match = is_ipv6;
+                               break;
+#endif
+
+                       case O_IP4:
+                               match = is_ipv4;
+                               break;
+
+                       case O_TAG: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               /* Packet is already tagged with this tag? */
+                               mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+                               /* We have `untag' action when F_NOT flag is
+                                * present. And we must remove this mtag from
+                                * mbuf and reset `match' to zero (`match' will
+                                * be inversed later).
+                                * Otherwise we should allocate new mtag and
+                                * push it into mbuf.
+                                */
+                               if (cmd->len & F_NOT) { /* `untag' action */
+                                       if (mtag != NULL)
+                                               m_tag_delete(m, mtag);
+                                       match = 0;
+                               } else if (mtag == NULL) {
+                                       if ((mtag = m_tag_alloc(MTAG_IPFW,
+                                           tag, 0, M_NOWAIT)) != NULL)
+                                               m_tag_prepend(m, mtag);
+                                       match = 1;
+                               }
+                               break;
+                       }
+
+                       case O_FIB: /* try match the specified fib */
+                               if (args->f_id.fib == cmd->arg1)
+                                       match = 1;
+                               break;
+
+                       case O_TAGGED: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               if (cmdlen == 1) {
+                                       match = m_tag_locate(m, MTAG_IPFW,
+                                           tag, NULL) != NULL;
+                                       break;
+                               }
+
+                               /* we have ranges */
+                               for (mtag = m_tag_first(m);
+                                   mtag != NULL && !match;
+                                   mtag = m_tag_next(m, mtag)) {
+                                       uint16_t *p;
+                                       int i;
+
+                                       if (mtag->m_tag_cookie != MTAG_IPFW)
+                                               continue;
+
+                                       p = ((ipfw_insn_u16 *)cmd)->ports;
+                                       i = cmdlen - 1;
+                                       for(; !match && i > 0; i--, p += 2)
+                                               match =
+                                                   mtag->m_tag_id >= p[0] &&
+                                                   mtag->m_tag_id <= p[1];
+                               }
+                               break;
+                       }
+                               
+                       /*
+                        * The second set of opcodes represents 'actions',
+                        * i.e. the terminal part of a rule once the packet
+                        * matches all previous patterns.
+                        * Typically there is only one action for each rule,
+                        * and the opcode is stored at the end of the rule
+                        * (but there are exceptions -- see below).
+                        *
+                        * In general, here we set retval and terminate the
+                        * outer loop (would be a 'break 3' in some language,
+                        * but we need to set l=0, done=1)
+                        *
+                        * Exceptions:
+                        * O_COUNT and O_SKIPTO actions:
+                        *   instead of terminating, we jump to the next rule
+                        *   (setting l=0), or to the SKIPTO target (setting
+                        *   f/f_len, cmd and l as needed), respectively.
+                        *
+                        * O_TAG, O_LOG and O_ALTQ action parameters:
+                        *   perform some action and set match = 1;
+                        *
+                        * O_LIMIT and O_KEEP_STATE: these opcodes are
+                        *   not real 'actions', and are stored right
+                        *   before the 'action' part of the rule.
+                        *   These opcodes try to install an entry in the
+                        *   state tables; if successful, we continue with
+                        *   the next opcode (match=1; break;), otherwise
+                        *   the packet must be dropped (set retval,
+                        *   break loops with l=0, done=1)
+                        *
+                        * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+                        *   cause a lookup of the state table, and a jump
+                        *   to the 'action' part of the parent rule
+                        *   if an entry is found, or
+                        *   (CHECK_STATE only) a jump to the next rule if
+                        *   the entry is not found.
+                        *   The result of the lookup is cached so that
+                        *   further instances of these opcodes become NOPs.
+                        *   The jump to the next rule is done by setting
+                        *   l=0, cmdlen=0.
+                        */
+                       case O_LIMIT:
+                       case O_KEEP_STATE:
+                               if (ipfw_install_state(f,
+                                   (ipfw_insn_limit *)cmd, args, tablearg)) {
+                                       /* error or limit violation */
+                                       retval = IP_FW_DENY;
+                                       l = 0;  /* exit inner loop */
+                                       done = 1; /* exit outer loop */
+                               }
+                               match = 1;
+                               break;
+
+                       case O_PROBE_STATE:
+                       case O_CHECK_STATE:
+                               /*
+                                * dynamic rules are checked at the first
+                                * keep-state or check-state occurrence,
+                                * with the result being stored in dyn_dir.
+                                * The compiler introduces a PROBE_STATE
+                                * instruction for us when we have a
+                                * KEEP_STATE (because PROBE_STATE needs
+                                * to be run first).
+                                */
+                               if (dyn_dir == MATCH_UNKNOWN &&
+                                   (q = ipfw_lookup_dyn_rule(&args->f_id,
+                                    &dyn_dir, proto == IPPROTO_TCP ?
+                                       TCP(ulp) : NULL))
+                                       != NULL) {
+                                       /*
+                                        * Found dynamic entry, update stats
+                                        * and jump to the 'action' part of
+                                        * the parent rule by setting
+                                        * f, cmd, l and clearing cmdlen.
+                                        */
+                                       q->pcnt++;
+                                       q->bcnt += pktlen;
+                                       /* XXX we would like to have f_pos
+                                        * readily accessible in the dynamic
+                                        * rule, instead of having to
+                                        * lookup q->rule.
+                                        */
+                                       f = q->rule;
+                                       f_pos = ipfw_find_rule(chain,
+                                               f->rulenum, f->id);
+                                       cmd = ACTION_PTR(f);
+                                       l = f->cmd_len - f->act_ofs;
+                                       ipfw_dyn_unlock();
+                                       cmdlen = 0;
+                                       match = 1;
+                                       break;
+                               }
+                               /*
+                                * Dynamic entry not found. If CHECK_STATE,
+                                * skip to next rule, if PROBE_STATE just
+                                * ignore and continue with next opcode.
+                                */
+                               if (cmd->opcode == O_CHECK_STATE)
+                                       l = 0;  /* exit inner loop */
+                               match = 1;
+                               break;
+
+                       case O_ACCEPT:
+                               retval = 0;     /* accept */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_PIPE:
+                       case O_QUEUE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               if (cmd->opcode == O_PIPE)
+                                       args->rule.info |= IPFW_IS_PIPE;
+                               if (V_fw_one_pass)
+                                       args->rule.info |= IPFW_ONEPASS;
+                               retval = IP_FW_DUMMYNET;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_DIVERT:
+                       case O_TEE:
+                               if (args->eh) /* not on layer 2 */
+                                   break;
+                               /* otherwise this is terminal */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               retval = (cmd->opcode == O_DIVERT) ?
+                                       IP_FW_DIVERT : IP_FW_TEE;
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+                               break;
+
+                       case O_COUNT:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_SKIPTO:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                           /* If possible use cached f_pos (in f->next_rule),
+                            * whose version is written in f->next_rule
+                            * (horrible hacks to avoid changing the ABI).
+                            */
+                           if (cmd->arg1 != IP_FW_TABLEARG &&
+                                   (uintptr_t)f->x_next == chain->id) {
+                               f_pos = (uintptr_t)f->next_rule;
+                               } else {
+                               int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               /* make sure we do not jump backward */
+                               if (i <= f->rulenum)
+                                   i = f->rulenum + 1;
+                               f_pos = ipfw_find_rule(chain, i, 0);
+                               /* update the cache */
+                               if (cmd->arg1 != IP_FW_TABLEARG) {
+                                   f->next_rule =
+                                       (void *)(uintptr_t)f_pos;
+                                   f->x_next =
+                                       (void *)(uintptr_t)chain->id;
+                               }
+                               }
+                               /*
+                            * Skip disabled rules, and re-enter
+                            * the inner loop with the correct
+                            * f_pos, f, l and cmd.
+                                * Also clear cmdlen and skip_or
+                                */
+                           for (; f_pos < chain->n_rules - 1 &&
+                                   (V_set_disable &
+                                    (1 << chain->map[f_pos]->set));
+                                   f_pos++)
+                               ;
+                           /* prepare to enter the inner loop */
+                           f = chain->map[f_pos];
+                                       l = f->cmd_len;
+                                       cmd = f->cmd;
+                               match = 1;
+                               cmdlen = 0;
+                               skip_or = 0;
+                               break;
+
+                       case O_REJECT:
+                               /*
+                                * Drop the packet and send a reject notice
+                                * if the packet is not ICMP (or is an ICMP
+                                * query), and it is not multicast/broadcast.
+                                */
+                               if (hlen > 0 && is_ipv4 && offset == 0 &&
+                                   (proto != IPPROTO_ICMP ||
+                                    is_icmp_query(ICMP(ulp))) &&
+                                   !(m->m_flags & (M_BCAST|M_MCAST)) &&
+                                   !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
+                                       send_reject(args, cmd->arg1, iplen, ip);
+                                       m = args->m;
+                               }
+                               /* FALLTHROUGH */
+#ifdef INET6
+                       case O_UNREACH6:
+                               if (hlen > 0 && is_ipv6 &&
+                                   ((offset & IP6F_OFF_MASK) == 0) &&
+                                   (proto != IPPROTO_ICMPV6 ||
+                                    (is_icmp6_query(args->f_id.flags) == 1)) &&
+                                   !(m->m_flags & (M_BCAST|M_MCAST)) &&
+                                   !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
+                                       send_reject6(
+                                           args, cmd->arg1, hlen,
+                                           (struct ip6_hdr *)ip);
+                                       m = args->m;
+                               }
+                               /* FALLTHROUGH */
+#endif
+                       case O_DENY:
+                               retval = IP_FW_DENY;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_FORWARD_IP:
+                               if (args->eh)   /* not valid on layer2 pkts */
+                                       break;
+                               if (!q || dyn_dir == MATCH_FORWARD) {
+                                   struct sockaddr_in *sa;
+                                   sa = &(((ipfw_insn_sa *)cmd)->sa);
+                                   if (sa->sin_addr.s_addr == INADDR_ANY) {
+                                       bcopy(sa, &args->hopstore,
+                                                       sizeof(*sa));
+                                       args->hopstore.sin_addr.s_addr =
+                                                   htonl(tablearg);
+                                       args->next_hop = &args->hopstore;
+                                   } else {
+                                       args->next_hop = sa;
+                                   }
+                               }
+                               retval = IP_FW_PASS;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_NETGRAPH:
+                       case O_NGTEE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               retval = (cmd->opcode == O_NETGRAPH) ?
+                                   IP_FW_NETGRAPH : IP_FW_NGTEE;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_SETFIB:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               M_SETFIB(m, cmd->arg1);
+                               args->f_id.fib = cmd->arg1;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_NAT:
+                               if (!IPFW_NAT_LOADED) {
+                                   retval = IP_FW_DENY;
+                               } else {
+                                   struct cfg_nat *t;
+                                   int nat_id;
+
+                                   set_match(args, f_pos, chain);
+                                   t = ((ipfw_insn_nat *)cmd)->nat;
+                                   if (t == NULL) {
+                                       nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                               tablearg : cmd->arg1;
+                                       t = (*lookup_nat_ptr)(&chain->nat, nat_id);
+
+                                       if (t == NULL) {
+                                           retval = IP_FW_DENY;
+                                           l = 0;      /* exit inner loop */
+                                           done = 1;   /* exit outer loop */
+                                           break;
+                                       }
+                                       if (cmd->arg1 != IP_FW_TABLEARG)
+                                           ((ipfw_insn_nat *)cmd)->nat = t;
+                                   }
+                                   retval = ipfw_nat_ptr(args, t, m);
+                               }
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_REASS: {
+                               int ip_off;
+
+                               f->pcnt++;
+                               f->bcnt += pktlen;
+                               l = 0;  /* in any case exit inner loop */
+                               ip_off = ntohs(ip->ip_off);
+
+                               /* if not fragmented, go to next rule */
+                               if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
+                                   break;
+                               /* 
+                                * ip_reass() expects len & off in host
+                                * byte order.
+                                */
+                               SET_HOST_IPLEN(ip);
+
+                               args->m = m = ip_reass(m);
+
+                               /*
+                                * do IP header checksum fixup.
+                                */
+                               if (m == NULL) { /* fragment got swallowed */
+                                   retval = IP_FW_DENY;
+                               } else { /* good, packet complete */
+                                   int hlen;
+
+                                   ip = mtod(m, struct ip *);
+                                   hlen = ip->ip_hl << 2;
+                                   SET_NET_IPLEN(ip);
+                                   ip->ip_sum = 0;
+                                   if (hlen == sizeof(struct ip))
+                                       ip->ip_sum = in_cksum_hdr(ip);
+                                   else
+                                       ip->ip_sum = in_cksum(m, hlen);
+                                   retval = IP_FW_REASS;
+                                   set_match(args, f_pos, chain);
+                               }
+                               done = 1;       /* exit outer loop */
+                               break;
+                       }
+
+                       default:
+                               panic("-- unknown opcode %d\n", cmd->opcode);
+                       } /* end of switch() on opcodes */
+                       /*
+                        * if we get here with l=0, then match is irrelevant.
+                        */
+
+                       if (cmd->len & F_NOT)
+                               match = !match;
+
+                       if (match) {
+                               if (cmd->len & F_OR)
+                                       skip_or = 1;
+                       } else {
+                               if (!(cmd->len & F_OR)) /* not an OR block, */
+                                       break;          /* try next rule    */
+                       }
+
+               }       /* end of inner loop, scan opcodes */
+
+               if (done)
+                       break;
+
+/* next_rule:; */      /* try next rule                */
+
+       }               /* end of outer for, scan rules */
+
+       if (done) {
+               struct ip_fw *rule = chain->map[f_pos];
+               /* Update statistics */
+               rule->pcnt++;
+               rule->bcnt += pktlen;
+               rule->timestamp = time_uptime;
+       } else {
+               retval = IP_FW_DENY;
+               printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+       }
+       IPFW_RUNLOCK(chain);
+#ifndef __linux__
+       if (ucred_cache != NULL)
+               crfree(ucred_cache);
+#endif
+       return (retval);
+
+pullup_failed:
+       if (V_fw_verbose)
+               printf("ipfw: pullup failed\n");
+       return (IP_FW_DENY);
+}
+
+/*
+ * Module and VNET glue
+ */
+
+/*
+ * Stuff that must be initialised only on boot or module load
+ */
+static int
+ipfw_init(void)
+{
+       int error = 0;
+
+       ipfw_dyn_attach();
+       /*
+        * Only print out this stuff the first time around,
+        * when called from the sysinit code.
+        */
+       printf("ipfw2 "
+#ifdef INET6
+               "(+ipv6) "
+#endif
+               "initialized, divert %s, nat %s, "
+               "rule-based forwarding "
+#ifdef IPFIREWALL_FORWARD
+               "enabled, "
+#else
+               "disabled, "
+#endif
+               "default to %s, logging ",
+#ifdef IPDIVERT
+               "enabled",
+#else
+               "loadable",
+#endif
+#ifdef IPFIREWALL_NAT
+               "enabled",
+#else
+               "loadable",
+#endif
+               default_to_accept ? "accept" : "deny");
+
+       /*
+        * Note: V_xxx variables can be accessed here but the vnet specific
+        * initializer may not have been called yet for the VIMAGE case.
+        * Tuneables will have been processed. We will print out values for
+        * the default vnet. 
+        * XXX This should all be rationalized AFTER 8.0
+        */
+       if (V_fw_verbose == 0)
+               printf("disabled\n");
+       else if (V_verbose_limit == 0)
+               printf("unlimited\n");
+       else
+               printf("limited to %d packets/entry by default\n",
+                   V_verbose_limit);
+
+       ipfw_log_bpf(1); /* init */
+       return (error);
+}
+
+/*
+ * Called for the removal of the last instance only on module unload.
+ */
+static void
+ipfw_destroy(void)
+{
+
+       ipfw_log_bpf(0); /* uninit */
+       ipfw_dyn_detach();
+       printf("IP firewall unloaded\n");
+}
+
+/*
+ * Stuff that must be initialized for every instance
+ * (including the first of course).
+ */
+static int
+vnet_ipfw_init(const void *unused)
+{
+       int error;
+       struct ip_fw *rule = NULL;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+
+       /* First set up some values that are compile time options */
+       V_autoinc_step = 100;   /* bounded to 1..1000 in add_rule() */
+       V_fw_deny_unknown_exthdrs = 1;
+#ifdef IPFIREWALL_VERBOSE
+       V_fw_verbose = 1;
+#endif
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+       V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#endif
+#ifdef IPFIREWALL_NAT
+       LIST_INIT(&chain->nat);
+#endif
+
+       /* insert the default rule and create the initial map */
+       chain->n_rules = 1;
+       chain->static_len = sizeof(struct ip_fw);
+       chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
+       if (chain->map)
+               rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
+       if (rule == NULL) {
+               if (chain->map)
+                       free(chain->map, M_IPFW);
+               printf("ipfw2: ENOSPC initializing default rule "
+                       "(support disabled)\n");
+               return (ENOSPC);
+       }
+       error = ipfw_init_tables(chain);
+       if (error) {
+               panic("init_tables"); /* XXX Marko fix this ! */
+       }
+
+       /* fill and insert the default rule */
+       rule->act_ofs = 0;
+       rule->rulenum = IPFW_DEFAULT_RULE;
+       rule->cmd_len = 1;
+       rule->set = RESVD_SET;
+       rule->cmd[0].len = 1;
+       rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+       chain->rules = chain->default_rule = chain->map[0] = rule;
+       chain->id = rule->id = 1;
+
+       IPFW_LOCK_INIT(chain);
+       ipfw_dyn_init();
+
+       /* First set up some values that are compile time options */
+       V_ipfw_vnet_ready = 1;          /* Open for business */
+
+       /*
+        * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
+        * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
+        * we still keep the module alive because the sockopt and
+        * layer2 paths are still useful.
+        * ipfw[6]_hook return 0 on success, ENOENT on failure,
+        * so we can ignore the exact return value and just set a flag.
+        *
+        * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
+        * changes in the underlying (per-vnet) variables trigger
+        * immediate hook()/unhook() calls.
+        * In layer2 we have the same behaviour, except that V_ether_ipfw
+        * is checked on each packet because there are no pfil hooks.
+        */
+       V_ip_fw_ctl_ptr = ipfw_ctl;
+       V_ip_fw_chk_ptr = ipfw_chk;
+       error = ipfw_attach_hooks(1);
+       return (error);
+}
+
+/*
+ * Called for the removal of each instance.
+ */
+static int
+vnet_ipfw_uninit(const void *unused)
+{
+       struct ip_fw *reap, *rule;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+       int i;
+
+       V_ipfw_vnet_ready = 0; /* tell new callers to go away */
+       /*
+        * disconnect from ipv4, ipv6, layer2 and sockopt.
+        * Then grab, release and grab again the WLOCK so we make
+        * sure the update is propagated and nobody will be in.
+        */
+       (void)ipfw_attach_hooks(0 /* detach */);
+       V_ip_fw_chk_ptr = NULL;
+       V_ip_fw_ctl_ptr = NULL;
+       IPFW_UH_WLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+       IPFW_UH_WLOCK(chain);
+
+       IPFW_WLOCK(chain);
+       IPFW_WUNLOCK(chain);
+       IPFW_WLOCK(chain);
+
+       ipfw_dyn_uninit(0);     /* run the callout_drain */
+       ipfw_flush_tables(chain);
+       reap = NULL;
+       for (i = 0; i < chain->n_rules; i++) {
+               rule = chain->map[i];
+               rule->x_next = reap;
+               reap = rule;
+       }
+       if (chain->map)
+               free(chain->map, M_IPFW);
+       IPFW_WUNLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+       if (reap != NULL)
+               ipfw_reap_rules(reap);
+       IPFW_LOCK_DESTROY(chain);
+       ipfw_dyn_uninit(1);     /* free the remaining parts */
+       return 0;
+}
+
+/*
+ * Module event handler.
+ * In general we have the choice of handling most of these events by the
+ * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
+ * use the SYSINIT handlers as they are more capable of expressing the
+ * flow of control during module and vnet operations, so this is just
+ * a skeleton. Note there is no SYSINIT equivalent of the module
+ * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
+ */
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+       int err = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               /* Called once at module load or
+                * system boot if compiled in. */
+               break;
+       case MOD_QUIESCE:
+               /* Called before unload. May veto unloading. */
+               break;
+       case MOD_UNLOAD:
+               /* Called during unload. */
+               break;
+       case MOD_SHUTDOWN:
+               /* Called during system shutdown. */
+               break;
+       default:
+               err = EOPNOTSUPP;
+               break;
+       }
+       return err;
+}
+
+static moduledata_t ipfwmod = {
+       "ipfw",
+       ipfw_modevent,
+       0
+};
+
+/* Define startup order. */
+#define        IPFW_SI_SUB_FIREWALL    SI_SUB_PROTO_IFATTACHDOMAIN
+#define        IPFW_MODEVENT_ORDER     (SI_ORDER_ANY - 255) /* On boot slot in here. */
+#define        IPFW_MODULE_ORDER       (IPFW_MODEVENT_ORDER + 1) /* A little later. */
+#define        IPFW_VNET_ORDER         (IPFW_MODEVENT_ORDER + 2) /* Later still. */
+
+DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
+MODULE_VERSION(ipfw, 2);
+/* should declare some dependencies here */
+
+/*
+ * Starting up. Done in order after ipfwmod() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+           ipfw_init, NULL);
+VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+           vnet_ipfw_init, NULL);
+ 
+/*
+ * Closing up shop. These are done in REVERSE ORDER, but still
+ * after ipfwmod() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+           ipfw_destroy, NULL);
+VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+           vnet_ipfw_uninit, NULL);
+/* end of file */
diff --git a/dummynet2/ip_fw_dynamic.c b/dummynet2/ip_fw_dynamic.c

new file mode 100644 (file)

index 0000000..9c7d2cd
--- /dev/null
+++ b/dummynet2/ip_fw_dynamic.c
@@ -0,0 +1,1237 @@
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#define        DEB(x)
+#define        DDB(x) x
+
+/*
+ * Dynamic rule support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>    /* ip_defttl */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>       /* IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ *  + stateful rules;
+ *  + enforcing limits on the number of sessions;
+ *  + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. XXX we should make them survive.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+
+/*
+ * Static variables followed by global ones
+ */
+static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets);
+static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
+static VNET_DEFINE(struct callout, ipfw_timeout);
+#define        V_ipfw_dyn_v                    VNET(ipfw_dyn_v)
+#define        V_dyn_buckets                   VNET(dyn_buckets)
+#define        V_curr_dyn_buckets              VNET(curr_dyn_buckets)
+#define V_ipfw_timeout                  VNET(ipfw_timeout)
+
+static uma_zone_t ipfw_dyn_rule_zone;
+#if defined( __linux__ ) || defined( _WIN32 )
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
+static struct mtx ipfw_dyn_mtx;                /* mutex guarding dynamic rules */
+#endif
+
+#define        IPFW_DYN_LOCK_INIT() \
+       mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
+#define        IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
+#define        IPFW_DYN_LOCK()         mtx_lock(&ipfw_dyn_mtx)
+#define        IPFW_DYN_UNLOCK()       mtx_unlock(&ipfw_dyn_mtx)
+#define        IPFW_DYN_LOCK_ASSERT()  mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+
+void
+ipfw_dyn_unlock(void)
+{
+       IPFW_DYN_UNLOCK();
+}
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
+
+#define        V_dyn_ack_lifetime              VNET(dyn_ack_lifetime)
+#define        V_dyn_syn_lifetime              VNET(dyn_syn_lifetime)
+#define        V_dyn_fin_lifetime              VNET(dyn_fin_lifetime)
+#define        V_dyn_rst_lifetime              VNET(dyn_rst_lifetime)
+#define        V_dyn_udp_lifetime              VNET(dyn_udp_lifetime)
+#define        V_dyn_short_lifetime            VNET(dyn_short_lifetime)
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
+static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
+static VNET_DEFINE(u_int32_t, dyn_keepalive);
+
+#define        V_dyn_keepalive_interval        VNET(dyn_keepalive_interval)
+#define        V_dyn_keepalive_period          VNET(dyn_keepalive_period)
+#define        V_dyn_keepalive                 VNET(dyn_keepalive)
+
+static VNET_DEFINE(u_int32_t, dyn_count);      /* # of dynamic rules */
+static VNET_DEFINE(u_int32_t, dyn_max);                /* max # of dynamic rules */
+
+#define        V_dyn_count                     VNET(dyn_count)
+#define        V_dyn_max                       VNET(dyn_max)
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+    CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
+    "Number of dyn. buckets");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+    CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+    "Current Number of dyn. buckets");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count,
+    CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
+    "Number of dyn. rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max,
+    CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
+    "Max number of dyn. rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+    "Lifetime of dyn. rules for acks");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+    "Lifetime of dyn. rules for syn");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+    "Lifetime of dyn. rules for fin");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+    "Lifetime of dyn. rules for rst");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+    "Lifetime of dyn. rules for UDP");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+    "Lifetime of dyn. rules for other situations");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+    CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+    "Enable keepalives for dyn. rules");
+#endif /* SYSCTL_NODE */
+
+
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+       u_int32_t i;
+       i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+           (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+           (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+           (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+           (id->dst_port) ^ (id->src_port);
+       return i;
+}
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+       u_int32_t i;
+
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(id)) 
+               i = hash_packet6(id);
+       else
+#endif /* INET6 */
+       i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+       i &= (V_curr_dyn_buckets - 1);
+       return i;
+}
+
+static __inline void
+unlink_dyn_rule_print(struct ipfw_flow_id *id)
+{
+       struct in_addr da;
+#ifdef INET6
+       char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+#else
+       char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(id)) {
+               ip6_sprintf(src, &id->src_ip6);
+               ip6_sprintf(dst, &id->dst_ip6);
+       } else
+#endif
+       {
+               da.s_addr = htonl(id->src_ip);
+               inet_ntoa_r(da, src);
+               da.s_addr = htonl(id->dst_ip);
+               inet_ntoa_r(da, dst);
+       }
+       printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
+           src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) {                               \
+       ipfw_dyn_rule *old_q = q;                                       \
+                                                                       \
+       /* remove a refcount to the parent */                           \
+       if (q->dyn_type == O_LIMIT)                                     \
+               q->parent->count--;                                     \
+       DEB(unlink_dyn_rule_print(&q->id);)                             \
+       if (prev != NULL)                                               \
+               prev->next = q = q->next;                               \
+       else                                                            \
+               head = q = q->next;                                     \
+       V_dyn_count--;                                                  \
+       uma_zfree(ipfw_dyn_rule_zone, old_q); }
+
+#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
+
+/**
+ * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
+ *
+ * If keep_me == NULL, rules are deleted even if not expired,
+ * otherwise only expired rules are removed.
+ *
+ * The value of the second parameter is also used to point to identify
+ * a rule we absolutely do not want to remove (e.g. because we are
+ * holding a reference to it -- this is the case with O_LIMIT_PARENT
+ * rules). The pointer is only used for comparison, so any non-null
+ * value will do.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
+{
+       static u_int32_t last_remove = 0;
+
+#define FORCE (keep_me == NULL)
+
+       ipfw_dyn_rule *prev, *q;
+       int i, pass = 0, max_pass = 0;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+               return;
+       /* do not expire more than once per second, it is useless */
+       if (!FORCE && last_remove == time_uptime)
+               return;
+       last_remove = time_uptime;
+
+       /*
+        * because O_LIMIT refer to parent rules, during the first pass only
+        * remove child and mark any pending LIMIT_PARENT, and remove
+        * them in a second pass.
+        */
+next_pass:
+       for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+               for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
+                       /*
+                        * Logic can become complex here, so we split tests.
+                        */
+                       if (q == keep_me)
+                               goto next;
+                       if (rule != NULL && rule != q->rule)
+                               goto next; /* not the one we are looking for */
+                       if (q->dyn_type == O_LIMIT_PARENT) {
+                               /*
+                                * handle parent in the second pass,
+                                * record we need one.
+                                */
+                               max_pass = 1;
+                               if (pass == 0)
+                                       goto next;
+                               if (FORCE && q->count != 0 ) {
+                                       /* XXX should not happen! */
+                                       printf("ipfw: OUCH! cannot remove rule,"
+                                            " count %d\n", q->count);
+                               }
+                       } else {
+                               if (!FORCE &&
+                                   !TIME_LEQ( q->expire, time_uptime ))
+                                       goto next;
+                       }
+             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
+                     UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                     continue;
+             }
+next:
+                       prev=q;
+                       q=q->next;
+               }
+       }
+       if (pass++ < max_pass)
+               goto next_pass;
+}
+
+void
+ipfw_remove_dyn_children(struct ip_fw *rule)
+{
+       IPFW_DYN_LOCK();
+       remove_dyn_rule(rule, NULL /* force removal */);
+       IPFW_DYN_UNLOCK();
+}
+
+/**
+ * lookup a dynamic rule, locked version
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+       /*
+        * stateful ipfw extensions.
+        * Lookup into dynamic session queue
+        */
+#define MATCH_REVERSE  0
+#define MATCH_FORWARD  1
+#define MATCH_NONE     2
+#define MATCH_UNKNOWN  3
+       int i, dir = MATCH_NONE;
+       ipfw_dyn_rule *prev, *q=NULL;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL)
+               goto done;      /* not found */
+       i = hash_packet( pkt );
+       for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
+               if (q->dyn_type == O_LIMIT_PARENT && q->count)
+                       goto next;
+               if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
+                       UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                       continue;
+               }
+               if (pkt->proto == q->id.proto &&
+                   q->dyn_type != O_LIMIT_PARENT) {
+                       if (IS_IP6_FLOW_ID(pkt)) {
+                           if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                               &(q->id.src_ip6)) &&
+                           IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                               &(q->id.dst_ip6)) &&
+                           pkt->src_port == q->id.src_port &&
+                           pkt->dst_port == q->id.dst_port ) {
+                               dir = MATCH_FORWARD;
+                               break;
+                           }
+                           if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                                   &(q->id.dst_ip6)) &&
+                               IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                                   &(q->id.src_ip6)) &&
+                               pkt->src_port == q->id.dst_port &&
+                               pkt->dst_port == q->id.src_port ) {
+                                   dir = MATCH_REVERSE;
+                                   break;
+                           }
+                       } else {
+                           if (pkt->src_ip == q->id.src_ip &&
+                               pkt->dst_ip == q->id.dst_ip &&
+                               pkt->src_port == q->id.src_port &&
+                               pkt->dst_port == q->id.dst_port ) {
+                                   dir = MATCH_FORWARD;
+                                   break;
+                           }
+                           if (pkt->src_ip == q->id.dst_ip &&
+                               pkt->dst_ip == q->id.src_ip &&
+                               pkt->src_port == q->id.dst_port &&
+                               pkt->dst_port == q->id.src_port ) {
+                                   dir = MATCH_REVERSE;
+                                   break;
+                           }
+                       }
+               }
+next:
+               prev = q;
+               q = q->next;
+       }
+       if (q == NULL)
+               goto done; /* q = NULL, not found */
+
+       if ( prev != NULL) { /* found and not in front */
+               prev->next = q->next;
+               q->next = V_ipfw_dyn_v[i];
+               V_ipfw_dyn_v[i] = q;
+       }
+       if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
+               u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
+
+#define BOTH_SYN       (TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN       (TH_FIN | (TH_FIN << 8))
+               q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
+               switch (q->state) {
+               case TH_SYN:                            /* opening */
+                       q->expire = time_uptime + V_dyn_syn_lifetime;
+                       break;
+
+               case BOTH_SYN:                  /* move to established */
+               case BOTH_SYN | TH_FIN :        /* one side tries to close */
+               case BOTH_SYN | (TH_FIN << 8) :
+                       if (tcp) {
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+                           u_int32_t ack = ntohl(tcp->th_ack);
+                           if (dir == MATCH_FORWARD) {
+                               if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
+                                   q->ack_fwd = ack;
+                               else { /* ignore out-of-sequence */
+                                   break;
+                               }
+                           } else {
+                               if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
+                                   q->ack_rev = ack;
+                               else { /* ignore out-of-sequence */
+                                   break;
+                               }
+                           }
+                       }
+                       q->expire = time_uptime + V_dyn_ack_lifetime;
+                       break;
+
+               case BOTH_SYN | BOTH_FIN:       /* both sides closed */
+                       if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+                               V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
+                       q->expire = time_uptime + V_dyn_fin_lifetime;
+                       break;
+
+               default:
+#if 0
+                       /*
+                        * reset or some invalid combination, but can also
+                        * occur if we use keep-state the wrong way.
+                        */
+                       if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+                               printf("invalid state: 0x%x\n", q->state);
+#endif
+                       if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+                               V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
+                       q->expire = time_uptime + V_dyn_rst_lifetime;
+                       break;
+               }
+       } else if (pkt->proto == IPPROTO_UDP) {
+               q->expire = time_uptime + V_dyn_udp_lifetime;
+       } else {
+               /* other protocols */
+               q->expire = time_uptime + V_dyn_short_lifetime;
+       }
+done:
+       if (match_direction)
+               *match_direction = dir;
+       return q;
+}
+
+ipfw_dyn_rule *
+ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+       ipfw_dyn_rule *q;
+
+       IPFW_DYN_LOCK();
+       q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+       if (q == NULL)
+               IPFW_DYN_UNLOCK();
+       /* NB: return table locked when q is not NULL */
+       return q;
+}
+
+static void
+realloc_dynamic_table(void)
+{
+       IPFW_DYN_LOCK_ASSERT();
+
+       /*
+        * Try reallocation, make sure we have a power of 2 and do
+        * not allow more than 64k entries. In case of overflow,
+        * default to 1024.
+        */
+
+       if (V_dyn_buckets > 65536)
+               V_dyn_buckets = 1024;
+       if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
+               V_dyn_buckets = V_curr_dyn_buckets; /* reset */
+               return;
+       }
+       V_curr_dyn_buckets = V_dyn_buckets;
+       if (V_ipfw_dyn_v != NULL)
+               free(V_ipfw_dyn_v, M_IPFW);
+       for (;;) {
+               V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
+                      M_IPFW, M_NOWAIT | M_ZERO);
+               if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
+                       break;
+               V_curr_dyn_buckets /= 2;
+       }
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ *   (O_LIMIT). When they are created, the parent is
+ *   increased by 1, and decreased on delete. In this case,
+ *   the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+       ipfw_dyn_rule *r;
+       int i;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL ||
+           (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
+               realloc_dynamic_table();
+               if (V_ipfw_dyn_v == NULL)
+                       return NULL; /* failed ! */
+       }
+       i = hash_packet(id);
+
+       r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+       if (r == NULL) {
+               printf ("ipfw: sorry cannot allocate state\n");
+               return NULL;
+       }
+
+       /* increase refcount on parent, and set pointer */
+       if (dyn_type == O_LIMIT) {
+               ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+               if ( parent->dyn_type != O_LIMIT_PARENT)
+                       panic("invalid parent");
+               parent->count++;
+               r->parent = parent;
+               rule = parent->rule;
+       }
+
+       r->id = *id;
+       r->expire = time_uptime + V_dyn_syn_lifetime;
+       r->rule = rule;
+       r->dyn_type = dyn_type;
+       r->pcnt = r->bcnt = 0;
+       r->count = 0;
+
+       r->bucket = i;
+       r->next = V_ipfw_dyn_v[i];
+       V_ipfw_dyn_v[i] = r;
+       V_dyn_count++;
+       DEB({
+               struct in_addr da;
+#ifdef INET6
+               char src[INET6_ADDRSTRLEN];
+               char dst[INET6_ADDRSTRLEN];
+#else
+               char src[INET_ADDRSTRLEN];
+               char dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(r->id))) {
+                       ip6_sprintf(src, &r->id.src_ip6);
+                       ip6_sprintf(dst, &r->id.dst_ip6);
+               } else
+#endif
+               {
+                       da.s_addr = htonl(r->id.src_ip);
+                       inet_ntoa_r(da, src);
+                       da.s_addr = htonl(r->id.dst_ip);
+                       inet_ntoa_r(da, dst);
+               }
+               printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
+                   dyn_type, src, r->id.src_port, dst, r->id.dst_port,
+                   V_dyn_count);
+       })
+       return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+       ipfw_dyn_rule *q;
+       int i;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v) {
+               int is_v6 = IS_IP6_FLOW_ID(pkt);
+               i = hash_packet( pkt );
+               for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+                       if (q->dyn_type == O_LIMIT_PARENT &&
+                           rule== q->rule &&
+                           pkt->proto == q->id.proto &&
+                           pkt->src_port == q->id.src_port &&
+                           pkt->dst_port == q->id.dst_port &&
+                           (
+                               (is_v6 &&
+                                IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                                       &(q->id.src_ip6)) &&
+                                IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                                       &(q->id.dst_ip6))) ||
+                               (!is_v6 &&
+                                pkt->src_ip == q->id.src_ip &&
+                                pkt->dst_ip == q->id.dst_ip)
+                           )
+                       ) {
+                               q->expire = time_uptime + V_dyn_short_lifetime;
+                               DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
+                               return q;
+                       }
+       }
+       return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+int
+ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg)
+{
+       static int last_log;
+       ipfw_dyn_rule *q;
+       struct in_addr da;
+#ifdef INET6
+       char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+       char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+       src[0] = '\0';
+       dst[0] = '\0';
+
+       IPFW_DYN_LOCK();
+
+       DEB(
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(&(args->f_id))) {
+               ip6_sprintf(src, &args->f_id.src_ip6);
+               ip6_sprintf(dst, &args->f_id.dst_ip6);
+       } else
+#endif
+       {
+               da.s_addr = htonl(args->f_id.src_ip);
+               inet_ntoa_r(da, src);
+               da.s_addr = htonl(args->f_id.dst_ip);
+               inet_ntoa_r(da, dst);
+       }
+       printf("ipfw: %s: type %d %s %u -> %s %u\n",
+           __func__, cmd->o.opcode, src, args->f_id.src_port,
+           dst, args->f_id.dst_port);
+       src[0] = '\0';
+       dst[0] = '\0';
+       )
+
+       q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+       if (q != NULL) {        /* should never occur */
+               if (last_log != time_uptime) {
+                       last_log = time_uptime;
+                       printf("ipfw: %s: entry already present, done\n",
+                           __func__);
+               }
+               IPFW_DYN_UNLOCK();
+               return (0);
+       }
+
+       if (V_dyn_count >= V_dyn_max)
+               /* Run out of slots, try to remove any expired rule. */
+               remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
+
+       if (V_dyn_count >= V_dyn_max) {
+               if (last_log != time_uptime) {
+                       last_log = time_uptime;
+                       printf("ipfw: %s: Too many dynamic rules\n", __func__);
+               }
+               IPFW_DYN_UNLOCK();
+               return (1);     /* cannot install, notify caller */
+       }
+
+       switch (cmd->o.opcode) {
+       case O_KEEP_STATE:      /* bidir rule */
+               add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+               break;
+
+       case O_LIMIT: {         /* limit number of sessions */
+               struct ipfw_flow_id id;
+               ipfw_dyn_rule *parent;
+               uint32_t conn_limit;
+               uint16_t limit_mask = cmd->limit_mask;
+
+               conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
+                   tablearg : cmd->conn_limit;
+                 
+               DEB(
+               if (cmd->conn_limit == IP_FW_TABLEARG)
+                       printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+                           "(tablearg)\n", __func__, conn_limit);
+               else
+                       printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+                           __func__, conn_limit);
+               )
+
+               id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+               id.proto = args->f_id.proto;
+               id.addr_type = args->f_id.addr_type;
+               id.fib = M_GETFIB(args->m);
+
+               if (IS_IP6_FLOW_ID (&(args->f_id))) {
+                       if (limit_mask & DYN_SRC_ADDR)
+                               id.src_ip6 = args->f_id.src_ip6;
+                       if (limit_mask & DYN_DST_ADDR)
+                               id.dst_ip6 = args->f_id.dst_ip6;
+               } else {
+                       if (limit_mask & DYN_SRC_ADDR)
+                               id.src_ip = args->f_id.src_ip;
+                       if (limit_mask & DYN_DST_ADDR)
+                               id.dst_ip = args->f_id.dst_ip;
+               }
+               if (limit_mask & DYN_SRC_PORT)
+                       id.src_port = args->f_id.src_port;
+               if (limit_mask & DYN_DST_PORT)
+                       id.dst_port = args->f_id.dst_port;
+               if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+                       printf("ipfw: %s: add parent failed\n", __func__);
+                       IPFW_DYN_UNLOCK();
+                       return (1);
+               }
+
+               if (parent->count >= conn_limit) {
+                       /* See if we can remove some expired rule. */
+                       remove_dyn_rule(rule, parent);
+                       if (parent->count >= conn_limit) {
+                               if (V_fw_verbose && last_log != time_uptime) {
+                                       last_log = time_uptime;
+#ifdef INET6
+                                       /*
+                                        * XXX IPv6 flows are not
+                                        * supported yet.
+                                        */
+                                       if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                                               char ip6buf[INET6_ADDRSTRLEN];
+                                               snprintf(src, sizeof(src),
+                                                   "[%s]", ip6_sprintf(ip6buf,
+                                                       &args->f_id.src_ip6));
+                                               snprintf(dst, sizeof(dst),
+                                                   "[%s]", ip6_sprintf(ip6buf,
+                                                       &args->f_id.dst_ip6));
+                                       } else
+#endif
+                                       {
+                                               da.s_addr =
+                                                   htonl(args->f_id.src_ip);
+                                               inet_ntoa_r(da, src);
+                                               da.s_addr =
+                                                   htonl(args->f_id.dst_ip);
+                                               inet_ntoa_r(da, dst);
+                                       }
+                                       log(LOG_SECURITY | LOG_DEBUG,
+                                           "ipfw: %d %s %s:%u -> %s:%u, %s\n",
+                                           parent->rule->rulenum,
+                                           "drop session",
+                                           src, (args->f_id.src_port),
+                                           dst, (args->f_id.dst_port),
+                                           "too many entries");
+                               }
+                               IPFW_DYN_UNLOCK();
+                               return (1);
+                       }
+               }
+               add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
+               break;
+       }
+       default:
+               printf("ipfw: %s: unknown dynamic rule type %u\n",
+                   __func__, cmd->o.opcode);
+               IPFW_DYN_UNLOCK();
+               return (1);
+       }
+
+       /* XXX just set lifetime */
+       lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+       IPFW_DYN_UNLOCK();
+       return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+struct mbuf *
+ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+    u_int32_t ack, int flags)
+{
+#ifdef __linux__       // XXX to be revised
+       return NULL;
+#else
+       struct mbuf *m;
+       int len, dir;
+       struct ip *h = NULL;            /* stupid compiler */
+#ifdef INET6
+       struct ip6_hdr *h6 = NULL;
+#endif
+       struct tcphdr *th = NULL;
+
+       MGETHDR(m, M_DONTWAIT, MT_DATA);
+       if (m == NULL)
+               return (NULL);
+
+       M_SETFIB(m, id->fib);
+#ifdef MAC
+       if (replyto != NULL)
+               mac_netinet_firewall_reply(replyto, m);
+       else
+               mac_netinet_firewall_send(m);
+#else
+       (void)replyto;          /* don't warn about unused arg */
+#endif
+
+       switch (id->addr_type) {
+       case 4:
+               len = sizeof(struct ip) + sizeof(struct tcphdr);
+               break;
+#ifdef INET6
+       case 6:
+               len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+               break;
+#endif
+       default:
+               /* XXX: log me?!? */
+               FREE_PKT(m);
+               return (NULL);
+       }
+       dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
+
+       m->m_data += max_linkhdr;
+       m->m_flags |= M_SKIP_FIREWALL;
+       m->m_pkthdr.len = m->m_len = len;
+       m->m_pkthdr.rcvif = NULL;
+       bzero(m->m_data, len);
+
+       switch (id->addr_type) {
+       case 4:
+               h = mtod(m, struct ip *);
+
+               /* prepare for checksum */
+               h->ip_p = IPPROTO_TCP;
+               h->ip_len = htons(sizeof(struct tcphdr));
+               if (dir) {
+                       h->ip_src.s_addr = htonl(id->src_ip);
+                       h->ip_dst.s_addr = htonl(id->dst_ip);
+               } else {
+                       h->ip_src.s_addr = htonl(id->dst_ip);
+                       h->ip_dst.s_addr = htonl(id->src_ip);
+               }
+
+               th = (struct tcphdr *)(h + 1);
+               break;
+#ifdef INET6
+       case 6:
+               h6 = mtod(m, struct ip6_hdr *);
+
+               /* prepare for checksum */
+               h6->ip6_nxt = IPPROTO_TCP;
+               h6->ip6_plen = htons(sizeof(struct tcphdr));
+               if (dir) {
+                       h6->ip6_src = id->src_ip6;
+                       h6->ip6_dst = id->dst_ip6;
+               } else {
+                       h6->ip6_src = id->dst_ip6;
+                       h6->ip6_dst = id->src_ip6;
+               }
+
+               th = (struct tcphdr *)(h6 + 1);
+               break;
+#endif
+       }
+
+       if (dir) {
+               th->th_sport = htons(id->src_port);
+               th->th_dport = htons(id->dst_port);
+       } else {
+               th->th_sport = htons(id->dst_port);
+               th->th_dport = htons(id->src_port);
+       }
+       th->th_off = sizeof(struct tcphdr) >> 2;
+
+       if (flags & TH_RST) {
+               if (flags & TH_ACK) {
+                       th->th_seq = htonl(ack);
+                       th->th_flags = TH_RST;
+               } else {
+                       if (flags & TH_SYN)
+                               seq++;
+                       th->th_ack = htonl(seq);
+                       th->th_flags = TH_RST | TH_ACK;
+               }
+       } else {
+               /*
+                * Keepalive - use caller provided sequence numbers
+                */
+               th->th_seq = htonl(seq);
+               th->th_ack = htonl(ack);
+               th->th_flags = TH_ACK;
+       }
+
+       switch (id->addr_type) {
+       case 4:
+               th->th_sum = in_cksum(m, len);
+
+               /* finish the ip header */
+               h->ip_v = 4;
+               h->ip_hl = sizeof(*h) >> 2;
+               h->ip_tos = IPTOS_LOWDELAY;
+               h->ip_off = 0;
+               h->ip_len = htons(len);
+               h->ip_ttl = V_ip_defttl;
+               h->ip_sum = 0;
+               break;
+#ifdef INET6
+       case 6:
+               th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
+                   sizeof(struct tcphdr));
+
+               /* finish the ip6 header */
+               h6->ip6_vfc |= IPV6_VERSION;
+               h6->ip6_hlim = IPV6_DEFHLIM;
+               break;
+#endif
+       }
+
+       return (m);
+#endif /* !__linux__ */
+}
+
+/*
+ * This procedure is only used to handle keepalives. It is invoked
+ * every dyn_keepalive_period
+ */
+static void
+ipfw_tick(void * vnetx) 
+{
+       struct mbuf *m0, *m, *mnext, **mtailp;
+#ifdef INET6
+       struct mbuf *m6, **m6_tailp;
+#endif
+       int i;
+       ipfw_dyn_rule *q;
+#ifdef VIMAGE
+       struct vnet *vp = vnetx;
+#endif
+
+       CURVNET_SET(vp);
+       if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+               goto done;
+
+       /*
+        * We make a chain of packets to go out here -- not deferring
+        * until after we drop the IPFW dynamic rule lock would result
+        * in a lock order reversal with the normal packet input -> ipfw
+        * call stack.
+        */
+       m0 = NULL;
+       mtailp = &m0;
+#ifdef INET6
+       m6 = NULL;
+       m6_tailp = &m6;
+#endif
+       IPFW_DYN_LOCK();
+       for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+               for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
+                       if (q->dyn_type == O_LIMIT_PARENT)
+                               continue;
+                       if (q->id.proto != IPPROTO_TCP)
+                               continue;
+                       if ( (q->state & BOTH_SYN) != BOTH_SYN)
+                               continue;
+                       if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
+                           q->expire))
+                               continue;       /* too early */
+                       if (TIME_LEQ(q->expire, time_uptime))
+                               continue;       /* too late, rule expired */
+
+                       m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1,
+                               q->ack_fwd, TH_SYN);
+                       mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1,
+                               q->ack_rev, 0);
+
+                       switch (q->id.addr_type) {
+                       case 4:
+                               if (m != NULL) {
+                                       *mtailp = m;
+                                       mtailp = &(*mtailp)->m_nextpkt;
+                               }
+                               if (mnext != NULL) {
+                                       *mtailp = mnext;
+                                       mtailp = &(*mtailp)->m_nextpkt;
+                               }
+                               break;
+#ifdef INET6
+                       case 6:
+                               if (m != NULL) {
+                                       *m6_tailp = m;
+                                       m6_tailp = &(*m6_tailp)->m_nextpkt;
+                               }
+                               if (mnext != NULL) {
+                                       *m6_tailp = mnext;
+                                       m6_tailp = &(*m6_tailp)->m_nextpkt;
+                               }
+                               break;
+#endif
+                       }
+
+                       m = mnext = NULL;
+               }
+       }
+       IPFW_DYN_UNLOCK();
+       for (m = mnext = m0; m != NULL; m = mnext) {
+               mnext = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip_output(m, NULL, NULL, 0, NULL, NULL);
+       }
+#ifdef INET6
+       for (m = mnext = m6; m != NULL; m = mnext) {
+               mnext = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+       }
+#endif
+done:
+       callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
+                     ipfw_tick, vnetx);
+       CURVNET_RESTORE();
+}
+
+void
+ipfw_dyn_attach(void)
+{
+        ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+            sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+            UMA_ALIGN_PTR, 0);
+
+        IPFW_DYN_LOCK_INIT();
+}
+
+void
+ipfw_dyn_detach(void)
+{
+        uma_zdestroy(ipfw_dyn_rule_zone);
+        IPFW_DYN_LOCK_DESTROY();
+}
+
+void
+ipfw_dyn_init(void)
+{
+        V_ipfw_dyn_v = NULL;
+        V_dyn_buckets = 256;    /* must be power of 2 */
+        V_curr_dyn_buckets = 256; /* must be power of 2 */
+ 
+        V_dyn_ack_lifetime = 300;
+        V_dyn_syn_lifetime = 20;
+        V_dyn_fin_lifetime = 1;
+        V_dyn_rst_lifetime = 1;
+        V_dyn_udp_lifetime = 10;
+        V_dyn_short_lifetime = 5;
+
+        V_dyn_keepalive_interval = 20;
+        V_dyn_keepalive_period = 5;
+        V_dyn_keepalive = 1;    /* do send keepalives */
+        
+        V_dyn_max = 4096;       /* max # of dynamic rules */
+        callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
+        callout_reset(&V_ipfw_timeout, hz, ipfw_tick, curvnet);
+}
+
+void
+ipfw_dyn_uninit(int pass)
+{
+       if (pass == 0)
+               callout_drain(&V_ipfw_timeout);
+       else {
+               if (V_ipfw_dyn_v != NULL)
+                       free(V_ipfw_dyn_v, M_IPFW);
+       }
+}
+
+int
+ipfw_dyn_len(void)
+{
+       return (V_ipfw_dyn_v == NULL) ? 0 :
+               (V_dyn_count * sizeof(ipfw_dyn_rule));
+}
+
+void
+ipfw_get_dynamic(char **pbp, const char *ep)
+{
+       ipfw_dyn_rule *p, *last = NULL;
+       char *bp;
+       int i;
+
+       if (V_ipfw_dyn_v == NULL)
+               return;
+       bp = *pbp;
+
+       IPFW_DYN_LOCK();
+       for (i = 0 ; i < V_curr_dyn_buckets; i++)
+               for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
+                       if (bp + sizeof *p <= ep) {
+                               ipfw_dyn_rule *dst =
+                                       (ipfw_dyn_rule *)bp;
+                               bcopy(p, dst, sizeof *p);
+                               bcopy(&(p->rule->rulenum), &(dst->rule),
+                                   sizeof(p->rule->rulenum));
+                               /*
+                                * store set number into high word of
+                                * dst->rule pointer.
+                                */
+                               bcopy(&(p->rule->set),
+                                   (char *)&dst->rule +
+                                   sizeof(p->rule->rulenum),
+                                   sizeof(p->rule->set));
+                               /*
+                                * store a non-null value in "next".
+                                * The userland code will interpret a
+                                * NULL here as a marker
+                                * for the last dynamic rule.
+                                */
+                               bcopy(&dst, &dst->next, sizeof(dst));
+                               last = dst;
+                               dst->expire =
+                                   TIME_LEQ(dst->expire, time_uptime) ?
+                                       0 : dst->expire - time_uptime ;
+                               bp += sizeof(ipfw_dyn_rule);
+                       }
+               }
+       IPFW_DYN_UNLOCK();
+       if (last != NULL) /* mark last dynamic rule */
+               bzero(&last->next, sizeof(last));
+       *pbp = bp;
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_log.c b/dummynet2/ip_fw_log.c

new file mode 100644 (file)

index 0000000..1bc1216
--- /dev/null
+++ b/dummynet2/ip_fw_log.c
@@ -0,0 +1,434 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Logging support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/if_types.h>      /* for IFT_ETHER */
+#include <net/bpf.h>           /* for BPF */
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_var.h>  /* ip6_sprintf() */
+#endif
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+#ifdef WITHOUT_BPF
+void
+ipfw_log_bpf(int onoff)
+{
+}
+#else /* !WITHOUT_BPF */
+static struct ifnet *log_if;   /* hook to attach to bpf */
+
+/* we use this dummy function for all ifnet callbacks */
+static int
+log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+       return EINVAL;
+}
+
+void
+ipfw_log_bpf(int onoff)
+{
+       struct ifnet *ifp;
+
+       if (onoff) {
+               if (log_if)
+                       return;
+               ifp = if_alloc(IFT_ETHER);
+               if (ifp == NULL)
+                       return;
+               if_initname(ifp, "ipfw", 0);
+               ifp->if_mtu = 65536;
+               ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+               ifp->if_init = (void *)log_dummy;
+               ifp->if_ioctl = log_dummy;
+               ifp->if_start = (void *)log_dummy;
+               ifp->if_output = (void *)log_dummy;
+               ifp->if_addrlen = 6;
+               ifp->if_hdrlen = 14;
+               if_attach(ifp);
+               ifp->if_baudrate = IF_Mbps(10);
+               bpfattach(ifp, DLT_EN10MB, 14);
+               log_if = ifp;
+       } else {
+               if (log_if) {
+                       ether_ifdetach(log_if);
+                       if_free(log_if);
+               }
+               log_if = NULL;
+       }
+}
+#endif /* !WITHOUT_BPF */
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+    struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+    struct ip *ip)
+{
+       char *action;
+       int limit_reached = 0;
+       char action2[40], proto[128], fragment[32];
+
+       if (V_fw_verbose == 0) {
+#ifndef WITHOUT_BPF
+               struct m_hdr mh;
+
+               if (log_if == NULL || log_if->if_bpf == NULL)
+                       return;
+               /* BPF treats the "mbuf" as read-only */
+               mh.mh_next = m;
+               mh.mh_len = ETHER_HDR_LEN;
+               if (args->eh) { /* layer2, use orig hdr */
+                       mh.mh_data = (char *)args->eh;
+               } else {
+                       /* add fake header. Later we will store
+                        * more info in the header
+                        */
+                       mh.mh_data = "DDDDDDSSSSSS\x08\x00";
+               }
+               BPF_MTAP(log_if, (struct mbuf *)&mh);
+#endif /* !WITHOUT_BPF */
+               return;
+       }
+       /* the old 'log' function */
+       fragment[0] = '\0';
+       proto[0] = '\0';
+
+       if (f == NULL) {        /* bogus pkt */
+               if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+                       return;
+               V_norule_counter++;
+               if (V_norule_counter == V_verbose_limit)
+                       limit_reached = V_verbose_limit;
+               action = "Refuse";
+       } else {        /* O_LOG is the first action, find the real one */
+               ipfw_insn *cmd = ACTION_PTR(f);
+               ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+               if (l->max_log != 0 && l->log_left == 0)
+                       return;
+               l->log_left--;
+               if (l->log_left == 0)
+                       limit_reached = l->max_log;
+               cmd += F_LEN(cmd);      /* point to first action */
+               if (cmd->opcode == O_ALTQ) {
+                       ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                       snprintf(SNPARGS(action2, 0), "Altq %d",
+                               altq->qid);
+                       cmd += F_LEN(cmd);
+               }
+               if (cmd->opcode == O_PROB)
+                       cmd += F_LEN(cmd);
+
+               if (cmd->opcode == O_TAG)
+                       cmd += F_LEN(cmd);
+
+               action = action2;
+               switch (cmd->opcode) {
+               case O_DENY:
+                       action = "Deny";
+                       break;
+
+               case O_REJECT:
+                       if (cmd->arg1==ICMP_REJECT_RST)
+                               action = "Reset";
+                       else if (cmd->arg1==ICMP_UNREACH_HOST)
+                               action = "Reject";
+                       else
+                               snprintf(SNPARGS(action2, 0), "Unreach %d",
+                                       cmd->arg1);
+                       break;
+
+               case O_UNREACH6:
+                       if (cmd->arg1==ICMP6_UNREACH_RST)
+                               action = "Reset";
+                       else
+                               snprintf(SNPARGS(action2, 0), "Unreach %d",
+                                       cmd->arg1);
+                       break;
+
+               case O_ACCEPT:
+                       action = "Accept";
+                       break;
+               case O_COUNT:
+                       action = "Count";
+                       break;
+               case O_DIVERT:
+                       snprintf(SNPARGS(action2, 0), "Divert %d",
+                               cmd->arg1);
+                       break;
+               case O_TEE:
+                       snprintf(SNPARGS(action2, 0), "Tee %d",
+                               cmd->arg1);
+                       break;
+               case O_SETFIB:
+                       snprintf(SNPARGS(action2, 0), "SetFib %d",
+                               cmd->arg1);
+                       break;
+               case O_SKIPTO:
+                       snprintf(SNPARGS(action2, 0), "SkipTo %d",
+                               cmd->arg1);
+                       break;
+               case O_PIPE:
+                       snprintf(SNPARGS(action2, 0), "Pipe %d",
+                               cmd->arg1);
+                       break;
+               case O_QUEUE:
+                       snprintf(SNPARGS(action2, 0), "Queue %d",
+                               cmd->arg1);
+                       break;
+               case O_FORWARD_IP: {
+                       ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+                       int len;
+                       struct in_addr dummyaddr;
+                       if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+                               dummyaddr.s_addr = htonl(tablearg);
+                       else
+                               dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+                       len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+                               inet_ntoa(dummyaddr));
+
+                       if (sa->sa.sin_port)
+                               snprintf(SNPARGS(action2, len), ":%d",
+                                   sa->sa.sin_port);
+                       }
+                       break;
+               case O_NETGRAPH:
+                       snprintf(SNPARGS(action2, 0), "Netgraph %d",
+                               cmd->arg1);
+                       break;
+               case O_NGTEE:
+                       snprintf(SNPARGS(action2, 0), "Ngtee %d",
+                               cmd->arg1);
+                       break;
+               case O_NAT:
+                       action = "Nat";
+                       break;
+               case O_REASS:
+                       action = "Reass";
+                       break;
+               default:
+                       action = "UNKNOWN";
+                       break;
+               }
+       }
+
+       if (hlen == 0) {        /* non-ip */
+               snprintf(SNPARGS(proto, 0), "MAC");
+
+       } else {
+               int len;
+#ifdef INET6
+               char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+               char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+               struct icmphdr *icmp;
+               struct tcphdr *tcp;
+               struct udphdr *udp;
+#ifdef INET6
+               struct ip6_hdr *ip6 = NULL;
+               struct icmp6_hdr *icmp6;
+#endif
+               src[0] = '\0';
+               dst[0] = '\0';
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                       char ip6buf[INET6_ADDRSTRLEN];
+                       snprintf(src, sizeof(src), "[%s]",
+                           ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+                       snprintf(dst, sizeof(dst), "[%s]",
+                           ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+                       ip6 = (struct ip6_hdr *)ip;
+                       tcp = (struct tcphdr *)(((char *)ip) + hlen);
+                       udp = (struct udphdr *)(((char *)ip) + hlen);
+               } else
+#endif
+               {
+                       tcp = L3HDR(struct tcphdr, ip);
+                       udp = L3HDR(struct udphdr, ip);
+
+                       inet_ntoa_r(ip->ip_src, src);
+                       inet_ntoa_r(ip->ip_dst, dst);
+               }
+
+               switch (args->f_id.proto) {
+               case IPPROTO_TCP:
+                       len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+                       if (offset == 0)
+                               snprintf(SNPARGS(proto, len), ":%d %s:%d",
+                                   ntohs(tcp->th_sport),
+                                   dst,
+                                   ntohs(tcp->th_dport));
+                       else
+                               snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+
+               case IPPROTO_UDP:
+                       len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+                       if (offset == 0)
+                               snprintf(SNPARGS(proto, len), ":%d %s:%d",
+                                   ntohs(udp->uh_sport),
+                                   dst,
+                                   ntohs(udp->uh_dport));
+                       else
+                               snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+
+               case IPPROTO_ICMP:
+                       icmp = L3HDR(struct icmphdr, ip);
+                       if (offset == 0)
+                               len = snprintf(SNPARGS(proto, 0),
+                                   "ICMP:%u.%u ",
+                                   icmp->icmp_type, icmp->icmp_code);
+                       else
+                               len = snprintf(SNPARGS(proto, 0), "ICMP ");
+                       len += snprintf(SNPARGS(proto, len), "%s", src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+#ifdef INET6
+               case IPPROTO_ICMPV6:
+                       icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+                       if (offset == 0)
+                               len = snprintf(SNPARGS(proto, 0),
+                                   "ICMPv6:%u.%u ",
+                                   icmp6->icmp6_type, icmp6->icmp6_code);
+                       else
+                               len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+                       len += snprintf(SNPARGS(proto, len), "%s", src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+#endif
+               default:
+                       len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+                           args->f_id.proto, src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+               }
+
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                       if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+                               snprintf(SNPARGS(fragment, 0),
+                                   " (frag %08x:%d@%d%s)",
+                                   args->f_id.frag_id6,
+                                   ntohs(ip6->ip6_plen) - hlen,
+                                   ntohs(offset & IP6F_OFF_MASK) << 3,
+                                   (offset & IP6F_MORE_FRAG) ? "+" : "");
+               } else
+#endif
+               {
+                       int ipoff, iplen;
+                       ipoff = ntohs(ip->ip_off);
+                       iplen = ntohs(ip->ip_len);
+                       if (ipoff & (IP_MF | IP_OFFMASK))
+                               snprintf(SNPARGS(fragment, 0),
+                                   " (frag %d:%d@%d%s)",
+                                   ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
+                                   offset << 3,
+                                   (ipoff & IP_MF) ? "+" : "");
+               }
+       }
+#ifndef __linux__
+       if (oif || m->m_pkthdr.rcvif)
+               log(LOG_SECURITY | LOG_INFO,
+                   "ipfw: %d %s %s %s via %s%s\n",
+                   f ? f->rulenum : -1,
+                   action, proto, oif ? "out" : "in",
+                   oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+                   fragment);
+       else
+#endif
+               log(LOG_SECURITY | LOG_INFO,
+                   "ipfw: %d %s %s [no if info]%s\n",
+                   f ? f->rulenum : -1,
+                   action, proto, fragment);
+       if (limit_reached)
+               log(LOG_SECURITY | LOG_NOTICE,
+                   "ipfw: limit %d reached on entry %d\n",
+                   limit_reached, f ? f->rulenum : -1);
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_lookup.c b/dummynet2/ip_fw_lookup.c

new file mode 100644 (file)

index 0000000..bf04cb6
--- /dev/null
+++ b/dummynet2/ip_fw_lookup.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Rule and pipe lookup support for ipfw.
+ *
+
+ipfw and dummynet need to quickly find objects (rules, pipes)
+that may be dynamically created or destroyed.
+To address the problem, we label each new object with a unique
+32-bit identifier whose low K bits are the index in a lookup
+table. All existing objects are referred by the lookup table,
+and identifiers are chosen so that for each slot there is
+at most one active object (whose identifier points to the slot).
+This is almost a hash table, except that we can pick the
+identifiers after looking at the table's occupation so
+we have a trivial hash function and are collision free.
+
+With this structure, operations are very fast and simple:
+- the table has N entries s[i] with two fields, 'id' and 'ptr',
+  with N <= M = 2^k (M is an upper bound to the size of the table);
+- initially, all slots have s[i].id = i, and the pointers
+  are used to build a freelist (tailq).
+- a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N].
+  This is easy to detect and we can use ptr to build the freelist.
+- when a new object is created, we put it in the empty slot i at the
+  head of the freelist, and set the id to s[i].id;
+- when an object is destroyed, we append its slot i to the end
+  of the freelist, and set s[i].id += M (note M, not N).
+- on a lookup for id = X, we look at slot i = X & (M-1),
+  and consider the lookup successful only if the slot is not
+  empty and s[i].id == X;
+- wraps occur at most every F * 2^32/M operations, where F is
+  the number of free slots. Because F is usually a reasonable
+  fraction of M, we should not worry too much.
+- if the table fills up, we can extend it by increasing N
+- shrinking the table is more difficult as we might create
+  collisions during the rehashing.
+ *
+ */
+
+#include <sys/cdefs.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup");
+#define Malloc(n)      malloc(n, M_IPFW_LUT, M_WAITOK)
+#define Calloc(n)      calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO)
+#define Free(p)                free(p, M_IPFW_LUT)
+
+#define log(x, arg...)
+
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define Malloc(n)      malloc(n)
+#define Calloc(n)      calloc(1, n)
+#define Free(p)                free(p)
+#define log(x, arg...) fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg)
+#endif /* !_KERNEL */
+
+struct entry {
+       uint32_t        id;
+       struct entry    *ptr;
+};
+
+struct lookup_table {
+       int _size;
+       int used;
+       int mask; /* 2^k -1, used for hashing */
+       struct entry *f_head, *f_tail; /* freelist */
+       struct entry *  s;      /* slots, array of N entries */
+};
+
+static __inline int empty(struct lookup_table *head, const void *p)
+{
+       const struct entry *ep = p;
+       return (ep == NULL ||
+               (ep >= head->s && ep < &head->s[head->_size]));
+}
+
+/*
+ * init or reinit a table
+ */
+struct lookup_table *
+ipfw_lut_init(struct lookup_table *head, int new_size, int mask)
+{
+       int i;
+       struct entry *s;        /* the new slots */
+       struct entry *fh, *ft;  /* the freelist */
+
+       if (head != NULL) {
+               mask = head->mask;
+               if (new_size <= head->_size)
+                       return head;
+               if (new_size >= mask+1) {
+                       log("size larger than mask");
+                       return NULL;
+               }
+       } else {
+               log("old is null, initialize");
+               head = Calloc(sizeof(*head));
+               if (head == NULL)
+                       return NULL;
+               if (new_size >= mask)
+                       mask = new_size;
+               if (mask & (mask -1)) {
+                       for (i = 1; i < mask; i += i)
+                           ;
+                       log("mask %d not 2^k, round up to %d", mask, i);
+                       mask = i;
+               }
+               mask = head->mask = mask - 1;
+       }
+
+       s = Calloc(new_size * sizeof(*s));
+       if (s == NULL)
+               return NULL;
+       if (!head->s) {
+               head->s = s;
+               head->_size = 1;
+       }
+       fh = ft = NULL;
+       /* remap the entries, adjust the freelist */
+       for (i = 0; i < new_size; i++) {
+               s[i].id = (i >= head->_size) ? i : head->s[i].id;
+               if (i < head->_size && !empty(head, head->s[i].ptr)) {
+                       s[i].ptr = head->s[i].ptr;
+                       continue;
+               }
+               if (fh == NULL)
+                       fh = &s[i];
+               else
+                       ft->ptr = &s[i];
+               ft = &s[i];
+       }
+       head->f_head = fh;
+       head->f_tail = ft;
+
+       /* write lock on the structure, to protect the readers */
+       fh = head->s;
+       head->s = s;
+       head->_size = new_size;
+       /* release write lock */
+       if (fh != s)
+               Free(fh);
+       log("done");
+       return head;
+}
+
+/* insert returns the id */
+int
+ipfw_lut_insert(struct lookup_table *head, void *d)
+{
+       struct entry *e;
+
+       e = head->f_head;
+       if (e == NULL)
+               return -1;
+       head->f_head = e->ptr;
+       e->ptr = d;
+       head->used++;
+       return e->id;
+}
+
+/* delete, returns the original entry */
+void *
+ipfw_lut_delete(struct lookup_table *head, int id)
+{
+       int i = id & head->mask;
+       void *result;
+       struct entry *e;
+
+       if (i >= head->_size)
+               return NULL;
+       e = &head->s[i];
+       if (e->id != id)
+               return NULL;
+       result = e->ptr;
+       /* write lock to invalidate the entry to readers */
+       e->id += head->mask + 1; /* prepare for next insert */
+       e->ptr = NULL;
+       /* release write lock */
+       if (head->f_head == NULL)
+               head->f_head = e;
+       else
+               head->f_tail->ptr = e;
+       head->f_tail = e;
+       head->used--;
+       return result;
+}
+
+void *
+ipfw_lut_lookup(struct lookup_table *head, int id)
+{
+       int i = id & head->mask;
+       struct entry *e;
+
+       if (i >= head->_size)
+               return NULL;
+       e = &head->s[i];
+       return (e->id == id) ? e->ptr : NULL;
+}
+
+void
+ipfw_lut_dump(struct lookup_table *head)
+{
+       int i;
+
+       log("head %p size %d used %d freelist %d",
+           head, head->_size, head->used, head->f_head ?
+                   head->f_head - head->s : -1);
+       for (i = 0; i < head->_size; i++) {
+               struct entry *e = &head->s[i];
+               char ee = empty(head, e->ptr) ? 'E' : ' ';
+               log("%5d  %5d %c %p", i, e->id, ee,
+                   ee == 'E' && e->ptr != NULL ?
+                   (void *)((struct entry *)e->ptr - head->s) : e->ptr);
+       }
+}
+
+#ifndef _KERNEL
+void dump_p(struct lookup_table *p, int *map)
+{
+       int i;
+       for (i = 0; i < p->_size; i++) {
+           int id = (int)ipfw_lut_lookup(p, map[i]);
+           log("%3d: %3d: %c", map[i] % 64, i, id);
+       }
+}
+int main(int argc, char *argv[])
+{
+       int i, j, l;
+#define S 1000
+       int map[S];
+       struct lookup_table *p;
+       struct lookup_table *p1;
+       const char *m = "nel mezzo del cammin di nostra vita mi ritrovai"
+               " in una selva oscura e la diritta via era smarrita!";
+
+       fprintf(stderr, "testing lookup\n");
+
+       l = strlen(m);
+
+       p = ipfw_lut_init(NULL, 120, 33);
+
+       ipfw_lut_dump(p);
+       for (i = 0; i < l; i++) {
+           int x = m[i];
+           int id = ipfw_lut_insert(p, (void *)x);
+           //ipfw_lut_dump(p);
+           map[i] = id;
+           for (j=0; j < 10; j++) {
+                   id = ipfw_lut_insert(p, (void *)'a');
+                   // ipfw_lut_dump(p);
+                   ipfw_lut_delete(p, id);
+                   // ipfw_lut_dump(p);
+           }
+       //    ipfw_lut_dump(p);
+       } 
+       dump_p(p, map);
+       p1 = ipfw_lut_init(p, 23, 0);
+       if (!p1)
+               return 1;
+       dump_p(p1, map);
+       p1 = ipfw_lut_init(p1, 120, 0);
+       if (!p1)
+               return 1;
+       dump_p(p1, map);
+       return 0;
+}
+#endif
+/* end of file */
diff --git a/dummynet2/ip_fw_nat.c b/dummynet2/ip_fw_nat.c

new file mode 100644 (file)

index 0000000..ead46a7
--- /dev/null
+++ b/dummynet2/ip_fw_nat.c
@@ -0,0 +1,606 @@
+/*-
+ * Copyright (c) 2008 Paolo Pisati
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+
+#define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */
+
+#include <netinet/libalias/alias.h>
+#include <netinet/libalias/alias_local.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
+#define        V_ifaddr_event_tag      VNET(ifaddr_event_tag)
+
+static void 
+ifaddr_change(void *arg __unused, struct ifnet *ifp)
+{
+       struct cfg_nat *ptr;
+       struct ifaddr *ifa;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+       IPFW_WLOCK(chain);
+       /* Check every nat entry... */
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               /* ...using nic 'ifp->if_xname' as dynamic alias address. */
+               if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
+                       continue;
+                       if_addr_rlock(ifp);
+                       TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+                               if (ifa->ifa_addr == NULL)
+                                       continue;
+                               if (ifa->ifa_addr->sa_family != AF_INET)
+                                       continue;
+                               ptr->ip = ((struct sockaddr_in *) 
+                                   (ifa->ifa_addr))->sin_addr;
+                               LibAliasSetAddress(ptr->lib, ptr->ip);
+                       }
+                       if_addr_runlock(ifp);
+               }
+       IPFW_WUNLOCK(chain);
+}
+
+/*
+ * delete the pointers for nat entry ix, or all of them if ix < 0
+ */
+static void
+flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
+{
+       int i;
+       ipfw_insn_nat *cmd;
+
+       IPFW_WLOCK_ASSERT(chain);
+       for (i = 0; i < chain->n_rules; i++) {
+               cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
+               /* XXX skip log and the like ? */
+               if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
+                           (ix < 0 || cmd->nat->id == ix))
+                       cmd->nat = NULL;
+       }
+}
+
+static void
+del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
+{
+       struct cfg_redir *r, *tmp_r;
+       struct cfg_spool *s, *tmp_s;
+       int i, num;
+
+       LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
+               num = 1; /* Number of alias_link to delete. */
+               switch (r->mode) {
+               case REDIR_PORT:
+                       num = r->pport_cnt;
+                       /* FALLTHROUGH */
+               case REDIR_ADDR:
+               case REDIR_PROTO:
+                       /* Delete all libalias redirect entry. */
+                       for (i = 0; i < num; i++)
+                               LibAliasRedirectDelete(n->lib, r->alink[i]);
+                       /* Del spool cfg if any. */
+                       LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
+                               LIST_REMOVE(s, _next);
+                               free(s, M_IPFW);
+                       }
+                       free(r->alink, M_IPFW);
+                       LIST_REMOVE(r, _next);
+                       free(r, M_IPFW);
+                       break;
+               default:
+                       printf("unknown redirect mode: %u\n", r->mode);                         
+                       /* XXX - panic?!?!? */
+                       break; 
+               }
+       }
+}
+
+static int
+add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
+{
+       struct cfg_redir *r, *ser_r;
+       struct cfg_spool *s, *ser_s;
+       int cnt, off, i;
+
+       for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
+               ser_r = (struct cfg_redir *)&buf[off];
+               r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+               memcpy(r, ser_r, SOF_REDIR);
+               LIST_INIT(&r->spool_chain);
+               off += SOF_REDIR;
+               r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
+                   M_IPFW, M_WAITOK | M_ZERO);
+               switch (r->mode) {
+               case REDIR_ADDR:
+                       r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
+                           r->paddr);
+                       break;
+               case REDIR_PORT:
+                       for (i = 0 ; i < r->pport_cnt; i++) {
+                               /* If remotePort is all ports, set it to 0. */
+                               u_short remotePortCopy = r->rport + i;
+                               if (r->rport_cnt == 1 && r->rport == 0)
+                                       remotePortCopy = 0;
+                               r->alink[i] = LibAliasRedirectPort(ptr->lib,
+                                   r->laddr, htons(r->lport + i), r->raddr,
+                                   htons(remotePortCopy), r->paddr, 
+                                   htons(r->pport + i), r->proto);
+                               if (r->alink[i] == NULL) {
+                                       r->alink[0] = NULL;
+                                       break;
+                               }
+                       }
+                       break;
+               case REDIR_PROTO:
+                       r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
+                           r->raddr, r->paddr, r->proto);
+                       break;
+               default:
+                       printf("unknown redirect mode: %u\n", r->mode);
+                       break; 
+               }
+               /* XXX perhaps return an error instead of panic ? */
+               if (r->alink[0] == NULL)
+                       panic("LibAliasRedirect* returned NULL");
+               /* LSNAT handling. */
+                       for (i = 0; i < r->spool_cnt; i++) {
+                               ser_s = (struct cfg_spool *)&buf[off];
+                       s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+                               memcpy(s, ser_s, SOF_SPOOL);
+                               LibAliasAddServer(ptr->lib, r->alink[0], 
+                                   s->addr, htons(s->port));
+                               off += SOF_SPOOL;
+                               /* Hook spool entry. */
+                       LIST_INSERT_HEAD(&r->spool_chain, s, _next);
+                       }
+               /* And finally hook this redir entry. */
+               LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
+       }
+       return (1);
+}
+
+static int
+ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
+{
+       struct mbuf *mcl;
+       struct ip *ip;
+       /* XXX - libalias duct tape */
+       int ldt, retval;
+       char *c;
+
+       ldt = 0;
+       retval = 0;
+       mcl = m_megapullup(m, m->m_pkthdr.len);
+       if (mcl == NULL) {
+               args->m = NULL;
+               return (IP_FW_DENY);
+       }
+       ip = mtod(mcl, struct ip *);
+
+       /* 
+        * XXX - Libalias checksum offload 'duct tape':
+        * 
+        * locally generated packets have only pseudo-header checksum
+        * calculated and libalias will break it[1], so mark them for
+        * later fix.  Moreover there are cases when libalias modifies
+        * tcp packet data[2], mark them for later fix too.
+        *
+        * [1] libalias was never meant to run in kernel, so it does
+        * not have any knowledge about checksum offloading, and
+        * expects a packet with a full internet checksum.
+        * Unfortunately, packets generated locally will have just the
+        * pseudo header calculated, and when libalias tries to adjust
+        * the checksum it will actually compute a wrong value.
+        *
+        * [2] when libalias modifies tcp's data content, full TCP
+        * checksum has to be recomputed: the problem is that
+        * libalias does not have any idea about checksum offloading.
+        * To work around this, we do not do checksumming in LibAlias,
+        * but only mark the packets in th_x2 field. If we receive a
+        * marked packet, we calculate correct checksum for it
+        * aware of offloading.  Why such a terrible hack instead of
+        * recalculating checksum for each packet?
+        * Because the previous checksum was not checked!
+        * Recalculating checksums for EVERY packet will hide ALL
+        * transmission errors. Yes, marked packets still suffer from
+        * this problem. But, sigh, natd(8) has this problem, too.
+        *
+        * TODO: -make libalias mbuf aware (so
+        * it can handle delayed checksum and tso)
+        */
+
+       if (mcl->m_pkthdr.rcvif == NULL && 
+           mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
+               ldt = 1;
+
+       c = mtod(mcl, char *);
+       if (args->oif == NULL)
+               retval = LibAliasIn(t->lib, c, 
+                       mcl->m_len + M_TRAILINGSPACE(mcl));
+       else
+               retval = LibAliasOut(t->lib, c, 
+                       mcl->m_len + M_TRAILINGSPACE(mcl));
+       if (retval == PKT_ALIAS_RESPOND) {
+         m->m_flags |= M_SKIP_FIREWALL;
+         retval = PKT_ALIAS_OK;
+       }
+       if (retval != PKT_ALIAS_OK &&
+           retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
+               /* XXX - should i add some logging? */
+               m_free(mcl);
+               args->m = NULL;
+               return (IP_FW_DENY);
+       }
+       mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
+
+       /* 
+        * XXX - libalias checksum offload 
+        * 'duct tape' (see above) 
+        */
+
+       if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && 
+           ip->ip_p == IPPROTO_TCP) {
+               struct tcphdr   *th; 
+
+               th = (struct tcphdr *)(ip + 1);
+               if (th->th_x2) 
+                       ldt = 1;
+       }
+
+       if (ldt) {
+               struct tcphdr   *th;
+               struct udphdr   *uh;
+               u_short cksum;
+
+               /* XXX check if ip_len can stay in net format */
+               cksum = in_pseudo(
+                   ip->ip_src.s_addr,
+                   ip->ip_dst.s_addr, 
+                   htons(ip->ip_p + ntohs(ip->ip_len) - (ip->ip_hl << 2))
+               );
+                                       
+               switch (ip->ip_p) {
+               case IPPROTO_TCP:
+                       th = (struct tcphdr *)(ip + 1);
+                       /* 
+                        * Maybe it was set in 
+                        * libalias... 
+                        */
+                       th->th_x2 = 0;
+                       th->th_sum = cksum;
+                       mcl->m_pkthdr.csum_data = 
+                           offsetof(struct tcphdr, th_sum);
+                       break;
+               case IPPROTO_UDP:
+                       uh = (struct udphdr *)(ip + 1);
+                       uh->uh_sum = cksum;
+                       mcl->m_pkthdr.csum_data = 
+                           offsetof(struct udphdr, uh_sum);
+                       break;                                          
+               }
+               /* No hw checksum offloading: do it ourselves */
+               if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
+                       in_delayed_cksum(mcl);
+                       mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+               }
+       }
+       args->m = mcl;
+       return (IP_FW_NAT);
+}
+
+static struct cfg_nat *
+lookup_nat(struct nat_list *l, int nat_id)
+{
+       struct cfg_nat *res;
+
+       LIST_FOREACH(res, l, _next) {
+               if (res->id == nat_id)
+                       break;
+       }
+       return res;
+}
+
+static int 
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+       struct cfg_nat *ptr, *ser_n;
+       char *buf;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+       sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat));
+       ser_n = (struct cfg_nat *)buf;
+
+       /* check valid parameter ser_n->id > 0 ? */
+       /* 
+        * Find/create nat rule.
+        */
+       IPFW_WLOCK(chain);
+       ptr = lookup_nat(&chain->nat, ser_n->id);
+       if (ptr == NULL) {
+               /* New rule: allocate and init new instance. */
+               ptr = malloc(sizeof(struct cfg_nat), 
+                   M_IPFW, M_NOWAIT | M_ZERO);
+               if (ptr == NULL) {
+                       IPFW_WUNLOCK(chain);
+                       free(buf, M_IPFW);
+                       return (ENOSPC);
+               }
+               ptr->lib = LibAliasInit(NULL);
+               if (ptr->lib == NULL) {
+                       IPFW_WUNLOCK(chain);
+                       free(ptr, M_IPFW);
+                       free(buf, M_IPFW);
+                       return (EINVAL);
+               }
+               LIST_INIT(&ptr->redir_chain);
+       } else {
+               /* Entry already present: temporarly unhook it. */
+               LIST_REMOVE(ptr, _next);
+               flush_nat_ptrs(chain, ser_n->id);
+       }
+       IPFW_WUNLOCK(chain);
+
+       /* 
+        * Basic nat configuration.
+        */
+       ptr->id = ser_n->id;
+       /* 
+        * XXX - what if this rule doesn't nat any ip and just 
+        * redirect? 
+        * do we set aliasaddress to 0.0.0.0?
+        */
+       ptr->ip = ser_n->ip;
+       ptr->redir_cnt = ser_n->redir_cnt;
+       ptr->mode = ser_n->mode;
+       LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
+       LibAliasSetAddress(ptr->lib, ptr->ip);
+       memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
+
+       /* 
+        * Redir and LSNAT configuration.
+        */
+       /* Delete old cfgs. */
+       del_redir_spool_cfg(ptr, &ptr->redir_chain);
+       /* Add new entries. */
+       add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+       free(buf, M_IPFW);
+       IPFW_WLOCK(chain);
+       LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+       IPFW_WUNLOCK(chain);
+       return (0);
+}
+
+static int
+ipfw_nat_del(struct sockopt *sopt)
+{
+       struct cfg_nat *ptr;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+       int i;
+               
+       sooptcopyin(sopt, &i, sizeof i, sizeof i);
+       /* XXX validate i */
+       IPFW_WLOCK(chain);
+       ptr = lookup_nat(&chain->nat, i);
+       if (ptr == NULL) {
+               IPFW_WUNLOCK(chain);
+               return (EINVAL);
+       }
+       LIST_REMOVE(ptr, _next);
+       flush_nat_ptrs(chain, i);
+       IPFW_WUNLOCK(chain);
+       del_redir_spool_cfg(ptr, &ptr->redir_chain);
+       LibAliasUninit(ptr->lib);
+       free(ptr, M_IPFW);
+       return (0);
+}
+
+static int
+ipfw_nat_get_cfg(struct sockopt *sopt)
+{      
+       uint8_t *data;
+       struct cfg_nat *n;
+       struct cfg_redir *r;
+       struct cfg_spool *s;
+       int nat_cnt, off;
+       struct ip_fw_chain *chain;
+       int err = ENOSPC;
+               
+       chain = &V_layer3_chain;
+       nat_cnt = 0;
+       off = sizeof(nat_cnt);
+
+       data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+       IPFW_RLOCK(chain);
+       /* Serialize all the data. */
+       LIST_FOREACH(n, &chain->nat, _next) {
+               nat_cnt++;
+               if (off + SOF_NAT >= NAT_BUF_LEN)
+                       goto nospace;
+                       bcopy(n, &data[off], SOF_NAT);
+                       off += SOF_NAT;
+                       LIST_FOREACH(r, &n->redir_chain, _next) {
+                       if (off + SOF_REDIR >= NAT_BUF_LEN)
+                               goto nospace;
+                       bcopy(r, &data[off], SOF_REDIR);
+                                       off += SOF_REDIR;
+                       LIST_FOREACH(s, &r->spool_chain, _next) {
+                               if (off + SOF_SPOOL >= NAT_BUF_LEN)
+                                                       goto nospace;
+                               bcopy(s, &data[off], SOF_SPOOL);
+                               off += SOF_SPOOL;
+                                       }
+                       }
+       }
+       err = 0; /* all good */
+nospace:
+       IPFW_RUNLOCK(chain);
+       if (err == 0) {
+       bcopy(&nat_cnt, data, sizeof(nat_cnt));
+       sooptcopyout(sopt, data, NAT_BUF_LEN);
+       } else {
+       printf("serialized data buffer not big enough:"
+           "please increase NAT_BUF_LEN\n");
+       }
+       free(data, M_IPFW);
+       return (err);
+}
+
+static int
+ipfw_nat_get_log(struct sockopt *sopt)
+{
+       uint8_t *data;
+       struct cfg_nat *ptr;
+       int i, size;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+
+       IPFW_RLOCK(chain);
+       /* one pass to count, one to copy the data */
+       i = 0;
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               if (ptr->lib->logDesc == NULL) 
+                       continue;
+               i++;
+       }
+       size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
+       data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
+               if (data == NULL) {
+               IPFW_RUNLOCK(chain);
+                       return (ENOSPC);
+               }
+       i = 0;
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               if (ptr->lib->logDesc == NULL)
+                       continue;
+               bcopy(&ptr->id, &data[i], sizeof(int));
+               i += sizeof(int);
+               bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
+               i += LIBALIAS_BUF_SIZE;
+       }
+       IPFW_RUNLOCK(chain);
+       sooptcopyout(sopt, data, size);
+       free(data, M_IPFW);
+       return(0);
+}
+
+static void
+ipfw_nat_init(void)
+{
+
+       IPFW_WLOCK(&V_layer3_chain);
+       /* init ipfw hooks */
+       ipfw_nat_ptr = ipfw_nat;
+       lookup_nat_ptr = lookup_nat;
+       ipfw_nat_cfg_ptr = ipfw_nat_cfg;
+       ipfw_nat_del_ptr = ipfw_nat_del;
+       ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
+       ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+       IPFW_WUNLOCK(&V_layer3_chain);
+       V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
+           ifaddr_event, ifaddr_change,
+           NULL, EVENTHANDLER_PRI_ANY);
+}
+
+static void
+ipfw_nat_destroy(void)
+{
+       struct cfg_nat *ptr, *ptr_temp;
+       struct ip_fw_chain *chain;
+       
+       chain = &V_layer3_chain;
+       IPFW_WLOCK(chain);
+       LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
+               LIST_REMOVE(ptr, _next);
+               del_redir_spool_cfg(ptr, &ptr->redir_chain);
+               LibAliasUninit(ptr->lib);
+               free(ptr, M_IPFW);
+       }
+       EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
+       flush_nat_ptrs(chain, -1 /* flush all */);
+       /* deregister ipfw_nat */
+       ipfw_nat_ptr = NULL;
+       lookup_nat_ptr = NULL;
+       ipfw_nat_cfg_ptr = NULL;
+       ipfw_nat_del_ptr = NULL;
+       ipfw_nat_get_cfg_ptr = NULL;
+       ipfw_nat_get_log_ptr = NULL;
+       IPFW_WUNLOCK(chain);
+}
+
+static int
+ipfw_nat_modevent(module_t mod, int type, void *unused)
+{
+       int err = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               ipfw_nat_init();
+               break;
+
+       case MOD_UNLOAD:
+               ipfw_nat_destroy();
+               break;
+
+       default:
+               return EOPNOTSUPP;
+               break;
+       }
+       return err;
+}
+
+static moduledata_t ipfw_nat_mod = {
+       "ipfw_nat",
+       ipfw_nat_modevent,
+       0
+};
+
+DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
+MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_VERSION(ipfw_nat, 1);
+/* end of file */
diff --git a/dummynet2/ip_fw_pfil.c b/dummynet2/ip_fw_pfil.c

new file mode 100644 (file)

index 0000000..db7cec6
--- /dev/null
+++ b/dummynet2/ip_fw_pfil.c
@@ -0,0 +1,410 @@
+/*-
+ * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif /* KLD_MODULE */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pfil.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_dummynet.h>
+#include <netgraph/ng_ipfw.h>
+
+#include <machine/in_cksum.h>
+
+static VNET_DEFINE(int, fw_enable) = 1;
+#define V_fw_enable    VNET(fw_enable)
+
+#ifdef INET6
+static VNET_DEFINE(int, fw6_enable) = 1;
+#define V_fw6_enable   VNET(fw6_enable)
+#endif
+
+int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+/* Divert hooks. */
+void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+/* ng_ipfw hooks. */
+ng_ipfw_input_t *ng_ipfw_input_p = NULL;
+
+/* Forward declarations. */
+static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
+    ipfw_chg_hook, "I", "Enable ipfw");
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6_fw);
+SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
+    ipfw_chg_hook, "I", "Enable ipfw+6");
+#endif /* INET6 */
+#endif /* SYSCTL_NODE */
+
+/*
+ * The pfilter hook to pass packets to ipfw_chk and then to
+ * dummynet, divert, netgraph or other modules.
+ * The packet may be consumed.
+ */            
+int
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp)
+{
+       struct ip_fw_args args;
+       struct m_tag *tag;
+       int ipfw;
+       int ret;
+
+       /* all the processing now uses ip_len in net format */
+       SET_NET_IPLEN(mtod(*m0, struct ip *));
+
+       /* convert dir to IPFW values */
+       dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
+       bzero(&args, sizeof(args));
+
+again:
+       /*
+        * extract and remove the tag if present. If we are left
+        * with onepass, optimize the outgoing path.
+        */
+       tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+       if (tag != NULL) {
+               args.rule = *((struct ipfw_rule_ref *)(tag+1));
+               m_tag_delete(*m0, tag);
+               if (args.rule.info & IPFW_ONEPASS) {
+                       SET_HOST_IPLEN(mtod(*m0, struct ip *));
+                       return 0;
+               }
+       }
+
+       args.m = *m0;
+       args.oif = dir == DIR_OUT ? ifp : NULL;
+       args.inp = inp;
+
+       ipfw = ipfw_chk(&args);
+       *m0 = args.m;
+
+       KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+           __func__));
+
+       /* breaking out of the switch means drop */
+       ret = 0;        /* default return value for pass */
+       switch (ipfw) {
+       case IP_FW_PASS:
+               /* next_hop may be set by ipfw_chk */
+               if (args.next_hop == NULL)
+                       break; /* pass */
+#ifndef IPFIREWALL_FORWARD
+               ret = EACCES;
+#else
+           {
+               struct m_tag *fwd_tag;
+
+               /* Incoming packets should not be tagged so we do not
+                * m_tag_find. Outgoing packets may be tagged, so we
+                * reuse the tag if present.
+                */
+               fwd_tag = (dir == DIR_IN) ? NULL :
+                       m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+               if (fwd_tag != NULL) {
+                       m_tag_unlink(*m0, fwd_tag);
+               } else {
+                       fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
+                               sizeof(struct sockaddr_in), M_NOWAIT);
+                       if (fwd_tag == NULL) {
+                               ret = EACCES;
+                               break; /* i.e. drop */
+                       }
+               }
+               bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
+               m_tag_prepend(*m0, fwd_tag);
+
+               if (in_localip(args.next_hop->sin_addr))
+                       (*m0)->m_flags |= M_FASTFWD_OURS;
+           }
+#endif
+               break;
+
+       case IP_FW_DENY:
+               ret = EACCES;
+               break; /* i.e. drop */
+
+       case IP_FW_DUMMYNET:
+               ret = EACCES;
+               if (ip_dn_io_ptr == NULL)
+                       break; /* i.e. drop */
+               if (mtod(*m0, struct ip *)->ip_v == 4)
+                       ret = ip_dn_io_ptr(m0, dir, &args);
+               else if (mtod(*m0, struct ip *)->ip_v == 6)
+                       ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
+               else
+                       break; /* drop it */
+               /*
+                * XXX should read the return value.
+                * dummynet normally eats the packet and sets *m0=NULL
+                * unless the packet can be sent immediately. In this
+                * case args is updated and we should re-run the
+                * check without clearing args.
+                */
+               if (*m0 != NULL)
+                       goto again;
+               break;
+
+       case IP_FW_TEE:
+       case IP_FW_DIVERT:
+               if (ip_divert_ptr == NULL) {
+                       ret = EACCES;
+                       break; /* i.e. drop */
+               }
+               ret = ipfw_divert(m0, dir, &args.rule,
+                       (ipfw == IP_FW_TEE) ? 1 : 0);
+               /* continue processing for the original packet (tee). */
+               if (*m0)
+                       goto again;
+               break;
+
+       case IP_FW_NGTEE:
+       case IP_FW_NETGRAPH:
+               if (!NG_IPFW_LOADED) {
+                       ret = EACCES;
+                       break; /* i.e. drop */
+               }
+               ret = ng_ipfw_input_p(m0, dir, &args,
+                       (ipfw == IP_FW_NGTEE) ? 1 : 0);
+               if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
+                       goto again;     /* continue with packet */
+               break;
+               
+       case IP_FW_NAT:
+       case IP_FW_REASS:
+               goto again;             /* continue with packet */
+       
+       default:
+               KASSERT(0, ("%s: unknown retval", __func__));
+       }
+
+       if (ret != 0) {
+               if (*m0)
+                       FREE_PKT(*m0);
+               *m0 = NULL;
+       }
+       if (*m0)
+               SET_HOST_IPLEN(mtod(*m0, struct ip *));
+       return ret;
+}
+
+/* do the divert, return 1 on error 0 on success */
+static int
+ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
+       int tee)
+{
+       /*
+        * ipfw_chk() has already tagged the packet with the divert tag.
+        * If tee is set, copy packet and return original.
+        * If not tee, consume packet and send it to divert socket.
+        */
+       struct mbuf *clone;
+       struct ip *ip;
+       struct m_tag *tag;
+
+       /* Cloning needed for tee? */
+       if (tee == 0) {
+               clone = *m0;    /* use the original mbuf */
+               *m0 = NULL;
+       } else {
+               clone = m_dup(*m0, M_DONTWAIT);
+               /* If we cannot duplicate the mbuf, we sacrifice the divert
+                * chain and continue with the tee-ed packet.
+                */
+               if (clone == NULL)
+                       return 1;
+       }
+
+       /*
+        * Divert listeners can normally handle non-fragmented packets,
+        * but we can only reass in the non-tee case.
+        * This means that listeners on a tee rule may get fragments,
+        * and have to live with that.
+        * Note that we now have the 'reass' ipfw option so if we care
+        * we can do it before a 'tee'.
+        */
+       ip = mtod(clone, struct ip *);
+       if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
+               int hlen;
+               struct mbuf *reass;
+
+               SET_HOST_IPLEN(ip); /* ip_reass wants host order */
+               reass = ip_reass(clone); /* Reassemble packet. */
+               if (reass == NULL)
+                       return 0; /* not an error */
+               /* if reass = NULL then it was consumed by ip_reass */
+               /*
+                * IP header checksum fixup after reassembly and leave header
+                * in network byte order.
+                */
+               ip = mtod(reass, struct ip *);
+               hlen = ip->ip_hl << 2;
+               SET_NET_IPLEN(ip);
+               ip->ip_sum = 0;
+               if (hlen == sizeof(struct ip))
+                       ip->ip_sum = in_cksum_hdr(ip);
+               else
+                       ip->ip_sum = in_cksum(reass, hlen);
+               clone = reass;
+       }
+       /* attach a tag to the packet with the reinject info */
+       tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+                   sizeof(struct ipfw_rule_ref), M_NOWAIT);
+       if (tag == NULL) {
+               FREE_PKT(clone);
+               return 1;
+       }
+       *((struct ipfw_rule_ref *)(tag+1)) = *rule;
+       m_tag_prepend(clone, tag);
+
+       /* Do the dirty job... */
+       ip_divert_ptr(clone, incoming);
+       return 0;
+}
+
+/*
+ * attach or detach hooks for a given protocol family
+ */
+static int
+ipfw_hook(int onoff, int pf)
+{
+       struct pfil_head *pfh;
+
+       pfh = pfil_head_get(PFIL_TYPE_AF, pf);
+       if (pfh == NULL)
+               return ENOENT;
+
+       (void) (onoff ? pfil_add_hook : pfil_remove_hook)
+           (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
+
+       return 0;
+}
+
+int
+ipfw_attach_hooks(int arg)
+{
+       int error = 0;
+
+       if (arg == 0) /* detach */
+               ipfw_hook(0, AF_INET);
+       else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+                error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
+                printf("ipfw_hook() error\n");
+        }
+#ifdef INET6
+       if (arg == 0) /* detach */
+               ipfw_hook(0, AF_INET6);
+       else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+                error = ENOENT;
+                printf("ipfw6_hook() error\n");
+        }
+#endif
+       return error;
+}
+
+int
+ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
+{
+       int enable;
+       int oldenable;
+       int error;
+       int af;
+
+       if (arg1 == &VNET_NAME(fw_enable)) {
+               enable = V_fw_enable;
+               af = AF_INET;
+       }
+#ifdef INET6
+       else if (arg1 == &VNET_NAME(fw6_enable)) {
+               enable = V_fw6_enable;
+               af = AF_INET6;
+       }
+#endif
+       else 
+               return (EINVAL);
+
+       oldenable = enable;
+
+       error = sysctl_handle_int(oidp, &enable, 0, req);
+
+       if (error)
+               return (error);
+
+       enable = (enable) ? 1 : 0;
+
+       if (enable == oldenable)
+               return (0);
+
+       error = ipfw_hook(enable, af);
+       if (error)
+               return (error);
+       if (af == AF_INET)
+               V_fw_enable = enable;
+#ifdef INET6
+       else if (af == AF_INET6)
+               V_fw6_enable = enable;
+#endif
+
+       return (0);
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_sockopt.c b/dummynet2/ip_fw_sockopt.c

new file mode 100644 (file)

index 0000000..086d7f0
--- /dev/null
+++ b/dummynet2/ip_fw_sockopt.c
@@ -0,0 +1,1086 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Sockopt support for ipfw. The routines here implement
+ * the upper half of the ipfw code.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>  /* struct m_tag used by nested headers */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+/*
+ * static variables followed by global ones (none in this file)
+ */
+
+/*
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
+ */
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
+{
+       int i, lo, hi;
+       struct ip_fw *r;
+
+       for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+               i = (lo + hi) / 2;
+               r = chain->map[i];
+               if (r->rulenum < key)
+                       lo = i + 1;     /* continue from the next one */
+               else if (r->rulenum > key)
+                       hi = i;         /* this might be good */
+               else if (r->id < id)
+                       lo = i + 1;     /* continue from the next one */
+               else /* r->id >= id */
+                       hi = i;         /* this might be good */
+       };
+       return hi;
+}
+
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
+
+       for (;;) {
+               struct ip_fw **map;
+               int i;
+
+               i = chain->n_rules + extra;
+               map = malloc(i * sizeof(struct ip_fw *), M_IPFW, M_WAITOK);
+               if (map == NULL) {
+                       printf("%s: cannot allocate map\n", __FUNCTION__);
+                       return NULL;
+               }
+               if (!locked)
+                       IPFW_UH_WLOCK(chain);
+               if (i >= chain->n_rules + extra) /* good */
+                       return map;
+               /* otherwise we lost the race, free and retry */
+               if (!locked)
+                       IPFW_UH_WUNLOCK(chain);
+               free(map, M_IPFW);
+       }
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+       struct ip_fw **old_map;
+
+       IPFW_WLOCK(chain);
+       chain->id++;
+       chain->n_rules = new_len;
+       old_map = chain->map;
+       chain->map = new_map;
+       IPFW_WUNLOCK(chain);
+       return old_map;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
+ */
+int
+ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+       struct ip_fw *rule;
+       int i, l, insert_before;
+       struct ip_fw **map;     /* the new array of pointers */
+
+       if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
+               return (EINVAL);
+
+       l = RULESIZE(input_rule);
+       rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+       if (rule == NULL)
+               return (ENOSPC);
+       /* get_map returns with IPFW_UH_WLOCK if successful */
+       map = get_map(chain, 1, 0 /* not locked */);
+       if (map == NULL) {
+               free(rule, M_IPFW);
+               return ENOSPC;
+       }
+
+       bcopy(input_rule, rule, l);
+       /* clear fields not settable from userland */
+       rule->x_next = NULL;
+       rule->next_rule = NULL;
+       rule->pcnt = 0;
+       rule->bcnt = 0;
+       rule->timestamp = 0;
+
+       if (V_autoinc_step < 1)
+               V_autoinc_step = 1;
+       else if (V_autoinc_step > 1000)
+               V_autoinc_step = 1000;
+       /* find the insertion point, we will insert before */
+       insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+       i = ipfw_find_rule(chain, insert_before, 0);
+       /* duplicate first part */
+       if (i > 0)
+               bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+       map[i] = rule;
+       /* duplicate remaining part, we always have the default rule */
+       bcopy(chain->map + i, map + i + 1,
+               sizeof(struct ip_fw *) *(chain->n_rules - i));
+       if (rule->rulenum == 0) {
+               /* write back the number */
+               rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
+               if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+                       rule->rulenum += V_autoinc_step;
+               input_rule->rulenum = rule->rulenum;
+       }
+
+       rule->id = chain->id + 1;
+       map = swap_map(chain, map, chain->n_rules + 1);
+       chain->static_len += l;
+       IPFW_UH_WUNLOCK(chain);
+       if (map)
+               free(map, M_IPFW);
+       return (0);
+}
+
+/*
+ * Reclaim storage associated with a list of rules.  This is
+ * typically the list created using remove_rule.
+ * A NULL pointer on input is handled correctly.
+ */
+void
+ipfw_reap_rules(struct ip_fw *head)
+{
+       struct ip_fw *rule;
+
+       while ((rule = head) != NULL) {
+               head = head->x_next;
+               free(rule, M_IPFW);
+       }
+}
+
+/**
+ * Remove all rules with given number, and also do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an u_int32_t. The low 16 bit are the rule or set number,
+ * the next 8 bits are the new set, the top 8 bits are the command:
+ *
+ *     0       delete rules with given number
+ *     1       delete rules with given set number
+ *     2       move rules with given number to new set
+ *     3       move rules with given set number to new set
+ *     4       swap sets with given numbers
+ *     5       delete rules with given number and with given set number
+ */
+static int
+del_entry(struct ip_fw_chain *chain, u_int32_t arg)
+{
+       struct ip_fw *rule;
+       uint32_t rulenum;       /* rule or old_set */
+       uint8_t cmd, new_set;
+       int start, end = 0, i, ofs, n;
+       struct ip_fw **map = NULL;
+       int error = 0;
+
+       rulenum = arg & 0xffff;
+       cmd = (arg >> 24) & 0xff;
+       new_set = (arg >> 16) & 0xff;
+
+       if (cmd > 5 || new_set > RESVD_SET)
+               return EINVAL;
+       if (cmd == 0 || cmd == 2 || cmd == 5) {
+               if (rulenum >= IPFW_DEFAULT_RULE)
+                       return EINVAL;
+       } else {
+               if (rulenum > RESVD_SET)        /* old_set */
+                       return EINVAL;
+       }
+
+       IPFW_UH_WLOCK(chain); /* prevent conflicts among the writers */
+       chain->reap = NULL;     /* prepare for deletions */
+
+       switch (cmd) {
+       case 0: /* delete rules with given number (0 is special means all) */
+       case 1: /* delete all rules with given set number, rule->set == rulenum */
+       case 5: /* delete rules with given number and with given set number.
+                * rulenum - given rule number;
+                * new_set - given set number.
+                */
+               /* locate first rule to delete (start), the one after the
+                * last one (end), and count how many rules to delete (n)
+                */
+               n = 0;
+               if (cmd == 1) { /* look for a specific set, must scan all */
+                       for (start = -1, i = 0; i < chain->n_rules; i++) {
+                               if (chain->map[start]->set != rulenum)
+                                       continue;
+                               if (start < 0)
+                                       start = i;
+                               end = i;
+                               n++;
+                       }
+                       end++;  /* first non-matching */
+               } else {
+                       start = ipfw_find_rule(chain, rulenum, 0);
+                       for (end = start; end < chain->n_rules; end++) {
+                               rule = chain->map[end];
+                               if (rulenum > 0 && rule->rulenum != rulenum)
+                                       break;
+                               if (rule->set != RESVD_SET &&
+                                   (cmd == 0 || rule->set == new_set) )
+                                       n++;
+                       }
+               }
+               if (n == 0 && arg == 0)
+                       break; /* special case, flush on empty ruleset */
+               /* allocate the map, if needed */
+               if (n > 0)
+                       map = get_map(chain, -n, 1 /* locked */);
+               if (n == 0 || map == NULL) {
+                       error = EINVAL;
+               break;
+               }
+               /* copy the initial part of the map */
+               if (start > 0)
+                       bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+               /* copy active rules between start and end */
+               for (i = ofs = start; i < end; i++) {
+                       rule = chain->map[i];
+                       if (!(rule->set != RESVD_SET &&
+                           (cmd == 0 || rule->set == new_set) ))
+                               map[ofs++] = chain->map[i];
+               }
+               /* finally the tail */
+               bcopy(chain->map + end, map + ofs,
+                       (chain->n_rules - end) * sizeof(struct ip_fw *));
+               map = swap_map(chain, map, chain->n_rules - n);
+               /* now remove the rules deleted */
+               for (i = start; i < end; i++) {
+                       rule = map[i];
+                       if (rule->set != RESVD_SET &&
+                           (cmd == 0 || rule->set == new_set) ) {
+                               int l = RULESIZE(rule);
+
+                               chain->static_len -= l;
+                               ipfw_remove_dyn_children(rule);
+                               rule->x_next = chain->reap;
+                               chain->reap = rule;
+                       }
+               }
+               break;
+
+       case 2: /* move rules with given number to new set */
+               IPFW_UH_WLOCK(chain);
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->rulenum == rulenum)
+                               rule->set = new_set;
+               }
+               IPFW_UH_WUNLOCK(chain);
+               break;
+
+       case 3: /* move rules with given set number to new set */
+               IPFW_UH_WLOCK(chain);
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->set == rulenum)
+                               rule->set = new_set;
+               }
+               IPFW_UH_WUNLOCK(chain);
+               break;
+
+       case 4: /* swap two sets */
+               IPFW_UH_WLOCK(chain);
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->set == rulenum)
+                               rule->set = new_set;
+                       else if (rule->set == new_set)
+                               rule->set = rulenum;
+               }
+               IPFW_UH_WUNLOCK(chain);
+               break;
+       }
+       rule = chain->reap;
+       chain->reap = NULL;
+       IPFW_UH_WUNLOCK(chain);
+       ipfw_reap_rules(rule);
+       if (map)
+               free(map, M_IPFW);
+       return error;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+       ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+       if (log_only == 0) {
+               rule->bcnt = rule->pcnt = 0;
+               rule->timestamp = 0;
+       }
+       if (l->o.opcode == O_LOG)
+               l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ *     0       work with rules from all set's;
+ *     1       work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+       struct ip_fw *rule;
+       char *msg;
+       int i;
+
+       uint16_t rulenum = arg & 0xffff;
+       uint8_t set = (arg >> 16) & 0xff;
+       uint8_t cmd = (arg >> 24) & 0xff;
+
+       if (cmd > 1)
+               return (EINVAL);
+       if (cmd == 1 && set > RESVD_SET)
+               return (EINVAL);
+
+       IPFW_UH_RLOCK(chain);
+       if (rulenum == 0) {
+               V_norule_counter = 0;
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       /* Skip rules not in our set. */
+                       if (cmd == 1 && rule->set != set)
+                               continue;
+                       clear_counters(rule, log_only);
+               }
+               msg = log_only ? "All logging counts reset" :
+                   "Accounting cleared";
+       } else {
+               int cleared = 0;
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->rulenum == rulenum) {
+                                       if (cmd == 0 || rule->set == set)
+                                               clear_counters(rule, log_only);
+                               cleared = 1;
+                       }
+                       if (rule->rulenum > rulenum)
+                               break;
+                       }
+               if (!cleared) { /* we did not find any matching rules */
+                       IPFW_WUNLOCK(chain);
+                       return (EINVAL);
+               }
+               msg = log_only ? "logging count reset" : "cleared";
+       }
+       IPFW_UH_RUNLOCK(chain);
+
+       if (V_fw_verbose) {
+               int lev = LOG_SECURITY | LOG_NOTICE;
+
+               if (rulenum)
+                       log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+               else
+                       log(lev, "ipfw: %s.\n", msg);
+       }
+       return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+       int l, cmdlen = 0;
+       int have_action=0;
+       ipfw_insn *cmd;
+
+       if (size < sizeof(*rule)) {
+               printf("ipfw: rule too short\n");
+               return (EINVAL);
+       }
+       /* first, check for valid size */
+       l = RULESIZE(rule);
+       if (l != size) {
+               printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+               return (EINVAL);
+       }
+       if (rule->act_ofs >= rule->cmd_len) {
+               printf("ipfw: bogus action offset (%u > %u)\n",
+                   rule->act_ofs, rule->cmd_len - 1);
+               return (EINVAL);
+       }
+       /*
+        * Now go for the individual checks. Very simple ones, basically only
+        * instruction sizes.
+        */
+       for (l = rule->cmd_len, cmd = rule->cmd ;
+                       l > 0 ; l -= cmdlen, cmd += cmdlen) {
+               cmdlen = F_LEN(cmd);
+               if (cmdlen > l) {
+                       printf("ipfw: opcode %d size truncated\n",
+                           cmd->opcode);
+                       return EINVAL;
+               }
+               switch (cmd->opcode) {
+               case O_PROBE_STATE:
+               case O_KEEP_STATE:
+               case O_PROTO:
+               case O_IP_SRC_ME:
+               case O_IP_DST_ME:
+               case O_LAYER2:
+               case O_IN:
+               case O_FRAG:
+               case O_DIVERTED:
+               case O_IPOPT:
+               case O_IPTOS:
+               case O_IPPRECEDENCE:
+               case O_IPVER:
+               case O_TCPWIN:
+               case O_TCPFLAGS:
+               case O_TCPOPTS:
+               case O_ESTAB:
+               case O_VERREVPATH:
+               case O_VERSRCREACH:
+               case O_ANTISPOOF:
+               case O_IPSEC:
+#ifdef INET6
+               case O_IP6_SRC_ME:
+               case O_IP6_DST_ME:
+               case O_EXT_HDR:
+               case O_IP6:
+#endif
+               case O_IP4:
+               case O_TAG:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       break;
+
+               case O_FIB:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       if (cmd->arg1 >= rt_numfibs) {
+                               printf("ipfw: invalid fib number %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       break;
+
+               case O_SETFIB:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       if (cmd->arg1 >= rt_numfibs) {
+                               printf("ipfw: invalid fib number %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       goto check_action;
+
+               case O_UID:
+               case O_GID:
+               case O_JAIL:
+               case O_IP_SRC:
+               case O_IP_DST:
+               case O_TCPSEQ:
+               case O_TCPACK:
+               case O_PROB:
+               case O_ICMPTYPE:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+                               goto bad_size;
+                       break;
+
+               case O_LIMIT:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+                               goto bad_size;
+                       break;
+
+               case O_LOG:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+                               goto bad_size;
+
+                       ((ipfw_insn_log *)cmd)->log_left =
+                           ((ipfw_insn_log *)cmd)->max_log;
+
+                       break;
+
+               case O_IP_SRC_MASK:
+               case O_IP_DST_MASK:
+                       /* only odd command lengths */
+                       if ( !(cmdlen & 1) || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_IP_SRC_SET:
+               case O_IP_DST_SET:
+                       if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+                               printf("ipfw: invalid set size %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+                           (cmd->arg1+31)/32 )
+                               goto bad_size;
+                       break;
+
+               case O_IP_SRC_LOOKUP:
+               case O_IP_DST_LOOKUP:
+                       if (cmd->arg1 >= IPFW_TABLES_MAX) {
+                               printf("ipfw: invalid table number %d\n",
+                                   cmd->arg1);
+                               return (EINVAL);
+                       }
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+                           cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
+                           cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+                               goto bad_size;
+                       break;
+
+               case O_MACADDR2:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+                               goto bad_size;
+                       break;
+
+               case O_NOP:
+               case O_IPID:
+               case O_IPTTL:
+               case O_IPLEN:
+               case O_TCPDATALEN:
+               case O_TAGGED:
+                       if (cmdlen < 1 || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_MAC_TYPE:
+               case O_IP_SRCPORT:
+               case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+                       if (cmdlen < 2 || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_RECV:
+               case O_XMIT:
+               case O_VIA:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+                               goto bad_size;
+                       break;
+
+               case O_ALTQ:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+                               goto bad_size;
+                       break;
+
+               case O_PIPE:
+               case O_QUEUE:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       goto check_action;
+
+               case O_FORWARD_IP:
+#ifdef IPFIREWALL_FORWARD
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+                               goto bad_size;
+                       goto check_action;
+#else
+                       return EINVAL;
+#endif
+
+               case O_DIVERT:
+               case O_TEE:
+                       if (ip_divert_ptr == NULL)
+                               return EINVAL;
+                       else
+                               goto check_size;
+               case O_NETGRAPH:
+               case O_NGTEE:
+                       if (!NG_IPFW_LOADED)
+                               return EINVAL;
+                       else
+                               goto check_size;
+               case O_NAT:
+                       if (!IPFW_NAT_LOADED)
+                               return EINVAL;
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+                               goto bad_size;          
+                       goto check_action;
+               case O_FORWARD_MAC: /* XXX not implemented yet */
+               case O_CHECK_STATE:
+               case O_COUNT:
+               case O_ACCEPT:
+               case O_DENY:
+               case O_REJECT:
+#ifdef INET6
+               case O_UNREACH6:
+#endif
+               case O_SKIPTO:
+               case O_REASS:
+check_size:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+check_action:
+                       if (have_action) {
+                               printf("ipfw: opcode %d, multiple actions"
+                                       " not allowed\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+                       have_action = 1;
+                       if (l != cmdlen) {
+                               printf("ipfw: opcode %d, action must be"
+                                       " last opcode\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+                       break;
+#ifdef INET6
+               case O_IP6_SRC:
+               case O_IP6_DST:
+                       if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+                           F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       break;
+
+               case O_FLOW6ID:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+                           ((ipfw_insn_u32 *)cmd)->o.arg1)
+                               goto bad_size;
+                       break;
+
+               case O_IP6_SRC_MASK:
+               case O_IP6_DST_MASK:
+                       if ( !(cmdlen & 1) || cmdlen > 127)
+                               goto bad_size;
+                       break;
+               case O_ICMP6TYPE:
+                       if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+                               goto bad_size;
+                       break;
+#endif
+
+               default:
+                       switch (cmd->opcode) {
+#ifndef INET6
+                       case O_IP6_SRC_ME:
+                       case O_IP6_DST_ME:
+                       case O_EXT_HDR:
+                       case O_IP6:
+                       case O_UNREACH6:
+                       case O_IP6_SRC:
+                       case O_IP6_DST:
+                       case O_FLOW6ID:
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                       case O_ICMP6TYPE:
+                               printf("ipfw: no IPv6 support in kernel\n");
+                               return EPROTONOSUPPORT;
+#endif
+                       default:
+                               printf("ipfw: opcode %d, unknown opcode\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+               }
+       }
+       if (have_action == 0) {
+               printf("ipfw: missing action\n");
+               return EINVAL;
+       }
+       return 0;
+
+bad_size:
+       printf("ipfw: opcode %d size %d wrong\n",
+               cmd->opcode, cmdlen);
+       return EINVAL;
+}
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+       char *bp = buf;
+       char *ep = bp + space;
+       struct ip_fw *rule, *dst;
+       int l, i;
+       time_t  boot_seconds;
+
+        boot_seconds = boottime.tv_sec;
+       for (i = 0; i < chain->n_rules; i++) {
+               rule = chain->map[i];
+               l = RULESIZE(rule);
+               if (bp + l > ep) { /* should not happen */
+                       printf("overflow dumping static rules\n");
+                       break;
+               }
+               dst = (struct ip_fw *)bp;
+               bcopy(rule, dst, l);
+                       /*
+                        * XXX HACK. Store the disable mask in the "next"
+                        * pointer in a wild attempt to keep the ABI the same.
+                        * Why do we do this on EVERY rule?
+                        */
+               bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+               if (dst->timestamp)
+                       dst->timestamp += boot_seconds;
+               bp += l;
+       }
+       ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
+       return (bp - (char *)buf);
+}
+
+
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define        RULE_MAXSIZE    (256*sizeof(u_int32_t))
+       int error;
+       size_t size;
+       struct ip_fw *buf, *rule;
+       struct ip_fw_chain *chain;
+       u_int32_t rulenum[2];
+
+       error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+       if (error)
+               return (error);
+
+       /*
+        * Disallow modifications in really-really secure mode, but still allow
+        * the logging counters to be reset.
+        */
+       if (sopt->sopt_name == IP_FW_ADD ||
+           (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+               error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+               if (error)
+                       return (error);
+       }
+
+       chain = &V_layer3_chain;
+       error = 0;
+
+       switch (sopt->sopt_name) {
+       case IP_FW_GET:
+               /*
+                * pass up a copy of the current rules. Static rules
+                * come first (the last of which has number IPFW_DEFAULT_RULE),
+                * followed by a possibly empty list of dynamic rule.
+                * The last dynamic rule has NULL in the "next" field.
+                *
+                * Note that the calculated size is used to bound the
+                * amount of data returned to the user.  The rule set may
+                * change between calculating the size and returning the
+                * data in which case we'll just return what fits.
+                */
+               for (;;) {
+                       int len = 0, want;
+
+                       size = chain->static_len;
+                       size += ipfw_dyn_len();
+               if (size >= sopt->sopt_valsize)
+                       break;
+               buf = malloc(size, M_TEMP, M_WAITOK);
+                       if (buf == NULL)
+                               break;
+                       IPFW_UH_RLOCK(chain);
+                       /* check again how much space we need */
+                       want = chain->static_len + ipfw_dyn_len();
+                       if (size >= want)
+                               len = ipfw_getrules(chain, buf, size);
+                       IPFW_UH_RUNLOCK(chain);
+                       if (size >= want)
+                               error = sooptcopyout(sopt, buf, len);
+               free(buf, M_TEMP);
+                       if (size >= want)
+                               break;
+               }
+               break;
+
+       case IP_FW_FLUSH:
+               /* locking is done within del_entry() */
+               error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
+               break;
+
+       case IP_FW_ADD:
+               rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+               error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+                       sizeof(struct ip_fw) );
+               if (error == 0)
+                       error = check_ipfw_struct(rule, sopt->sopt_valsize);
+               if (error == 0) {
+                       /* locking is done within ipfw_add_rule() */
+                       error = ipfw_add_rule(chain, rule);
+                       size = RULESIZE(rule);
+                       if (!error && sopt->sopt_dir == SOPT_GET)
+                               error = sooptcopyout(sopt, rule, size);
+               }
+               free(rule, M_TEMP);
+               break;
+
+       case IP_FW_DEL:
+               /*
+                * IP_FW_DEL is used for deleting single rules or sets,
+                * and (ab)used to atomically manipulate sets. Argument size
+                * is used to distinguish between the two:
+                *    sizeof(u_int32_t)
+                *      delete single rule or set of rules,
+                *      or reassign rules (or sets) to a different set.
+                *    2*sizeof(u_int32_t)
+                *      atomic disable/enable sets.
+                *      first u_int32_t contains sets to be disabled,
+                *      second u_int32_t contains sets to be enabled.
+                */
+               error = sooptcopyin(sopt, rulenum,
+                       2*sizeof(u_int32_t), sizeof(u_int32_t));
+               if (error)
+                       break;
+               size = sopt->sopt_valsize;
+               if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
+                       /* delete or reassign, locking done in del_entry() */
+                       error = del_entry(chain, rulenum[0]);
+               } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+                       IPFW_UH_WLOCK(chain);
+                       V_set_disable =
+                           (V_set_disable | rulenum[0]) & ~rulenum[1] &
+                           ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+                       IPFW_UH_WUNLOCK(chain);
+               } else
+                       error = EINVAL;
+               break;
+
+       case IP_FW_ZERO:
+       case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+               rulenum[0] = 0;
+               if (sopt->sopt_val != 0) {
+                   error = sooptcopyin(sopt, rulenum,
+                           sizeof(u_int32_t), sizeof(u_int32_t));
+                   if (error)
+                       break;
+               }
+               error = zero_entry(chain, rulenum[0],
+                       sopt->sopt_name == IP_FW_RESETLOG);
+               break;
+
+       /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+       case IP_FW_TABLE_ADD:
+               {
+                       ipfw_table_entry ent;
+
+                       error = sooptcopyin(sopt, &ent,
+                           sizeof(ent), sizeof(ent));
+                       if (error)
+                               break;
+                       error = ipfw_add_table_entry(chain, ent.tbl,
+                           ent.addr, ent.masklen, ent.value);
+               }
+               break;
+
+       case IP_FW_TABLE_DEL:
+               {
+                       ipfw_table_entry ent;
+
+                       error = sooptcopyin(sopt, &ent,
+                           sizeof(ent), sizeof(ent));
+                       if (error)
+                               break;
+                       error = ipfw_del_table_entry(chain, ent.tbl,
+                           ent.addr, ent.masklen);
+               }
+               break;
+
+       case IP_FW_TABLE_FLUSH:
+               {
+                       u_int16_t tbl;
+
+                       error = sooptcopyin(sopt, &tbl,
+                           sizeof(tbl), sizeof(tbl));
+                       if (error)
+                               break;
+                       IPFW_WLOCK(chain);
+                       error = ipfw_flush_table(chain, tbl);
+                       IPFW_WUNLOCK(chain);
+               }
+               break;
+
+       case IP_FW_TABLE_GETSIZE:
+               {
+                       u_int32_t tbl, cnt;
+
+                       if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+                           sizeof(tbl))))
+                               break;
+                       IPFW_RLOCK(chain);
+                       error = ipfw_count_table(chain, tbl, &cnt);
+                       IPFW_RUNLOCK(chain);
+                       if (error)
+                               break;
+                       error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+               }
+               break;
+
+       case IP_FW_TABLE_LIST:
+               {
+                       ipfw_table *tbl;
+
+                       if (sopt->sopt_valsize < sizeof(*tbl)) {
+                               error = EINVAL;
+                               break;
+                       }
+                       size = sopt->sopt_valsize;
+                       tbl = malloc(size, M_TEMP, M_WAITOK);
+                       error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+                       if (error) {
+                               free(tbl, M_TEMP);
+                               break;
+                       }
+                       tbl->size = (size - sizeof(*tbl)) /
+                           sizeof(ipfw_table_entry);
+                       IPFW_RLOCK(chain);
+                       error = ipfw_dump_table(chain, tbl);
+                       IPFW_RUNLOCK(chain);
+                       if (error) {
+                               free(tbl, M_TEMP);
+                               break;
+                       }
+                       error = sooptcopyout(sopt, tbl, size);
+                       free(tbl, M_TEMP);
+               }
+               break;
+
+       /*--- NAT operations are protected by the IPFW_LOCK ---*/
+       case IP_FW_NAT_CFG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_cfg_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_CFG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_DEL:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_del_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_DEL: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_GET_CONFIG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_get_cfg_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_GET_CFG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_GET_LOG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_get_log_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_GET_LOG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       default:
+               printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+               error = EINVAL;
+       }
+
+       return (error);
+#undef RULE_MAXSIZE
+}
+/* end of file */
diff --git a/dummynet2/ip_fw_table.c b/dummynet2/ip_fw_table.c

new file mode 100644 (file)

index 0000000..8cbf457
--- /dev/null
+++ b/dummynet2/ip_fw_table.c
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Lookup table support for ipfw
+ *
+ * Lookup tables are implemented (at the moment) using the radix
+ * tree used for routing tables. Tables store key-value entries, where
+ * keys are network prefixes (addr/masklen), and values are integers.
+ * As a degenerate case we can interpret keys as 32-bit integers
+ * (with a /32 mask).
+ *
+ * The table is protected by the IPFW lock even for manipulation coming
+ * from userland, because operations are typically fast.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <net/if.h>    /* ip_fw.h requires IFNAMSIZ */
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+struct table_entry {
+       struct radix_node       rn[2];
+       struct sockaddr_in      addr, mask;
+       u_int32_t               value;
+};
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v)     *((uint8_t *)&(v))
+#define KEY_OFS                (8*offsetof(struct sockaddr_in, sin_addr))
+
+int
+ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct radix_node *rn;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
+       if (ent == NULL)
+               return (ENOMEM);
+       ent->value = value;
+       KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8;
+       ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+       ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+       IPFW_WLOCK(ch);
+       rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
+       if (rn == NULL) {
+               IPFW_WUNLOCK(ch);
+               free(ent, M_IPFW_TBL);
+               return (EEXIST);
+       }
+       IPFW_WUNLOCK(ch);
+       return (0);
+}
+
+int
+ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct sockaddr_in sa, mask;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       KEY_LEN(sa) = KEY_LEN(mask) = 8;
+       mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+       sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+       IPFW_WLOCK(ch);
+       ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+       if (ent == NULL) {
+               IPFW_WUNLOCK(ch);
+               return (ESRCH);
+       }
+       IPFW_WUNLOCK(ch);
+       free(ent, M_IPFW_TBL);
+       return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+       struct radix_node_head * const rnh = arg;
+       struct table_entry *ent;
+
+       ent = (struct table_entry *)
+           rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+       if (ent != NULL)
+               free(ent, M_IPFW_TBL);
+       return (0);
+}
+
+int
+ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+       struct radix_node_head *rnh;
+
+       IPFW_WLOCK_ASSERT(ch);
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       KASSERT(rnh != NULL, ("NULL IPFW table"));
+       rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+       return (0);
+}
+
+void
+ipfw_flush_tables(struct ip_fw_chain *ch)
+{
+       uint16_t tbl;
+
+       IPFW_WLOCK_ASSERT(ch);
+
+       for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
+               ipfw_flush_table(ch, tbl);
+}
+
+int
+ipfw_init_tables(struct ip_fw_chain *ch)
+{ 
+       int i;
+       uint16_t j;
+
+       for (i = 0; i < IPFW_TABLES_MAX; i++) {
+               if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) {
+                       for (j = 0; j < i; j++) {
+                               (void) ipfw_flush_table(ch, j);
+                       }
+                       return (ENOMEM);
+               }
+       }
+       return (0);
+}
+
+int
+ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct sockaddr_in sa;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (0);
+       rnh = ch->tables[tbl];
+       KEY_LEN(sa) = 8;
+       sa.sin_addr.s_addr = addr;
+       ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+       if (ent != NULL) {
+               *val = ent->value;
+               return (1);
+       }
+       return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+       u_int32_t * const cnt = arg;
+
+       (*cnt)++;
+       return (0);
+}
+
+int
+ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+       struct radix_node_head *rnh;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       *cnt = 0;
+       rnh->rnh_walktree(rnh, count_table_entry, cnt);
+       return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+       struct table_entry * const n = (struct table_entry *)rn;
+       ipfw_table * const tbl = arg;
+       ipfw_table_entry *ent;
+
+       if (tbl->cnt == tbl->size)
+               return (1);
+       ent = &tbl->ent[tbl->cnt];
+       ent->tbl = tbl->tbl;
+       if (in_nullhost(n->mask.sin_addr))
+               ent->masklen = 0;
+       else
+               ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+       ent->addr = n->addr.sin_addr.s_addr;
+       ent->value = n->value;
+       tbl->cnt++;
+       return (0);
+}
+
+int
+ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+       struct radix_node_head *rnh;
+
+       if (tbl->tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl->tbl];
+       tbl->cnt = 0;
+       rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+       return (0);
+}
+/* end of file */
diff --git a/dummynet2/ipfw2_mod.c b/dummynet2/ipfw2_mod.c

new file mode 100644 (file)

index 0000000..f59a37c
--- /dev/null
+++ b/dummynet2/ipfw2_mod.c
@@ -0,0 +1,768 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: ipfw2_mod.c 4671 2010-01-04 17:50:51Z luigi $
+ *
+ * The main interface to build ipfw+dummynet as a linux module.
+ * (and possibly as a windows module as well, though that part
+ * is not complete yet).
+ *
+ * The control interface uses the sockopt mechanism
+ * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW).
+ *
+ * The data interface uses the netfilter interface, at the moment
+ * hooked to the PRE_ROUTING and POST_ROUTING hooks.
+ * Unfortunately the netfilter interface is a moving target,
+ * so we need a set of macros to adapt to the various cases.
+ *
+ * In the netfilter hook we just mark packet as 'QUEUE' and then
+ * let the queue handler to do the whole work (filtering and
+ * possibly emulation).
+ * As we receive packets, we wrap them with an mbuf descriptor
+ * so the existing ipfw+dummynet code runs unmodified.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/mbuf.h>                  /* sizeof struct mbuf */
+#include <sys/param.h>                 /* NGROUPS */
+
+#ifdef __linux__
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>      /* NF_IP_PRI_FILTER */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
+#include <net/netfilter/nf_queue.h>    /* nf_queue */
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+#define __read_mostly
+#endif
+
+#endif /* !__linux__ */
+
+#include <netinet/in.h>                        /* in_addr */
+#include <netinet/ip_fw.h>             /* ip_fw_ctl_t, ip_fw_chk_t */
+#include <netinet/ipfw/ip_fw_private.h>                /* ip_fw_ctl_t, ip_fw_chk_t */
+#include <netinet/ip_dummynet.h>       /* ip_dn_ctl_t, ip_dn_io_t */
+#include <net/pfil.h>                  /* PFIL_IN, PFIL_OUT */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#warning --- inet_hashtables not present on 2.4
+#include <linux/tcp.h>
+#include <net/route.h>
+#include <net/sock.h>
+static inline int inet_iif(const struct sk_buff *skb)
+{
+        return ((struct rtable *)skb->dst)->rt_iif;
+}
+
+#else
+#include <net/inet_hashtables.h>       /* inet_lookup */
+#endif
+#include <net/route.h>                 /* inet_iif */
+
+/*
+ * Here we allocate some global variables used in the firewall.
+ */
+//ip_dn_ctl_t    *ip_dn_ctl_ptr;
+int (*ip_dn_ctl_ptr)(struct sockopt *);
+
+ip_fw_ctl_t    *ip_fw_ctl_ptr;
+
+int    (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
+ip_fw_chk_t    *ip_fw_chk_ptr;
+
+void           (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+/*---
+ * Glue code to implement the registration of children with the parent.
+ * Each child should call my_mod_register() when linking, so that
+ * module_init() and module_exit() can call init_children() and
+ * fini_children() to provide the necessary initialization.
+ * We use the same mechanism for MODULE_ and SYSINIT_.
+ * The former only get a pointer to the moduledata,
+ * the latter have two function pointers (init/uninit)
+ */
+#include <sys/module.h>
+struct mod_args {
+        const char *name;
+        int order;
+        struct moduledata *mod;
+       void (*init)(void), (*uninit)(void);
+};
+
+static unsigned int mod_idx;
+static struct mod_args mods[10];       /* hard limit to 10 modules */
+
+int
+my_mod_register(const char *name, int order,
+       struct moduledata *mod, void *init, void *uninit);
+/*
+ * my_mod_register should be called automatically as the init
+ * functions in the submodules. Unfortunately this compiler/linker
+ * trick is not supported yet so we call it manually.
+ */
+int
+my_mod_register(const char *name, int order,
+       struct moduledata *mod, void *init, void *uninit)
+{
+       struct mod_args m = { .name = name, .order = order,
+               .mod = mod, .init = init, .uninit = uninit };
+
+       printf("%s %s called\n", __FUNCTION__, name);
+       if (mod_idx < sizeof(mods) / sizeof(mods[0]))
+               mods[mod_idx++] = m;
+       return 0;
+}
+
+static void
+init_children(void)
+{
+       unsigned int i;
+
+        /* Call the functions registered at init time. */
+       printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx);
+        for (i = 0; i < mod_idx; i++) {
+               struct mod_args *m = &mods[i];
+                printf("+++ start module %d %s %s at %p order 0x%x\n",
+                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
+                        m->mod, m->order);
+               if (m->mod && m->mod->evhand)
+                       m->mod->evhand(NULL, MOD_LOAD, m->mod->priv);
+               else if (m->init)
+                       m->init();
+        }
+}
+
+static void
+fini_children(void)
+{
+       int i;
+
+        /* Call the functions registered at init time. */
+        for (i = mod_idx - 1; i >= 0; i--) {
+               struct mod_args *m = &mods[i];
+                printf("+++ end module %d %s %s at %p order 0x%x\n",
+                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
+                        m->mod, m->order);
+               if (m->mod && m->mod->evhand)
+                       m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv);
+               else if (m->uninit)
+                       m->uninit();
+        }
+}
+/*--- end of module binding helper functions ---*/
+
+/*---
+ * Control hooks:
+ * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention.
+ * then call the ipfw handler in order to manage requests.
+ * In turn this is called by the linux set/get handlers.
+ */
+static int
+ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user)
+{
+       struct thread t;
+       int ret = EINVAL;
+
+       memset(s, 0, sizeof(s));
+       s->sopt_name = cmd;
+       s->sopt_dir = dir;
+       s->sopt_valsize = len;
+       s->sopt_val = user;
+
+       /* sopt_td is not used but it is referenced */
+       memset(&t, 0, sizeof(t));
+       s->sopt_td = &t;
+       
+       // printf("%s called with cmd %d len %d\n", __FUNCTION__, cmd, len);
+
+       if (cmd < IP_DUMMYNET_CONFIGURE && ip_fw_ctl_ptr)
+               ret = ip_fw_ctl_ptr(s);
+       else if (cmd >= IP_DUMMYNET_CONFIGURE && ip_dn_ctl_ptr)
+               ret = ip_dn_ctl_ptr(s);
+
+       return -ret;    /* errors are < 0 on linux */
+}
+
+#ifdef _WIN32
+
+void
+netisr_dispatch(int __unused num, struct mbuf *m)
+{
+}
+
+int
+ip_output(struct mbuf *m, struct mbuf __unused *opt,
+       struct route __unused *ro, int __unused flags,
+    struct ip_moptions __unused *imo, struct inpcb __unused *inp)
+{
+       netisr_dispatch(0, m);
+       return 0;
+}
+
+#else /* this is the linux glue */
+/*
+ * setsockopt hook has no return value other than the error code.
+ */
+static int
+do_ipfw_set_ctl(struct sock __unused *sk, int cmd,
+       void __user *user, unsigned int len)
+{
+       struct sockopt s;       /* pass arguments */
+
+       return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user);
+}
+
+/*
+ * getsockopt can can return a block of data in response.
+ */
+static int
+do_ipfw_get_ctl(struct sock __unused *sk,
+       int cmd, void __user *user, int *len)
+{
+       struct sockopt s;       /* pass arguments */
+       int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user);
+
+       *len = s.sopt_valsize;  /* return lenght back to the caller */
+       return ret;
+}
+
+/*
+ * declare our [get|set]sockopt hooks
+ */
+static struct nf_sockopt_ops ipfw_sockopts = {
+       .pf             = PF_INET,
+       .set_optmin     = _IPFW_SOCKOPT_BASE,
+       .set_optmax     = _IPFW_SOCKOPT_END,
+       .set            = do_ipfw_set_ctl,
+       .get_optmin     = _IPFW_SOCKOPT_BASE,
+       .get_optmax     = _IPFW_SOCKOPT_END,
+       .get            = do_ipfw_get_ctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+       .owner          = THIS_MODULE,
+#endif
+};
+
+/*----
+ * We need a number of macros to adapt to the various APIs in
+ * different linux versions. Among them:
+ *
+ * - the hook names change between macros (NF_IP*) and enum NF_INET_*
+ *
+ * - the second argument to the netfilter hook is
+ *     struct sk_buff **       in kernels <= 2.6.22
+ *     struct sk_buff *        in kernels > 2.6.22
+ *
+ * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT
+ *
+ * - the packet descriptor passed to the queue handler is
+ *     struct nf_info          in kernels <= 2.6.24
+ *     struct nf_queue_entry   in kernels <= 2.6.24
+ *
+ * - the arguments to the queue handler also change;
+ */
+
+/*
+ * declare hook to grab packets from the netfilter interface.
+ * The NF_* names change in different versions of linux, in some
+ * cases they are #defines, in others they are enum, so we
+ * need to adapt.
+ */
+#ifndef NF_IP_PRE_ROUTING
+#define NF_IP_PRE_ROUTING      NF_INET_PRE_ROUTING
+#endif
+#ifndef NF_IP_POST_ROUTING
+#define NF_IP_POST_ROUTING     NF_INET_POST_ROUTING
+#endif
+
+/*
+ * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains.
+ * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and
+ * POST_ROUTING chains, so if we want to use that information we
+ * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING.
+ * However at the moment the skb_tag info is not reliable so
+ * we stay with the standard hooks.
+ */
+#if 0 // defined(IPFW_PLANETLAB)
+#define IPFW_HOOK_IN NF_IP_LOCAL_IN
+#else
+#define IPFW_HOOK_IN NF_IP_PRE_ROUTING
+#endif
+
+/*
+ * The main netfilter hook.
+ * To make life simple, we queue everything and then do all the
+ * decision in the queue handler.
+ *
+ * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff**
+ * so we have an #ifdef to set the proper argument type.
+ */
+static unsigned int
+call_ipfw(unsigned int __unused hooknum,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have **
+       struct sk_buff  __unused **skb,
+#else
+       struct sk_buff  __unused *skb,
+#endif
+       const struct net_device  __unused *in,
+       const struct net_device  __unused *out,
+       int __unused (*okfn)(struct sk_buff *))
+{
+       return NF_QUEUE;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#define        NF_STOP         NF_ACCEPT
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+
+/*
+ * nf_queue_entry is a recent addition, in previous versions
+ * of the code the struct is called nf_info.
+ */
+#define nf_queue_entry nf_info /* for simplicity */
+
+/* also, 2.4 and perhaps something else have different arguments */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* unsure on the exact boundary */
+/* on 2.4 we use nf_info */
+#define QH_ARGS                struct sk_buff *skb, struct nf_info *info, void *data
+#else  /* 2.6.1.. 2.6.24 */
+#define QH_ARGS                struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data
+#endif
+
+#define DEFINE_SKB     /* nothing, already an argument */
+#define        REINJECT(_inf, _verd)   nf_reinject(skb, _inf, _verd)
+
+#else  /* 2.6.25 and above */
+
+#define QH_ARGS                struct nf_queue_entry *info, unsigned int queuenum
+#define DEFINE_SKB     struct sk_buff *skb = info->skb;
+#define        REINJECT(_inf, _verd)   nf_reinject(_inf, _verd)
+#endif
+
+/*
+ * used by dummynet when dropping packets
+ * XXX use dummynet_send()
+ */
+void
+reinject_drop(struct mbuf* m)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)        /* unsure on the exact boundary */
+       struct sk_buff *skb = (struct sk_buff *)m;
+#endif
+       REINJECT(m->queue_entry, NF_DROP);
+}
+
+/*
+ * The real call to the firewall. nf_queue_entry points to the skbuf,
+ * and eventually we need to return both through nf_reinject().
+ */
+static int
+ipfw2_queue_handler(QH_ARGS)
+{
+       DEFINE_SKB      /* no semicolon here, goes in the macro */
+       int ret = 0;    /* return value */
+       struct mbuf *m;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+       if (skb->nh.iph == NULL) {
+               printf("null dp, len %d reinject now\n", skb->len);
+               REINJECT(info, NF_ACCEPT);
+               return 0;
+       }
+#endif
+       m = malloc(sizeof(*m), 0, 0);
+       if (m == NULL) {
+               printf("malloc fail, len %d reinject now\n", skb->len);
+               REINJECT(info, NF_ACCEPT);
+               return 0;
+       }
+
+       m->m_skb = skb;
+       m->m_len = skb->len;            /* len in this skbuf */
+       m->m_pkthdr.len = skb->len;     /* total packet len */
+       m->m_pkthdr.rcvif = info->indev;
+       m->queue_entry = info;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+       m->m_data = skb->nh.iph;
+#else
+       m->m_data = skb_network_header(skb);
+#endif
+
+       /* XXX add the interface */
+       if (info->hook == IPFW_HOOK_IN) {
+               ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL);
+       } else {
+               ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL);
+       }
+
+       if (m != NULL) {        /* Accept. reinject and free the mbuf */
+               REINJECT(info, NF_ACCEPT);
+               m_freem(m);
+       } else if (ret == 0) {
+               /* dummynet has kept the packet, will reinject later. */
+       } else {
+               /*
+                * Packet dropped by ipfw or dummynet. Nothing to do as
+                * FREE_PKT already did a reinject as NF_DROP
+                */
+       }
+       return 0;
+}
+
+struct route;
+struct ip_moptions;
+struct inpcb;
+
+
+/* XXX should include prototypes for netisr_dispatch and ip_output */
+/*
+ * The reinjection routine after a packet comes out from dummynet.
+ * We must update the skb timestamp so ping reports the right time.
+ */
+void
+netisr_dispatch(int num, struct mbuf *m)
+{
+       struct nf_queue_entry *info = m->queue_entry;
+       struct sk_buff *skb = m->m_skb; /* always used */
+
+       m_freem(m);
+
+       KASSERT((info != NULL), ("%s info null!\n", __FUNCTION__));
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)       // XXX above 2.6.x ?
+       __net_timestamp(skb);   /* update timestamp */
+#endif
+
+       /* XXX to obey one-pass, possibly call the queue handler here */
+       REINJECT(info, ((num == -1)?NF_DROP:NF_STOP));  /* accept but no more firewall */
+}
+
+int
+ip_output(struct mbuf *m, struct mbuf __unused *opt,
+       struct route __unused *ro, int __unused flags,
+    struct ip_moptions __unused *imo, struct inpcb __unused *inp)
+{
+       netisr_dispatch(0, m);
+        return 0;
+}
+
+/*
+ * socket lookup function for linux.
+ * This code is used to associate uid, gid, jail/xid to packets,
+ * and store the info in a cache *ugp where they can be accessed quickly.
+ * The function returns 1 if the info is found, -1 otherwise.
+ *
+ * We do this only on selected protocols: TCP, ...
+ *
+ * The chain is the following
+ *   sk_buff*  sock*  socket*    file*
+ *     skb  ->  sk ->sk_socket->file ->f_owner    ->pid
+ *     skb  ->  sk ->sk_socket->file ->f_uid (direct)
+ *     skb  ->  sk ->sk_socket->file ->f_cred->fsuid (2.6.29+)
+ *
+ * Related headers:
+ * linux/skbuff.h      struct skbuff
+ * net/sock.h          struct sock
+ * linux/net.h         struct socket
+ * linux/fs.h          struct file
+ *
+ * With vserver we may have sk->sk_xid and sk->sk_nid that
+ * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2]
+ * (no matches yet)
+ *
+ * Note- for locally generated, outgoing packets we should not need
+ * need a lookup because the sk_buff already points to the socket where
+ * the info is.
+ */
+extern struct inet_hashinfo tcp_hashinfo;
+int
+linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
+               const __be32 daddr, const __be16 dport,
+               struct sk_buff *skb, int dir, struct bsd_ucred *u)
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,0)
+       return -1;
+#else
+       struct sock *sk;
+       int ret = -1;   /* default return value */
+       int st = -1;    /* state */
+
+
+       if (proto != IPPROTO_TCP)       /* XXX extend for UDP */
+               return -1;
+
+       if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
+               panic(" -- this should not happen\n");
+               return -1;
+       }
+
+       if (skb->sk) {
+               sk = skb->sk;
+       } else {
+               /*
+                * Try a lookup. On a match, sk has a refcount that we must
+                * release on exit (we know it because skb->sk = NULL).
+                *
+                * inet_lookup above 2.6.24 has an additional 'net' parameter
+                * so we use a macro to conditionally supply it.
+                * swap dst and src depending on the direction.
+                */
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24)
+#define _OPT_NET_ARG
+#else
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+/* there is no dev_net() on 2.6.25 */
+#define _OPT_NET_ARG (skb->dev->nd_net),
+#else  /* 2.6.26 and above */
+#define _OPT_NET_ARG dev_net(skb->dev),
+#endif
+#endif
+               sk =  (dir) ? /* dir != 0 on output */
+                   inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
+                       daddr, dport, saddr, sport,     // match outgoing
+                       inet_iif(skb)) :
+                   inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
+                       saddr, sport, daddr, dport,     // match incoming
+                       skb->dev->ifindex);
+#undef _OPT_NET_ARG
+
+               if (sk == NULL) /* no match, nothing to be done */
+                       return -1;
+       }
+       ret = 1;        /* retrying won't make things better */
+       st = sk->sk_state;
+#ifdef CONFIG_VSERVER
+       u->xid = sk->sk_xid;
+       u->nid = sk->sk_nid;
+#else
+       u->xid = u->nid = 0;
+#endif
+       /*
+        * Exclude tcp states where sk points to a inet_timewait_sock which
+        * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more).
+        * To be safe, use a whitelist and not a blacklist.
+        * Before dereferencing sk_socket grab a lock on sk_callback_lock.
+        *
+        * Once again we need conditional code because the UID and GID
+        * location changes between kernels.
+        */
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
+/* use the current's real uid/gid */
+#define _CURR_UID f_uid
+#define _CURR_GID f_gid
+#else /* 2.6.29 and above */
+/* use the current's file access real uid/gid */
+#define _CURR_UID f_cred->fsuid
+#define _CURR_GID f_cred->fsgid
+#endif
+
+#define GOOD_STATES (  \
+       (1<<TCP_LISTEN) | (1<<TCP_SYN_RECV)   | (1<<TCP_SYN_SENT)   | \
+       (1<<TCP_ESTABLISHED)  | (1<<TCP_FIN_WAIT1) | (1<<TCP_FIN_WAIT2) )
+       // surely exclude TCP_CLOSE, TCP_TIME_WAIT, TCP_LAST_ACK
+       // uncertain TCP_CLOSE_WAIT and TCP_CLOSING
+
+       if ((1<<st) & GOOD_STATES) {
+               read_lock_bh(&sk->sk_callback_lock);
+               if (sk->sk_socket && sk->sk_socket->file) {
+                       u->uid = sk->sk_socket->file->_CURR_UID;
+                       u->gid = sk->sk_socket->file->_CURR_GID;
+               }
+               read_unlock_bh(&sk->sk_callback_lock);
+       } else {
+               u->uid = u->gid = 0;
+       }
+       if (!skb->sk) /* return the reference that came from the lookup */
+               sock_put(sk);
+#undef GOOD_STATES
+#undef _CURR_UID
+#undef _CURR_GID
+       return ret;
+
+#endif /* LINUX > 2.4 */
+}
+
+/*
+ * Now prepare to hook the various functions.
+ * Linux 2.4 has a different API so we need some adaptation
+ * for register and unregister hooks
+ *
+ * the unregister function changed arguments between 2.6.22 and 2.6.24
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+static int
+nf_register_hooks(struct nf_hook_ops *ops, int n)
+{
+       int i, ret = 0;
+       for (i = 0; i < n; i++) {
+               ret = nf_register_hook(ops + i);
+               if (ret < 0)
+                       break;
+       }
+       return ret;
+}
+
+static void
+nf_unregister_hooks(struct nf_hook_ops *ops, int n)
+{
+       int i;
+       for (i = 0; i < n; i++) {
+               nf_unregister_hook(ops + i);
+       }
+}
+#define REG_QH_ARG(fn) fn, NULL        /* argument for nf_[un]register_queue_handler */
+#define UNREG_QH_ARG(fn) //fn  /* argument for nf_[un]register_queue_handler */
+#define SET_MOD_OWNER
+
+#else /* linux >= 2.6.0 */
+
+struct nf_queue_handler ipfw2_queue_handler_desc = {
+        .outfn = ipfw2_queue_handler,
+        .name = "ipfw2 dummynet queue",
+};
+#define REG_QH_ARG(fn) &(fn ## _desc)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define UNREG_QH_ARG(fn) //fn  /* argument for nf_[un]register_queue_handler */
+#else
+#define UNREG_QH_ARG(fn)       , &(fn ## _desc)
+#endif /* 2.6.0 < LINUX > 2.6.24 */
+
+#define SET_MOD_OWNER  .owner = THIS_MODULE,
+
+#endif /* !LINUX < 2.6.0 */
+
+static struct nf_hook_ops ipfw_ops[] __read_mostly = {
+        {
+                .hook           = call_ipfw,
+                .pf             = PF_INET,
+                .hooknum        = IPFW_HOOK_IN,
+                .priority       = NF_IP_PRI_FILTER,
+                SET_MOD_OWNER
+        },
+        {
+                .hook           = call_ipfw,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_POST_ROUTING,
+                .priority       = NF_IP_PRI_FILTER,
+               SET_MOD_OWNER
+        },
+};
+#endif /* !__linux__ */
+
+/* descriptors for the children, until i find a way for the
+ * linker to produce them
+ */
+extern moduledata_t *moddesc_ipfw;
+extern moduledata_t *moddesc_dummynet;
+extern void *sysinit_ipfw_init;
+extern void *sysuninit_ipfw_destroy;
+extern void *sysinit_vnet_ipfw_init;
+extern void *sysuninit_vnet_ipfw_uninit;
+
+/*
+ * Module glue - init and exit function.
+ */
+static int __init
+ipfw_module_init(void)
+{
+       int ret = 0;
+
+       printf("%s in-hook %d svn id %s\n", __FUNCTION__, IPFW_HOOK_IN, "$Id: ipfw2_mod.c 4671 2010-01-04 17:50:51Z luigi $");
+
+       rn_init(64);
+
+       my_mod_register("ipfw",  1, moddesc_ipfw, NULL, NULL);
+       my_mod_register("sy_ipfw",  2, NULL,
+               sysinit_ipfw_init, sysuninit_ipfw_destroy);
+       my_mod_register("sy_Vnet_ipfw",  3, NULL,
+               sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit);
+       my_mod_register("dummynet",  4, moddesc_dummynet, NULL, NULL);
+       init_children();
+
+#ifdef _WIN32
+       return ret;
+
+#else  /* linux hook */
+       /* sockopt register, in order to talk with user space */
+       ret = nf_register_sockopt(&ipfw_sockopts);
+        if (ret < 0) {
+               printf("error %d in nf_register_sockopt\n", ret);
+               goto clean_modules;
+       }
+
+       /* queue handler registration, in order to get network
+        * packet under a private queue */
+       ret = nf_register_queue_handler(PF_INET, REG_QH_ARG(ipfw2_queue_handler) );
+        if (ret < 0)   /* queue busy */
+               goto unregister_sockopt;
+
+        ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
+        if (ret < 0)
+               goto unregister_sockopt;
+
+       printf("%s loaded\n", __FUNCTION__);
+       return 0;
+
+
+/* handle errors on load */
+unregister_sockopt:
+       nf_unregister_queue_handler(PF_INET  UNREG_QH_ARG(ipfw2_queue_handler) );
+       nf_unregister_sockopt(&ipfw_sockopts);
+
+clean_modules:
+       fini_children();
+       printf("%s error\n", __FUNCTION__);
+
+       return ret;
+#endif /* linux */
+}
+
+/* module shutdown */
+static void __exit
+ipfw_module_exit(void)
+{
+#ifdef _WIN32
+#else  /* linux hook */
+        nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
+       /* maybe drain the queue before unregistering ? */
+       nf_unregister_queue_handler(PF_INET  UNREG_QH_ARG(ipfw2_queue_handler) );
+       nf_unregister_sockopt(&ipfw_sockopts);
+#endif /* linux */
+
+       fini_children();
+
+       printf("%s unloaded\n", __FUNCTION__);
+}
+
+#ifdef __linux__
+module_init(ipfw_module_init)
+module_exit(ipfw_module_exit)
+MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
+#endif
diff --git a/dummynet2/missing.h b/dummynet2/missing.h

new file mode 100644 (file)

index 0000000..09ea13a
--- /dev/null
+++ b/dummynet2/missing.h
@@ -0,0 +1,562 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: missing.h 4666 2010-01-04 12:55:32Z luigi $
+ *
+ * Header for kernel variables and functions that are not available in
+ * userland.
+ */
+
+#ifndef _MISSING_H_
+#define _MISSING_H_
+
+#include <sys/cdefs.h>
+
+/* portability features, to be set before the rest: */
+#define HAVE_NET_IPLEN         /* iplen/ipoff in net format */
+#define WITHOUT_BPF            /* do not use bpf logging */
+
+#ifdef _WIN32
+
+#ifndef DEFINE_SPINLOCK
+#define DEFINE_SPINLOCK(x)     FAST_MUTEX x
+#endif
+/* spinlock --> Guarded Mutex KGUARDED_MUTEX */
+/* http://www.reactos.org/wiki/index.php/Guarded_Mutex */
+#define spin_lock_init(_l)
+#define spin_lock_bh(_l)
+#define spin_unlock_bh(_l)
+
+#include <sys/socket.h>                /* bsd-compat.c */
+#include <netinet/in.h>                /* bsd-compat.c */
+#include <netinet/ip.h>                /* local version */
+
+#else  /* __linux__ */
+
+#define MALLOC_DECLARE(x)      /* nothing */
+#include <linux/time.h>                /* do_gettimeofday */
+#include <netinet/ip.h>                /* local version */
+struct inpcb;
+
+/*
+ * Kernel locking support.
+ * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c
+ *
+ * In linux we use spinlock_bh to implement both.
+ * For 'struct rwlock' we need an #ifdef to change it to spinlock_t
+ */
+
+#ifndef DEFINE_SPINLOCK        /* this is for linux 2.4 */
+#define DEFINE_SPINLOCK(x)   spinlock_t x = SPIN_LOCK_UNLOCKED
+#endif
+
+#endif /* __linux__ */
+
+#define rw_assert(a, b)
+#define rw_destroy(_l)
+#define rw_init(_l, msg)       spin_lock_init(_l)
+#define rw_rlock(_l)           spin_lock_bh(_l)
+#define rw_runlock(_l)         spin_unlock_bh(_l)
+#define rw_wlock(_l)           spin_lock_bh(_l)
+#define rw_wunlock(_l)         spin_unlock_bh(_l)
+#define rw_init_flags(_l, s, v)
+
+#define mtx_assert(a, b)
+#define        mtx_destroy(m)
+#define mtx_init(m, a,b,c)     spin_lock_init(m)
+#define mtx_lock(_l)           spin_lock_bh(_l)
+#define mtx_unlock(_l)         spin_unlock_bh(_l)
+
+/* end of locking support */
+
+/* in netinet/in.h */
+#define        in_nullhost(x)  ((x).s_addr == INADDR_ANY)
+
+/* bzero not present on linux, but this should go in glue.h */
+#define bzero(s, n) memset(s, 0, n)
+#define bcmp(p1, p2, n) memcmp(p1, p2, n)
+
+/* ethernet stuff */
+#define        ETHERTYPE_IP            0x0800  /* IP protocol */
+#define        ETHER_ADDR_LEN          6       /* length of an Ethernet address */
+struct ether_header {
+        u_char  ether_dhost[ETHER_ADDR_LEN];
+        u_char  ether_shost[ETHER_ADDR_LEN];
+        u_short ether_type;
+};
+
+#define ETHER_ADDR_LEN          6       /* length of an Ethernet address */
+#define ETHER_TYPE_LEN          2       /* length of the Ethernet type field */
+#define ETHER_HDR_LEN           (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN)
+
+/*
+ * Historically, BSD keeps ip_len and ip_off in host format
+ * when doing layer 3 processing, and this often requires
+ * to translate the format back and forth.
+ * To make the process explicit, we define a couple of macros
+ * that also take into account the fact that at some point
+ * we may want to keep those fields always in net format.
+ */
+
+#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN)
+#define SET_NET_IPLEN(p)        do {} while (0)
+#define SET_HOST_IPLEN(p)       do {} while (0)
+#else /* never on linux */
+#define SET_NET_IPLEN(p)        do {            \
+        struct ip *h_ip = (p);                  \
+        h_ip->ip_len = htons(h_ip->ip_len);     \
+        h_ip->ip_off = htons(h_ip->ip_off);     \
+        } while (0)
+
+#define SET_HOST_IPLEN(p)       do {            \
+        struct ip *h_ip = (p);                  \
+        h_ip->ip_len = ntohs(h_ip->ip_len);     \
+        h_ip->ip_off = ntohs(h_ip->ip_off);     \
+        } while (0)
+#endif /* !HAVE_NET_IPLEN */
+
+/* ip_dummynet.c */
+#define __FreeBSD_version 500035
+
+#ifdef __linux__
+struct moduledata;
+int my_mod_register(const char *name,
+       int order, struct moduledata *mod, void *init, void *uninit);
+
+/* define some macro for ip_dummynet */
+
+struct malloc_type {
+};
+
+#define MALLOC_DEFINE(type, shortdesc, longdesc)       \
+       struct malloc_type type[1]; void *md_dummy_ ## type = type
+
+#define CTASSERT(x)
+
+/* log... does not use the first argument */
+#define        LOG_ERR         0x100
+#define        LOG_INFO        0x200
+#define log(_level, fmt, arg...)  do {                 \
+       int __unused x=_level;printk(KERN_ERR fmt, ##arg); } while (0)
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+#else  /* _WIN32 */
+#define MALLOC_DEFINE(a,b,c)
+#endif /* _WIN32 */
+
+extern int     hz;
+extern long    tick;           /* exists in 2.4 but not in 2.6 */
+extern int     bootverbose;
+extern time_t  time_uptime;
+extern struct timeval boottime;
+
+extern int     max_linkhdr;
+extern int     ip_defttl;
+extern u_long  in_ifaddrhmask;                         /* mask for hash table */
+extern struct in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+/*-------------------------------------------------*/
+
+/* define, includes and functions missing in linux */
+/* include and define */
+#include <arpa/inet.h>         /* inet_ntoa */
+
+struct mbuf;
+
+/* used by ip_dummynet.c */
+void reinject_drop(struct mbuf* m);
+
+#include <linux/errno.h>       /* error define */
+#include <linux/if.h>          /* IFNAMESIZ */
+
+void rn_init(int);
+/*
+ * some network structure can be defined in the bsd way
+ * by using the _FAVOR_BSD definition. This is not true
+ * for icmp structure.
+ * XXX struct icmp contains bsd names in 
+ * /usr/include/netinet/ip_icmp.h
+ */
+#ifdef __linux__
+#define icmp_code code
+#define icmp_type type
+
+/* linux in6_addr has no member __u6_addr
+ * replace the whole structure ?
+ */
+#define __u6_addr       in6_u
+#define __u6_addr32     u6_addr32
+#endif /* __linux__ */
+
+/* defined in linux/sctp.h with no bsd definition */
+struct sctphdr {
+        uint16_t src_port;      /* source port */
+        uint16_t dest_port;     /* destination port */
+        uint32_t v_tag;         /* verification tag of packet */
+        uint32_t checksum;      /* Adler32 C-Sum */
+        /* chunks follow... */
+};
+
+/* missing definition */
+#define TH_FIN  0x01
+#define TH_SYN  0x02
+#define TH_RST  0x04
+#define TH_ACK  0x10
+
+#define RTF_CLONING    0x100           /* generate new routes on use */
+
+#define IPPROTO_OSPFIGP         89              /* OSPFIGP */
+#define IPPROTO_CARP            112             /* CARP */
+#ifndef _WIN32
+#define IPPROTO_IPV4            IPPROTO_IPIP    /* for compatibility */
+#endif
+
+#define        CARP_VERSION            2
+#define        CARP_ADVERTISEMENT      0x01
+
+#define PRIV_NETINET_IPFW       491     /* Administer IPFW firewall. */
+
+#define IP_FORWARDING           0x1             /* most of ip header exists */
+
+#define NETISR_IP       2               /* same as AF_INET */
+
+#define PRIV_NETINET_DUMMYNET   494     /* Administer DUMMYNET. */
+
+extern int securelevel;
+
+struct carp_header {
+#if BYTE_ORDER == LITTLE_ENDIAN
+        u_int8_t        carp_type:4,
+                        carp_version:4;
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+        u_int8_t        carp_version:4,
+                        carp_type:4;
+#endif
+};
+
+struct pim {
+       int dummy;      /* windows compiler does not like empty definition */
+};
+
+struct route {
+       struct  rtentry *ro_rt;
+       struct  sockaddr ro_dst;
+};
+
+struct ifaltq {
+       void *ifq_head;
+};
+
+/*
+ * ifnet->if_snd is used in ip_dummynet.c to take the transmission
+ * clock.
+ */
+#if defined( __linux__)
+#define        if_xname        name
+#define        if_snd          XXX
+#elif defined( _WIN32 )
+/* used in ip_dummynet.c */
+struct ifnet {
+       char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
+//        struct ifaltq if_snd;          /* output queue (includes altq) */
+};
+
+struct net_device {
+       char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
+};
+#endif
+
+/* involves mbufs */
+int in_cksum(struct mbuf *m, int len);
+#define divert_cookie(mtag) 0
+#define divert_info(mtag) 0
+#define INADDR_TO_IFP(a, b) b = NULL
+#define pf_find_mtag(a) NULL
+#define pf_get_mtag(a) NULL
+#ifndef _WIN32
+#define AF_LINK AF_ASH /* ? our sys/socket.h */
+#endif
+
+/* we don't pullup, either success or free and fail */
+#define m_pullup(m, x)                                 \
+       ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL))
+
+struct pf_mtag {
+       void            *hdr;           /* saved hdr pos in mbuf, for ECN */
+       sa_family_t      af;            /* for ECN */
+        u_int32_t        qid;           /* queue id */
+};
+
+#if 0 // ndef radix
+/* radix stuff in radix.h and radix.c */
+struct radix_node {
+       caddr_t rn_key;         /* object of search */
+       caddr_t rn_mask;        /* netmask, if present */
+};
+#endif /* !radix */
+
+/* missing kernel functions */
+char *inet_ntoa(struct in_addr ina);
+int random(void);
+
+/*
+ * Return the risult of a/b
+ *
+ * this is used in linux kernel space,
+ * since the 64bit division needs to
+ * be done using a macro
+ */
+int64_t
+div64(int64_t a, int64_t b);
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf);
+
+/* from bsd sys/queue.h */
+#define TAILQ_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = TAILQ_FIRST((head));                               \
+            (var) && ((tvar) = TAILQ_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+#define SLIST_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = SLIST_FIRST((head));                               \
+            (var) && ((tvar) = SLIST_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+/* depending of linux version */
+#ifndef ETHERTYPE_IPV6
+#define ETHERTYPE_IPV6          0x86dd          /* IP protocol version 6 */
+#endif
+
+/*-------------------------------------------------*/
+#define RT_NUMFIBS 1
+extern u_int rt_numfibs;
+
+/* involves kernel locking function */
+#ifdef RTFREE
+#undef RTFREE
+#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n");
+#endif
+
+void getmicrouptime(struct timeval *tv);
+
+/* from sys/netinet/ip_output.c */
+struct ip_moptions;
+struct route;
+struct ip;
+
+struct mbuf *ip_reass(struct mbuf *);
+u_short in_cksum_hdr(struct ip *);
+int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+    struct ip_moptions *imo, struct inpcb *inp);
+
+/* from net/netisr.c */
+void netisr_dispatch(int num, struct mbuf *m);
+
+/* definition moved in missing.c */
+int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len);
+
+int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen);
+
+/* defined in session.c */
+int priv_check(struct thread *td, int priv);
+
+/* struct ucred is in linux/socket.h and has pid, uid, gid.
+ * We need a 'bsd_ucred' to store also the extra info
+ */
+
+struct bsd_ucred {
+       uid_t           uid;
+       gid_t           gid;
+       uint32_t        xid;
+       uint32_t        nid;
+};
+
+int
+cred_check(void *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb);
+
+int securelevel_ge(struct ucred *cr, int level);
+
+struct sysctl_oid;
+struct sysctl_req;
+
+/*
+ * sysctl are mapped into /sys/module/ipfw_mod parameters
+ */
+#define CTLFLAG_RD             1
+#define CTLFLAG_RDTUN          1
+#define CTLFLAG_RW             2
+#define CTLFLAG_SECURE3                0 // unsupported
+#define CTLFLAG_VNET    0      /* unsupported */
+
+#ifdef _WIN32
+#define module_param_named(_name, _var, _ty, _perm)
+#else
+
+/* Linux 2.4 is mostly for openwrt */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#include <linux/bitops.h>       /* generic_ffs() used in ip_fw2.c */
+typedef uint32_t __be32;
+typedef uint16_t __be16;
+struct sock;
+struct net;
+struct inet_hashinfo;
+struct sock *inet_lookup(
+       struct inet_hashinfo *hashinfo,
+        const __be32 saddr, const __be16 sport,
+        const __be32 daddr, const __be16 dport,
+        const int dif);
+struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+#endif /* Linux < 2.6 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+#define module_param_named(_name, _var, _ty, _perm)    \
+       //module_param(_name, _ty, 0644)
+#endif
+#endif /* __linux__ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+typedef unsigned long uintptr_t;
+#endif
+
+#define SYSCTL_DECL(_1)
+#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8)
+#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6)
+#define _SYSCTL_BASE(_name, _var, _ty, _perm)          \
+       module_param_named(_name, *(_var), _ty,         \
+               ( (_perm) == CTLFLAG_RD) ? 0444: 0644 )
+#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b)
+
+#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc)       \
+       _SYSCTL_BASE(_name, _var, int, _mode)
+
+#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc)      \
+       _SYSCTL_BASE(_name, _var, long, _mode)
+
+#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc)     \
+       _SYSCTL_BASE(_name, _var, ulong, _mode)
+
+#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc)      \
+        _SYSCTL_BASE(_name, _var, uint, _mode)
+
+#define SYSCTL_HANDLER_ARGS            \
+       struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req
+int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_long(SYSCTL_HANDLER_ARGS); 
+
+#define TUNABLE_INT(_name, _ptr)
+
+void ether_demux(struct ifnet *ifp, struct mbuf *m);
+
+int ether_output_frame(struct ifnet *ifp, struct mbuf *m);
+
+void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
+
+void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu);
+
+void rtfree(struct rtentry *rt);
+
+u_short in_cksum_skip(struct mbuf *m, int len, int skip);
+
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+int jailed(struct ucred *cred);
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int in_localaddr(struct in_addr in);
+
+/* the prototype is already in the headers */
+//int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); 
+
+int fnmatch(const char *pattern, const char *string, int flags);
+
+int
+linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
+       const __be32 daddr, const __be16 dport,
+       struct sk_buff *skb, int dir, struct bsd_ucred *u);
+
+/* vnet wrappers, in vnet.h and ip_var.h */
+//int ipfw_init(void);
+//void ipfw_destroy(void);
+struct ip_fw_args;
+extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
+
+#define curvnet                 NULL
+#define        CURVNET_SET(_v)
+#define        CURVNET_RESTORE()
+#define VNET_ASSERT(condition)
+
+#define VNET_NAME(n)            n
+#define VNET_DECLARE(t, n)      extern t n
+#define VNET_DEFINE(t, n)       t n
+#define _VNET_PTR(b, n)         &VNET_NAME(n)
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VNET_VNET_PTR(vnet, n)          (&(n))
+#define VNET_VNET(vnet, n)              (n)
+
+#define VNET_PTR(n)             (&(n))
+#define VNET(n)                 (n)
+
+int
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp);
+
+extern int (*ip_dn_ctl_ptr)(struct sockopt *);
+typedef int ip_fw_ctl_t(struct sockopt *);
+extern ip_fw_ctl_t *ip_fw_ctl_ptr;
+
+/* For kernel ipfw_ether and ipfw_bridge. */
+struct ip_fw_args;
+typedef int ip_fw_chk_t(struct ip_fw_args *args);
+extern  ip_fw_chk_t     *ip_fw_chk_ptr;
+
+#define V_ip_fw_chk_ptr         VNET(ip_fw_chk_ptr)
+#define V_ip_fw_ctl_ptr         VNET(ip_fw_ctl_ptr)
+#define        V_tcbinfo               VNET(tcbinfo)
+#define        V_udbinfo               VNET(udbinfo)
+
+#define SYSCTL_VNET_PROC       SYSCTL_PROC
+#define SYSCTL_VNET_INT                SYSCTL_INT
+
+#endif /* !_MISSING_H_ */
diff --git a/dummynet2/radix.c b/dummynet2/radix.c

new file mode 100644 (file)

index 0000000..5d508e4
--- /dev/null
+++ b/dummynet2/radix.c
@@ -0,0 +1,1186 @@
+/*-
+ * Copyright (c) 1988, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)radix.c     8.5 (Berkeley) 5/19/95
+ * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $
+ */
+
+/*
+ * Routines to build and maintain radix trees for routing lookups.
+ */
+#include <sys/param.h>
+#ifdef _KERNEL
+#include <sys/cdefs.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <net/radix.h>
+#include "opt_mpath.h"
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+#else /* !_KERNEL */
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x)       fprintf(stderr, "PANIC: %s", x), exit(1)
+#define min(a, b) ((a) < (b) ? (a) : (b) )
+#include "include/net/radix.h"
+#endif /* !_KERNEL */
+
+static int     rn_walktree_from(struct radix_node_head *h, void *a, void *m,
+                   walktree_f_t *f, void *w);
+static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
+static struct radix_node
+        *rn_insert(void *, struct radix_node_head *, int *,
+            struct radix_node [2]),
+        *rn_newpair(void *, int, struct radix_node[2]),
+        *rn_search(void *, struct radix_node *),
+        *rn_search_m(void *, struct radix_node *, void *);
+
+static int     max_keylen;
+static struct radix_mask *rn_mkfreelist;
+static struct radix_node_head *mask_rnhead;
+/*
+ * Work area -- the following point to 3 buffers of size max_keylen,
+ * allocated in this order in a block of memory malloc'ed by rn_init.
+ * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards.
+ * addmask_key is used in rn_addmask in rw mode and not thread-safe.
+ */
+static char *rn_zeros, *rn_ones, *addmask_key;
+
+#define MKGet(m) {                                             \
+       if (rn_mkfreelist) {                                    \
+               m = rn_mkfreelist;                              \
+               rn_mkfreelist = (m)->rm_mklist;                 \
+       } else                                                  \
+               R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); }
+ 
+#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);}
+
+#define rn_masktop (mask_rnhead->rnh_treetop)
+
+static int     rn_lexobetter(void *m_arg, void *n_arg);
+static struct radix_mask *
+               rn_new_radix_mask(struct radix_node *tt,
+                   struct radix_mask *next);
+static int     rn_satisfies_leaf(char *trial, struct radix_node *leaf,
+                   int skip);
+
+/*
+ * The data structure for the keys is a radix tree with one way
+ * branching removed.  The index rn_bit at an internal node n represents a bit
+ * position to be tested.  The tree is arranged so that all descendants
+ * of a node n have keys whose bits all agree up to position rn_bit - 1.
+ * (We say the index of n is rn_bit.)
+ *
+ * There is at least one descendant which has a one bit at position rn_bit,
+ * and at least one with a zero there.
+ *
+ * A route is determined by a pair of key and mask.  We require that the
+ * bit-wise logical and of the key and mask to be the key.
+ * We define the index of a route to associated with the mask to be
+ * the first bit number in the mask where 0 occurs (with bit number 0
+ * representing the highest order bit).
+ *
+ * We say a mask is normal if every bit is 0, past the index of the mask.
+ * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
+ * and m is a normal mask, then the route applies to every descendant of n.
+ * If the index(m) < rn_bit, this implies the trailing last few bits of k
+ * before bit b are all 0, (and hence consequently true of every descendant
+ * of n), so the route applies to all descendants of the node as well.
+ *
+ * Similar logic shows that a non-normal mask m such that
+ * index(m) <= index(n) could potentially apply to many children of n.
+ * Thus, for each non-host route, we attach its mask to a list at an internal
+ * node as high in the tree as we can go.
+ *
+ * The present version of the code makes use of normal routes in short-
+ * circuiting an explict mask and compare operation when testing whether
+ * a key satisfies a normal route, and also in remembering the unique leaf
+ * that governs a subtree.
+ */
+
+/*
+ * Most of the functions in this code assume that the key/mask arguments
+ * are sockaddr-like structures, where the first byte is an u_char
+ * indicating the size of the entire structure.
+ *
+ * To make the assumption more explicit, we use the LEN() macro to access
+ * this field. It is safe to pass an expression with side effects
+ * to LEN() as the argument is evaluated only once.
+ * We cast the result to int as this is the dominant usage.
+ */
+#define LEN(x) ( (int) (*(const u_char *)(x)) )
+
+/*
+ * XXX THIS NEEDS TO BE FIXED
+ * In the code, pointers to keys and masks are passed as either
+ * 'void *' (because callers use to pass pointers of various kinds), or
+ * 'caddr_t' (which is fine for pointer arithmetics, but not very
+ * clean when you dereference it to access data). Furthermore, caddr_t
+ * is really 'char *', while the natural type to operate on keys and
+ * masks would be 'u_char'. This mismatch require a lot of casts and
+ * intermediate variables to adapt types that clutter the code.
+ */
+
+/*
+ * Search a node in the tree matching the key.
+ */
+static struct radix_node *
+rn_search(v_arg, head)
+       void *v_arg;
+       struct radix_node *head;
+{
+       register struct radix_node *x;
+       register caddr_t v;
+
+       for (x = head, v = v_arg; x->rn_bit >= 0;) {
+               if (x->rn_bmask & v[x->rn_offset])
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       }
+       return (x);
+}
+
+/*
+ * Same as above, but with an additional mask.
+ * XXX note this function is used only once.
+ */
+static struct radix_node *
+rn_search_m(v_arg, head, m_arg)
+       struct radix_node *head;
+       void *v_arg, *m_arg;
+{
+       register struct radix_node *x;
+       register caddr_t v = v_arg, m = m_arg;
+
+       for (x = head; x->rn_bit >= 0;) {
+               if ((x->rn_bmask & m[x->rn_offset]) &&
+                   (x->rn_bmask & v[x->rn_offset]))
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       }
+       return x;
+}
+
+int
+rn_refines(m_arg, n_arg)
+       void *m_arg, *n_arg;
+{
+       register caddr_t m = m_arg, n = n_arg;
+       register caddr_t lim, lim2 = lim = n + LEN(n);
+       int longer = LEN(n++) - LEN(m++);
+       int masks_are_equal = 1;
+
+       if (longer > 0)
+               lim -= longer;
+       while (n < lim) {
+               if (*n & ~(*m))
+                       return 0;
+               if (*n++ != *m++)
+                       masks_are_equal = 0;
+       }
+       while (n < lim2)
+               if (*n++)
+                       return 0;
+       if (masks_are_equal && (longer < 0))
+               for (lim2 = m - longer; m < lim2; )
+                       if (*m++)
+                               return 1;
+       return (!masks_are_equal);
+}
+
+struct radix_node *
+rn_lookup(v_arg, m_arg, head)
+       void *v_arg, *m_arg;
+       struct radix_node_head *head;
+{
+       register struct radix_node *x;
+       caddr_t netmask = 0;
+
+       if (m_arg) {
+               x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
+               if (x == 0)
+                       return (0);
+               netmask = x->rn_key;
+       }
+       x = rn_match(v_arg, head);
+       if (x && netmask) {
+               while (x && x->rn_mask != netmask)
+                       x = x->rn_dupedkey;
+       }
+       return x;
+}
+
+static int
+rn_satisfies_leaf(trial, leaf, skip)
+       char *trial;
+       register struct radix_node *leaf;
+       int skip;
+{
+       register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
+       char *cplim;
+       int length = min(LEN(cp), LEN(cp2));
+
+       if (cp3 == NULL)
+               cp3 = rn_ones;
+       else
+               length = min(length, LEN(cp3));
+       cplim = cp + length; cp3 += skip; cp2 += skip;
+       for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
+               if ((*cp ^ *cp2) & *cp3)
+                       return 0;
+       return 1;
+}
+
+struct radix_node *
+rn_match(v_arg, head)
+       void *v_arg;
+       struct radix_node_head *head;
+{
+       caddr_t v = v_arg;
+       register struct radix_node *t = head->rnh_treetop, *x;
+       register caddr_t cp = v, cp2;
+       caddr_t cplim;
+       struct radix_node *saved_t, *top = t;
+       int off = t->rn_offset, vlen = LEN(cp), matched_off;
+       register int test, b, rn_bit;
+
+       /*
+        * Open code rn_search(v, top) to avoid overhead of extra
+        * subroutine call.
+        */
+       for (; t->rn_bit >= 0; ) {
+               if (t->rn_bmask & cp[t->rn_offset])
+                       t = t->rn_right;
+               else
+                       t = t->rn_left;
+       }
+       /*
+        * See if we match exactly as a host destination
+        * or at least learn how many bits match, for normal mask finesse.
+        *
+        * It doesn't hurt us to limit how many bytes to check
+        * to the length of the mask, since if it matches we had a genuine
+        * match and the leaf we have is the most specific one anyway;
+        * if it didn't match with a shorter length it would fail
+        * with a long one.  This wins big for class B&C netmasks which
+        * are probably the most common case...
+        */
+       if (t->rn_mask)
+               vlen = *(u_char *)t->rn_mask;
+       cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
+       for (; cp < cplim; cp++, cp2++)
+               if (*cp != *cp2)
+                       goto on1;
+       /*
+        * This extra grot is in case we are explicitly asked
+        * to look up the default.  Ugh!
+        *
+        * Never return the root node itself, it seems to cause a
+        * lot of confusion.
+        */
+       if (t->rn_flags & RNF_ROOT)
+               t = t->rn_dupedkey;
+       return t;
+on1:
+       test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
+       for (b = 7; (test >>= 1) > 0;)
+               b--;
+       matched_off = cp - v;
+       b += matched_off << 3;
+       rn_bit = -1 - b;
+       /*
+        * If there is a host route in a duped-key chain, it will be first.
+        */
+       if ((saved_t = t)->rn_mask == 0)
+               t = t->rn_dupedkey;
+       for (; t; t = t->rn_dupedkey)
+               /*
+                * Even if we don't match exactly as a host,
+                * we may match if the leaf we wound up at is
+                * a route to a net.
+                */
+               if (t->rn_flags & RNF_NORMAL) {
+                       if (rn_bit <= t->rn_bit)
+                               return t;
+               } else if (rn_satisfies_leaf(v, t, matched_off))
+                               return t;
+       t = saved_t;
+       /* start searching up the tree */
+       do {
+               register struct radix_mask *m;
+               t = t->rn_parent;
+               m = t->rn_mklist;
+               /*
+                * If non-contiguous masks ever become important
+                * we can restore the masking and open coding of
+                * the search and satisfaction test and put the
+                * calculation of "off" back before the "do".
+                */
+               while (m) {
+                       if (m->rm_flags & RNF_NORMAL) {
+                               if (rn_bit <= m->rm_bit)
+                                       return (m->rm_leaf);
+                       } else {
+                               off = min(t->rn_offset, matched_off);
+                               x = rn_search_m(v, t, m->rm_mask);
+                               while (x && x->rn_mask != m->rm_mask)
+                                       x = x->rn_dupedkey;
+                               if (x && rn_satisfies_leaf(v, x, off))
+                                       return x;
+                       }
+                       m = m->rm_mklist;
+               }
+       } while (t != top);
+       return 0;
+}
+
+#ifdef RN_DEBUG
+int    rn_nodenum;
+struct radix_node *rn_clist;
+int    rn_saveinfo;
+int    rn_debug =  1;
+#endif
+
+/*
+ * Whenever we add a new leaf to the tree, we also add a parent node,
+ * so we allocate them as an array of two elements: the first one must be
+ * the leaf (see RNTORT() in route.c), the second one is the parent.
+ * This routine initializes the relevant fields of the nodes, so that
+ * the leaf is the left child of the parent node, and both nodes have
+ * (almost) all all fields filled as appropriate.
+ * (XXX some fields are left unset, see the '#if 0' section).
+ * The function returns a pointer to the parent node.
+ */
+
+static struct radix_node *
+rn_newpair(v, b, nodes)
+       void *v;
+       int b;
+       struct radix_node nodes[2];
+{
+       register struct radix_node *tt = nodes, *t = tt + 1;
+       t->rn_bit = b;
+       t->rn_bmask = 0x80 >> (b & 7);
+       t->rn_left = tt;
+       t->rn_offset = b >> 3;
+
+#if 0  /* XXX perhaps we should fill these fields as well. */
+       t->rn_parent = t->rn_right = NULL;
+
+       tt->rn_mask = NULL;
+       tt->rn_dupedkey = NULL;
+       tt->rn_bmask = 0;
+#endif
+       tt->rn_bit = -1;
+       tt->rn_key = (caddr_t)v;
+       tt->rn_parent = t;
+       tt->rn_flags = t->rn_flags = RNF_ACTIVE;
+       tt->rn_mklist = t->rn_mklist = 0;
+#ifdef RN_DEBUG
+       tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
+       tt->rn_twin = t;
+       tt->rn_ybro = rn_clist;
+       rn_clist = tt;
+#endif
+       return t;
+}
+
+static struct radix_node *
+rn_insert(v_arg, head, dupentry, nodes)
+       void *v_arg;
+       struct radix_node_head *head;
+       int *dupentry;
+       struct radix_node nodes[2];
+{
+       caddr_t v = v_arg;
+       struct radix_node *top = head->rnh_treetop;
+       int head_off = top->rn_offset, vlen = LEN(v);
+       register struct radix_node *t = rn_search(v_arg, top);
+       register caddr_t cp = v + head_off;
+       register int b;
+       struct radix_node *tt;
+       /*
+        * Find first bit at which v and t->rn_key differ
+        */
+    {
+       register caddr_t cp2 = t->rn_key + head_off;
+       register int cmp_res;
+       caddr_t cplim = v + vlen;
+
+       while (cp < cplim)
+               if (*cp2++ != *cp++)
+                       goto on1;
+       *dupentry = 1;
+       return t;
+on1:
+       *dupentry = 0;
+       cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
+       for (b = (cp - v) << 3; cmp_res; b--)
+               cmp_res >>= 1;
+    }
+    {
+       register struct radix_node *p, *x = top;
+       cp = v;
+       do {
+               p = x;
+               if (cp[x->rn_offset] & x->rn_bmask)
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       } while (b > (unsigned) x->rn_bit);
+                               /* x->rn_bit < b && x->rn_bit >= 0 */
+#ifdef RN_DEBUG
+       if (rn_debug)
+               log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p);
+#endif
+       t = rn_newpair(v_arg, b, nodes); 
+       tt = t->rn_left;
+       if ((cp[p->rn_offset] & p->rn_bmask) == 0)
+               p->rn_left = t;
+       else
+               p->rn_right = t;
+       x->rn_parent = t;
+       t->rn_parent = p; /* frees x, p as temp vars below */
+       if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
+               t->rn_right = x;
+       } else {
+               t->rn_right = tt;
+               t->rn_left = x;
+       }
+#ifdef RN_DEBUG
+       if (rn_debug)
+               log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p);
+#endif
+    }
+       return (tt);
+}
+
+struct radix_node *
+rn_addmask(n_arg, search, skip)
+       int search, skip;
+       void *n_arg;
+{
+       caddr_t netmask = (caddr_t)n_arg;
+       register struct radix_node *x;
+       register caddr_t cp, cplim;
+       register int b = 0, mlen, j;
+       int maskduplicated, m0, isnormal;
+       struct radix_node *saved_x;
+       static int last_zeroed = 0;
+
+       if ((mlen = LEN(netmask)) > max_keylen)
+               mlen = max_keylen;
+       if (skip == 0)
+               skip = 1;
+       if (mlen <= skip)
+               return (mask_rnhead->rnh_nodes);
+       if (skip > 1)
+               bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
+       if ((m0 = mlen) > skip)
+               bcopy(netmask + skip, addmask_key + skip, mlen - skip);
+       /*
+        * Trim trailing zeroes.
+        */
+       for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
+               cp--;
+       mlen = cp - addmask_key;
+       if (mlen <= skip) {
+               if (m0 >= last_zeroed)
+                       last_zeroed = mlen;
+               return (mask_rnhead->rnh_nodes);
+       }
+       if (m0 < last_zeroed)
+               bzero(addmask_key + m0, last_zeroed - m0);
+       *addmask_key = last_zeroed = mlen;
+       x = rn_search(addmask_key, rn_masktop);
+       if (bcmp(addmask_key, x->rn_key, mlen) != 0)
+               x = 0;
+       if (x || search)
+               return (x);
+       R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
+       if ((saved_x = x) == 0)
+               return (0);
+       netmask = cp = (caddr_t)(x + 2);
+       bcopy(addmask_key, cp, mlen);
+       x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
+       if (maskduplicated) {
+               log(LOG_ERR, "rn_addmask: mask impossibly already in tree");
+               Free(saved_x);
+               return (x);
+       }
+       /*
+        * Calculate index of mask, and check for normalcy.
+        * First find the first byte with a 0 bit, then if there are
+        * more bits left (remember we already trimmed the trailing 0's),
+        * the pattern must be one of those in normal_chars[], or we have
+        * a non-contiguous mask.
+        */
+       cplim = netmask + mlen;
+       isnormal = 1;
+       for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;)
+               cp++;
+       if (cp != cplim) {
+               static char normal_chars[] = {
+                       0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
+
+               for (j = 0x80; (j & *cp) != 0; j >>= 1)
+                       b++;
+               if (*cp != normal_chars[b] || cp != (cplim - 1))
+                       isnormal = 0;
+       }
+       b += (cp - netmask) << 3;
+       x->rn_bit = -1 - b;
+       if (isnormal)
+               x->rn_flags |= RNF_NORMAL;
+       return (x);
+}
+
+static int     /* XXX: arbitrary ordering for non-contiguous masks */
+rn_lexobetter(m_arg, n_arg)
+       void *m_arg, *n_arg;
+{
+       register u_char *mp = m_arg, *np = n_arg, *lim;
+
+       if (LEN(mp) > LEN(np))
+               return 1;  /* not really, but need to check longer one first */
+       if (LEN(mp) == LEN(np))
+               for (lim = mp + LEN(mp); mp < lim;)
+                       if (*mp++ > *np++)
+                               return 1;
+       return 0;
+}
+
+static struct radix_mask *
+rn_new_radix_mask(tt, next)
+       register struct radix_node *tt;
+       register struct radix_mask *next;
+{
+       register struct radix_mask *m;
+
+       MKGet(m);
+       if (m == 0) {
+               log(LOG_ERR, "Mask for route not entered\n");
+               return (0);
+       }
+       bzero(m, sizeof *m);
+       m->rm_bit = tt->rn_bit;
+       m->rm_flags = tt->rn_flags;
+       if (tt->rn_flags & RNF_NORMAL)
+               m->rm_leaf = tt;
+       else
+               m->rm_mask = tt->rn_mask;
+       m->rm_mklist = next;
+       tt->rn_mklist = m;
+       return m;
+}
+
+struct radix_node *
+rn_addroute(v_arg, n_arg, head, treenodes)
+       void *v_arg, *n_arg;
+       struct radix_node_head *head;
+       struct radix_node treenodes[2];
+{
+       caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
+       register struct radix_node *t, *x = 0, *tt;
+       struct radix_node *saved_tt, *top = head->rnh_treetop;
+       short b = 0, b_leaf = 0;
+       int keyduplicated;
+       caddr_t mmask;
+       struct radix_mask *m, **mp;
+
+       /*
+        * In dealing with non-contiguous masks, there may be
+        * many different routes which have the same mask.
+        * We will find it useful to have a unique pointer to
+        * the mask to speed avoiding duplicate references at
+        * nodes and possibly save time in calculating indices.
+        */
+       if (netmask)  {
+               if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
+                       return (0);
+               b_leaf = x->rn_bit;
+               b = -1 - x->rn_bit;
+               netmask = x->rn_key;
+       }
+       /*
+        * Deal with duplicated keys: attach node to previous instance
+        */
+       saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
+       if (keyduplicated) {
+               for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
+#ifdef RADIX_MPATH
+                       /* permit multipath, if enabled for the family */
+                       if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
+                               /*
+                                * go down to the end of multipaths, so that
+                                * new entry goes into the end of rn_dupedkey
+                                * chain.
+                                */
+                               do {
+                                       t = tt;
+                                       tt = tt->rn_dupedkey;
+                               } while (tt && t->rn_mask == tt->rn_mask);
+                               break;
+                       }
+#endif
+                       if (tt->rn_mask == netmask)
+                               return (0);
+                       if (netmask == 0 ||
+                           (tt->rn_mask &&
+                            ((b_leaf < tt->rn_bit) /* index(netmask) > node */
+                             || rn_refines(netmask, tt->rn_mask)
+                             || rn_lexobetter(netmask, tt->rn_mask))))
+                               break;
+               }
+               /*
+                * If the mask is not duplicated, we wouldn't
+                * find it among possible duplicate key entries
+                * anyway, so the above test doesn't hurt.
+                *
+                * We sort the masks for a duplicated key the same way as
+                * in a masklist -- most specific to least specific.
+                * This may require the unfortunate nuisance of relocating
+                * the head of the list.
+                *
+                * We also reverse, or doubly link the list through the
+                * parent pointer.
+                */
+               if (tt == saved_tt) {
+                       struct  radix_node *xx = x;
+                       /* link in at head of list */
+                       (tt = treenodes)->rn_dupedkey = t;
+                       tt->rn_flags = t->rn_flags;
+                       tt->rn_parent = x = t->rn_parent;
+                       t->rn_parent = tt;                      /* parent */
+                       if (x->rn_left == t)
+                               x->rn_left = tt;
+                       else
+                               x->rn_right = tt;
+                       saved_tt = tt; x = xx;
+               } else {
+                       (tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
+                       t->rn_dupedkey = tt;
+                       tt->rn_parent = t;                      /* parent */
+                       if (tt->rn_dupedkey)                    /* parent */
+                               tt->rn_dupedkey->rn_parent = tt; /* parent */
+               }
+#ifdef RN_DEBUG
+               t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
+               tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt;
+#endif
+               tt->rn_key = (caddr_t) v;
+               tt->rn_bit = -1;
+               tt->rn_flags = RNF_ACTIVE;
+       }
+       /*
+        * Put mask in tree.
+        */
+       if (netmask) {
+               tt->rn_mask = netmask;
+               tt->rn_bit = x->rn_bit;
+               tt->rn_flags |= x->rn_flags & RNF_NORMAL;
+       }
+       t = saved_tt->rn_parent;
+       if (keyduplicated)
+               goto on2;
+       b_leaf = -1 - t->rn_bit;
+       if (t->rn_right == saved_tt)
+               x = t->rn_left;
+       else
+               x = t->rn_right;
+       /* Promote general routes from below */
+       if (x->rn_bit < 0) {
+           for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+               if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
+                       *mp = m = rn_new_radix_mask(x, 0);
+                       if (m)
+                               mp = &m->rm_mklist;
+               }
+       } else if (x->rn_mklist) {
+               /*
+                * Skip over masks whose index is > that of new node
+                */
+               for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
+                       if (m->rm_bit >= b_leaf)
+                               break;
+               t->rn_mklist = m; *mp = 0;
+       }
+on2:
+       /* Add new route to highest possible ancestor's list */
+       if ((netmask == 0) || (b > t->rn_bit ))
+               return tt; /* can't lift at all */
+       b_leaf = tt->rn_bit;
+       do {
+               x = t;
+               t = t->rn_parent;
+       } while (b <= t->rn_bit && x != top);
+       /*
+        * Search through routes associated with node to
+        * insert new route according to index.
+        * Need same criteria as when sorting dupedkeys to avoid
+        * double loop on deletion.
+        */
+       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) {
+               if (m->rm_bit < b_leaf)
+                       continue;
+               if (m->rm_bit > b_leaf)
+                       break;
+               if (m->rm_flags & RNF_NORMAL) {
+                       mmask = m->rm_leaf->rn_mask;
+                       if (tt->rn_flags & RNF_NORMAL) {
+                           log(LOG_ERR,
+                               "Non-unique normal route, mask not entered\n");
+                               return tt;
+                       }
+               } else
+                       mmask = m->rm_mask;
+               if (mmask == netmask) {
+                       m->rm_refs++;
+                       tt->rn_mklist = m;
+                       return tt;
+               }
+               if (rn_refines(netmask, mmask)
+                   || rn_lexobetter(netmask, mmask))
+                       break;
+       }
+       *mp = rn_new_radix_mask(tt, *mp);
+       return tt;
+}
+
+struct radix_node *
+rn_delete(v_arg, netmask_arg, head)
+       void *v_arg, *netmask_arg;
+       struct radix_node_head *head;
+{
+       register struct radix_node *t, *p, *x, *tt;
+       struct radix_mask *m, *saved_m, **mp;
+       struct radix_node *dupedkey, *saved_tt, *top;
+       caddr_t v, netmask;
+       int b, head_off, vlen;
+
+       v = v_arg;
+       netmask = netmask_arg;
+       x = head->rnh_treetop;
+       tt = rn_search(v, x);
+       head_off = x->rn_offset;
+       vlen =  LEN(v);
+       saved_tt = tt;
+       top = x;
+       if (tt == 0 ||
+           bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
+               return (0);
+       /*
+        * Delete our route from mask lists.
+        */
+       if (netmask) {
+               if ((x = rn_addmask(netmask, 1, head_off)) == 0)
+                       return (0);
+               netmask = x->rn_key;
+               while (tt->rn_mask != netmask)
+                       if ((tt = tt->rn_dupedkey) == 0)
+                               return (0);
+       }
+       if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
+               goto on1;
+       if (tt->rn_flags & RNF_NORMAL) {
+               if (m->rm_leaf != tt || m->rm_refs > 0) {
+                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
+                       return 0;  /* dangling ref could cause disaster */
+               }
+       } else {
+               if (m->rm_mask != tt->rn_mask) {
+                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
+                       goto on1;
+               }
+               if (--m->rm_refs >= 0)
+                       goto on1;
+       }
+       b = -1 - tt->rn_bit;
+       t = saved_tt->rn_parent;
+       if (b > t->rn_bit)
+               goto on1; /* Wasn't lifted at all */
+       do {
+               x = t;
+               t = t->rn_parent;
+       } while (b <= t->rn_bit && x != top);
+       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
+               if (m == saved_m) {
+                       *mp = m->rm_mklist;
+                       MKFree(m);
+                       break;
+               }
+       if (m == 0) {
+               log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
+               if (tt->rn_flags & RNF_NORMAL)
+                       return (0); /* Dangling ref to us */
+       }
+on1:
+       /*
+        * Eliminate us from tree
+        */
+       if (tt->rn_flags & RNF_ROOT)
+               return (0);
+#ifdef RN_DEBUG
+       /* Get us out of the creation list */
+       for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {}
+       if (t) t->rn_ybro = tt->rn_ybro;
+#endif
+       t = tt->rn_parent;
+       dupedkey = saved_tt->rn_dupedkey;
+       if (dupedkey) {
+               /*
+                * Here, tt is the deletion target and
+                * saved_tt is the head of the dupekey chain.
+                */
+               if (tt == saved_tt) {
+                       /* remove from head of chain */
+                       x = dupedkey; x->rn_parent = t;
+                       if (t->rn_left == tt)
+                               t->rn_left = x;
+                       else
+                               t->rn_right = x;
+               } else {
+                       /* find node in front of tt on the chain */
+                       for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
+                               p = p->rn_dupedkey;
+                       if (p) {
+                               p->rn_dupedkey = tt->rn_dupedkey;
+                               if (tt->rn_dupedkey)            /* parent */
+                                       tt->rn_dupedkey->rn_parent = p;
+                                                               /* parent */
+                       } else log(LOG_ERR, "rn_delete: couldn't find us\n");
+               }
+               t = tt + 1;
+               if  (t->rn_flags & RNF_ACTIVE) {
+#ifndef RN_DEBUG
+                       *++x = *t;
+                       p = t->rn_parent;
+#else
+                       b = t->rn_info;
+                       *++x = *t;
+                       t->rn_info = b;
+                       p = t->rn_parent;
+#endif
+                       if (p->rn_left == t)
+                               p->rn_left = x;
+                       else
+                               p->rn_right = x;
+                       x->rn_left->rn_parent = x;
+                       x->rn_right->rn_parent = x;
+               }
+               goto out;
+       }
+       if (t->rn_left == tt)
+               x = t->rn_right;
+       else
+               x = t->rn_left;
+       p = t->rn_parent;
+       if (p->rn_right == t)
+               p->rn_right = x;
+       else
+               p->rn_left = x;
+       x->rn_parent = p;
+       /*
+        * Demote routes attached to us.
+        */
+       if (t->rn_mklist) {
+               if (x->rn_bit >= 0) {
+                       for (mp = &x->rn_mklist; (m = *mp);)
+                               mp = &m->rm_mklist;
+                       *mp = t->rn_mklist;
+               } else {
+                       /* If there are any key,mask pairs in a sibling
+                          duped-key chain, some subset will appear sorted
+                          in the same order attached to our mklist */
+                       for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
+                               if (m == x->rn_mklist) {
+                                       struct radix_mask *mm = m->rm_mklist;
+                                       x->rn_mklist = 0;
+                                       if (--(m->rm_refs) < 0)
+                                               MKFree(m);
+                                       m = mm;
+                               }
+                       if (m)
+                               log(LOG_ERR,
+                                   "rn_delete: Orphaned Mask %p at %p\n",
+                                   (void *)m, (void *)x);
+               }
+       }
+       /*
+        * We may be holding an active internal node in the tree.
+        */
+       x = tt + 1;
+       if (t != x) {
+#ifndef RN_DEBUG
+               *t = *x;
+#else
+               b = t->rn_info;
+               *t = *x;
+               t->rn_info = b;
+#endif
+               t->rn_left->rn_parent = t;
+               t->rn_right->rn_parent = t;
+               p = x->rn_parent;
+               if (p->rn_left == x)
+                       p->rn_left = t;
+               else
+                       p->rn_right = t;
+       }
+out:
+       tt->rn_flags &= ~RNF_ACTIVE;
+       tt[1].rn_flags &= ~RNF_ACTIVE;
+       return (tt);
+}
+
+/*
+ * This is the same as rn_walktree() except for the parameters and the
+ * exit.
+ */
+static int
+rn_walktree_from(h, a, m, f, w)
+       struct radix_node_head *h;
+       void *a, *m;
+       walktree_f_t *f;
+       void *w;
+{
+       int error;
+       struct radix_node *base, *next;
+       u_char *xa = (u_char *)a;
+       u_char *xm = (u_char *)m;
+       register struct radix_node *rn, *last = 0 /* shut up gcc */;
+       int stopping = 0;
+       int lastb;
+
+       /*
+        * rn_search_m is sort-of-open-coded here. We cannot use the
+        * function because we need to keep track of the last node seen.
+        */
+       /* printf("about to search\n"); */
+       for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) {
+               last = rn;
+               /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n",
+                      rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */
+               if (!(rn->rn_bmask & xm[rn->rn_offset])) {
+                       break;
+               }
+               if (rn->rn_bmask & xa[rn->rn_offset]) {
+                       rn = rn->rn_right;
+               } else {
+                       rn = rn->rn_left;
+               }
+       }
+       /* printf("done searching\n"); */
+
+       /*
+        * Two cases: either we stepped off the end of our mask,
+        * in which case last == rn, or we reached a leaf, in which
+        * case we want to start from the last node we looked at.
+        * Either way, last is the node we want to start from.
+        */
+       rn = last;
+       lastb = rn->rn_bit;
+
+       /* printf("rn %p, lastb %d\n", rn, lastb);*/
+
+       /*
+        * This gets complicated because we may delete the node
+        * while applying the function f to it, so we need to calculate
+        * the successor node in advance.
+        */
+       while (rn->rn_bit >= 0)
+               rn = rn->rn_left;
+
+       while (!stopping) {
+               /* printf("node %p (%d)\n", rn, rn->rn_bit); */
+               base = rn;
+               /* If at right child go back up, otherwise, go right */
+               while (rn->rn_parent->rn_right == rn
+                      && !(rn->rn_flags & RNF_ROOT)) {
+                       rn = rn->rn_parent;
+
+                       /* if went up beyond last, stop */
+                       if (rn->rn_bit <= lastb) {
+                               stopping = 1;
+                               /* printf("up too far\n"); */
+                               /*
+                                * XXX we should jump to the 'Process leaves'
+                                * part, because the values of 'rn' and 'next'
+                                * we compute will not be used. Not a big deal
+                                * because this loop will terminate, but it is
+                                * inefficient and hard to understand!
+                                */
+                       }
+               }
+               
+               /* 
+                * At the top of the tree, no need to traverse the right
+                * half, prevent the traversal of the entire tree in the
+                * case of default route.
+                */
+               if (rn->rn_parent->rn_flags & RNF_ROOT)
+                       stopping = 1;
+
+               /* Find the next *leaf* since next node might vanish, too */
+               for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
+                       rn = rn->rn_left;
+               next = rn;
+               /* Process leaves */
+               while ((rn = base) != 0) {
+                       base = rn->rn_dupedkey;
+                       /* printf("leaf %p\n", rn); */
+                       if (!(rn->rn_flags & RNF_ROOT)
+                           && (error = (*f)(rn, w)))
+                               return (error);
+               }
+               rn = next;
+
+               if (rn->rn_flags & RNF_ROOT) {
+                       /* printf("root, stopping"); */
+                       stopping = 1;
+               }
+
+       }
+       return 0;
+}
+
+static int
+rn_walktree(h, f, w)
+       struct radix_node_head *h;
+       walktree_f_t *f;
+       void *w;
+{
+       int error;
+       struct radix_node *base, *next;
+       register struct radix_node *rn = h->rnh_treetop;
+       /*
+        * This gets complicated because we may delete the node
+        * while applying the function f to it, so we need to calculate
+        * the successor node in advance.
+        */
+
+       /* First time through node, go left */
+       while (rn->rn_bit >= 0)
+               rn = rn->rn_left;
+       for (;;) {
+               base = rn;
+               /* If at right child go back up, otherwise, go right */
+               while (rn->rn_parent->rn_right == rn
+                      && (rn->rn_flags & RNF_ROOT) == 0)
+                       rn = rn->rn_parent;
+               /* Find the next *leaf* since next node might vanish, too */
+               for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
+                       rn = rn->rn_left;
+               next = rn;
+               /* Process leaves */
+               while ((rn = base)) {
+                       base = rn->rn_dupedkey;
+                       if (!(rn->rn_flags & RNF_ROOT)
+                           && (error = (*f)(rn, w)))
+                               return (error);
+               }
+               rn = next;
+               if (rn->rn_flags & RNF_ROOT)
+                       return (0);
+       }
+       /* NOTREACHED */
+}
+
+/*
+ * Allocate and initialize an empty tree. This has 3 nodes, which are
+ * part of the radix_node_head (in the order <left,root,right>) and are
+ * marked RNF_ROOT so they cannot be freed.
+ * The leaves have all-zero and all-one keys, with significant
+ * bits starting at 'off'.
+ * Return 1 on success, 0 on error.
+ */
+int
+rn_inithead(head, off)
+       void **head;
+       int off;
+{
+       register struct radix_node_head *rnh;
+       register struct radix_node *t, *tt, *ttt;
+       if (*head)
+               return (1);
+       R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh));
+       if (rnh == 0)
+               return (0);
+#ifdef _KERNEL
+       RADIX_NODE_HEAD_LOCK_INIT(rnh);
+#endif
+       *head = rnh;
+       t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
+       ttt = rnh->rnh_nodes + 2;
+       t->rn_right = ttt;
+       t->rn_parent = t;
+       tt = t->rn_left;        /* ... which in turn is rnh->rnh_nodes */
+       tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
+       tt->rn_bit = -1 - off;
+       *ttt = *tt;
+       ttt->rn_key = rn_ones;
+       rnh->rnh_addaddr = rn_addroute;
+       rnh->rnh_deladdr = rn_delete;
+       rnh->rnh_matchaddr = rn_match;
+       rnh->rnh_lookup = rn_lookup;
+       rnh->rnh_walktree = rn_walktree;
+       rnh->rnh_walktree_from = rn_walktree_from;
+       rnh->rnh_treetop = t;
+       return (1);
+}
+
+void
+rn_init(int maxk)
+{
+       char *cp, *cplim;
+
+       max_keylen = maxk;
+       if (max_keylen == 0) {
+               log(LOG_ERR,
+                   "rn_init: radix functions require max_keylen be set\n");
+               return;
+       }
+       R_Malloc(rn_zeros, char *, 3 * max_keylen);
+       if (rn_zeros == NULL)
+               panic("rn_init");
+       bzero(rn_zeros, 3 * max_keylen);
+       rn_ones = cp = rn_zeros + max_keylen;
+       addmask_key = cplim = rn_ones + max_keylen;
+       while (cp < cplim)
+               *cp++ = -1;
+       if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
+               panic("rn_init 2");
+}
diff --git a/glue.h b/glue.h

index 1f8aa62..de0ab23 100644 (file)
--- a/glue.h
+++ b/glue.h
@@ -23,7 +23,7 @@
   * SUCH DAMAGE.
   */
  /*
- * $Id: glue.h 4436 2009-12-10 18:31:49Z luigi $
+ * $Id: glue.h 4661 2010-01-04 11:56:12Z luigi $
   *
   * glue code to adapt the FreeBSD version to linux and windows,
   * userland and kernel.
@@ -241,6 +241,11 @@ int
  sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
           size_t newlen);
   
+#ifdef __linux__
+/* linux does not have sin_len in sockaddr, we only remap in userland */
+#define        sin_len sin_zero[0]
+#endif /* __linux__ */
+
  #else /* KERNEL_MODULE */
  
  /* linux and windows kernel do not have bcopy ? */
@@ -250,6 +255,11 @@ sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
  #include <linux/in6.h>
  #endif
  
+/* skb_dst() was introduced from linux 2.6.31 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)        // or 2.4.x
+#define skb_dst(_dummy) skb->dst
+#endif
+
  /* definitions useful for the kernel side */
  
  struct route_in6 { };
@@ -260,10 +270,6 @@ struct route_in6 { };
  
  #define INET_ADDRSTRLEN                16
  
-#ifdef linux
-/* linux does not have sin_len in sockaddr */
-#define        sin_len sin_zero[0]
-#endif /* linux */
  
  /*
   * List of values used for set/getsockopt options.
diff --git a/ipfw/Makefile b/ipfw/Makefile

index 807f2d1..7b4a272 100644 (file)
--- a/ipfw/Makefile
+++ b/ipfw/Makefile
@@ -9,7 +9,7 @@ $(warning Building userland ipfw for $(VER))
  EXTRA_CFLAGS += -O1
  EXTRA_CFLAGS += -Wall -Werror
  EXTRA_CFLAGS += -include ../glue.h
-EXTRA_CFLAGS += -I ./include
+EXTRA_CFLAGS += -I ./include_e -I ./include
  
  ifneq ($(VER),openwrt)
  OSARCH := $(shell uname)
@@ -41,7 +41,21 @@ all: ipfw
  ipfw: $(OBJS)
         $(CC) $(LDFLAGS) -o $@ $^
  
-$(OBJS) : ipfw2.h ../glue.h include/netinet
+$(OBJS) : ipfw2.h ../glue.h include/netinet include_e
+
+# support to create empty dirs and files in include_e/
+# EDIRS is the list of directories, EFILES is the list of files.
+ 
+EDIRS   = sys
+
+EFILES  = sys/sockio.h libutil.h
+M ?= $(shell pwd)
+
+include_e:
+       echo "running in $M"
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
  
  include/netinet:
         -@rm -rf include/netinet
diff --git a/ipfw/dummynet.c b/ipfw/dummynet.c

index 6cfbff0..c50962d 100644 (file)
--- a/ipfw/dummynet.c
+++ b/ipfw/dummynet.c
@@ -33,6 +33,7 @@
  #include <ctype.h>
  #include <err.h>
  #include <errno.h>
+#include <libutil.h>
  #include <netdb.h>
  #include <stdio.h>
  #include <stdlib.h>
diff --git a/ipfw/glue.c b/ipfw/glue.c

index 19ea71e..fb3d5c3 100644 (file)
--- a/ipfw/glue.c
+++ b/ipfw/glue.c
@@ -24,7 +24,7 @@
   */
  
  /*
- * $Id: glue.c 4469 2009-12-11 20:23:11Z marta $
+ * $Id: glue.c 4540 2009-12-16 17:22:47Z marta $
   *
   * Userland functions missing in linux
   */
diff --git a/ipfw/include_e/libutil.h b/ipfw/include_e/libutil.h

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ipfw/include_e/sys/sockio.h b/ipfw/include_e/sys/sockio.h

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ipfw/ipfw2.c b/ipfw/ipfw2.c

index 5d70328..85979f8 100644 (file)
--- a/ipfw/ipfw2.c
+++ b/ipfw/ipfw2.c
@@ -224,11 +224,14 @@ static struct _s_x rule_action_params[] = {
         { NULL, 0 }     /* terminator */
  };
  
-/* index of 'lookup ... ' keys in the kernel */
+/*                             
+ * The 'lookup' instruction accepts one of the following arguments.
+ * -1 is a terminator for the list.
+ * Arguments are passed as v[1] in O_DST_LOOKUP options.
+ */
  static int lookup_key[] = {
         TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT,
-       TOK_UID, TOK_GID, TOK_JAIL,
-       TOK_PROTO, TOK_MACTYPE, 0, };
+       TOK_UID, TOK_JAIL, -1 };
  
  static struct _s_x rule_options[] = {
         { "tagged",             TOK_TAGGED },
@@ -756,8 +759,8 @@ print_ip(ipfw_insn_ip *cmd, char const *s)
  
                 if (d < sizeof(lookup_key)/sizeof(lookup_key[0]))
                         arg = match_value(rule_options, lookup_key[d]);
-               printf("%s lookup %s %d,%d", cmd->o.len & F_NOT ? " not": "",
-                       arg, cmd->o.arg1, a[0]);
+               printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "",
+                       arg, cmd->o.arg1);
                 return;
         }
         printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);
@@ -3518,26 +3521,21 @@ read_options:
                         int j;
  
                         if (ac < 2)
-                               errx(EX_USAGE, "format: lookup argument tablenum[,arg]");
+                               errx(EX_USAGE, "format: lookup argument tablenum");
                         cmd->opcode = O_IP_DST_LOOKUP;
                         cmd->len |= F_INSN_SIZE(ipfw_insn) + 2;
                         i = match_token(rule_options, *av);
-                       for (j = 0; lookup_key[j] ; j++) {
+                       for (j = 0; lookup_key[j] >= 0 ; j++) {
                                 if (i == lookup_key[j])
                                         break;
                         }
-                       if (lookup_key[j] == 0)
+                       if (lookup_key[j] <= 0)
                                 errx(EX_USAGE, "format: cannot lookup on %s", *av);
                         c->d[1] = j; // i converted to option
                         ac--; av++;
-                       p = strchr(*av, ',');
-                       if (p) {
-                               *p++ = '\0';
-                               c->d[0] = strtoul(p, NULL, 0);
-                       } else {
-                               c->d[0] = ~0;
-                       }
-                       cmd->arg1 = strtoul(*av, NULL, 0);
+                       cmd->arg1 = strtoul(*av, &p, 0);
+                       if (p && *p)
+                               errx(EX_USAGE, "format: lookup argument tablenum");
                         ac--; av++;
                     }
                         break;
diff --git a/planetlab/ipfwroot.spec b/planetlab/ipfwroot.spec

index 0e5e696..68373b9 100644 (file)
--- a/planetlab/ipfwroot.spec
+++ b/planetlab/ipfwroot.spec
@@ -5,7 +5,7 @@
  # restart crond
  # modprobe ipfw_mod.ko (depmod ?)
  #
-%define url $URL:$
+%define url $URL$
  
  # Marta Carbone <marta.carbone@iet.unipi.it>
  # 2009 - Universita` di Pisa
@@ -32,6 +32,7 @@ Group: System Environment/Kernel
  Source0: %{name}-%{version}.tar.bz2
  BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
  Requires: vixie-cron
+Requires: vsys-scripts
  
  Vendor: unipi
  Packager: PlanetLab <marta@onelab2.iet.unipi.it>
@@ -56,7 +57,7 @@ rm -rf $RPM_BUILD_ROOT
  %__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1
  
  %install
-install -D -m 755 dummynet/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
+install -D -m 755 dummynet2/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
  install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw
  install -D -m 755 planetlab/ipfw-cleanup $RPM_BUILD_ROOT/usr/bin/ipfw-cleanup
  install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron
@@ -76,8 +77,15 @@ rm -rf $RPM_BUILD_ROOT
  %postun
  # unload the module if present
  LOADED=`cat /proc/modules | grep ^ipfw_mod`; if [ -n "$LOADED" ] ; then rmmod ipfw_mod; fi
+# clean the old database and initialize the firewall
+echo "super dbcleanup" | /vsys/ipfw-be 0
+echo "super init" | /vsys/ipfw-be 0
  
  %changelog
+* Wed Jan 06 2010 Marta Carbone <marta.carbone@iet.unipi.it>
+- move to dummynet2, added support for table lookup
+- added the vsys-script dependencies and the ipfw initialization
+
  * Tue Dec 15 2009 Marta Carbone <marta.carbone@iet.unipi.it>
  - more work on the radix code, added sysctl read/write support
author	marta <marta@8c455092-636d-4788-adf5-e71def0336e8>
	Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
committer	marta <marta@8c455092-636d-4788-adf5-e71def0336e8>
	Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)
Makefile		patch \| blob \| history
Makefile.openwrt		patch \| blob \| history
README		patch \| blob \| history
dummynet/Makefile		patch \| blob \| history
dummynet/bsd_compat.c		patch \| blob \| history
dummynet/include/sys/kernel.h		patch \| blob \| history
dummynet/include/sys/mbuf.h		patch \| blob \| history
dummynet/include/sys/module.h		patch \| blob \| history
dummynet/ip_dummynet.c		patch \| blob \| history
dummynet/ip_fw2.c		patch \| blob \| history
dummynet/ip_fw_pfil.c		patch \| blob \| history
dummynet/ipfw2_mod.c		patch \| blob \| history
dummynet/missing.h		patch \| blob \| history
dummynet/radix.c		patch \| blob \| history
dummynet2/Makefile	[new file with mode: 0644]	patch \| blob
dummynet2/bsd_compat.c	[new file with mode: 0644]	patch \| blob
dummynet2/in_cksum.c	[new file with mode: 0644]	patch \| blob
dummynet2/include/netgraph/ng_ipfw.h	[new file with mode: 0644]	patch \| blob
dummynet2/include/netinet/ip_dummynet.h	[new file with mode: 0644]	patch \| blob
dummynet2/include/netinet/ip_fw.h	[new file with mode: 0644]	patch \| blob
dummynet2/include/netinet/ipfw/ip_fw_private.h	[new file with mode: 0644]	patch \| blob
dummynet2/ip_dummynet.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw2.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_dynamic.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_log.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_lookup.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_nat.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_pfil.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_sockopt.c	[new file with mode: 0644]	patch \| blob
dummynet2/ip_fw_table.c	[new file with mode: 0644]	patch \| blob
dummynet2/ipfw2_mod.c	[new file with mode: 0644]	patch \| blob
dummynet2/missing.h	[new file with mode: 0644]	patch \| blob
dummynet2/radix.c	[new file with mode: 0644]	patch \| blob
glue.h		patch \| blob \| history
ipfw/Makefile		patch \| blob \| history
ipfw/dummynet.c		patch \| blob \| history
ipfw/glue.c		patch \| blob \| history
ipfw/include_e/libutil.h	[new file with mode: 0644]	patch \| blob
ipfw/include_e/sys/sockio.h	[new file with mode: 0644]	patch \| blob
ipfw/ipfw2.c		patch \| blob \| history
planetlab/ipfwroot.spec		patch \| blob \| history