initial version, corresponding to ipfw3-2012
authorLuigi Rizzo <rizzo@iet.unipi.it>
Sun, 17 Aug 2014 10:30:20 +0000 (03:30 -0700)
committerLuigi Rizzo <rizzo@iet.unipi.it>
Sun, 17 Aug 2014 10:30:20 +0000 (03:30 -0700)
142 files changed:
020-mips-hz1000.patch [new file with mode: 0644]
Makefile [new file with mode: 0644]
Makefile.inc [new file with mode: 0644]
Makefile.openwrt [new file with mode: 0644]
NOTES [new file with mode: 0644]
README [new file with mode: 0644]
binary/README.txt [new file with mode: 0644]
binary/cygwin1.dll [new file with mode: 0644]
binary/ipfw.exe [new file with mode: 0644]
binary/ipfw.sys [new file with mode: 0644]
binary/netipfw.inf [new file with mode: 0644]
binary/netipfw_m.inf [new file with mode: 0644]
binary/testme.bat [new file with mode: 0644]
binary/wget.exe [new file with mode: 0644]
binary64/ipfw.exe [new file with mode: 0755]
binary64/ipfw.sys [new file with mode: 0755]
configuration/README [new file with mode: 0644]
configuration/change_rules.sh [new file with mode: 0755]
configuration/change_rules_linux.sh [new file with mode: 0755]
configuration/ipfw.conf [new file with mode: 0644]
configuration/ipfw.rules [new file with mode: 0755]
configuration/rc.firewall [new file with mode: 0755]
glue.h [new file with mode: 0644]
ipfw/Makefile [new file with mode: 0644]
ipfw/add_rules [new file with mode: 0755]
ipfw/dummynet.c [new file with mode: 0644]
ipfw/expand_number.c [new file with mode: 0644]
ipfw/glue.c [new file with mode: 0644]
ipfw/humanize_number.c [new file with mode: 0644]
ipfw/include/alias.h [new file with mode: 0644]
ipfw/include/net/if_dl.h [new file with mode: 0644]
ipfw/include/net/pfvar.h [new file with mode: 0644]
ipfw/include/timeconv.h [new file with mode: 0644]
ipfw/ipfw.8 [new file with mode: 0644]
ipfw/ipfw2.c [new file with mode: 0644]
ipfw/ipfw2.h [new file with mode: 0644]
ipfw/ipv6.c [new file with mode: 0644]
ipfw/main.c [new file with mode: 0644]
ipfw/qsort.c [new file with mode: 0644]
ipfw/qsort_r.c [new file with mode: 0644]
ipfw/rule_test.sh [new file with mode: 0755]
ipfw/ws2_32.def [new file with mode: 0644]
kipfw/Makefile [new file with mode: 0644]
kipfw/bsd_compat.c [new file with mode: 0644]
kipfw/debug.c [new file with mode: 0644]
kipfw/ipfw2_mod.c [new file with mode: 0644]
kipfw/md_win.c [new file with mode: 0644]
kipfw/missing.h [new file with mode: 0644]
kipfw/mysetenv.sh [new file with mode: 0644]
kipfw/netipfw.inf [new file with mode: 0644]
kipfw/netipfw_m.inf [new file with mode: 0644]
kipfw/sources [new file with mode: 0644]
kipfw/win-passthru.diff [new file with mode: 0644]
kipfw/winmissing.h [new file with mode: 0644]
kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk [new file with mode: 0644]
modified_passthru/miniport.c [new file with mode: 0644]
modified_passthru/passthru.c [new file with mode: 0644]
modified_passthru/passthru.h [new file with mode: 0644]
modified_passthru/precomp.h [new file with mode: 0644]
modified_passthru/protocol.c [new file with mode: 0644]
original_passthru/makefile [new file with mode: 0644]
original_passthru/miniport.c [new file with mode: 0644]
original_passthru/netsf.inf [new file with mode: 0644]
original_passthru/netsf_m.inf [new file with mode: 0644]
original_passthru/passthru.c [new file with mode: 0644]
original_passthru/passthru.h [new file with mode: 0644]
original_passthru/passthru.htm [new file with mode: 0644]
original_passthru/passthru.rc [new file with mode: 0644]
original_passthru/precomp.h [new file with mode: 0644]
original_passthru/protocol.c [new file with mode: 0644]
original_passthru/sources [new file with mode: 0644]
planetlab/Makefile.planetlab [new file with mode: 0644]
planetlab/check_planetlab_sync [new file with mode: 0755]
planetlab/ipfw [new file with mode: 0755]
planetlab/ipfw.8.gz [new file with mode: 0644]
planetlab/ipfw.cron [new file with mode: 0644]
planetlab/ipfwroot.spec [new file with mode: 0644]
planetlab/ipfwslice.spec [new file with mode: 0644]
planetlab/netconfig [new file with mode: 0755]
planetlab/planetlab-tags.mk [new file with mode: 0644]
planetlab/planetlab.mk [new file with mode: 0644]
planetlab/sample_hook [new file with mode: 0755]
sys/net/if.h [new file with mode: 0644]
sys/net/pfil.h [new file with mode: 0644]
sys/net/radix.c [new file with mode: 0644]
sys/net/radix.h [new file with mode: 0644]
sys/netgraph/ng_ipfw.h [new file with mode: 0644]
sys/netinet/in_cksum.c [new file with mode: 0644]
sys/netinet/ip.h [new file with mode: 0644]
sys/netinet/ip6.h [new file with mode: 0644]
sys/netinet/ip_dummynet.h [new file with mode: 0644]
sys/netinet/ip_fw.h [new file with mode: 0644]
sys/netinet/ip_icmp.h [new file with mode: 0644]
sys/netinet/ipfw/dn_heap.c [new file with mode: 0644]
sys/netinet/ipfw/dn_heap.h [new file with mode: 0644]
sys/netinet/ipfw/dn_sched.h [new file with mode: 0644]
sys/netinet/ipfw/dn_sched_fifo.c [new file with mode: 0644]
sys/netinet/ipfw/dn_sched_prio.c [new file with mode: 0644]
sys/netinet/ipfw/dn_sched_qfq.c [new file with mode: 0644]
sys/netinet/ipfw/dn_sched_rr.c [new file with mode: 0644]
sys/netinet/ipfw/dn_sched_wf2q.c [new file with mode: 0644]
sys/netinet/ipfw/ip_dn_glue.c [new file with mode: 0644]
sys/netinet/ipfw/ip_dn_io.c [new file with mode: 0644]
sys/netinet/ipfw/ip_dn_private.h [new file with mode: 0644]
sys/netinet/ipfw/ip_dummynet.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw2.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_dynamic.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_log.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_lookup.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_nat.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_pfil.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_private.h [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_sockopt.c [new file with mode: 0644]
sys/netinet/ipfw/ip_fw_table.c [new file with mode: 0644]
sys/netinet/tcp.h [new file with mode: 0644]
sys/netinet/tcp_var.h [new file with mode: 0644]
sys/netinet/udp.h [new file with mode: 0644]
sys/sys/cdefs.h [new file with mode: 0644]
sys/sys/kernel.h [new file with mode: 0644]
sys/sys/malloc.h [new file with mode: 0644]
sys/sys/mbuf.h [new file with mode: 0644]
sys/sys/module.h [new file with mode: 0644]
sys/sys/param.h [new file with mode: 0644]
sys/sys/queue.h [new file with mode: 0644]
sys/sys/syslog.h [new file with mode: 0644]
sys/sys/systm.h [new file with mode: 0644]
sys/sys/taskqueue.h [new file with mode: 0644]
tcc-0.9.25-bsd.zip [new file with mode: 0644]
tcc_glue.h [new file with mode: 0644]
test/Makefile [new file with mode: 0644]
test/basic_ipfw.sh [new file with mode: 0755]
test/dn_test.h [new file with mode: 0644]
test/dynrules.sh [new file with mode: 0644]
test/interpolation.c [new file with mode: 0644]
test/main.c [new file with mode: 0644]
test/memory_leak.sh [new file with mode: 0644]
test/mylist.h [new file with mode: 0644]
test/profile_bench1 [new file with mode: 0644]
test/profile_bench2 [new file with mode: 0644]
test/profile_bench3 [new file with mode: 0644]
test/test_dn_heap.c [new file with mode: 0644]
test/test_dn_sched.c [new file with mode: 0644]

diff --git a/020-mips-hz1000.patch b/020-mips-hz1000.patch
new file mode 100644 (file)
index 0000000..eb54ca2
--- /dev/null
@@ -0,0 +1,11 @@
+--- include/asm-mips/param_orig.h      2010-02-23 12:45:58.000000000 +0100
++++ include/asm-mips/param.h   2010-02-23 12:00:31.000000000 +0100
+@@ -41,7 +41,7 @@
+    counter is increasing.  This value is independent from the external value
+    and can be changed in order to suit the hardware and application
+    requirements.  */
+-#  define HZ 100
++#  define HZ 1000
+ #  define hz_to_std(a) (a)
+ #endif /* Not a DECstation  */
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..508f1ae
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,149 @@
+# $Id: Makefile 11689 2012-08-12 21:07:34Z luigi $
+#
+# Top level makefile for building ipfw/dummynet (kernel and userspace).
+# You can run it manually or also under the Planetlab build.
+# Planetlab wants also the 'install' target.
+#
+# To build on system with non standard Kernel sources or userland files,
+# you should run this with
+#
+#      make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr
+#
+# We assume that $(USRDIR) contains include/ and lib/ used to build userland.
+#
+
+include Makefile.inc
+
+DATE ?= $(shell date +%Y%m%d)
+SNAPSHOT_NAME=$(DATE)-ipfw3.tgz
+BINDIST=$(DATE)-dummynet-linux.tgz
+WINDIST=$(DATE)-dummynet-windows.zip
+
+DISTFILES= Makefile Makefile.inc README binary* ipfw kipfw *.h sys
+
+.PHONY: ipfw kipfw
+
+###########################################
+#  windows x86 and x64 specific variables #
+###########################################
+#  DRIVE must be the hard drive letter where DDK is installed
+#  DDKDIR must be the path to the DDK root directory, without drive letter
+#  TARGETOS (x64 only) must be one of the following:
+#  wnet   -> windows server 2003
+#  wlh    -> windows vista and windows server 2008
+#  win7   -> windows 7
+#  future version must be added here
+DRIVE ?= C:
+DDKDIR ?= /WinDDK/7600.16385.1
+DDK = $(DRIVE)$(DDKDIR)
+TARGETOS=win7
+
+export WIN64
+export DDK
+export DRIVE
+export DDKDIR
+
+_all: all
+
+clean distclean:
+       -@(cd ipfw && $(MAKE) $(@) )
+       -@rm -rf kipfw-mod binary64/[A-hj-z]*
+
+all: kipfw ipfw
+       @# -- windows only
+ifeq ($(OSARCH),Windows)       # copy files
+ifeq ($(WIN64),)
+       -@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/
+       -@ cp kipfw/*.inf binary/
+else
+       -@ cp binary/* kipfw/*.inf binary64/
+       -@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/
+endif  # WIN64
+endif  # Windows
+
+win64:
+       $(MAKE) WIN64=1
+
+# kipfw-src prepares the sources for the kernel part.
+# The windows files (passthru etc.) are modified version of the
+# examples found in the $(DDK)/src/network/ndis/passthru/driver/
+# They can be re-created using the 'ndis-glue' target
+# # We need a sed trick to remove newlines from the patchfile.
+
+ndis-glue:
+       -@mkdir -p kipfw-mod
+       cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod
+       cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch )
+
+kipfw-src:
+       -@rm -rf kipfw-mod
+       -@mkdir -p kipfw-mod
+       -@cp -Rp kipfw/* kipfw-mod
+       -@cp `find sys -name \*.c` kipfw-mod
+       -@(cd kipfw-mod && $(MAKE) include_e)
+ifeq ($(OSARCH),Windows)
+       make ndis-glue
+endif
+
+snapshot:
+       $(MAKE) distclean
+       (tar cvzhf /tmp/$(SNAPSHOT_NAME) -s':^:ipfw3-2012/:' $(DISTFILES) )
+
+bindist:
+       $(MAKE) clean
+       $(MAKE) all
+       tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko
+
+windist:
+       $(MAKE) clean
+       -$(MAKE) all
+       -rm /tmp/$(WINDIST)
+       zip -r /tmp/$(WINDIST) binary -x \*.svn\*
+
+
+ipfw:
+       @(cd ipfw && $(MAKE) $(@) )
+
+kipfw: kipfw-src
+ifeq ($(WIN64),)       # linux or windows 32 bit
+       @(cd kipfw-mod && $(MAKE) $(@) )
+else   #--- windows 64 bit, we use build.exe and nmake
+       rm -f kipfw-mod/Makefile
+       mkdir kipfw-mod/tmpbuild                # check mysetenv.sh
+       bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS)
+endif
+
+openwrt_release:
+       # create a temporary directory
+       $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX))
+       # create the source destination directory
+       $(eval IPFWDIR := ipfw3-$(DATE))
+       $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR))
+       mkdir $(DSTDIR)
+       # copy the package, clean objects and svn info
+       cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR)
+       (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf)
+       (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR))
+
+       # create the port files in /tmp/ipfw3-port
+       $(eval PORTDIR := $(TMPDIR)/ipfw3)
+       mkdir -p $(PORTDIR)/patches
+       # generate the Makefile, PKG_VERSION and PKG_MD5SUM
+       md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum
+       cat ./OPENWRT/Makefile | \
+               sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \
+               sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \
+               > $(PORTDIR)/Makefile
+
+       @echo ""
+       @echo "The openwrt port is in $(TMPDIR)/ipfw3-port"
+       @echo "The source file should be copied to the public server:"
+       @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet"
+       @echo "after this the temporary directory $(TMPDIR) can be removed."
+
+install:
+
+diff:
+       -@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw)
+       -@(diff -upr $(BSD_HEAD)/sys sys)
+
diff --git a/Makefile.inc b/Makefile.inc
new file mode 100644 (file)
index 0000000..ffa14e9
--- /dev/null
@@ -0,0 +1,23 @@
+# $Id$
+# GNU makefile header for ipfw/kipfw building
+BSD_HEAD ?= ~/FreeBSD/head
+OSARCH := $(shell uname)
+OSARCH := $(findstring $(OSARCH),FreeBSD Linux Darwin)
+ifeq ($(OSARCH),)
+    OSARCH := Windows
+endif
+OBJDIR=mia
+
+KSRC ?= /lib/modules/$(shell uname -r)/build
+ifneq ($V,1) # no echo
+    MSG=@echo
+    HIDE=@
+else
+    MSG=@\#
+    HIDE=
+endif
+
+.c.o:
+       $(MSG) "   CC $<"
+       $(HIDE) $(CC) $(CFLAGS) -c $< -o $@
+
diff --git a/Makefile.openwrt b/Makefile.openwrt
new file mode 100644 (file)
index 0000000..3c7be80
--- /dev/null
@@ -0,0 +1,95 @@
+# Makefile to build the package in openwrt.
+# goes into package/ipfw3/Makefile
+#
+# Edit IPFW_DIR to point to the directory with the sources for ipfw
+
+IPFW_DIR := $(TOPDIR)/../ipfw3
+
+include $(TOPDIR)/rules.mk
+include $(INCLUDE_DIR)/kernel.mk
+
+PKG_NAME:=kmod-ipfw3
+PKG_RELEASE:=1
+
+# MV is undefined
+MV ?= mv
+
+include $(INCLUDE_DIR)/package.mk
+
+#Stuff depending on kernel version
+ifeq ($(KERNEL),2.6)
+
+VERS:=2.6
+IPFW_MOD:=ipfw_mod.ko
+IPFW_SRC_DIR:=M
+
+else
+
+VERS:=openwrt
+CFLAGS_WRT:=-DSYSCTL_NODE -DEMULATE_SYSCTL
+IPFW_MOD:=ipfw_mod.o
+IPFW_SRC_DIR:=SUBDIRS
+
+endif
+
+# Description for the package.
+# The names KernelPackage/ipfw3 must match the arguments to the
+# call $(eval $(call KernelPackage,ipfw3)) used to build it
+
+define KernelPackage/ipfw3
+ SUBMENU:=Other modules
+ TITLE:= IPFW and dummynet
+ # FILES is what makes up the module, both kernel and userland
+ # It must be in the KernelPackage section
+ FILES := $(PKG_BUILD_DIR)/dummynet2/$(IPFW_MOD) $(PKG_BUILD_DIR)/ipfw/ipfw
+ # AUTOLOAD:=$(call AutoLoad,80,ipfw_mod)
+endef
+
+define KernelPackage/ipfw3/description
+ This package contains the ipfw and dummynet module
+endef
+
+# Standard entries for the openwrt builds: Build/Prepare and Build/Compile
+# Remember that commands must start with a tab
+
+# 'prepare' instructions for both kernel and userland
+# We copy the entire subtree, then build include_e/ which
+# contains empty headers used by the kernel sources.
+define Build/Prepare
+  # $(warning Preparing ipfw sources)
+       mkdir -p $(PKG_BUILD_DIR)
+       $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/
+       (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e )
+       (cd $(PKG_BUILD_DIR)/dummynet2 && $(MAKE) include_e )
+endef
+
+define Build/Compile
+       # compile the kernel part for openwrt
+       $(MAKE) -C "$(LINUX_DIR)" \
+               CROSS_COMPILE="$(TARGET_CROSS)" \
+               ARCH="$(LINUX_KARCH)" \
+               $(IPFW_SRC_DIR)="$(PKG_BUILD_DIR)/dummynet2" \
+               VER=$(VERS) modules
+       # compile the userland part for openwrt
+       $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \
+               $(TARGET_CONFIGURE_OPTS) \
+               CFLAGS="$(TARGET_CFLAGS) $(CFLAGS_WRT) -I./include_e -I./include -include ../glue.h -DNO_ALTQ -D__BSD_VISIBLE" \
+               VER=$(VERS) all
+endef
+
+define Package/ipfw3-userland
+  SECTION:=utils
+  CATEGORY:=Utilities
+  TITLE := /sbin/ipfw
+  DESCRIPTION := This is the control program for ipfw and dummynet
+endef
+
+define Package/ipfw3-userland/install
+       $(INSTALL_DIR) $(1) /sbin
+endef
+
+# XXX not entirely clear why the install entry for userland works,
+# given that /sbin/ipfw is in KernelPackage/ipfw3
+
+$(eval $(call Package,ipfw3-userland))
+$(eval $(call KernelPackage,ipfw3))
diff --git a/NOTES b/NOTES
new file mode 100644 (file)
index 0000000..52bb5bf
--- /dev/null
+++ b/NOTES
@@ -0,0 +1,220 @@
+#
+# $Id: NOTES 6552 2010-06-15 11:24:59Z svn_panicucci $
+#
+
+---------------------------------------------------------------------
+---  DEVELOPER NOTES ------------------------------------------------
+
+Both the client and the kernel code use almost unmodified sources
+from FreeBSD (just a very small number of sections #ifdef'ed out
+for features not relevant or not implemented).
+
+In both cases we provide two set of headers:
+ - one set is made of empty files, automatically generated, to replace
+   FreeBSD headers not available or conflicting on the ported platforms.
+ - one set is made of custom files, sometimes copied verbatim
+   from FreeBSD, sometimes containing only the minimal set of
+   macros/ struct/ prototypes required by the port.
+
+Additionally, we have a small set of .c files providing functions not
+available in the port platforms, and hooks for the sockopt/packet
+data.
+
+
+TODO 20100205:
++ use an appropriate identifier instead of LINUX24
++ find the discharging module hook, in order to force a queue flush
++ better matching on interface names (case insensitive etc ?)
++ match by interface address
++ verify path
++ send keepalives (20100301 marta: implemented)
++ pullup of data in external buffers
++ O_TAG
++ O_DIVERT
++ O_TEE
++ O_SETFIB
++ kmem_cache_alloc 
+
+TODO (OpenWRT) 20090622
++ add a module compilation for 2.6
+
+TODO (FreeBSD, general)
++ New features related to the forthcoming IPv6 are missing, as the IPv6
+support for lookup tables that currently support IPv4 addresses only.
+One of the goal of this project is to add the tables feature to the
+IPv6 protocol.
+
++ The current code implements rules listing requests as a single
+request returning both static and dynamic rules as a whole block. This
+operation requires a lock to be held for the time needed to get the
+full list of rules, regardless of the requested rules.  I propose to
+break up the rule request in two parts, for static and dynamic rules, in
+order to avoid to lock the whole struct for a subset of rules required.
+
++ At last, due to improvement and contribution to the code, the tool
+significantly grown over the time with new functionalities and features,
+leaving the general view aside. An example of this will be the use of
+dispatching table instead some very long switch case, making the resulting
+code more readable and hopefully a faster execution.
+
++ XXX can't find the ipfw_* indirection...
+
+DETAILED PORTING INFO
+
+--- ipfw (userland) on linux ---
+
+The port is relatively trivial. Communication with the kernel occurs
+through a raw socket using [gs]etsockopt(), and all is needed is the
+availability of ip_fw.h and ip_dummynet.h headers to describe the
+relevant data structures.
+
+--- kernel ipfw on linux ---
+
+Sources are mostly unmodified, except for commenting out
+unsupported features (tables, in-kernel nat...).
+The port requires a rather large number of empty headers.
+Other porting issues are in ipfw2_mod.c
+
+--- build as an Openwrt package
+
+------ WINDOWS PORT ------
+
+We started from the wipfw port available at [WIPFW] , but
+most of the port is done from scratch using the most recent
+version of ipfw+dummynet from HEAD/RELENG_7 as of March 2009
+
+# WIPFW: wipfw.sourceforge.net
+#binary:
+http://downloads.sourceforge.net/wipfw/wipfw-0.3.2b.zip?use_mirror=mesh
+http://downloads.sourceforge.net/wipfw/wipfw-0.2.8-source.zip
+
+--- DEVELOPMENT TOOLS:
+
+At least initially, to build the code you need a pc with
+windows installed and the [WINDDK] from the microsoft site.
+Other tools like the new WDK should work as well.
+
+The 'standard' way used by WDK/WINDDK is to run a 'build'
+script which in turn calls nmake and then the microsoft
+compiler [CL] and linker [LINK]. See the documentation for
+command line switches for these tools, they are similar but
+not the same as the equivalent gcc switches. In particular,
+a / is often used to replace - though both forms are accepted.
+
+The steps to do in order to launch the build environment follows:
+
+ + download winddk from microsoft.com 
+ + install 
+ + run the Free Build Enviroment from:
+
+       Start -> All Program -> WINDDK ->
+       [NT|XP|2000] -> Free Build Environment
+
+ + change dir to .src and type `build' in command line
+
+For our purposes, however, it is much more convenient to use
+cygwin [CYGWIN] and invoke CL and LINK using gmake
+
+A debugging tools is:
+       http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx
+it simply display the kernel-mode debug output.
+Use the DbgPrint() function, that is something similar to printk().
+Can be lauched with dbgview.exe.
+
+After a succesfully compilation and link, you can launch the program
+in user space simply executing the binary file, while for the kernel
+space you need to do the following steps:
+
+cp ipfw.sys /cygdrive/c/WINDOWS/system32/drivers/
+ipfw install_drv System32\DRIVERS\ip_fw.sys
+net start ip_fw
+
+
+=======
+--- ARCHITECTURE ---
+
+The main part of the userland program mostly work as the
+unix equivalent, the only issue is to provide empty
+header files to replace those not available in Windows,
+and include the winsock2 headers to access some network
+related functions and headers.
+
+Communication with the kernel module does not use a raw IP socket
+as in the unix version. Instead, we inherit the same method
+used in ipfw -- a replacement for socket() creates a handle
+to access the control structure, and setsockopt/getsockopt
+replacements are also used to communicate with the kernel
+side. This is implemented in win32.c
+
+In order to load the module and activate it, we also use
+the same technique suggested in wipfw -- the main() is
+extended (with a wrapper) so that it can handle additional
+commands to install/control/deinstall the service and
+call the appropriate actions. See svcmain.c for details.
+
+--- PORTING ISSUES:
+
+Most of the unix hierarchy of headers is not available so we
+have to replicate them.
+
+gcc attributes are also not present.
+
+C99 types are not present, remapped in <sys/cdefs.h>
+Also, we don't have C99 initializers which sometimes gives trouble.
+
+--- USEFUL LINKS:
+
+[WIPFW]
+       http://wipfw.sourceforge.net/
+
+[WINDDK]
+       http://www.microsoft.com/whdc/devtools/ddk/default.mspx
+
+[CL]
+       http://msdn.microsoft.com/en-us/library/610ecb4h.aspx
+       command line syntax
+
+[CYGWIN]
+       http://www.cygwin.com/setup.exe
+Windows Driver Kit
+http://www.microsoft.com/whdc/DevTools/WDK/WDKpkg.mspx
+
+Debug Symbols for WinXP SP3
+http://www.microsoft.com/whdc/devtools/debugging/symbolpkg.mspx#d
+
+DbgView
+http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx
+
+Cygwin
+http://www.cygwin.com/
+(installazione pacchetti di default + categoria devel)
+
+Winrar (il WDK e' distribuito in un file .iso)
+http://www.rarlab.com/download.htm
+
+puttycyg (terminale per cygwin)
+http://code.google.com/p/puttycyg/
+
+Tortoise SVN
+http://tortoisesvn.net/downloads
+
+EditPlus
+http://www.editplus.com/
+
+---------------------------------------------------------------------
+--- OPEN ISSUES/TODO ------------------------------------------------
+
+- Fix the build on OpenWRT for linux 2.6
+  [Forum: https://forum.openwrt.org/viewtopic.php?id=24990]
+- Compilation on 2.6 OpenWRT (target is MIPS Artheros 71xx) gives compilation
+  errors; [Send updates to: https://forum.openwrt.org/viewtopic.php?id=24990]
+- Windows stack corruption [a tricky bug in dummynet]
+- Windows ipv6 port [RE: Windows port of ipv6 in ipfw+dummynet]
+
+NOTE:
+- To allow compilation on OpenWRT with kernel 2.6 only the Makefile.opewrt
+  is modified to guess the kernel version (2.4/2.6)
+- ipfw3 Makefile is not modified.
+- Also compile on bigendian, but not tested yet...
+- Little changes in source code.
+
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..9791ea1
--- /dev/null
+++ b/README
@@ -0,0 +1,275 @@
+#
+# $Id: README 11691 2012-08-12 21:32:37Z luigi $
+#
+
+This directory contains a port of ipfw and dummynet to Linux and Windows.
+This version of ipfw and dummynet is called "ipfw3" as it is the
+third major rewrite of the code.  The source code here comes straight
+from FreeBSD (roughly the version in HEAD as of February 2010),
+plus some glue code and headers written from scratch.  Unless
+specified otherwise, all the code here is under a BSD license.
+
+Specific build instructions are below, and in general produce
+
+       a kernel module,        ipfw_mod.ko (ipfw.sys on windows)
+       a userland program,     /sbin/ipfw (ipfw.exe on windows)
+
+which you need to install on your system.
+
+CREDITS:
+    Luigi Rizzo (main design and development)
+    Marta Carbone (Linux and Planetlab ports)
+    Riccardo Panicucci (modular scheduler support)
+    Francesco Magno (Windows port)
+    Fabio Checconi (the QFQ scheduler)
+    Funding from Universita` di Pisa (NETOS project),
+       European Commission (ONELAB2 project)
+       ACM SIGCOMM (Sigcomm Community Projects Award, April 2012)
+    
+------ INSTALL/REMOVE INSTRUCTIONS ------
+
+Linux
+    INSTALL:
+       # Do the following as root
+       insmod ./dummynet2/ipfw_mod.ko
+       cp ipfw/ipfw /usr/local/sbin
+    REMOVE:
+       rmmod ipfw_mod.ko
+
+OpenWRT
+    INSTALL:   # use the correct name for your system
+       opkg install  kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install
+       ls -l ls -l /lib/modules/2.4.35.4/ipfw*     # check
+       insmod /lib/modules/2.4.35.4/ipfw_mod.o     # load the module
+       /lib/modules/2.4.35.4/ipfw show             # launch the userspace tool
+    REMOVE:
+       rmmod ipfw_mod.o                            # remove the module
+
+Windows:
+    A pre-built version is in binary/ and binary64/ directories.
+
+    INSTALL THE NDIS DRIVER
+       - open the configuration panel for the network card in use
+         (right click on the icon on the SYSTRAY, or go to
+         Control Panel -> Network and select one card)
+       - click on Properties->Install->Service->Add
+       - click on 'Driver Disk' and select 'netipfw.inf' in this folder
+       - select 'ipfw+dummynet' which is the only service you should see
+       - click accept on the warnings for the installation of an unsigned
+         driver (roughly twice per existing network card)
+
+       Now you are ready to use the emulator. To configure it, open a 'cmd'
+       window (REMEMBER to run it as Administrator)
+       and you can use the ipfw command from the command line.
+       Otherwise click on the 'TESTME.bat' which is a batch program that
+       runs various tests.
+       REMEMBER: you need to run ipfw as administrator.
+
+    REMOVE:
+       - select a network card as above.
+       - click on Properties
+       - select 'ipfw+dummynet'
+       - click on 'Remove'
+
+
+------ BUILD INSTRUCTIONS ------
+
++ Windows 32 bit and 64 bit (XP, Windows7)
+
+    To build your own version of the package you need:
+       - cygwin, http://www.cygwin.com/ with base packages, make,
+         c compiler, possibly an editor and subversion.
+         This is used to build the userspace control program, ipfw.exe
+
+       - Microsoft Windows Driver Kit Version 7.1.0, available from
+           http://www.microsoft.com/en-us/download/details.aspx?id=11800
+           (ISO image, GRMWDK_EN_7600_1.ISO)
+         This is used to build the kernel module.
+
+       - optionally, DbgView if you want to see diagnostics coming from
+         the kernel module. You can find it at
+
+           http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx
+
+    Check the Makefile in the root directory to make sure that the WDK is
+    installed in the place indicated by DRIVE and DDKDIR variables
+    (otherwise pass the correct values to the Makefile).
+    Open a shell from cygwin, move to this directory, and run "make" for
+    the 32-bit version, "make win64" for the 64 bit version.
+    This will produce in the binary/ or binary64/ directory the
+    following files:
+       ipfw.exe (you also need cygwin1.dll)
+       ipfw.sys (an NDIS intermediate filter driver)
+       netipfw.inf and netipfw_m.inf (installer files)
+
+    Cross compilation of the userland side under FreeBSD is possible with
+       gmake TCC=`pwd`/tcc-0.9.25-bsd/win32 CC=`pwd`/tcc-0.9.25-bsd/win32/bin/wintcc
+    (wintcc is a custom version of tcc which produces Windows code)
+
+    NOTE: the 64-bit version is compiled as a 32-bit executable for userspace,
+       with appropriate changes to produce 64-bit pointers.
+       The kernel module is built using the MSC 'build' utility instead
+       of 'make'. THE MODULE IS NOT SIGNED.
+    IMPORTANT: Windows 64-bit will not load unsigned kernel modules unless
+       you boot with 'F8' and disable checks for signed modules.
+
+***** Linux 2.6 and above ******
+
+       make [KSRC=/path/to/linux USRDIR=/path/to/usr]
+
+    where the two variables are optional an point to the linux kernel
+    sources and the /usr directory. Defaults are USRDIR=/usr and
+    KSRC=/lib/modules/`uname -r`/build         --- XXX check ?
+
+    NOTE: make sure CONFIG_NETFILTER is enabled in the kernel
+    configuration file. You need the ncurses devel library,
+    that can be installed according your distro with:
+       apt-get install ncurses-dev     # for debian based distro
+       yum -y install ncurses-dev      # for fedora based distro
+    You can enable CONFIG_NETFILTER by doing:
+    
+       "(cd ${KSRC}; make menuconfig)"
+
+    and enabling the option listed below:
+
+        Networking --->
+           Networking options  --->
+              [*] Network packet filtering framework (Netfilter)
+
+    If you have not yet compiled your kernel source, you need to
+    prepare the build environment:
+
+       (cd $(KSRC); make oldconfig; make prepare; make scripts)
+
+***** Linux 2.4.x *****
+
+    Almost as above, with an additional VER=2.4
+
+       make VER=2.4 KSRC=...
+
+    For 2.4, if KSRC is not specified then we use
+       KSRC ?= /usr/src/`uname -r`/build
+
+    You need to follow the same instruction for the 2.6 kernel, enabling
+    netfilter in the kernel options:
+
+    Networking options  --->
+      [*] Network packet filtering (replaces ipchains)
+
+***** Openwrt package *****
+
+    (Tested with kamikaze_8.09.1 and Linux 2.4)
+
+    + Download and extract the OpenWrt package, e.g.
+
+       wget http://downloads.openwrt.org/kamikaze/8.09.1/kamikaze_8.09.1_source.tar.bz2
+       tar xvjf kamikaze_8.09.1_source.tar.bz2
+
+    + move to the directory with the OpenWrt sources (the one that
+      contains Config.in, rules.mk ...)
+
+       cd kamikaze_8.09.1
+
+    + Optional: Add support for 1ms resolution.
+
+       By default OpenWRT kernel is compiled with HZ=100; this implies
+        that all timeouts are rounded to 10ms, too coarse for dummynet.
+        The file 020-mips-hz1000.patch contains a kernel patch to build
+       a kernel with HZ=1000 (i.e. 1ms resolution) as in Linux/FreeBSD.
+        To apply this patch, go in the kernel source directory and
+        patch the kernel
+
+               cd build_dir/linux-brcm-2.4/linux-2.4.35.4
+               cat $IPFW3_SOURCES/020-mips-hz1000.patch | patch -p0
+
+       where IPFW3_SOURCES contains the ipfw3 source code.
+       Now, the next kernel recompilation will use the right HZ value
+
+    + Optional: to be sure that the tools are working, make a first
+      build as follows:
+
+       - run "make menuconfig" and set the correct target device,
+         drivers, and so on;
+       - run "make" to do the build
+
+    + Add ipfw3 to the openwrt package, as follows:
+
+      - copy the code from this directory to the place used for the build:
+
+               cp -Rp /path_to_ipfw3 ../ipfw3; 
+
+       If you want, you can fetch a newer version from the web
+       (cd ..; rm -rf ipfw3; \
+       wget http://info.iet.unipi.it/~luigi/dummynet/ipfw3-latest.tgz;\
+       tar xvzf ipfw3-latest.tgz)
+
+      - run the following commands:
+       (mkdir package/ipfw3; \
+       cp ../ipfw3/Makefile.openwrt package/ipfw3/Makefile)
+
+       to create the package/ipfw3 directory in the OpenWrt source
+       directory, and copy Makefile.openwrt to package/ipfw3/Makefile ;
+
+      - if necessary, edit package/ipfw3/Makefile and set IPFW_DIR to point to
+       the directory ipfw3, which contains the sources;
+
+      - run "make menuconfig" and select kmod-ipfw3 as a module <M> in
+           Kernel Modules -> Other modules -> kmod-ipfw3 
+
+      - run "make" to build the package, "make V=99" for verbose build.
+
+      - to modify the code, assuming you are in directory "kamikaze_8.09.1"
+       
+       (cd ../ipfw3 && vi ...the files you are interested in )
+       rm -rf build_dir/linux-brcm-2.4/kmod-ipfw3
+       make package/ipfw3/compile V=99
+
+    The resulting package is located in bin/packages/mipsel/kmod-ipfw3*,
+    upload the file and install on the target system, as follows:
+
+    opkg install  kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install
+    ls -l ls -l /lib/modules/2.4.35.4/ipfw*     # check
+    insmod /lib/modules/2.4.35.4/ipfw_mod.o     # load the module
+    /lib/modules/2.4.35.4/ipfw show             # launch the userspace tool
+    rmmod ipfw_mod.o                            # remove the module
+
+***** PLANETLAB BUILD (within a slice) *****
+These instruction can be used by PlanetLab developers to compile
+the dummynet module on a node. To install the module on the node
+users need root access in root context.  PlanetLab users that want
+to use the dummynet package should ask to PlanetLab support for
+nodes with dummynet emulation capabilities.
+
+    Follow the instructions below. You can just cut&paste
+
+       # install the various tools if not available
+       sudo yum -y install subversion rpm-build rpm-devel m4 redhat-rpm-config make gcc
+       # new build installation requires the gnupg package
+       sudo yum -y install gnupg
+       # the linux kernel and the ipfw source can be fetched by git
+       sudo yum -y install git
+
+       # create and move to a work directory
+       mkdir -p test
+       # extract a planetlab distribution to directory XYZ
+       (cd test; git clone git://git.onelab.eu/build ./XYZ)
+       # download the specfiles and do some patching.
+       # Results are into SPEC/ (takes 5 minutes)
+       (cd test/XYZ; make stage1=true PLDISTRO=onelab)
+       # Building the slice code is fast, the root code takes longer
+       # as it needs to rebuild the whole kernel
+       (cd test/XYZ; sudo make ipfwslice PLDISTRO=onelab)
+       (cd test/XYZ; sudo make ipfwroot PLDISTRO=onelab)
+
+    The kernel dependency phase is a bit time consuming, but does not
+    need to be redone if we are changing the ipfw sources only.
+    To clean up the code do
+       (cd test/XYZ; sudo make ipfwroot-clean ipfwslice-clean)
+    then after you have updated the repository again
+       (cd test/XYZ; sudo make ipfwslice ipfwroot)
+
+--- References
+[1] https://svn.planet-lab.org/wiki/VserverCentos
+[2] http://wiki.linux-vserver.org/Installation_on_CentOS
+[3] http://mirror.centos.org/centos/5/isos/
+[4] More information are in /build/README* files 
diff --git a/binary/README.txt b/binary/README.txt
new file mode 100644 (file)
index 0000000..0212277
--- /dev/null
@@ -0,0 +1,27 @@
+This directory contains the binaries to install and use IPFW and\r
+DUMMYNET on a Windows Machine. The kernel part is an NDIS module,\r
+whereas the user interface is a command line program.\r
+\r
+1. INSTALL THE NDIS DRIVER\r
+\r
+- open the configuration panel for the network card in use\r
+  (either right click on the icon on the SYSTRAY, or go to\r
+  Control Panel -> Network and select one card)\r
+\r
+- click on Properties->Install->Service->Add\r
+- click on 'Driver Disk' and select 'netipfw.inf' in this folder\r
+- select 'ipfw+dummynet' which is the only service you should see\r
+- click accept on the warnings for the installation of an unknown\r
+  driver (roughly twice per existing network card)\r
+\r
+Now you are ready to use the emulator. To configure it, open a 'cmd'\r
+window and you can use the ipfw command from the command line.\r
+Otherwise click on the 'TESTME.bat' which is a batch program that\r
+runs various tests.\r
+\r
+2. UNINSTALL THE DRIVER\r
+\r
+- select a network card as above.\r
+- click on Properties\r
+- select 'ipfw+dummynet'\r
+- click on 'Remove'\r
diff --git a/binary/cygwin1.dll b/binary/cygwin1.dll
new file mode 100644 (file)
index 0000000..317c51e
Binary files /dev/null and b/binary/cygwin1.dll differ
diff --git a/binary/ipfw.exe b/binary/ipfw.exe
new file mode 100644 (file)
index 0000000..09bdc37
Binary files /dev/null and b/binary/ipfw.exe differ
diff --git a/binary/ipfw.sys b/binary/ipfw.sys
new file mode 100644 (file)
index 0000000..59e855c
Binary files /dev/null and b/binary/ipfw.sys differ
diff --git a/binary/netipfw.inf b/binary/netipfw.inf
new file mode 100644 (file)
index 0000000..7159403
--- /dev/null
@@ -0,0 +1,81 @@
+; version section\r
+[Version]\r
+Signature  = "$Windows NT$"\r
+Class      = NetService\r
+ClassGUID  = {4D36E974-E325-11CE-BFC1-08002BE10318}\r
+Provider   = %Unipi%\r
+DriverVer  = 26/02/2010,3.0.0.1\r
+\r
+; manufacturer section\r
+[Manufacturer]\r
+%Unipi% = UNIPI,NTx86,NTamd64\r
+\r
+; control flags section\r
+; optional, unused in netipfw.inf inf, used in netipfw_m.inf\r
+[ControlFlags]\r
+\r
+; models section\r
+[UNIPI] ; Win2k\r
+%Desc% = Ipfw.ndi, unipi_ipfw\r
+[UNIPI.NTx86] ;For WinXP and later\r
+%Desc% = Ipfw.ndi, unipi_ipfw\r
+[UNIPI.NTamd64] ;For x64\r
+%Desc% = Ipfw.ndi, unipi_ipfw\r
+\r
+; ddinstall section\r
+[Ipfw.ndi]\r
+AddReg          = Ipfw.ndi.AddReg, Ipfw.AddReg\r
+Characteristics = 0x4410 ;  NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!!\r
+CopyFiles       = Ipfw.Files.Sys\r
+CopyInf         = netipfw_m.inf\r
+\r
+; remove section\r
+[Ipfw.ndi.Remove]\r
+DelFiles = Ipfw.Files.Sys\r
+\r
+;ddinstall.services section\r
+[Ipfw.ndi.Services]\r
+AddService = Ipfw,,Ipfw.AddService\r
+\r
+[Ipfw.AddService]\r
+DisplayName    = %ServiceDesc%\r
+ServiceType    = 1 ;SERVICE_KERNEL_DRIVER\r
+StartType      = 3 ;SERVICE_DEMAND_START\r
+ErrorControl   = 1 ;SERVICE_ERROR_NORMAL\r
+ServiceBinary  = %12%\ipfw.sys\r
+AddReg         = Ipfw.AddService.AddReg\r
+\r
+[Ipfw.AddService.AddReg]\r
+\r
+;file copy related sections\r
+[SourceDisksNames]\r
+1=%DiskDescription%,"",,\r
+\r
+[SourceDisksFiles]\r
+ipfw.sys=1\r
+\r
+[DestinationDirs]\r
+DefaultDestDir = 12\r
+Ipfw.Files.Sys   = 12   ; %windir%\System32\drivers\r
+\r
+; ddinstall->copyfiles points here\r
+[Ipfw.Files.Sys]\r
+ipfw.sys,,,2\r
+\r
+; ddinstall->addreg points here\r
+[Ipfw.ndi.AddReg]\r
+HKR, Ndi,            HelpText,            , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box\r
+HKR, Ndi,            FilterClass,         , failover\r
+HKR, Ndi,            FilterDeviceInfId,   , unipi_ipfwmp\r
+HKR, Ndi,            Service,             , Ipfw\r
+HKR, Ndi\Interfaces, UpperRange,          , noupper\r
+HKR, Ndi\Interfaces, LowerRange,          , nolower\r
+HKR, Ndi\Interfaces, FilterMediaTypes,    , "ethernet, tokenring, fddi, wan"\r
+\r
+;strings section\r
+[Strings]\r
+Unipi = "Unipi"\r
+DiskDescription = "Ipfw Driver Disk"\r
+Desc = "ipfw+dummynet"\r
+HELP = "This is ipfw and dummynet network emulator, developed by unipi.it"\r
+ServiceDesc = "ipfw service"\r
diff --git a/binary/netipfw_m.inf b/binary/netipfw_m.inf
new file mode 100644 (file)
index 0000000..350e4d1
--- /dev/null
@@ -0,0 +1,56 @@
+; version section\r
+[Version]\r
+Signature  = "$Windows NT$"\r
+Class      = Net\r
+ClassGUID  = {4D36E972-E325-11CE-BFC1-08002BE10318}\r
+Provider   = %Unipi%\r
+DriverVer  = 26/02/2010,3.0.0.1\r
+\r
+; control flags section\r
+; optional, unused in netipfw.inf inf, used in netipfw_m.inf\r
+[ControlFlags]\r
+ExcludeFromSelect = unipi_ipfwmp\r
+\r
+; destinationdirs section, optional\r
+[DestinationDirs]\r
+DefaultDestDir=12\r
+; No files to copy \r
+\r
+; manufacturer section\r
+[Manufacturer]\r
+%Unipi% = UNIPI,NTx86,NTamd64\r
+\r
+; models section\r
+[UNIPI] ; Win2k\r
+%Desc% = IpfwMP.ndi, unipi_ipfwmp\r
+[UNIPI.NTx86] ;For WinXP and later\r
+%Desc% = IpfwMP.ndi, unipi_ipfwmp\r
+[UNIPI.NTamd64] ;For x64\r
+%Desc% = IpfwMP.ndi, unipi_ipfwmp\r
+\r
+; ddinstall section\r
+[IpfwMP.ndi]\r
+AddReg  = IpfwMP.ndi.AddReg\r
+Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN\r
+\r
+; ddinstall->addreg points here\r
+[IpfwMP.ndi.AddReg]\r
+HKR, Ndi, Service,  0,  IpfwMP\r
+\r
+;ddinstall.services section\r
+[IpfwMP.ndi.Services]\r
+AddService = IpfwMP,0x2, IpfwMP.AddService\r
+\r
+[IpfwMP.AddService]\r
+ServiceType    = 1 ;SERVICE_KERNEL_DRIVER\r
+StartType      = 3 ;SERVICE_DEMAND_START\r
+ErrorControl   = 1 ;SERVICE_ERROR_NORMAL\r
+ServiceBinary  = %12%\ipfw.sys\r
+AddReg         = IpfwMP.AddService.AddReg\r
+\r
+[IpfwMP.AddService.AddReg]\r
+; None\r
+\r
+[Strings]\r
+Unipi = "Unipi"\r
+Desc = "Ipfw Miniport"
\ No newline at end of file
diff --git a/binary/testme.bat b/binary/testme.bat
new file mode 100644 (file)
index 0000000..5b3de00
--- /dev/null
@@ -0,0 +1,79 @@
+@echo on\r
+@set CYGWIN=nodosfilewarning\r
+\r
+@ipfw -q flush\r
+@ipfw -q pipe flush\r
+@echo ######################################################################\r
+@echo ## Setting delay to 100ms for both incoming and outgoing ip packets ##\r
+@echo ## and sending 4 echo request to Google                             ##\r
+@echo ######################################################################\r
+ipfw pipe 3 config delay 100ms\r
+ipfw add pipe 3 ip from any to any\r
+ipfw pipe show\r
+ping -n 4 www.google.it\r
+\r
+@echo ##############################################\r
+@echo ## Raising delay to 300ms and pinging again ##\r
+@echo ##############################################\r
+ipfw pipe 3 config delay 300ms\r
+ipfw pipe show\r
+ping -n 4 www.google.com\r
+\r
+@echo ##################################\r
+@echo ## Shaping bandwidth to 500kbps ##\r
+@echo ##################################\r
+ipfw pipe 3 config bw 500Kbit/s\r
+ipfw pipe show\r
+wget http://info.iet.unipi.it/~luigi/1m\r
+@del 1m\r
+\r
+@echo ###################################\r
+@echo ## Lowering bandwidth to 250kbps ##\r
+@echo ###################################\r
+ipfw pipe 3 config bw 250Kbit/s\r
+ipfw pipe show\r
+wget http://info.iet.unipi.it/~luigi/1m\r
+@del 1m\r
+\r
+@echo ###################################################################\r
+@echo ## Simulating 50 percent packet loss and sending 15 echo request ##\r
+@echo ###################################################################\r
+@ipfw -q flush\r
+@ipfw -q pipe flush\r
+ipfw add prob 0.5 deny proto icmp in\r
+ping -n 15 -w 300 www.google.it\r
+@ipfw -q flush\r
+\r
+@echo ##############################\r
+@echo ## Showing SYSCTL variables ##\r
+@echo ##############################\r
+ipfw sysctl -a\r
+\r
+@echo #############################################\r
+@echo ## Inserting rules to test command parsing ##\r
+@echo #############################################\r
+@echo -- dropping all packets of a specific protocol --\r
+ipfw add deny proto icmp\r
+@echo -- dropping packets of all protocols except a specific one --\r
+ipfw add deny not proto tcp\r
+@echo -- dropping all packets from IP x to IP y --\r
+ipfw add deny src-ip 1.2.3.4 dst-ip 5.6.7.8\r
+@echo -- dropping all ssh outgoing connections --\r
+ipfw add deny out dst-port 22\r
+@echo -- allowing already opened browser connections --\r
+@echo -- but preventing new ones from being opened   --\r
+ipfw add deny out proto tcp dst-port 80 tcpflags syn\r
+@echo -- another way to do the same thing --\r
+ipfw add allow out proto tcp dst-port 80 established\r
+ipfw add deny out proto tcp dst-port 80 setup\r
+@echo -- checking what rules have been inserted --\r
+ipfw -c show\r
+@ipfw -q flush\r
+\r
+@echo #################\r
+@echo ## Cleaning up ##\r
+@echo #################\r
+ipfw -q flush\r
+ipfw -q pipe flush\r
+\r
+pause\r
diff --git a/binary/wget.exe b/binary/wget.exe
new file mode 100644 (file)
index 0000000..f2a11c1
Binary files /dev/null and b/binary/wget.exe differ
diff --git a/binary64/ipfw.exe b/binary64/ipfw.exe
new file mode 100755 (executable)
index 0000000..35c86d9
Binary files /dev/null and b/binary64/ipfw.exe differ
diff --git a/binary64/ipfw.sys b/binary64/ipfw.sys
new file mode 100755 (executable)
index 0000000..8e2275d
Binary files /dev/null and b/binary64/ipfw.sys differ
diff --git a/configuration/README b/configuration/README
new file mode 100644 (file)
index 0000000..778f7aa
--- /dev/null
@@ -0,0 +1,14 @@
+This directorty contains some ipfw configurations and a scripts 
+to safely change the firewall rules.
+
+The firewall configuration comes from the FreeBSD initial script.
+The change_rules_linux.sh allows to change the ipfw rules and
+in case os a misconfiguration which prevents to reach the remote
+host, to restore the old ruleset.
+
+To configure the firewall behavior, edit the ipfw.conf file and 
+execute the ./change_rules_linux.sh script.
+
+The ipfw program executable should be located in /sbin (XXX)
+
+XXX seems we use something which is not compatible with dash
diff --git a/configuration/change_rules.sh b/configuration/change_rules.sh
new file mode 100755 (executable)
index 0000000..8f23369
--- /dev/null
@@ -0,0 +1,159 @@
+#!/bin/sh
+#
+# Copyright (c) 2000 Alexandre Peixoto
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: src/share/examples/ipfw/change_rules.sh,v 1.6 2003/09/07 07:52:56 jmg Exp $
+
+# Change ipfw(8) rules with safety guarantees for remote operation
+#
+# Invoke this script to edit ${firewall_script}. It will call ${EDITOR},
+# or vi(1) if the environment variable is not set, for you to edit
+# ${firewall_script}, ask for confirmation, and then run
+# ${firewall_script}. You can then examine the output of ipfw list and
+# confirm whether you want the new version or not.
+#
+# If no answer is received in 30 seconds, the previous
+# ${firewall_script} is run, restoring the old rules (this assumes ipfw
+# flush is present in it).
+#
+# If the new rules are confirmed, they'll replace ${firewall_script} and
+# the previous ones will be copied to ${firewall_script}.{date}. Mail
+# will also be sent to root with a unified diff of the rule change.
+#
+# Unapproved rules are kept in ${firewall_script}.new, and you are
+# offered the option of changing them instead of the present rules when
+# you call this script.
+#
+# This script could be improved by using version control
+# software.
+
+# XXX on linux /etc/rc.conf defines:
+# firewall_type and firewall_script
+
+if [ -r /etc/defaults/rc.conf ]; then
+       . /etc/defaults/rc.conf
+       source_rc_confs
+elif [ -r /etc/rc.conf ]; then
+       . /etc/rc.conf
+fi
+
+EDITOR=${EDITOR:-/usr/bin/vi}
+PAGER=${PAGER:-/usr/bin/more}
+
+# on linux the default mktemp invocation behavior
+# is different, we should change the temporary file creation
+tempfoo=`basename $0`
+#TMPFILE=`mktemp -t ${tempfoo}` || exit 1
+TMPFILE=`mktemp -t ${tempfoo}.XXXXX` || exit 1
+
+get_yes_no() {
+       while true
+       do
+               echo -n "$1 (Y/N) ? " 
+               read -t 30 a
+               if [ $? != 0 ]; then
+                       a="No";
+                       return;
+               fi
+               case $a in
+                       [Yy]) a="Yes";
+                             return;;
+                       [Nn]) a="No";
+                             return;;
+                       *);;
+               esac
+       done
+}
+
+restore_rules() {
+       nohup sh ${firewall_script} </dev/null >/dev/null 2>&1
+       rm ${TMPFILE}
+       exit 1
+}
+
+case "${firewall_type}" in
+[Cc][Ll][Ii][Ee][Nn][Tt]|\
+[Cc][Ll][Oo][Ss][Ee][Dd]|\
+[Oo][Pp][Ee][Nn]|\
+[Ss][Ii][Mm][Pp][Ll][Ee]|\
+[Uu][Nn][Kk][Nn][Oo][Ww][Nn])
+       edit_file="${firewall_script}"
+       rules_edit=no
+       ;;
+*)
+       if [ -r "${firewall_type}" ]; then
+               edit_file="${firewall_type}"
+               rules_edit=yes
+       fi
+       ;;
+esac
+
+if [ -f ${edit_file}.new ]; then
+       get_yes_no "A new rules file already exists, do you want to use it"
+       [ $a = 'No' ] && cp ${edit_file} ${edit_file}.new
+else 
+       cp ${edit_file} ${edit_file}.new
+fi
+
+trap restore_rules SIGHUP
+
+${EDITOR} ${edit_file}.new
+
+get_yes_no "Do you want to install the new rules"
+
+[ $a = 'No' ] && exit 1
+
+cat <<!
+The rules will be changed now. If the message 'Type y to keep the new
+rules' does not appear on the screen or the y key is not pressed in 30
+seconds, the original rules will be restored.
+The TCP/IP connections might be broken during the change. If so, restore
+the ssh/telnet connection being used.
+!
+
+if [ ${rules_edit} = yes ]; then
+       nohup sh ${firewall_script} ${firewall_type}.new \
+           < /dev/null > ${TMPFILE} 2>&1
+else
+       nohup sh ${firewall_script}.new \
+           < /dev/null > ${TMPFILE} 2>&1
+fi
+sleep 2;
+get_yes_no "Would you like to see the resulting new rules"
+[ $a = 'Yes' ] && ${PAGER} ${TMPFILE}
+get_yes_no "Type y to keep the new rules"
+[ $a != 'Yes' ] && restore_rules
+
+DATE=`date "+%Y%m%d%H%M"`
+cp ${edit_file} ${edit_file}.$DATE
+mv ${edit_file}.new ${edit_file} 
+cat <<!
+The new rules are now installed. The previous rules have been preserved in
+the file ${edit_file}.$DATE
+!
+diff -F "^# .*[A-Za-z]" -u ${edit_file}.$DATE ${edit_file} \
+    | mail -s "`hostname` Firewall rule change" root
+rm ${TMPFILE}
+exit 0
diff --git a/configuration/change_rules_linux.sh b/configuration/change_rules_linux.sh
new file mode 100755 (executable)
index 0000000..5d170b3
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+#
+# marta
+# linux wrapper for the FreeBSD change rules program
+# This file load the linux configuration and calls the
+# original change rules program
+
+if [ -r ./ipfw.conf ]; then
+       . ./ipfw.conf
+fi
+
+. ./change_rules.sh
diff --git a/configuration/ipfw.conf b/configuration/ipfw.conf
new file mode 100644 (file)
index 0000000..c673020
--- /dev/null
@@ -0,0 +1,29 @@
+# ipfw and dummynet configuration file for linux
+# XXX TO BE TESTED ON LINUX
+
+# The firewall_type variable is used to configure the firewall behavior.
+# A detailed description on how a following type works is in rc.firewall
+#
+#   open        - will allow anyone in
+#   client      - will try to protect just this machine
+#   simple      - will try to protect a whole network
+#   closed      - totally disables IP services except via lo0 interface
+#   workstation - will try to protect just this machine using statefull
+#                 firewalling. See below for rc.conf variables used
+#   UNKNOWN     - disables the loading of firewall rules.
+#   filename    - will load the rules in the given filename (full path required)
+
+# firewall_type=open
+
+# The following file is an example on how to use a filename to define a firewall
+# and how to configure a simple dummynet pipe to ... XXX shape traffic... etc...
+firewall_type=/home/marta/SVN/ports-luigi/dummynet-branches/ipfw3/configuration/ipfw.rules
+
+# Environment variables expected by the change rules script
+EDITOR=/usr/bin/vi
+PAGER=/bin/more
+
+# The following variable should point to the rc.firewall script
+# XXX TEST
+#firewall_script=`echo "please edit the firewall_script variable in ipfw.conf"`;
+firewall_script="/home/marta/SVN/ports-luigi/dummynet-branches/ipfw3/configuration/rc.firewall"
diff --git a/configuration/ipfw.rules b/configuration/ipfw.rules
new file mode 100755 (executable)
index 0000000..aaec43c
--- /dev/null
@@ -0,0 +1,12 @@
+# This is a simple configuration file
+# add dummynet pipes and a firewall section
+
+# flush all rules ...
+# flush
+
+# dummynet configuration
+
+# firewall configuration
+add 1 allow all from any to any
+# ...
+add 65000 deny all from any to any
diff --git a/configuration/rc.firewall b/configuration/rc.firewall
new file mode 100755 (executable)
index 0000000..2fcc8f4
--- /dev/null
@@ -0,0 +1,400 @@
+#!/bin/sh -
+# Copyright (c) 1996  Poul-Henning Kamp
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: src/etc/rc.firewall,v 1.52.4.1 2008/01/29 00:22:32 dougb Exp $
+#
+
+#
+# Setup system for ipfw(4) firewall service.
+#
+
+# Suck in the configuration variables.
+if [ -z "${source_rc_confs_defined}" ]; then
+       if [ -r /etc/defaults/rc.conf ]; then
+               . /etc/defaults/rc.conf
+               source_rc_confs
+       elif [ -r /etc/rc.conf ]; then
+               . /etc/rc.conf
+       fi
+fi
+
+############
+# Define the firewall type in /etc/rc.conf.  Valid values are:
+#   open        - will allow anyone in
+#   client      - will try to protect just this machine
+#   simple      - will try to protect a whole network
+#   closed      - totally disables IP services except via lo0 interface
+#   workstation - will try to protect just this machine using statefull
+#                firewalling. See below for rc.conf variables used
+#   UNKNOWN     - disables the loading of firewall rules.
+#   filename    - will load the rules in the given filename (full path required)
+#
+# For ``client'' and ``simple'' the entries below should be customized
+# appropriately.
+
+############
+#
+# If you don't know enough about packet filtering, we suggest that you
+# take time to read this book:
+#
+#      Building Internet Firewalls, 2nd Edition
+#      Brent Chapman and Elizabeth Zwicky
+#
+#      O'Reilly & Associates, Inc
+#      ISBN 1-56592-871-7
+#      http://www.ora.com/
+#      http://www.oreilly.com/catalog/fire2/
+#
+# For a more advanced treatment of Internet Security read:
+#
+#      Firewalls and Internet Security: Repelling the Wily Hacker, 2nd Edition
+#      William R. Cheswick, Steven M. Bellowin, Aviel D. Rubin
+#
+#      Addison-Wesley / Prentice Hall
+#      ISBN 0-201-63466-X
+#      http://www.pearsonhighered.com/
+#      http://www.pearsonhighered.com/educator/academic/product/0,3110,020163466X,00.html
+#
+
+setup_loopback () {
+       ############
+       # Only in rare cases do you want to change these rules
+       #
+       ${fwcmd} add 100 pass all from any to any via lo0
+       ${fwcmd} add 200 deny all from any to 127.0.0.0/8
+       ${fwcmd} add 300 deny ip from 127.0.0.0/8 to any
+}
+
+if [ -n "${1}" ]; then
+       firewall_type="${1}"
+fi
+
+############
+# Set quiet mode if requested
+#
+case ${firewall_quiet} in
+[Yy][Ee][Ss])
+       fwcmd="/sbin/ipfw -q"
+       ;;
+*)
+       fwcmd="/sbin/ipfw"
+       ;;
+esac
+
+############
+# Flush out the list before we begin.
+#
+${fwcmd} -f flush
+
+setup_loopback
+
+############
+# Network Address Translation.  All packets are passed to natd(8)
+# before they encounter your remaining rules.  The firewall rules
+# will then be run again on each packet after translation by natd
+# starting at the rule number following the divert rule.
+#
+# For ``simple'' firewall type the divert rule should be put to a
+# different place to not interfere with address-checking rules.
+#
+case ${firewall_type} in
+[Oo][Pp][Ee][Nn]|[Cc][Ll][Ii][Ee][Nn][Tt])
+       case ${natd_enable} in
+       [Yy][Ee][Ss])
+               if [ -n "${natd_interface}" ]; then
+                       ${fwcmd} add 50 divert natd ip4 from any to any via ${natd_interface}
+               fi
+               ;;
+       esac
+       case ${firewall_nat_enable} in
+       [Yy][Ee][Ss])
+               if [ -n "${firewall_nat_interface}" ]; then
+                       ${fwcmd} nat 123 config if ${firewall_nat_interface} log
+                       ${fwcmd} add 50 nat 123 ip4 from any to any via ${firewall_nat_interface}
+               fi
+               ;;
+       esac
+esac
+
+############
+# If you just configured ipfw in the kernel as a tool to solve network
+# problems or you just want to disallow some particular kinds of traffic
+# then you will want to change the default policy to open.  You can also
+# do this as your only action by setting the firewall_type to ``open''.
+#
+# ${fwcmd} add 65000 pass all from any to any
+
+
+# Prototype setups.
+#
+case ${firewall_type} in
+[Oo][Pp][Ee][Nn])
+       ${fwcmd} add 65000 pass all from any to any
+       ;;
+
+[Cc][Ll][Ii][Ee][Nn][Tt])
+       ############
+       # This is a prototype setup that will protect your system somewhat
+       # against people from outside your own network.
+       ############
+
+       # set these to your network and netmask and ip
+       net="192.0.2.0"
+       mask="255.255.255.0"
+       ip="192.0.2.1"
+
+       # Allow any traffic to or from my own net.
+       ${fwcmd} add pass all from ${ip} to ${net}:${mask}
+       ${fwcmd} add pass all from ${net}:${mask} to ${ip}
+
+       # Allow TCP through if setup succeeded
+       ${fwcmd} add pass tcp from any to any established
+
+       # Allow IP fragments to pass through
+       ${fwcmd} add pass all from any to any frag
+
+       # Allow setup of incoming email
+       ${fwcmd} add pass tcp from any to me 25 setup
+
+       # Allow setup of outgoing TCP connections only
+       ${fwcmd} add pass tcp from me to any setup
+
+       # Disallow setup of all other TCP connections
+       ${fwcmd} add deny tcp from any to any setup
+
+       # Allow DNS queries out in the world
+       ${fwcmd} add pass udp from me to any 53 keep-state
+
+       # Allow NTP queries out in the world
+       ${fwcmd} add pass udp from me to any 123 keep-state
+
+       # Everything else is denied by default, unless the
+       # IPFIREWALL_DEFAULT_TO_ACCEPT option is set in your kernel
+       # config file.
+       ;;
+
+[Ss][Ii][Mm][Pp][Ll][Ee])
+       ############
+       # This is a prototype setup for a simple firewall.  Configure this
+       # machine as a DNS and NTP server, and point all the machines
+       # on the inside at this machine for those services.
+       ############
+
+       # set these to your outside interface network and netmask and ip
+       oif="ed0"
+       onet="192.0.2.0"
+       omask="255.255.255.240"
+       oip="192.0.2.1"
+
+       # set these to your inside interface network and netmask and ip
+       iif="ed1"
+       inet="192.0.2.16"
+       imask="255.255.255.240"
+       iip="192.0.2.17"
+
+       # Stop spoofing
+       ${fwcmd} add deny all from ${inet}:${imask} to any in via ${oif}
+       ${fwcmd} add deny all from ${onet}:${omask} to any in via ${iif}
+
+       # Stop RFC1918 nets on the outside interface
+       ${fwcmd} add deny all from any to 10.0.0.0/8 via ${oif}
+       ${fwcmd} add deny all from any to 172.16.0.0/12 via ${oif}
+       ${fwcmd} add deny all from any to 192.168.0.0/16 via ${oif}
+
+       # Stop draft-manning-dsua-03.txt (1 May 2000) nets (includes RESERVED-1,
+       # DHCP auto-configuration, NET-TEST, MULTICAST (class D), and class E)
+       # on the outside interface
+       ${fwcmd} add deny all from any to 0.0.0.0/8 via ${oif}
+       ${fwcmd} add deny all from any to 169.254.0.0/16 via ${oif}
+       ${fwcmd} add deny all from any to 192.0.2.0/24 via ${oif}
+       ${fwcmd} add deny all from any to 224.0.0.0/4 via ${oif}
+       ${fwcmd} add deny all from any to 240.0.0.0/4 via ${oif}
+
+       # Network Address Translation.  This rule is placed here deliberately
+       # so that it does not interfere with the surrounding address-checking
+       # rules.  If for example one of your internal LAN machines had its IP
+       # address set to 192.0.2.1 then an incoming packet for it after being
+       # translated by natd(8) would match the `deny' rule above.  Similarly
+       # an outgoing packet originated from it before being translated would
+       # match the `deny' rule below.
+       case ${natd_enable} in
+       [Yy][Ee][Ss])
+               if [ -n "${natd_interface}" ]; then
+                       ${fwcmd} add divert natd all from any to any via ${natd_interface}
+               fi
+               ;;
+       esac
+
+       # Stop RFC1918 nets on the outside interface
+       ${fwcmd} add deny all from 10.0.0.0/8 to any via ${oif}
+       ${fwcmd} add deny all from 172.16.0.0/12 to any via ${oif}
+       ${fwcmd} add deny all from 192.168.0.0/16 to any via ${oif}
+
+       # Stop draft-manning-dsua-03.txt (1 May 2000) nets (includes RESERVED-1,
+       # DHCP auto-configuration, NET-TEST, MULTICAST (class D), and class E)
+       # on the outside interface
+       ${fwcmd} add deny all from 0.0.0.0/8 to any via ${oif}
+       ${fwcmd} add deny all from 169.254.0.0/16 to any via ${oif}
+       ${fwcmd} add deny all from 192.0.2.0/24 to any via ${oif}
+       ${fwcmd} add deny all from 224.0.0.0/4 to any via ${oif}
+       ${fwcmd} add deny all from 240.0.0.0/4 to any via ${oif}
+
+       # Allow TCP through if setup succeeded
+       ${fwcmd} add pass tcp from any to any established
+
+       # Allow IP fragments to pass through
+       ${fwcmd} add pass all from any to any frag
+
+       # Allow setup of incoming email
+       ${fwcmd} add pass tcp from any to ${oip} 25 setup
+
+       # Allow access to our DNS
+       ${fwcmd} add pass tcp from any to ${oip} 53 setup
+       ${fwcmd} add pass udp from any to ${oip} 53
+       ${fwcmd} add pass udp from ${oip} 53 to any
+
+       # Allow access to our WWW
+       ${fwcmd} add pass tcp from any to ${oip} 80 setup
+
+       # Reject&Log all setup of incoming connections from the outside
+       ${fwcmd} add deny log tcp from any to any in via ${oif} setup
+
+       # Allow setup of any other TCP connection
+       ${fwcmd} add pass tcp from any to any setup
+
+       # Allow DNS queries out in the world
+       ${fwcmd} add pass udp from ${oip} to any 53 keep-state
+
+       # Allow NTP queries out in the world
+       ${fwcmd} add pass udp from ${oip} to any 123 keep-state
+
+       # Everything else is denied by default, unless the
+       # IPFIREWALL_DEFAULT_TO_ACCEPT option is set in your kernel
+       # config file.
+       ;;
+
+[Ww][Oo][Rr][Kk][Ss][Tt][Aa][Tt][Ii][Oo][Nn])
+       # Configuration:
+       #  firewall_myservices:         List of TCP ports on which this host
+       #                                offers services.
+       #  firewall_allowservices:      List of IPs which has access to
+       #                                $firewall_myservices.
+       #  firewall_trusted:            List of IPs which has full access 
+       #                                to this host. Be very carefull 
+       #                                when setting this. This option can
+       #                                seriously degrade the level of 
+       #                                protection provided by the firewall.
+       #  firewall_logdeny:            Boolean (YES/NO) specifying if the
+       #                                default denied packets should be
+       #                                logged (in /var/log/security).
+       #  firewall_nologports:         List of TCP/UDP ports for which
+       #                                denied incomming packets are not
+       #                                logged.
+       
+       # Allow packets for which a state has been built.
+       ${fwcmd} add check-state
+
+       # For services permitted below.
+       ${fwcmd} add pass tcp  from me to any established
+
+       # Allow any connection out, adding state for each.
+       ${fwcmd} add pass tcp  from me to any setup keep-state
+       ${fwcmd} add pass udp  from me to any       keep-state
+       ${fwcmd} add pass icmp from me to any       keep-state
+
+       # Allow DHCP.
+       ${fwcmd} add pass udp  from 0.0.0.0 68 to 255.255.255.255 67 out
+       ${fwcmd} add pass udp  from any 67     to me 68 in
+       ${fwcmd} add pass udp  from any 67     to 255.255.255.255 68 in
+       # Some servers will ping the IP while trying to decide if it's 
+       # still in use.
+       ${fwcmd} add pass icmp from any to any icmptype 8
+
+       # Allow "mandatory" ICMP in.
+       ${fwcmd} add pass icmp from any to any icmptype 3,4,11
+       
+       # Add permits for this workstations published services below
+       # Only IPs and nets in firewall_allowservices is allowed in.
+       # If you really wish to let anyone use services on your 
+       # workstation, then set "firewall_allowservices='any'" in /etc/rc.conf
+       #
+       # Note: We don't use keep-state as that would allow DoS of
+       #       our statetable. 
+       #       You can add 'keep-state' to the lines for slightly
+       #       better performance if you fell that DoS of your
+       #       workstation won't be a problem.
+       #
+       for i in ${firewall_allowservices} ; do
+         for j in ${firewall_myservices} ; do
+           ${fwcmd} add pass tcp from $i to me $j
+         done
+       done
+
+       # Allow all connections from trusted IPs.
+       # Playing with the content of firewall_trusted could seriously
+       # degrade the level of protection provided by the firewall.
+       for i in ${firewall_trusted} ; do
+         ${fwcmd} add pass ip from $i to me
+       done
+       
+       ${fwcmd} add 65000 count ip from any to any
+
+       # Drop packets to ports where we don't want logging
+       for i in ${firewall_nologports} ; do
+         ${fwcmd} add deny { tcp or udp } from any to any $i in
+       done
+
+       # Broadcasts and muticasts
+       ${fwcmd} add deny ip  from any to 255.255.255.255
+       ${fwcmd} add deny ip  from any to 224.0.0.0/24 in       # XXX
+
+       # Noise from routers
+       ${fwcmd} add deny udp from any to any 520 in
+
+       # Noise from webbrowsing.
+       # The statefull filter is a bit agressive, and will cause some
+       #  connection teardowns to be logged.
+       ${fwcmd} add deny tcp from any 80,443 to any 1024-65535 in
+
+       # Deny and (if wanted) log the rest unconditionally.
+       log=""
+       if [ ${firewall_logdeny:-x} = "YES" -o ${firewall_logdeny:-x} = "yes" ] ; then
+         log="log logamount 500"       # The default of 100 is too low.
+         sysctl net.inet.ip.fw.verbose=1 >/dev/null
+       fi
+       ${fwcmd} add deny $log ip from any to any
+       ;;
+
+[Cc][Ll][Oo][Ss][Ee][Dd])
+       ${fwcmd} add 65000 deny ip from any to any
+       ;;
+[Uu][Nn][Kk][Nn][Oo][Ww][Nn])
+       ;;
+*)
+       if [ -r "${firewall_type}" ]; then
+               ${fwcmd} ${firewall_flags} ${firewall_type}
+       fi
+       ;;
+esac
diff --git a/glue.h b/glue.h
new file mode 100644 (file)
index 0000000..75216cc
--- /dev/null
+++ b/glue.h
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * $Id: glue.h 12501 2014-01-10 01:09:14Z luigi $
+ *
+ * glue code to adapt the FreeBSD version to linux and windows,
+ * userland and kernel.
+ * This is included before any other headers, so we do not have
+ * a chance to override any #define that should appear in other
+ * headers.
+ * First handle headers for userland and kernel. Then common code
+ * (including headers that require a specific order of inclusion),
+ * then the user- and kernel- specific parts.
+ */
+#if defined __FreeBSD__
+#define _GLUE_H
+#endif /* __FreeBSD__ */
+#ifndef _GLUE_H
+#define        _GLUE_H
+
+
+/*
+ * common definitions to allow portability
+ */
+#ifndef __FBSDID
+#define __FBSDID(x)
+#endif  /* FBSDID */
+
+#ifndef KERNEL_MODULE  /* Userland headers */
+
+#if defined(__CYGWIN32__) && !defined(_WIN32)                                   
+#define _WIN32                                                                  
+#endif                                                                          
+
+#if defined(TCC) && defined(_WIN32)
+#include <tcc_glue.h>
+#endif /* TCC */
+
+#include <stdint.h>    /* linux needs it in addition to sys/types.h */
+#include <sys/types.h> /* for size_t */
+#include <sys/ioctl.h>
+#include <time.h>
+#include <errno.h>
+#ifdef __linux__
+#include <netinet/ether.h>     /* linux only 20111031 */
+#endif
+
+#else /* KERNEL_MODULE, kernel headers */
+
+#define        INET            # want inet support
+#ifdef __linux__
+
+#include <linux/version.h>
+
+#define ifnet          net_device      /* remap */
+#define        _KERNEL         # make kernel structure visible
+#define        KLD_MODULE      # add the module glue
+
+#include <linux/stddef.h>      /* linux kernel */
+#include <linux/types.h>       /* linux kernel */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)        // or 2.4.x
+#include <linux/linkage.h>     /* linux/msg.h require this */
+#include <linux/netdevice.h>   /* just MAX_ADDR_LEN 8 on 2.4 32 on 2.6, also brings in byteorder */
+#endif
+
+/* on 2.6.22, msg.h requires spinlock_types.h */
+/* XXX spinlock_type.h was introduced in 2.6.14 */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) && \
+       LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#include <linux/spinlock_types.h>
+#endif
+/* XXX m_type define conflict with include/sys/mbuf.h,
+ * so early include msg.h (to be solved)
+*/
+#include <linux/msg.h> 
+
+#include <linux/list.h>
+#include <linux/in.h>          /* struct in_addr */
+#include <linux/in6.h>         /* struct in6_addr */
+#include <linux/icmp.h>
+/*
+ * LIST_HEAD in queue.h conflict with linux/list.h
+ * some previous linux include need list.h definition
+ */
+#undef LIST_HEAD
+
+#define        IF_NAMESIZE     (16)
+typedef        uint32_t        in_addr_t;
+
+#define printf(fmt, arg...) printk(KERN_ERR fmt, ##arg)
+#endif /* __linux__ */
+
+#endif /* KERNEL_MODULE end of kernel headers */
+
+
+/*
+ * Part 2: common userland and kernel definitions
+ */
+
+#ifndef ETHER_ADDR_LEN
+#define ETHER_ADDR_LEN (6+0)       /* length of an Ethernet address */
+#endif
+
+#define ICMP6_DST_UNREACH_NOROUTE       0       /* no route to destination */
+#define ICMP6_DST_UNREACH_ADMIN         1       /* administratively prohibited */
+#define ICMP6_DST_UNREACH_ADDR          3       /* address unreachable */
+#define ICMP6_DST_UNREACH_NOPORT        4       /* port unreachable */
+
+/*
+ * linux: sysctl are mapped into /sys/module/ipfw_mod parameters
+ * windows: they are emulated via get/setsockopt
+ */
+#define CTLFLAG_RD             1
+#define CTLFLAG_RDTUN  1
+#define CTLFLAG_RW             2
+#define CTLFLAG_SECURE3        0 // unsupported
+#define CTLFLAG_VNET    0      /* unsupported */
+
+/* if needed, queue.h must be included here after list.h */
+
+/*
+ * struct thread is used in linux and windows kernel.
+ * In windows, we need to emulate the sockopt interface
+ * so also the userland needs to have the struct sockopt defined.
+ * In order to achieve 64 bit compatibility, padding has been inserted.
+ */
+struct thread {
+        void *sopt_td;
+        void *td_ucred;
+};
+
+enum sopt_dir { SOPT_GET, SOPT_SET };
+
+struct  sockopt {
+        enum    sopt_dir sopt_dir; /* is this a get or a set? */
+        int     sopt_level;     /* second arg of [gs]etsockopt */
+        int     sopt_name;      /* third arg of [gs]etsockopt */
+#ifdef _X64EMU
+               void* pad1;
+               void* pad2;
+#endif
+               void   *sopt_val;       /* fourth arg of [gs]etsockopt */
+               size_t  sopt_valsize;   /* (almost) fifth arg of [gs]etsockopt */
+#ifdef _X64EMU
+               void* pad3;
+               void* pad4;
+#endif
+               struct  thread *sopt_td; /* calling thread or null if kernel */
+};
+
+
+#define INET_ADDRSTRLEN                (16)    /* missing in netinet/in.h */
+
+/*
+ * List of values used for set/getsockopt options.
+ * The base value on FreeBSD is defined as a macro,
+ * if not available we will use our own enum.
+ * The TABLE_BASE value is used in the kernel.
+ */
+#ifndef IP_FW_TABLE_ADD
+#define _IPFW_SOCKOPT_BASE     100     /* 40 on freebsd */
+enum ipfw_msg_type {
+       IP_FW_TABLE_ADD         = _IPFW_SOCKOPT_BASE,
+       IP_FW_TABLE_DEL,
+       IP_FW_TABLE_FLUSH,
+       IP_FW_TABLE_GETSIZE,
+       IP_FW_TABLE_LIST,
+       IP_FW_DYN_GET,          /* new addition */
+
+       /* IP_FW3 and IP_DUMMYNET3 are the new API */
+       IP_FW3                  = _IPFW_SOCKOPT_BASE + 8,
+       IP_DUMMYNET3,
+
+       IP_FW_ADD               = _IPFW_SOCKOPT_BASE + 10,
+       IP_FW_DEL,
+       IP_FW_FLUSH,
+       IP_FW_ZERO,
+       IP_FW_GET,
+       IP_FW_RESETLOG,
+
+       IP_FW_NAT_CFG,
+       IP_FW_NAT_DEL,
+       IP_FW_NAT_GET_CONFIG,
+       IP_FW_NAT_GET_LOG,
+
+       IP_DUMMYNET_CONFIGURE,
+       IP_DUMMYNET_DEL ,
+       IP_DUMMYNET_FLUSH,
+       /* 63 is missing */
+       IP_DUMMYNET_GET         = _IPFW_SOCKOPT_BASE + 24,
+       _IPFW_SOCKOPT_END
+};
+#endif /* IP_FW_TABLE_ADD */
+
+/*
+ * Part 3: userland stuff
+ */
+
+#ifndef KERNEL_MODULE
+
+/*
+ * internal names in struct in6_addr (netinet/in6.h) differ,
+ * so we remap the FreeBSD names to the platform-specific ones.
+ */
+#ifndef _WIN32
+#define __u6_addr      in6_u
+#define __u6_addr32    u6_addr32
+#define in6_u __in6_u  /* missing type for ipv6 (linux 2.6.28) */
+#else  /* _WIN32 uses different naming */
+#define __u6_addr      __u6
+#define __u6_addr32    __s6_addr32
+#endif /* _WIN32 */
+
+/* missing in linux netinet/ip.h */
+#define IPTOS_ECN_ECT0 0x02    /* ECN-capable transport (0) */
+#define IPTOS_ECN_CE   0x03    /* congestion experienced */
+
+/* defined in freebsd netinet/icmp6.h */
+#define ICMP6_MAXTYPE  201
+
+/* on freebsd sys/socket.h pf specific */
+#define NET_RT_IFLIST  3               /* survey interface list */
+
+#if defined(__linux__) || defined(__CYGWIN32__)
+/* on freebsd net/if.h XXX used */
+struct if_data {
+       /* ... */
+        u_long ifi_mtu;        /* maximum transmission unit */
+};
+
+/*
+ * Message format for use in obtaining information about interfaces
+ * from getkerninfo and the routing socket.
+ * This is used in nat.c
+ */
+struct if_msghdr {
+        u_short ifm_msglen;     /* to skip over unknown messages */
+        u_char  ifm_version;    /* future binary compatibility */
+        u_char  ifm_type;       /* message type */
+        int     ifm_addrs;      /* like rtm_addrs */
+        int     ifm_flags;      /* value of if_flags */
+        u_short ifm_index;      /* index for associated ifp */
+        struct  if_data ifm_data;/* stats and other ifdata */
+};
+
+/*
+ * Message format for use in obtaining information about interface
+ * addresses from getkerninfo and the routing socket
+ */
+struct ifa_msghdr {
+        u_short ifam_msglen;    /* to skip over unknown messages */
+        u_char  ifam_version;   /* future binary compatibility */
+        u_char  ifam_type;      /* message type */
+        int     ifam_addrs;     /* like rtm_addrs */
+        int     ifam_flags;     /* value of ifa_flags */
+        u_short ifam_index;     /* index for associated ifp */
+        int     ifam_metric;    /* value of ifa_metric */
+};
+
+#ifndef NO_RTM /* conflicting with netlink */
+/* missing in net/route.h */
+#define RTM_VERSION     5       /* Up the ante and ignore older versions */
+#define RTM_IFINFO      0xe     /* iface going up/down etc. */
+#define RTM_NEWADDR     0xc     /* address being added to iface */
+#define RTA_IFA         0x20    /* interface addr sockaddr present */
+#endif /* NO_RTM */
+
+/* SA_SIZE is used in the userland nat.c modified */
+#define SA_SIZE(sa)                                             \
+    (  (!(sa) ) ?      \
+        sizeof(long)            :                               \
+        1 + ( (sizeof(struct sockaddr) - 1) | (sizeof(long) - 1) ) )
+
+/* sys/time.h */
+/*
+ * Getkerninfo clock information structure
+ */
+struct clockinfo {
+        int     hz;             /* clock frequency */
+        int     tick;           /* micro-seconds per hz tick */
+        int     spare;
+        int     stathz;         /* statistics clock frequency */
+        int     profhz;         /* profiling clock frequency */
+};
+
+/* no sin_len in sockaddr, we only remap in userland */
+#define        sin_len sin_zero[0]
+
+#endif /* Linux/Win */
+
+/*
+ * linux does not have a reentrant version of qsort,
+ * so we the FreeBSD stdlib version.
+ */
+void qsort_r(void *a, size_t n, size_t es, void *thunk,
+       int cmp_t(void *, const void *, const void *));
+
+/* prototypes from libutil */
+/* humanize_number(3) */
+#define HN_DECIMAL              0x01
+#define HN_NOSPACE              0x02
+#define HN_B                    0x04
+#define HN_DIVISOR_1000         0x08
+
+#define HN_GETSCALE             0x10
+#define HN_AUTOSCALE            0x20
+
+int     humanize_number(char *_buf, size_t _len, int64_t _number,
+            const char *_suffix, int _scale, int _flags);
+int     expand_number(const char *_buf, int64_t *_num);
+
+#define setprogname(x) /* not present in linux */
+
+extern int optreset;   /* not present in linux */
+
+size_t strlcpy(char * dst, const char * src, size_t siz);
+long long int strtonum(const char *nptr, long long minval,
+       long long maxval, const char **errstr);
+int sysctlbyname(const char *name, void *oldp, size_t *oldlenp,
+       void *newp, size_t newlen);
+
+#else /* KERNEL_MODULE */
+
+/*
+ * Part 4: kernel stuff
+ */
+
+/* linux and windows kernel do not have bcopy ? */
+#define bcopy(_s, _d, _l)      memcpy(_d, _s, _l)
+/* definitions useful for the kernel side */
+struct route_in6 {
+       int dummy;
+};
+
+#ifdef __linux__
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)        // or 2.4.x
+#include <linux/in6.h>
+#endif
+
+/* skb_dst() and skb_dst_set() was introduced from linux 2.6.31 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst);
+struct dst_entry *skb_dst(const struct sk_buff *skb);
+#endif
+
+/* The struct flowi changed */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38)        // check boundaries
+#define flow_daddr fl.u.ip4
+#else
+#define flow_daddr fl.nl_u.ip4_u
+#endif
+
+#endif /* __linux__ */
+
+/* 
+ * Do not load prio_heap.h header because of conflicting names
+ * with our heap functions defined in include/netinet/ipfw/dn_heap.h
+ * However do define struct ptr_heap used in linux 3.12.7 etc.
+ */
+#define _LINUX_PRIO_HEAP_H
+struct ptr_heap;
+
+/* 
+ * The following define prevent the ipv6.h header to be loaded.
+ * Starting from the 2.6.38 kernel the ipv6.h file, which is included
+ * by include/net/inetpeer.h in turn included by net/route.h
+ * include the system tcp.h file while we want to include 
+ * our include/net/tcp.h instead.
+ */
+#ifndef _NET_IPV6_H
+#define _NET_IPV6_H
+static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2)
+{
+        memcpy(a1, a2, sizeof(struct in6_addr));
+}
+#endif /* _NET_IPV6_H */
+
+#endif /* KERNEL_MODULE */
+
+/*
+ * Part 5: windows specific stuff
+ */
+
+#ifdef _WIN32
+#ifndef KERNEL_MODULE
+#define CTL_CODE( DeviceType, Function, Method, Access ) (                 \
+    ((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method) \
+)
+
+#define METHOD_BUFFERED                 0
+#define METHOD_IN_DIRECT                1
+#define METHOD_OUT_DIRECT               2
+#define METHOD_NEITHER                  3
+#define FILE_ANY_ACCESS                 0
+#define FILE_READ_DATA            ( 0x0001 )    // file & pipe
+#define FILE_WRITE_DATA           ( 0x0002 )    // file & pipe
+#endif /* !KERNEL_MODULE */
+
+#define FILE_DEVICE_IPFW               0x00654324
+#define IP_FW_BASE_CTL                 0x840
+#define IP_FW_SETSOCKOPT \
+       CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 1, METHOD_BUFFERED, FILE_WRITE_DATA)
+#define IP_FW_GETSOCKOPT \
+       CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 2, METHOD_BUFFERED, FILE_ANY_ACCESS)
+
+/*********************************
+* missing declarations in altq.c *
+**********************************/
+
+#define _IOWR(x,y,t) _IOW(x,y,t)
+
+/**********************************
+* missing declarations in ipfw2.c *
+***********************************/
+
+#define        ICMP_UNREACH_NET                0       /* bad net */
+#define        ICMP_UNREACH_HOST               1       /* bad host */
+#define        ICMP_UNREACH_PROTOCOL           2       /* bad protocol */
+#define        ICMP_UNREACH_PORT               3       /* bad port */
+#define        ICMP_UNREACH_NEEDFRAG           4       /* IP_DF caused drop */
+#define        ICMP_UNREACH_SRCFAIL            5       /* src route failed */
+#define        ICMP_UNREACH_NET_UNKNOWN        6       /* unknown net */
+#define        ICMP_UNREACH_HOST_UNKNOWN       7       /* unknown host */
+#define        ICMP_UNREACH_ISOLATED           8       /* src host isolated */
+#define        ICMP_UNREACH_NET_PROHIB         9       /* prohibited access */
+#define        ICMP_UNREACH_HOST_PROHIB        10      /* ditto */
+#define        ICMP_UNREACH_TOSNET             11      /* bad tos for net */
+#define        ICMP_UNREACH_TOSHOST            12      /* bad tos for host */
+#define        ICMP_UNREACH_FILTER_PROHIB      13      /* admin prohib */
+#define        ICMP_UNREACH_HOST_PRECEDENCE    14      /* host prec vio. */
+#define        ICMP_UNREACH_PRECEDENCE_CUTOFF  15      /* prec cutoff */
+
+
+struct ether_addr;
+struct ether_addr * ether_aton(const char *a);
+
+/*********************************
+* missing declarations in ipv6.c *
+**********************************/
+
+struct hostent* gethostbyname2(const char *name, int af);
+
+
+/********************
+* windows wrappings *
+*********************/
+
+int my_socket(int domain, int ty, int proto);
+#define socket(_a, _b, _c)     my_socket(_a, _b, _c)
+
+#endif /* _WIN32 */
+/*******************
+* SYSCTL emulation *
+********************/
+#if defined (_WIN32) || defined (EMULATE_SYSCTL)
+#define STRINGIFY(x) #x
+
+/* flag is set with the last 2 bits for access, as defined in glue.h
+ * and the rest for type
+ */
+enum {
+       SYSCTLTYPE_INT = 0,
+       SYSCTLTYPE_UINT,
+       SYSCTLTYPE_SHORT,
+       SYSCTLTYPE_USHORT,
+       SYSCTLTYPE_LONG,
+       SYSCTLTYPE_ULONG,
+       SYSCTLTYPE_STRING,
+};
+
+struct sysctlhead {
+       uint32_t blocklen; //total size of the entry
+       uint32_t namelen; //strlen(name) + '\0'
+       uint32_t flags; //type and access
+       uint32_t datalen;
+};
+
+#ifdef _KERNEL
+
+#ifdef SYSCTL_NODE
+#undef SYSCTL_NODE
+#endif
+#define SYSCTL_NODE(a,b,c,d,e,f)
+#define SYSCTL_DECL(a)
+#define SYSCTL_VNET_PROC(a,b,c,d,e,f,g,h,i)
+
+#define GST_HARD_LIMIT 100
+
+/* In the module, GST is implemented as an array of
+ * sysctlentry, but while passing data to the userland
+ * pointers are useless, the buffer is actually made of:
+ * - sysctlhead (fixed size, containing lengths)
+ * - data (typically 32 bit)
+ * - name (zero-terminated and padded to mod4)
+ */
+
+struct sysctlentry {
+       struct sysctlhead head;
+       char* name;
+       void* data;
+};
+
+struct sysctltable {
+       int count; //number of valid tables
+       int totalsize; //total size of valid entries of al the valid tables
+       void* namebuffer; //a buffer for all chained names
+       struct sysctlentry entry[GST_HARD_LIMIT];
+};
+
+#ifdef SYSBEGIN
+#undef SYSBEGIN
+#endif
+#define SYSBEGIN(x) void sysctl_addgroup_##x() {
+#ifdef SYSEND
+#undef SYSEND
+#endif
+#define SYSEND }
+
+/* XXX remove duplication */
+#define SYSCTL_INT(a,b,c,d,e,f,g)                              \
+       sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,      \
+               (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e)
+
+#define SYSCTL_VNET_INT(a,b,c,d,e,f,g)                         \
+       sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,      \
+               (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e)
+
+#define SYSCTL_UINT(a,b,c,d,e,f,g)                             \
+       sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,      \
+               (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e)
+
+#define SYSCTL_VNET_UINT(a,b,c,d,e,f,g)                                \
+       sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,      \
+               (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e)
+
+#define SYSCTL_LONG(a,b,c,d,e,f,g)                             \
+       sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,      \
+               (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e)
+
+#define SYSCTL_ULONG(a,b,c,d,e,f,g)                            \
+       sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1,      \
+               (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e)
+#define TUNABLE_INT(a,b)
+
+void keinit_GST(void);
+void keexit_GST(void);
+int kesysctl_emu_set(void* p, int l);
+int kesysctl_emu_get(struct sockopt* sopt);
+void sysctl_pushback(char* name, int flags, int datalen, void* data);
+
+#endif /* _KERNEL */
+
+int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
+         size_t newlen);
+#endif /* _WIN32" || EMULATE_SYSCTL */
+#ifdef _WIN32
+int do_cmd(int optname, void *optval, uintptr_t optlen);
+
+#endif /* _WIN32 */
+
+#define __PAST_END(v, idx)      v[idx]
+#endif /* !_GLUE_H */
diff --git a/ipfw/Makefile b/ipfw/Makefile
new file mode 100644 (file)
index 0000000..a32d02a
--- /dev/null
@@ -0,0 +1,128 @@
+#
+# $Id: Makefile 11688 2012-08-12 20:58:26Z luigi $
+#
+# GNUMakefile to build the userland part of ipfw on Linux and Windows
+#
+# Do not set with = or := so we can inherit from the caller
+
+include ../Makefile.inc
+
+all: $(TARGET)
+
+#TCC=c:/path/to/tcc
+
+# common flags
+EXTRA_CFLAGS += -O1
+EXTRA_CFLAGS += -Wall
+EXTRA_CFLAGS += -include ../glue.h
+EXTRA_CFLAGS += -I ./include_e -I ./include
+
+TARGET := ipfw
+ifneq ($(VER),openwrt)
+ifeq ($(OSARCH),Linux)
+    EXTRA_CFLAGS += -D__BSD_VISIBLE
+    EXTRA_CFLAGS += -Werror
+    # Required by GCC 4.6
+    EXTRA_CFLAGS += -Wno-unused-but-set-variable
+endif
+ifeq ($(OSARCH),FreeBSD)
+    EXTRA_CFLAGS += -D__BSD_VISIBLE
+    EXTRA_CFLAGS += -Werror
+endif
+ifeq ($(OSARCH),Darwin)
+    EXTRA_CFLAGS += -D__BSD_VISIBLE
+    EXTRA_CFLAGS += -Werror
+endif
+
+ifeq ($(OSARCH),Windows)
+# we only support Cygwin and tcc as compilers.
+ifeq ($(WIN64),1)
+    EXTRA_CFLAGS += -D_X64EMU
+endif
+
+ifeq ($(TCC),) # cygwin
+    EXTRA_CFLAGS += -I/cygdrive/c/$(DDKDIR)/inc/ddk
+    EXTRA_CFLAGS += -I .
+    EXTRA_CFLAGS += -pipe -Wall
+else           #-- build with tcc
+    # TCC points to the root of tcc tree
+    CC=$(TCC)/tcc.exe
+    EXTRA_CFLAGS += -DTCC -I..
+    EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include
+    EXTRA_CFLAGS += -nostdinc
+
+    EFILES_. += err.h grp.h netdb.h pwd.h sysexits.h
+    EFILES_arpa += inet.h
+    EFILES_net += if.h
+    EFILES_netinet += in.h in_systm.h ip.h ip_icmp.h
+    EFILES_sys += cdefs.h wait.h ioctl.h socket.h
+
+endif
+    # EXTRA_CFLAGS += -D_WIN32 # see who defines it
+    EXTRA_CFLAGS += -Dsetsockopt=wnd_setsockopt
+    EXTRA_CFLAGS += -Dgetsockopt=wnd_getsockopt
+    EXTRA_CFLAGS += -DEMULATE_SYSCTL
+    EFILES_net += ethernet.h route.h
+    EFILES_netinet += ether.h icmp6.h
+    EFILES_sys += sysctl.h
+    TARGET = ipfw.exe
+ipfw: $(TARGET)
+endif # windows
+endif # !openwrt
+
+CFLAGS += $(EXTRA_CFLAGS)
+# Location of OS headers and libraries. After our stuff.
+USRDIR?= /usr
+ifeq ($(TCC),)
+    CFLAGS += -I$(USRDIR)/include
+    LDFLAGS += -L$(USRDIR)/lib
+else
+    LDFLAGS += -L. -L$(TCC)/lib -lws2_32
+endif
+
+OBJS = ipfw2.o dummynet.o main.o ipv6.o qsort_r.o
+OBJS += expand_number.o humanize_number.o glue.o
+
+# we don't use ALTQ
+CFLAGS += -DNO_ALTQ
+#OBJS += altq.o
+
+all: $(TARGET)
+       -@echo "Done build for $(OSARCH) VER $(VER)"
+
+$(TARGET): $(OBJS)
+       $(MSG) "   LD  $@"
+       $(HIDE)$(CC) $(LDFLAGS) -o $@ $^
+
+$(OBJS) : ipfw2.h ../glue.h include_e
+
+# support to create empty dirs and files in include_e/
+# EDIRS is the list of directories, EFILES is the list of files.
+EFILES_sys  += sockio.h
+EFILES_.    += libutil.h
+EFILES_netinet += __emtpy.h
+
+M ?= $(shell pwd)
+
+# first make a list of directories from variable names
+EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES)))
+# then prepend the directory name to individual files.
+#       $(empty) serves to interpret the following space literally,
+#       and the ":  = " substitution packs spaces into one.
+EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i):  = )))
+
+include_e:
+       $(MSG) "building include_e in $M"
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
+       -@(cd $(M)/include_e/netinet; \
+               for i in ip_fw.h ip_dummynet.h tcp.h; do \
+               cp ../../../sys/netinet/$$i .; done; )
+
+clean distclean:
+       -@rm -rf $(OBJS) $(TARGET) include_e
+
+diff:
+       -@(diff -upr $(BSD_HEAD)/sbin/ipfw .)
+
diff --git a/ipfw/add_rules b/ipfw/add_rules
new file mode 100755 (executable)
index 0000000..f7866d7
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# A test script to add rules
+
+PRG=./ipfw
+
+myfun() {
+       $PRG add 10 count icmp from any to 131.114.9.128
+       $PRG add 20 count icmp from 131.114.9.128 to any
+       $PRG add 20 count icmp from any to 131.114.9.130
+       $PRG add 30 count icmp from 131.114.9.130 to any
+       $PRG add 40 count icmp from any to 131.114.9.129
+       $PRG add 50 count icmp from 131.114.9.129 to any
+       $PRG add 60 count icmp from 131.114.9.236 to any
+       sleep 1
+       $PRG del 10
+       $PRG del 20
+       $PRG del 20
+       $PRG del 30
+       $PRG del 40
+       $PRG del 50
+       $PRG del 60
+}
+
+for ((i=0;i<100;i++)) ; do
+       myfun
+done
diff --git a/ipfw/dummynet.c b/ipfw/dummynet.c
new file mode 100644 (file)
index 0000000..15f00b6
--- /dev/null
@@ -0,0 +1,1459 @@
+/*
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * $FreeBSD: head/sbin/ipfw/dummynet.c 206843 2010-04-19 15:11:45Z luigi $
+ *
+ * dummynet support
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+/* XXX there are several sysctl leftover here */
+#include <sys/sysctl.h>
+
+#include "ipfw2.h"
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <arpa/inet.h> /* inet_ntoa */
+
+
+static struct _s_x dummynet_params[] = {
+       { "plr",                TOK_PLR },
+       { "noerror",            TOK_NOERROR },
+       { "buckets",            TOK_BUCKETS },
+       { "dst-ip",             TOK_DSTIP },
+       { "src-ip",             TOK_SRCIP },
+       { "dst-port",           TOK_DSTPORT },
+       { "src-port",           TOK_SRCPORT },
+       { "proto",              TOK_PROTO },
+       { "weight",             TOK_WEIGHT },
+       { "lmax",               TOK_LMAX },
+       { "maxlen",             TOK_LMAX },
+       { "all",                TOK_ALL },
+       { "mask",               TOK_MASK }, /* alias for both */
+       { "sched_mask",         TOK_SCHED_MASK },
+       { "flow_mask",          TOK_FLOW_MASK },
+       { "droptail",           TOK_DROPTAIL },
+       { "red",                TOK_RED },
+       { "gred",               TOK_GRED },
+       { "bw",                 TOK_BW },
+       { "bandwidth",          TOK_BW },
+       { "delay",              TOK_DELAY },
+       { "link",               TOK_LINK },
+       { "pipe",               TOK_PIPE },
+       { "queue",              TOK_QUEUE },
+       { "flowset",            TOK_FLOWSET },
+       { "sched",              TOK_SCHED },
+       { "pri",                TOK_PRI },
+       { "priority",           TOK_PRI },
+       { "type",               TOK_TYPE },
+       { "flow-id",            TOK_FLOWID},
+       { "dst-ipv6",           TOK_DSTIP6},
+       { "dst-ip6",            TOK_DSTIP6},
+       { "src-ipv6",           TOK_SRCIP6},
+       { "src-ip6",            TOK_SRCIP6},
+       { "profile",            TOK_PROFILE},
+       { "burst",              TOK_BURST},
+       { "dummynet-params",    TOK_NULL },
+       { NULL, 0 }     /* terminator */
+};
+
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+       oid->len = len;
+       oid->type = type;
+       oid->subtype = 0;
+       oid->id = id;
+}
+
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+       struct dn_id *ret = *o;
+       oid_fill(ret, len, type, 0);
+       *o = O_NEXT(*o, len);
+       return ret;
+}
+
+/* handle variable length structures moving back the pointer and fixing length */
+static void *
+o_compact(struct dn_id **o, int len, int real_length, int type)
+{
+        struct dn_id *ret = *o;
+
+        ret = O_NEXT(*o, -len);
+        oid_fill(ret, real_length, type, 0);
+        *o = O_NEXT(ret, real_length);
+        return ret;
+}
+
+#if 0
+static int
+sort_q(void *arg, const void *pa, const void *pb)
+{
+       int rev = (co.do_sort < 0);
+       int field = rev ? -co.do_sort : co.do_sort;
+       long long res = 0;
+       const struct dn_flow_queue *a = pa;
+       const struct dn_flow_queue *b = pb;
+
+       switch (field) {
+       case 1: /* pkts */
+               res = a->len - b->len;
+               break;
+       case 2: /* bytes */
+               res = a->len_bytes - b->len_bytes;
+               break;
+
+       case 3: /* tot pkts */
+               res = a->tot_pkts - b->tot_pkts;
+               break;
+
+       case 4: /* tot bytes */
+               res = a->tot_bytes - b->tot_bytes;
+               break;
+       }
+       if (res < 0)
+               res = -1;
+       if (res > 0)
+               res = 1;
+       return (int)(rev ? res : -res);
+}
+#endif
+
+/* print a mask and header for the subsequent list of flows */
+static void
+print_mask(struct ipfw_flow_id *id)
+{
+       if (!IS_IP6_FLOW_ID(id)) {
+               printf("    "
+                   "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
+                   id->extra ? "queue," : "",
+                   id->proto,
+                   id->src_ip, id->src_port,
+                   id->dst_ip, id->dst_port);
+       } else {
+               char buf[255];
+               printf("\n        mask: %sproto: 0x%02x, flow_id: 0x%08x,  ",
+                   id->extra ? "queue," : "",
+                   id->proto, id->flow_id6);
+               inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf));
+               printf("%s/0x%04x -> ", buf, id->src_port);
+               inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf));
+               printf("%s/0x%04x\n", buf, id->dst_port);
+       }
+}
+
+static void
+print_header(struct ipfw_flow_id *id)
+{
+       if (!IS_IP6_FLOW_ID(id))
+               printf("BKT Prot ___Source IP/port____ "
+                   "____Dest. IP/port____ "
+                   "Tot_pkt/bytes Pkt/Byte Drp\n");
+       else
+               printf("BKT ___Prot___ _flow-id_ "
+                   "______________Source IPv6/port_______________ "
+                   "_______________Dest. IPv6/port_______________ "
+                   "Tot_pkt/bytes Pkt/Byte Drp\n");
+}
+
+static void
+list_flow(struct dn_flow *ni, int *print)
+{
+       char buff[255];
+       struct protoent *pe = NULL;
+       struct in_addr ina;
+       struct ipfw_flow_id *id = &ni->fid;
+
+       if (*print) {
+               print_header(&ni->fid);
+               *print = 0;
+       }
+       pe = getprotobynumber(id->proto);
+               /* XXX: Should check for IPv4 flows */
+       printf("%3u%c", (ni->oid.id) & 0xff,
+               id->extra ? '*' : ' ');
+       if (!IS_IP6_FLOW_ID(id)) {
+               if (pe)
+                       printf("%-4s ", pe->p_name);
+               else
+                       printf("%4u ", id->proto);
+               ina.s_addr = htonl(id->src_ip);
+               printf("%15s/%-5d ",
+                   inet_ntoa(ina), id->src_port);
+               ina.s_addr = htonl(id->dst_ip);
+               printf("%15s/%-5d ",
+                   inet_ntoa(ina), id->dst_port);
+       } else {
+               /* Print IPv6 flows */
+               if (pe != NULL)
+                       printf("%9s ", pe->p_name);
+               else
+                       printf("%9u ", id->proto);
+               printf("%7d  %39s/%-5d ", id->flow_id6,
+                   inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)),
+                   id->src_port);
+               printf(" %39s/%-5d ",
+                   inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)),
+                   id->dst_port);
+       }
+       pr_u64(&ni->tot_pkts, 4);
+       pr_u64(&ni->tot_bytes, 8);
+       printf("%2u %4u %3u\n",
+           ni->length, ni->len_bytes, ni->drops);
+}
+
+static void
+print_flowset_parms(struct dn_fs *fs, char *prefix)
+{
+       int l;
+       char qs[30];
+       char plr[30];
+       char red[90];   /* Display RED parameters */
+
+       l = fs->qsize;
+       if (fs->flags & DN_QSIZE_BYTES) {
+               if (l >= 8192)
+                       sprintf(qs, "%d KB", l / 1024);
+               else
+                       sprintf(qs, "%d B", l);
+       } else
+               sprintf(qs, "%3d sl.", l);
+       if (fs->plr)
+               sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff));
+       else
+               plr[0] = '\0';
+
+       if (fs->flags & DN_IS_RED)      /* RED parameters */
+               sprintf(red,
+                   "\n\t %cRED w_q %f min_th %d max_th %d max_p %f",
+                   (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ',
+                   1.0 * fs->w_q / (double)(1 << SCALE_RED),
+                   fs->min_th,
+                   fs->max_th,
+                   1.0 * fs->max_p / (double)(1 << SCALE_RED));
+       else
+               sprintf(red, "droptail");
+
+       if (prefix[0]) {
+           printf("%s %s%s %d queues (%d buckets) %s\n",
+               prefix, qs, plr, fs->oid.id, fs->buckets, red);
+           prefix[0] = '\0';
+       } else {
+           printf("q%05d %s%s %d flows (%d buckets) sched %d "
+                       "weight %d lmax %d pri %d %s\n",
+               fs->fs_nr, qs, plr, fs->oid.id, fs->buckets,
+               fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red);
+           if (fs->flags & DN_HAVE_MASK)
+               print_mask(&fs->flow_mask);
+       }
+}
+
+static void
+print_extra_delay_parms(struct dn_profile *p)
+{
+       double loss;
+       if (p->samples_no <= 0)
+               return;
+
+       loss = p->loss_level;
+       loss /= p->samples_no;
+       printf("\t profile: name \"%s\" loss %f samples %d\n",
+               p->name, loss, p->samples_no);
+}
+
+static void
+flush_buf(char *buf)
+{
+       if (buf[0])
+               printf("%s\n", buf);
+       buf[0] = '\0';
+}
+
+/*
+ * generic list routine. We expect objects in a specific order, i.e.
+ * PIPES AND SCHEDULERS:
+ *     link; scheduler; internal flowset if any; instances
+ * we can tell a pipe from the number.
+ *
+ * FLOWSETS:
+ *     flowset; queues;
+ * link i (int queue); scheduler i; si(i) { flowsets() : queues }
+ */
+static void
+list_pipes(struct dn_id *oid, struct dn_id *end)
+{
+    char buf[160];     /* pending buffer */
+    int toPrint = 1;   /* print header */
+
+    buf[0] = '\0';
+    for (; oid != end; oid = O_NEXT(oid, oid->len)) {
+       if (oid->len < sizeof(*oid))
+               errx(1, "invalid oid len %d\n", oid->len);
+
+       switch (oid->type) {
+       default:
+           flush_buf(buf);
+           printf("unrecognized object %d size %d\n", oid->type, oid->len);
+           break;
+       case DN_TEXT: /* list of attached flowsets */
+           {
+               int i, l;
+               struct {
+                       struct dn_id id;
+                       uint32_t p[0];
+               } *d = (void *)oid;
+               l = (oid->len - sizeof(*oid))/sizeof(d->p[0]);
+               if (l == 0)
+                   break;
+               printf("   Children flowsets: ");
+               for (i = 0; i < l; i++)
+                       printf("%u ", d->p[i]);
+               printf("\n");
+               break;
+           }
+       case DN_CMD_GET:
+           if (co.verbose)
+               printf("answer for cmd %d, len %d\n", oid->type, oid->id);
+           break;
+       case DN_SCH: {
+           struct dn_sch *s = (struct dn_sch *)oid;
+           flush_buf(buf);
+           printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
+                       s->sched_nr,
+                       s->name, s->flags, s->buckets, s->oid.id);
+           if (s->flags & DN_HAVE_MASK)
+               print_mask(&s->sched_mask);
+           }
+           break;
+
+       case DN_FLOW:
+           list_flow((struct dn_flow *)oid, &toPrint);
+           break;
+
+       case DN_LINK: {
+           struct dn_link *p = (struct dn_link *)oid;
+           double b = p->bandwidth;
+           char bwbuf[30];
+           char burst[5 + 7];
+
+           /* This starts a new object so flush buffer */
+           flush_buf(buf);
+           /* data rate */
+           if (b == 0)
+               sprintf(bwbuf, "unlimited     ");
+           else if (b >= 1000000)
+               sprintf(bwbuf, "%7.3f Mbit/s", b/1000000);
+           else if (b >= 1000)
+               sprintf(bwbuf, "%7.3f Kbit/s", b/1000);
+           else
+               sprintf(bwbuf, "%7.3f bit/s ", b);
+
+           if (humanize_number(burst, sizeof(burst), p->burst,
+                   "", HN_AUTOSCALE, 0) < 0 || co.verbose)
+               sprintf(burst, "%d", (int)p->burst);
+           sprintf(buf, "%05d: %s %4d ms burst %s",
+               p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst);
+           }
+           break;
+
+       case DN_FS:
+           print_flowset_parms((struct dn_fs *)oid, buf);
+           break;
+       case DN_PROFILE:
+           flush_buf(buf);
+           print_extra_delay_parms((struct dn_profile *)oid);
+       }
+       flush_buf(buf); // XXX does it really go here ?
+    }
+}
+
+/*
+ * Delete pipe, queue or scheduler i
+ */
+int
+ipfw_delete_pipe(int do_pipe, int i)
+{
+       struct {
+               struct dn_id oid;
+               uintptr_t a[1]; /* add more if we want a list */
+       } cmd;
+       oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+       cmd.oid.subtype = (do_pipe == 1) ? DN_LINK :
+               ( (do_pipe == 2) ? DN_FS : DN_SCH);
+       cmd.a[0] = i;
+       i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len);
+       if (i) {
+               i = 1;
+               warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i);
+       }
+       return i;
+}
+
+/*
+ * Code to parse delay profiles.
+ *
+ * Some link types introduce extra delays in the transmission
+ * of a packet, e.g. because of MAC level framing, contention on
+ * the use of the channel, MAC level retransmissions and so on.
+ * From our point of view, the channel is effectively unavailable
+ * for this extra time, which is constant or variable depending
+ * on the link type. Additionally, packets may be dropped after this
+ * time (e.g. on a wireless link after too many retransmissions).
+ * We can model the additional delay with an empirical curve
+ * that represents its distribution.
+ *
+ *      cumulative probability
+ *      1.0 ^
+ *          |
+ *      L   +-- loss-level          x
+ *          |                 ******
+ *          |                *
+ *          |           *****
+ *          |          *
+ *          |        **
+ *          |       *
+ *          +-------*------------------->
+ *                      delay
+ *
+ * The empirical curve may have both vertical and horizontal lines.
+ * Vertical lines represent constant delay for a range of
+ * probabilities; horizontal lines correspond to a discontinuty
+ * in the delay distribution: the link will use the largest delay
+ * for a given probability.
+ *
+ * To pass the curve to dummynet, we must store the parameters
+ * in a file as described below, and issue the command
+ *
+ *      ipfw pipe <n> config ... bw XXX profile <filename> ...
+ *
+ * The file format is the following, with whitespace acting as
+ * a separator and '#' indicating the beginning a comment:
+ *
+ *     samples N
+ *             the number of samples used in the internal
+ *             representation (2..1024; default 100);
+ *
+ *     loss-level L
+ *             The probability above which packets are lost.
+ *            (0.0 <= L <= 1.0, default 1.0 i.e. no loss);
+ *
+ *     name identifier
+ *             Optional a name (listed by "ipfw pipe show")
+ *             to identify the distribution;
+ *
+ *     "delay prob" | "prob delay"
+ *             One of these two lines is mandatory and defines
+ *             the format of the following lines with data points.
+ *
+ *     XXX YYY
+ *             2 or more lines representing points in the curve,
+ *             with either delay or probability first, according
+ *             to the chosen format.
+ *             The unit for delay is milliseconds.
+ *
+ * Data points does not need to be ordered or equal to the number
+ * specified in the "samples" line. ipfw will sort and interpolate
+ * the curve as needed.
+ *
+ * Example of a profile file:
+
+       name    bla_bla_bla
+       samples 100
+       loss-level    0.86
+       prob    delay
+       0       200     # minimum overhead is 200ms
+       0.5     200
+       0.5     300
+       0.8     1000
+       0.9     1300
+       1       1300
+
+ * Internally, we will convert the curve to a fixed number of
+ * samples, and when it is time to transmit a packet we will
+ * model the extra delay as extra bits in the packet.
+ *
+ */
+
+#define ED_MAX_LINE_LEN        256+ED_MAX_NAME_LEN
+#define ED_TOK_SAMPLES "samples"
+#define ED_TOK_LOSS    "loss-level"
+#define ED_TOK_NAME    "name"
+#define ED_TOK_DELAY   "delay"
+#define ED_TOK_PROB    "prob"
+#define ED_TOK_BW      "bw"
+#define ED_SEPARATORS  " \t\n"
+#define ED_MIN_SAMPLES_NO      2
+
+/*
+ * returns 1 if s is a non-negative number, with at least one '.'
+ */
+static int
+is_valid_number(const char *s)
+{
+       int i, dots_found = 0;
+       int len = strlen(s);
+
+       for (i = 0; i<len; ++i)
+               if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1))
+                       return 0;
+       return 1;
+}
+
+/*
+ * Take as input a string describing a bandwidth value
+ * and return the numeric bandwidth value.
+ * set clocking interface or bandwidth value
+ */
+static void
+read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
+{
+       if (*bandwidth != -1)
+               warnx("duplicate token, override bandwidth value!");
+
+       if (arg[0] >= 'a' && arg[0] <= 'z') {
+               if (!if_name) {
+                       errx(1, "no if support");
+               }
+               if (namelen >= IFNAMSIZ)
+                       warn("interface name truncated");
+               namelen--;
+               /* interface name */
+               strncpy(if_name, arg, namelen);
+               if_name[namelen] = '\0';
+               *bandwidth = 0;
+       } else {        /* read bandwidth value */
+               int bw;
+               char *end = NULL;
+
+               bw = strtoul(arg, &end, 0);
+               if (*end == 'K' || *end == 'k') {
+                       end++;
+                       bw *= 1000;
+               } else if (*end == 'M' || *end == 'm') {
+                       end++;
+                       bw *= 1000000;
+               }
+               if ((*end == 'B' &&
+                       _substrcmp2(end, "Bi", "Bit/s") != 0) ||
+                   _substrcmp2(end, "by", "bytes") == 0)
+                       bw *= 8;
+
+               if (bw < 0)
+                       errx(EX_DATAERR, "bandwidth too large");
+
+               *bandwidth = bw;
+               if (if_name)
+                       if_name[0] = '\0';
+       }
+}
+
+struct point {
+       double prob;
+       double delay;
+};
+
+static int
+compare_points(const void *vp1, const void *vp2)
+{
+       const struct point *p1 = vp1;
+       const struct point *p2 = vp2;
+       double res = 0;
+
+       res = p1->prob - p2->prob;
+       if (res == 0)
+               res = p1->delay - p2->delay;
+       if (res < 0)
+               return -1;
+       else if (res > 0)
+               return 1;
+       else
+               return 0;
+}
+
+#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno
+
+/*
+ * Interpolate a set of proability-value tuples.
+ *
+ * This function takes as input a tuple of values <prob, value>
+ * and samples the interpolated curve described from the tuples.
+ *
+ * The user defined points are stored in the ponts structure.
+ * The number of points is stored in points_no.
+ * The user defined sampling value is stored in samples_no.
+ * The resulting samples are in the "samples" pointer.
+ *
+ *       We assume that The last point for the '1' value of the
+ *       probability should be defined. (XXX add checks for this)
+ *
+ * The input data are points and points_no.
+ * The output data are s (the array of s_no samples)
+ * and s_no (the number of samples)
+ *
+ */
+static void
+interpolate_samples(struct point *p, int points_no,
+               int *samples, int samples_no, const char *filename)
+{
+       double dy;              /* delta on the y axis */
+       double y;               /* current value of y */
+       double x;               /* current value of x */
+       double m;               /* the y slope */
+       int i;                  /* samples index */
+       int curr;               /* points current index */
+
+        /* make sure that there are enough points. */
+        /* XXX Duplicated should be removed */
+        if (points_no < 3)
+            errx(EX_DATAERR, "%s too few samples, need at least %d",
+                filename, 3);
+
+        qsort(p, points_no, sizeof(struct point), compare_points);
+
+       dy = 1.0/samples_no;
+       y = 0;
+
+       for (i=0, curr = 0; i < samples_no; i++, y+=dy) {
+               /* This statment move the curr pointer to the next point
+                * skipping the points with the same x value. We are
+                * guaranteed to exit from the loop because the
+                * last possible value of y is stricly less than 1
+                * and the last possible value of the y points is 1 */
+               while ( y >= p[curr+1].prob ) curr++;
+
+               /* compute the slope of the curve */
+               m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob);
+               /* compute the x value starting from the current point */
+               x = p[curr].delay + (y - p[curr].prob) * m;
+               samples[i] = x;
+       }
+
+       /* add the last sample */
+       samples[i] = p[curr+1].delay;
+}
+
+/*
+ * p is the link (old pipe)
+ * pf is the profile
+ */
+static void
+load_extra_delays(const char *filename, struct dn_profile *p,
+       struct dn_link *link)
+{
+       char    line[ED_MAX_LINE_LEN];
+       FILE    *f;
+       int     lineno = 0;
+
+       int     samples = -1;
+       double  loss = -1.0;
+       char    profile_name[ED_MAX_NAME_LEN];
+       int     delay_first = -1;
+       int     do_points = 0;
+       struct point    points[ED_MAX_SAMPLES_NO];
+       int     points_no = 0;
+
+       /* XXX link never NULL? */
+       p->link_nr = link->link_nr;
+
+       profile_name[0] = '\0';
+       f = fopen(filename, "r");
+       if (f == NULL)
+               err(EX_UNAVAILABLE, "fopen: %s", filename);
+
+       while (fgets(line, ED_MAX_LINE_LEN, f)) {        /* read commands */
+               char *s, *cur = line, *name = NULL, *arg = NULL;
+
+               ++lineno;
+
+               /* parse the line */
+               while (cur) {
+                       s = strsep(&cur, ED_SEPARATORS);
+                       if (s == NULL || *s == '#')
+                               break;
+                       if (*s == '\0')
+                               continue;
+                       if (arg)
+                               errx(ED_EFMT("too many arguments"));
+                       if (name == NULL)
+                               name = s;
+                       else
+                               arg = s;
+               }
+
+               if ((name == NULL) || (*name == '#'))   /* empty line */
+                       continue;
+               if (arg == NULL)
+                       errx(ED_EFMT("missing arg for %s"), name);
+
+               if (!strcasecmp(name, ED_TOK_SAMPLES)) {
+                   if (samples > 0)
+                       errx(ED_EFMT("duplicate ``samples'' line"));
+                   if (atoi(arg) <=0)
+                       errx(ED_EFMT("invalid number of samples"));
+                   samples = atoi(arg);
+                   if (samples>=ED_MAX_SAMPLES_NO-1)
+                           errx(ED_EFMT("too many samples, maximum is %d"),
+                               ED_MAX_SAMPLES_NO-1);
+                   do_points = 0;
+               } else if (!strcasecmp(name, ED_TOK_BW)) {
+                   char buf[IFNAMSIZ];
+                   read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf));
+                   p->bandwidth = link->bandwidth;
+               } else if (!strcasecmp(name, ED_TOK_LOSS)) {
+                   if (loss != -1.0)
+                       errx(ED_EFMT("duplicated token: %s"), name);
+                   if (!is_valid_number(arg))
+                       errx(ED_EFMT("invalid %s"), arg);
+                   loss = atof(arg);
+                   if (loss > 1)
+                       errx(ED_EFMT("%s greater than 1.0"), name);
+                   do_points = 0;
+               } else if (!strcasecmp(name, ED_TOK_NAME)) {
+                   if (profile_name[0] != '\0')
+                       errx(ED_EFMT("duplicated token: %s"), name);
+                   strncpy(profile_name, arg, sizeof(profile_name) - 1);
+                   profile_name[sizeof(profile_name)-1] = '\0';
+                   do_points = 0;
+               } else if (!strcasecmp(name, ED_TOK_DELAY)) {
+                   if (do_points)
+                       errx(ED_EFMT("duplicated token: %s"), name);
+                   delay_first = 1;
+                   do_points = 1;
+               } else if (!strcasecmp(name, ED_TOK_PROB)) {
+                   if (do_points)
+                       errx(ED_EFMT("duplicated token: %s"), name);
+                   delay_first = 0;
+                   do_points = 1;
+               } else if (do_points) {
+                   if (!is_valid_number(name) || !is_valid_number(arg))
+                       errx(ED_EFMT("invalid point found"));
+                   if (delay_first) {
+                       points[points_no].delay = atof(name);
+                       points[points_no].prob = atof(arg);
+                   } else {
+                       points[points_no].delay = atof(arg);
+                       points[points_no].prob = atof(name);
+                   }
+                   if (points[points_no].prob > 1.0)
+                       errx(ED_EFMT("probability greater than 1.0"));
+                   ++points_no;
+               } else {
+                   errx(ED_EFMT("unrecognised command '%s'"), name);
+               }
+       }
+
+       fclose (f);
+
+       if (samples == -1) {
+           warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES);
+           samples = 100;
+       }
+
+       if (loss == -1.0) {
+           warnx("'%s' not found, assuming no loss", ED_TOK_LOSS);
+           loss = 1;
+       }
+
+       interpolate_samples(points, points_no, p->samples, samples, filename);
+
+       p->samples_no = samples++;
+       p->loss_level = loss * samples;
+       strncpy(p->name, profile_name, sizeof(p->name));
+}
+
+/*
+ * configuration of pipes, schedulers, flowsets.
+ * When we configure a new scheduler, an empty pipe is created, so:
+ *
+ * do_pipe = 1 -> "pipe N config ..." only for backward compatibility
+ *     sched N+Delta type fifo sched_mask ...
+ *     pipe N+Delta <parameters>
+ *     flowset N+Delta pipe N+Delta (no parameters)
+ *     sched N type wf2q+ sched_mask ...
+ *     pipe N <parameters>
+ *
+ * do_pipe = 2 -> flowset N config
+ *     flowset N parameters
+ *
+ * do_pipe = 3 -> sched N config
+ *     sched N parameters (default no pipe)
+ *     optional Pipe N config ...
+ * pipe ==>
+ */
+void
+ipfw_config_pipe(int ac, char **av)
+{
+       int i;
+       u_int j;
+       char *end;
+       void *par = NULL;
+       struct dn_id *buf, *base;
+       struct dn_sch *sch = NULL;
+       struct dn_link *p = NULL;
+       struct dn_fs *fs = NULL;
+       struct dn_profile *pf = NULL;
+       struct ipfw_flow_id *mask = NULL;
+       int lmax;
+       uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
+       size_t max_pf_size = sizeof(struct dn_profile) + ED_MAX_SAMPLES_NO * sizeof(int);
+
+       /*
+        * allocate space for 1 header,
+        * 1 scheduler, 1 link, 1 flowset, 1 profile
+        */
+       lmax = sizeof(struct dn_id);    /* command header */
+       lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+               sizeof(struct dn_fs);
+       lmax += max_pf_size;
+
+       av++; ac--;
+       /* Pipe number */
+       if (ac && isdigit(**av)) {
+               i = atoi(*av); av++; ac--;
+       } else
+               i = -1;
+       if (i <= 0)
+               errx(EX_USAGE, "need a pipe/flowset/sched number");
+       base = buf = safe_calloc(1, lmax);
+       /* all commands start with a 'CONFIGURE' and a version */
+       o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+       base->id = DN_API_VERSION;
+
+       switch (co.do_pipe) {
+       case 1: /* "pipe N config ..." */
+               /* Allocate space for the WF2Q+ scheduler, its link
+                * and the FIFO flowset. Set the number, but leave
+                * the scheduler subtype and other parameters to 0
+                * so the kernel will use appropriate defaults.
+                * XXX todo: add a flag to record if a parameter
+                * is actually configured.
+                * If we do a 'pipe config' mask -> sched_mask.
+                * The FIFO scheduler and link are derived from the
+                * WF2Q+ one in the kernel.
+                */
+               sch = o_next(&buf, sizeof(*sch), DN_SCH);
+               p = o_next(&buf, sizeof(*p), DN_LINK);
+               fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+               sch->sched_nr = i;
+               sch->oid.subtype = 0;   /* defaults to WF2Q+ */
+               mask = &sch->sched_mask;
+               flags = &sch->flags;
+               buckets = &sch->buckets;
+               *flags |= DN_PIPE_CMD;
+
+               p->link_nr = i;
+
+               /* This flowset is only for the FIFO scheduler */
+               fs->fs_nr = i + 2*DN_MAX_ID;
+               fs->sched_nr = i + DN_MAX_ID;
+               break;
+
+       case 2: /* "queue N config ... " */
+               fs = o_next(&buf, sizeof(*fs), DN_FS);
+               fs->fs_nr = i;
+               mask = &fs->flow_mask;
+               flags = &fs->flags;
+               buckets = &fs->buckets;
+               break;
+
+       case 3: /* "sched N config ..." */
+               sch = o_next(&buf, sizeof(*sch), DN_SCH);
+               fs = o_next(&buf, sizeof(*fs), DN_FS);
+               sch->sched_nr = i;
+               mask = &sch->sched_mask;
+               flags = &sch->flags;
+               buckets = &sch->buckets;
+               /* fs is used only with !MULTIQUEUE schedulers */
+               fs->fs_nr = i + DN_MAX_ID;
+               fs->sched_nr = i;
+               break;
+       }
+       /* set to -1 those fields for which we want to reuse existing
+        * values from the kernel.
+        * Also, *_nr and subtype = 0 mean reuse the value from the kernel.
+        * XXX todo: support reuse of the mask.
+        */
+       if (p)
+               p->bandwidth = -1;
+       for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++)
+               fs->par[j] = -1;
+       while (ac > 0) {
+               double d;
+               int tok = match_token(dummynet_params, *av);
+               ac--; av++;
+
+               switch(tok) {
+               case TOK_NOERROR:
+                       NEED(fs, "noerror is only for pipes");
+                       fs->flags |= DN_NOERROR;
+                       break;
+
+               case TOK_PLR:
+                       NEED(fs, "plr is only for pipes");
+                       NEED1("plr needs argument 0..1\n");
+                       d = strtod(av[0], NULL);
+                       if (d > 1)
+                               d = 1;
+                       else if (d < 0)
+                               d = 0;
+                       fs->plr = (int)(d*0x7fffffff);
+                       ac--; av++;
+                       break;
+
+               case TOK_QUEUE:
+                       NEED(fs, "queue is only for pipes or flowsets");
+                       NEED1("queue needs queue size\n");
+                       end = NULL;
+                       fs->qsize = strtoul(av[0], &end, 0);
+                       if (*end == 'K' || *end == 'k') {
+                               fs->flags |= DN_QSIZE_BYTES;
+                               fs->qsize *= 1024;
+                       } else if (*end == 'B' ||
+                           _substrcmp2(end, "by", "bytes") == 0) {
+                               fs->flags |= DN_QSIZE_BYTES;
+                       }
+                       ac--; av++;
+                       break;
+
+               case TOK_BUCKETS:
+                       NEED(fs, "buckets is only for pipes or flowsets");
+                       NEED1("buckets needs argument\n");
+                       *buckets = strtoul(av[0], NULL, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_FLOW_MASK:
+               case TOK_SCHED_MASK:
+               case TOK_MASK:
+                       NEED(mask, "tok_mask");
+                       NEED1("mask needs mask specifier\n");
+                       /*
+                        * per-flow queue, mask is dst_ip, dst_port,
+                        * src_ip, src_port, proto measured in bits
+                        */
+                       par = NULL;
+
+                       bzero(mask, sizeof(*mask));
+                       end = NULL;
+
+                       while (ac >= 1) {
+                           uint32_t *p32 = NULL;
+                           uint16_t *p16 = NULL;
+                           uint32_t *p20 = NULL;
+                           struct in6_addr *pa6 = NULL;
+                           uint32_t a;
+
+                           tok = match_token(dummynet_params, *av);
+                           ac--; av++;
+                           switch(tok) {
+                           case TOK_ALL:
+                                   /*
+                                    * special case, all bits significant
+                                    * except 'extra' (the queue number)
+                                    */
+                                   mask->dst_ip = ~0;
+                                   mask->src_ip = ~0;
+                                   mask->dst_port = ~0;
+                                   mask->src_port = ~0;
+                                   mask->proto = ~0;
+                                   n2mask(&mask->dst_ip6, 128);
+                                   n2mask(&mask->src_ip6, 128);
+                                   mask->flow_id6 = ~0;
+                                   *flags |= DN_HAVE_MASK;
+                                   goto end_mask;
+
+                           case TOK_QUEUE:
+                                   mask->extra = ~0;
+                                   *flags |= DN_HAVE_MASK;
+                                   goto end_mask;
+
+                           case TOK_DSTIP:
+                                   mask->addr_type = 4;
+                                   p32 = &mask->dst_ip;
+                                   break;
+
+                           case TOK_SRCIP:
+                                   mask->addr_type = 4;
+                                   p32 = &mask->src_ip;
+                                   break;
+
+                           case TOK_DSTIP6:
+                                   mask->addr_type = 6;
+                                   pa6 = &mask->dst_ip6;
+                                   break;
+
+                           case TOK_SRCIP6:
+                                   mask->addr_type = 6;
+                                   pa6 = &mask->src_ip6;
+                                   break;
+
+                           case TOK_FLOWID:
+                                   mask->addr_type = 6;
+                                   p20 = &mask->flow_id6;
+                                   break;
+
+                           case TOK_DSTPORT:
+                                   p16 = &mask->dst_port;
+                                   break;
+
+                           case TOK_SRCPORT:
+                                   p16 = &mask->src_port;
+                                   break;
+
+                           case TOK_PROTO:
+                                   break;
+
+                           default:
+                                   ac++; av--; /* backtrack */
+                                   goto end_mask;
+                           }
+                           if (ac < 1)
+                                   errx(EX_USAGE, "mask: value missing");
+                           if (*av[0] == '/') {
+                                   a = strtoul(av[0]+1, &end, 0);
+                                   if (pa6 == NULL)
+                                           a = (a == 32) ? ~0 : (1 << a) - 1;
+                           } else
+                                   a = strtoul(av[0], &end, 0);
+                           if (p32 != NULL)
+                                   *p32 = a;
+                           else if (p16 != NULL) {
+                                   if (a > 0xFFFF)
+                                           errx(EX_DATAERR,
+                                               "port mask must be 16 bit");
+                                   *p16 = (uint16_t)a;
+                           } else if (p20 != NULL) {
+                                   if (a > 0xfffff)
+                                       errx(EX_DATAERR,
+                                           "flow_id mask must be 20 bit");
+                                   *p20 = (uint32_t)a;
+                           } else if (pa6 != NULL) {
+                                   if (a > 128)
+                                       errx(EX_DATAERR,
+                                           "in6addr invalid mask len");
+                                   else
+                                       n2mask(pa6, a);
+                           } else {
+                                   if (a > 0xFF)
+                                           errx(EX_DATAERR,
+                                               "proto mask must be 8 bit");
+                                   mask->proto = (uint8_t)a;
+                           }
+                           if (a != 0)
+                                   *flags |= DN_HAVE_MASK;
+                           ac--; av++;
+                       } /* end while, config masks */
+end_mask:
+                       break;
+
+               case TOK_RED:
+               case TOK_GRED:
+                       NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
+                       fs->flags |= DN_IS_RED;
+                       if (tok == TOK_GRED)
+                               fs->flags |= DN_IS_GENTLE_RED;
+                       /*
+                        * the format for parameters is w_q/min_th/max_th/max_p
+                        */
+                       if ((end = strsep(&av[0], "/"))) {
+                           double w_q = strtod(end, NULL);
+                           if (w_q > 1 || w_q <= 0)
+                               errx(EX_DATAERR, "0 < w_q <= 1");
+                           fs->w_q = (int) (w_q * (1 << SCALE_RED));
+                       }
+                       if ((end = strsep(&av[0], "/"))) {
+                           fs->min_th = strtoul(end, &end, 0);
+                           if (*end == 'K' || *end == 'k')
+                               fs->min_th *= 1024;
+                       }
+                       if ((end = strsep(&av[0], "/"))) {
+                           fs->max_th = strtoul(end, &end, 0);
+                           if (*end == 'K' || *end == 'k')
+                               fs->max_th *= 1024;
+                       }
+                       if ((end = strsep(&av[0], "/"))) {
+                           double max_p = strtod(end, NULL);
+                           if (max_p > 1 || max_p <= 0)
+                               errx(EX_DATAERR, "0 < max_p <= 1");
+                           fs->max_p = (int)(max_p * (1 << SCALE_RED));
+                       }
+                       ac--; av++;
+                       break;
+
+               case TOK_DROPTAIL:
+                       NEED(fs, "droptail is only for flowsets");
+                       fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
+                       break;
+
+               case TOK_BW:
+                       NEED(p, "bw is only for links");
+                       NEED1("bw needs bandwidth or interface\n");
+                       read_bandwidth(av[0], &p->bandwidth, NULL, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_DELAY:
+                       NEED(p, "delay is only for links");
+                       NEED1("delay needs argument 0..10000ms\n");
+                       p->delay = strtoul(av[0], NULL, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_TYPE: {
+                       int l;
+                       NEED(sch, "type is only for schedulers");
+                       NEED1("type needs a string");
+                       l = strlen(av[0]);
+                       if (l == 0 || l > 15)
+                               errx(1, "type %s too long\n", av[0]);
+                       strcpy(sch->name, av[0]);
+                       sch->oid.subtype = 0; /* use string */
+                       ac--; av++;
+                       break;
+                   }
+
+               case TOK_WEIGHT:
+                       NEED(fs, "weight is only for flowsets");
+                       NEED1("weight needs argument\n");
+                       fs->par[0] = strtol(av[0], &end, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_LMAX:
+                       NEED(fs, "lmax is only for flowsets");
+                       NEED1("lmax needs argument\n");
+                       fs->par[1] = strtol(av[0], &end, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_PRI:
+                       NEED(fs, "priority is only for flowsets");
+                       NEED1("priority needs argument\n");
+                       fs->par[2] = strtol(av[0], &end, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_SCHED:
+               case TOK_PIPE:
+                       NEED(fs, "pipe/sched");
+                       NEED1("pipe/link/sched needs number\n");
+                       fs->sched_nr = strtoul(av[0], &end, 0);
+                       ac--; av++;
+                       break;
+
+               case TOK_PROFILE:
+                   {
+                       size_t real_length;
+
+                       NEED((!pf), "profile already set");
+                       NEED(p, "profile");
+                       NEED1("extra delay needs the file name\n");
+
+                       /* load the profile structure using the DN_API */
+                       pf = o_next(&buf, max_pf_size, DN_PROFILE);
+                       load_extra_delays(av[0], pf, p); //XXX can't fail?
+
+                       /* compact the dn_id structure */
+                       real_length = sizeof(struct dn_profile) +
+                               pf->samples_no * sizeof(int);
+                       o_compact(&buf, max_pf_size, real_length, DN_PROFILE);
+                       --ac; ++av;
+                   }
+                       break;
+
+               case TOK_BURST:
+                       NEED(p, "burst");
+                       NEED1("burst needs argument\n");
+                       errno = 0;
+                       if (expand_number(av[0], (int64_t *)&p->burst) < 0)
+                               if (errno != ERANGE)
+                                       errx(EX_DATAERR,
+                                           "burst: invalid argument");
+                       if (errno || p->burst > (1ULL << 48) - 1)
+                               errx(EX_DATAERR,
+                                   "burst: out of range (0..2^48-1)");
+                       ac--; av++;
+                       break;
+
+               default:
+                       errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]);
+               }
+       }
+
+       /* check validity of parameters */
+       if (p) {
+               if (p->delay > 10000)
+                       errx(EX_DATAERR, "delay must be < 10000");
+               if (p->bandwidth == -1)
+                       p->bandwidth = 0;
+       }
+       if (fs) {
+               /* XXX accept a 0 scheduler to keep the default */
+           if (fs->flags & DN_QSIZE_BYTES) {
+               size_t len;
+               long limit;
+
+               len = sizeof(limit);
+               if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit",
+                       &limit, &len, NULL, 0) == -1)
+                       limit = 1024*1024;
+               if (fs->qsize > limit)
+                       errx(EX_DATAERR, "queue size must be < %ldB", limit);
+           } else {
+               size_t len;
+               long limit;
+
+               len = sizeof(limit);
+               if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit",
+                       &limit, &len, NULL, 0) == -1)
+                       limit = 100;
+               if (fs->qsize > limit)
+                       errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
+           }
+
+           if (fs->flags & DN_IS_RED) {
+               size_t len;
+               int lookup_depth, avg_pkt_size;
+               double w_q;
+
+               if (fs->min_th >= fs->max_th)
+                   errx(EX_DATAERR, "min_th %d must be < than max_th %d",
+                       fs->min_th, fs->max_th);
+               if (fs->max_th == 0)
+                   errx(EX_DATAERR, "max_th must be > 0");
+
+               len = sizeof(int);
+               if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth",
+                       &lookup_depth, &len, NULL, 0) == -1)
+                       lookup_depth = 256;
+               if (lookup_depth == 0)
+                   errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth"
+                       " must be greater than zero");
+
+               len = sizeof(int);
+               if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size",
+                       &avg_pkt_size, &len, NULL, 0) == -1)
+                       avg_pkt_size = 512;
+
+               if (avg_pkt_size == 0)
+                       errx(EX_DATAERR,
+                           "net.inet.ip.dummynet.red_avg_pkt_size must"
+                           " be greater than zero");
+
+               /*
+                * Ticks needed for sending a medium-sized packet.
+                * Unfortunately, when we are configuring a WF2Q+ queue, we
+                * do not have bandwidth information, because that is stored
+                * in the parent pipe, and also we have multiple queues
+                * competing for it. So we set s=0, which is not very
+                * correct. But on the other hand, why do we want RED with
+                * WF2Q+ ?
+                */
+#if 0
+               if (p.bandwidth==0) /* this is a WF2Q+ queue */
+                       s = 0;
+               else
+                       s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth;
+#endif
+               /*
+                * max idle time (in ticks) before avg queue size becomes 0.
+                * NOTA:  (3/w_q) is approx the value x so that
+                * (1-w_q)^x < 10^-3.
+                */
+               w_q = ((double)fs->w_q) / (1 << SCALE_RED);
+#if 0 // go in kernel
+               idle = s * 3. / w_q;
+               fs->lookup_step = (int)idle / lookup_depth;
+               if (!fs->lookup_step)
+                       fs->lookup_step = 1;
+               weight = 1 - w_q;
+               for (t = fs->lookup_step; t > 1; --t)
+                       weight *= 1 - w_q;
+               fs->lookup_weight = (int)(weight * (1 << SCALE_RED));
+#endif /* code moved in the kernel */
+           }
+       }
+
+       i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base);
+
+       if (i)
+               err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE");
+}
+
+void
+dummynet_flush(void)
+{
+       struct dn_id oid;
+       oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+       do_cmd(IP_DUMMYNET3, &oid, oid.len);
+}
+
+/* Parse input for 'ipfw [pipe|sched|queue] show [range list]'
+ * Returns the number of ranges, and possibly stores them
+ * in the array v of size len.
+ */
+static int
+parse_range(int ac, char *av[], uint32_t *v, int len)
+{
+       int n = 0;
+       char *endptr, *s;
+       uint32_t base[2];
+
+       if (v == NULL || len < 2) {
+               v = base;
+               len = 2;
+       }
+
+       for (s = *av; s != NULL; av++, ac--) {
+               v[0] = strtoul(s, &endptr, 10);
+               v[1] = (*endptr != '-') ? v[0] :
+                        strtoul(endptr+1, &endptr, 10);
+               if (*endptr == '\0') { /* prepare for next round */
+                       s = (ac > 0) ? *(av+1) : NULL;
+               } else {
+                       if (*endptr != ',') {
+                               warn("invalid number: %s", s);
+                               s = ++endptr;
+                               continue;
+                       }
+                       /* continue processing from here */
+                       s = ++endptr;
+                       ac++;
+                       av--;
+               }
+               if (v[1] < v[0] ||
+                       v[1] >= DN_MAX_ID-1 ||
+                       v[1] >= DN_MAX_ID-1) {
+                       continue; /* invalid entry */
+               }
+               n++;
+               /* translate if 'pipe list' */
+               if (co.do_pipe == 1) {
+                       v[0] += DN_MAX_ID;
+                       v[1] += DN_MAX_ID;
+               }
+               v = (n*2 < len) ? v + 2 : base;
+       }
+       return n;
+}
+
+/* main entry point for dummynet list functions. co.do_pipe indicates
+ * which function we want to support.
+ * av may contain filtering arguments, either individual entries
+ * or ranges, or lists (space or commas are valid separators).
+ * Format for a range can be n1-n2 or n3 n4 n5 ...
+ * In a range n1 must be <= n2, otherwise the range is ignored.
+ * A number 'n4' is translate in a range 'n4-n4'
+ * All number must be > 0 and < DN_MAX_ID-1
+ */
+void
+dummynet_list(int ac, char *av[], int show_counters)
+{
+       struct dn_id *oid, *x = NULL;
+       int ret, i;
+       int n;          /* # of ranges */
+       u_int buflen, l;
+       u_int max_size; /* largest obj passed up */
+
+       (void)show_counters;    // XXX unused, but we should use it.
+       ac--;
+       av++;           /* skip 'list' | 'show' word */
+
+       n = parse_range(ac, av, NULL, 0);       /* Count # of ranges. */
+
+       /* Allocate space to store ranges */
+       l = sizeof(*oid) + sizeof(uint32_t) * n * 2;
+       oid = safe_calloc(1, l);
+       oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION);
+
+       if (n > 0)      /* store ranges in idx */
+               parse_range(ac, av, (uint32_t *)(oid + 1), n*2);
+       /*
+        * Compute the size of the largest object returned. If the
+        * response leaves at least this much spare space in the
+        * buffer, then surely the response is complete; otherwise
+        * there might be a risk of truncation and we will need to
+        * retry with a larger buffer.
+        * XXX don't bother with smaller structs.
+        */
+       max_size = sizeof(struct dn_fs);
+       if (max_size < sizeof(struct dn_sch))
+               max_size = sizeof(struct dn_sch);
+       if (max_size < sizeof(struct dn_flow))
+               max_size = sizeof(struct dn_flow);
+
+       switch (co.do_pipe) {
+       case 1:
+               oid->subtype = DN_LINK; /* list pipe */
+               break;
+       case 2:
+               oid->subtype = DN_FS;   /* list queue */
+               break;
+       case 3:
+               oid->subtype = DN_SCH;  /* list sched */
+               break;
+       }
+
+       /*
+        * Ask the kernel an estimate of the required space (result
+        * in oid.id), unless we are requesting a subset of objects,
+        * in which case the kernel does not give an exact answer.
+        * In any case, space might grow in the meantime due to the
+        * creation of new queues, so we must be prepared to retry.
+        */
+       if (n > 0) {
+               buflen = 4*1024;
+       } else {
+               ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
+               if (ret != 0 || oid->id <= sizeof(*oid))
+                       goto done;
+               buflen = oid->id + max_size;
+               oid->len = sizeof(*oid); /* restore */
+       }
+       /* Try a few times, until the buffer fits */
+       for (i = 0; i < 20; i++) {
+               l = buflen;
+               x = safe_realloc(x, l);
+               bcopy(oid, x, oid->len);
+               ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l);
+               if (ret != 0 || x->id <= sizeof(*oid))
+                       goto done; /* no response */
+               if (l + max_size <= buflen)
+                       break; /* ok */
+               buflen *= 2;     /* double for next attempt */
+       }
+       list_pipes(x, O_NEXT(x, l));
+done:
+       if (x)
+               free(x);
+       free(oid);
+}
diff --git a/ipfw/expand_number.c b/ipfw/expand_number.c
new file mode 100644 (file)
index 0000000..d557111
--- /dev/null
@@ -0,0 +1,100 @@
+/*-
+ * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org>
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// #include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/lib/libutil/expand_number.c,v 1.2.4.2 2009/06/10 14:52:34 des Exp $");
+
+#include <sys/types.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+//#include <libutil.h>
+#include <stdint.h>
+
+/*
+ * Convert an expression of the following forms to a int64_t.
+ *     1) A positive decimal number.
+ *     2) A positive decimal number followed by a 'b' or 'B' (mult by 1).
+ *     3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10).
+ *     4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20).
+ *     5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30).
+ *     6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40).
+ *     7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50).
+ *     8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60).
+ */
+int
+expand_number(const char *buf, int64_t *num)
+{
+       static const char unit[] = "bkmgtpe";
+       char *endptr, s;
+       int64_t number;
+       int i;
+
+       number = strtoimax(buf, &endptr, 0);
+
+       if (endptr == buf) {
+               /* No valid digits. */
+               errno = EINVAL;
+               return (-1);
+       }
+
+       if (*endptr == '\0') {
+               /* No unit. */
+               *num = number;
+               return (0);
+       }
+
+       s = tolower(*endptr);
+       switch (s) {
+       case 'b':
+       case 'k':
+       case 'm':
+       case 'g':
+       case 't':
+       case 'p':
+       case 'e':
+               break;
+       default:
+               /* Unrecognized unit. */
+               errno = EINVAL;
+               return (-1);
+       }
+
+       for (i = 0; unit[i] != '\0'; i++) {
+               if (s == unit[i])
+                       break;
+               if ((number < 0 && (number << 10) > number) ||
+                   (number >= 0 && (number << 10) < number)) {
+                       errno = ERANGE;
+                       return (-1);
+               }
+               number <<= 10;
+       }
+
+       *num = number;
+       return (0);
+}
diff --git a/ipfw/glue.c b/ipfw/glue.c
new file mode 100644 (file)
index 0000000..a3ef641
--- /dev/null
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: glue.c 12264 2013-04-27 20:21:06Z luigi $
+ *
+ * Userland functions missing in linux/Windows
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include <netdb.h>
+#include <windows.h>
+#endif /* _WIN32 */
+
+#ifndef HAVE_NAT
+/* dummy nat functions */
+void
+ipfw_show_nat(int ac, char **av)
+{
+       fprintf(stderr, "%s unsupported\n", __FUNCTION__);
+}
+
+void
+ipfw_config_nat(int ac, char **av)
+{
+       fprintf(stderr, "%s unsupported\n", __FUNCTION__);
+}
+#endif
+
+#ifdef __linux__
+int optreset;  /* missing in linux */
+#endif
+
+/*
+ * not implemented in linux.
+ * taken from /usr/src/lib/libc/string/strlcpy.c
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+        char *d = dst;
+        const char *s = src;
+        size_t n = siz;
+
+        /* Copy as many bytes as will fit */
+        if (n != 0 && --n != 0) {
+                do {
+                        if ((*d++ = *s++) == 0)
+                                break;
+                } while (--n != 0);
+        }
+
+        /* Not enough room in dst, add NUL and traverse rest of src */
+        if (n == 0) {
+                if (siz != 0)
+                        *d = '\0';              /* NUL-terminate dst */
+                while (*s++)
+                        ;
+        }
+
+        return(s - src - 1);    /* count does not include NUL */
+}
+
+
+/* missing in linux and windows */
+long long int
+strtonum(const char *nptr, long long minval, long long maxval,
+         const char **errstr)
+{
+       long long ret;
+       int errno_c = errno;    /* save actual errno */
+
+       errno = 0;
+#ifdef TCC
+       ret = strtol(nptr, (char **)errstr, 0);
+#else
+       ret = strtoll(nptr, (char **)errstr, 0);
+#endif
+       /* We accept only a string that represent exactly a number (ie. start
+        * and end with a digit).
+        * FreeBSD version wants errstr==NULL if no error occurs, otherwise
+        * errstr should point to an error string.
+        * For our purspose, we implement only the invalid error, ranges
+        * error aren't checked
+        */
+       if (errno != 0 || nptr == *errstr || **errstr != '\0')
+               *errstr = "invalid";
+       else  {
+               *errstr = NULL;
+               errno = errno_c;
+       }
+       return ret;
+}
+
+#if defined (_WIN32) || defined (EMULATE_SYSCTL)
+//XXX missing prerequisites
+#include <net/if.h>            //openwrt
+#include <netinet/ip.h>        //openwrt
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#endif
+
+/*
+ * set or get system information
+ * XXX lock acquisition/serialize calls
+ *
+ * we export this as sys/module/ipfw_mod/parameters/___
+ * This function get or/and set the value of the sysctl passed by
+ * the name parameter. If the old value is not desired,
+ * oldp and oldlenp should be set to NULL.
+ *
+ * XXX
+ * I do not know how this works in FreeBSD in the case
+ * where there are no write permission on the sysctl var.
+ * We read the value and set return variables in any way
+ * but returns -1 on write failures, regardless the
+ * read success.
+ *
+ * Since there is no information on types, in the following
+ * code we assume a length of 4 is a int.
+ *
+ * Returns 0 on success, -1 on errors.
+ */
+int
+sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp,
+         size_t newlen)
+{
+#if defined (_WIN32) || defined (EMULATE_SYSCTL)
+       /*
+        * we embed the sysctl request in the usual sockopt mechanics.
+        * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3
+        * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET
+        * subcommands.
+        * the syntax of this function is fully compatible with
+        * POSIX sysctlby name:
+        * if newp and newlen are != 0 => this is a set
+        * else if oldp and oldlen are != 0 => this is a get
+        *              to avoid too much overhead in the module, the whole
+        *              sysctltable is returned, and the parsing is done in userland,
+        *              a probe request is done to retrieve the size needed to
+        *              transfer the table, before the real request
+        * if both old and new params = 0 => this is a print
+        *              this is a special request, done only by main()
+        *              to implement the extension './ipfw sysctl',
+        *              a command that bypasses the normal getopt, and that
+        *              is available on those platforms that use this
+        *              sysctl emulation.
+        *              in this case, a negative oldlen signals that *oldp
+        *              is actually a FILE* to print somewhere else than stdout
+        */
+
+       int l;
+       int ret;
+       struct dn_id* oid;
+       struct sysctlhead* entry;
+       char* pstring;
+       char* pdata;
+       FILE* fp;
+
+       if((oldlenp != NULL) && (*oldlenp < 0))
+               fp = (FILE*)oldp;
+       else
+               fp = stdout;
+       if(newp != NULL && newlen != 0)
+       {
+               //this is a set
+               l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen;
+               oid = malloc(l);
+               if (oid == NULL)
+                       return -1;
+               oid->len = l;
+               oid->type = DN_SYSCTL_SET;
+               oid->id = DN_API_VERSION;
+
+               entry = (struct sysctlhead*)(oid+1);
+               pdata = (char*)(entry+1);
+               pstring = pdata + newlen;
+
+               entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3;
+               entry->namelen = strlen(name)+1;
+               entry->flags = 0;
+               entry->datalen = newlen;
+
+               bcopy(newp, pdata, newlen);
+               bcopy(name, pstring, strlen(name)+1);
+
+               ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l);
+               if (ret != 0)
+                       return -1;
+       }
+       else
+       {
+               //this is a get or a print
+               l = sizeof(struct dn_id);
+               oid = malloc(l);
+               if (oid == NULL)
+                       return -1;
+               oid->len = l;
+               oid->type = DN_SYSCTL_GET;
+               oid->id = DN_API_VERSION;
+
+               ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
+               if (ret != 0)
+                       return -1;
+
+               l=oid->id;
+               free(oid);
+               oid = malloc(l);
+               if (oid == NULL)
+                       return -1;
+               oid->len = l;
+               oid->type = DN_SYSCTL_GET;
+               oid->id = DN_API_VERSION;
+
+               ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
+               if (ret != 0)
+                       return -1;
+
+               entry = (struct sysctlhead*)(oid+1);
+               while(entry->blocklen != 0)
+               {
+                       pdata = (char*)(entry+1);
+                       pstring = pdata+entry->datalen;
+
+                       //time to check if this is a get or a print
+                       if(name != NULL && oldp != NULL && *oldlenp > 0)
+                       {
+                               //this is a get
+                               if(strcmp(name,pstring) == 0)
+                               {
+                                       //match found, sanity chech on len
+                                       if(*oldlenp < entry->datalen)
+                                       {
+                                               printf("%s error: buffer too small\n",__FUNCTION__);
+                                               return -1;
+                                       }
+                                       *oldlenp = entry->datalen;
+                                       bcopy(pdata, oldp, *oldlenp);
+                                       return 0;
+                               }
+                       }
+                       else
+                       {
+                               //this is a print
+                               if( name == NULL )
+                                       goto print;
+                               if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) )
+                                               goto print;
+                               else
+                                               goto skip;
+print:
+                               fprintf(fp, "%s: ",pstring);
+                               switch( entry->flags >> 2 )
+                               {
+                                       case SYSCTLTYPE_LONG:
+                                               fprintf(fp, "%li ", *(long*)(pdata));
+                                               break;
+                                       case SYSCTLTYPE_UINT:
+                                               fprintf(fp, "%u ", *(unsigned int*)(pdata));
+                                               break;
+                                       case SYSCTLTYPE_ULONG:
+                                               fprintf(fp, "%lu ", *(unsigned long*)(pdata));
+                                               break;
+                                       case SYSCTLTYPE_INT:
+                                       default:
+                                               fprintf(fp, "%i ", *(int*)(pdata));
+                               }
+                               if( (entry->flags & 0x00000003) == CTLFLAG_RD )
+                                       fprintf(fp, "\t(read only)\n");
+                               else
+                                       fprintf(fp, "\n");
+skip:                  ;
+                       }
+                       entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen);
+               }
+               free(oid);
+               return 0;
+       }
+       //fallback for invalid options
+       return -1;
+
+#else /* __linux__ */
+       FILE *fp;
+       char *basename = "/sys/module/ipfw_mod/parameters/";
+       char filename[256];     /* full filename */
+       char *varp;
+       int ret = 0;            /* return value */
+       long d;
+
+       if (name == NULL) /* XXX set errno */
+               return -1;
+
+       /* locate the filename */
+       varp = strrchr(name, '.');
+       if (varp == NULL) /* XXX set errno */
+               return -1;
+
+       snprintf(filename, sizeof(filename), "%s%s", basename, varp+1);
+
+       /*
+        * XXX we could open the file here, in rw mode
+        * but need to check if a file have write
+        * permissions.
+        */
+
+       /* check parameters */
+       if (oldp && oldlenp) { /* read mode */
+               fp = fopen(filename, "r");
+               if (fp == NULL) {
+                       fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename);
+                       return -1;
+               }
+               if (fscanf(fp, "%ld", &d) != 1) {
+                       ret = -1;
+               } else if (*oldlenp == sizeof(int)) {
+                       int dst = d;
+                       memcpy(oldp, &dst, *oldlenp);
+               } else if (*oldlenp == sizeof(long)) {
+                       memcpy(oldp, &d, *oldlenp);
+               } else {
+                       fprintf(stderr, "unknown paramerer len %d\n",
+                               (int)*oldlenp);
+               }
+               fclose(fp);
+       }
+
+       if (newp && newlen) { /* write */
+               fp = fopen(filename, "w");
+               if (fp == NULL) {
+                       fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename);
+                       return -1;
+               }
+               if (newlen == sizeof(int)) {
+                       if (fprintf(fp, "%d", *(int *)newp) < 1)
+                               ret = -1;
+               } else if (newlen == sizeof(long)) {
+                       if (fprintf(fp, "%ld", *(long *)newp) < 1)
+                               ret = -1;
+               } else {
+                       fprintf(stderr, "unknown paramerer len %d\n",
+                               (int)newlen);
+               }
+
+               fclose(fp);
+       }
+
+       return ret;
+#endif /* __linux__ */
+}
+
+#ifdef _WIN32
+/*
+ * On windows, set/getsockopt are mapped to DeviceIoControl()
+ */
+int
+wnd_setsockopt(int s, int level, int sopt_name, const void *optval,
+                socklen_t optlen)
+{
+    size_t len = sizeof (struct sockopt) + optlen;
+    struct sockopt *sock;
+    DWORD n;
+    BOOL result;
+    HANDLE _dev_h = (HANDLE)s;
+
+    /* allocate a data structure for communication */
+    sock = malloc(len);
+    if (sock == NULL)
+        return -1;
+
+    sock->sopt_dir = SOPT_SET;
+    sock->sopt_name = sopt_name;
+    sock->sopt_valsize = optlen;
+    sock->sopt_val = (void *)(sock+1);
+
+    memcpy(sock->sopt_val, optval, optlen);
+    result = DeviceIoControl (_dev_h, IP_FW_SETSOCKOPT, sock, len,
+               NULL, 0, &n, NULL);
+    free (sock);
+
+    return (result ? 0 : -1);
+}
+
+int
+wnd_getsockopt(int s, int level, int sopt_name, void *optval,
+                socklen_t *optlen)
+{
+    size_t len = sizeof (struct sockopt) + *optlen;
+    struct sockopt *sock;
+    DWORD n;
+    BOOL result;
+    HANDLE _dev_h = (HANDLE)s;
+
+    sock = malloc(len);
+    if (sock == NULL)
+        return -1;
+
+    sock->sopt_dir = SOPT_GET;
+    sock->sopt_name = sopt_name;
+    sock->sopt_valsize = *optlen;
+    sock->sopt_val = (void *)(sock+1);
+
+    memcpy (sock->sopt_val, optval, *optlen);
+
+    result = DeviceIoControl (_dev_h, IP_FW_GETSOCKOPT, sock, len,
+               sock, len, &n, NULL);
+       //printf("len = %i, returned = %u, valsize = %i\n",len,n,sock->sopt_valsize);
+    *optlen = sock->sopt_valsize;
+    memcpy (optval, sock->sopt_val, *optlen);
+    free (sock);
+    return (result ? 0 : -1);
+}
+
+int
+my_socket(int domain, int ty, int proto)
+{
+    TCHAR *pcCommPort = TEXT("\\\\.\\Ipfw");
+    HANDLE _dev_h = INVALID_HANDLE_VALUE;
+
+    /* Special Handling For Accessing Device On Windows 2000 Terminal Server
+       See Microsoft KB Article 259131 */
+    if (_dev_h == INVALID_HANDLE_VALUE) {
+        _dev_h = CreateFile (pcCommPort,
+               GENERIC_READ | GENERIC_WRITE,
+               0, NULL,
+               OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    }
+    if (_dev_h == INVALID_HANDLE_VALUE) {
+       printf("%s failed %u, cannot talk to kernel module\n",
+               __FUNCTION__, (unsigned)GetLastError());
+        return -1;
+    }
+    return (int)_dev_h;
+}
+
+struct hostent* gethostbyname2(const char *name, int af)
+{
+       return gethostbyname(name);
+}
+
+struct ether_addr* ether_aton(const char *a)
+{
+       fprintf(stderr, "%s empty\n", __FUNCTION__);
+       return NULL;
+}
+
+#ifdef TCC
+int     opterr = 1,             /* if error message should be printed */
+        optind = 1,             /* index into parent argv vector */
+        optopt,                 /* character checked for validity */
+        optreset;               /* reset getopt */
+char    *optarg;                /* argument associated with option */
+
+#define BADCH   (int)'?'
+#define BADARG  (int)':'
+#define EMSG    ""
+
+#define PROGNAME       "ipfw"
+/*
+ * getopt --
+ *      Parse argc/argv argument vector.
+ */
+int
+getopt(nargc, nargv, ostr)
+        int nargc;
+        char * const nargv[];
+        const char *ostr;
+{
+        static char *place = EMSG;              /* option letter processing */
+        char *oli;                              /* option letter list index */
+
+        if (optreset || *place == 0) {          /* update scanning pointer */
+                optreset = 0;
+                place = nargv[optind];
+                if (optind >= nargc || *place++ != '-') {
+                        /* Argument is absent or is not an option */
+                        place = EMSG;
+                        return (-1);
+                }
+                optopt = *place++;
+                if (optopt == '-' && *place == 0) {
+                        /* "--" => end of options */
+                        ++optind;
+                        place = EMSG;
+                        return (-1);
+                }
+                if (optopt == 0) {
+                        /* Solitary '-', treat as a '-' option
+                           if the program (eg su) is looking for it. */
+                        place = EMSG;
+                        if (strchr(ostr, '-') == NULL)
+                                return (-1);
+                        optopt = '-';
+                }
+        } else
+                optopt = *place++;
+
+        /* See if option letter is one the caller wanted... */
+        if (optopt == ':' || (oli = strchr(ostr, optopt)) == NULL) {
+                if (*place == 0)
+                        ++optind;
+                if (opterr && *ostr != ':')
+                        (void)fprintf(stderr,
+                            "%s: illegal option -- %c\n", PROGNAME,
+                            optopt);
+                return (BADCH);
+        }
+
+        /* Does this option need an argument? */
+        if (oli[1] != ':') {
+                /* don't need argument */
+                optarg = NULL;
+                if (*place == 0)
+                        ++optind;
+        } else {
+                /* Option-argument is either the rest of this argument or the
+                   entire next argument. */
+                if (*place)
+                        optarg = place;
+                else if (nargc > ++optind)
+                        optarg = nargv[optind];
+                else {
+                        /* option-argument absent */
+                        place = EMSG;
+                        if (*ostr == ':')
+                                return (BADARG);
+                        if (opterr)
+                                (void)fprintf(stderr,
+                                    "%s: option requires an argument -- %c\n",
+                                    PROGNAME, optopt);
+                        return (BADCH);
+                }
+                place = EMSG;
+                ++optind;
+        }
+        return (optopt);                        /* return option letter */
+}
+
+//static FILE *err_file = stderr;
+void
+verrx(int ex, int eval, const char *fmt, va_list ap)
+{
+        fprintf(stderr, "%s: ", PROGNAME);
+        if (fmt != NULL)
+                vfprintf(stderr, fmt, ap);
+        fprintf(stderr, "\n");
+       if (ex)
+               exit(eval);
+}
+void
+errx(int eval, const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        verrx(1, eval, fmt, ap);
+        va_end(ap);
+}
+
+void
+warnx(const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+       verrx(0, 0, fmt, ap);
+        va_end(ap);
+}
+
+char *
+strsep(char **stringp, const char *delim)
+{
+        char *s;
+        const char *spanp;
+        int c, sc;
+        char *tok;
+
+        if ((s = *stringp) == NULL)
+                return (NULL);
+        for (tok = s;;) {
+                c = *s++;
+                spanp = delim;
+                do {
+                        if ((sc = *spanp++) == c) {
+                                if (c == 0)
+                                        s = NULL;
+                                else
+                                        s[-1] = 0;
+                                *stringp = s;
+                                return (tok);
+                        }
+                } while (sc != 0);
+        }
+        /* NOTREACHED */
+}
+
+static unsigned char
+tolower(unsigned char c)
+{
+       return (c >= 'A' && c <= 'Z') ? c + 'a' - 'A' : c;
+}
+
+static int isdigit(unsigned char c)
+{
+       return (c >= '0' && c <= '9');
+}
+
+static int isxdigit(unsigned char c)
+{
+       return (strchr("0123456789ABCDEFabcdef", c) ? 1 : 0);
+}
+
+static int isspace(unsigned char c)
+{
+       return (strchr(" \t\n\r", c) ? 1 : 0);
+}
+
+static int isascii(unsigned char c)
+{
+       return (c < 128);
+}
+
+static int islower(unsigned char c)
+{
+       return (c >= 'a' && c <= 'z');
+}
+
+int
+strcasecmp(const char *s1, const char *s2)
+{
+        const unsigned char
+                        *us1 = (const unsigned char *)s1,
+                        *us2 = (const unsigned char *)s2;
+
+        while (tolower(*us1) == tolower(*us2++))
+                if (*us1++ == '\0')
+                        return (0);
+        return (tolower(*us1) - tolower(*--us2));
+}
+
+intmax_t
+strtoimax(const char * restrict nptr, char ** restrict endptr, int base)
+{
+       return strtol(nptr, endptr,base);
+}
+
+void
+setservent(int a)
+{
+}
+
+#define NS_INADDRSZ 128
+
+int
+inet_pton(int af, const char *src, void *dst)
+{
+        static const char digits[] = "0123456789";
+        int saw_digit, octets, ch;
+        u_char tmp[NS_INADDRSZ], *tp;
+
+       if (af != AF_INET) {
+               errno = EINVAL;
+               return -1;
+       }
+
+        saw_digit = 0;
+        octets = 0;
+        *(tp = tmp) = 0;
+        while ((ch = *src++) != '\0') {
+                const char *pch;
+
+                if ((pch = strchr(digits, ch)) != NULL) {
+                        u_int new = *tp * 10 + (pch - digits);
+
+                        if (saw_digit && *tp == 0)
+                                return (0);
+                        if (new > 255)
+                                return (0);
+                        *tp = new;
+                        if (!saw_digit) {
+                                if (++octets > 4)
+                                        return (0);
+                                saw_digit = 1;
+                        }
+                } else if (ch == '.' && saw_digit) {
+                        if (octets == 4)
+                                return (0);
+                        *++tp = 0;
+                        saw_digit = 0;
+                } else
+                        return (0);
+        }
+        if (octets < 4)
+                return (0);
+        memcpy(dst, tmp, NS_INADDRSZ);
+        return (1);
+}
+
+const char *
+inet_ntop(int af, const void *_src, char *dst, socklen_t size)
+{
+        static const char fmt[] = "%u.%u.%u.%u";
+        char tmp[sizeof "255.255.255.255"];
+       const u_char *src = _src;
+        int l;
+       if (af != AF_INET) {
+               errno = EINVAL;
+               return NULL;
+       }
+
+        l = snprintf(tmp, sizeof(tmp), fmt, src[0], src[1], src[2], src[3]);
+        if (l <= 0 || (socklen_t) l >= size) {
+                errno = ENOSPC;
+                return (NULL);
+        }
+        strlcpy(dst, tmp, size);
+        return (dst);
+}
+
+/*%
+ * Check whether "cp" is a valid ascii representation
+ * of an Internet address and convert to a binary address.
+ * Returns 1 if the address is valid, 0 if not.
+ * This replaces inet_addr, the return value from which
+ * cannot distinguish between failure and a local broadcast address.
+ */
+int
+inet_aton(const char *cp, struct in_addr *addr) {
+        u_long val;
+        int base, n;
+        char c;
+        u_int8_t parts[4];
+        u_int8_t *pp = parts;
+        int digit;
+
+        c = *cp;
+        for (;;) {
+                /*
+                 * Collect number up to ``.''.
+                 * Values are specified as for C:
+                 * 0x=hex, 0=octal, isdigit=decimal.
+                 */
+                if (!isdigit((unsigned char)c))
+                        return (0);
+                val = 0; base = 10; digit = 0;
+                if (c == '0') {
+                        c = *++cp;
+                        if (c == 'x' || c == 'X')
+                                base = 16, c = *++cp;
+                        else {
+                                base = 8;
+                                digit = 1 ;
+                        }
+                }
+                for (;;) {
+                        if (isascii(c) && isdigit((unsigned char)c)) {
+                                if (base == 8 && (c == '8' || c == '9'))
+                                        return (0);
+                                val = (val * base) + (c - '0');
+                                c = *++cp;
+                                digit = 1;
+                        } else if (base == 16 && isascii(c) &&
+                                   isxdigit((unsigned char)c)) {
+                                val = (val << 4) |
+                                        (c + 10 - (islower((unsigned char)c) ? 'a' : 'A'));
+                                c = *++cp;
+                                digit = 1;
+                        } else
+                                break;
+                }
+                if (c == '.') {
+                        /*
+                         * Internet format:
+                         *      a.b.c.d
+                         *      a.b.c   (with c treated as 16 bits)
+                         *      a.b     (with b treated as 24 bits)
+                         */
+                        if (pp >= parts + 3 || val > 0xffU)
+                                return (0);
+                        *pp++ = val;
+                        c = *++cp;
+                } else
+                        break;
+        }
+        /*
+         * Check for trailing characters.
+         */
+        if (c != '\0' && (!isascii(c) || !isspace((unsigned char)c)))
+                return (0);
+        /*
+         * Did we get a valid digit?
+         */
+        if (!digit)
+                return (0);
+        /*
+         * Concoct the address according to
+         * the number of parts specified.
+         */
+        n = pp - parts + 1;
+        switch (n) {
+        case 1:                         /*%< a -- 32 bits */
+                break;
+
+        case 2:                         /*%< a.b -- 8.24 bits */
+                if (val > 0xffffffU)
+                        return (0);
+                val |= parts[0] << 24;
+                break;
+
+        case 3:                         /*%< a.b.c -- 8.8.16 bits */
+                if (val > 0xffffU)
+                        return (0);
+                val |= (parts[0] << 24) | (parts[1] << 16);
+                break;
+
+        case 4:                         /*%< a.b.c.d -- 8.8.8.8 bits */
+                if (val > 0xffU)
+                        return (0);
+                val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
+                break;
+        }
+        if (addr != NULL)
+                addr->s_addr = htonl(val);
+        return (1);
+}
+
+#endif /* TCC */
+
+#endif /* _WIN32 */
diff --git a/ipfw/humanize_number.c b/ipfw/humanize_number.c
new file mode 100644 (file)
index 0000000..90aa18b
--- /dev/null
@@ -0,0 +1,153 @@
+/*     $NetBSD: humanize_number.c,v 1.13 2007/12/14 17:26:19 christos Exp $    */
+
+/*
+ * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
+ * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the NetBSD
+ *      Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// #include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/lib/libutil/humanize_number.c,v 1.2.10.1 2008/04/20 16:29:01 antoine Exp $");
+
+#include <sys/types.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+// #include <locale.h>
+//#include <libutil.h>
+
+int
+humanize_number(char *buf, size_t len, int64_t bytes,
+    const char *suffix, int scale, int flags)
+{
+       const char *prefixes, *sep;
+       int     b, i, r, maxscale, s1, s2, sign;
+       int64_t divisor, max;
+       size_t  baselen;
+
+       assert(buf != NULL);
+       assert(suffix != NULL);
+       assert(scale >= 0);
+
+       if (flags & HN_DIVISOR_1000) {
+               /* SI for decimal multiplies */
+               divisor = 1000;
+               if (flags & HN_B)
+                       prefixes = "B\0k\0M\0G\0T\0P\0E";
+               else
+                       prefixes = "\0\0k\0M\0G\0T\0P\0E";
+       } else {
+               /*
+                * binary multiplies
+                * XXX IEC 60027-2 recommends Ki, Mi, Gi...
+                */
+               divisor = 1024;
+               if (flags & HN_B)
+                       prefixes = "B\0K\0M\0G\0T\0P\0E";
+               else
+                       prefixes = "\0\0K\0M\0G\0T\0P\0E";
+       }
+
+#define        SCALE2PREFIX(scale)     (&prefixes[(scale) << 1])
+       maxscale = 7;
+
+       if (scale >= maxscale &&
+           (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0)
+               return (-1);
+
+       if (buf == NULL || suffix == NULL)
+               return (-1);
+
+       if (len > 0)
+               buf[0] = '\0';
+       if (bytes < 0) {
+               sign = -1;
+               bytes *= -100;
+               baselen = 3;            /* sign, digit, prefix */
+       } else {
+               sign = 1;
+               bytes *= 100;
+               baselen = 2;            /* digit, prefix */
+       }
+       if (flags & HN_NOSPACE)
+               sep = "";
+       else {
+               sep = " ";
+               baselen++;
+       }
+       baselen += strlen(suffix);
+
+       /* Check if enough room for `x y' + suffix + `\0' */
+       if (len < baselen + 1)
+               return (-1);
+
+       if (scale & (HN_AUTOSCALE | HN_GETSCALE)) {
+               /* See if there is additional columns can be used. */
+               for (max = 100, i = len - baselen; i-- > 0;)
+                       max *= 10;
+
+               /*
+                * Divide the number until it fits the given column.
+                * If there will be an overflow by the rounding below,
+                * divide once more.
+                */
+               for (i = 0; bytes >= max - 50 && i < maxscale; i++)
+                       bytes /= divisor;
+
+               if (scale & HN_GETSCALE)
+                       return (i);
+       } else
+               for (i = 0; i < scale && i < maxscale; i++)
+                       bytes /= divisor;
+
+       /* If a value <= 9.9 after rounding and ... */
+       if (bytes < 995 && i > 0 && flags & HN_DECIMAL) {
+               /* baselen + \0 + .N */
+               if (len < baselen + 1 + 2)
+                       return (-1);
+               b = ((int)bytes + 5) / 10;
+               s1 = b / 10;
+               s2 = b % 10;
+               r = snprintf(buf, len, "%d%s%d%s%s%s",
+                   sign * s1, ".", s2,
+                   sep, SCALE2PREFIX(i), suffix);
+       } else
+               r = snprintf(buf, len, "%" PRId64 "%s%s%s",
+                   sign * ((bytes + 50) / 100),
+                   sep, SCALE2PREFIX(i), suffix);
+
+       return (r);
+}
diff --git a/ipfw/include/alias.h b/ipfw/include/alias.h
new file mode 100644 (file)
index 0000000..888bd0d
--- /dev/null
@@ -0,0 +1,71 @@
+#ifndef _ALIAS_H_
+#define        _ALIAS_H_
+
+#define LIBALIAS_BUF_SIZE 128
+
+/*
+ * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log
+ * every time a link is created or deleted.  This is useful for debugging.
+ */
+#define        PKT_ALIAS_LOG                   0x01
+
+/*
+ * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp,
+ * telnet or web servers will be prevented by the aliasing mechanism.
+ */
+#define        PKT_ALIAS_DENY_INCOMING         0x02
+
+/*
+ * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the
+ * same port as they originated on.  This allows e.g. rsh to work *99% of the
+ * time*, but _not_ 100% (it will be slightly flakey instead of not working
+ * at all).  This mode bit is set by PacketAliasInit(), so it is a default
+ * mode of operation.
+ */
+#define        PKT_ALIAS_SAME_PORTS            0x04
+
+/*
+ * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g.
+ * destination port and/or address is zero), the packet aliasing engine will
+ * attempt to allocate a socket for the aliasing port it chooses.  This will
+ * avoid interference with the host machine.  Fully specified links do not
+ * require this.  This bit is set after a call to PacketAliasInit(), so it is
+ * a default mode of operation.
+ */
+#ifndef        NO_USE_SOCKETS
+#define        PKT_ALIAS_USE_SOCKETS           0x08
+#endif
+/*-
+ * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with
+ * unregistered source addresses will be aliased.  Private
+ * addresses are those in the following ranges:
+ *
+ *             10.0.0.0     ->   10.255.255.255
+ *             172.16.0.0   ->   172.31.255.255
+ *             192.168.0.0  ->   192.168.255.255
+ */
+#define        PKT_ALIAS_UNREGISTERED_ONLY     0x10
+
+/*
+ * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic
+ * aliasing links will be reset whenever PacketAliasSetAddress() changes the
+ * default aliasing address.  If the default aliasing address is left
+ * unchanged by this function call, then the table of dynamic aliasing links
+ * will be left intact.  This bit is set after a call to PacketAliasInit().
+ */
+#define        PKT_ALIAS_RESET_ON_ADDR_CHANGE  0x20
+
+
+/*
+ * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only
+ * transparent proxying is performed.
+ */
+#define        PKT_ALIAS_PROXY_ONLY            0x40
+
+/*
+ * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and
+ * PacketAliasOut() are reversed.
+ */
+#define        PKT_ALIAS_REVERSE               0x80
+
+#endif                         /* !_ALIAS_H_ */
diff --git a/ipfw/include/net/if_dl.h b/ipfw/include/net/if_dl.h
new file mode 100644 (file)
index 0000000..4d2b4f7
--- /dev/null
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)if_dl.h     8.1 (Berkeley) 6/10/93
+ * $FreeBSD: src/sys/net/if_dl.h,v 1.14 2005/01/07 01:45:34 imp Exp $
+ */
+
+#ifndef _NET_IF_DL_H_
+#define _NET_IF_DL_H_
+
+/*
+ * A Link-Level Sockaddr may specify the interface in one of two
+ * ways: either by means of a system-provided index number (computed
+ * anew and possibly differently on every reboot), or by a human-readable
+ * string such as "il0" (for managerial convenience).
+ *
+ * Census taking actions, such as something akin to SIOCGCONF would return
+ * both the index and the human name.
+ *
+ * High volume transactions (such as giving a link-level ``from'' address
+ * in a recvfrom or recvmsg call) may be likely only to provide the indexed
+ * form, (which requires fewer copy operations and less space).
+ *
+ * The form and interpretation  of the link-level address is purely a matter
+ * of convention between the device driver and its consumers; however, it is
+ * expected that all drivers for an interface of a given if_type will agree.
+ */
+
+/*
+ * Structure of a Link-Level sockaddr:
+ */
+struct sockaddr_dl {
+       u_char  sdl_len;        /* Total length of sockaddr */
+       u_char  sdl_family;     /* AF_LINK */
+       u_short sdl_index;      /* if != 0, system given index for interface */
+       u_char  sdl_type;       /* interface type */
+       u_char  sdl_nlen;       /* interface name length, no trailing 0 reqd. */
+       u_char  sdl_alen;       /* link level address length */
+       u_char  sdl_slen;       /* link layer selector length */
+       char    sdl_data[46];   /* minimum work area, can be larger;
+                                  contains both if name and ll address */
+};
+
+#define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen))
+
+#ifndef _KERNEL
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+void   link_addr(const char *, struct sockaddr_dl *);
+char   *link_ntoa(const struct sockaddr_dl *);
+__END_DECLS
+
+#endif /* !_KERNEL */
+
+#endif
diff --git a/ipfw/include/net/pfvar.h b/ipfw/include/net/pfvar.h
new file mode 100644 (file)
index 0000000..304cb16
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef _PF_VAR_H_
+#define _PF_VAR_H_
+
+/*
+ * replacement for FreeBSD's pfqueue.h
+ */
+#include <sys/queue.h>
+
+#define DIOCSTARTALTQ   _IO  ('D', 42)
+#define DIOCSTOPALTQ    _IO  ('D', 43)
+
+struct pf_altq {
+       TAILQ_ENTRY(pf_altq)     entries;
+       /* ... */
+        u_int32_t                qid;           /* return value */
+
+#define PF_QNAME_SIZE            64
+        char                     qname[PF_QNAME_SIZE];  /* queue name */
+
+};
+
+struct pfioc_altq {
+        u_int32_t        action;
+        u_int32_t        ticket;
+        u_int32_t        nr;
+        struct pf_altq   altq;
+};
+
+#define DIOCGETALTQS    _IOWR('D', 47, struct pfioc_altq)
+#define DIOCGETALTQ    _IOWR('D', 48, struct pfioc_altq)
+
+#endif /* !_PF_VAR_H */
diff --git a/ipfw/include/timeconv.h b/ipfw/include/timeconv.h
new file mode 100644 (file)
index 0000000..5377ebb
--- /dev/null
@@ -0,0 +1,14 @@
+/*
+ * simple override for _long_to_time()
+ */
+#ifndef _TIMECONV_H_
+#define _TIMECONV_H_
+static __inline time_t
+_long_to_time(long tlong)
+{
+    if (sizeof(long) == sizeof(__int32_t))
+        return((time_t)(__int32_t)(tlong));
+    return((time_t)tlong);
+}
+
+#endif /* _TIMECONV_H_ */
diff --git a/ipfw/ipfw.8 b/ipfw/ipfw.8
new file mode 100644 (file)
index 0000000..bc8d819
--- /dev/null
@@ -0,0 +1,3476 @@
+.\"
+.\" $FreeBSD$
+.\"
+.Dd October 25, 2012
+.Dt IPFW 8
+.Os
+.Sh NAME
+.Nm ipfw
+.Nd User interface for firewall, traffic shaper, packet scheduler,
+in-kernel NAT.
+.Sh SYNOPSIS
+.Ss FIREWALL CONFIGURATION
+.Nm
+.Op Fl cq
+.Cm add
+.Ar rule
+.Nm
+.Op Fl acdefnNStT
+.Op Cm set Ar N
+.Brq Cm list | show
+.Op Ar rule | first-last ...
+.Nm
+.Op Fl f | q
+.Op Cm set Ar N
+.Cm flush
+.Nm
+.Op Fl q
+.Op Cm set Ar N
+.Brq Cm delete | zero | resetlog
+.Op Ar number ...
+.Pp
+.Nm
+.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ...
+.Nm
+.Cm set move
+.Op Cm rule
+.Ar number Cm to Ar number
+.Nm
+.Cm set swap Ar number number
+.Nm
+.Cm set show
+.Ss SYSCTL SHORTCUTS
+.Nm
+.Cm enable
+.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
+.Nm
+.Cm disable
+.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
+.Ss LOOKUP TABLES
+.Nm
+.Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value
+.Nm
+.Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen
+.Nm
+.Cm table
+.Brq Ar number | all
+.Cm flush
+.Nm
+.Cm table
+.Brq Ar number | all
+.Cm list
+.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER)
+.Nm
+.Brq Cm pipe | queue | sched
+.Ar number
+.Cm config
+.Ar config-options
+.Nm
+.Op Fl s Op Ar field
+.Brq Cm pipe | queue | sched
+.Brq Cm delete | list | show
+.Op Ar number ...
+.Ss IN-KERNEL NAT
+.Nm
+.Op Fl q
+.Cm nat
+.Ar number
+.Cm config
+.Ar config-options
+.Pp
+.Nm
+.Op Fl cfnNqS
+.Oo
+.Fl p Ar preproc
+.Oo
+.Ar preproc-flags
+.Oc
+.Oc
+.Ar pathname
+.Sh DESCRIPTION
+The
+.Nm
+utility is the user interface for controlling the
+.Xr ipfw 4
+firewall, the
+.Xr dummynet 4
+traffic shaper/packet scheduler, and the
+in-kernel NAT services.
+.Pp
+A firewall configuration, or
+.Em ruleset ,
+is made of a list of
+.Em rules
+numbered from 1 to 65535.
+Packets are passed to the firewall
+from a number of different places in the protocol stack
+(depending on the source and destination of the packet,
+it is possible for the firewall to be
+invoked multiple times on the same packet).
+The packet passed to the firewall is compared
+against each of the rules in the
+.Em ruleset ,
+in rule-number order
+(multiple rules with the same number are permitted, in which case
+they are processed in order of insertion).
+When a match is found, the action corresponding to the
+matching rule is performed.
+.Pp
+Depending on the action and certain system settings, packets
+can be reinjected into the firewall at some rule after the
+matching one for further processing.
+.Pp
+A ruleset always includes a
+.Em default
+rule (numbered 65535) which cannot be modified or deleted,
+and matches all packets.
+The action associated with the
+.Em default
+rule can be either
+.Cm deny
+or
+.Cm allow
+depending on how the kernel is configured.
+.Pp
+If the ruleset includes one or more rules with the
+.Cm keep-state
+or
+.Cm limit
+option,
+the firewall will have a
+.Em stateful
+behaviour, i.e., upon a match it will create
+.Em dynamic rules ,
+i.e., rules that match packets with the same 5-tuple
+(protocol, source and destination addresses and ports)
+as the packet which caused their creation.
+Dynamic rules, which have a limited lifetime, are checked
+at the first occurrence of a
+.Cm check-state ,
+.Cm keep-state
+or
+.Cm limit
+rule, and are typically used to open the firewall on-demand to
+legitimate traffic only.
+See the
+.Sx STATEFUL FIREWALL
+and
+.Sx EXAMPLES
+Sections below for more information on the stateful behaviour of
+.Nm .
+.Pp
+All rules (including dynamic ones) have a few associated counters:
+a packet count, a byte count, a log count and a timestamp
+indicating the time of the last match.
+Counters can be displayed or reset with
+.Nm
+commands.
+.Pp
+Each rule belongs to one of 32 different
+.Em sets
+, and there are
+.Nm
+commands to atomically manipulate sets, such as enable,
+disable, swap sets, move all rules in a set to another
+one, delete all rules in a set.
+These can be useful to
+install temporary configurations, or to test them.
+See Section
+.Sx SETS OF RULES
+for more information on
+.Em sets .
+.Pp
+Rules can be added with the
+.Cm add
+command; deleted individually or in groups with the
+.Cm delete
+command, and globally (except those in set 31) with the
+.Cm flush
+command; displayed, optionally with the content of the
+counters, using the
+.Cm show
+and
+.Cm list
+commands.
+Finally, counters can be reset with the
+.Cm zero
+and
+.Cm resetlog
+commands.
+.Pp
+.Ss COMMAND OPTIONS
+The following general options are available when invoking
+.Nm :
+.Bl -tag -width indent
+.It Fl a
+Show counter values when listing rules.
+The
+.Cm show
+command implies this option.
+.It Fl b
+Only show the action and the comment, not the body of a rule.
+Implies
+.Fl c .
+.It Fl c
+When entering or showing rules, print them in compact form,
+i.e., omitting the "ip from any to any" string
+when this does not carry any additional information.
+.It Fl d
+When listing, show dynamic rules in addition to static ones.
+.It Fl e
+When listing and
+.Fl d
+is specified, also show expired dynamic rules.
+.It Fl f
+Do not ask for confirmation for commands that can cause problems
+if misused, i.e.,
+.Cm flush .
+If there is no tty associated with the process, this is implied.
+.It Fl i
+When listing a table (see the
+.Sx LOOKUP TABLES
+section below for more information on lookup tables), format values
+as IP addresses.
+By default, values are shown as integers.
+.It Fl n
+Only check syntax of the command strings, without actually passing
+them to the kernel.
+.It Fl N
+Try to resolve addresses and service names in output.
+.It Fl q
+Be quiet when executing the
+.Cm add ,
+.Cm nat ,
+.Cm zero ,
+.Cm resetlog
+or
+.Cm flush
+commands;
+(implies
+.Fl f ) .
+This is useful when updating rulesets by executing multiple
+.Nm
+commands in a script
+(e.g.,
+.Ql sh\ /etc/rc.firewall ) ,
+or by processing a file with many
+.Nm
+rules across a remote login session.
+It also stops a table add or delete
+from failing if the entry already exists or is not present.
+.Pp
+The reason why this option may be important is that
+for some of these actions,
+.Nm
+may print a message; if the action results in blocking the
+traffic to the remote client,
+the remote login session will be closed
+and the rest of the ruleset will not be processed.
+Access to the console would then be required to recover.
+.It Fl S
+When listing rules, show the
+.Em set
+each rule belongs to.
+If this flag is not specified, disabled rules will not be
+listed.
+.It Fl s Op Ar field
+When listing pipes, sort according to one of the four
+counters (total or current packets or bytes).
+.It Fl t
+When listing, show last match timestamp converted with ctime().
+.It Fl T
+When listing, show last match timestamp as seconds from the epoch.
+This form can be more convenient for postprocessing by scripts.
+.El
+.Ss LIST OF RULES AND PREPROCESSING
+To ease configuration, rules can be put into a file which is
+processed using
+.Nm
+as shown in the last synopsis line.
+An absolute
+.Ar pathname
+must be used.
+The file will be read line by line and applied as arguments to the
+.Nm
+utility.
+.Pp
+Optionally, a preprocessor can be specified using
+.Fl p Ar preproc
+where
+.Ar pathname
+is to be piped through.
+Useful preprocessors include
+.Xr cpp 1
+and
+.Xr m4 1 .
+If
+.Ar preproc
+does not start with a slash
+.Pq Ql /
+as its first character, the usual
+.Ev PATH
+name search is performed.
+Care should be taken with this in environments where not all
+file systems are mounted (yet) by the time
+.Nm
+is being run (e.g.\& when they are mounted over NFS).
+Once
+.Fl p
+has been specified, any additional arguments are passed on to the preprocessor
+for interpretation.
+This allows for flexible configuration files (like conditionalizing
+them on the local hostname) and the use of macros to centralize
+frequently required arguments like IP addresses.
+.Ss TRAFFIC SHAPER CONFIGURATION
+The
+.Nm
+.Cm pipe , queue
+and
+.Cm sched
+commands are used to configure the traffic shaper and packet scheduler.
+See the
+.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
+Section below for details.
+.Pp
+If the world and the kernel get out of sync the
+.Nm
+ABI may break, preventing you from being able to add any rules.
+This can adversely affect the booting process.
+You can use
+.Nm
+.Cm disable
+.Cm firewall
+to temporarily disable the firewall to regain access to the network,
+allowing you to fix the problem.
+.Sh PACKET FLOW
+A packet is checked against the active ruleset in multiple places
+in the protocol stack, under control of several sysctl variables.
+These places and variables are shown below, and it is important to
+have this picture in mind in order to design a correct ruleset.
+.Bd -literal -offset indent
+       ^    to upper layers    V
+       |                       |
+       +----------->-----------+
+       ^                       V
+ [ip(6)_input]           [ip(6)_output]     net.inet(6).ip(6).fw.enable=1
+       |                       |
+       ^                       V
+ [ether_demux]        [ether_output_frame]  net.link.ether.ipfw=1
+       |                       |
+       +-->--[bdg_forward]-->--+            net.link.bridge.ipfw=1
+       ^                       V
+       |      to devices       |
+.Ed
+.Pp
+The number of
+times the same packet goes through the firewall can
+vary between 0 and 4 depending on packet source and
+destination, and system configuration.
+.Pp
+Note that as packets flow through the stack, headers can be
+stripped or added to it, and so they may or may not be available
+for inspection.
+E.g., incoming packets will include the MAC header when
+.Nm
+is invoked from
+.Cm ether_demux() ,
+but the same packets will have the MAC header stripped off when
+.Nm
+is invoked from
+.Cm ip_input()
+or
+.Cm ip6_input() .
+.Pp
+Also note that each packet is always checked against the complete ruleset,
+irrespective of the place where the check occurs, or the source of the packet.
+If a rule contains some match patterns or actions which are not valid
+for the place of invocation (e.g.\& trying to match a MAC header within
+.Cm ip_input
+or
+.Cm ip6_input ),
+the match pattern will not match, but a
+.Cm not
+operator in front of such patterns
+.Em will
+cause the pattern to
+.Em always
+match on those packets.
+It is thus the responsibility of
+the programmer, if necessary, to write a suitable ruleset to
+differentiate among the possible places.
+.Cm skipto
+rules can be useful here, as an example:
+.Bd -literal -offset indent
+# packets from ether_demux or bdg_forward
+ipfw add 10 skipto 1000 all from any to any layer2 in
+# packets from ip_input
+ipfw add 10 skipto 2000 all from any to any not layer2 in
+# packets from ip_output
+ipfw add 10 skipto 3000 all from any to any not layer2 out
+# packets from ether_output_frame
+ipfw add 10 skipto 4000 all from any to any layer2 out
+.Ed
+.Pp
+(yes, at the moment there is no way to differentiate between
+ether_demux and bdg_forward).
+.Sh SYNTAX
+In general, each keyword or argument must be provided as
+a separate command line argument, with no leading or trailing
+spaces.
+Keywords are case-sensitive, whereas arguments may
+or may not be case-sensitive depending on their nature
+(e.g.\& uid's are, hostnames are not).
+.Pp
+Some arguments (e.g., port or address lists) are comma-separated
+lists of values.
+In this case, spaces after commas ',' are allowed to make
+the line more readable.
+You can also put the entire
+command (including flags) into a single argument.
+E.g., the following forms are equivalent:
+.Bd -literal -offset indent
+ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8
+ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8
+ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8"
+.Ed
+.Sh RULE FORMAT
+The format of firewall rules is the following:
+.Bd -ragged -offset indent
+.Bk -words
+.Op Ar rule_number
+.Op Cm set Ar set_number
+.Op Cm prob Ar match_probability
+.Ar action
+.Op Cm log Op Cm logamount Ar number
+.Op Cm altq Ar queue
+.Oo
+.Bro Cm tag | untag
+.Brc Ar number
+.Oc
+.Ar body
+.Ek
+.Ed
+.Pp
+where the body of the rule specifies which information is used
+for filtering packets, among the following:
+.Pp
+.Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact
+.It Layer-2 header fields
+When available
+.It IPv4 and IPv6 Protocol
+TCP, UDP, ICMP, etc.
+.It Source and dest. addresses and ports
+.It Direction
+See Section
+.Sx PACKET FLOW
+.It Transmit and receive interface
+By name or address
+.It Misc. IP header fields
+Version, type of service, datagram length, identification,
+fragment flag (non-zero IP offset),
+Time To Live
+.It IP options
+.It IPv6 Extension headers
+Fragmentation, Hop-by-Hop options,
+Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options.
+.It IPv6 Flow-ID
+.It Misc. TCP header fields
+TCP flags (SYN, FIN, ACK, RST, etc.),
+sequence number, acknowledgment number,
+window
+.It TCP options
+.It ICMP types
+for ICMP packets
+.It ICMP6 types
+for ICMP6 packets
+.It User/group ID
+When the packet can be associated with a local socket.
+.It Divert status
+Whether a packet came from a divert socket (e.g.,
+.Xr natd 8 ) .
+.It Fib annotation state
+Whether a packet has been tagged for using a specific FIB (routing table)
+in future forwarding decisions.
+.El
+.Pp
+Note that some of the above information, e.g.\& source MAC or IP addresses and
+TCP/UDP ports, can be easily spoofed, so filtering on those fields
+alone might not guarantee the desired results.
+.Bl -tag -width indent
+.It Ar rule_number
+Each rule is associated with a
+.Ar rule_number
+in the range 1..65535, with the latter reserved for the
+.Em default
+rule.
+Rules are checked sequentially by rule number.
+Multiple rules can have the same number, in which case they are
+checked (and listed) according to the order in which they have
+been added.
+If a rule is entered without specifying a number, the kernel will
+assign one in such a way that the rule becomes the last one
+before the
+.Em default
+rule.
+Automatic rule numbers are assigned by incrementing the last
+non-default rule number by the value of the sysctl variable
+.Ar net.inet.ip.fw.autoinc_step
+which defaults to 100.
+If this is not possible (e.g.\& because we would go beyond the
+maximum allowed rule number), the number of the last
+non-default value is used instead.
+.It Cm set Ar set_number
+Each rule is associated with a
+.Ar set_number
+in the range 0..31.
+Sets can be individually disabled and enabled, so this parameter
+is of fundamental importance for atomic ruleset manipulation.
+It can be also used to simplify deletion of groups of rules.
+If a rule is entered without specifying a set number,
+set 0 will be used.
+.br
+Set 31 is special in that it cannot be disabled,
+and rules in set 31 are not deleted by the
+.Nm ipfw flush
+command (but you can delete them with the
+.Nm ipfw delete set 31
+command).
+Set 31 is also used for the
+.Em default
+rule.
+.It Cm prob Ar match_probability
+A match is only declared with the specified probability
+(floating point number between 0 and 1).
+This can be useful for a number of applications such as
+random packet drop or
+(in conjunction with
+.Nm dummynet )
+to simulate the effect of multiple paths leading to out-of-order
+packet delivery.
+.Pp
+Note: this condition is checked before any other condition, including
+ones such as keep-state or check-state which might have side effects.
+.It Cm log Op Cm logamount Ar number
+Packets matching a rule with the
+.Cm log
+keyword will be made available for logging in two ways:
+if the sysctl variable
+.Va net.inet.ip.fw.verbose
+is set to 0 (default), one can use
+.Xr bpf 4
+attached to the
+.Li ipfw0
+pseudo interface.
+This pseudo interface can be created after a boot
+manually by using the following command:
+.Bd -literal -offset indent
+# ifconfig ipfw0 create
+.Ed
+.Pp
+Or, automatically at boot time by adding the following
+line to the
+.Xr rc.conf 5
+file:
+.Bd -literal -offset indent
+firewall_logif="YES"
+.Ed
+.Pp
+There is no overhead if no
+.Xr bpf 4
+is attached to the pseudo interface.
+.Pp
+If
+.Va net.inet.ip.fw.verbose
+is set to 1, packets will be logged to
+.Xr syslogd 8
+with a
+.Dv LOG_SECURITY
+facility up to a maximum of
+.Cm logamount
+packets.
+If no
+.Cm logamount
+is specified, the limit is taken from the sysctl variable
+.Va net.inet.ip.fw.verbose_limit .
+In both cases, a value of 0 means unlimited logging.
+.Pp
+Once the limit is reached, logging can be re-enabled by
+clearing the logging counter or the packet counter for that entry, see the
+.Cm resetlog
+command.
+.Pp
+Note: logging is done after all other packet matching conditions
+have been successfully verified, and before performing the final
+action (accept, deny, etc.) on the packet.
+.It Cm tag Ar number
+When a packet matches a rule with the
+.Cm tag
+keyword, the numeric tag for the given
+.Ar number
+in the range 1..65534 will be attached to the packet.
+The tag acts as an internal marker (it is not sent out over
+the wire) that can be used to identify these packets later on.
+This can be used, for example, to provide trust between interfaces
+and to start doing policy-based filtering.
+A packet can have multiple tags at the same time.
+Tags are "sticky", meaning once a tag is applied to a packet by a
+matching rule it exists until explicit removal.
+Tags are kept with the packet everywhere within the kernel, but are
+lost when packet leaves the kernel, for example, on transmitting
+packet out to the network or sending packet to a
+.Xr divert 4
+socket.
+.Pp
+To check for previously applied tags, use the
+.Cm tagged
+rule option.
+To delete previously applied tag, use the
+.Cm untag
+keyword.
+.Pp
+Note: since tags are kept with the packet everywhere in kernelspace,
+they can be set and unset anywhere in the kernel network subsystem
+(using the
+.Xr mbuf_tags 9
+facility), not only by means of the
+.Xr ipfw 4
+.Cm tag
+and
+.Cm untag
+keywords.
+For example, there can be a specialized
+.Xr netgraph 4
+node doing traffic analyzing and tagging for later inspecting
+in firewall.
+.It Cm untag Ar number
+When a packet matches a rule with the
+.Cm untag
+keyword, the tag with the number
+.Ar number
+is searched among the tags attached to this packet and,
+if found, removed from it.
+Other tags bound to packet, if present, are left untouched.
+.It Cm altq Ar queue
+When a packet matches a rule with the
+.Cm altq
+keyword, the ALTQ identifier for the given
+.Ar queue
+(see
+.Xr altq 4 )
+will be attached.
+Note that this ALTQ tag is only meaningful for packets going "out" of IPFW,
+and not being rejected or going to divert sockets.
+Note that if there is insufficient memory at the time the packet is
+processed, it will not be tagged, so it is wise to make your ALTQ
+"default" queue policy account for this.
+If multiple
+.Cm altq
+rules match a single packet, only the first one adds the ALTQ classification
+tag.
+In doing so, traffic may be shaped by using
+.Cm count Cm altq Ar queue
+rules for classification early in the ruleset, then later applying
+the filtering decision.
+For example,
+.Cm check-state
+and
+.Cm keep-state
+rules may come later and provide the actual filtering decisions in
+addition to the fallback ALTQ tag.
+.Pp
+You must run
+.Xr pfctl 8
+to set up the queues before IPFW will be able to look them up by name,
+and if the ALTQ disciplines are rearranged, the rules in containing the
+queue identifiers in the kernel will likely have gone stale and need
+to be reloaded.
+Stale queue identifiers will probably result in misclassification.
+.Pp
+All system ALTQ processing can be turned on or off via
+.Nm
+.Cm enable Ar altq
+and
+.Nm
+.Cm disable Ar altq .
+The usage of
+.Va net.inet.ip.fw.one_pass
+is irrelevant to ALTQ traffic shaping, as the actual rule action is followed
+always after adding an ALTQ tag.
+.El
+.Ss RULE ACTIONS
+A rule can be associated with one of the following actions, which
+will be executed when the packet matches the body of the rule.
+.Bl -tag -width indent
+.It Cm allow | accept | pass | permit
+Allow packets that match rule.
+The search terminates.
+.It Cm check-state
+Checks the packet against the dynamic ruleset.
+If a match is found, execute the action associated with
+the rule which generated this dynamic rule, otherwise
+move to the next rule.
+.br
+.Cm Check-state
+rules do not have a body.
+If no
+.Cm check-state
+rule is found, the dynamic ruleset is checked at the first
+.Cm keep-state
+or
+.Cm limit
+rule.
+.It Cm count
+Update counters for all packets that match rule.
+The search continues with the next rule.
+.It Cm deny | drop
+Discard packets that match this rule.
+The search terminates.
+.It Cm divert Ar port
+Divert packets that match this rule to the
+.Xr divert 4
+socket bound to port
+.Ar port .
+The search terminates.
+.It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port
+Change the next-hop on matching packets to
+.Ar ipaddr ,
+which can be an IP address or a host name.
+For IPv4, the next hop can also be supplied by the last table
+looked up for the packet by using the
+.Cm tablearg
+keyword instead of an explicit address.
+The search terminates if this rule matches.
+.Pp
+If
+.Ar ipaddr
+is a local address, then matching packets will be forwarded to
+.Ar port
+(or the port number in the packet if one is not specified in the rule)
+on the local machine.
+.br
+If
+.Ar ipaddr
+is not a local address, then the port number
+(if specified) is ignored, and the packet will be
+forwarded to the remote address, using the route as found in
+the local routing table for that IP.
+.br
+A
+.Ar fwd
+rule will not match layer-2 packets (those received
+on ether_input, ether_output, or bridged).
+.br
+The
+.Cm fwd
+action does not change the contents of the packet at all.
+In particular, the destination address remains unmodified, so
+packets forwarded to another system will usually be rejected by that system
+unless there is a matching rule on that system to capture them.
+For packets forwarded locally,
+the local address of the socket will be
+set to the original destination address of the packet.
+This makes the
+.Xr netstat 1
+entry look rather weird but is intended for
+use with transparent proxy servers.
+.It Cm nat Ar nat_nr | tablearg
+Pass packet to a
+nat instance
+(for network address translation, address redirect, etc.):
+see the
+.Sx NETWORK ADDRESS TRANSLATION (NAT)
+Section for further information.
+.It Cm pipe Ar pipe_nr
+Pass packet to a
+.Nm dummynet
+.Dq pipe
+(for bandwidth limitation, delay, etc.).
+See the
+.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
+Section for further information.
+The search terminates; however, on exit from the pipe and if
+the
+.Xr sysctl 8
+variable
+.Va net.inet.ip.fw.one_pass
+is not set, the packet is passed again to the firewall code
+starting from the next rule.
+.It Cm queue Ar queue_nr
+Pass packet to a
+.Nm dummynet
+.Dq queue
+(for bandwidth limitation using WF2Q+).
+.It Cm reject
+(Deprecated).
+Synonym for
+.Cm unreach host .
+.It Cm reset
+Discard packets that match this rule, and if the
+packet is a TCP packet, try to send a TCP reset (RST) notice.
+The search terminates.
+.It Cm reset6
+Discard packets that match this rule, and if the
+packet is a TCP packet, try to send a TCP reset (RST) notice.
+The search terminates.
+.It Cm skipto Ar number | tablearg
+Skip all subsequent rules numbered less than
+.Ar number .
+The search continues with the first rule numbered
+.Ar number
+or higher.
+It is possible to use the
+.Cm tablearg
+keyword with a skipto for a
+.Em computed
+skipto, but care should be used, as no destination caching
+is possible in this case so the rules are always walked to find it,
+starting from the
+.Cm skipto .
+.It Cm call Ar number | tablearg
+The current rule number is saved in the internal stack and
+ruleset processing continues with the first rule numbered
+.Ar number
+or higher.
+If later a rule with the
+.Cm return
+action is encountered, the processing returns to the first rule
+with number of this
+.Cm call
+rule plus one or higher
+(the same behaviour as with packets returning from
+.Xr divert 4
+socket after a
+.Cm divert
+action).
+This could be used to make somewhat like an assembly language
+.Dq subroutine
+calls to rules with common checks for different interfaces, etc.
+.Pp
+Rule with any number could be called, not just forward jumps as with
+.Cm skipto .
+So, to prevent endless loops in case of mistakes, both
+.Cm call
+and
+.Cm return
+actions don't do any jumps and simply go to the next rule if memory
+cannot be allocated or stack overflowed/underflowed.
+.Pp
+Internally stack for rule numbers is implemented using
+.Xr mbuf_tags 9
+facility and currently has size of 16 entries.
+As mbuf tags are lost when packet leaves the kernel,
+.Cm divert
+should not be used in subroutines to avoid endless loops
+and other undesired effects.
+.It Cm return
+Takes rule number saved to internal stack by the last
+.Cm call
+action and returns ruleset processing to the first rule
+with number greater than number of corresponding
+.Cm call
+rule.
+See description of the
+.Cm call
+action for more details.
+.Pp
+Note that
+.Cm return
+rules usually end a
+.Dq subroutine
+and thus are unconditional, but
+.Nm
+command-line utility currently requires every action except
+.Cm check-state
+to have body.
+While it is sometimes useful to return only on some packets,
+usually you want to print just
+.Dq return
+for readability.
+A workaround for this is to use new syntax and
+.Fl c
+switch:
+.Bd -literal -offset indent
+# Add a rule without actual body
+ipfw add 2999 return via any
+
+# List rules without "from any to any" part
+ipfw -c list
+.Ed
+.Pp
+This cosmetic annoyance may be fixed in future releases.
+.It Cm tee Ar port
+Send a copy of packets matching this rule to the
+.Xr divert 4
+socket bound to port
+.Ar port .
+The search continues with the next rule.
+.It Cm unreach Ar code
+Discard packets that match this rule, and try to send an ICMP
+unreachable notice with code
+.Ar code ,
+where
+.Ar code
+is a number from 0 to 255, or one of these aliases:
+.Cm net , host , protocol , port ,
+.Cm needfrag , srcfail , net-unknown , host-unknown ,
+.Cm isolated , net-prohib , host-prohib , tosnet ,
+.Cm toshost , filter-prohib , host-precedence
+or
+.Cm precedence-cutoff .
+The search terminates.
+.It Cm unreach6 Ar code
+Discard packets that match this rule, and try to send an ICMPv6
+unreachable notice with code
+.Ar code ,
+where
+.Ar code
+is a number from 0, 1, 3 or 4, or one of these aliases:
+.Cm no-route, admin-prohib, address
+or
+.Cm port .
+The search terminates.
+.It Cm netgraph Ar cookie
+Divert packet into netgraph with given
+.Ar cookie .
+The search terminates.
+If packet is later returned from netgraph it is either
+accepted or continues with the next rule, depending on
+.Va net.inet.ip.fw.one_pass
+sysctl variable.
+.It Cm ngtee Ar cookie
+A copy of packet is diverted into netgraph, original
+packet continues with the next rule.
+See
+.Xr ng_ipfw 4
+for more information on
+.Cm netgraph
+and
+.Cm ngtee
+actions.
+.It Cm setfib Ar fibnum | tablearg
+The packet is tagged so as to use the FIB (routing table)
+.Ar fibnum
+in any subsequent forwarding decisions.
+In the current implementation, this is limited to the values 0 through 15, see
+.Xr setfib 2 .
+Processing continues at the next rule.
+It is possible to use the
+.Cm tablearg
+keyword with setfib.
+If the tablearg value is not within the compiled range of fibs,
+the packet's fib is set to 0.
+.It Cm setdscp Ar DSCP | number | tablearg
+Set specified DiffServ codepoint for an IPv4/IPv6 packet.
+Processing continues at the next rule.
+Supported values are:
+.Pp
+.Cm CS0
+.Pq Dv 000000 ,
+.Cm CS1
+.Pq Dv 001000 ,
+.Cm CS2
+.Pq Dv 010000 ,
+.Cm CS3
+.Pq Dv 011000 ,
+.Cm CS4
+.Pq Dv 100000 ,
+.Cm CS5
+.Pq Dv 101000 ,
+.Cm CS6
+.Pq Dv 110000 ,
+.Cm CS7
+.Pq Dv 111000 ,
+.Cm AF11
+.Pq Dv 001010 ,
+.Cm AF12
+.Pq Dv 001100 ,
+.Cm AF13
+.Pq Dv 001110 ,
+.Cm AF21
+.Pq Dv 010010 ,
+.Cm AF22
+.Pq Dv 010100 ,
+.Cm AF23
+.Pq Dv 010110 ,
+.Cm AF31
+.Pq Dv 011010 ,
+.Cm AF32
+.Pq Dv 011100 ,
+.Cm AF33
+.Pq Dv 011110 ,
+.Cm AF41
+.Pq Dv 100010 ,
+.Cm AF42
+.Pq Dv 100100 ,
+.Cm AF43
+.Pq Dv 100110 ,
+.Cm EF
+.Pq Dv 101110 ,
+.Cm BE
+.Pq Dv 000000 .
+Additionally, DSCP value can be specified by number (0..64).
+It is also possible to use the
+.Cm tablearg
+keyword with setdscp.
+If the tablearg value is not within the 0..64 range, lower 6 bits of supplied
+value are used.
+.It Cm reass
+Queue and reassemble IP fragments.
+If the packet is not fragmented, counters are updated and
+processing continues with the next rule.
+If the packet is the last logical fragment, the packet is reassembled and, if
+.Va net.inet.ip.fw.one_pass
+is set to 0, processing continues with the next rule.
+Otherwise, the packet is allowed to pass and the search terminates.
+If the packet is a fragment in the middle of a logical group of fragments,
+it is consumed and
+processing stops immediately.
+.Pp
+Fragment handling can be tuned via
+.Va net.inet.ip.maxfragpackets
+and
+.Va net.inet.ip.maxfragsperpacket
+which limit, respectively, the maximum number of processable
+fragments (default: 800) and
+the maximum number of fragments per packet (default: 16).
+.Pp
+NOTA BENE: since fragments do not contain port numbers,
+they should be avoided with the
+.Nm reass
+rule.
+Alternatively, direction-based (like
+.Nm in
+/
+.Nm out
+) and source-based (like
+.Nm via
+) match patterns can be used to select fragments.
+.Pp
+Usually a simple rule like:
+.Bd -literal -offset indent
+# reassemble incoming fragments
+ipfw add reass all from any to any in
+.Ed
+.Pp
+is all you need at the beginning of your ruleset.
+.El
+.Ss RULE BODY
+The body of a rule contains zero or more patterns (such as
+specific source and destination addresses or ports,
+protocol options, incoming or outgoing interfaces, etc.)
+that the packet must match in order to be recognised.
+In general, the patterns are connected by (implicit)
+.Cm and
+operators -- i.e., all must match in order for the
+rule to match.
+Individual patterns can be prefixed by the
+.Cm not
+operator to reverse the result of the match, as in
+.Pp
+.Dl "ipfw add 100 allow ip from not 1.2.3.4 to any"
+.Pp
+Additionally, sets of alternative match patterns
+.Pq Em or-blocks
+can be constructed by putting the patterns in
+lists enclosed between parentheses ( ) or braces { }, and
+using the
+.Cm or
+operator as follows:
+.Pp
+.Dl "ipfw add 100 allow ip from { x or not y or z } to any"
+.Pp
+Only one level of parentheses is allowed.
+Beware that most shells have special meanings for parentheses
+or braces, so it is advisable to put a backslash \\ in front of them
+to prevent such interpretations.
+.Pp
+The body of a rule must in general include a source and destination
+address specifier.
+The keyword
+.Ar any
+can be used in various places to specify that the content of
+a required field is irrelevant.
+.Pp
+The rule body has the following format:
+.Bd -ragged -offset indent
+.Op Ar proto Cm from Ar src Cm to Ar dst
+.Op Ar options
+.Ed
+.Pp
+The first part (proto from src to dst) is for backward
+compatibility with earlier versions of
+.Fx .
+In modern
+.Fx
+any match pattern (including MAC headers, IP protocols,
+addresses and ports) can be specified in the
+.Ar options
+section.
+.Pp
+Rule fields have the following meaning:
+.Bl -tag -width indent
+.It Ar proto : protocol | Cm { Ar protocol Cm or ... }
+.It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number
+An IP protocol specified by number or name
+(for a complete list see
+.Pa /etc/protocols ) ,
+or one of the following keywords:
+.Bl -tag -width indent
+.It Cm ip4 | ipv4
+Matches IPv4 packets.
+.It Cm ip6 | ipv6
+Matches IPv6 packets.
+.It Cm ip | all
+Matches any packet.
+.El
+.Pp
+The
+.Cm ipv6
+in
+.Cm proto
+option will be treated as inner protocol.
+And, the
+.Cm ipv4
+is not available in
+.Cm proto
+option.
+.Pp
+The
+.Cm { Ar protocol Cm or ... }
+format (an
+.Em or-block )
+is provided for convenience only but its use is deprecated.
+.It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports
+An address (or a list, see below)
+optionally followed by
+.Ar ports
+specifiers.
+.Pp
+The second format
+.Em ( or-block
+with multiple addresses) is provided for convenience only and
+its use is discouraged.
+.It Ar addr : Oo Cm not Oc Bro
+.Cm any | me | me6 |
+.Cm table Ns Pq Ar number Ns Op , Ns Ar value
+.Ar | addr-list | addr-set
+.Brc
+.Bl -tag -width indent
+.It Cm any
+matches any IP address.
+.It Cm me
+matches any IP address configured on an interface in the system.
+.It Cm me6
+matches any IPv6 address configured on an interface in the system.
+The address list is evaluated at the time the packet is
+analysed.
+.It Cm table Ns Pq Ar number Ns Op , Ns Ar value
+Matches any IPv4 address for which an entry exists in the lookup table
+.Ar number .
+If an optional 32-bit unsigned
+.Ar value
+is also specified, an entry will match only if it has this value.
+See the
+.Sx LOOKUP TABLES
+section below for more information on lookup tables.
+.El
+.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list
+.It Ar ip-addr :
+A host or subnet address specified in one of the following ways:
+.Bl -tag -width indent
+.It Ar numeric-ip | hostname
+Matches a single IPv4 address, specified as dotted-quad or a hostname.
+Hostnames are resolved at the time the rule is added to the firewall list.
+.It Ar addr Ns / Ns Ar masklen
+Matches all addresses with base
+.Ar addr
+(specified as an IP address, a network number, or a hostname)
+and mask width of
+.Cm masklen
+bits.
+As an example, 1.2.3.4/25 or 1.2.3.0/25 will match
+all IP numbers from 1.2.3.0 to 1.2.3.127 .
+.It Ar addr Ns : Ns Ar mask
+Matches all addresses with base
+.Ar addr
+(specified as an IP address, a network number, or a hostname)
+and the mask of
+.Ar mask ,
+specified as a dotted quad.
+As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match
+1.*.3.*.
+This form is advised only for non-contiguous
+masks.
+It is better to resort to the
+.Ar addr Ns / Ns Ar masklen
+format for contiguous masks, which is more compact and less
+error-prone.
+.El
+.It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm }
+.It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list
+Matches all addresses with base address
+.Ar addr
+(specified as an IP address, a network number, or a hostname)
+and whose last byte is in the list between braces { } .
+Note that there must be no spaces between braces and
+numbers (spaces after commas are allowed).
+Elements of the list can be specified as single entries
+or ranges.
+The
+.Ar masklen
+field is used to limit the size of the set of addresses,
+and can have any value between 24 and 32.
+If not specified,
+it will be assumed as 24.
+.br
+This format is particularly useful to handle sparse address sets
+within a single rule.
+Because the matching occurs using a
+bitmask, it takes constant time and dramatically reduces
+the complexity of rulesets.
+.br
+As an example, an address specified as 1.2.3.4/24{128,35-55,89}
+or 1.2.3.0/24{128,35-55,89}
+will match the following IP addresses:
+.br
+1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 .
+.It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list
+.It Ar ip6-addr :
+A host or subnet specified one of the following ways:
+.Bl -tag -width indent
+.It Ar numeric-ip | hostname
+Matches a single IPv6 address as allowed by
+.Xr inet_pton 3
+or a hostname.
+Hostnames are resolved at the time the rule is added to the firewall
+list.
+.It Ar addr Ns / Ns Ar masklen
+Matches all IPv6 addresses with base
+.Ar addr
+(specified as allowed by
+.Xr inet_pton
+or a hostname)
+and mask width of
+.Cm masklen
+bits.
+.El
+.Pp
+No support for sets of IPv6 addresses is provided because IPv6 addresses
+are typically random past the initial prefix.
+.It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports
+For protocols which support port numbers (such as TCP and UDP), optional
+.Cm ports
+may be specified as one or more ports or port ranges, separated
+by commas but no spaces, and an optional
+.Cm not
+operator.
+The
+.Ql \&-
+notation specifies a range of ports (including boundaries).
+.Pp
+Service names (from
+.Pa /etc/services )
+may be used instead of numeric port values.
+The length of the port list is limited to 30 ports or ranges,
+though one can specify larger ranges by using an
+.Em or-block
+in the
+.Cm options
+section of the rule.
+.Pp
+A backslash
+.Pq Ql \e
+can be used to escape the dash
+.Pq Ql -
+character in a service name (from a shell, the backslash must be
+typed twice to avoid the shell itself interpreting it as an escape
+character).
+.Pp
+.Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any"
+.Pp
+Fragmented packets which have a non-zero offset (i.e., not the first
+fragment) will never match a rule which has one or more port
+specifications.
+See the
+.Cm frag
+option for details on matching fragmented packets.
+.El
+.Ss RULE OPTIONS (MATCH PATTERNS)
+Additional match patterns can be used within
+rules.
+Zero or more of these so-called
+.Em options
+can be present in a rule, optionally prefixed by the
+.Cm not
+operand, and possibly grouped into
+.Em or-blocks .
+.Pp
+The following match patterns can be used (listed in alphabetical order):
+.Bl -tag -width indent
+.It Cm // this is a comment.
+Inserts the specified text as a comment in the rule.
+Everything following // is considered as a comment and stored in the rule.
+You can have comment-only rules, which are listed as having a
+.Cm count
+action followed by the comment.
+.It Cm bridged
+Alias for
+.Cm layer2 .
+.It Cm diverted
+Matches only packets generated by a divert socket.
+.It Cm diverted-loopback
+Matches only packets coming from a divert socket back into the IP stack
+input for delivery.
+.It Cm diverted-output
+Matches only packets going from a divert socket back outward to the IP
+stack output for delivery.
+.It Cm dst-ip Ar ip-address
+Matches IPv4 packets whose destination IP is one of the address(es)
+specified as argument.
+.It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address
+Matches IPv6 packets whose destination IP is one of the address(es)
+specified as argument.
+.It Cm dst-port Ar ports
+Matches IP packets whose destination port is one of the port(s)
+specified as argument.
+.It Cm established
+Matches TCP packets that have the RST or ACK bits set.
+.It Cm ext6hdr Ar header
+Matches IPv6 packets containing the extended header given by
+.Ar header .
+Supported headers are:
+.Pp
+Fragment,
+.Pq Cm frag ,
+Hop-to-hop options
+.Pq Cm hopopt ,
+any type of Routing Header
+.Pq Cm route ,
+Source routing Routing Header Type 0
+.Pq Cm rthdr0 ,
+Mobile IPv6 Routing Header Type 2
+.Pq Cm rthdr2 ,
+Destination options
+.Pq Cm dstopt ,
+IPSec authentication headers
+.Pq Cm ah ,
+and IPsec encapsulated security payload headers
+.Pq Cm esp .
+.It Cm fib Ar fibnum
+Matches a packet that has been tagged to use
+the given FIB (routing table) number.
+.It Cm flow-id Ar labels
+Matches IPv6 packets containing any of the flow labels given in
+.Ar labels .
+.Ar labels
+is a comma separated list of numeric flow labels.
+.It Cm frag
+Matches packets that are fragments and not the first
+fragment of an IP datagram.
+Note that these packets will not have
+the next protocol header (e.g.\& TCP, UDP) so options that look into
+these headers cannot match.
+.It Cm gid Ar group
+Matches all TCP or UDP packets sent by or received for a
+.Ar group .
+A
+.Ar group
+may be specified by name or number.
+.It Cm jail Ar prisonID
+Matches all TCP or UDP packets sent by or received for the
+jail whos prison ID is
+.Ar prisonID .
+.It Cm icmptypes Ar types
+Matches ICMP packets whose ICMP type is in the list
+.Ar types .
+The list may be specified as any combination of
+individual types (numeric) separated by commas.
+.Em Ranges are not allowed .
+The supported ICMP types are:
+.Pp
+echo reply
+.Pq Cm 0 ,
+destination unreachable
+.Pq Cm 3 ,
+source quench
+.Pq Cm 4 ,
+redirect
+.Pq Cm 5 ,
+echo request
+.Pq Cm 8 ,
+router advertisement
+.Pq Cm 9 ,
+router solicitation
+.Pq Cm 10 ,
+time-to-live exceeded
+.Pq Cm 11 ,
+IP header bad
+.Pq Cm 12 ,
+timestamp request
+.Pq Cm 13 ,
+timestamp reply
+.Pq Cm 14 ,
+information request
+.Pq Cm 15 ,
+information reply
+.Pq Cm 16 ,
+address mask request
+.Pq Cm 17
+and address mask reply
+.Pq Cm 18 .
+.It Cm icmp6types Ar types
+Matches ICMP6 packets whose ICMP6 type is in the list of
+.Ar types .
+The list may be specified as any combination of
+individual types (numeric) separated by commas.
+.Em Ranges are not allowed .
+.It Cm in | out
+Matches incoming or outgoing packets, respectively.
+.Cm in
+and
+.Cm out
+are mutually exclusive (in fact,
+.Cm out
+is implemented as
+.Cm not in Ns No ).
+.It Cm ipid Ar id-list
+Matches IPv4 packets whose
+.Cm ip_id
+field has value included in
+.Ar id-list ,
+which is either a single value or a list of values or ranges
+specified in the same way as
+.Ar ports .
+.It Cm iplen Ar len-list
+Matches IP packets whose total length, including header and data, is
+in the set
+.Ar len-list ,
+which is either a single value or a list of values or ranges
+specified in the same way as
+.Ar ports .
+.It Cm ipoptions Ar spec
+Matches packets whose IPv4 header contains the comma separated list of
+options specified in
+.Ar spec .
+The supported IP options are:
+.Pp
+.Cm ssrr
+(strict source route),
+.Cm lsrr
+(loose source route),
+.Cm rr
+(record packet route) and
+.Cm ts
+(timestamp).
+The absence of a particular option may be denoted
+with a
+.Ql \&! .
+.It Cm ipprecedence Ar precedence
+Matches IPv4 packets whose precedence field is equal to
+.Ar precedence .
+.It Cm ipsec
+Matches packets that have IPSEC history associated with them
+(i.e., the packet comes encapsulated in IPSEC, the kernel
+has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly
+decapsulate it).
+.Pp
+Note that specifying
+.Cm ipsec
+is different from specifying
+.Cm proto Ar ipsec
+as the latter will only look at the specific IP protocol field,
+irrespective of IPSEC kernel support and the validity of the IPSEC data.
+.Pp
+Further note that this flag is silently ignored in kernels without
+IPSEC support.
+It does not affect rule processing when given and the
+rules are handled as if with no
+.Cm ipsec
+flag.
+.It Cm iptos Ar spec
+Matches IPv4 packets whose
+.Cm tos
+field contains the comma separated list of
+service types specified in
+.Ar spec .
+The supported IP types of service are:
+.Pp
+.Cm lowdelay
+.Pq Dv IPTOS_LOWDELAY ,
+.Cm throughput
+.Pq Dv IPTOS_THROUGHPUT ,
+.Cm reliability
+.Pq Dv IPTOS_RELIABILITY ,
+.Cm mincost
+.Pq Dv IPTOS_MINCOST ,
+.Cm congestion
+.Pq Dv IPTOS_ECN_CE .
+The absence of a particular type may be denoted
+with a
+.Ql \&! .
+.It Cm dscp spec Ns Op , Ns Ar spec
+Matches IPv4/IPv6 packets whose
+.Cm DS
+field value is contained in
+.Ar spec
+mask.
+Multiple values can be specified via
+the comma separated list.
+Value can be one of keywords used in
+.Cm setdscp
+action or exact number.
+.It Cm ipttl Ar ttl-list
+Matches IPv4 packets whose time to live is included in
+.Ar ttl-list ,
+which is either a single value or a list of values or ranges
+specified in the same way as
+.Ar ports .
+.It Cm ipversion Ar ver
+Matches IP packets whose IP version field is
+.Ar ver .
+.It Cm keep-state
+Upon a match, the firewall will create a dynamic rule, whose
+default behaviour is to match bidirectional traffic between
+source and destination IP/port using the same protocol.
+The rule has a limited lifetime (controlled by a set of
+.Xr sysctl 8
+variables), and the lifetime is refreshed every time a matching
+packet is found.
+.It Cm layer2
+Matches only layer2 packets, i.e., those passed to
+.Nm
+from ether_demux() and ether_output_frame().
+.It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N
+The firewall will only allow
+.Ar N
+connections with the same
+set of parameters as specified in the rule.
+One or more
+of source and destination addresses and ports can be
+specified.
+Currently,
+only IPv4 flows are supported.
+.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N
+Search an entry in lookup table
+.Ar N
+that matches the field specified as argument.
+If not found, the match fails.
+Otherwise, the match succeeds and
+.Cm tablearg
+is set to the value extracted from the table.
+.Pp
+This option can be useful to quickly dispatch traffic based on
+certain packet fields.
+See the
+.Sx LOOKUP TABLES
+section below for more information on lookup tables.
+.It Cm { MAC | mac } Ar dst-mac src-mac
+Match packets with a given
+.Ar dst-mac
+and
+.Ar src-mac
+addresses, specified as the
+.Cm any
+keyword (matching any MAC address), or six groups of hex digits
+separated by colons,
+and optionally followed by a mask indicating the significant bits.
+The mask may be specified using either of the following methods:
+.Bl -enum -width indent
+.It
+A slash
+.Pq /
+followed by the number of significant bits.
+For example, an address with 33 significant bits could be specified as:
+.Pp
+.Dl "MAC 10:20:30:40:50:60/33 any"
+.Pp
+.It
+An ampersand
+.Pq &
+followed by a bitmask specified as six groups of hex digits separated
+by colons.
+For example, an address in which the last 16 bits are significant could
+be specified as:
+.Pp
+.Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any"
+.Pp
+Note that the ampersand character has a special meaning in many shells
+and should generally be escaped.
+.Pp
+.El
+Note that the order of MAC addresses (destination first,
+source second) is
+the same as on the wire, but the opposite of the one used for
+IP addresses.
+.It Cm mac-type Ar mac-type
+Matches packets whose Ethernet Type field
+corresponds to one of those specified as argument.
+.Ar mac-type
+is specified in the same way as
+.Cm port numbers
+(i.e., one or more comma-separated single values or ranges).
+You can use symbolic names for known values such as
+.Em vlan , ipv4, ipv6 .
+Values can be entered as decimal or hexadecimal (if prefixed by 0x),
+and they are always printed as hexadecimal (unless the
+.Cm -N
+option is used, in which case symbolic resolution will be attempted).
+.It Cm proto Ar protocol
+Matches packets with the corresponding IP protocol.
+.It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar table Ns Pq Ar number Ns Op , Ns Ar value | Ar ipno | Ar any
+Matches packets received, transmitted or going through,
+respectively, the interface specified by exact name
+.Po Ar ifX Pc ,
+by device name
+.Po Ar if* Pc ,
+by IP address, or through some interface.
+.Pp
+The
+.Cm via
+keyword causes the interface to always be checked.
+If
+.Cm recv
+or
+.Cm xmit
+is used instead of
+.Cm via ,
+then only the receive or transmit interface (respectively)
+is checked.
+By specifying both, it is possible to match packets based on
+both receive and transmit interface, e.g.:
+.Pp
+.Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1"
+.Pp
+The
+.Cm recv
+interface can be tested on either incoming or outgoing packets,
+while the
+.Cm xmit
+interface can only be tested on outgoing packets.
+So
+.Cm out
+is required (and
+.Cm in
+is invalid) whenever
+.Cm xmit
+is used.
+.Pp
+A packet might not have a receive or transmit interface: packets
+originating from the local host have no receive interface,
+while packets destined for the local host have no transmit
+interface.
+.It Cm setup
+Matches TCP packets that have the SYN bit set but no ACK bit.
+This is the short form of
+.Dq Li tcpflags\ syn,!ack .
+.It Cm sockarg
+Matches packets that are associated to a local socket and
+for which the SO_USER_COOKIE socket option has been set
+to a non-zero value.
+As a side effect, the value of the
+option is made available as
+.Cm tablearg
+value, which in turn can be used as
+.Cm skipto
+or
+.Cm pipe
+number.
+.It Cm src-ip Ar ip-address
+Matches IPv4 packets whose source IP is one of the address(es)
+specified as an argument.
+.It Cm src-ip6 Ar ip6-address
+Matches IPv6 packets whose source IP is one of the address(es)
+specified as an argument.
+.It Cm src-port Ar ports
+Matches IP packets whose source port is one of the port(s)
+specified as argument.
+.It Cm tagged Ar tag-list
+Matches packets whose tags are included in
+.Ar tag-list ,
+which is either a single value or a list of values or ranges
+specified in the same way as
+.Ar ports .
+Tags can be applied to the packet using
+.Cm tag
+rule action parameter (see it's description for details on tags).
+.It Cm tcpack Ar ack
+TCP packets only.
+Match if the TCP header acknowledgment number field is set to
+.Ar ack .
+.It Cm tcpdatalen Ar tcpdatalen-list
+Matches TCP packets whose length of TCP data is
+.Ar tcpdatalen-list ,
+which is either a single value or a list of values or ranges
+specified in the same way as
+.Ar ports .
+.It Cm tcpflags Ar spec
+TCP packets only.
+Match if the TCP header contains the comma separated list of
+flags specified in
+.Ar spec .
+The supported TCP flags are:
+.Pp
+.Cm fin ,
+.Cm syn ,
+.Cm rst ,
+.Cm psh ,
+.Cm ack
+and
+.Cm urg .
+The absence of a particular flag may be denoted
+with a
+.Ql \&! .
+A rule which contains a
+.Cm tcpflags
+specification can never match a fragmented packet which has
+a non-zero offset.
+See the
+.Cm frag
+option for details on matching fragmented packets.
+.It Cm tcpseq Ar seq
+TCP packets only.
+Match if the TCP header sequence number field is set to
+.Ar seq .
+.It Cm tcpwin Ar tcpwin-list
+Matches TCP packets whose  header window field is set to
+.Ar tcpwin-list ,
+which is either a single value or a list of values or ranges
+specified in the same way as
+.Ar ports .
+.It Cm tcpoptions Ar spec
+TCP packets only.
+Match if the TCP header contains the comma separated list of
+options specified in
+.Ar spec .
+The supported TCP options are:
+.Pp
+.Cm mss
+(maximum segment size),
+.Cm window
+(tcp window advertisement),
+.Cm sack
+(selective ack),
+.Cm ts
+(rfc1323 timestamp) and
+.Cm cc
+(rfc1644 t/tcp connection count).
+The absence of a particular option may be denoted
+with a
+.Ql \&! .
+.It Cm uid Ar user
+Match all TCP or UDP packets sent by or received for a
+.Ar user .
+A
+.Ar user
+may be matched by name or identification number.
+.It Cm verrevpath
+For incoming packets,
+a routing table lookup is done on the packet's source address.
+If the interface on which the packet entered the system matches the
+outgoing interface for the route,
+the packet matches.
+If the interfaces do not match up,
+the packet does not match.
+All outgoing packets or packets with no incoming interface match.
+.Pp
+The name and functionality of the option is intentionally similar to
+the Cisco IOS command:
+.Pp
+.Dl ip verify unicast reverse-path
+.Pp
+This option can be used to make anti-spoofing rules to reject all
+packets with source addresses not from this interface.
+See also the option
+.Cm antispoof .
+.It Cm versrcreach
+For incoming packets,
+a routing table lookup is done on the packet's source address.
+If a route to the source address exists, but not the default route
+or a blackhole/reject route, the packet matches.
+Otherwise, the packet does not match.
+All outgoing packets match.
+.Pp
+The name and functionality of the option is intentionally similar to
+the Cisco IOS command:
+.Pp
+.Dl ip verify unicast source reachable-via any
+.Pp
+This option can be used to make anti-spoofing rules to reject all
+packets whose source address is unreachable.
+.It Cm antispoof
+For incoming packets, the packet's source address is checked if it
+belongs to a directly connected network.
+If the network is directly connected, then the interface the packet
+came on in is compared to the interface the network is connected to.
+When incoming interface and directly connected interface are not the
+same, the packet does not match.
+Otherwise, the packet does match.
+All outgoing packets match.
+.Pp
+This option can be used to make anti-spoofing rules to reject all
+packets that pretend to be from a directly connected network but do
+not come in through that interface.
+This option is similar to but more restricted than
+.Cm verrevpath
+because it engages only on packets with source addresses of directly
+connected networks instead of all source addresses.
+.El
+.Sh LOOKUP TABLES
+Lookup tables are useful to handle large sparse sets of
+addresses or other search keys (e.g., ports, jail IDs, interface names).
+In the rest of this section we will use the term ``address''.
+There may be up to 65535 different lookup tables, numbered 0 to 65534.
+.Pp
+Each entry is represented by an
+.Ar addr Ns Op / Ns Ar masklen
+and will match all addresses with base
+.Ar addr
+(specified as an IPv4/IPv6 address, a hostname or an unsigned integer)
+and mask width of
+.Ar masklen
+bits.
+If
+.Ar masklen
+is not specified, it defaults to 32 for IPv4 and 128 for IPv6.
+When looking up an IP address in a table, the most specific
+entry will match.
+Associated with each entry is a 32-bit unsigned
+.Ar value ,
+which can optionally be checked by a rule matching code.
+When adding an entry, if
+.Ar value
+is not specified, it defaults to 0.
+.Pp
+An entry can be added to a table
+.Pq Cm add ,
+or removed from a table
+.Pq Cm delete .
+A table can be examined
+.Pq Cm list
+or flushed
+.Pq Cm flush .
+.Pp
+Internally, each table is stored in a Radix tree, the same way as
+the routing table (see
+.Xr route 4 ) .
+.Pp
+Lookup tables currently support only ports, jail IDs, IPv4/IPv6  addresses
+and interface names.
+Wildcards is not supported for interface names.
+.Pp
+The
+.Cm tablearg
+feature provides the ability to use a value, looked up in the table, as
+the argument for a rule action, action parameter or rule option.
+This can significantly reduce number of rules in some configurations.
+If two tables are used in a rule, the result of the second (destination)
+is used.
+The
+.Cm tablearg
+argument can be used with the following actions:
+.Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib,
+action parameters:
+.Cm tag, untag,
+rule options:
+.Cm limit, tagged.
+.Pp
+When used with
+.Cm fwd
+it is possible to supply table entries with values
+that are in the form of IP addresses or hostnames.
+See the
+.Sx EXAMPLES
+Section for example usage of tables and the tablearg keyword.
+.Pp
+When used with the
+.Cm skipto
+action, the user should be aware that the code will walk the ruleset
+up to a rule equal to, or past, the given number,
+and should therefore try keep the
+ruleset compact between the skipto and the target rules.
+.Sh SETS OF RULES
+Each rule belongs to one of 32 different
+.Em sets
+, numbered 0 to 31.
+Set 31 is reserved for the default rule.
+.Pp
+By default, rules are put in set 0, unless you use the
+.Cm set N
+attribute when entering a new rule.
+Sets can be individually and atomically enabled or disabled,
+so this mechanism permits an easy way to store multiple configurations
+of the firewall and quickly (and atomically) switch between them.
+The command to enable/disable sets is
+.Bd -ragged -offset indent
+.Nm
+.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ...
+.Ed
+.Pp
+where multiple
+.Cm enable
+or
+.Cm disable
+sections can be specified.
+Command execution is atomic on all the sets specified in the command.
+By default, all sets are enabled.
+.Pp
+When you disable a set, its rules behave as if they do not exist
+in the firewall configuration, with only one exception:
+.Bd -ragged -offset indent
+dynamic rules created from a rule before it had been disabled
+will still be active until they expire.
+In order to delete
+dynamic rules you have to explicitly delete the parent rule
+which generated them.
+.Ed
+.Pp
+The set number of rules can be changed with the command
+.Bd -ragged -offset indent
+.Nm
+.Cm set move
+.Brq Cm rule Ar rule-number | old-set
+.Cm to Ar new-set
+.Ed
+.Pp
+Also, you can atomically swap two rulesets with the command
+.Bd -ragged -offset indent
+.Nm
+.Cm set swap Ar first-set second-set
+.Ed
+.Pp
+See the
+.Sx EXAMPLES
+Section on some possible uses of sets of rules.
+.Sh STATEFUL FIREWALL
+Stateful operation is a way for the firewall to dynamically
+create rules for specific flows when packets that
+match a given pattern are detected.
+Support for stateful
+operation comes through the
+.Cm check-state , keep-state
+and
+.Cm limit
+options of
+.Nm rules .
+.Pp
+Dynamic rules are created when a packet matches a
+.Cm keep-state
+or
+.Cm limit
+rule, causing the creation of a
+.Em dynamic
+rule which will match all and only packets with
+a given
+.Em protocol
+between a
+.Em src-ip/src-port dst-ip/dst-port
+pair of addresses
+.Em ( src
+and
+.Em dst
+are used here only to denote the initial match addresses, but they
+are completely equivalent afterwards).
+Dynamic rules will be checked at the first
+.Cm check-state, keep-state
+or
+.Cm limit
+occurrence, and the action performed upon a match will be the same
+as in the parent rule.
+.Pp
+Note that no additional attributes other than protocol and IP addresses
+and ports are checked on dynamic rules.
+.Pp
+The typical use of dynamic rules is to keep a closed firewall configuration,
+but let the first TCP SYN packet from the inside network install a
+dynamic rule for the flow so that packets belonging to that session
+will be allowed through the firewall:
+.Pp
+.Dl "ipfw add check-state"
+.Dl "ipfw add allow tcp from my-subnet to any setup keep-state"
+.Dl "ipfw add deny tcp from any to any"
+.Pp
+A similar approach can be used for UDP, where an UDP packet coming
+from the inside will install a dynamic rule to let the response through
+the firewall:
+.Pp
+.Dl "ipfw add check-state"
+.Dl "ipfw add allow udp from my-subnet to any keep-state"
+.Dl "ipfw add deny udp from any to any"
+.Pp
+Dynamic rules expire after some time, which depends on the status
+of the flow and the setting of some
+.Cm sysctl
+variables.
+See Section
+.Sx SYSCTL VARIABLES
+for more details.
+For TCP sessions, dynamic rules can be instructed to periodically
+send keepalive packets to refresh the state of the rule when it is
+about to expire.
+.Pp
+See Section
+.Sx EXAMPLES
+for more examples on how to use dynamic rules.
+.Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
+.Nm
+is also the user interface for the
+.Nm dummynet
+traffic shaper, packet scheduler and network emulator, a subsystem that
+can artificially queue, delay or drop packets
+emulating the behaviour of certain network links
+or queueing systems.
+.Pp
+.Nm dummynet
+operates by first using the firewall to select packets
+using any match pattern that can be used in
+.Nm
+rules.
+Matching packets are then passed to either of two
+different objects, which implement the traffic regulation:
+.Bl -hang -offset XXXX
+.It Em pipe
+A
+.Em pipe
+emulates a
+.Em link
+with given bandwidth and propagation delay,
+driven by a FIFO scheduler and a single queue with programmable
+queue size and packet loss rate.
+Packets are appended to the queue as they come out from
+.Nm ipfw ,
+and then transferred in FIFO order to the link at the desired rate.
+.It Em queue
+A
+.Em queue
+is an abstraction used to implement packet scheduling
+using one of several packet scheduling algorithms.
+Packets sent to a
+.Em queue
+are first grouped into flows according to a mask on the 5-tuple.
+Flows are then passed to the scheduler associated to the
+.Em queue ,
+and each flow uses scheduling parameters (weight and others)
+as configured in the
+.Em queue
+itself.
+A scheduler in turn is connected to an emulated link,
+and arbitrates the link's bandwidth among backlogged flows according to
+weights and to the features of the scheduling algorithm in use.
+.El
+.Pp
+In practice,
+.Em pipes
+can be used to set hard limits to the bandwidth that a flow can use, whereas
+.Em queues
+can be used to determine how different flows share the available bandwidth.
+.Pp
+A graphical representation of the binding of queues,
+flows, schedulers and links is below.
+.Bd -literal -offset indent
+                 (flow_mask|sched_mask)  sched_mask
+         +---------+   weight Wx  +-------------+
+         |         |->-[flow]-->--|             |-+
+    -->--| QUEUE x |   ...        |             | |
+         |         |->-[flow]-->--| SCHEDuler N | |
+         +---------+              |             | |
+             ...                  |             +--[LINK N]-->--
+         +---------+   weight Wy  |             | +--[LINK N]-->--
+         |         |->-[flow]-->--|             | |
+    -->--| QUEUE y |   ...        |             | |
+         |         |->-[flow]-->--|             | |
+         +---------+              +-------------+ |
+                                    +-------------+
+.Ed
+It is important to understand the role of the SCHED_MASK
+and FLOW_MASK, which are configured through the commands
+.Dl "ipfw sched N config mask SCHED_MASK ..."
+and
+.Dl "ipfw queue X config mask FLOW_MASK ..." .
+.Pp
+The SCHED_MASK is used to assign flows to one or more
+scheduler instances, one for each
+value of the packet's 5-tuple after applying SCHED_MASK.
+As an example, using ``src-ip 0xffffff00'' creates one instance
+for each /24 destination subnet.
+.Pp
+The FLOW_MASK, together with the SCHED_MASK, is used to split
+packets into flows.
+As an example, using
+``src-ip 0x000000ff''
+together with the previous SCHED_MASK makes a flow for
+each individual source address.
+In turn, flows for each /24
+subnet will be sent to the same scheduler instance.
+.Pp
+The above diagram holds even for the
+.Em pipe
+case, with the only restriction that a
+.Em pipe
+only supports a SCHED_MASK, and forces the use of a FIFO
+scheduler (these are for backward compatibility reasons;
+in fact, internally, a
+.Nm dummynet's
+pipe is implemented exactly as above).
+.Pp
+There are two modes of
+.Nm dummynet
+operation:
+.Dq normal
+and
+.Dq fast .
+The
+.Dq normal
+mode tries to emulate a real link: the
+.Nm dummynet
+scheduler ensures that the packet will not leave the pipe faster than it
+would on the real link with a given bandwidth.
+The
+.Dq fast
+mode allows certain packets to bypass the
+.Nm dummynet
+scheduler (if packet flow does not exceed pipe's bandwidth).
+This is the reason why the
+.Dq fast
+mode requires less CPU cycles per packet (on average) and packet latency
+can be significantly lower in comparison to a real link with the same
+bandwidth.
+The default mode is
+.Dq normal .
+The
+.Dq fast
+mode can be enabled by setting the
+.Va net.inet.ip.dummynet.io_fast
+.Xr sysctl 8
+variable to a non-zero value.
+.Pp
+.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION
+The
+.Em pipe ,
+.Em queue
+and
+.Em scheduler
+configuration commands are the following:
+.Bd -ragged -offset indent
+.Cm pipe Ar number Cm config Ar pipe-configuration
+.Pp
+.Cm queue Ar number Cm config Ar queue-configuration
+.Pp
+.Cm sched Ar number Cm config Ar sched-configuration
+.Ed
+.Pp
+The following parameters can be configured for a pipe:
+.Pp
+.Bl -tag -width indent -compact
+.It Cm bw Ar bandwidth | device
+Bandwidth, measured in
+.Sm off
+.Op Cm K | M
+.Brq Cm bit/s | Byte/s .
+.Sm on
+.Pp
+A value of 0 (default) means unlimited bandwidth.
+The unit must immediately follow the number, as in
+.Pp
+.Dl "ipfw pipe 1 config bw 300Kbit/s"
+.Pp
+If a device name is specified instead of a numeric value, as in
+.Pp
+.Dl "ipfw pipe 1 config bw tun0"
+.Pp
+then the transmit clock is supplied by the specified device.
+At the moment only the
+.Xr tun 4
+device supports this
+functionality, for use in conjunction with
+.Xr ppp 8 .
+.Pp
+.It Cm delay Ar ms-delay
+Propagation delay, measured in milliseconds.
+The value is rounded to the next multiple of the clock tick
+(typically 10ms, but it is a good practice to run kernels
+with
+.Dq "options HZ=1000"
+to reduce
+the granularity to 1ms or less).
+The default value is 0, meaning no delay.
+.Pp
+.It Cm burst Ar size
+If the data to be sent exceeds the pipe's bandwidth limit
+(and the pipe was previously idle), up to
+.Ar size
+bytes of data are allowed to bypass the
+.Nm dummynet
+scheduler, and will be sent as fast as the physical link allows.
+Any additional data will be transmitted at the rate specified
+by the
+.Nm pipe
+bandwidth.
+The burst size depends on how long the pipe has been idle;
+the effective burst size is calculated as follows:
+MAX(
+.Ar size
+,
+.Nm bw
+* pipe_idle_time).
+.Pp
+.It Cm profile Ar filename
+A file specifying the additional overhead incurred in the transmission
+of a packet on the link.
+.Pp
+Some link types introduce extra delays in the transmission
+of a packet, e.g., because of MAC level framing, contention on
+the use of the channel, MAC level retransmissions and so on.
+From our point of view, the channel is effectively unavailable
+for this extra time, which is constant or variable depending
+on the link type.
+Additionally, packets may be dropped after this
+time (e.g., on a wireless link after too many retransmissions).
+We can model the additional delay with an empirical curve
+that represents its distribution.
+.Bd -literal -offset indent
+      cumulative probability
+      1.0 ^
+          |
+      L   +-- loss-level          x
+          |                 ******
+          |                *
+          |           *****
+          |          *
+          |        **
+          |       *
+          +-------*------------------->
+                      delay
+.Ed
+The empirical curve may have both vertical and horizontal lines.
+Vertical lines represent constant delay for a range of
+probabilities.
+Horizontal lines correspond to a discontinuity in the delay
+distribution: the pipe will use the largest delay for a
+given probability.
+.Pp
+The file format is the following, with whitespace acting as
+a separator and '#' indicating the beginning a comment:
+.Bl -tag -width indent
+.It Cm name Ar identifier
+optional name (listed by "ipfw pipe show")
+to identify the delay distribution;
+.It Cm bw Ar value
+the bandwidth used for the pipe.
+If not specified here, it must be present
+explicitly as a configuration parameter for the pipe;
+.It Cm loss-level Ar L
+the probability above which packets are lost.
+(0.0 <= L <= 1.0, default 1.0 i.e., no loss);
+.It Cm samples Ar N
+the number of samples used in the internal
+representation of the curve (2..1024; default 100);
+.It Cm "delay prob" | "prob delay"
+One of these two lines is mandatory and defines
+the format of the following lines with data points.
+.It Ar XXX Ar YYY
+2 or more lines representing points in the curve,
+with either delay or probability first, according
+to the chosen format.
+The unit for delay is milliseconds.
+Data points do not need to be sorted.
+Also, the number of actual lines can be different
+from the value of the "samples" parameter:
+.Nm
+utility will sort and interpolate
+the curve as needed.
+.El
+.Pp
+Example of a profile file:
+.Bd -literal -offset indent
+name    bla_bla_bla
+samples 100
+loss-level    0.86
+prob    delay
+0       200    # minimum overhead is 200ms
+0.5     200
+0.5     300
+0.8     1000
+0.9     1300
+1       1300
+#configuration file end
+.Ed
+.El
+.Pp
+The following parameters can be configured for a queue:
+.Pp
+.Bl -tag -width indent -compact
+.It Cm pipe Ar pipe_nr
+Connects a queue to the specified pipe.
+Multiple queues (with the same or different weights) can be connected to
+the same pipe, which specifies the aggregate rate for the set of queues.
+.Pp
+.It Cm weight Ar weight
+Specifies the weight to be used for flows matching this queue.
+The weight must be in the range 1..100, and defaults to 1.
+.El
+.Pp
+The following case-insensitive parameters can be configured for a
+scheduler:
+.Pp
+.Bl -tag -width indent -compact
+.It Cm type Ar {fifo | wf2q+ | rr | qfq}
+specifies the scheduling algorithm to use.
+.Bl -tag -width indent -compact
+.It Cm fifo
+is just a FIFO scheduler (which means that all packets
+are stored in the same queue as they arrive to the scheduler).
+FIFO has O(1) per-packet time complexity, with very low
+constants (estimate 60-80ns on a 2GHz desktop machine)
+but gives no service guarantees.
+.It Cm wf2q+
+implements the WF2Q+ algorithm, which is a Weighted Fair Queueing
+algorithm which permits flows to share bandwidth according to
+their weights.
+Note that weights are not priorities; even a flow
+with a minuscule weight will never starve.
+WF2Q+ has O(log N) per-packet processing cost, where N is the number
+of flows, and is the default algorithm used by previous versions
+dummynet's queues.
+.It Cm rr
+implements the Deficit Round Robin algorithm, which has O(1) processing
+costs (roughly, 100-150ns per packet)
+and permits bandwidth allocation according to weights, but
+with poor service guarantees.
+.It Cm qfq
+implements the QFQ algorithm, which is a very fast variant of
+WF2Q+, with similar service guarantees and O(1) processing
+costs (roughly, 200-250ns per packet).
+.El
+.El
+.Pp
+In addition to the type, all parameters allowed for a pipe can also
+be specified for a scheduler.
+.Pp
+Finally, the following parameters can be configured for both
+pipes and queues:
+.Pp
+.Bl -tag -width XXXX -compact
+.It Cm buckets Ar hash-table-size
+Specifies the size of the hash table used for storing the
+various queues.
+Default value is 64 controlled by the
+.Xr sysctl 8
+variable
+.Va net.inet.ip.dummynet.hash_size ,
+allowed range is 16 to 65536.
+.Pp
+.It Cm mask Ar mask-specifier
+Packets sent to a given pipe or queue by an
+.Nm
+rule can be further classified into multiple flows, each of which is then
+sent to a different
+.Em dynamic
+pipe or queue.
+A flow identifier is constructed by masking the IP addresses,
+ports and protocol types as specified with the
+.Cm mask
+options in the configuration of the pipe or queue.
+For each different flow identifier, a new pipe or queue is created
+with the same parameters as the original object, and matching packets
+are sent to it.
+.Pp
+Thus, when
+.Em dynamic pipes
+are used, each flow will get the same bandwidth as defined by the pipe,
+whereas when
+.Em dynamic queues
+are used, each flow will share the parent's pipe bandwidth evenly
+with other flows generated by the same queue (note that other queues
+with different weights might be connected to the same pipe).
+.br
+Available mask specifiers are a combination of one or more of the following:
+.Pp
+.Cm dst-ip Ar mask ,
+.Cm dst-ip6 Ar mask ,
+.Cm src-ip Ar mask ,
+.Cm src-ip6 Ar mask ,
+.Cm dst-port Ar mask ,
+.Cm src-port Ar mask ,
+.Cm flow-id Ar mask ,
+.Cm proto Ar mask
+or
+.Cm all ,
+.Pp
+where the latter means all bits in all fields are significant.
+.Pp
+.It Cm noerror
+When a packet is dropped by a
+.Nm dummynet
+queue or pipe, the error
+is normally reported to the caller routine in the kernel, in the
+same way as it happens when a device queue fills up.
+Setting this
+option reports the packet as successfully delivered, which can be
+needed for some experimental setups where you want to simulate
+loss or congestion at a remote router.
+.Pp
+.It Cm plr Ar packet-loss-rate
+Packet loss rate.
+Argument
+.Ar packet-loss-rate
+is a floating-point number between 0 and 1, with 0 meaning no
+loss, 1 meaning 100% loss.
+The loss rate is internally represented on 31 bits.
+.Pp
+.It Cm queue Brq Ar slots | size Ns Cm Kbytes
+Queue size, in
+.Ar slots
+or
+.Cm KBytes .
+Default value is 50 slots, which
+is the typical queue size for Ethernet devices.
+Note that for slow speed links you should keep the queue
+size short or your traffic might be affected by a significant
+queueing delay.
+E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit
+or 20s of queue on a 30Kbit/s pipe.
+Even worse effects can result if you get packets from an
+interface with a much larger MTU, e.g.\& the loopback interface
+with its 16KB packets.
+The
+.Xr sysctl 8
+variables
+.Em net.inet.ip.dummynet.pipe_byte_limit
+and
+.Em net.inet.ip.dummynet.pipe_slot_limit
+control the maximum lengths that can be specified.
+.Pp
+.It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p
+Make use of the RED (Random Early Detection) queue management algorithm.
+.Ar w_q
+and
+.Ar max_p
+are floating
+point numbers between 0 and 1 (0 not included), while
+.Ar min_th
+and
+.Ar max_th
+are integer numbers specifying thresholds for queue management
+(thresholds are computed in bytes if the queue has been defined
+in bytes, in slots otherwise).
+The
+.Nm dummynet
+also supports the gentle RED variant (gred).
+Three
+.Xr sysctl 8
+variables can be used to control the RED behaviour:
+.Bl -tag -width indent
+.It Va net.inet.ip.dummynet.red_lookup_depth
+specifies the accuracy in computing the average queue
+when the link is idle (defaults to 256, must be greater than zero)
+.It Va net.inet.ip.dummynet.red_avg_pkt_size
+specifies the expected average packet size (defaults to 512, must be
+greater than zero)
+.It Va net.inet.ip.dummynet.red_max_pkt_size
+specifies the expected maximum packet size, only used when queue
+thresholds are in bytes (defaults to 1500, must be greater than zero).
+.El
+.El
+.Pp
+When used with IPv6 data,
+.Nm dummynet
+currently has several limitations.
+Information necessary to route link-local packets to an
+interface is not available after processing by
+.Nm dummynet
+so those packets are dropped in the output path.
+Care should be taken to ensure that link-local packets are not passed to
+.Nm dummynet .
+.Sh CHECKLIST
+Here are some important points to consider when designing your
+rules:
+.Bl -bullet
+.It
+Remember that you filter both packets going
+.Cm in
+and
+.Cm out .
+Most connections need packets going in both directions.
+.It
+Remember to test very carefully.
+It is a good idea to be near the console when doing this.
+If you cannot be near the console,
+use an auto-recovery script such as the one in
+.Pa /usr/share/examples/ipfw/change_rules.sh .
+.It
+Do not forget the loopback interface.
+.El
+.Sh FINE POINTS
+.Bl -bullet
+.It
+There are circumstances where fragmented datagrams are unconditionally
+dropped.
+TCP packets are dropped if they do not contain at least 20 bytes of
+TCP header, UDP packets are dropped if they do not contain a full 8
+byte UDP header, and ICMP packets are dropped if they do not contain
+4 bytes of ICMP header, enough to specify the ICMP type, code, and
+checksum.
+These packets are simply logged as
+.Dq pullup failed
+since there may not be enough good data in the packet to produce a
+meaningful log entry.
+.It
+Another type of packet is unconditionally dropped, a TCP packet with a
+fragment offset of one.
+This is a valid packet, but it only has one use, to try
+to circumvent firewalls.
+When logging is enabled, these packets are
+reported as being dropped by rule -1.
+.It
+If you are logged in over a network, loading the
+.Xr kld 4
+version of
+.Nm
+is probably not as straightforward as you would think.
+The following command line is recommended:
+.Bd -literal -offset indent
+kldload ipfw && \e
+ipfw add 32000 allow ip from any to any
+.Ed
+.Pp
+Along the same lines, doing an
+.Bd -literal -offset indent
+ipfw flush
+.Ed
+.Pp
+in similar surroundings is also a bad idea.
+.It
+The
+.Nm
+filter list may not be modified if the system security level
+is set to 3 or higher
+(see
+.Xr init 8
+for information on system security levels).
+.El
+.Sh PACKET DIVERSION
+A
+.Xr divert 4
+socket bound to the specified port will receive all packets
+diverted to that port.
+If no socket is bound to the destination port, or if the divert module is
+not loaded, or if the kernel was not compiled with divert socket support,
+the packets are dropped.
+.Sh NETWORK ADDRESS TRANSLATION (NAT)
+.Nm
+support in-kernel NAT using the kernel version of
+.Xr libalias 3 .
+.Pp
+The nat configuration command is the following:
+.Bd -ragged -offset indent
+.Bk -words
+.Cm nat
+.Ar nat_number
+.Cm config
+.Ar nat-configuration
+.Ek
+.Ed
+.Pp
+The following parameters can be configured:
+.Bl -tag -width indent
+.It Cm ip Ar ip_address
+Define an ip address to use for aliasing.
+.It Cm if Ar nic
+Use ip address of NIC for aliasing, dynamically changing
+it if NIC's ip address changes.
+.It Cm log
+Enable logging on this nat instance.
+.It Cm deny_in
+Deny any incoming connection from outside world.
+.It Cm same_ports
+Try to leave the alias port numbers unchanged from
+the actual local port numbers.
+.It Cm unreg_only
+Traffic on the local network not originating from an
+unregistered address spaces will be ignored.
+.It Cm reset
+Reset table of the packet aliasing engine on address change.
+.It Cm reverse
+Reverse the way libalias handles aliasing.
+.It Cm proxy_only
+Obey transparent proxy rules only, packet aliasing is not performed.
+.It Cm skip_global
+Skip instance in case of global state lookup (see below).
+.El
+.Pp
+Some specials value can be supplied instead of
+.Va nat_number:
+.Bl -tag -width indent
+.It Cm global
+Looks up translation state in all configured nat instances.
+If an entry is found, packet is aliased according to that entry.
+If no entry was found in any of the instances, packet is passed unchanged,
+and no new entry will be created.
+See section
+.Sx MULTIPLE INSTANCES
+in
+.Xr natd 8
+for more information.
+.It Cm tablearg
+Uses argument supplied in lookup table.
+See
+.Sx LOOKUP TABLES
+section below for more information on lookup tables.
+.El
+.Pp
+To let the packet continue after being (de)aliased, set the sysctl variable
+.Va net.inet.ip.fw.one_pass
+to 0.
+For more information about aliasing modes, refer to
+.Xr libalias 3 .
+See Section
+.Sx EXAMPLES
+for some examples about nat usage.
+.Ss REDIRECT AND LSNAT SUPPORT IN IPFW
+Redirect and LSNAT support follow closely the syntax used in
+.Xr natd 8 .
+See Section
+.Sx EXAMPLES
+for some examples on how to do redirect and lsnat.
+.Ss SCTP NAT SUPPORT
+SCTP nat can be configured in a similar manner to TCP through the
+.Nm
+command line tool.
+The main difference is that
+.Nm sctp nat
+does not do port translation.
+Since the local and global side ports will be the same,
+there is no need to specify both.
+Ports are redirected as follows:
+.Bd -ragged -offset indent
+.Bk -words
+.Cm nat
+.Ar nat_number
+.Cm config if
+.Ar nic
+.Cm redirect_port sctp
+.Ar ip_address [,addr_list] {[port | port-port] [,ports]}
+.Ek
+.Ed
+.Pp
+Most
+.Nm sctp nat
+configuration can be done in real-time through the
+.Xr sysctl 8
+interface.
+All may be changed dynamically, though the hash_table size will only
+change for new
+.Nm nat
+instances.
+See
+.Sx SYSCTL VARIABLES
+for more info.
+.Sh LOADER TUNABLES
+Tunables can be set in
+.Xr loader 8
+prompt,
+.Xr loader.conf 5
+or
+.Xr kenv 1
+before ipfw module gets loaded.
+.Bl -tag -width indent
+.It Va net.inet.ip.fw.default_to_accept: No 0
+Defines ipfw last rule behavior.
+This value overrides
+.Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)"
+from kernel configuration file.
+.It Va net.inet.ip.fw.tables_max: No 128
+Defines number of tables available in ipfw.
+Number cannot exceed 65534.
+.El
+.Sh SYSCTL VARIABLES
+A set of
+.Xr sysctl 8
+variables controls the behaviour of the firewall and
+associated modules
+.Pq Nm dummynet , bridge , sctp nat .
+These are shown below together with their default value
+(but always check with the
+.Xr sysctl 8
+command what value is actually in use) and meaning:
+.Bl -tag -width indent
+.It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0
+Defines how the
+.Nm nat
+responds to receipt of global OOTB ASCONF-AddIP:
+.Bl -tag -width indent
+.It Cm 0
+No response (unless a partially matching association exists -
+ports and vtags match but global address does not)
+.It Cm 1
+.Nm nat
+will accept and process all OOTB global AddIP messages.
+.El
+.Pp
+Option 1 should never be selected as this forms a security risk.
+An attacker can
+establish multiple fake associations by sending AddIP messages.
+.It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5
+Defines the maximum number of chunks in an SCTP packet that will be
+parsed for a
+packet that matches an existing association.
+This value is enforced to be greater or equal than
+.Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit .
+A high value is
+a DoS risk yet setting too low a value may result in
+important control chunks in
+the packet not being located and parsed.
+.It Va net.inet.ip.alias.sctp.error_on_ootb: No 1
+Defines when the
+.Nm nat
+responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets.
+An OOTB packet is a packet that arrives with no existing association
+registered in the
+.Nm nat
+and is not an INIT or ASCONF-AddIP packet:
+.Bl -tag -width indent
+.It Cm 0
+ErrorM is never sent in response to OOTB packets.
+.It Cm 1
+ErrorM is only sent to OOTB packets received on the local side.
+.It Cm 2
+ErrorM is sent to the local side and on the global side ONLY if there is a
+partial match (ports and vtags match but the source global IP does not).
+This value is only useful if the
+.Nm nat
+is tracking global IP addresses.
+.It Cm 3
+ErrorM is sent in response to all OOTB packets on both
+the local and global side
+(DoS risk).
+.El
+.Pp
+At the moment the default is 0, since the ErrorM packet is not yet
+supported by most SCTP stacks.
+When it is supported, and if not tracking
+global addresses, we recommend setting this value to 1 to allow
+multi-homed local hosts to function with the
+.Nm nat .
+To track global addresses, we recommend setting this value to 2 to
+allow global hosts to be informed when they need to (re)send an
+ASCONF-AddIP.
+Value 3 should never be chosen (except for debugging) as the
+.Nm nat
+will respond to all OOTB global packets (a DoS risk).
+.It Va net.inet.ip.alias.sctp.hashtable_size: No 2003
+Size of hash tables used for
+.Nm nat
+lookups (100 < prime_number > 1000001).
+This value sets the
+.Nm hash table
+size for any future created
+.Nm nat
+instance and therefore must be set prior to creating a
+.Nm nat
+instance.
+The table sizes may be changed to suit specific needs.
+If there will be few
+concurrent associations, and memory is scarce, you may make these smaller.
+If there will be many thousands (or millions) of concurrent associations, you
+should make these larger.
+A prime number is best for the table size.
+The sysctl
+update function will adjust your input value to the next highest prime number.
+.It Va net.inet.ip.alias.sctp.holddown_time:  No 0
+Hold association in table for this many seconds after receiving a
+SHUTDOWN-COMPLETE.
+This allows endpoints to correct shutdown gracefully if a
+shutdown_complete is lost and retransmissions are required.
+.It Va net.inet.ip.alias.sctp.init_timer: No 15
+Timeout value while waiting for (INIT-ACK|AddIP-ACK).
+This value cannot be 0.
+.It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2
+Defines the maximum number of chunks in an SCTP packet that will be parsed when
+no existing association exists that matches that packet.
+Ideally this packet
+will only be an INIT or ASCONF-AddIP packet.
+A higher value may become a DoS
+risk as malformed packets can consume processing resources.
+.It Va net.inet.ip.alias.sctp.param_proc_limit: No 25
+Defines the maximum number of parameters within a chunk that will be
+parsed in a
+packet.
+As for other similar sysctl variables, larger values pose a DoS risk.
+.It Va net.inet.ip.alias.sctp.log_level: No 0
+Level of detail in the system log messages (0 \- minimal, 1 \- event,
+2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug).
+May be a good
+option in high loss environments.
+.It Va net.inet.ip.alias.sctp.shutdown_time: No 15
+Timeout value while waiting for SHUTDOWN-COMPLETE.
+This value cannot be 0.
+.It Va net.inet.ip.alias.sctp.track_global_addresses: No 0
+Enables/disables global IP address tracking within the
+.Nm nat
+and places an
+upper limit on the number of addresses tracked for each association:
+.Bl -tag -width indent
+.It Cm 0
+Global tracking is disabled
+.It Cm >1
+Enables tracking, the maximum number of addresses tracked for each
+association is limited to this value
+.El
+.Pp
+This variable is fully dynamic, the new value will be adopted for all newly
+arriving associations, existing associations are treated
+as they were previously.
+Global tracking will decrease the number of collisions within the
+.Nm nat
+at a cost
+of increased processing load, memory usage, complexity, and possible
+.Nm nat
+state
+problems in complex networks with multiple
+.Nm nats .
+We recommend not tracking
+global IP addresses, this will still result in a fully functional
+.Nm nat .
+.It Va net.inet.ip.alias.sctp.up_timer: No 300
+Timeout value to keep an association up with no traffic.
+This value cannot be 0.
+.It Va net.inet.ip.dummynet.expire : No 1
+Lazily delete dynamic pipes/queue once they have no pending traffic.
+You can disable this by setting the variable to 0, in which case
+the pipes/queues will only be deleted when the threshold is reached.
+.It Va net.inet.ip.dummynet.hash_size : No 64
+Default size of the hash table used for dynamic pipes/queues.
+This value is used when no
+.Cm buckets
+option is specified when configuring a pipe/queue.
+.It Va net.inet.ip.dummynet.io_fast : No 0
+If set to a non-zero value,
+the
+.Dq fast
+mode of
+.Nm dummynet
+operation (see above) is enabled.
+.It Va net.inet.ip.dummynet.io_pkt
+Number of packets passed to
+.Nm dummynet .
+.It Va net.inet.ip.dummynet.io_pkt_drop
+Number of packets dropped by
+.Nm dummynet .
+.It Va net.inet.ip.dummynet.io_pkt_fast
+Number of packets bypassed by the
+.Nm dummynet
+scheduler.
+.It Va net.inet.ip.dummynet.max_chain_len : No 16
+Target value for the maximum number of pipes/queues in a hash bucket.
+The product
+.Cm max_chain_len*hash_size
+is used to determine the threshold over which empty pipes/queues
+will be expired even when
+.Cm net.inet.ip.dummynet.expire=0 .
+.It Va net.inet.ip.dummynet.red_lookup_depth : No 256
+.It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512
+.It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500
+Parameters used in the computations of the drop probability
+for the RED algorithm.
+.It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576
+.It Va net.inet.ip.dummynet.pipe_slot_limit : No 100
+The maximum queue size that can be specified in bytes or packets.
+These limits prevent accidental exhaustion of resources such as mbufs.
+If you raise these limits,
+you should make sure the system is configured so that sufficient resources
+are available.
+.It Va net.inet.ip.fw.autoinc_step : No 100
+Delta between rule numbers when auto-generating them.
+The value must be in the range 1..1000.
+.It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets
+The current number of buckets in the hash table for dynamic rules
+(readonly).
+.It Va net.inet.ip.fw.debug : No 1
+Controls debugging messages produced by
+.Nm .
+.It Va net.inet.ip.fw.default_rule : No 65535
+The default rule number (read-only).
+By the design of
+.Nm , the default rule is the last one, so its number
+can also serve as the highest number allowed for a rule.
+.It Va net.inet.ip.fw.dyn_buckets : No 256
+The number of buckets in the hash table for dynamic rules.
+Must be a power of 2, up to 65536.
+It only takes effect when all dynamic rules have expired, so you
+are advised to use a
+.Cm flush
+command to make sure that the hash table is resized.
+.It Va net.inet.ip.fw.dyn_count : No 3
+Current number of dynamic rules
+(read-only).
+.It Va net.inet.ip.fw.dyn_keepalive : No 1
+Enables generation of keepalive packets for
+.Cm keep-state
+rules on TCP sessions.
+A keepalive is generated to both
+sides of the connection every 5 seconds for the last 20
+seconds of the lifetime of the rule.
+.It Va net.inet.ip.fw.dyn_max : No 8192
+Maximum number of dynamic rules.
+When you hit this limit, no more dynamic rules can be
+installed until old ones expire.
+.It Va net.inet.ip.fw.dyn_ack_lifetime : No 300
+.It Va net.inet.ip.fw.dyn_syn_lifetime : No 20
+.It Va net.inet.ip.fw.dyn_fin_lifetime : No 1
+.It Va net.inet.ip.fw.dyn_rst_lifetime : No 1
+.It Va net.inet.ip.fw.dyn_udp_lifetime : No 5
+.It Va net.inet.ip.fw.dyn_short_lifetime : No 30
+These variables control the lifetime, in seconds, of dynamic
+rules.
+Upon the initial SYN exchange the lifetime is kept short,
+then increased after both SYN have been seen, then decreased
+again during the final FIN exchange or when a RST is received.
+Both
+.Em dyn_fin_lifetime
+and
+.Em dyn_rst_lifetime
+must be strictly lower than 5 seconds, the period of
+repetition of keepalives.
+The firewall enforces that.
+.It Va net.inet.ip.fw.dyn_keep_states: No 0
+Keep dynamic states on rule/set deletion.
+States are relinked to default rule (65535).
+This can be handly for ruleset reload.
+Turned off by default.
+.It Va net.inet.ip.fw.enable : No 1
+Enables the firewall.
+Setting this variable to 0 lets you run your machine without
+firewall even if compiled in.
+.It Va net.inet6.ip6.fw.enable : No 1
+provides the same functionality as above for the IPv6 case.
+.It Va net.inet.ip.fw.one_pass : No 1
+When set, the packet exiting from the
+.Nm dummynet
+pipe or from
+.Xr ng_ipfw 4
+node is not passed though the firewall again.
+Otherwise, after an action, the packet is
+reinjected into the firewall at the next rule.
+.It Va net.inet.ip.fw.tables_max : No 128
+Maximum number of tables.
+.It Va net.inet.ip.fw.verbose : No 1
+Enables verbose messages.
+.It Va net.inet.ip.fw.verbose_limit : No 0
+Limits the number of messages produced by a verbose firewall.
+.It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1
+If enabled packets with unknown IPv6 Extension Headers will be denied.
+.It Va net.link.ether.ipfw : No 0
+Controls whether layer-2 packets are passed to
+.Nm .
+Default is no.
+.It Va net.link.bridge.ipfw : No 0
+Controls whether bridged packets are passed to
+.Nm .
+Default is no.
+.El
+.Sh EXAMPLES
+There are far too many possible uses of
+.Nm
+so this Section will only give a small set of examples.
+.Pp
+.Ss BASIC PACKET FILTERING
+This command adds an entry which denies all tcp packets from
+.Em cracker.evil.org
+to the telnet port of
+.Em wolf.tambov.su
+from being forwarded by the host:
+.Pp
+.Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet"
+.Pp
+This one disallows any connection from the entire cracker's
+network to my host:
+.Pp
+.Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org"
+.Pp
+A first and efficient way to limit access (not using dynamic rules)
+is the use of the following rules:
+.Pp
+.Dl "ipfw add allow tcp from any to any established"
+.Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup"
+.Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup"
+.Dl "..."
+.Dl "ipfw add deny tcp from any to any"
+.Pp
+The first rule will be a quick match for normal TCP packets,
+but it will not match the initial SYN packet, which will be
+matched by the
+.Cm setup
+rules only for selected source/destination pairs.
+All other SYN packets will be rejected by the final
+.Cm deny
+rule.
+.Pp
+If you administer one or more subnets, you can take advantage
+of the address sets and or-blocks and write extremely
+compact rulesets which selectively enable services to blocks
+of clients, as below:
+.Pp
+.Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q"
+.Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q"
+.Dl ""
+.Dl "ipfw add allow ip from ${goodguys} to any"
+.Dl "ipfw add deny ip from ${badguys} to any"
+.Dl "... normal policies ..."
+.Pp
+The
+.Cm verrevpath
+option could be used to do automated anti-spoofing by adding the
+following to the top of a ruleset:
+.Pp
+.Dl "ipfw add deny ip from any to any not verrevpath in"
+.Pp
+This rule drops all incoming packets that appear to be coming to the
+system on the wrong interface.
+For example, a packet with a source
+address belonging to a host on a protected internal network would be
+dropped if it tried to enter the system from an external interface.
+.Pp
+The
+.Cm antispoof
+option could be used to do similar but more restricted anti-spoofing
+by adding the following to the top of a ruleset:
+.Pp
+.Dl "ipfw add deny ip from any to any not antispoof in"
+.Pp
+This rule drops all incoming packets that appear to be coming from another
+directly connected system but on the wrong interface.
+For example, a packet with a source address of
+.Li 192.168.0.0/24 ,
+configured on
+.Li fxp0 ,
+but coming in on
+.Li fxp1
+would be dropped.
+.Pp
+The
+.Cm setdscp
+option could be used to (re)mark user traffic,
+by adding the following to the appropriate place in ruleset:
+.Pp
+.Dl "ipfw add setdscp be ip from any to any dscp af11,af21"
+.Ss DYNAMIC RULES
+In order to protect a site from flood attacks involving fake
+TCP packets, it is safer to use dynamic rules:
+.Pp
+.Dl "ipfw add check-state"
+.Dl "ipfw add deny tcp from any to any established"
+.Dl "ipfw add allow tcp from my-net to any setup keep-state"
+.Pp
+This will let the firewall install dynamic rules only for
+those connection which start with a regular SYN packet coming
+from the inside of our network.
+Dynamic rules are checked when encountering the first
+occurrence of a
+.Cm check-state ,
+.Cm keep-state
+or
+.Cm limit
+rule.
+A
+.Cm check-state
+rule should usually be placed near the beginning of the
+ruleset to minimize the amount of work scanning the ruleset.
+Your mileage may vary.
+.Pp
+To limit the number of connections a user can open
+you can use the following type of rules:
+.Pp
+.Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10"
+.Dl "ipfw add allow tcp from any to me setup limit src-addr 4"
+.Pp
+The former (assuming it runs on a gateway) will allow each host
+on a /24 network to open at most 10 TCP connections.
+The latter can be placed on a server to make sure that a single
+client does not use more than 4 simultaneous connections.
+.Pp
+.Em BEWARE :
+stateful rules can be subject to denial-of-service attacks
+by a SYN-flood which opens a huge number of dynamic rules.
+The effects of such attacks can be partially limited by
+acting on a set of
+.Xr sysctl 8
+variables which control the operation of the firewall.
+.Pp
+Here is a good usage of the
+.Cm list
+command to see accounting records and timestamp information:
+.Pp
+.Dl ipfw -at list
+.Pp
+or in short form without timestamps:
+.Pp
+.Dl ipfw -a list
+.Pp
+which is equivalent to:
+.Pp
+.Dl ipfw show
+.Pp
+Next rule diverts all incoming packets from 192.168.2.0/24
+to divert port 5000:
+.Pp
+.Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in
+.Ss TRAFFIC SHAPING
+The following rules show some of the applications of
+.Nm
+and
+.Nm dummynet
+for simulations and the like.
+.Pp
+This rule drops random incoming packets with a probability
+of 5%:
+.Pp
+.Dl "ipfw add prob 0.05 deny ip from any to any in"
+.Pp
+A similar effect can be achieved making use of
+.Nm dummynet
+pipes:
+.Pp
+.Dl "ipfw add pipe 10 ip from any to any"
+.Dl "ipfw pipe 10 config plr 0.05"
+.Pp
+We can use pipes to artificially limit bandwidth, e.g.\& on a
+machine acting as a router, if we want to limit traffic from
+local clients on 192.168.2.0/24 we do:
+.Pp
+.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out"
+.Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes"
+.Pp
+note that we use the
+.Cm out
+modifier so that the rule is not used twice.
+Remember in fact that
+.Nm
+rules are checked both on incoming and outgoing packets.
+.Pp
+Should we want to simulate a bidirectional link with bandwidth
+limitations, the correct way is the following:
+.Pp
+.Dl "ipfw add pipe 1 ip from any to any out"
+.Dl "ipfw add pipe 2 ip from any to any in"
+.Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes"
+.Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes"
+.Pp
+The above can be very useful, e.g.\& if you want to see how
+your fancy Web page will look for a residential user who
+is connected only through a slow link.
+You should not use only one pipe for both directions, unless
+you want to simulate a half-duplex medium (e.g.\& AppleTalk,
+Ethernet, IRDA).
+It is not necessary that both pipes have the same configuration,
+so we can also simulate asymmetric links.
+.Pp
+Should we want to verify network performance with the RED queue
+management algorithm:
+.Pp
+.Dl "ipfw add pipe 1 ip from any to any"
+.Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1"
+.Pp
+Another typical application of the traffic shaper is to
+introduce some delay in the communication.
+This can significantly affect applications which do a lot of Remote
+Procedure Calls, and where the round-trip-time of the
+connection often becomes a limiting factor much more than
+bandwidth:
+.Pp
+.Dl "ipfw add pipe 1 ip from any to any out"
+.Dl "ipfw add pipe 2 ip from any to any in"
+.Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s"
+.Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s"
+.Pp
+Per-flow queueing can be useful for a variety of purposes.
+A very simple one is counting traffic:
+.Pp
+.Dl "ipfw add pipe 1 tcp from any to any"
+.Dl "ipfw add pipe 1 udp from any to any"
+.Dl "ipfw add pipe 1 ip from any to any"
+.Dl "ipfw pipe 1 config mask all"
+.Pp
+The above set of rules will create queues (and collect
+statistics) for all traffic.
+Because the pipes have no limitations, the only effect is
+collecting statistics.
+Note that we need 3 rules, not just the last one, because
+when
+.Nm
+tries to match IP packets it will not consider ports, so we
+would not see connections on separate ports as different
+ones.
+.Pp
+A more sophisticated example is limiting the outbound traffic
+on a net with per-host limits, rather than per-network limits:
+.Pp
+.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out"
+.Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in"
+.Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes"
+.Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes"
+.Ss LOOKUP TABLES
+In the following example, we need to create several traffic bandwidth
+classes and we need different hosts/networks to fall into different classes.
+We create one pipe for each class and configure them accordingly.
+Then we create a single table and fill it with IP subnets and addresses.
+For each subnet/host we set the argument equal to the number of the pipe
+that it should use.
+Then we classify traffic using a single rule:
+.Pp
+.Dl "ipfw pipe 1 config bw 1000Kbyte/s"
+.Dl "ipfw pipe 4 config bw 4000Kbyte/s"
+.Dl "..."
+.Dl "ipfw table 1 add 192.168.2.0/24 1"
+.Dl "ipfw table 1 add 192.168.0.0/27 4"
+.Dl "ipfw table 1 add 192.168.0.2 1"
+.Dl "..."
+.Dl "ipfw add pipe tablearg ip from table(1) to any"
+.Pp
+Using the
+.Cm fwd
+action, the table entries may include hostnames and IP addresses.
+.Pp
+.Dl "ipfw table 1 add 192.168.2.0/24 10.23.2.1"
+.Dl "ipfw table 1 add 192.168.0.0/27 router1.dmz"
+.Dl "..."
+.Dl "ipfw add 100 fwd tablearg ip from any to table(1)"
+.Pp
+In the following example per-interface firewall is created:
+.Pp
+.Dl "ipfw table 10 add vlan20 12000"
+.Dl "ipfw table 10 add vlan30 13000"
+.Dl "ipfw table 20 add vlan20 22000"
+.Dl "ipfw table 20 add vlan30 23000"
+.Dl ".."
+.Dl "ipfw add 100 ipfw skipto tablearg ip from any to any recv 'table(10)' in"
+.Dl "ipfw add 200 ipfw skipto tablearg ip from any to any xmit 'table(10)' out"
+.Ss SETS OF RULES
+To add a set of rules atomically, e.g.\& set 18:
+.Pp
+.Dl "ipfw set disable 18"
+.Dl "ipfw add NN set 18 ...         # repeat as needed"
+.Dl "ipfw set enable 18"
+.Pp
+To delete a set of rules atomically the command is simply:
+.Pp
+.Dl "ipfw delete set 18"
+.Pp
+To test a ruleset and disable it and regain control if something goes wrong:
+.Pp
+.Dl "ipfw set disable 18"
+.Dl "ipfw add NN set 18 ...         # repeat as needed"
+.Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18"
+.Pp
+Here if everything goes well, you press control-C before the "sleep"
+terminates, and your ruleset will be left active.
+Otherwise, e.g.\& if
+you cannot access your box, the ruleset will be disabled after
+the sleep terminates thus restoring the previous situation.
+.Pp
+To show rules of the specific set:
+.Pp
+.Dl "ipfw set 18 show"
+.Pp
+To show rules of the disabled set:
+.Pp
+.Dl "ipfw -S set 18 show"
+.Pp
+To clear a specific rule counters of the specific set:
+.Pp
+.Dl "ipfw set 18 zero NN"
+.Pp
+To delete a specific rule of the specific set:
+.Pp
+.Dl "ipfw set 18 delete NN"
+.Ss NAT, REDIRECT AND LSNAT
+First redirect all the traffic to nat instance 123:
+.Pp
+.Dl "ipfw add nat 123 all from any to any"
+.Pp
+Then to configure nat instance 123 to alias all the outgoing traffic with ip
+192.168.0.123, blocking all incoming connections, trying to keep
+same ports on both sides, clearing aliasing table on address change
+and keeping a log of traffic/link statistics:
+.Pp
+.Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports"
+.Pp
+Or to change address of instance 123, aliasing table will be cleared (see
+reset option):
+.Pp
+.Dl "ipfw nat 123 config ip 10.0.0.1"
+.Pp
+To see configuration of nat instance 123:
+.Pp
+.Dl "ipfw nat 123 show config"
+.Pp
+To show logs of all the instances in range 111-999:
+.Pp
+.Dl "ipfw nat 111-999 show"
+.Pp
+To see configurations of all instances:
+.Pp
+.Dl "ipfw nat show config"
+.Pp
+Or a redirect rule with mixed modes could looks like:
+.Pp
+.Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66"
+.Dl "                   redirect_port tcp 192.168.0.1:80 500"
+.Dl "                   redirect_proto udp 192.168.1.43 192.168.1.1"
+.Dl "                   redirect_addr 192.168.0.10,192.168.0.11"
+.Dl "                              10.0.0.100  # LSNAT"
+.Dl "                   redirect_port tcp 192.168.0.1:80,192.168.0.10:22"
+.Dl "                              500         # LSNAT"
+.Pp
+or it could be split in:
+.Pp
+.Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66"
+.Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500"
+.Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1"
+.Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12"
+.Dl "                                   10.0.0.100"
+.Dl "ipfw nat 5 config redirect_port tcp"
+.Dl "                  192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500"
+.Sh SEE ALSO
+.Xr cpp 1 ,
+.Xr m4 1 ,
+.Xr altq 4 ,
+.Xr divert 4 ,
+.Xr dummynet 4 ,
+.Xr if_bridge 4 ,
+.Xr ip 4 ,
+.Xr ipfirewall 4 ,
+.Xr ng_ipfw 4 ,
+.Xr protocols 5 ,
+.Xr services 5 ,
+.Xr init 8 ,
+.Xr kldload 8 ,
+.Xr reboot 8 ,
+.Xr sysctl 8 ,
+.Xr syslogd 8
+.Sh HISTORY
+The
+.Nm
+utility first appeared in
+.Fx 2.0 .
+.Nm dummynet
+was introduced in
+.Fx 2.2.8 .
+Stateful extensions were introduced in
+.Fx 4.0 .
+.Nm ipfw2
+was introduced in Summer 2002.
+.Sh AUTHORS
+.An Ugen J. S. Antsilevich ,
+.An Poul-Henning Kamp ,
+.An Alex Nash ,
+.An Archie Cobbs ,
+.An Luigi Rizzo .
+.Pp
+.An -nosplit
+API based upon code written by
+.An Daniel Boulet
+for BSDI.
+.Pp
+Dummynet has been introduced by Luigi Rizzo in 1997-1998.
+.Pp
+Some early work (1999-2000) on the
+.Nm dummynet
+traffic shaper supported by Akamba Corp.
+.Pp
+The ipfw core (ipfw2) has been completely redesigned and
+reimplemented by Luigi Rizzo in summer 2002.
+Further
+actions and
+options have been added by various developer over the years.
+.Pp
+.An -nosplit
+In-kernel NAT support written by
+.An Paolo Pisati Aq piso@FreeBSD.org
+as part of a Summer of Code 2005 project.
+.Pp
+SCTP
+.Nm nat
+support has been developed by
+.An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au .
+The primary developers and maintainers are David Hayes and Jason But.
+For further information visit:
+.Aq http://www.caia.swin.edu.au/urp/SONATA
+.Pp
+Delay profiles have been developed by Alessandro Cerri and
+Luigi Rizzo, supported by the
+European Commission within Projects Onelab and Onelab2.
+.Sh BUGS
+The syntax has grown over the years and sometimes it might be confusing.
+Unfortunately, backward compatibility prevents cleaning up mistakes
+made in the definition of the syntax.
+.Pp
+.Em !!! WARNING !!!
+.Pp
+Misconfiguring the firewall can put your computer in an unusable state,
+possibly shutting down network services and requiring console access to
+regain control of it.
+.Pp
+Incoming packet fragments diverted by
+.Cm divert
+are reassembled before delivery to the socket.
+The action used on those packet is the one from the
+rule which matches the first fragment of the packet.
+.Pp
+Packets diverted to userland, and then reinserted by a userland process
+may lose various packet attributes.
+The packet source interface name
+will be preserved if it is shorter than 8 bytes and the userland process
+saves and reuses the sockaddr_in
+(as does
+.Xr natd 8 ) ;
+otherwise, it may be lost.
+If a packet is reinserted in this manner, later rules may be incorrectly
+applied, making the order of
+.Cm divert
+rules in the rule sequence very important.
+.Pp
+Dummynet drops all packets with IPv6 link-local addresses.
+.Pp
+Rules using
+.Cm uid
+or
+.Cm gid
+may not behave as expected.
+In particular, incoming SYN packets may
+have no uid or gid associated with them since they do not yet belong
+to a TCP connection, and the uid/gid associated with a packet may not
+be as expected if the associated process calls
+.Xr setuid 2
+or similar system calls.
+.Pp
+Rule syntax is subject to the command line environment and some patterns
+may need to be escaped with the backslash character
+or quoted appropriately.
+.Pp
+Due to the architecture of
+.Xr libalias 3 ,
+ipfw nat is not compatible with the TCP segmentation offloading (TSO).
+Thus, to reliably nat your network traffic, please disable TSO
+on your NICs using
+.Xr ifconfig 8 .
+.Pp
+ICMP error messages are not implicitly matched by dynamic rules
+for the respective conversations.
+To avoid failures of network error detection and path MTU discovery,
+ICMP error messages may need to be allowed explicitly through static
+rules.
+.Pp
+Rules using
+.Cm call
+and
+.Cm return
+actions may lead to confusing behaviour if ruleset has mistakes,
+and/or interaction with other subsystems (netgraph, dummynet, etc.) is used.
+One possible case for this is packet leaving
+.Nm
+in subroutine on the input pass, while later on output encountering unpaired
+.Cm return
+first.
+As the call stack is kept intact after input pass, packet will suddenly
+return to the rule number used on input pass, not on output one.
+Order of processing should be checked carefully to avoid such mistakes.
diff --git a/ipfw/ipfw2.c b/ipfw/ipfw2.c
new file mode 100644 (file)
index 0000000..5dbfd4a
--- /dev/null
@@ -0,0 +1,3994 @@
+/*
+ * Copyright (c) 2002-2003 Luigi Rizzo
+ * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Idea and grammar partially left from:
+ * Copyright (c) 1993 Daniel Boulet
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * NEW command line interface for IP firewall facility
+ *
+ * $FreeBSD: head/sbin/ipfw/ipfw2.c 206843 2010-04-19 15:11:45Z luigi $
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+
+#include "ipfw2.h"
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <grp.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <time.h>      /* ctime */
+#include <timeconv.h>  /* _long_to_time */
+#include <unistd.h>
+#include <fcntl.h>
+#include <stddef.h>    /* offsetof */
+
+#include <net/ethernet.h>
+#include <net/if.h>            /* only IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/in_systm.h>  /* only n_short, n_long */
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+struct cmdline_opts co;        /* global options */
+
+int resvd_set_number = RESVD_SET;
+
+int ipfw_socket = -1;
+
+#ifndef s6_addr32
+#define s6_addr32 __u6_addr.__u6_addr32
+#endif
+
+#define GET_UINT_ARG(arg, min, max, tok, s_x) do {                     \
+       if (!av[0])                                                     \
+               errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \
+       if (_substrcmp(*av, "tablearg") == 0) {                         \
+               arg = IP_FW_TABLEARG;                                   \
+               break;                                                  \
+       }                                                               \
+                                                                       \
+       {                                                               \
+       long _xval;                                                     \
+       char *end;                                                      \
+                                                                       \
+       _xval = strtol(*av, &end, 10);                                  \
+                                                                       \
+       if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \
+               errx(EX_DATAERR, "%s: invalid argument: %s",            \
+                   match_value(s_x, tok), *av);                        \
+                                                                       \
+       if (errno == ERANGE || _xval < min || _xval > max)              \
+               errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \
+                   match_value(s_x, tok), min, max, *av);              \
+                                                                       \
+       if (_xval == IP_FW_TABLEARG)                                    \
+               errx(EX_DATAERR, "%s: illegal argument value: %s",      \
+                   match_value(s_x, tok), *av);                        \
+       arg = _xval;                                                    \
+       }                                                               \
+} while (0)
+
+static void
+PRINT_UINT_ARG(const char *str, uint32_t arg)
+{
+       if (str != NULL)
+               printf("%s",str);
+       if (arg == IP_FW_TABLEARG)
+               printf("tablearg");
+       else
+               printf("%u", arg);
+}
+
+static struct _s_x f_tcpflags[] = {
+       { "syn", TH_SYN },
+       { "fin", TH_FIN },
+       { "ack", TH_ACK },
+       { "psh", TH_PUSH },
+       { "rst", TH_RST },
+       { "urg", TH_URG },
+       { "tcp flag", 0 },
+       { NULL, 0 }
+};
+
+static struct _s_x f_tcpopts[] = {
+       { "mss",        IP_FW_TCPOPT_MSS },
+       { "maxseg",     IP_FW_TCPOPT_MSS },
+       { "window",     IP_FW_TCPOPT_WINDOW },
+       { "sack",       IP_FW_TCPOPT_SACK },
+       { "ts",         IP_FW_TCPOPT_TS },
+       { "timestamp",  IP_FW_TCPOPT_TS },
+       { "cc",         IP_FW_TCPOPT_CC },
+       { "tcp option", 0 },
+       { NULL, 0 }
+};
+
+/*
+ * IP options span the range 0 to 255 so we need to remap them
+ * (though in fact only the low 5 bits are significant).
+ */
+static struct _s_x f_ipopts[] = {
+       { "ssrr",       IP_FW_IPOPT_SSRR},
+       { "lsrr",       IP_FW_IPOPT_LSRR},
+       { "rr",         IP_FW_IPOPT_RR},
+       { "ts",         IP_FW_IPOPT_TS},
+       { "ip option",  0 },
+       { NULL, 0 }
+};
+
+static struct _s_x f_iptos[] = {
+       { "lowdelay",   IPTOS_LOWDELAY},
+       { "throughput", IPTOS_THROUGHPUT},
+       { "reliability", IPTOS_RELIABILITY},
+       { "mincost",    IPTOS_MINCOST},
+       { "congestion", IPTOS_ECN_CE},
+       { "ecntransport", IPTOS_ECN_ECT0},
+       { "ip tos option", 0},
+       { NULL, 0 }
+};
+
+static struct _s_x limit_masks[] = {
+       {"all",         DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT},
+       {"src-addr",    DYN_SRC_ADDR},
+       {"src-port",    DYN_SRC_PORT},
+       {"dst-addr",    DYN_DST_ADDR},
+       {"dst-port",    DYN_DST_PORT},
+       {NULL,          0}
+};
+
+/*
+ * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines
+ * This is only used in this code.
+ */
+#define IPPROTO_ETHERTYPE      0x1000
+static struct _s_x ether_types[] = {
+    /*
+     * Note, we cannot use "-:&/" in the names because they are field
+     * separators in the type specifications. Also, we use s = NULL as
+     * end-delimiter, because a type of 0 can be legal.
+     */
+       { "ip",         0x0800 },
+       { "ipv4",       0x0800 },
+       { "ipv6",       0x86dd },
+       { "arp",        0x0806 },
+       { "rarp",       0x8035 },
+       { "vlan",       0x8100 },
+       { "loop",       0x9000 },
+       { "trail",      0x1000 },
+       { "at",         0x809b },
+       { "atalk",      0x809b },
+       { "aarp",       0x80f3 },
+       { "pppoe_disc", 0x8863 },
+       { "pppoe_sess", 0x8864 },
+       { "ipx_8022",   0x00E0 },
+       { "ipx_8023",   0x0000 },
+       { "ipx_ii",     0x8137 },
+       { "ipx_snap",   0x8137 },
+       { "ipx",        0x8137 },
+       { "ns",         0x0600 },
+       { NULL,         0 }
+};
+
+
+static struct _s_x rule_actions[] = {
+       { "accept",             TOK_ACCEPT },
+       { "pass",               TOK_ACCEPT },
+       { "allow",              TOK_ACCEPT },
+       { "permit",             TOK_ACCEPT },
+       { "count",              TOK_COUNT },
+       { "pipe",               TOK_PIPE },
+       { "queue",              TOK_QUEUE },
+       { "divert",             TOK_DIVERT },
+       { "tee",                TOK_TEE },
+       { "netgraph",           TOK_NETGRAPH },
+       { "ngtee",              TOK_NGTEE },
+       { "fwd",                TOK_FORWARD },
+       { "forward",            TOK_FORWARD },
+       { "skipto",             TOK_SKIPTO },
+       { "deny",               TOK_DENY },
+       { "drop",               TOK_DENY },
+       { "reject",             TOK_REJECT },
+       { "reset6",             TOK_RESET6 },
+       { "reset",              TOK_RESET },
+       { "unreach6",           TOK_UNREACH6 },
+       { "unreach",            TOK_UNREACH },
+       { "check-state",        TOK_CHECKSTATE },
+       { "//",                 TOK_COMMENT },
+       { "nat",                TOK_NAT },
+       { "reass",              TOK_REASS },
+       { "setfib",             TOK_SETFIB },
+       { "call",               TOK_CALL },
+       { "return",             TOK_RETURN },
+       { NULL, 0 }     /* terminator */
+};
+
+static struct _s_x rule_action_params[] = {
+       { "altq",               TOK_ALTQ },
+       { "log",                TOK_LOG },
+       { "tag",                TOK_TAG },
+       { "untag",              TOK_UNTAG },
+       { NULL, 0 }     /* terminator */
+};
+
+/*
+ * The 'lookup' instruction accepts one of the following arguments.
+ * -1 is a terminator for the list.
+ * Arguments are passed as v[1] in O_DST_LOOKUP options.
+ */
+static int lookup_key[] = {
+       TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT,
+       TOK_UID, TOK_JAIL, TOK_DSCP, -1 };
+
+static struct _s_x rule_options[] = {
+       { "tagged",             TOK_TAGGED },
+       { "uid",                TOK_UID },
+       { "gid",                TOK_GID },
+       { "jail",               TOK_JAIL },
+       { "in",                 TOK_IN },
+       { "limit",              TOK_LIMIT },
+       { "keep-state",         TOK_KEEPSTATE },
+       { "bridged",            TOK_LAYER2 },
+       { "layer2",             TOK_LAYER2 },
+       { "out",                TOK_OUT },
+       { "diverted",           TOK_DIVERTED },
+       { "diverted-loopback",  TOK_DIVERTEDLOOPBACK },
+       { "diverted-output",    TOK_DIVERTEDOUTPUT },
+       { "xmit",               TOK_XMIT },
+       { "recv",               TOK_RECV },
+       { "via",                TOK_VIA },
+       { "fragment",           TOK_FRAG },
+       { "frag",               TOK_FRAG },
+       { "fib",                TOK_FIB },
+       { "ipoptions",          TOK_IPOPTS },
+       { "ipopts",             TOK_IPOPTS },
+       { "iplen",              TOK_IPLEN },
+       { "ipid",               TOK_IPID },
+       { "ipprecedence",       TOK_IPPRECEDENCE },
+       { "dscp",               TOK_DSCP },
+       { "iptos",              TOK_IPTOS },
+       { "ipttl",              TOK_IPTTL },
+       { "ipversion",          TOK_IPVER },
+       { "ipver",              TOK_IPVER },
+       { "estab",              TOK_ESTAB },
+       { "established",        TOK_ESTAB },
+       { "setup",              TOK_SETUP },
+       { "sockarg",            TOK_SOCKARG },
+       { "tcpdatalen",         TOK_TCPDATALEN },
+       { "tcpflags",           TOK_TCPFLAGS },
+       { "tcpflgs",            TOK_TCPFLAGS },
+       { "tcpoptions",         TOK_TCPOPTS },
+       { "tcpopts",            TOK_TCPOPTS },
+       { "tcpseq",             TOK_TCPSEQ },
+       { "tcpack",             TOK_TCPACK },
+       { "tcpwin",             TOK_TCPWIN },
+       { "icmptype",           TOK_ICMPTYPES },
+       { "icmptypes",          TOK_ICMPTYPES },
+       { "dst-ip",             TOK_DSTIP },
+       { "src-ip",             TOK_SRCIP },
+       { "dst-port",           TOK_DSTPORT },
+       { "src-port",           TOK_SRCPORT },
+       { "proto",              TOK_PROTO },
+       { "MAC",                TOK_MAC },
+       { "mac",                TOK_MAC },
+       { "mac-type",           TOK_MACTYPE },
+       { "verrevpath",         TOK_VERREVPATH },
+       { "versrcreach",        TOK_VERSRCREACH },
+       { "antispoof",          TOK_ANTISPOOF },
+       { "ipsec",              TOK_IPSEC },
+       { "icmp6type",          TOK_ICMP6TYPES },
+       { "icmp6types",         TOK_ICMP6TYPES },
+       { "ext6hdr",            TOK_EXT6HDR},
+       { "flow-id",            TOK_FLOWID},
+       { "ipv6",               TOK_IPV6},
+       { "ip6",                TOK_IPV6},
+       { "ipv4",               TOK_IPV4},
+       { "ip4",                TOK_IPV4},
+       { "dst-ipv6",           TOK_DSTIP6},
+       { "dst-ip6",            TOK_DSTIP6},
+       { "src-ipv6",           TOK_SRCIP6},
+       { "src-ip6",            TOK_SRCIP6},
+       { "lookup",             TOK_LOOKUP},
+       { "//",                 TOK_COMMENT },
+
+       { "not",                TOK_NOT },              /* pseudo option */
+       { "!", /* escape ? */   TOK_NOT },              /* pseudo option */
+       { "or",                 TOK_OR },               /* pseudo option */
+       { "|", /* escape */     TOK_OR },               /* pseudo option */
+       { "{",                  TOK_STARTBRACE },       /* pseudo option */
+       { "(",                  TOK_STARTBRACE },       /* pseudo option */
+       { "}",                  TOK_ENDBRACE },         /* pseudo option */
+       { ")",                  TOK_ENDBRACE },         /* pseudo option */
+       { NULL, 0 }     /* terminator */
+};
+
+/*
+ * Helper routine to print a possibly unaligned uint64_t on
+ * various platform. If width > 0, print the value with
+ * the desired width, followed by a space;
+ * otherwise, return the required width.
+ */
+int
+pr_u64(uint64_t *pd, int width)
+{
+#ifdef TCC
+#define U64_FMT "I64"
+#else
+#define U64_FMT "llu"
+#endif
+       uint64_t u;
+       unsigned long long d;
+
+       bcopy (pd, &u, sizeof(u));
+       d = u;
+       return (width > 0) ?
+               printf("%*" U64_FMT " ", width, d) :
+               snprintf(NULL, 0, "%" U64_FMT, d) ;
+#undef U64_FMT
+}
+
+void *
+safe_calloc(size_t number, size_t size)
+{
+       void *ret = calloc(number, size);
+
+       if (ret == NULL)
+               err(EX_OSERR, "calloc");
+       return ret;
+}
+
+void *
+safe_realloc(void *ptr, size_t size)
+{
+       void *ret = realloc(ptr, size);
+
+       if (ret == NULL)
+               err(EX_OSERR, "realloc");
+       return ret;
+}
+
+/*
+ * conditionally runs the command.
+ * Selected options or negative -> getsockopt
+ */
+int
+do_cmd(int optname, void *optval, uintptr_t optlen)
+{
+       int i;
+
+       if (co.test_only)
+               return 0;
+
+       if (ipfw_socket == -1)
+               ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
+       if (ipfw_socket < 0)
+               err(EX_UNAVAILABLE, "socket");
+
+       if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET ||
+           optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST ||
+           optname == IP_FW_TABLE_GETSIZE ||
+           optname == IP_FW_NAT_GET_CONFIG ||
+           optname < 0 ||
+           optname == IP_FW_NAT_GET_LOG) {
+               if (optname < 0)
+                       optname = -optname;
+               i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval,
+                       (socklen_t *)optlen);
+       } else {
+               i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen);
+       }
+       return i;
+}
+
+#if 0 // XXX still unused
+/*
+ * do_setcmd3 - pass ipfw control cmd to kernel
+ * @optname: option name
+ * @optval: pointer to option data
+ * @optlen: option length
+ *
+ * Function encapsulates option value in IP_FW3 socket option
+ * and calls setsockopt().
+ * Function returns 0 on success or -1 otherwise.
+ */
+static int
+do_setcmd3(int optname, void *optval, socklen_t optlen)
+{
+       socklen_t len;
+       ip_fw3_opheader *op3;
+
+       if (co.test_only)
+               return (0);
+
+       if (ipfw_socket == -1)
+               ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
+       if (ipfw_socket < 0)
+               err(EX_UNAVAILABLE, "socket");
+
+       len = sizeof(ip_fw3_opheader) + optlen;
+       op3 = alloca(len);
+       /* Zero reserved fields */
+       memset(op3, 0, sizeof(ip_fw3_opheader));
+       memcpy(op3 + 1, optval, optlen);
+       op3->opcode = optname;
+
+       return setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, len);
+}
+#endif // XXX still unused
+
+/**
+ * match_token takes a table and a string, returns the value associated
+ * with the string (-1 in case of failure).
+ */
+int
+match_token(struct _s_x *table, char *string)
+{
+       struct _s_x *pt;
+       uint i = strlen(string);
+
+       for (pt = table ; i && pt->s != NULL ; pt++)
+               if (strlen(pt->s) == i && !bcmp(string, pt->s, i))
+                       return pt->x;
+       return -1;
+}
+
+/**
+ * match_value takes a table and a value, returns the string associated
+ * with the value (NULL in case of failure).
+ */
+char const *
+match_value(struct _s_x *p, int value)
+{
+       for (; p->s != NULL; p++)
+               if (p->x == value)
+                       return p->s;
+       return NULL;
+}
+
+/*
+ * _substrcmp takes two strings and returns 1 if they do not match,
+ * and 0 if they match exactly or the first string is a sub-string
+ * of the second.  A warning is printed to stderr in the case that the
+ * first string is a sub-string of the second.
+ *
+ * This function will be removed in the future through the usual
+ * deprecation process.
+ */
+int
+_substrcmp(const char *str1, const char* str2)
+{
+
+       if (strncmp(str1, str2, strlen(str1)) != 0)
+               return 1;
+
+       if (strlen(str1) != strlen(str2))
+               warnx("DEPRECATED: '%s' matched '%s' as a sub-string",
+                   str1, str2);
+       return 0;
+}
+
+/*
+ * _substrcmp2 takes three strings and returns 1 if the first two do not match,
+ * and 0 if they match exactly or the second string is a sub-string
+ * of the first.  A warning is printed to stderr in the case that the
+ * first string does not match the third.
+ *
+ * This function exists to warn about the bizarre construction
+ * strncmp(str, "by", 2) which is used to allow people to use a shortcut
+ * for "bytes".  The problem is that in addition to accepting "by",
+ * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any
+ * other string beginning with "by".
+ *
+ * This function will be removed in the future through the usual
+ * deprecation process.
+ */
+int
+_substrcmp2(const char *str1, const char* str2, const char* str3)
+{
+
+       if (strncmp(str1, str2, strlen(str2)) != 0)
+               return 1;
+
+       if (strcmp(str1, str3) != 0)
+               warnx("DEPRECATED: '%s' matched '%s'",
+                   str1, str3);
+       return 0;
+}
+
+/*
+ * prints one port, symbolic or numeric
+ */
+static void
+print_port(int proto, uint16_t port)
+{
+
+       if (proto == IPPROTO_ETHERTYPE) {
+               char const *s;
+
+               if (co.do_resolv && (s = match_value(ether_types, port)) )
+                       printf("%s", s);
+               else
+                       printf("0x%04x", port);
+       } else {
+               struct servent *se = NULL;
+               if (co.do_resolv) {
+                       struct protoent *pe = getprotobynumber(proto);
+
+                       se = getservbyport(htons(port), pe ? pe->p_name : NULL);
+               }
+               if (se)
+                       printf("%s", se->s_name);
+               else
+                       printf("%d", port);
+       }
+}
+
+static struct _s_x _port_name[] = {
+       {"dst-port",    O_IP_DSTPORT},
+       {"src-port",    O_IP_SRCPORT},
+       {"ipid",        O_IPID},
+       {"iplen",       O_IPLEN},
+       {"ipttl",       O_IPTTL},
+       {"mac-type",    O_MAC_TYPE},
+       {"tcpdatalen",  O_TCPDATALEN},
+       {"tcpwin",      O_TCPWIN},
+       {"tagged",      O_TAGGED},
+       {NULL,          0}
+};
+
+/*
+ * Print the values in a list 16-bit items of the types above.
+ * XXX todo: add support for mask.
+ */
+static void
+print_newports(ipfw_insn_u16 *cmd, int proto, int opcode)
+{
+       uint16_t *p = cmd->ports;
+       int i;
+       char const *sep;
+
+       if (opcode != 0) {
+               sep = match_value(_port_name, opcode);
+               if (sep == NULL)
+                       sep = "???";
+               printf (" %s", sep);
+       }
+       sep = " ";
+       for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) {
+               printf("%s", sep);
+               print_port(proto, p[0]);
+               if (p[0] != p[1]) {
+                       printf("-");
+                       print_port(proto, p[1]);
+               }
+               sep = ",";
+       }
+}
+
+/*
+ * Like strtol, but also translates service names into port numbers
+ * for some protocols.
+ * In particular:
+ *     proto == -1 disables the protocol check;
+ *     proto == IPPROTO_ETHERTYPE looks up an internal table
+ *     proto == <some value in /etc/protocols> matches the values there.
+ * Returns *end == s in case the parameter is not found.
+ */
+static int
+strtoport(char *s, char **end, int base, int proto)
+{
+       char *p, *buf;
+       char *s1;
+       int i;
+
+       *end = s;               /* default - not found */
+       if (*s == '\0')
+               return 0;       /* not found */
+
+       if (isdigit(*s))
+               return strtol(s, end, base);
+
+       /*
+        * find separator. '\\' escapes the next char.
+        */
+       for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++)
+               if (*s1 == '\\' && s1[1] != '\0')
+                       s1++;
+
+       buf = safe_calloc(s1 - s + 1, 1);
+
+       /*
+        * copy into a buffer skipping backslashes
+        */
+       for (p = s, i = 0; p != s1 ; p++)
+               if (*p != '\\')
+                       buf[i++] = *p;
+       buf[i++] = '\0';
+
+       if (proto == IPPROTO_ETHERTYPE) {
+               i = match_token(ether_types, buf);
+               free(buf);
+               if (i != -1) {  /* found */
+                       *end = s1;
+                       return i;
+               }
+       } else {
+               struct protoent *pe = NULL;
+               struct servent *se;
+
+               if (proto != 0)
+                       pe = getprotobynumber(proto);
+               setservent(1);
+               se = getservbyname(buf, pe ? pe->p_name : NULL);
+               free(buf);
+               if (se != NULL) {
+                       *end = s1;
+                       return ntohs(se->s_port);
+               }
+       }
+       return 0;       /* not found */
+}
+
+/*
+ * Fill the body of the command with the list of port ranges.
+ */
+static int
+fill_newports(ipfw_insn_u16 *cmd, char *av, int proto)
+{
+       uint16_t a, b, *p = cmd->ports;
+       int i = 0;
+       char *s = av;
+
+       while (*s) {
+               a = strtoport(av, &s, 0, proto);
+               if (s == av)                    /* empty or invalid argument */
+                       return (0);
+
+               switch (*s) {
+               case '-':                       /* a range */
+                       av = s + 1;
+                       b = strtoport(av, &s, 0, proto);
+                       /* Reject expressions like '1-abc' or '1-2-3'. */
+                       if (s == av || (*s != ',' && *s != '\0'))
+                               return (0);
+                       p[0] = a;
+                       p[1] = b;
+                       break;
+               case ',':                       /* comma separated list */
+               case '\0':
+                       p[0] = p[1] = a;
+                       break;
+               default:
+                       warnx("port list: invalid separator <%c> in <%s>",
+                               *s, av);
+                       return (0);
+               }
+
+               i++;
+               p += 2;
+               av = s + 1;
+       }
+       if (i > 0) {
+               if (i + 1 > F_LEN_MASK)
+                       errx(EX_DATAERR, "too many ports/ranges\n");
+               cmd->o.len |= i + 1;    /* leave F_NOT and F_OR untouched */
+       }
+       return (i);
+}
+
+static struct _s_x icmpcodes[] = {
+      { "net",                 ICMP_UNREACH_NET },
+      { "host",                        ICMP_UNREACH_HOST },
+      { "protocol",            ICMP_UNREACH_PROTOCOL },
+      { "port",                        ICMP_UNREACH_PORT },
+      { "needfrag",            ICMP_UNREACH_NEEDFRAG },
+      { "srcfail",             ICMP_UNREACH_SRCFAIL },
+      { "net-unknown",         ICMP_UNREACH_NET_UNKNOWN },
+      { "host-unknown",                ICMP_UNREACH_HOST_UNKNOWN },
+      { "isolated",            ICMP_UNREACH_ISOLATED },
+      { "net-prohib",          ICMP_UNREACH_NET_PROHIB },
+      { "host-prohib",         ICMP_UNREACH_HOST_PROHIB },
+      { "tosnet",              ICMP_UNREACH_TOSNET },
+      { "toshost",             ICMP_UNREACH_TOSHOST },
+      { "filter-prohib",       ICMP_UNREACH_FILTER_PROHIB },
+      { "host-precedence",     ICMP_UNREACH_HOST_PRECEDENCE },
+      { "precedence-cutoff",   ICMP_UNREACH_PRECEDENCE_CUTOFF },
+      { NULL, 0 }
+};
+
+static void
+fill_reject_code(u_short *codep, char *str)
+{
+       int val;
+       char *s;
+
+       val = strtoul(str, &s, 0);
+       if (s == str || *s != '\0' || val >= 0x100)
+               val = match_token(icmpcodes, str);
+       if (val < 0)
+               errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str);
+       *codep = val;
+       return;
+}
+
+static void
+print_reject_code(uint16_t code)
+{
+       char const *s = match_value(icmpcodes, code);
+
+       if (s != NULL)
+               printf("unreach %s", s);
+       else
+               printf("unreach %u", code);
+}
+
+/*
+ * Returns the number of bits set (from left) in a contiguous bitmask,
+ * or -1 if the mask is not contiguous.
+ * XXX this needs a proper fix.
+ * This effectively works on masks in big-endian (network) format.
+ * when compiled on little endian architectures.
+ *
+ * First bit is bit 7 of the first byte -- note, for MAC addresses,
+ * the first bit on the wire is bit 0 of the first byte.
+ * len is the max length in bits.
+ */
+int
+contigmask(uint8_t *p, int len)
+{
+       int i, n;
+
+       for (i=0; i<len ; i++)
+               if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
+                       break;
+       for (n=i+1; n < len; n++)
+               if ( (p[n/8] & (1 << (7 - (n%8)))) != 0)
+                       return -1; /* mask not contiguous */
+       return i;
+}
+
+/*
+ * print flags set/clear in the two bitmasks passed as parameters.
+ * There is a specialized check for f_tcpflags.
+ */
+static void
+print_flags(char const *name, ipfw_insn *cmd, struct _s_x *list)
+{
+       char const *comma = "";
+       int i;
+       uint8_t set = cmd->arg1 & 0xff;
+       uint8_t clear = (cmd->arg1 >> 8) & 0xff;
+
+       if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) {
+               printf(" setup");
+               return;
+       }
+
+       printf(" %s ", name);
+       for (i=0; list[i].x != 0; i++) {
+               if (set & list[i].x) {
+                       set &= ~list[i].x;
+                       printf("%s%s", comma, list[i].s);
+                       comma = ",";
+               }
+               if (clear & list[i].x) {
+                       clear &= ~list[i].x;
+                       printf("%s!%s", comma, list[i].s);
+                       comma = ",";
+               }
+       }
+}
+
+/*
+ * Print the ip address contained in a command.
+ */
+static void
+print_ip(ipfw_insn_ip *cmd, char const *s)
+{
+       struct hostent *he = NULL;
+       uint32_t len = F_LEN((ipfw_insn *)cmd);
+       uint32_t *a = ((ipfw_insn_u32 *)cmd)->d;
+
+       if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) {
+               uint32_t d = a[1];
+               const char *arg = "<invalid>";
+
+               if (d < sizeof(lookup_key)/sizeof(lookup_key[0]))
+                       arg = match_value(rule_options, lookup_key[d]);
+               printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "",
+                       arg, cmd->o.arg1);
+               return;
+       }
+       printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);
+
+       if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) {
+               printf("me");
+               return;
+       }
+       if (cmd->o.opcode == O_IP_SRC_LOOKUP ||
+           cmd->o.opcode == O_IP_DST_LOOKUP) {
+               printf("table(%u", ((ipfw_insn *)cmd)->arg1);
+               if (len == F_INSN_SIZE(ipfw_insn_u32))
+                       printf(",%u", *a);
+               printf(")");
+               return;
+       }
+       if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) {
+               uint32_t x, *map = (uint32_t *)&(cmd->mask);
+               int i, j;
+               char comma = '{';
+
+               x = cmd->o.arg1 - 1;
+               x = htonl( ~x );
+               cmd->addr.s_addr = htonl(cmd->addr.s_addr);
+               printf("%s/%d", inet_ntoa(cmd->addr),
+                       contigmask((uint8_t *)&x, 32));
+               x = cmd->addr.s_addr = htonl(cmd->addr.s_addr);
+               x &= 0xff; /* base */
+               /*
+                * Print bits and ranges.
+                * Locate first bit set (i), then locate first bit unset (j).
+                * If we have 3+ consecutive bits set, then print them as a
+                * range, otherwise only print the initial bit and rescan.
+                */
+               for (i=0; i < cmd->o.arg1; i++)
+                       if (map[i/32] & (1<<(i & 31))) {
+                               for (j=i+1; j < cmd->o.arg1; j++)
+                                       if (!(map[ j/32] & (1<<(j & 31))))
+                                               break;
+                               printf("%c%d", comma, i+x);
+                               if (j>i+2) { /* range has at least 3 elements */
+                                       printf("-%d", j-1+x);
+                                       i = j-1;
+                               }
+                               comma = ',';
+                       }
+               printf("}");
+               return;
+       }
+       /*
+        * len == 2 indicates a single IP, whereas lists of 1 or more
+        * addr/mask pairs have len = (2n+1). We convert len to n so we
+        * use that to count the number of entries.
+        */
+    for (len = len / 2; len > 0; len--, a += 2) {
+       int mb =        /* mask length */
+           (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ?
+               32 : contigmask((uint8_t *)&(a[1]), 32);
+       if (mb == 32 && co.do_resolv)
+               he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET);
+       if (he != NULL)         /* resolved to name */
+               printf("%s", he->h_name);
+       else if (mb == 0)       /* any */
+               printf("any");
+       else {          /* numeric IP followed by some kind of mask */
+               printf("%s", inet_ntoa( *((struct in_addr *)&a[0]) ) );
+               if (mb < 0)
+                       printf(":%s", inet_ntoa( *((struct in_addr *)&a[1]) ) );
+               else if (mb < 32)
+                       printf("/%d", mb);
+       }
+       if (len > 1)
+               printf(",");
+    }
+}
+
+/*
+ * prints a MAC address/mask pair
+ */
+static void
+print_mac(uint8_t *addr, uint8_t *mask)
+{
+       int l = contigmask(mask, 48);
+
+       if (l == 0)
+               printf(" any");
+       else {
+               printf(" %02x:%02x:%02x:%02x:%02x:%02x",
+                   addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
+               if (l == -1)
+                       printf("&%02x:%02x:%02x:%02x:%02x:%02x",
+                           mask[0], mask[1], mask[2],
+                           mask[3], mask[4], mask[5]);
+               else if (l < 48)
+                       printf("/%d", l);
+       }
+}
+
+static void
+fill_icmptypes(ipfw_insn_u32 *cmd, char *av)
+{
+       uint8_t type;
+
+       cmd->d[0] = 0;
+       while (*av) {
+               if (*av == ',')
+                       av++;
+
+               type = strtoul(av, &av, 0);
+
+               if (*av != ',' && *av != '\0')
+                       errx(EX_DATAERR, "invalid ICMP type");
+
+               if (type > 31)
+                       errx(EX_DATAERR, "ICMP type out of range");
+
+               cmd->d[0] |= 1 << type;
+       }
+       cmd->o.opcode = O_ICMPTYPE;
+       cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
+}
+
+static void
+print_icmptypes(ipfw_insn_u32 *cmd)
+{
+       int i;
+       char sep= ' ';
+
+       printf(" icmptypes");
+       for (i = 0; i < 32; i++) {
+               if ( (cmd->d[0] & (1 << (i))) == 0)
+                       continue;
+               printf("%c%d", sep, i);
+               sep = ',';
+       }
+}
+
+/*
+ * show_ipfw() prints the body of an ipfw rule.
+ * Because the standard rule has at least proto src_ip dst_ip, we use
+ * a helper function to produce these entries if not provided explicitly.
+ * The first argument is the list of fields we have, the second is
+ * the list of fields we want to be printed.
+ *
+ * Special cases if we have provided a MAC header:
+ *   + if the rule does not contain IP addresses/ports, do not print them;
+ *   + if the rule does not contain an IP proto, print "all" instead of "ip";
+ *
+ * Once we have 'have_options', IP header fields are printed as options.
+ */
+#define        HAVE_PROTO      0x0001
+#define        HAVE_SRCIP      0x0002
+#define        HAVE_DSTIP      0x0004
+#define        HAVE_PROTO4     0x0008
+#define        HAVE_PROTO6     0x0010
+#define        HAVE_IP         0x0100
+#define        HAVE_OPTIONS    0x8000
+
+static void
+show_prerequisites(int *flags, int want, int cmd)
+{
+       (void)cmd;      /* UNUSED */
+       if (co.comment_only)
+               return;
+       if ( (*flags & HAVE_IP) == HAVE_IP)
+               *flags |= HAVE_OPTIONS;
+
+       if ( !(*flags & HAVE_OPTIONS)) {
+               if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) {
+                       if ( (*flags & HAVE_PROTO4))
+                               printf(" ip4");
+                       else if ( (*flags & HAVE_PROTO6))
+                               printf(" ip6");
+                       else
+                               printf(" ip");
+               }
+               if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP))
+                       printf(" from any");
+               if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP))
+                       printf(" to any");
+       }
+       *flags |= want;
+}
+
+static void
+show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth)
+{
+       static int twidth = 0;
+       int l;
+       ipfw_insn *cmd, *tagptr = NULL;
+       const char *comment = NULL;     /* ptr to comment if we have one */
+       int proto = 0;          /* default */
+       int flags = 0;  /* prerequisites */
+       ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */
+       ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */
+       int or_block = 0;       /* we are in an or block */
+       uint32_t set_disable;
+
+       bcopy(&rule->next_rule, &set_disable, sizeof(set_disable));
+
+       if (set_disable & (1 << rule->set)) { /* disabled */
+               if (!co.show_sets)
+                       return;
+               else
+                       printf("# DISABLED ");
+       }
+       printf("%05u ", rule->rulenum);
+
+       if (pcwidth > 0 || bcwidth > 0) {
+               pr_u64(&rule->pcnt, pcwidth);
+               pr_u64(&rule->bcnt, bcwidth);
+       }
+
+       if (co.do_time == 2)
+               printf("%10u ", rule->timestamp);
+       else if (co.do_time == 1) {
+               char timestr[30];
+               time_t t = (time_t)0;
+
+               if (twidth == 0) {
+                       strcpy(timestr, ctime(&t));
+                       *strchr(timestr, '\n') = '\0';
+                       twidth = strlen(timestr);
+               }
+               if (rule->timestamp) {
+                       t = _long_to_time(rule->timestamp);
+
+                       strcpy(timestr, ctime(&t));
+                       *strchr(timestr, '\n') = '\0';
+                       printf("%s ", timestr);
+               } else {
+                       printf("%*s", twidth, " ");
+               }
+       }
+
+       if (co.show_sets)
+               printf("set %d ", rule->set);
+
+       /*
+        * print the optional "match probability"
+        */
+       if (rule->cmd_len > 0) {
+               cmd = rule->cmd ;
+               if (cmd->opcode == O_PROB) {
+                       ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd;
+                       double d = 1.0 * p->d[0];
+
+                       d = (d / 0x7fffffff);
+                       printf("prob %f ", d);
+               }
+       }
+
+       /*
+        * first print actions
+        */
+       for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule);
+                       l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) {
+               switch(cmd->opcode) {
+               case O_CHECK_STATE:
+                       printf("check-state");
+                       /* avoid printing anything else */
+                       flags = HAVE_PROTO | HAVE_SRCIP |
+                               HAVE_DSTIP | HAVE_IP;
+                       break;
+
+               case O_ACCEPT:
+                       printf("allow");
+                       break;
+
+               case O_COUNT:
+                       printf("count");
+                       break;
+
+               case O_DENY:
+                       printf("deny");
+                       break;
+
+               case O_REJECT:
+                       if (cmd->arg1 == ICMP_REJECT_RST)
+                               printf("reset");
+                       else if (cmd->arg1 == ICMP_UNREACH_HOST)
+                               printf("reject");
+                       else
+                               print_reject_code(cmd->arg1);
+                       break;
+
+               case O_UNREACH6:
+                       if (cmd->arg1 == ICMP6_UNREACH_RST)
+                               printf("reset6");
+                       else
+                               print_unreach6_code(cmd->arg1);
+                       break;
+
+               case O_SKIPTO:
+                       PRINT_UINT_ARG("skipto ", cmd->arg1);
+                       break;
+
+               case O_PIPE:
+                       PRINT_UINT_ARG("pipe ", cmd->arg1);
+                       break;
+
+               case O_QUEUE:
+                       PRINT_UINT_ARG("queue ", cmd->arg1);
+                       break;
+
+               case O_DIVERT:
+                       PRINT_UINT_ARG("divert ", cmd->arg1);
+                       break;
+
+               case O_TEE:
+                       PRINT_UINT_ARG("tee ", cmd->arg1);
+                       break;
+
+               case O_NETGRAPH:
+                       PRINT_UINT_ARG("netgraph ", cmd->arg1);
+                       break;
+
+               case O_NGTEE:
+                       PRINT_UINT_ARG("ngtee ", cmd->arg1);
+                       break;
+
+               case O_FORWARD_IP:
+                   {
+                       ipfw_insn_sa *s = (ipfw_insn_sa *)cmd;
+
+                       if (s->sa.sin_addr.s_addr == INADDR_ANY) {
+                               printf("fwd tablearg");
+                       } else {
+                               printf("fwd %s", inet_ntoa(s->sa.sin_addr));
+                       }
+                       if (s->sa.sin_port)
+                               printf(",%d", s->sa.sin_port);
+                   }
+                       break;
+
+#if 0 // XXX unused yet
+               case O_FORWARD_IP6:
+                   {
+                       char buf[4 + INET6_ADDRSTRLEN + 1];
+                       ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd;
+
+                       printf("fwd %s", inet_ntop(AF_INET6, &s->sa.sin6_addr,
+                           buf, sizeof(buf)));
+                       if (s->sa.sin6_port)
+                               printf(",%d", s->sa.sin6_port);
+                   }
+                       break;
+#endif // XXX unused yet
+
+
+               case O_LOG: /* O_LOG is printed last */
+                       logptr = (ipfw_insn_log *)cmd;
+                       break;
+
+               case O_ALTQ: /* O_ALTQ is printed after O_LOG */
+                       altqptr = (ipfw_insn_altq *)cmd;
+                       break;
+
+               case O_TAG:
+                       tagptr = cmd;
+                       break;
+
+               case O_NAT:
+                       if (cmd->arg1 != 0)
+                               PRINT_UINT_ARG("nat ", cmd->arg1);
+                       else
+                               printf("nat global");
+                       break;
+
+               case O_SETFIB:
+                       PRINT_UINT_ARG("setfib ", cmd->arg1);
+                       break;
+
+               case O_REASS:
+                       printf("reass");
+                       break;
+
+               case O_CALLRETURN:
+                       if (cmd->len & F_NOT)
+                               printf("return");
+                       else
+                               PRINT_UINT_ARG("call ", cmd->arg1);
+                       break;
+
+               default:
+                       printf("** unrecognized action %d len %d ",
+                               cmd->opcode, cmd->len);
+               }
+       }
+       if (logptr) {
+               if (logptr->max_log > 0)
+                       printf(" log logamount %d", logptr->max_log);
+               else
+                       printf(" log");
+       }
+#ifndef NO_ALTQ
+       if (altqptr) {
+               print_altq_cmd(altqptr);
+       }
+#endif
+       if (tagptr) {
+               if (tagptr->len & F_NOT)
+                       PRINT_UINT_ARG(" untag ", tagptr->arg1);
+               else
+                       PRINT_UINT_ARG(" tag ", tagptr->arg1);
+       }
+
+       /*
+        * then print the body.
+        */
+       for (l = rule->act_ofs, cmd = rule->cmd ;
+                       l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) {
+               if ((cmd->len & F_OR) || (cmd->len & F_NOT))
+                       continue;
+               if (cmd->opcode == O_IP4) {
+                       flags |= HAVE_PROTO4;
+                       break;
+               } else if (cmd->opcode == O_IP6) {
+                       flags |= HAVE_PROTO6;
+                       break;
+               }
+       }
+       if (rule->_pad & 1) {   /* empty rules before options */
+               if (!co.do_compact) {
+                       show_prerequisites(&flags, HAVE_PROTO, 0);
+                       printf(" from any to any");
+               }
+               flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO |
+                        HAVE_SRCIP | HAVE_DSTIP;
+       }
+
+       if (co.comment_only)
+               comment = "...";
+
+       for (l = rule->act_ofs, cmd = rule->cmd ;
+                       l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) {
+               /* useful alias */
+               ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd;
+
+               if (co.comment_only) {
+                       if (cmd->opcode != O_NOP)
+                               continue;
+                       printf(" // %s\n", (char *)(cmd + 1));
+                       return;
+               }
+
+               show_prerequisites(&flags, 0, cmd->opcode);
+
+               switch(cmd->opcode) {
+               case O_PROB:
+                       break;  /* done already */
+
+               case O_PROBE_STATE:
+                       break; /* no need to print anything here */
+
+               case O_IP_SRC:
+               case O_IP_SRC_LOOKUP:
+               case O_IP_SRC_MASK:
+               case O_IP_SRC_ME:
+               case O_IP_SRC_SET:
+                       show_prerequisites(&flags, HAVE_PROTO, 0);
+                       if (!(flags & HAVE_SRCIP))
+                               printf(" from");
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       print_ip((ipfw_insn_ip *)cmd,
+                               (flags & HAVE_OPTIONS) ? " src-ip" : "");
+                       flags |= HAVE_SRCIP;
+                       break;
+
+               case O_IP_DST:
+               case O_IP_DST_LOOKUP:
+               case O_IP_DST_MASK:
+               case O_IP_DST_ME:
+               case O_IP_DST_SET:
+                       show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0);
+                       if (!(flags & HAVE_DSTIP))
+                               printf(" to");
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       print_ip((ipfw_insn_ip *)cmd,
+                               (flags & HAVE_OPTIONS) ? " dst-ip" : "");
+                       flags |= HAVE_DSTIP;
+                       break;
+
+               case O_IP6_SRC:
+               case O_IP6_SRC_MASK:
+               case O_IP6_SRC_ME:
+                       show_prerequisites(&flags, HAVE_PROTO, 0);
+                       if (!(flags & HAVE_SRCIP))
+                               printf(" from");
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       print_ip6((ipfw_insn_ip6 *)cmd,
+                           (flags & HAVE_OPTIONS) ? " src-ip6" : "");
+                       flags |= HAVE_SRCIP | HAVE_PROTO;
+                       break;
+
+               case O_IP6_DST:
+               case O_IP6_DST_MASK:
+               case O_IP6_DST_ME:
+                       show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0);
+                       if (!(flags & HAVE_DSTIP))
+                               printf(" to");
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       print_ip6((ipfw_insn_ip6 *)cmd,
+                           (flags & HAVE_OPTIONS) ? " dst-ip6" : "");
+                       flags |= HAVE_DSTIP;
+                       break;
+
+               case O_FLOW6ID:
+               print_flow6id( (ipfw_insn_u32 *) cmd );
+               flags |= HAVE_OPTIONS;
+               break;
+
+               case O_IP_DSTPORT:
+                       show_prerequisites(&flags,
+                               HAVE_PROTO | HAVE_SRCIP |
+                               HAVE_DSTIP | HAVE_IP, 0);
+               case O_IP_SRCPORT:
+                       if (flags & HAVE_DSTIP)
+                               flags |= HAVE_IP;
+                       show_prerequisites(&flags,
+                               HAVE_PROTO | HAVE_SRCIP, 0);
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       if (cmd->len & F_NOT)
+                               printf(" not");
+                       print_newports((ipfw_insn_u16 *)cmd, proto,
+                               (flags & HAVE_OPTIONS) ? cmd->opcode : 0);
+                       break;
+
+               case O_PROTO: {
+                       struct protoent *pe = NULL;
+
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       if (cmd->len & F_NOT)
+                               printf(" not");
+                       proto = cmd->arg1;
+                       pe = getprotobynumber(cmd->arg1);
+                       if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) &&
+                           !(flags & HAVE_PROTO))
+                               show_prerequisites(&flags,
+                                   HAVE_PROTO | HAVE_IP | HAVE_SRCIP |
+                                   HAVE_DSTIP | HAVE_OPTIONS, 0);
+                       if (flags & HAVE_OPTIONS)
+                               printf(" proto");
+                       if (pe)
+                               printf(" %s", pe->p_name);
+                       else
+                               printf(" %u", cmd->arg1);
+                       }
+                       flags |= HAVE_PROTO;
+                       break;
+
+               default: /*options ... */
+                       if (!(cmd->len & (F_OR|F_NOT)))
+                               if (((cmd->opcode == O_IP6) &&
+                                   (flags & HAVE_PROTO6)) ||
+                                   ((cmd->opcode == O_IP4) &&
+                                   (flags & HAVE_PROTO4)))
+                                       break;
+                       show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP |
+                                   HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0);
+                       if ((cmd->len & F_OR) && !or_block)
+                               printf(" {");
+                       if (cmd->len & F_NOT && cmd->opcode != O_IN)
+                               printf(" not");
+                       switch(cmd->opcode) {
+                       case O_MACADDR2: {
+                               ipfw_insn_mac *m = (ipfw_insn_mac *)cmd;
+
+                               printf(" MAC");
+                               print_mac(m->addr, m->mask);
+                               print_mac(m->addr + 6, m->mask + 6);
+                               }
+                               break;
+
+                       case O_MAC_TYPE:
+                               print_newports((ipfw_insn_u16 *)cmd,
+                                               IPPROTO_ETHERTYPE, cmd->opcode);
+                               break;
+
+
+                       case O_FRAG:
+                               printf(" frag");
+                               break;
+
+                       case O_FIB:
+                               printf(" fib %u", cmd->arg1 );
+                               break;
+                       case O_SOCKARG:
+                               printf(" sockarg");
+                               break;
+
+                       case O_IN:
+                               printf(cmd->len & F_NOT ? " out" : " in");
+                               break;
+
+                       case O_DIVERTED:
+                               switch (cmd->arg1) {
+                               case 3:
+                                       printf(" diverted");
+                                       break;
+                               case 1:
+                                       printf(" diverted-loopback");
+                                       break;
+                               case 2:
+                                       printf(" diverted-output");
+                                       break;
+                               default:
+                                       printf(" diverted-?<%u>", cmd->arg1);
+                                       break;
+                               }
+                               break;
+
+                       case O_LAYER2:
+                               printf(" layer2");
+                               break;
+                       case O_XMIT:
+                       case O_RECV:
+                       case O_VIA:
+                           {
+                               char const *s;
+                               ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd;
+
+                               if (cmd->opcode == O_XMIT)
+                                       s = "xmit";
+                               else if (cmd->opcode == O_RECV)
+                                       s = "recv";
+                               else /* if (cmd->opcode == O_VIA) */
+                                       s = "via";
+                               if (cmdif->name[0] == '\0')
+                                       printf(" %s %s", s,
+                                           inet_ntoa(cmdif->p.ip));
+                               else
+                                       printf(" %s %s", s, cmdif->name);
+
+                               break;
+                           }
+                       case O_IPID:
+                               if (F_LEN(cmd) == 1)
+                                   printf(" ipid %u", cmd->arg1 );
+                               else
+                                   print_newports((ipfw_insn_u16 *)cmd, 0,
+                                       O_IPID);
+                               break;
+
+                       case O_IPTTL:
+                               if (F_LEN(cmd) == 1)
+                                   printf(" ipttl %u", cmd->arg1 );
+                               else
+                                   print_newports((ipfw_insn_u16 *)cmd, 0,
+                                       O_IPTTL);
+                               break;
+
+                       case O_IPVER:
+                               printf(" ipver %u", cmd->arg1 );
+                               break;
+
+                       case O_IPPRECEDENCE:
+                               printf(" ipprecedence %u", (cmd->arg1) >> 5 );
+                               break;
+
+                       case O_IPLEN:
+                               if (F_LEN(cmd) == 1)
+                                   printf(" iplen %u", cmd->arg1 );
+                               else
+                                   print_newports((ipfw_insn_u16 *)cmd, 0,
+                                       O_IPLEN);
+                               break;
+
+                       case O_IPOPT:
+                               print_flags("ipoptions", cmd, f_ipopts);
+                               break;
+
+                       case O_IPTOS:
+                               print_flags("iptos", cmd, f_iptos);
+                               break;
+
+                       case O_ICMPTYPE:
+                               print_icmptypes((ipfw_insn_u32 *)cmd);
+                               break;
+
+                       case O_ESTAB:
+                               printf(" established");
+                               break;
+
+                       case O_TCPDATALEN:
+                               if (F_LEN(cmd) == 1)
+                                   printf(" tcpdatalen %u", cmd->arg1 );
+                               else
+                                   print_newports((ipfw_insn_u16 *)cmd, 0,
+                                       O_TCPDATALEN);
+                               break;
+
+                       case O_TCPFLAGS:
+                               print_flags("tcpflags", cmd, f_tcpflags);
+                               break;
+
+                       case O_TCPOPTS:
+                               print_flags("tcpoptions", cmd, f_tcpopts);
+                               break;
+
+                       case O_TCPWIN:
+                               printf(" tcpwin %d", ntohs(cmd->arg1));
+                               break;
+
+                       case O_TCPACK:
+                               printf(" tcpack %d", ntohl(cmd32->d[0]));
+                               break;
+
+                       case O_TCPSEQ:
+                               printf(" tcpseq %d", ntohl(cmd32->d[0]));
+                               break;
+
+                       case O_UID:
+                           {
+                               struct passwd *pwd = getpwuid(cmd32->d[0]);
+
+                               if (pwd)
+                                       printf(" uid %s", pwd->pw_name);
+                               else
+                                       printf(" uid %u", cmd32->d[0]);
+                           }
+                               break;
+
+                       case O_GID:
+                           {
+                               struct group *grp = getgrgid(cmd32->d[0]);
+
+                               if (grp)
+                                       printf(" gid %s", grp->gr_name);
+                               else
+                                       printf(" gid %u", cmd32->d[0]);
+                           }
+                               break;
+
+                       case O_JAIL:
+                               printf(" jail %d", cmd32->d[0]);
+                               break;
+
+                       case O_VERREVPATH:
+                               printf(" verrevpath");
+                               break;
+
+                       case O_VERSRCREACH:
+                               printf(" versrcreach");
+                               break;
+
+                       case O_ANTISPOOF:
+                               printf(" antispoof");
+                               break;
+
+                       case O_IPSEC:
+                               printf(" ipsec");
+                               break;
+
+                       case O_NOP:
+                               comment = (char *)(cmd + 1);
+                               break;
+
+                       case O_KEEP_STATE:
+                               printf(" keep-state");
+                               break;
+
+                       case O_LIMIT: {
+                               struct _s_x *p = limit_masks;
+                               ipfw_insn_limit *c = (ipfw_insn_limit *)cmd;
+                               uint8_t x = c->limit_mask;
+                               char const *comma = " ";
+
+                               printf(" limit");
+                               for (; p->x != 0 ; p++)
+                                       if ((x & p->x) == p->x) {
+                                               x &= ~p->x;
+                                               printf("%s%s", comma, p->s);
+                                               comma = ",";
+                                       }
+                               PRINT_UINT_ARG(" ", c->conn_limit);
+                               break;
+                       }
+
+                       case O_IP6:
+                               printf(" ip6");
+                               break;
+
+                       case O_IP4:
+                               printf(" ip4");
+                               break;
+
+                       case O_ICMP6TYPE:
+                               print_icmp6types((ipfw_insn_u32 *)cmd);
+                               break;
+
+                       case O_EXT_HDR:
+                               print_ext6hdr( (ipfw_insn *) cmd );
+                               break;
+
+                       case O_TAGGED:
+                               if (F_LEN(cmd) == 1)
+                                       PRINT_UINT_ARG(" tagged ", cmd->arg1);
+                               else
+                                       print_newports((ipfw_insn_u16 *)cmd, 0,
+                                           O_TAGGED);
+                               break;
+
+                       default:
+                               printf(" [opcode %d len %d]",
+                                   cmd->opcode, cmd->len);
+                       }
+               }
+               if (cmd->len & F_OR) {
+                       printf(" or");
+                       or_block = 1;
+               } else if (or_block) {
+                       printf(" }");
+                       or_block = 0;
+               }
+       }
+       show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP
+                                             | HAVE_IP, 0);
+       if (comment)
+               printf(" // %s", comment);
+       printf("\n");
+}
+
+static void
+show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth)
+{
+       struct protoent *pe;
+       struct in_addr a;
+       uint16_t rulenum;
+       char buf[INET6_ADDRSTRLEN];
+
+       if (!co.do_expired) {
+               if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT))
+                       return;
+       }
+       bcopy(&d->rule, &rulenum, sizeof(rulenum));
+       printf("%05d", rulenum);
+       if (pcwidth > 0 || bcwidth > 0) {
+               printf(" ");
+               pr_u64(&d->pcnt, pcwidth);
+               pr_u64(&d->bcnt, bcwidth);
+               printf("(%ds)", d->expire);
+       }
+       switch (d->dyn_type) {
+       case O_LIMIT_PARENT:
+               printf(" PARENT %d", d->count);
+               break;
+       case O_LIMIT:
+               printf(" LIMIT");
+               break;
+       case O_KEEP_STATE: /* bidir, no mask */
+               printf(" STATE");
+               break;
+       }
+
+       if ((pe = getprotobynumber(d->id.proto)) != NULL)
+               printf(" %s", pe->p_name);
+       else
+               printf(" proto %u", d->id.proto);
+
+       if (d->id.addr_type == 4) {
+               a.s_addr = htonl(d->id.src_ip);
+               printf(" %s %d", inet_ntoa(a), d->id.src_port);
+
+               a.s_addr = htonl(d->id.dst_ip);
+               printf(" <-> %s %d", inet_ntoa(a), d->id.dst_port);
+       } else if (d->id.addr_type == 6) {
+               printf(" %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf,
+                   sizeof(buf)), d->id.src_port);
+               printf(" <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf,
+                   sizeof(buf)), d->id.dst_port);
+       } else
+               printf(" UNKNOWN <-> UNKNOWN\n");
+
+       printf("\n");
+}
+
+/*
+ * This one handles all set-related commands
+ *     ipfw set { show | enable | disable }
+ *     ipfw set swap X Y
+ *     ipfw set move X to Y
+ *     ipfw set move rule X to Y
+ */
+void
+ipfw_sets_handler(char *av[])
+{
+       uint32_t set_disable, masks[2];
+       int i, nbytes;
+       uint16_t rulenum;
+       uint8_t cmd, new_set;
+
+       av++;
+
+       if (av[0] == NULL)
+               errx(EX_USAGE, "set needs command");
+       if (_substrcmp(*av, "show") == 0) {
+               void *data = NULL;
+               char const *msg;
+               int nalloc;
+
+               nalloc = nbytes = sizeof(struct ip_fw);
+               while (nbytes >= nalloc) {
+                       if (data)
+                               free(data);
+                       nalloc = nalloc * 2 + 200;
+                       nbytes = nalloc;
+                       data = safe_calloc(1, nbytes);
+                       if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0)
+                               err(EX_OSERR, "getsockopt(IP_FW_GET)");
+               }
+
+               bcopy(&((struct ip_fw *)data)->next_rule,
+                       &set_disable, sizeof(set_disable));
+
+               for (i = 0, msg = "disable" ; i < RESVD_SET; i++)
+                       if ((set_disable & (1<<i))) {
+                               printf("%s %d", msg, i);
+                               msg = "";
+                       }
+               msg = (set_disable) ? " enable" : "enable";
+               for (i = 0; i < RESVD_SET; i++)
+                       if (!(set_disable & (1<<i))) {
+                               printf("%s %d", msg, i);
+                               msg = "";
+                       }
+               printf("\n");
+       } else if (_substrcmp(*av, "swap") == 0) {
+               av++;
+               if ( av[0] == NULL || av[1] == NULL )
+                       errx(EX_USAGE, "set swap needs 2 set numbers\n");
+               rulenum = atoi(av[0]);
+               new_set = atoi(av[1]);
+               if (!isdigit(*(av[0])) || rulenum > RESVD_SET)
+                       errx(EX_DATAERR, "invalid set number %s\n", av[0]);
+               if (!isdigit(*(av[1])) || new_set > RESVD_SET)
+                       errx(EX_DATAERR, "invalid set number %s\n", av[1]);
+               masks[0] = (4 << 24) | (new_set << 16) | (rulenum);
+               i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t));
+       } else if (_substrcmp(*av, "move") == 0) {
+               av++;
+               if (av[0] && _substrcmp(*av, "rule") == 0) {
+                       cmd = 2;
+                       av++;
+               } else
+                       cmd = 3;
+               if (av[0] == NULL || av[1] == NULL || av[2] == NULL ||
+                               av[3] != NULL ||  _substrcmp(av[1], "to") != 0)
+                       errx(EX_USAGE, "syntax: set move [rule] X to Y\n");
+               rulenum = atoi(av[0]);
+               new_set = atoi(av[2]);
+               if (!isdigit(*(av[0])) || (cmd == 3 && rulenum > RESVD_SET) ||
+                       (cmd == 2 && rulenum == IPFW_DEFAULT_RULE) )
+                       errx(EX_DATAERR, "invalid source number %s\n", av[0]);
+               if (!isdigit(*(av[2])) || new_set > RESVD_SET)
+                       errx(EX_DATAERR, "invalid dest. set %s\n", av[1]);
+               masks[0] = (cmd << 24) | (new_set << 16) | (rulenum);
+               i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t));
+       } else if (_substrcmp(*av, "disable") == 0 ||
+                  _substrcmp(*av, "enable") == 0 ) {
+               int which = _substrcmp(*av, "enable") == 0 ? 1 : 0;
+
+               av++;
+               masks[0] = masks[1] = 0;
+
+               while (av[0]) {
+                       if (isdigit(**av)) {
+                               i = atoi(*av);
+                               if (i < 0 || i > RESVD_SET)
+                                       errx(EX_DATAERR,
+                                           "invalid set number %d\n", i);
+                               masks[which] |= (1<<i);
+                       } else if (_substrcmp(*av, "disable") == 0)
+                               which = 0;
+                       else if (_substrcmp(*av, "enable") == 0)
+                               which = 1;
+                       else
+                               errx(EX_DATAERR,
+                                       "invalid set command %s\n", *av);
+                       av++;
+               }
+               if ( (masks[0] & masks[1]) != 0 )
+                       errx(EX_DATAERR,
+                           "cannot enable and disable the same set\n");
+
+               i = do_cmd(IP_FW_DEL, masks, sizeof(masks));
+               if (i)
+                       warn("set enable/disable: setsockopt(IP_FW_DEL)");
+       } else
+               errx(EX_USAGE, "invalid set command %s\n", *av);
+}
+
+void
+ipfw_sysctl_handler(char *av[], int which)
+{
+       av++;
+
+       if (av[0] == NULL) {
+               warnx("missing keyword to enable/disable\n");
+       } else if (_substrcmp(*av, "firewall") == 0) {
+               sysctlbyname("net.inet.ip.fw.enable", NULL, 0,
+                   &which, sizeof(which));
+               sysctlbyname("net.inet6.ip6.fw.enable", NULL, 0,
+                   &which, sizeof(which));
+       } else if (_substrcmp(*av, "one_pass") == 0) {
+               sysctlbyname("net.inet.ip.fw.one_pass", NULL, 0,
+                   &which, sizeof(which));
+       } else if (_substrcmp(*av, "debug") == 0) {
+               sysctlbyname("net.inet.ip.fw.debug", NULL, 0,
+                   &which, sizeof(which));
+       } else if (_substrcmp(*av, "verbose") == 0) {
+               sysctlbyname("net.inet.ip.fw.verbose", NULL, 0,
+                   &which, sizeof(which));
+       } else if (_substrcmp(*av, "dyn_keepalive") == 0) {
+               sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0,
+                   &which, sizeof(which));
+#ifndef NO_ALTQ
+       } else if (_substrcmp(*av, "altq") == 0) {
+               altq_set_enabled(which);
+#endif
+       } else {
+               warnx("unrecognize enable/disable keyword: %s\n", *av);
+       }
+}
+
+void
+ipfw_list(int ac, char *av[], int show_counters)
+{
+       struct ip_fw *r;
+       ipfw_dyn_rule *dynrules, *d;
+
+#define NEXT(r)        ((struct ip_fw *)((char *)r + RULESIZE(r)))
+       char *lim;
+       void *data = NULL;
+       int bcwidth, n, nbytes, nstat, ndyn, pcwidth, width;
+       int exitval = EX_OK;
+       int lac;
+       char **lav;
+       u_long rnum, last;
+       char *endptr;
+       int seen = 0;
+       uint8_t set;
+
+       const int ocmd = co.do_pipe ? IP_DUMMYNET_GET : IP_FW_GET;
+       int nalloc = 1024;      /* start somewhere... */
+
+       last = 0;
+
+       if (co.test_only) {
+               fprintf(stderr, "Testing only, list disabled\n");
+               return;
+       }
+       if (co.do_pipe) {
+               dummynet_list(ac, av, show_counters);
+               return;
+       }
+
+       ac--;
+       av++;
+
+       /* get rules or pipes from kernel, resizing array as necessary */
+       nbytes = nalloc;
+
+       while (nbytes >= nalloc) {
+               nalloc = nalloc * 2 + 200;
+               nbytes = nalloc;
+               data = safe_realloc(data, nbytes);
+               if (do_cmd(ocmd, data, (uintptr_t)&nbytes) < 0)
+                       err(EX_OSERR, "getsockopt(IP_%s_GET)",
+                               co.do_pipe ? "DUMMYNET" : "FW");
+       }
+
+       /*
+        * Count static rules. They have variable size so we
+        * need to scan the list to count them.
+        */
+       for (nstat = 1, r = data, lim = (char *)data + nbytes;
+                   r->rulenum < IPFW_DEFAULT_RULE && (char *)r < lim;
+                   ++nstat, r = NEXT(r) )
+               ; /* nothing */
+
+       /*
+        * Count dynamic rules. This is easier as they have
+        * fixed size.
+        */
+       r = NEXT(r);
+       dynrules = (ipfw_dyn_rule *)r ;
+       n = (char *)r - (char *)data;
+       ndyn = (nbytes - n) / sizeof *dynrules;
+
+       /* if showing stats, figure out column widths ahead of time */
+       bcwidth = pcwidth = 0;
+       if (show_counters) {
+               for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) {
+                       /* skip rules from another set */
+                       if (co.use_set && r->set != co.use_set - 1)
+                               continue;
+
+                       /* packet counter */
+                       width = pr_u64(&r->pcnt, 0);
+                       if (width > pcwidth)
+                               pcwidth = width;
+
+                       /* byte counter */
+                       width = pr_u64(&r->bcnt, 0);
+                       if (width > bcwidth)
+                               bcwidth = width;
+               }
+       }
+       if (co.do_dynamic && ndyn) {
+               for (n = 0, d = dynrules; n < ndyn; n++, d++) {
+                       if (co.use_set) {
+                               /* skip rules from another set */
+                               bcopy((char *)&d->rule + sizeof(uint16_t),
+                                     &set, sizeof(uint8_t));
+                               if (set != co.use_set - 1)
+                                       continue;
+                       }
+                       width = pr_u64(&d->pcnt, 0);
+                       if (width > pcwidth)
+                               pcwidth = width;
+
+                       width = pr_u64(&d->bcnt, 0);
+                       if (width > bcwidth)
+                               bcwidth = width;
+               }
+       }
+       /* if no rule numbers were specified, list all rules */
+       if (ac == 0) {
+               for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) {
+                       if (co.use_set && r->set != co.use_set - 1)
+                               continue;
+                       show_ipfw(r, pcwidth, bcwidth);
+               }
+
+               if (co.do_dynamic && ndyn) {
+                       printf("## Dynamic rules (%d):\n", ndyn);
+                       for (n = 0, d = dynrules; n < ndyn; n++, d++) {
+                               if (co.use_set) {
+                                       bcopy((char *)&d->rule + sizeof(uint16_t),
+                                             &set, sizeof(uint8_t));
+                                       if (set != co.use_set - 1)
+                                               continue;
+                               }
+                               show_dyn_ipfw(d, pcwidth, bcwidth);
+               }
+               }
+               goto done;
+       }
+
+       /* display specific rules requested on command line */
+
+       for (lac = ac, lav = av; lac != 0; lac--) {
+               /* convert command line rule # */
+               last = rnum = strtoul(*lav++, &endptr, 10);
+               if (*endptr == '-')
+                       last = strtoul(endptr+1, &endptr, 10);
+               if (*endptr) {
+                       exitval = EX_USAGE;
+                       warnx("invalid rule number: %s", *(lav - 1));
+                       continue;
+               }
+               for (n = seen = 0, r = data; n < nstat; n++, r = NEXT(r) ) {
+                       if (r->rulenum > last)
+                               break;
+                       if (co.use_set && r->set != co.use_set - 1)
+                               continue;
+                       if (r->rulenum >= rnum && r->rulenum <= last) {
+                               show_ipfw(r, pcwidth, bcwidth);
+                               seen = 1;
+                       }
+               }
+               if (!seen) {
+                       /* give precedence to other error(s) */
+                       if (exitval == EX_OK)
+                               exitval = EX_UNAVAILABLE;
+                       warnx("rule %lu does not exist", rnum);
+               }
+       }
+
+       if (co.do_dynamic && ndyn) {
+               printf("## Dynamic rules:\n");
+               for (lac = ac, lav = av; lac != 0; lac--) {
+                       last = rnum = strtoul(*lav++, &endptr, 10);
+                       if (*endptr == '-')
+                               last = strtoul(endptr+1, &endptr, 10);
+                       if (*endptr)
+                               /* already warned */
+                               continue;
+                       for (n = 0, d = dynrules; n < ndyn; n++, d++) {
+                               uint16_t rulenum;
+
+                               bcopy(&d->rule, &rulenum, sizeof(rulenum));
+                               if (rulenum > rnum)
+                                       break;
+                               if (co.use_set) {
+                                       bcopy((char *)&d->rule + sizeof(uint16_t),
+                                             &set, sizeof(uint8_t));
+                                       if (set != co.use_set - 1)
+                                               continue;
+                               }
+                               if (r->rulenum >= rnum && r->rulenum <= last)
+                                       show_dyn_ipfw(d, pcwidth, bcwidth);
+                       }
+               }
+       }
+
+       ac = 0;
+
+done:
+       free(data);
+
+       if (exitval != EX_OK)
+               exit(exitval);
+#undef NEXT
+}
+
+static int
+lookup_host (char *host, struct in_addr *ipaddr)
+{
+       struct hostent *he;
+
+       if (!inet_aton(host, ipaddr)) {
+               if ((he = gethostbyname(host)) == NULL)
+                       return(-1);
+               *ipaddr = *(struct in_addr *)he->h_addr_list[0];
+       }
+       return(0);
+}
+
+/*
+ * fills the addr and mask fields in the instruction as appropriate from av.
+ * Update length as appropriate.
+ * The following formats are allowed:
+ *     me      returns O_IP_*_ME
+ *     1.2.3.4         single IP address
+ *     1.2.3.4:5.6.7.8 address:mask
+ *     1.2.3.4/24      address/mask
+ *     1.2.3.4/26{1,6,5,4,23}  set of addresses in a subnet
+ * We can have multiple comma-separated address/mask entries.
+ */
+static void
+fill_ip(ipfw_insn_ip *cmd, char *av)
+{
+       int len = 0;
+       uint32_t *d = ((ipfw_insn_u32 *)cmd)->d;
+
+       cmd->o.len &= ~F_LEN_MASK;      /* zero len */
+
+       if (_substrcmp(av, "any") == 0)
+               return;
+
+       if (_substrcmp(av, "me") == 0) {
+               cmd->o.len |= F_INSN_SIZE(ipfw_insn);
+               return;
+       }
+
+       if (strncmp(av, "table(", 6) == 0) {
+               char *p = strchr(av + 6, ',');
+
+               if (p)
+                       *p++ = '\0';
+               cmd->o.opcode = O_IP_DST_LOOKUP;
+               cmd->o.arg1 = strtoul(av + 6, NULL, 0);
+               if (p) {
+                       cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
+                       d[0] = strtoul(p, NULL, 0);
+               } else
+                       cmd->o.len |= F_INSN_SIZE(ipfw_insn);
+               return;
+       }
+
+    while (av) {
+       /*
+        * After the address we can have '/' or ':' indicating a mask,
+        * ',' indicating another address follows, '{' indicating a
+        * set of addresses of unspecified size.
+        */
+       char *t = NULL, *p = strpbrk(av, "/:,{");
+       int masklen;
+       char md, nd = '\0';
+
+       if (p) {
+               md = *p;
+               *p++ = '\0';
+               if ((t = strpbrk(p, ",{")) != NULL) {
+                       nd = *t;
+                       *t = '\0';
+               }
+       } else
+               md = '\0';
+
+       if (lookup_host(av, (struct in_addr *)&d[0]) != 0)
+               errx(EX_NOHOST, "hostname ``%s'' unknown", av);
+       switch (md) {
+       case ':':
+               if (!inet_aton(p, (struct in_addr *)&d[1]))
+                       errx(EX_DATAERR, "bad netmask ``%s''", p);
+               break;
+       case '/':
+               masklen = atoi(p);
+               if (masklen == 0)
+                       d[1] = htonl(0);        /* mask */
+               else if (masklen > 32)
+                       errx(EX_DATAERR, "bad width ``%s''", p);
+               else
+                       d[1] = htonl(~0 << (32 - masklen));
+               break;
+       case '{':       /* no mask, assume /24 and put back the '{' */
+               d[1] = htonl(~0 << (32 - 24));
+               *(--p) = md;
+               break;
+
+       case ',':       /* single address plus continuation */
+               *(--p) = md;
+               /* FALLTHROUGH */
+       case 0:         /* initialization value */
+       default:
+               d[1] = htonl(~0);       /* force /32 */
+               break;
+       }
+       d[0] &= d[1];           /* mask base address with mask */
+       if (t)
+               *t = nd;
+       /* find next separator */
+       if (p)
+               p = strpbrk(p, ",{");
+       if (p && *p == '{') {
+               /*
+                * We have a set of addresses. They are stored as follows:
+                *   arg1       is the set size (powers of 2, 2..256)
+                *   addr       is the base address IN HOST FORMAT
+                *   mask..     is an array of arg1 bits (rounded up to
+                *              the next multiple of 32) with bits set
+                *              for each host in the map.
+                */
+               uint32_t *map = (uint32_t *)&cmd->mask;
+               int low, high;
+               int i = contigmask((uint8_t *)&(d[1]), 32);
+
+               if (len > 0)
+                       errx(EX_DATAERR, "address set cannot be in a list");
+               if (i < 24 || i > 31)
+                       errx(EX_DATAERR, "invalid set with mask %d\n", i);
+               cmd->o.arg1 = 1<<(32-i);        /* map length           */
+               d[0] = ntohl(d[0]);             /* base addr in host format */
+               cmd->o.opcode = O_IP_DST_SET;   /* default */
+               cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32;
+               for (i = 0; i < (cmd->o.arg1+31)/32 ; i++)
+                       map[i] = 0;     /* clear map */
+
+               av = p + 1;
+               low = d[0] & 0xff;
+               high = low + cmd->o.arg1 - 1;
+               /*
+                * Here, i stores the previous value when we specify a range
+                * of addresses within a mask, e.g. 45-63. i = -1 means we
+                * have no previous value.
+                */
+               i = -1; /* previous value in a range */
+               while (isdigit(*av)) {
+                       char *s;
+                       int a = strtol(av, &s, 0);
+
+                       if (s == av) { /* no parameter */
+                           if (*av != '}')
+                               errx(EX_DATAERR, "set not closed\n");
+                           if (i != -1)
+                               errx(EX_DATAERR, "incomplete range %d-", i);
+                           break;
+                       }
+                       if (a < low || a > high)
+                           errx(EX_DATAERR, "addr %d out of range [%d-%d]\n",
+                               a, low, high);
+                       a -= low;
+                       if (i == -1)    /* no previous in range */
+                           i = a;
+                       else {          /* check that range is valid */
+                           if (i > a)
+                               errx(EX_DATAERR, "invalid range %d-%d",
+                                       i+low, a+low);
+                           if (*s == '-')
+                               errx(EX_DATAERR, "double '-' in range");
+                       }
+                       for (; i <= a; i++)
+                           map[i/32] |= 1<<(i & 31);
+                       i = -1;
+                       if (*s == '-')
+                           i = a;
+                       else if (*s == '}')
+                           break;
+                       av = s+1;
+               }
+               return;
+       }
+       av = p;
+       if (av)                 /* then *av must be a ',' */
+               av++;
+
+       /* Check this entry */
+       if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */
+               /*
+                * 'any' turns the entire list into a NOP.
+                * 'not any' never matches, so it is removed from the
+                * list unless it is the only item, in which case we
+                * report an error.
+                */
+               if (cmd->o.len & F_NOT) {       /* "not any" never matches */
+                       if (av == NULL && len == 0) /* only this entry */
+                               errx(EX_DATAERR, "not any never matches");
+               }
+               /* else do nothing and skip this entry */
+               return;
+       }
+       /* A single IP can be stored in an optimized format */
+       if (d[1] == (uint32_t)~0 && av == NULL && len == 0) {
+               cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
+               return;
+       }
+       len += 2;       /* two words... */
+       d += 2;
+    } /* end while */
+    if (len + 1 > F_LEN_MASK)
+       errx(EX_DATAERR, "address list too long");
+    cmd->o.len |= len+1;
+}
+
+
+/* n2mask sets n bits of the mask */
+void
+n2mask(struct in6_addr *mask, int n)
+{
+       static int      minimask[9] =
+           { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
+       u_char          *p;
+
+       memset(mask, 0, sizeof(struct in6_addr));
+       p = (u_char *) mask;
+       for (; n > 0; p++, n -= 8) {
+               if (n >= 8)
+                       *p = 0xff;
+               else
+                       *p = minimask[n];
+       }
+       return;
+}
+
+/*
+ * helper function to process a set of flags and set bits in the
+ * appropriate masks.
+ */
+static void
+fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode,
+       struct _s_x *flags, char *p)
+{
+       uint8_t set=0, clear=0;
+
+       while (p && *p) {
+               char *q;        /* points to the separator */
+               int val;
+               uint8_t *which; /* mask we are working on */
+
+               if (*p == '!') {
+                       p++;
+                       which = &clear;
+               } else
+                       which = &set;
+               q = strchr(p, ',');
+               if (q)
+                       *q++ = '\0';
+               val = match_token(flags, p);
+               if (val <= 0)
+                       errx(EX_DATAERR, "invalid flag %s", p);
+               *which |= (uint8_t)val;
+               p = q;
+       }
+       cmd->opcode = opcode;
+       cmd->len =  (cmd->len & (F_NOT | F_OR)) | 1;
+       cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8);
+}
+
+
+void
+ipfw_delete(char *av[])
+{
+       uint32_t rulenum;
+       int i;
+       int exitval = EX_OK;
+       int do_set = 0;
+
+       av++;
+       NEED1("missing rule specification");
+       if ( *av && _substrcmp(*av, "set") == 0) {
+               /* Do not allow using the following syntax:
+                *      ipfw set N delete set M
+                */
+               if (co.use_set)
+                       errx(EX_DATAERR, "invalid syntax");
+               do_set = 1;     /* delete set */
+               av++;
+       }
+
+       /* Rule number */
+       while (*av && isdigit(**av)) {
+               i = atoi(*av); av++;
+               if (co.do_nat) {
+                       exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i);
+                       if (exitval) {
+                               exitval = EX_UNAVAILABLE;
+                               warn("rule %u not available", i);
+                       }
+               } else if (co.do_pipe) {
+                       exitval = ipfw_delete_pipe(co.do_pipe, i);
+               } else {
+                       if (co.use_set)
+                               rulenum = (i & 0xffff) | (5 << 24) |
+                                   ((co.use_set - 1) << 16);
+                       else
+                       rulenum =  (i & 0xffff) | (do_set << 24);
+                       i = do_cmd(IP_FW_DEL, &rulenum, sizeof rulenum);
+                       if (i) {
+                               exitval = EX_UNAVAILABLE;
+                               warn("rule %u: setsockopt(IP_FW_DEL)",
+                                   rulenum);
+                       }
+               }
+       }
+       if (exitval != EX_OK)
+               exit(exitval);
+}
+
+
+/*
+ * fill the interface structure. We do not check the name as we can
+ * create interfaces dynamically, so checking them at insert time
+ * makes relatively little sense.
+ * Interface names containing '*', '?', or '[' are assumed to be shell
+ * patterns which match interfaces.
+ */
+static void
+fill_iface(ipfw_insn_if *cmd, char *arg)
+{
+       cmd->name[0] = '\0';
+       cmd->o.len |= F_INSN_SIZE(ipfw_insn_if);
+
+       /* Parse the interface or address */
+       if (strcmp(arg, "any") == 0)
+               cmd->o.len = 0;         /* effectively ignore this command */
+       else if (!isdigit(*arg)) {
+               strlcpy(cmd->name, arg, sizeof(cmd->name));
+               cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0;
+       } else if (!inet_aton(arg, &cmd->p.ip))
+               errx(EX_DATAERR, "bad ip address ``%s''", arg);
+}
+
+static void
+get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask)
+{
+       int i;
+       size_t l;
+       char *ap, *ptr, *optr;
+       struct ether_addr *mac;
+       const char *macset = "0123456789abcdefABCDEF:";
+
+       if (strcmp(p, "any") == 0) {
+               for (i = 0; i < ETHER_ADDR_LEN; i++)
+                       addr[i] = mask[i] = 0;
+               return;
+       }
+
+       optr = ptr = strdup(p);
+       if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) {
+               l = strlen(ap);
+               if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL)
+                       errx(EX_DATAERR, "Incorrect MAC address");
+               bcopy(mac, addr, ETHER_ADDR_LEN);
+       } else
+               errx(EX_DATAERR, "Incorrect MAC address");
+
+       if (ptr != NULL) { /* we have mask? */
+               if (p[ptr - optr - 1] == '/') { /* mask len */
+                       long ml = strtol(ptr, &ap, 10);
+                       if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0)
+                               errx(EX_DATAERR, "Incorrect mask length");
+                       for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++)
+                               mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml);
+               } else { /* mask */
+                       l = strlen(ptr);
+                       if (strspn(ptr, macset) != l ||
+                           (mac = ether_aton(ptr)) == NULL)
+                               errx(EX_DATAERR, "Incorrect mask");
+                       bcopy(mac, mask, ETHER_ADDR_LEN);
+               }
+       } else { /* default mask: ff:ff:ff:ff:ff:ff */
+               for (i = 0; i < ETHER_ADDR_LEN; i++)
+                       mask[i] = 0xff;
+       }
+       for (i = 0; i < ETHER_ADDR_LEN; i++)
+               addr[i] &= mask[i];
+
+       free(optr);
+}
+
+/*
+ * helper function, updates the pointer to cmd with the length
+ * of the current command, and also cleans up the first word of
+ * the new command in case it has been clobbered before.
+ */
+static ipfw_insn *
+next_cmd(ipfw_insn *cmd)
+{
+       cmd += F_LEN(cmd);
+       bzero(cmd, sizeof(*cmd));
+       return cmd;
+}
+
+/*
+ * Takes arguments and copies them into a comment
+ */
+static void
+fill_comment(ipfw_insn *cmd, char **av)
+{
+       int i, l;
+       char *p = (char *)(cmd + 1);
+
+       cmd->opcode = O_NOP;
+       cmd->len =  (cmd->len & (F_NOT | F_OR));
+
+       /* Compute length of comment string. */
+       for (i = 0, l = 0; av[i] != NULL; i++)
+               l += strlen(av[i]) + 1;
+       if (l == 0)
+               return;
+       if (l > 84)
+               errx(EX_DATAERR,
+                   "comment too long (max 80 chars)");
+       l = 1 + (l+3)/4;
+       cmd->len =  (cmd->len & (F_NOT | F_OR)) | l;
+       for (i = 0; av[i] != NULL; i++) {
+               strcpy(p, av[i]);
+               p += strlen(av[i]);
+               *p++ = ' ';
+       }
+       *(--p) = '\0';
+}
+
+/*
+ * A function to fill simple commands of size 1.
+ * Existing flags are preserved.
+ */
+static void
+fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg)
+{
+       cmd->opcode = opcode;
+       cmd->len =  ((cmd->len | flags) & (F_NOT | F_OR)) | 1;
+       cmd->arg1 = arg;
+}
+
+/*
+ * Fetch and add the MAC address and type, with masks. This generates one or
+ * two microinstructions, and returns the pointer to the last one.
+ */
+static ipfw_insn *
+add_mac(ipfw_insn *cmd, char *av[])
+{
+       ipfw_insn_mac *mac;
+
+       if ( ( av[0] == NULL ) || ( av[1] == NULL ) )
+               errx(EX_DATAERR, "MAC dst src");
+
+       cmd->opcode = O_MACADDR2;
+       cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac);
+
+       mac = (ipfw_insn_mac *)cmd;
+       get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */
+       get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]),
+           &(mac->mask[ETHER_ADDR_LEN])); /* src */
+       return cmd;
+}
+
+static ipfw_insn *
+add_mactype(ipfw_insn *cmd, char *av)
+{
+       if (!av)
+               errx(EX_DATAERR, "missing MAC type");
+       if (strcmp(av, "any") != 0) { /* we have a non-null type */
+               fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE);
+               cmd->opcode = O_MAC_TYPE;
+               return cmd;
+       } else
+               return NULL;
+}
+
+static ipfw_insn *
+add_proto0(ipfw_insn *cmd, char *av, u_char *protop)
+{
+       struct protoent *pe;
+       char *ep;
+       int proto;
+
+       proto = strtol(av, &ep, 10);
+       if (*ep != '\0' || proto <= 0) {
+               if ((pe = getprotobyname(av)) == NULL)
+                       return NULL;
+               proto = pe->p_proto;
+       }
+
+       fill_cmd(cmd, O_PROTO, 0, proto);
+       *protop = proto;
+       return cmd;
+}
+
+static ipfw_insn *
+add_proto(ipfw_insn *cmd, char *av, u_char *protop)
+{
+       u_char proto = IPPROTO_IP;
+
+       if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0)
+               ; /* do not set O_IP4 nor O_IP6 */
+       else if (strcmp(av, "ip4") == 0)
+               /* explicit "just IPv4" rule */
+               fill_cmd(cmd, O_IP4, 0, 0);
+       else if (strcmp(av, "ip6") == 0) {
+               /* explicit "just IPv6" rule */
+               proto = IPPROTO_IPV6;
+               fill_cmd(cmd, O_IP6, 0, 0);
+       } else
+               return add_proto0(cmd, av, protop);
+
+       *protop = proto;
+       return cmd;
+}
+
+static ipfw_insn *
+add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop)
+{
+       u_char proto = IPPROTO_IP;
+
+       if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0)
+               ; /* do not set O_IP4 nor O_IP6 */
+       else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0)
+               /* explicit "just IPv4" rule */
+               fill_cmd(cmd, O_IP4, 0, 0);
+       else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) {
+               /* explicit "just IPv6" rule */
+               proto = IPPROTO_IPV6;
+               fill_cmd(cmd, O_IP6, 0, 0);
+       } else
+               return add_proto0(cmd, av, protop);
+
+       *protop = proto;
+       return cmd;
+}
+
+static ipfw_insn *
+add_srcip(ipfw_insn *cmd, char *av)
+{
+       fill_ip((ipfw_insn_ip *)cmd, av);
+       if (cmd->opcode == O_IP_DST_SET)                        /* set */
+               cmd->opcode = O_IP_SRC_SET;
+       else if (cmd->opcode == O_IP_DST_LOOKUP)                /* table */
+               cmd->opcode = O_IP_SRC_LOOKUP;
+       else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn))          /* me */
+               cmd->opcode = O_IP_SRC_ME;
+       else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32))      /* one IP */
+               cmd->opcode = O_IP_SRC;
+       else                                                    /* addr/mask */
+               cmd->opcode = O_IP_SRC_MASK;
+       return cmd;
+}
+
+static ipfw_insn *
+add_dstip(ipfw_insn *cmd, char *av)
+{
+       fill_ip((ipfw_insn_ip *)cmd, av);
+       if (cmd->opcode == O_IP_DST_SET)                        /* set */
+               ;
+       else if (cmd->opcode == O_IP_DST_LOOKUP)                /* table */
+               ;
+       else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn))          /* me */
+               cmd->opcode = O_IP_DST_ME;
+       else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32))      /* one IP */
+               cmd->opcode = O_IP_DST;
+       else                                                    /* addr/mask */
+               cmd->opcode = O_IP_DST_MASK;
+       return cmd;
+}
+
+static ipfw_insn *
+add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode)
+{
+       /* XXX "any" is trapped before. Perhaps "to" */
+       if (_substrcmp(av, "any") == 0) {
+               return NULL;
+       } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) {
+               /* XXX todo: check that we have a protocol with ports */
+               cmd->opcode = opcode;
+               return cmd;
+       }
+       return NULL;
+}
+
+static ipfw_insn *
+add_src(ipfw_insn *cmd, char *av, u_char proto)
+{
+       struct in6_addr a;
+       char *host, *ch;
+       ipfw_insn *ret = NULL;
+
+       if ((host = strdup(av)) == NULL)
+               return NULL;
+       if ((ch = strrchr(host, '/')) != NULL)
+               *ch = '\0';
+
+       if (proto == IPPROTO_IPV6  || strcmp(av, "me6") == 0 ||
+           inet_pton(AF_INET6, host, &a) == 1)
+               ret = add_srcip6(cmd, av);
+       /* XXX: should check for IPv4, not !IPv6 */
+       if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
+           inet_pton(AF_INET6, host, &a) != 1))
+               ret = add_srcip(cmd, av);
+       if (ret == NULL && strcmp(av, "any") != 0)
+               ret = cmd;
+
+       free(host);
+       return ret;
+}
+
+static ipfw_insn *
+add_dst(ipfw_insn *cmd, char *av, u_char proto)
+{
+       struct in6_addr a;
+       char *host, *ch;
+       ipfw_insn *ret = NULL;
+
+       if ((host = strdup(av)) == NULL)
+               return NULL;
+       if ((ch = strrchr(host, '/')) != NULL)
+               *ch = '\0';
+
+       if (proto == IPPROTO_IPV6  || strcmp(av, "me6") == 0 ||
+           inet_pton(AF_INET6, host, &a) == 1)
+               ret = add_dstip6(cmd, av);
+       /* XXX: should check for IPv4, not !IPv6 */
+       if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
+           inet_pton(AF_INET6, host, &a) != 1))
+               ret = add_dstip(cmd, av);
+       if (ret == NULL && strcmp(av, "any") != 0)
+               ret = cmd;
+
+       free(host);
+       return ret;
+}
+
+/*
+ * Parse arguments and assemble the microinstructions which make up a rule.
+ * Rules are added into the 'rulebuf' and then copied in the correct order
+ * into the actual rule.
+ *
+ * The syntax for a rule starts with the action, followed by
+ * optional action parameters, and the various match patterns.
+ * In the assembled microcode, the first opcode must be an O_PROBE_STATE
+ * (generated if the rule includes a keep-state option), then the
+ * various match patterns, log/altq actions, and the actual action.
+ *
+ */
+void
+ipfw_add(char *av[])
+{
+       /*
+        * rules are added into the 'rulebuf' and then copied in
+        * the correct order into the actual rule.
+        * Some things that need to go out of order (prob, action etc.)
+        * go into actbuf[].
+        */
+       static uint32_t rulebuf[255], actbuf[255], cmdbuf[255];
+
+       ipfw_insn *src, *dst, *cmd, *action, *prev=NULL;
+       ipfw_insn *first_cmd;   /* first match pattern */
+
+       struct ip_fw *rule;
+
+       /*
+        * various flags used to record that we entered some fields.
+        */
+       ipfw_insn *have_state = NULL;   /* check-state or keep-state */
+       ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL;
+       size_t len;
+
+       int i;
+
+       int open_par = 0;       /* open parenthesis ( */
+
+       /* proto is here because it is used to fetch ports */
+       u_char proto = IPPROTO_IP;      /* default protocol */
+
+       double match_prob = 1; /* match probability, default is always match */
+
+       bzero(actbuf, sizeof(actbuf));          /* actions go here */
+       bzero(cmdbuf, sizeof(cmdbuf));
+       bzero(rulebuf, sizeof(rulebuf));
+
+       rule = (struct ip_fw *)rulebuf;
+       cmd = (ipfw_insn *)cmdbuf;
+       action = (ipfw_insn *)actbuf;
+
+       av++;
+
+       /* [rule N]     -- Rule number optional */
+       if (av[0] && isdigit(**av)) {
+               rule->rulenum = atoi(*av);
+               av++;
+       }
+
+       /* [set N]      -- set number (0..RESVD_SET), optional */
+       if (av[0] && av[1] && _substrcmp(*av, "set") == 0) {
+               int set = strtoul(av[1], NULL, 10);
+               if (set < 0 || set > RESVD_SET)
+                       errx(EX_DATAERR, "illegal set %s", av[1]);
+               rule->set = set;
+               av += 2;
+       }
+
+       /* [prob D]     -- match probability, optional */
+       if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) {
+               match_prob = strtod(av[1], NULL);
+
+               if (match_prob <= 0 || match_prob > 1)
+                       errx(EX_DATAERR, "illegal match prob. %s", av[1]);
+               av += 2;
+       }
+
+       /* action       -- mandatory */
+       NEED1("missing action");
+       i = match_token(rule_actions, *av);
+       av++;
+       action->len = 1;        /* default */
+       switch(i) {
+       case TOK_CHECKSTATE:
+               have_state = action;
+               action->opcode = O_CHECK_STATE;
+               break;
+
+       case TOK_ACCEPT:
+               action->opcode = O_ACCEPT;
+               break;
+
+       case TOK_DENY:
+               action->opcode = O_DENY;
+               action->arg1 = 0;
+               break;
+
+       case TOK_REJECT:
+               action->opcode = O_REJECT;
+               action->arg1 = ICMP_UNREACH_HOST;
+               break;
+
+       case TOK_RESET:
+               action->opcode = O_REJECT;
+               action->arg1 = ICMP_REJECT_RST;
+               break;
+
+       case TOK_RESET6:
+               action->opcode = O_UNREACH6;
+               action->arg1 = ICMP6_UNREACH_RST;
+               break;
+
+       case TOK_UNREACH:
+               action->opcode = O_REJECT;
+               NEED1("missing reject code");
+               fill_reject_code(&action->arg1, *av);
+               av++;
+               break;
+
+       case TOK_UNREACH6:
+               action->opcode = O_UNREACH6;
+               NEED1("missing unreach code");
+               fill_unreach6_code(&action->arg1, *av);
+               av++;
+               break;
+
+       case TOK_COUNT:
+               action->opcode = O_COUNT;
+               break;
+
+       case TOK_NAT:
+               action->opcode = O_NAT;
+               action->len = F_INSN_SIZE(ipfw_insn_nat);
+               goto chkarg;
+
+       case TOK_QUEUE:
+               action->opcode = O_QUEUE;
+               goto chkarg;
+       case TOK_PIPE:
+               action->opcode = O_PIPE;
+               goto chkarg;
+       case TOK_SKIPTO:
+               action->opcode = O_SKIPTO;
+               goto chkarg;
+       case TOK_NETGRAPH:
+               action->opcode = O_NETGRAPH;
+               goto chkarg;
+       case TOK_NGTEE:
+               action->opcode = O_NGTEE;
+               goto chkarg;
+       case TOK_DIVERT:
+               action->opcode = O_DIVERT;
+               goto chkarg;
+       case TOK_TEE:
+               action->opcode = O_TEE;
+               goto chkarg;
+       case TOK_CALL:
+               action->opcode = O_CALLRETURN;
+chkarg:
+               if (!av[0])
+                       errx(EX_USAGE, "missing argument for %s", *(av - 1));
+               if (isdigit(**av)) {
+                       action->arg1 = strtoul(*av, NULL, 10);
+                       if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG)
+                               errx(EX_DATAERR, "illegal argument for %s",
+                                   *(av - 1));
+               } else if (_substrcmp(*av, "tablearg") == 0) {
+                       action->arg1 = IP_FW_TABLEARG;
+               } else if (i == TOK_DIVERT || i == TOK_TEE) {
+                       struct servent *s;
+                       setservent(1);
+                       s = getservbyname(av[0], "divert");
+                       if (s != NULL)
+                               action->arg1 = ntohs(s->s_port);
+                       else
+                               errx(EX_DATAERR, "illegal divert/tee port");
+               } else
+                       errx(EX_DATAERR, "illegal argument for %s", *(av - 1));
+               av++;
+               break;
+
+       case TOK_FORWARD: {
+               ipfw_insn_sa *p = (ipfw_insn_sa *)action;
+               char *s, *end;
+
+               NEED1("missing forward address[:port]");
+
+               action->opcode = O_FORWARD_IP;
+               action->len = F_INSN_SIZE(ipfw_insn_sa);
+
+               /*
+                * In the kernel we assume AF_INET and use only
+                * sin_port and sin_addr. Remember to set sin_len as
+                * the routing code seems to use it too.
+                */
+               p->sa.sin_family = AF_INET;
+               p->sa.sin_len = sizeof(struct sockaddr_in);
+               p->sa.sin_port = 0;
+               /*
+                * locate the address-port separator (':' or ',')
+                */
+               s = strchr(*av, ':');
+               if (s == NULL)
+                       s = strchr(*av, ',');
+               if (s != NULL) {
+                       *(s++) = '\0';
+                       i = strtoport(s, &end, 0 /* base */, 0 /* proto */);
+                       if (s == end)
+                               errx(EX_DATAERR,
+                                   "illegal forwarding port ``%s''", s);
+                       p->sa.sin_port = (u_short)i;
+               }
+               if (_substrcmp(*av, "tablearg") == 0)
+                       p->sa.sin_addr.s_addr = INADDR_ANY;
+               else
+                       lookup_host(*av, &(p->sa.sin_addr));
+               av++;
+               break;
+           }
+       case TOK_COMMENT:
+               /* pretend it is a 'count' rule followed by the comment */
+               action->opcode = O_COUNT;
+               av--;           /* go back... */
+               break;
+
+       case TOK_SETFIB:
+           {
+               int numfibs;
+               size_t intsize = sizeof(int);
+
+               action->opcode = O_SETFIB;
+               NEED1("missing fib number");
+               action->arg1 = strtoul(*av, NULL, 10);
+               if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+                       errx(EX_DATAERR, "fibs not suported.\n");
+               if (action->arg1 >= numfibs)  /* Temporary */
+                       errx(EX_DATAERR, "fib too large.\n");
+               av++;
+               break;
+           }
+
+       case TOK_REASS:
+               action->opcode = O_REASS;
+               break;
+
+       case TOK_RETURN:
+               fill_cmd(action, O_CALLRETURN, F_NOT, 0);
+               break;
+
+       default:
+               errx(EX_DATAERR, "invalid action %s\n", av[-1]);
+       }
+       action = next_cmd(action);
+
+       /*
+        * [altq queuename] -- altq tag, optional
+        * [log [logamount N]]  -- log, optional
+        *
+        * If they exist, it go first in the cmdbuf, but then it is
+        * skipped in the copy section to the end of the buffer.
+        */
+       while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) {
+               av++;
+               switch (i) {
+               case TOK_LOG:
+                   {
+                       ipfw_insn_log *c = (ipfw_insn_log *)cmd;
+                       int l;
+
+                       if (have_log)
+                               errx(EX_DATAERR,
+                                   "log cannot be specified more than once");
+                       have_log = (ipfw_insn *)c;
+                       cmd->len = F_INSN_SIZE(ipfw_insn_log);
+                       cmd->opcode = O_LOG;
+                       if (av[0] && _substrcmp(*av, "logamount") == 0) {
+                               av++;
+                               NEED1("logamount requires argument");
+                               l = atoi(*av);
+                               if (l < 0)
+                                       errx(EX_DATAERR,
+                                           "logamount must be positive");
+                               c->max_log = l;
+                               av++;
+                       } else {
+                               len = sizeof(c->max_log);
+                               if (sysctlbyname("net.inet.ip.fw.verbose_limit",
+                                   &c->max_log, &len, NULL, 0) == -1)
+                                       errx(1, "sysctlbyname(\"%s\")",
+                                           "net.inet.ip.fw.verbose_limit");
+                       }
+                   }
+                       break;
+
+#ifndef NO_ALTQ
+               case TOK_ALTQ:
+                   {
+                       ipfw_insn_altq *a = (ipfw_insn_altq *)cmd;
+
+                       NEED1("missing altq queue name");
+                       if (have_altq)
+                               errx(EX_DATAERR,
+                                   "altq cannot be specified more than once");
+                       have_altq = (ipfw_insn *)a;
+                       cmd->len = F_INSN_SIZE(ipfw_insn_altq);
+                       cmd->opcode = O_ALTQ;
+                       a->qid = altq_name_to_qid(*av);
+                       av++;
+                   }
+                       break;
+#endif
+
+               case TOK_TAG:
+               case TOK_UNTAG: {
+                       uint16_t tag;
+
+                       if (have_tag)
+                               errx(EX_USAGE, "tag and untag cannot be "
+                                   "specified more than once");
+                       GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i,
+                          rule_action_params);
+                       have_tag = cmd;
+                       fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag);
+                       av++;
+                       break;
+               }
+
+               default:
+                       abort();
+               }
+               cmd = next_cmd(cmd);
+       }
+
+       if (have_state) /* must be a check-state, we are done */
+               goto done;
+
+#define OR_START(target)                                       \
+       if (av[0] && (*av[0] == '(' || *av[0] == '{')) {        \
+               if (open_par)                                   \
+                       errx(EX_USAGE, "nested \"(\" not allowed\n"); \
+               prev = NULL;                                    \
+               open_par = 1;                                   \
+               if ( (av[0])[1] == '\0') {                      \
+                       av++;                                   \
+               } else                                          \
+                       (*av)++;                                \
+       }                                                       \
+       target:                                                 \
+
+
+#define        CLOSE_PAR                                               \
+       if (open_par) {                                         \
+               if (av[0] && (                                  \
+                   strcmp(*av, ")") == 0 ||                    \
+                   strcmp(*av, "}") == 0)) {                   \
+                       prev = NULL;                            \
+                       open_par = 0;                           \
+                       av++;                                   \
+               } else                                          \
+                       errx(EX_USAGE, "missing \")\"\n");      \
+       }
+
+#define NOT_BLOCK                                              \
+       if (av[0] && _substrcmp(*av, "not") == 0) {             \
+               if (cmd->len & F_NOT)                           \
+                       errx(EX_USAGE, "double \"not\" not allowed\n"); \
+               cmd->len |= F_NOT;                              \
+               av++;                                           \
+       }
+
+#define OR_BLOCK(target)                                       \
+       if (av[0] && _substrcmp(*av, "or") == 0) {              \
+               if (prev == NULL || open_par == 0)              \
+                       errx(EX_DATAERR, "invalid OR block");   \
+               prev->len |= F_OR;                              \
+               av++;                                   \
+               goto target;                                    \
+       }                                                       \
+       CLOSE_PAR;
+
+       first_cmd = cmd;
+
+#if 0
+       /*
+        * MAC addresses, optional.
+        * If we have this, we skip the part "proto from src to dst"
+        * and jump straight to the option parsing.
+        */
+       NOT_BLOCK;
+       NEED1("missing protocol");
+       if (_substrcmp(*av, "MAC") == 0 ||
+           _substrcmp(*av, "mac") == 0) {
+               av++;                   /* the "MAC" keyword */
+               add_mac(cmd, av);       /* exits in case of errors */
+               cmd = next_cmd(cmd);
+               av += 2;                /* dst-mac and src-mac */
+               NOT_BLOCK;
+               NEED1("missing mac type");
+               if (add_mactype(cmd, av[0]))
+                       cmd = next_cmd(cmd);
+               av++;                   /* any or mac-type */
+               goto read_options;
+       }
+#endif
+
+       /*
+        * protocol, mandatory
+        */
+    OR_START(get_proto);
+       NOT_BLOCK;
+       NEED1("missing protocol");
+       if (add_proto_compat(cmd, *av, &proto)) {
+               av++;
+               if (F_LEN(cmd) != 0) {
+                       prev = cmd;
+                       cmd = next_cmd(cmd);
+               }
+       } else if (first_cmd != cmd) {
+               errx(EX_DATAERR, "invalid protocol ``%s''", *av);
+       } else
+               goto read_options;
+    OR_BLOCK(get_proto);
+
+       /*
+        * "from", mandatory
+        */
+       if ((av[0] == NULL) || _substrcmp(*av, "from") != 0)
+               errx(EX_USAGE, "missing ``from''");
+       av++;
+
+       /*
+        * source IP, mandatory
+        */
+    OR_START(source_ip);
+       NOT_BLOCK;      /* optional "not" */
+       NEED1("missing source address");
+       if (add_src(cmd, *av, proto)) {
+               av++;
+               if (F_LEN(cmd) != 0) {  /* ! any */
+                       prev = cmd;
+                       cmd = next_cmd(cmd);
+               }
+       } else
+               errx(EX_USAGE, "bad source address %s", *av);
+    OR_BLOCK(source_ip);
+
+       /*
+        * source ports, optional
+        */
+       NOT_BLOCK;      /* optional "not" */
+       if ( av[0] != NULL ) {
+               if (_substrcmp(*av, "any") == 0 ||
+                   add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
+                       av++;
+                       if (F_LEN(cmd) != 0)
+                               cmd = next_cmd(cmd);
+               }
+       }
+
+       /*
+        * "to", mandatory
+        */
+       if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 )
+               errx(EX_USAGE, "missing ``to''");
+       av++;
+
+       /*
+        * destination, mandatory
+        */
+    OR_START(dest_ip);
+       NOT_BLOCK;      /* optional "not" */
+       NEED1("missing dst address");
+       if (add_dst(cmd, *av, proto)) {
+               av++;
+               if (F_LEN(cmd) != 0) {  /* ! any */
+                       prev = cmd;
+                       cmd = next_cmd(cmd);
+               }
+       } else
+               errx( EX_USAGE, "bad destination address %s", *av);
+    OR_BLOCK(dest_ip);
+
+       /*
+        * dest. ports, optional
+        */
+       NOT_BLOCK;      /* optional "not" */
+       if (av[0]) {
+               if (_substrcmp(*av, "any") == 0 ||
+                   add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
+                       av++;
+                       if (F_LEN(cmd) != 0)
+                               cmd = next_cmd(cmd);
+               }
+       }
+
+read_options:
+       if (av[0] && first_cmd == cmd) {
+               /*
+                * nothing specified so far, store in the rule to ease
+                * printout later.
+                */
+                rule->_pad = 1;
+       }
+       prev = NULL;
+       while ( av[0] != NULL ) {
+               char *s;
+               ipfw_insn_u32 *cmd32;   /* alias for cmd */
+
+               s = *av;
+               cmd32 = (ipfw_insn_u32 *)cmd;
+
+               if (*s == '!') {        /* alternate syntax for NOT */
+                       if (cmd->len & F_NOT)
+                               errx(EX_USAGE, "double \"not\" not allowed\n");
+                       cmd->len = F_NOT;
+                       s++;
+               }
+               i = match_token(rule_options, s);
+               av++;
+               switch(i) {
+               case TOK_NOT:
+                       if (cmd->len & F_NOT)
+                               errx(EX_USAGE, "double \"not\" not allowed\n");
+                       cmd->len = F_NOT;
+                       break;
+
+               case TOK_OR:
+                       if (open_par == 0 || prev == NULL)
+                               errx(EX_USAGE, "invalid \"or\" block\n");
+                       prev->len |= F_OR;
+                       break;
+
+               case TOK_STARTBRACE:
+                       if (open_par)
+                               errx(EX_USAGE, "+nested \"(\" not allowed\n");
+                       open_par = 1;
+                       break;
+
+               case TOK_ENDBRACE:
+                       if (!open_par)
+                               errx(EX_USAGE, "+missing \")\"\n");
+                       open_par = 0;
+                       prev = NULL;
+                       break;
+
+               case TOK_IN:
+                       fill_cmd(cmd, O_IN, 0, 0);
+                       break;
+
+               case TOK_OUT:
+                       cmd->len ^= F_NOT; /* toggle F_NOT */
+                       fill_cmd(cmd, O_IN, 0, 0);
+                       break;
+
+               case TOK_DIVERTED:
+                       fill_cmd(cmd, O_DIVERTED, 0, 3);
+                       break;
+
+               case TOK_DIVERTEDLOOPBACK:
+                       fill_cmd(cmd, O_DIVERTED, 0, 1);
+                       break;
+
+               case TOK_DIVERTEDOUTPUT:
+                       fill_cmd(cmd, O_DIVERTED, 0, 2);
+                       break;
+
+               case TOK_FRAG:
+                       fill_cmd(cmd, O_FRAG, 0, 0);
+                       break;
+
+               case TOK_LAYER2:
+                       fill_cmd(cmd, O_LAYER2, 0, 0);
+                       break;
+
+               case TOK_XMIT:
+               case TOK_RECV:
+               case TOK_VIA:
+                       NEED1("recv, xmit, via require interface name"
+                               " or address");
+                       fill_iface((ipfw_insn_if *)cmd, av[0]);
+                       av++;
+                       if (F_LEN(cmd) == 0)    /* not a valid address */
+                               break;
+                       if (i == TOK_XMIT)
+                               cmd->opcode = O_XMIT;
+                       else if (i == TOK_RECV)
+                               cmd->opcode = O_RECV;
+                       else if (i == TOK_VIA)
+                               cmd->opcode = O_VIA;
+                       break;
+
+               case TOK_ICMPTYPES:
+                       NEED1("icmptypes requires list of types");
+                       fill_icmptypes((ipfw_insn_u32 *)cmd, *av);
+                       av++;
+                       break;
+
+               case TOK_ICMP6TYPES:
+                       NEED1("icmptypes requires list of types");
+                       fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av);
+                       av++;
+                       break;
+
+               case TOK_IPTTL:
+                       NEED1("ipttl requires TTL");
+                       if (strpbrk(*av, "-,")) {
+                           if (!add_ports(cmd, *av, 0, O_IPTTL))
+                               errx(EX_DATAERR, "invalid ipttl %s", *av);
+                       } else
+                           fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+
+               case TOK_IPID:
+                       NEED1("ipid requires id");
+                       if (strpbrk(*av, "-,")) {
+                           if (!add_ports(cmd, *av, 0, O_IPID))
+                               errx(EX_DATAERR, "invalid ipid %s", *av);
+                       } else
+                           fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+
+               case TOK_IPLEN:
+                       NEED1("iplen requires length");
+                       if (strpbrk(*av, "-,")) {
+                           if (!add_ports(cmd, *av, 0, O_IPLEN))
+                               errx(EX_DATAERR, "invalid ip len %s", *av);
+                       } else
+                           fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+
+               case TOK_IPVER:
+                       NEED1("ipver requires version");
+                       fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+
+               case TOK_IPPRECEDENCE:
+                       NEED1("ipprecedence requires value");
+                       fill_cmd(cmd, O_IPPRECEDENCE, 0,
+                           (strtoul(*av, NULL, 0) & 7) << 5);
+                       av++;
+                       break;
+
+               case TOK_IPOPTS:
+                       NEED1("missing argument for ipoptions");
+                       fill_flags(cmd, O_IPOPT, f_ipopts, *av);
+                       av++;
+                       break;
+
+               case TOK_IPTOS:
+                       NEED1("missing argument for iptos");
+                       fill_flags(cmd, O_IPTOS, f_iptos, *av);
+                       av++;
+                       break;
+
+               case TOK_UID:
+                       NEED1("uid requires argument");
+                   {
+                       char *end;
+                       uid_t uid;
+                       struct passwd *pwd;
+
+                       cmd->opcode = O_UID;
+                       uid = strtoul(*av, &end, 0);
+                       pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av);
+                       if (pwd == NULL)
+                               errx(EX_DATAERR, "uid \"%s\" nonexistent", *av);
+                       cmd32->d[0] = pwd->pw_uid;
+                       cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
+                       av++;
+                   }
+                       break;
+
+               case TOK_GID:
+                       NEED1("gid requires argument");
+                   {
+                       char *end;
+                       gid_t gid;
+                       struct group *grp;
+
+                       cmd->opcode = O_GID;
+                       gid = strtoul(*av, &end, 0);
+                       grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av);
+                       if (grp == NULL)
+                               errx(EX_DATAERR, "gid \"%s\" nonexistent", *av);
+                       cmd32->d[0] = grp->gr_gid;
+                       cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
+                       av++;
+                   }
+                       break;
+
+               case TOK_JAIL:
+                       NEED1("jail requires argument");
+                   {
+                       char *end;
+                       int jid;
+
+                       cmd->opcode = O_JAIL;
+                       jid = (int)strtol(*av, &end, 0);
+                       if (jid < 0 || *end != '\0')
+                               errx(EX_DATAERR, "jail requires prison ID");
+                       cmd32->d[0] = (uint32_t)jid;
+                       cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
+                       av++;
+                   }
+                       break;
+
+               case TOK_ESTAB:
+                       fill_cmd(cmd, O_ESTAB, 0, 0);
+                       break;
+
+               case TOK_SETUP:
+                       fill_cmd(cmd, O_TCPFLAGS, 0,
+                               (TH_SYN) | ( (TH_ACK) & 0xff) <<8 );
+                       break;
+
+               case TOK_TCPDATALEN:
+                       NEED1("tcpdatalen requires length");
+                       if (strpbrk(*av, "-,")) {
+                           if (!add_ports(cmd, *av, 0, O_TCPDATALEN))
+                               errx(EX_DATAERR, "invalid tcpdata len %s", *av);
+                       } else
+                           fill_cmd(cmd, O_TCPDATALEN, 0,
+                                   strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+
+               case TOK_TCPOPTS:
+                       NEED1("missing argument for tcpoptions");
+                       fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av);
+                       av++;
+                       break;
+
+               case TOK_TCPSEQ:
+               case TOK_TCPACK:
+                       NEED1("tcpseq/tcpack requires argument");
+                       cmd->len = F_INSN_SIZE(ipfw_insn_u32);
+                       cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK;
+                       cmd32->d[0] = htonl(strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+
+               case TOK_TCPWIN:
+                       NEED1("tcpwin requires length");
+                       fill_cmd(cmd, O_TCPWIN, 0,
+                           htons(strtoul(*av, NULL, 0)));
+                       av++;
+                       break;
+
+               case TOK_TCPFLAGS:
+                       NEED1("missing argument for tcpflags");
+                       cmd->opcode = O_TCPFLAGS;
+                       fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av);
+                       av++;
+                       break;
+
+               case TOK_KEEPSTATE:
+                       if (open_par)
+                               errx(EX_USAGE, "keep-state cannot be part "
+                                   "of an or block");
+                       if (have_state)
+                               errx(EX_USAGE, "only one of keep-state "
+                                       "and limit is allowed");
+                       have_state = cmd;
+                       fill_cmd(cmd, O_KEEP_STATE, 0, 0);
+                       break;
+
+               case TOK_LIMIT: {
+                       ipfw_insn_limit *c = (ipfw_insn_limit *)cmd;
+                       int val;
+
+                       if (open_par)
+                               errx(EX_USAGE,
+                                   "limit cannot be part of an or block");
+                       if (have_state)
+                               errx(EX_USAGE, "only one of keep-state and "
+                                   "limit is allowed");
+                       have_state = cmd;
+
+                       cmd->len = F_INSN_SIZE(ipfw_insn_limit);
+                       cmd->opcode = O_LIMIT;
+                       c->limit_mask = c->conn_limit = 0;
+
+                       while ( av[0] != NULL ) {
+                               if ((val = match_token(limit_masks, *av)) <= 0)
+                                       break;
+                               c->limit_mask |= val;
+                               av++;
+                       }
+
+                       if (c->limit_mask == 0)
+                               errx(EX_USAGE, "limit: missing limit mask");
+
+                       GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX,
+                           TOK_LIMIT, rule_options);
+
+                       av++;
+                       break;
+               }
+
+               case TOK_PROTO:
+                       NEED1("missing protocol");
+                       if (add_proto(cmd, *av, &proto)) {
+                               av++;
+                       } else
+                               errx(EX_DATAERR, "invalid protocol ``%s''",
+                                   *av);
+                       break;
+
+               case TOK_SRCIP:
+                       NEED1("missing source IP");
+                       if (add_srcip(cmd, *av)) {
+                               av++;
+                       }
+                       break;
+
+               case TOK_DSTIP:
+                       NEED1("missing destination IP");
+                       if (add_dstip(cmd, *av)) {
+                               av++;
+                       }
+                       break;
+
+               case TOK_SRCIP6:
+                       NEED1("missing source IP6");
+                       if (add_srcip6(cmd, *av)) {
+                               av++;
+                       }
+                       break;
+
+               case TOK_DSTIP6:
+                       NEED1("missing destination IP6");
+                       if (add_dstip6(cmd, *av)) {
+                               av++;
+                       }
+                       break;
+
+               case TOK_SRCPORT:
+                       NEED1("missing source port");
+                       if (_substrcmp(*av, "any") == 0 ||
+                           add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
+                               av++;
+                       } else
+                               errx(EX_DATAERR, "invalid source port %s", *av);
+                       break;
+
+               case TOK_DSTPORT:
+                       NEED1("missing destination port");
+                       if (_substrcmp(*av, "any") == 0 ||
+                           add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
+                               av++;
+                       } else
+                               errx(EX_DATAERR, "invalid destination port %s",
+                                   *av);
+                       break;
+
+               case TOK_MAC:
+                       if (add_mac(cmd, av))
+                               av += 2;
+                       break;
+
+               case TOK_MACTYPE:
+                       NEED1("missing mac type");
+                       if (!add_mactype(cmd, *av))
+                               errx(EX_DATAERR, "invalid mac type %s", *av);
+                       av++;
+                       break;
+
+               case TOK_VERREVPATH:
+                       fill_cmd(cmd, O_VERREVPATH, 0, 0);
+                       break;
+
+               case TOK_VERSRCREACH:
+                       fill_cmd(cmd, O_VERSRCREACH, 0, 0);
+                       break;
+
+               case TOK_ANTISPOOF:
+                       fill_cmd(cmd, O_ANTISPOOF, 0, 0);
+                       break;
+
+               case TOK_IPSEC:
+                       fill_cmd(cmd, O_IPSEC, 0, 0);
+                       break;
+
+               case TOK_IPV6:
+                       fill_cmd(cmd, O_IP6, 0, 0);
+                       break;
+
+               case TOK_IPV4:
+                       fill_cmd(cmd, O_IP4, 0, 0);
+                       break;
+
+               case TOK_EXT6HDR:
+                       fill_ext6hdr( cmd, *av );
+                       av++;
+                       break;
+
+               case TOK_FLOWID:
+                       if (proto != IPPROTO_IPV6 )
+                               errx( EX_USAGE, "flow-id filter is active "
+                                   "only for ipv6 protocol\n");
+                       fill_flow6( (ipfw_insn_u32 *) cmd, *av );
+                       av++;
+                       break;
+
+               case TOK_COMMENT:
+                       fill_comment(cmd, av);
+                       av[0]=NULL;
+                       break;
+
+               case TOK_TAGGED:
+                       if (av[0] && strpbrk(*av, "-,")) {
+                               if (!add_ports(cmd, *av, 0, O_TAGGED))
+                                       errx(EX_DATAERR, "tagged: invalid tag"
+                                           " list: %s", *av);
+                       }
+                       else {
+                               uint16_t tag;
+
+                               GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX,
+                                   TOK_TAGGED, rule_options);
+                               fill_cmd(cmd, O_TAGGED, 0, tag);
+                       }
+                       av++;
+                       break;
+
+               case TOK_FIB:
+                       NEED1("fib requires fib number");
+                       fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0));
+                       av++;
+                       break;
+               case TOK_SOCKARG:
+                       fill_cmd(cmd, O_SOCKARG, 0, 0);
+                       break;
+
+               case TOK_LOOKUP: {
+                       ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd;
+                       char *p;
+                       int j;
+
+                       if (!av[0] || !av[1])
+                               errx(EX_USAGE, "format: lookup argument tablenum");
+                       cmd->opcode = O_IP_DST_LOOKUP;
+                       cmd->len |= F_INSN_SIZE(ipfw_insn) + 2;
+                       i = match_token(rule_options, *av);
+                       for (j = 0; lookup_key[j] >= 0 ; j++) {
+                               if (i == lookup_key[j])
+                                       break;
+                       }
+                       if (lookup_key[j] <= 0)
+                               errx(EX_USAGE, "format: cannot lookup on %s", *av);
+                       __PAST_END(c->d, 1) = j; // i converted to option
+                       av++;
+                       cmd->arg1 = strtoul(*av, &p, 0);
+                       if (p && *p)
+                               errx(EX_USAGE, "format: lookup argument tablenum");
+                       av++;
+                   }
+                       break;
+
+               default:
+                       errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s);
+               }
+               if (F_LEN(cmd) > 0) {   /* prepare to advance */
+                       prev = cmd;
+                       cmd = next_cmd(cmd);
+               }
+       }
+
+done:
+       /*
+        * Now copy stuff into the rule.
+        * If we have a keep-state option, the first instruction
+        * must be a PROBE_STATE (which is generated here).
+        * If we have a LOG option, it was stored as the first command,
+        * and now must be moved to the top of the action part.
+        */
+       dst = (ipfw_insn *)rule->cmd;
+
+       /*
+        * First thing to write into the command stream is the match probability.
+        */
+       if (match_prob != 1) { /* 1 means always match */
+               dst->opcode = O_PROB;
+               dst->len = 2;
+               *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff);
+               dst += dst->len;
+       }
+
+       /*
+        * generate O_PROBE_STATE if necessary
+        */
+       if (have_state && have_state->opcode != O_CHECK_STATE) {
+               fill_cmd(dst, O_PROBE_STATE, 0, 0);
+               dst = next_cmd(dst);
+       }
+
+       /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */
+       for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) {
+               i = F_LEN(src);
+
+               switch (src->opcode) {
+               case O_LOG:
+               case O_KEEP_STATE:
+               case O_LIMIT:
+               case O_ALTQ:
+               case O_TAG:
+                       break;
+               default:
+                       bcopy(src, dst, i * sizeof(uint32_t));
+                       dst += i;
+               }
+       }
+
+       /*
+        * put back the have_state command as last opcode
+        */
+       if (have_state && have_state->opcode != O_CHECK_STATE) {
+               i = F_LEN(have_state);
+               bcopy(have_state, dst, i * sizeof(uint32_t));
+               dst += i;
+       }
+       /*
+        * start action section
+        */
+       rule->act_ofs = dst - rule->cmd;
+
+       /* put back O_LOG, O_ALTQ, O_TAG if necessary */
+       if (have_log) {
+               i = F_LEN(have_log);
+               bcopy(have_log, dst, i * sizeof(uint32_t));
+               dst += i;
+       }
+       if (have_altq) {
+               i = F_LEN(have_altq);
+               bcopy(have_altq, dst, i * sizeof(uint32_t));
+               dst += i;
+       }
+       if (have_tag) {
+               i = F_LEN(have_tag);
+               bcopy(have_tag, dst, i * sizeof(uint32_t));
+               dst += i;
+       }
+       /*
+        * copy all other actions
+        */
+       for (src = (ipfw_insn *)actbuf; src != action; src += i) {
+               i = F_LEN(src);
+               bcopy(src, dst, i * sizeof(uint32_t));
+               dst += i;
+       }
+
+       rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd);
+       i = (char *)dst - (char *)rule;
+       if (do_cmd(IP_FW_ADD, rule, (uintptr_t)&i) == -1)
+               err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_ADD");
+       if (!co.do_quiet)
+               show_ipfw(rule, 0, 0);
+}
+
+/*
+ * clear the counters or the log counters.
+ */
+void
+ipfw_zero(int ac, char *av[], int optname /* 0 = IP_FW_ZERO, 1 = IP_FW_RESETLOG */)
+{
+       uint32_t arg, saved_arg;
+       int failed = EX_OK;
+       char const *errstr;
+       char const *name = optname ? "RESETLOG" : "ZERO";
+
+       optname = optname ? IP_FW_RESETLOG : IP_FW_ZERO;
+
+       av++; ac--;
+
+       if (!ac) {
+               /* clear all entries */
+               if (do_cmd(optname, NULL, 0) < 0)
+                       err(EX_UNAVAILABLE, "setsockopt(IP_FW_%s)", name);
+               if (!co.do_quiet)
+                       printf("%s.\n", optname == IP_FW_ZERO ?
+                           "Accounting cleared":"Logging counts reset");
+
+               return;
+       }
+
+       while (ac) {
+               /* Rule number */
+               if (isdigit(**av)) {
+                       arg = strtonum(*av, 0, 0xffff, &errstr);
+                       if (errstr)
+                               errx(EX_DATAERR,
+                                   "invalid rule number %s\n", *av);
+                       saved_arg = arg;
+                       if (co.use_set)
+                               arg |= (1 << 24) | ((co.use_set - 1) << 16);
+                       av++;
+                       ac--;
+                       if (do_cmd(optname, &arg, sizeof(arg))) {
+                               warn("rule %u: setsockopt(IP_FW_%s)",
+                                   saved_arg, name);
+                               failed = EX_UNAVAILABLE;
+                       } else if (!co.do_quiet)
+                               printf("Entry %d %s.\n", saved_arg,
+                                   optname == IP_FW_ZERO ?
+                                       "cleared" : "logging count reset");
+               } else {
+                       errx(EX_USAGE, "invalid rule number ``%s''", *av);
+               }
+       }
+       if (failed != EX_OK)
+               exit(failed);
+}
+
+void
+ipfw_flush(int force)
+{
+       int cmd = co.do_pipe ? IP_DUMMYNET_FLUSH : IP_FW_FLUSH;
+
+       if (!force && !co.do_quiet) { /* need to ask user */
+               int c;
+
+               printf("Are you sure? [yn] ");
+               fflush(stdout);
+               do {
+                       c = toupper(getc(stdin));
+                       while (c != '\n' && getc(stdin) != '\n')
+                               if (feof(stdin))
+                                       return; /* and do not flush */
+               } while (c != 'Y' && c != 'N');
+               printf("\n");
+               if (c == 'N')   /* user said no */
+                       return;
+       }
+       if (co.do_pipe) {
+               dummynet_flush();
+               return;
+       }
+       /* `ipfw set N flush` - is the same that `ipfw delete set N` */
+       if (co.use_set) {
+               uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24);
+               if (do_cmd(IP_FW_DEL, &arg, sizeof(arg)) < 0)
+                       err(EX_UNAVAILABLE, "setsockopt(IP_FW_DEL)");
+       } else if (do_cmd(cmd, NULL, 0) < 0)
+               err(EX_UNAVAILABLE, "setsockopt(IP_%s_FLUSH)",
+                   co.do_pipe ? "DUMMYNET" : "FW");
+       if (!co.do_quiet)
+               printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules");
+}
+
+
+static void table_list(ipfw_table_entry ent, int need_header);
+
+/*
+ * This one handles all table-related commands
+ *     ipfw table N add addr[/masklen] [value]
+ *     ipfw table N delete addr[/masklen]
+ *     ipfw table {N | all} flush
+ *     ipfw table {N | all} list
+ */
+void
+ipfw_table_handler(int ac, char *av[])
+{
+       ipfw_table_entry ent;
+       int do_add;
+       int is_all;
+       size_t len;
+       char *p;
+       uint32_t a;
+       uint32_t tables_max;
+
+       len = sizeof(tables_max);
+       if (sysctlbyname("net.inet.ip.fw.tables_max", &tables_max, &len,
+               NULL, 0) == -1) {
+#ifdef IPFW_TABLES_MAX
+               warn("Warn: Failed to get the max tables number via sysctl. "
+                    "Using the compiled in defaults. \nThe reason was");
+               tables_max = IPFW_TABLES_MAX;
+#else
+               errx(1, "Failed sysctlbyname(\"net.inet.ip.fw.tables_max\")");
+#endif
+       }
+
+       ac--; av++;
+       if (ac && isdigit(**av)) {
+               ent.tbl = atoi(*av);
+               is_all = 0;
+               ac--; av++;
+       } else if (ac && _substrcmp(*av, "all") == 0) {
+               ent.tbl = 0;
+               is_all = 1;
+               ac--; av++;
+       } else
+               errx(EX_USAGE, "table number or 'all' keyword required");
+       if (ent.tbl >= tables_max)
+               errx(EX_USAGE, "The table number exceeds the maximum allowed "
+                       "value (%d)", tables_max - 1);
+       NEED1("table needs command");
+       if (is_all && _substrcmp(*av, "list") != 0
+                  && _substrcmp(*av, "flush") != 0)
+               errx(EX_USAGE, "table number required");
+
+       if (_substrcmp(*av, "add") == 0 ||
+           _substrcmp(*av, "delete") == 0) {
+               do_add = **av == 'a';
+               ac--; av++;
+               if (!ac)
+                       errx(EX_USAGE, "IP address required");
+               p = strchr(*av, '/');
+               if (p) {
+                       *p++ = '\0';
+                       ent.masklen = atoi(p);
+                       if (ent.masklen > 32)
+                               errx(EX_DATAERR, "bad width ``%s''", p);
+               } else
+                       ent.masklen = 32;
+               if (lookup_host(*av, (struct in_addr *)&ent.addr) != 0)
+                       errx(EX_NOHOST, "hostname ``%s'' unknown", *av);
+               ac--; av++;
+               if (do_add && ac) {
+                       unsigned int tval;
+                       /* isdigit is a bit of a hack here.. */
+                       if (strchr(*av, (int)'.') == NULL && isdigit(**av))  {
+                               ent.value = strtoul(*av, NULL, 0);
+                       } else {
+                               if (lookup_host(*av, (struct in_addr *)&tval) == 0) {
+                                       /* The value must be stored in host order        *
+                                        * so that the values < 65k can be distinguished */
+                                       ent.value = ntohl(tval);
+                               } else {
+                                       errx(EX_NOHOST, "hostname ``%s'' unknown", *av);
+                               }
+                       }
+               } else
+                       ent.value = 0;
+               if (do_cmd(do_add ? IP_FW_TABLE_ADD : IP_FW_TABLE_DEL,
+                   &ent, sizeof(ent)) < 0) {
+                       /* If running silent, don't bomb out on these errors. */
+                       if (!(co.do_quiet && (errno == (do_add ? EEXIST : ESRCH))))
+                               err(EX_OSERR, "setsockopt(IP_FW_TABLE_%s)",
+                                   do_add ? "ADD" : "DEL");
+                       /* In silent mode, react to a failed add by deleting */
+                       if (do_add) {
+                               do_cmd(IP_FW_TABLE_DEL, &ent, sizeof(ent));
+                               if (do_cmd(IP_FW_TABLE_ADD,
+                                   &ent, sizeof(ent)) < 0)
+                                       err(EX_OSERR,
+                                           "setsockopt(IP_FW_TABLE_ADD)");
+                       }
+               }
+       } else if (_substrcmp(*av, "flush") == 0) {
+               a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
+               do {
+                       if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl,
+                           sizeof(ent.tbl)) < 0)
+                               err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)");
+               } while (++ent.tbl < a);
+       } else if (_substrcmp(*av, "list") == 0) {
+               a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
+               do {
+                       table_list(ent, is_all);
+               } while (++ent.tbl < a);
+       } else
+               errx(EX_USAGE, "invalid table command %s", *av);
+}
+
+static void
+table_list(ipfw_table_entry ent, int need_header)
+{
+       ipfw_table *tbl;
+       socklen_t l;
+       uint32_t a;
+
+       a = ent.tbl;
+       l = sizeof(a);
+       if (do_cmd(IP_FW_TABLE_GETSIZE, &a, (uintptr_t)&l) < 0)
+               err(EX_OSERR, "getsockopt(IP_FW_TABLE_GETSIZE)");
+
+       /* If a is zero we have nothing to do, the table is empty. */
+       if (a == 0)
+               return;
+
+       l = sizeof(*tbl) + a * sizeof(ipfw_table_entry);
+       tbl = safe_calloc(1, l);
+       tbl->tbl = ent.tbl;
+       if (do_cmd(IP_FW_TABLE_LIST, tbl, (uintptr_t)&l) < 0)
+               err(EX_OSERR, "getsockopt(IP_FW_TABLE_LIST)");
+       if (tbl->cnt && need_header)
+               printf("---table(%d)---\n", tbl->tbl);
+       for (a = 0; a < tbl->cnt; a++) {
+               unsigned int tval;
+               tval = tbl->ent[a].value;
+               if (co.do_value_as_ip) {
+                       char tbuf[128];
+                       strncpy(tbuf, inet_ntoa(*(struct in_addr *)
+                               &tbl->ent[a].addr), 127);
+                       /* inet_ntoa expects network order */
+                       tval = htonl(tval);
+                       printf("%s/%u %s\n", tbuf, tbl->ent[a].masklen,
+                               inet_ntoa(*(struct in_addr *)&tval));
+               } else {
+                       printf("%s/%u %u\n",
+                               inet_ntoa(*(struct in_addr *)&tbl->ent[a].addr),
+                               tbl->ent[a].masklen, tval);
+               }
+       }
+       free(tbl);
+}
diff --git a/ipfw/ipfw2.h b/ipfw/ipfw2.h
new file mode 100644 (file)
index 0000000..1f280f5
--- /dev/null
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2002-2003 Luigi Rizzo
+ * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Idea and grammar partially left from:
+ * Copyright (c) 1993 Daniel Boulet
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * NEW command line interface for IP firewall facility
+ *
+ * $FreeBSD: head/sbin/ipfw/ipfw2.h 206843 2010-04-19 15:11:45Z luigi $
+ */
+
+/*
+ * Options that can be set on the command line.
+ * When reading commands from a file, a subset of the options can also
+ * be applied globally by specifying them before the file name.
+ * After that, each line can contain its own option that changes
+ * the global value.
+ * XXX The context is not restored after each line.
+ */
+
+struct cmdline_opts {
+       /* boolean options: */
+       int     do_value_as_ip; /* show table value as IP */
+       int     do_resolv;      /* try to resolve all ip to names */
+       int     do_time;        /* Show time stamps */
+       int     do_quiet;       /* Be quiet in add and flush */
+       int     do_pipe;        /* this cmd refers to a pipe/queue/sched */
+       int     do_nat;         /* this cmd refers to a nat config */
+       int     do_dynamic;     /* display dynamic rules */
+       int     do_expired;     /* display expired dynamic rules */
+       int     do_compact;     /* show rules in compact mode */
+       int     do_force;       /* do not ask for confirmation */
+       int     show_sets;      /* display the set each rule belongs to */
+       int     test_only;      /* only check syntax */
+       int     comment_only;   /* only print action and comment */
+       int     verbose;        /* be verbose on some commands */
+
+       /* The options below can have multiple values. */
+
+       int     do_sort;        /* field to sort results (0 = no) */
+               /* valid fields are 1 and above */
+
+       int     use_set;        /* work with specified set number */
+               /* 0 means all sets, otherwise apply to set use_set - 1 */
+
+};
+
+extern struct cmdline_opts co;
+
+/*
+ * _s_x is a structure that stores a string <-> token pairs, used in
+ * various places in the parser. Entries are stored in arrays,
+ * with an entry with s=NULL as terminator.
+ * The search routines are match_token() and match_value().
+ * Often, an element with x=0 contains an error string.
+ *
+ */
+struct _s_x {
+       char const *s;
+       int x;
+};
+
+enum tokens {
+       TOK_NULL=0,
+
+       TOK_OR,
+       TOK_NOT,
+       TOK_STARTBRACE,
+       TOK_ENDBRACE,
+
+       TOK_ACCEPT,
+       TOK_COUNT,
+       TOK_PIPE,
+       TOK_LINK,
+       TOK_QUEUE,
+       TOK_FLOWSET,
+       TOK_SCHED,
+       TOK_DIVERT,
+       TOK_TEE,
+       TOK_NETGRAPH,
+       TOK_NGTEE,
+       TOK_FORWARD,
+       TOK_SKIPTO,
+       TOK_DENY,
+       TOK_REJECT,
+       TOK_RESET,
+       TOK_UNREACH,
+       TOK_CHECKSTATE,
+       TOK_NAT,
+       TOK_REASS,
+       TOK_CALL,
+       TOK_RETURN,
+
+       TOK_ALTQ,
+       TOK_LOG,
+       TOK_TAG,
+       TOK_UNTAG,
+
+       TOK_TAGGED,
+       TOK_UID,
+       TOK_GID,
+       TOK_JAIL,
+       TOK_IN,
+       TOK_LIMIT,
+       TOK_KEEPSTATE,
+       TOK_LAYER2,
+       TOK_OUT,
+       TOK_DIVERTED,
+       TOK_DIVERTEDLOOPBACK,
+       TOK_DIVERTEDOUTPUT,
+       TOK_XMIT,
+       TOK_RECV,
+       TOK_VIA,
+       TOK_FRAG,
+       TOK_IPOPTS,
+       TOK_IPLEN,
+       TOK_IPID,
+       TOK_IPPRECEDENCE,
+       TOK_DSCP,
+       TOK_IPTOS,
+       TOK_IPTTL,
+       TOK_IPVER,
+       TOK_ESTAB,
+       TOK_SETUP,
+       TOK_TCPDATALEN,
+       TOK_TCPFLAGS,
+       TOK_TCPOPTS,
+       TOK_TCPSEQ,
+       TOK_TCPACK,
+       TOK_TCPWIN,
+       TOK_ICMPTYPES,
+       TOK_MAC,
+       TOK_MACTYPE,
+       TOK_VERREVPATH,
+       TOK_VERSRCREACH,
+       TOK_ANTISPOOF,
+       TOK_IPSEC,
+       TOK_COMMENT,
+
+       TOK_PLR,
+       TOK_NOERROR,
+       TOK_BUCKETS,
+       TOK_DSTIP,
+       TOK_SRCIP,
+       TOK_DSTPORT,
+       TOK_SRCPORT,
+       TOK_ALL,
+       TOK_MASK,
+       TOK_FLOW_MASK,
+       TOK_SCHED_MASK,
+       TOK_BW,
+       TOK_DELAY,
+       TOK_PROFILE,
+       TOK_BURST,
+       TOK_RED,
+       TOK_GRED,
+       TOK_DROPTAIL,
+       TOK_PROTO,
+       /* dummynet tokens */
+       TOK_WEIGHT,
+       TOK_LMAX,
+       TOK_PRI,
+       TOK_TYPE,
+       TOK_SLOTSIZE,
+
+       TOK_IP,
+       TOK_IF,
+       TOK_ALOG,
+       TOK_DENY_INC,
+       TOK_SAME_PORTS,
+       TOK_UNREG_ONLY,
+       TOK_SKIP_GLOBAL,
+       TOK_RESET_ADDR,
+       TOK_ALIAS_REV,
+       TOK_PROXY_ONLY,
+       TOK_REDIR_ADDR,
+       TOK_REDIR_PORT,
+       TOK_REDIR_PROTO,
+
+       TOK_IPV6,
+       TOK_FLOWID,
+       TOK_ICMP6TYPES,
+       TOK_EXT6HDR,
+       TOK_DSTIP6,
+       TOK_SRCIP6,
+
+       TOK_IPV4,
+       TOK_UNREACH6,
+       TOK_RESET6,
+
+       TOK_FIB,
+       TOK_SETFIB,
+       TOK_LOOKUP,
+       TOK_SOCKARG,
+};
+/*
+ * the following macro returns an error message if we run out of
+ * arguments.
+ */
+#define NEED(_p, msg)      {if (!_p) errx(EX_USAGE, msg);}
+#define NEED1(msg)      {if (!(*av)) errx(EX_USAGE, msg);}
+
+int pr_u64(uint64_t *pd, int width);
+
+/* memory allocation support */
+void *safe_calloc(size_t number, size_t size);
+void *safe_realloc(void *ptr, size_t size);
+
+/* string comparison functions used for historical compatibility */
+int _substrcmp(const char *str1, const char* str2);
+int _substrcmp2(const char *str1, const char* str2, const char* str3);
+
+/* utility functions */
+int match_token(struct _s_x *table, char *string);
+char const *match_value(struct _s_x *p, int value);
+
+int do_cmd(int optname, void *optval, uintptr_t optlen);
+
+struct in6_addr;
+void n2mask(struct in6_addr *mask, int n);
+int contigmask(uint8_t *p, int len);
+
+/*
+ * Forward declarations to avoid include way too many headers.
+ * C does not allow duplicated typedefs, so we use the base struct
+ * that the typedef points to.
+ * Should the typedefs use a different type, the compiler will
+ * still detect the change when compiling the body of the
+ * functions involved, so we do not lose error checking.
+ */
+struct _ipfw_insn;
+struct _ipfw_insn_altq;
+struct _ipfw_insn_u32;
+struct _ipfw_insn_ip6;
+struct _ipfw_insn_icmp6;
+
+/*
+ * The reserved set numer. This is a constant in ip_fw.h
+ * but we store it in a variable so other files do not depend
+ * in that header just for one constant.
+ */
+extern int resvd_set_number;
+
+/* first-level command handlers */
+void ipfw_add(char *av[]);
+void ipfw_show_nat(int ac, char **av);
+void ipfw_config_pipe(int ac, char **av);
+void ipfw_config_nat(int ac, char **av);
+void ipfw_sets_handler(char *av[]);
+void ipfw_table_handler(int ac, char *av[]);
+void ipfw_sysctl_handler(char *av[], int which);
+void ipfw_delete(char *av[]);
+void ipfw_flush(int force);
+void ipfw_zero(int ac, char *av[], int optname);
+void ipfw_list(int ac, char *av[], int show_counters);
+
+/* altq.c */
+void altq_set_enabled(int enabled);
+u_int32_t altq_name_to_qid(const char *name);
+
+void print_altq_cmd(struct _ipfw_insn_altq *altqptr);
+
+/* dummynet.c */
+void dummynet_list(int ac, char *av[], int show_counters);
+void dummynet_flush(void);
+int ipfw_delete_pipe(int pipe_or_queue, int n);
+
+/* ipv6.c */
+void print_unreach6_code(uint16_t code);
+void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s);
+void print_flow6id(struct _ipfw_insn_u32 *cmd);
+void print_icmp6types(struct _ipfw_insn_u32 *cmd);
+void print_ext6hdr(struct _ipfw_insn *cmd );
+
+struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av);
+struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av);
+
+void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av );
+void fill_unreach6_code(u_short *codep, char *str);
+void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av);
+int fill_ext6hdr(struct _ipfw_insn *cmd, char *av);
diff --git a/ipfw/ipv6.c b/ipfw/ipv6.c
new file mode 100644 (file)
index 0000000..3cfc4df
--- /dev/null
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2002-2003 Luigi Rizzo
+ * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Idea and grammar partially left from:
+ * Copyright (c) 1993 Daniel Boulet
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * NEW command line interface for IP firewall facility
+ *
+ * $FreeBSD: user/luigi/ipfw3-head/sbin/ipfw/ipv6.c 187770 2009-01-27 12:01:30Z luigi $
+ *
+ * ipv6 support
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include "ipfw2.h"
+
+#include <err.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/icmp6.h>
+#include <netinet/ip_fw.h>
+#include <arpa/inet.h>
+
+static struct _s_x icmp6codes[] = {
+      { "no-route",            ICMP6_DST_UNREACH_NOROUTE },
+      { "admin-prohib",                ICMP6_DST_UNREACH_ADMIN },
+      { "address",             ICMP6_DST_UNREACH_ADDR },
+      { "port",                        ICMP6_DST_UNREACH_NOPORT },
+      { NULL, 0 }
+};
+
+void
+fill_unreach6_code(u_short *codep, char *str)
+{
+       int val;
+       char *s;
+
+       val = strtoul(str, &s, 0);
+       if (s == str || *s != '\0' || val >= 0x100)
+               val = match_token(icmp6codes, str);
+       if (val < 0)
+               errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str);
+       *codep = val;
+       return;
+}
+
+void
+print_unreach6_code(uint16_t code)
+{
+       char const *s = match_value(icmp6codes, code);
+
+       if (s != NULL)
+               printf("unreach6 %s", s);
+       else
+               printf("unreach6 %u", code);
+}
+
+/*
+ * Print the ip address contained in a command.
+ */
+void
+print_ip6(ipfw_insn_ip6 *cmd, char const *s)
+{
+       struct hostent *he = NULL;
+       int len = F_LEN((ipfw_insn *) cmd) - 1;
+       struct in6_addr *a = &(cmd->addr6);
+       char trad[255];
+
+       printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);
+
+       if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) {
+               printf("me6");
+               return;
+       }
+       if (cmd->o.opcode == O_IP6) {
+               printf(" ip6");
+               return;
+       }
+
+       /*
+        * len == 4 indicates a single IP, whereas lists of 1 or more
+        * addr/mask pairs have len = (2n+1). We convert len to n so we
+        * use that to count the number of entries.
+        */
+
+       for (len = len / 4; len > 0; len -= 2, a += 2) {
+           int mb =        /* mask length */
+               (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ?
+               128 : contigmask((uint8_t *)&(a[1]), 128);
+
+           if (mb == 128 && co.do_resolv)
+               he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6);
+           if (he != NULL)             /* resolved to name */
+               printf("%s", he->h_name);
+           else if (mb == 0)           /* any */
+               printf("any");
+           else {          /* numeric IP followed by some kind of mask */
+               if (inet_ntop(AF_INET6,  a, trad, sizeof( trad ) ) == NULL)
+                   printf("Error ntop in print_ip6\n");
+               printf("%s",  trad );
+               if (mb < 0)     /* XXX not really legal... */
+                   printf(":%s",
+                       inet_ntop(AF_INET6, &a[1], trad, sizeof(trad)));
+               else if (mb < 128)
+                   printf("/%d", mb);
+           }
+           if (len > 2)
+               printf(",");
+       }
+}
+
+void
+fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av)
+{
+       uint8_t type;
+
+       bzero(cmd, sizeof(*cmd));
+       while (*av) {
+       if (*av == ',')
+           av++;
+           type = strtoul(av, &av, 0);
+           if (*av != ',' && *av != '\0')
+               errx(EX_DATAERR, "invalid ICMP6 type");
+          /*
+           * XXX: shouldn't this be 0xFF?  I can't see any reason why
+           * we shouldn't be able to filter all possiable values
+           * regardless of the ability of the rest of the kernel to do
+           * anything useful with them.
+           */
+           if (type > ICMP6_MAXTYPE)
+               errx(EX_DATAERR, "ICMP6 type out of range");
+           cmd->d[type / 32] |= ( 1 << (type % 32));
+       }
+       cmd->o.opcode = O_ICMP6TYPE;
+       cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6);
+}
+
+
+void
+print_icmp6types(ipfw_insn_u32 *cmd)
+{
+       int i, j;
+       char sep= ' ';
+
+       printf(" ip6 icmp6types");
+       for (i = 0; i < 7; i++)
+               for (j=0; j < 32; ++j) {
+                       if ( (cmd->d[i] & (1 << (j))) == 0)
+                               continue;
+                       printf("%c%d", sep, (i*32 + j));
+                       sep = ',';
+               }
+}
+
+void
+print_flow6id( ipfw_insn_u32 *cmd)
+{
+       uint16_t i, limit = cmd->o.arg1;
+       char sep = ',';
+
+       printf(" flow-id ");
+       for( i=0; i < limit; ++i) {
+               if (i == limit - 1)
+                       sep = ' ';
+               printf("%d%c", cmd->d[i], sep);
+       }
+}
+
+/* structure and define for the extension header in ipv6 */
+static struct _s_x ext6hdrcodes[] = {
+       { "frag",       EXT_FRAGMENT },
+       { "hopopt",     EXT_HOPOPTS },
+       { "route",      EXT_ROUTING },
+       { "dstopt",     EXT_DSTOPTS },
+       { "ah",         EXT_AH },
+       { "esp",        EXT_ESP },
+       { "rthdr0",     EXT_RTHDR0 },
+       { "rthdr2",     EXT_RTHDR2 },
+       { NULL,         0 }
+};
+
+/* fills command for the extension header filtering */
+int
+fill_ext6hdr( ipfw_insn *cmd, char *av)
+{
+       int tok;
+       char *s = av;
+
+       cmd->arg1 = 0;
+
+       while(s) {
+          av = strsep( &s, ",") ;
+           tok = match_token(ext6hdrcodes, av);
+           switch (tok) {
+           case EXT_FRAGMENT:
+               cmd->arg1 |= EXT_FRAGMENT;
+               break;
+
+           case EXT_HOPOPTS:
+               cmd->arg1 |= EXT_HOPOPTS;
+               break;
+
+           case EXT_ROUTING:
+               cmd->arg1 |= EXT_ROUTING;
+               break;
+
+           case EXT_DSTOPTS:
+               cmd->arg1 |= EXT_DSTOPTS;
+               break;
+
+           case EXT_AH:
+               cmd->arg1 |= EXT_AH;
+               break;
+
+           case EXT_ESP:
+               cmd->arg1 |= EXT_ESP;
+               break;
+
+           case EXT_RTHDR0:
+               cmd->arg1 |= EXT_RTHDR0;
+               break;
+
+           case EXT_RTHDR2:
+               cmd->arg1 |= EXT_RTHDR2;
+               break;
+
+           default:
+               errx( EX_DATAERR, "invalid option for ipv6 exten header" );
+               break;
+           }
+       }
+       if (cmd->arg1 == 0 )
+           return 0;
+       cmd->opcode = O_EXT_HDR;
+       cmd->len |= F_INSN_SIZE( ipfw_insn );
+       return 1;
+}
+
+void
+print_ext6hdr( ipfw_insn *cmd )
+{
+       char sep = ' ';
+
+       printf(" extension header:");
+       if (cmd->arg1 & EXT_FRAGMENT ) {
+           printf("%cfragmentation", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_HOPOPTS ) {
+           printf("%chop options", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_ROUTING ) {
+           printf("%crouting options", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_RTHDR0 ) {
+           printf("%crthdr0", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_RTHDR2 ) {
+           printf("%crthdr2", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_DSTOPTS ) {
+           printf("%cdestination options", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_AH ) {
+           printf("%cauthentication header", sep);
+           sep = ',';
+       }
+       if (cmd->arg1 & EXT_ESP ) {
+           printf("%cencapsulated security payload", sep);
+       }
+}
+
+/* Try to find ipv6 address by hostname */
+static int
+lookup_host6 (char *host, struct in6_addr *ip6addr)
+{
+       struct hostent *he;
+
+       if (!inet_pton(AF_INET6, host, ip6addr)) {
+               if ((he = gethostbyname2(host, AF_INET6)) == NULL)
+                       return(-1);
+               memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr));
+       }
+       return(0);
+}
+
+
+/*
+ * fill the addr and mask fields in the instruction as appropriate from av.
+ * Update length as appropriate.
+ * The following formats are allowed:
+ *     any     matches any IP6. Actually returns an empty instruction.
+ *     me      returns O_IP6_*_ME
+ *
+ *     03f1::234:123:0342                single IP6 addres
+ *     03f1::234:123:0342/24            address/mask
+ *     03f1::234:123:0342/24,03f1::234:123:0343/               List of address
+ *
+ * Set of address (as in ipv6) not supported because ipv6 address
+ * are typically random past the initial prefix.
+ * Return 1 on success, 0 on failure.
+ */
+static int
+fill_ip6(ipfw_insn_ip6 *cmd, char *av)
+{
+       int len = 0;
+       struct in6_addr *d = &(cmd->addr6);
+       /*
+        * Needed for multiple address.
+        * Note d[1] points to struct in6_add r mask6 of cmd
+        */
+
+       cmd->o.len &= ~F_LEN_MASK;      /* zero len */
+
+       if (strcmp(av, "any") == 0)
+              return (1);
+
+
+       if (strcmp(av, "me") == 0) {    /* Set the data for "me" opt*/
+              cmd->o.len |= F_INSN_SIZE(ipfw_insn);
+              return (1);
+       }
+
+       if (strcmp(av, "me6") == 0) {   /* Set the data for "me" opt*/
+              cmd->o.len |= F_INSN_SIZE(ipfw_insn);
+              return (1);
+       }
+
+       av = strdup(av);
+       while (av) {
+               /*
+                * After the address we can have '/' indicating a mask,
+                * or ',' indicating another address follows.
+                */
+
+               char *p;
+               int masklen;
+               char md = '\0';
+
+               if ((p = strpbrk(av, "/,")) ) {
+                       md = *p;        /* save the separator */
+                       *p = '\0';      /* terminate address string */
+                       p++;            /* and skip past it */
+               }
+               /* now p points to NULL, mask or next entry */
+
+               /* lookup stores address in *d as a side effect */
+               if (lookup_host6(av, d) != 0) {
+                       /* XXX: failed. Free memory and go */
+                       errx(EX_DATAERR, "bad address \"%s\"", av);
+               }
+               /* next, look at the mask, if any */
+               masklen = (md == '/') ? atoi(p) : 128;
+               if (masklen > 128 || masklen < 0)
+                       errx(EX_DATAERR, "bad width \"%s\''", p);
+               else
+                       n2mask(&d[1], masklen);
+
+               APPLY_MASK(d, &d[1])   /* mask base address with mask */
+
+               /* find next separator */
+
+               if (md == '/') {        /* find separator past the mask */
+                       p = strpbrk(p, ",");
+                       if (p != NULL)
+                               p++;
+               }
+               av = p;
+
+               /* Check this entry */
+               if (masklen == 0) {
+                       /*
+                        * 'any' turns the entire list into a NOP.
+                        * 'not any' never matches, so it is removed from the
+                        * list unless it is the only item, in which case we
+                        * report an error.
+                        */
+                       if (cmd->o.len & F_NOT && av == NULL && len == 0)
+                               errx(EX_DATAERR, "not any never matches");
+                       continue;
+               }
+
+               /*
+                * A single IP can be stored alone
+                */
+               if (masklen == 128 && av == NULL && len == 0) {
+                       len = F_INSN_SIZE(struct in6_addr);
+                       break;
+               }
+
+               /* Update length and pointer to arguments */
+               len += F_INSN_SIZE(struct in6_addr)*2;
+               d += 2;
+       } /* end while */
+
+       /*
+        * Total length of the command, remember that 1 is the size of
+        * the base command.
+        */
+       if (len + 1 > F_LEN_MASK)
+               errx(EX_DATAERR, "address list too long");
+       cmd->o.len |= len+1;
+       free(av);
+       return (1);
+}
+
+/*
+ * fills command for ipv6 flow-id filtering
+ * note that the 20 bit flow number is stored in a array of u_int32_t
+ * it's supported lists of flow-id, so in the o.arg1 we store how many
+ * additional flow-id we want to filter, the basic is 1
+ */
+void
+fill_flow6( ipfw_insn_u32 *cmd, char *av )
+{
+       u_int32_t type;  /* Current flow number */
+       u_int16_t nflow = 0;    /* Current flow index */
+       char *s = av;
+       cmd->d[0] = 0;    /* Initializing the base number*/
+
+       while (s) {
+               av = strsep( &s, ",") ;
+               type = strtoul(av, &av, 0);
+               if (*av != ',' && *av != '\0')
+                       errx(EX_DATAERR, "invalid ipv6 flow number %s", av);
+               if (type > 0xfffff)
+                       errx(EX_DATAERR, "flow number out of range %s", av);
+               cmd->d[nflow] |= type;
+               nflow++;
+       }
+       if( nflow > 0 ) {
+               cmd->o.opcode = O_FLOW6ID;
+               cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow;
+               cmd->o.arg1 = nflow;
+       }
+       else {
+               errx(EX_DATAERR, "invalid ipv6 flow number %s", av);
+       }
+}
+
+ipfw_insn *
+add_srcip6(ipfw_insn *cmd, char *av)
+{
+
+       fill_ip6((ipfw_insn_ip6 *)cmd, av);
+       if (F_LEN(cmd) == 0) {                          /* any */
+       } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) {      /* "me" */
+               cmd->opcode = O_IP6_SRC_ME;
+       } else if (F_LEN(cmd) ==
+           (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) {
+               /* single IP, no mask*/
+               cmd->opcode = O_IP6_SRC;
+       } else {                                        /* addr/mask opt */
+               cmd->opcode = O_IP6_SRC_MASK;
+       }
+       return cmd;
+}
+
+ipfw_insn *
+add_dstip6(ipfw_insn *cmd, char *av)
+{
+
+       fill_ip6((ipfw_insn_ip6 *)cmd, av);
+       if (F_LEN(cmd) == 0) {                          /* any */
+       } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) {      /* "me" */
+               cmd->opcode = O_IP6_DST_ME;
+       } else if (F_LEN(cmd) ==
+           (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) {
+               /* single IP, no mask*/
+               cmd->opcode = O_IP6_DST;
+       } else {                                        /* addr/mask opt */
+               cmd->opcode = O_IP6_DST_MASK;
+       }
+       return cmd;
+}
diff --git a/ipfw/main.c b/ipfw/main.c
new file mode 100644 (file)
index 0000000..7bd9105
--- /dev/null
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
+ * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Idea and grammar partially left from:
+ * Copyright (c) 1993 Daniel Boulet
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * Command line interface for IP firewall facility
+ *
+ * $FreeBSD: head/sbin/ipfw/main.c 206494 2010-04-12 08:27:53Z luigi $
+ */
+
+#include <sys/wait.h>
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include "ipfw2.h"
+
+static void
+help(void)
+{
+       fprintf(stderr,
+"ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n"
+"\tipfw [-abcdefhnNqStTv] <command>\n\n"
+"where <command> is one of the following:\n\n"
+"add [num] [set N] [prob x] RULE-BODY\n"
+"{pipe|queue} N config PIPE-BODY\n"
+"[pipe|queue] {zero|delete|show} [N{,N}]\n"
+"nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n"
+"              reverse|proxy_only|redirect_addr linkspec|\n"
+"              redirect_port linkspec|redirect_proto linkspec}\n"
+"set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n"
+"set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n"
+"table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n"
+"table all {flush | list}\n"
+"\n"
+"RULE-BODY:    check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n"
+"ACTION:       check-state | allow | count | deny | unreach{,6} CODE |\n"
+"               skipto N | {divert|tee} PORT | forward ADDR |\n"
+"               pipe N | queue N | nat N | setfib FIB | reass\n"
+"PARAMS:       [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n"
+"ADDR:         [ MAC dst src ether_type ] \n"
+"              [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n"
+"              [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n"
+"IPADDR:       [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n"
+"IP6ADDR:      [not] { any | me | me6 | ip6/bits | IP6LIST }\n"
+"IP6LIST:      { ip6 | ip6/bits }[,IP6LIST]\n"
+"IPLIST:       { ip | ip/bits | ip:mask }[,IPLIST]\n"
+"OPTION_LIST:  OPTION [OPTION_LIST]\n"
+"OPTION:       bridged | diverted | diverted-loopback | diverted-output |\n"
+"      {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n"
+"      {dst-port|src-port} LIST |\n"
+"      estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n"
+"      iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n"
+"      ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n"
+"      icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n"
+"      mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n"
+"      setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n"
+"      tcpdatalen LIST | verrevpath | versrcreach | antispoof\n"
+);
+
+       exit(0);
+}
+
+/*
+ * Called with the arguments, including program name because getopt
+ * wants it to be present.
+ * Returns 0 if successful, 1 if empty command, errx() in case of errors.
+ * First thing we do is process parameters creating an argv[] array
+ * which includes the program name and a NULL entry at the end.
+ * If we are called with a single string, we split it on whitespace.
+ * Also, arguments with a trailing ',' are joined to the next one.
+ * The pointers (av[]) and data are in a single chunk of memory.
+ * av[0] points to the original program name, all other entries
+ * point into the allocated chunk.
+ */
+static int
+ipfw_main(int oldac, char **oldav)
+{
+       int ch, ac;
+       const char *errstr;
+       char **av, **save_av;
+       int do_acct = 0;                /* Show packet/byte count */
+       int try_next = 0;               /* set if pipe cmd not found */
+       int av_size;                    /* compute the av size */
+       char *av_p;                     /* used to build the av list */
+
+#define WHITESP                " \t\f\v\n\r"
+       if (oldac < 2)
+               return 1;       /* need at least one argument */
+
+       if (oldac == 2) {
+               /*
+                * If we are called with one argument, try to split it into
+                * words for subsequent parsing. Spaces after a ',' are
+                * removed by copying the string in-place.
+                */
+               char *arg = oldav[1];   /* The string is the first arg. */
+               int l = strlen(arg);
+               int copy = 0;           /* 1 if we need to copy, 0 otherwise */
+               int i, j;
+
+               for (i = j = 0; i < l; i++) {
+                       if (arg[i] == '#')      /* comment marker */
+                               break;
+                       if (copy) {
+                               arg[j++] = arg[i];
+                               copy = !strchr("," WHITESP, arg[i]);
+                       } else {
+                               copy = !strchr(WHITESP, arg[i]);
+                               if (copy)
+                                       arg[j++] = arg[i];
+                       }
+               }
+               if (!copy && j > 0)     /* last char was a 'blank', remove it */
+                       j--;
+               l = j;                  /* the new argument length */
+               arg[j++] = '\0';
+               if (l == 0)             /* empty string! */
+                       return 1;
+
+               /*
+                * First, count number of arguments. Because of the previous
+                * processing, this is just the number of blanks plus 1.
+                */
+               for (i = 0, ac = 1; i < l; i++)
+                       if (strchr(WHITESP, arg[i]) != NULL)
+                               ac++;
+
+               /*
+                * Allocate the argument list structure as a single block
+                * of memory, containing pointers and the argument
+                * strings. We include one entry for the program name
+                * because getopt expects it, and a NULL at the end
+                * to simplify further parsing.
+                */
+               ac++;           /* add 1 for the program name */
+               av_size = (ac+1) * sizeof(char *) + l + 1;
+               av = safe_calloc(av_size, 1);
+
+               /*
+                * Init the argument pointer to the end of the array
+                * and copy arguments from arg[] to av[]. For each one,
+                * j is the initial character, i is the one past the end.
+                */
+               av_p = (char *)&av[ac+1];
+               for (ac = 1, i = j = 0; i < l; i++) {
+                       if (strchr(WHITESP, arg[i]) != NULL || i == l-1) {
+                               if (i == l-1)
+                                       i++;
+                               bcopy(arg+j, av_p, i-j);
+                               av[ac] = av_p;
+                               av_p += i-j;    /* the length of the string */
+                               *av_p++ = '\0';
+                               ac++;
+                               j = i + 1;
+                       }
+               }
+       } else {
+               /*
+                * If an argument ends with ',' join with the next one.
+                */
+               int first, i, l=0;
+
+               /*
+                * Allocate the argument list structure as a single block
+                * of memory, containing both pointers and the argument
+                * strings. We include some space for the program name
+                * because getopt expects it.
+                * We add an extra pointer to the end of the array,
+                * to make simpler further parsing.
+                */
+               for (i=0; i<oldac; i++)
+                       l += strlen(oldav[i]);
+
+               av_size = (oldac+1) * sizeof(char *) + l + oldac;
+               av = safe_calloc(av_size, 1);
+
+               /*
+                * Init the argument pointer to the end of the array
+                * and copy arguments from arg[] to av[]
+                */
+               av_p = (char *)&av[oldac+1];
+               for (first = i = ac = 1, l = 0; i < oldac; i++) {
+                       char *arg = oldav[i];
+                       int k = strlen(arg);
+
+                       l += k;
+                       if (arg[k-1] != ',' || i == oldac-1) {
+                               /* Time to copy. */
+                               av[ac] = av_p;
+                               for (l=0; first <= i; first++) {
+                                       strcat(av_p, oldav[first]);
+                                       av_p += strlen(oldav[first]);
+                               }
+                               *av_p++ = '\0';
+                               ac++;
+                               l = 0;
+                               first = i+1;
+                       }
+               }
+       }
+
+       /*
+        * set the progname pointer to the original string
+        * and terminate the array with null
+        */
+       av[0] = oldav[0];
+       av[ac] = NULL;
+
+       /* Set the force flag for non-interactive processes */
+       if (!co.do_force)
+               co.do_force = !isatty(STDIN_FILENO);
+
+#ifdef EMULATE_SYSCTL /* sysctl emulation */
+       if ( ac >= 2 && !strcmp(av[1], "sysctl")) {
+               char *s;
+               int i;
+
+               if (ac != 3) {
+                       printf( "sysctl emulation usage:\n"
+                               "       ipfw sysctl name[=value]\n"
+                               "       ipfw sysctl -a\n");
+                       return 0;
+               }
+               s = strchr(av[2], '=');
+               if (s == NULL) {
+                       s = !strcmp(av[2], "-a") ? NULL : av[2];
+                       sysctlbyname(s, NULL, NULL, NULL, 0);
+               } else {        /* ipfw sysctl x.y.z=value */
+                       /* assume an INT value, will extend later */
+                       if (s[1] == '\0') {
+                               printf("ipfw sysctl: missing value\n\n");
+                               return 0;
+                       }
+                       *s = '\0';
+                       i = strtol(s+1, NULL, 0);
+                       sysctlbyname(av[2], NULL, NULL, &i, sizeof(int));
+               }
+               return 0;
+       }
+#endif
+
+       /* Save arguments for final freeing of memory. */
+       save_av = av;
+
+       optind = optreset = 1;  /* restart getopt() */
+       while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1)
+               switch (ch) {
+               case 'a':
+                       do_acct = 1;
+                       break;
+
+               case 'b':
+                       co.comment_only = 1;
+                       co.do_compact = 1;
+                       break;
+
+               case 'c':
+                       co.do_compact = 1;
+                       break;
+
+               case 'd':
+                       co.do_dynamic = 1;
+                       break;
+
+               case 'e':
+                       co.do_expired = 1;
+                       break;
+
+               case 'f':
+                       co.do_force = 1;
+                       break;
+
+               case 'h': /* help */
+                       free(save_av);
+                       help();
+                       break;  /* NOTREACHED */
+
+               case 'i':
+                       co.do_value_as_ip = 1;
+                       break;
+
+               case 'n':
+                       co.test_only = 1;
+                       break;
+
+               case 'N':
+                       co.do_resolv = 1;
+                       break;
+
+               case 'q':
+                       co.do_quiet = 1;
+                       break;
+
+               case 'p':
+                       errx(EX_USAGE, "An absolute pathname must be used "
+                           "with -p option.");
+                       /* NOTREACHED */
+
+               case 's': /* sort */
+                       co.do_sort = atoi(optarg);
+                       break;
+
+               case 'S':
+                       co.show_sets = 1;
+                       break;
+
+               case 't':
+                       co.do_time = 1;
+                       break;
+
+               case 'T':
+                       co.do_time = 2; /* numeric timestamp */
+                       break;
+
+               case 'v': /* verbose */
+                       co.verbose = 1;
+                       break;
+
+               default:
+                       free(save_av);
+                       return 1;
+               }
+
+       ac -= optind;
+       av += optind;
+       NEED1("bad arguments, for usage summary ``ipfw''");
+
+       /*
+        * An undocumented behaviour of ipfw1 was to allow rule numbers first,
+        * e.g. "100 add allow ..." instead of "add 100 allow ...".
+        * In case, swap first and second argument to get the normal form.
+        */
+       if (ac > 1 && isdigit(*av[0])) {
+               char *p = av[0];
+
+               av[0] = av[1];
+               av[1] = p;
+       }
+
+       /*
+        * Optional: pipe, queue or nat.
+        */
+       co.do_nat = 0;
+       co.do_pipe = 0;
+       co.use_set = 0;
+       if (!strncmp(*av, "nat", strlen(*av)))
+               co.do_nat = 1;
+       else if (!strncmp(*av, "pipe", strlen(*av)))
+               co.do_pipe = 1;
+       else if (_substrcmp(*av, "queue") == 0)
+               co.do_pipe = 2;
+       else if (_substrcmp(*av, "flowset") == 0)
+               co.do_pipe = 2;
+       else if (_substrcmp(*av, "sched") == 0)
+               co.do_pipe = 3;
+       else if (!strncmp(*av, "set", strlen(*av))) {
+               if (ac > 1 && isdigit(av[1][0])) {
+                       co.use_set = strtonum(av[1], 0, resvd_set_number,
+                                       &errstr);
+                       if (errstr)
+                               errx(EX_DATAERR,
+                                   "invalid set number %s\n", av[1]);
+                       ac -= 2; av += 2; co.use_set++;
+               }
+       }
+
+       if (co.do_pipe || co.do_nat) {
+               ac--;
+               av++;
+       }
+       NEED1("missing command");
+
+       /*
+        * For pipes, queues and nats we normally say 'nat|pipe NN config'
+        * but the code is easier to parse as 'nat|pipe config NN'
+        * so we swap the two arguments.
+        */
+       if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) {
+               char *p = av[0];
+
+               av[0] = av[1];
+               av[1] = p;
+       }
+
+       if (co.use_set == 0) {
+               if (_substrcmp(*av, "add") == 0)
+                       ipfw_add(av);
+               else if (co.do_nat && _substrcmp(*av, "show") == 0)
+                       ipfw_show_nat(ac, av);
+               else if (co.do_pipe && _substrcmp(*av, "config") == 0)
+                       ipfw_config_pipe(ac, av);
+               else if (co.do_nat && _substrcmp(*av, "config") == 0)
+                       ipfw_config_nat(ac, av);
+               else if (_substrcmp(*av, "set") == 0)
+                       ipfw_sets_handler(av);
+               else if (_substrcmp(*av, "table") == 0)
+                       ipfw_table_handler(ac, av);
+               else if (_substrcmp(*av, "enable") == 0)
+                       ipfw_sysctl_handler(av, 1);
+               else if (_substrcmp(*av, "disable") == 0)
+                       ipfw_sysctl_handler(av, 0);
+               else
+                       try_next = 1;
+       }
+
+       if (co.use_set || try_next) {
+               if (_substrcmp(*av, "delete") == 0)
+                       ipfw_delete(av);
+               else if (_substrcmp(*av, "flush") == 0)
+                       ipfw_flush(co.do_force);
+               else if (_substrcmp(*av, "zero") == 0)
+                       ipfw_zero(ac, av, 0 /* IP_FW_ZERO */);
+               else if (_substrcmp(*av, "resetlog") == 0)
+                       ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */);
+               else if (_substrcmp(*av, "print") == 0 ||
+                        _substrcmp(*av, "list") == 0)
+                       ipfw_list(ac, av, do_acct);
+               else if (_substrcmp(*av, "show") == 0)
+                       ipfw_list(ac, av, 1 /* show counters */);
+               else
+                       errx(EX_USAGE, "bad command `%s'", *av);
+       }
+
+       /* Free memory allocated in the argument parsing. */
+       free(save_av);
+       return 0;
+}
+
+
+static void
+ipfw_readfile(int ac, char *av[])
+{
+#define MAX_ARGS       32
+       char buf[4096];
+       char *progname = av[0];         /* original program name */
+       const char *cmd = NULL;         /* preprocessor name, if any */
+       const char *filename = av[ac-1]; /* file to read */
+       int     c, lineno=0;
+       FILE    *f = NULL;
+       pid_t   preproc = 0;
+
+       while ((c = getopt(ac, av, "cfNnp:qS")) != -1) {
+               switch(c) {
+               case 'c':
+                       co.do_compact = 1;
+                       break;
+
+               case 'f':
+                       co.do_force = 1;
+                       break;
+
+               case 'N':
+                       co.do_resolv = 1;
+                       break;
+
+               case 'n':
+                       co.test_only = 1;
+                       break;
+
+               case 'p':
+                       /*
+                        * ipfw -p cmd [args] filename
+                        *
+                        * We are done with getopt(). All arguments
+                        * except the filename go to the preprocessor,
+                        * so we need to do the following:
+                        * - check that a filename is actually present;
+                        * - advance av by optind-1 to skip arguments
+                        *   already processed;
+                        * - decrease ac by optind, to remove the args
+                        *   already processed and the final filename;
+                        * - set the last entry in av[] to NULL so
+                        *   popen() can detect the end of the array;
+                        * - set optind=ac to let getopt() terminate.
+                        */
+                       if (optind == ac)
+                               errx(EX_USAGE, "no filename argument");
+                       cmd = optarg;
+                       av[ac-1] = NULL;
+                       av += optind - 1;
+                       ac -= optind;
+                       optind = ac;
+                       break;
+
+               case 'q':
+                       co.do_quiet = 1;
+                       break;
+
+               case 'S':
+                       co.show_sets = 1;
+                       break;
+
+               default:
+                       errx(EX_USAGE, "bad arguments, for usage"
+                            " summary ``ipfw''");
+               }
+
+       }
+
+       if (cmd == NULL && ac != optind + 1)
+               errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]);
+
+       if ((f = fopen(filename, "r")) == NULL)
+               err(EX_UNAVAILABLE, "fopen: %s", filename);
+
+       if (cmd != NULL) {                      /* pipe through preprocessor */
+               int pipedes[2];
+
+               if (pipe(pipedes) == -1)
+                       err(EX_OSERR, "cannot create pipe");
+
+               preproc = fork();
+               if (preproc == -1)
+                       err(EX_OSERR, "cannot fork");
+
+               if (preproc == 0) {
+                       /*
+                        * Child, will run the preprocessor with the
+                        * file on stdin and the pipe on stdout.
+                        */
+                       if (dup2(fileno(f), 0) == -1
+                           || dup2(pipedes[1], 1) == -1)
+                               err(EX_OSERR, "dup2()");
+                       fclose(f);
+                       close(pipedes[1]);
+                       close(pipedes[0]);
+                       execvp(cmd, av);
+                       err(EX_OSERR, "execvp(%s) failed", cmd);
+               } else { /* parent, will reopen f as the pipe */
+                       fclose(f);
+                       close(pipedes[1]);
+                       if ((f = fdopen(pipedes[0], "r")) == NULL) {
+                               int savederrno = errno;
+
+                               (void)kill(preproc, SIGTERM);
+                               errno = savederrno;
+                               err(EX_OSERR, "fdopen()");
+                       }
+               }
+       }
+
+       while (fgets(buf, sizeof(buf), f)) {            /* read commands */
+               char linename[20];
+               char *args[2];
+
+               lineno++;
+               snprintf(linename, sizeof(linename), "Line %d", lineno);
+               setprogname(linename); /* XXX */
+               args[0] = progname;
+               args[1] = buf;
+               ipfw_main(2, args);
+       }
+       fclose(f);
+       if (cmd != NULL) {
+               int status;
+
+               if (waitpid(preproc, &status, 0) == -1)
+                       errx(EX_OSERR, "waitpid()");
+               if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK)
+                       errx(EX_UNAVAILABLE,
+                           "preprocessor exited with status %d",
+                           WEXITSTATUS(status));
+               else if (WIFSIGNALED(status))
+                       errx(EX_UNAVAILABLE,
+                           "preprocessor exited with signal %d",
+                           WTERMSIG(status));
+       }
+}
+
+int
+main(int ac, char *av[])
+{
+#if defined(_WIN32) && defined(TCC)
+       {
+               WSADATA wsaData;
+               int ret=0;
+               unsigned short wVersionRequested = MAKEWORD(2, 2);
+               ret = WSAStartup(wVersionRequested, &wsaData);
+               if (ret != 0) {
+                       /* Tell the user that we could not find a usable */
+                       /* Winsock DLL.                                  */
+                       printf("WSAStartup failed with error: %d\n", ret);
+                       return 1;
+               }
+       }
+#endif
+       /*
+        * If the last argument is an absolute pathname, interpret it
+        * as a file to be preprocessed.
+        */
+
+       if (ac > 1 && av[ac - 1][0] == '/') {
+               if (access(av[ac - 1], R_OK) == 0)
+                       ipfw_readfile(ac, av);
+               else
+                       err(EX_USAGE, "pathname: %s", av[ac - 1]);
+       } else {
+               if (ipfw_main(ac, av)) {
+                       errx(EX_USAGE,
+                           "usage: ipfw [options]\n"
+                           "do \"ipfw -h\" or \"man ipfw\" for details");
+               }
+       }
+       return EX_OK;
+}
diff --git a/ipfw/qsort.c b/ipfw/qsort.c
new file mode 100644 (file)
index 0000000..4258b8c
--- /dev/null
@@ -0,0 +1,195 @@
+/*-
+ * Copyright (c) 1992, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)qsort.c    8.1 (Berkeley) 6/4/93";
+#endif /* LIBC_SCCS and not lint */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $");
+
+#include <stdlib.h>
+
+#ifdef I_AM_QSORT_R
+typedef int             cmp_t(void *, const void *, const void *);
+#else
+typedef int             cmp_t(const void *, const void *);
+#endif
+static inline char     *med3(char *, char *, char *, cmp_t *, void *);
+static inline void      swapfunc(char *, char *, int, int);
+
+#define min(a, b)      (a) < (b) ? a : b
+
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define swapcode(TYPE, parmi, parmj, n) {              \
+       long i = (n) / sizeof (TYPE);                   \
+       TYPE *pi = (TYPE *) (parmi);            \
+       TYPE *pj = (TYPE *) (parmj);            \
+       do {                                            \
+               TYPE    t = *pi;                \
+               *pi++ = *pj;                            \
+               *pj++ = t;                              \
+        } while (--i > 0);                             \
+}
+
+#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+       es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+
+static inline void
+swapfunc(a, b, n, swaptype)
+       char *a, *b;
+       int n, swaptype;
+{
+       if(swaptype <= 1)
+               swapcode(long, a, b, n)
+       else
+               swapcode(char, a, b, n)
+}
+
+#define swap(a, b)                                     \
+       if (swaptype == 0) {                            \
+               long t = *(long *)(a);                  \
+               *(long *)(a) = *(long *)(b);            \
+               *(long *)(b) = t;                       \
+       } else                                          \
+               swapfunc(a, b, es, swaptype)
+
+#define vecswap(a, b, n)       if ((n) > 0) swapfunc(a, b, n, swaptype)
+
+#ifdef I_AM_QSORT_R
+#define        CMP(t, x, y) (cmp((t), (x), (y)))
+#else
+#define        CMP(t, x, y) (cmp((x), (y)))
+#endif
+
+static inline char *
+med3(char *a, char *b, char *c, cmp_t *cmp, void *thunk
+#ifndef I_AM_QSORT_R
+__unused // XXX what ?
+#endif
+)
+{
+       return CMP(thunk, a, b) < 0 ?
+              (CMP(thunk, b, c) < 0 ? b : (CMP(thunk, a, c) < 0 ? c : a ))
+              :(CMP(thunk, b, c) > 0 ? b : (CMP(thunk, a, c) < 0 ? a : c ));
+}
+
+#ifdef I_AM_QSORT_R
+void
+qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp)
+#else
+#define thunk NULL
+void
+qsort(void *a, size_t n, size_t es, cmp_t *cmp)
+#endif
+{
+       char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+       size_t d, r;
+       int cmp_result;
+       int swaptype, swap_cnt;
+
+loop:  SWAPINIT(a, es);
+       swap_cnt = 0;
+       if (n < 7) {
+               for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+                       for (pl = pm;
+                            pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
+                            pl -= es)
+                               swap(pl, pl - es);
+               return;
+       }
+       pm = (char *)a + (n / 2) * es;
+       if (n > 7) {
+               pl = a;
+               pn = (char *)a + (n - 1) * es;
+               if (n > 40) {
+                       d = (n / 8) * es;
+                       pl = med3(pl, pl + d, pl + 2 * d, cmp, thunk);
+                       pm = med3(pm - d, pm, pm + d, cmp, thunk);
+                       pn = med3(pn - 2 * d, pn - d, pn, cmp, thunk);
+               }
+               pm = med3(pl, pm, pn, cmp, thunk);
+       }
+       swap(a, pm);
+       pa = pb = (char *)a + es;
+
+       pc = pd = (char *)a + (n - 1) * es;
+       for (;;) {
+               while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) {
+                       if (cmp_result == 0) {
+                               swap_cnt = 1;
+                               swap(pa, pb);
+                               pa += es;
+                       }
+                       pb += es;
+               }
+               while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) {
+                       if (cmp_result == 0) {
+                               swap_cnt = 1;
+                               swap(pc, pd);
+                               pd -= es;
+                       }
+                       pc -= es;
+               }
+               if (pb > pc)
+                       break;
+               swap(pb, pc);
+               swap_cnt = 1;
+               pb += es;
+               pc -= es;
+       }
+       if (swap_cnt == 0) {  /* Switch to insertion sort */
+               for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+                       for (pl = pm;
+                            pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
+                            pl -= es)
+                               swap(pl, pl - es);
+               return;
+       }
+
+       pn = (char *)a + n * es;
+       r = min(pa - (char *)a, pb - pa);
+       vecswap(a, pb - r, r);
+       r = min(pd - pc, pn - pd - es);
+       vecswap(pb, pn - r, r);
+       if ((r = pb - pa) > es)
+#ifdef I_AM_QSORT_R
+               qsort_r(a, r / es, es, thunk, cmp);
+#else
+               qsort(a, r / es, es, cmp);
+#endif
+       if ((r = pd - pc) > es) {
+               /* Iterate rather than recurse to save stack space */
+               a = pn - r;
+               n = r / es;
+               goto loop;
+       }
+/*             qsort(pn - r, r / es, es, cmp);*/
+}
diff --git a/ipfw/qsort_r.c b/ipfw/qsort_r.c
new file mode 100644 (file)
index 0000000..f7c0e54
--- /dev/null
@@ -0,0 +1,8 @@
+/*
+ * This file is in the public domain.  Originally written by Garrett
+ * A. Wollman.
+ *
+ * $FreeBSD: src/lib/libc/stdlib/qsort_r.c,v 1.1 2002/09/10 02:04:49 wollman Exp $
+ */
+#define I_AM_QSORT_R
+#include "qsort.c"
diff --git a/ipfw/rule_test.sh b/ipfw/rule_test.sh
new file mode 100755 (executable)
index 0000000..d5ad6be
--- /dev/null
@@ -0,0 +1,83 @@
+#/bin/bash
+
+COMMAND=ipfw
+
+
+echo .########## Set $COMMAND mode .##########
+$COMMAND add allow ip from any to any
+$COMMAND -q flush
+
+echo .########## empty rules .##########
+$COMMAND list
+$COMMAND add allow ip from any to any
+$COMMAND add allow ip from any to { 1.2.3.4 or 2.3.4.5 }
+$COMMAND add allow { dst-ip 1.2.3.4 or dst-ip 2.3.4.5 }
+
+echo .########## listing 3 rules .##########
+$COMMAND list
+
+$COMMAND delete 200
+echo .########## listing 2 rules .##########
+$COMMAND list
+
+$COMMAND table 10 add 1.2.3.4
+$COMMAND table 10 add 1.2.3.5
+$COMMAND table 10 add 1.2.3.6
+$COMMAND table 10 add 1.2.3.7/13
+$COMMAND table 10 add 1.2.3.7/20
+$COMMAND table 10 add 1.2.3.7/28
+
+echo .########## listing table 10 with 6 elements .##########
+$COMMAND table 10 list
+$COMMAND table 10 delete 1.2.3.6
+
+echo .########## listing table 10 with 5 elements .##########
+$COMMAND table 10 list
+$COMMAND table 10 flush
+
+echo .########## table 10 empty .##########
+$COMMAND table 10 list
+
+echo .########## move rule 100 to set 1 300 to 3 .##########
+$COMMAND set move rule 100 to 1
+$COMMAND set move rule 300 to 3
+$COMMAND -S show
+
+echo .########## move rule 200 to 2 but 200 do not exist .######
+$COMMAND set move rule 200 to 2
+
+echo .########## add some rules .##########
+$COMMAND add 200 queue 2 proto ip
+$COMMAND add 300 queue 5 proto ip
+$COMMAND add 400 queue 40 proto ip
+$COMMAND add 400 queue 50 proto ip
+
+echo .########## move rule 200 to 2 .######
+$COMMAND set move rule 200 to 2
+
+echo .########## move rule 400 to 5 .######
+$COMMAND set move rule 400 to 5
+
+echo .########## set 5 show 2 rules .######
+$COMMAND set 5 show
+
+echo .########## flush set 5 .######
+$COMMAND -q set 5 flush
+
+echo .########## set 5 show 0 rule .######
+$COMMAND set 5 show
+
+echo .########## disable set 1 .######
+$COMMAND set disable 1
+
+echo .########## show all rules except set 1 .######
+$COMMAND -S show
+
+echo .########## enable set 1 .######
+$COMMAND set enable 1
+
+echo .########## show all rules .######
+$COMMAND -S show
+
+
+
diff --git a/ipfw/ws2_32.def b/ipfw/ws2_32.def
new file mode 100644 (file)
index 0000000..3813911
--- /dev/null
@@ -0,0 +1,120 @@
+LIBRARY ws2_32.dll\r
+\r
+EXPORTS\r
+FreeAddrInfoW\r
+GetAddrInfoW\r
+GetNameInfoW\r
+WEP\r
+WPUCompleteOverlappedRequest\r
+WSAAccept\r
+WSAAddressToStringA\r
+WSAAddressToStringW\r
+WSAAsyncGetHostByAddr\r
+WSAAsyncGetHostByName\r
+WSAAsyncGetProtoByName\r
+WSAAsyncGetProtoByNumber\r
+WSAAsyncGetServByName\r
+WSAAsyncGetServByPort\r
+WSAAsyncSelect\r
+WSACancelAsyncRequest\r
+WSACancelBlockingCall\r
+WSACleanup\r
+WSACloseEvent\r
+WSAConnect\r
+WSACreateEvent\r
+WSADuplicateSocketA\r
+WSADuplicateSocketW\r
+WSAEnumNameSpaceProvidersA\r
+WSAEnumNameSpaceProvidersW\r
+WSAEnumNetworkEvents\r
+WSAEnumProtocolsA\r
+WSAEnumProtocolsW\r
+WSAEventSelect\r
+WSAGetLastError\r
+WSAGetOverlappedResult\r
+WSAGetQOSByName\r
+WSAGetServiceClassInfoA\r
+WSAGetServiceClassInfoW\r
+WSAGetServiceClassNameByClassIdA\r
+WSAGetServiceClassNameByClassIdW\r
+WSAHtonl\r
+WSAHtons\r
+WSAInstallServiceClassA\r
+WSAInstallServiceClassW\r
+WSAIoctl\r
+WSAIsBlocking\r
+WSAJoinLeaf\r
+WSALookupServiceBeginA\r
+WSALookupServiceBeginW\r
+WSALookupServiceEnd\r
+WSALookupServiceNextA\r
+WSALookupServiceNextW\r
+WSANSPIoctl\r
+WSANtohl\r
+WSANtohs\r
+WSAProviderConfigChange\r
+WSARecv\r
+WSARecvDisconnect\r
+WSARecvFrom\r
+WSARemoveServiceClass\r
+WSAResetEvent\r
+WSASend\r
+WSASendDisconnect\r
+WSASendTo\r
+WSASetBlockingHook\r
+WSASetEvent\r
+WSASetLastError\r
+WSASetServiceA\r
+WSASetServiceW\r
+WSASocketA\r
+WSASocketW\r
+WSAStartup\r
+WSAStringToAddressA\r
+WSAStringToAddressW\r
+WSAUnhookBlockingHook\r
+WSAWaitForMultipleEvents\r
+WSApSetPostRoutine\r
+WSCDeinstallProvider\r
+WSCEnableNSProvider\r
+WSCEnumProtocols\r
+WSCGetProviderPath\r
+WSCInstallNameSpace\r
+WSCInstallProvider\r
+WSCUnInstallNameSpace\r
+WSCUpdateProvider\r
+WSCWriteNameSpaceOrder\r
+WSCWriteProviderOrder\r
+__WSAFDIsSet\r
+accept\r
+bind\r
+closesocket\r
+connect\r
+freeaddrinfo\r
+getaddrinfo\r
+gethostbyaddr\r
+gethostbyname\r
+gethostname\r
+getnameinfo\r
+getpeername\r
+getprotobyname\r
+getprotobynumber\r
+getservbyname\r
+getservbyport\r
+getsockname\r
+getsockopt\r
+htonl\r
+htons\r
+inet_addr\r
+inet_ntoa\r
+ioctlsocket\r
+listen\r
+ntohl\r
+ntohs\r
+recv\r
+recvfrom\r
+select\r
+send\r
+sendto\r
+setsockopt\r
+shutdown\r
+socket\r
diff --git a/kipfw/Makefile b/kipfw/Makefile
new file mode 100644 (file)
index 0000000..6ca0562
--- /dev/null
@@ -0,0 +1,367 @@
+# $Id: Makefile 12257 2013-04-26 21:13:24Z luigi $
+# gnu Makefile to build linux/Windows module for ipfw+dummynet.
+#
+# The defaults are set to build without modifications on PlanetLab
+# and possibly 2.6 versions.
+# On Windows, we use gnu-make and MSC
+
+# Some variables need to have specific names, because they are used
+# by the build infrastructure on Linux and OpenWrt. They are:
+# 
+#   ccflags-y  additional $(CC) flags
+#   M          used by Kbuild, we must set it to `pwd`
+#   obj-m      list of .o modules to build
+#   $(MOD)-y   for each $MOD in obj-m, the list of objects
+#   obj-y      same as above, for openwrt
+#   O_TARGET   the link target, for openwrt
+#   EXTRA_CFLAGS as the name says... in openwrt
+#   EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too
+#   KERNELPATH the path to the kernel sources or headers
+#      (on planetlab it is set already by the build system,
+#      for other systems we take KSRC which is either guessed
+#      or taken from the command line.
+#
+# Not sure about this (the name might be reserved)
+#   ipfw-cflags                our flags for building the module
+#
+# Other variables are only private and can be renamed. They include:
+#
+#   VER                linux version we are building for (2.4 2.6 or openwrt)
+#
+#---
+#
+# The windows files (passthru etc.) are modified version of the
+# examples found in the $(DDK)/src/network/ndis/passthru/driver/
+# They can be re-created using the 'ndis-glue' target in the 
+
+include $(PWD)/../Makefile.inc
+
+TARGET = kipfw
+
+# lets default for 2.6 for planetlab builds
+VER ?= 2.6
+
+#--- General values for all types of build ---
+# obj-m is the target module
+obj-m := ipfw_mod.o
+
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS := ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c
+IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c
+IPFW_SRCS += radix.c in_cksum.c
+IPFW_SRCS += ip_dummynet.c ip_dn_io.c ip_dn_glue.c
+IPFW_SRCS += dn_heap.c
+IPFW_SRCS += dn_sched_fifo.c dn_sched_wf2q.c
+IPFW_SRCS += dn_sched_rr.c dn_sched_qfq.c
+IPFW_SRCS += dn_sched_prio.c
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
+# generic cflags used on all systems
+#ipfw-cflags += -DIPFW_HASHTABLES
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
+# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
+ipfw-cflags += -D_BSD_SOURCE
+ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
+# the two header trees for empty and override files
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/../sys
+ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
+
+ifeq ($(OSARCH),Windows)       #--- {  Windows block
+ifeq ($(VER),win64)
+    $(warning ---- building for 64-bit windows ---)
+    win_arch= -DAMD64=1
+else
+    win_arch= -Di386=1
+endif
+    M ?= $(shell pwd)
+    WIN_SRCS += md_win.c
+    WIN_SRCS += miniport.c protocol.c passthru.c debug.c
+    #compiler, linker, target, sources and objects
+    #DDK is exported from the root makefile
+    #DDK = C:/WinDDK/7600.16385.1
+
+    CSOURCES = $(IPFW_SRCS) $(WIN_SRCS)
+
+    COBJS := $(CSOURCES:.c=.obj)
+    COBJS := $(addprefix $(OBJDIR)/,$(COBJS))
+
+    #include paths
+    INCLUDE_PATHS = -Ii386 -I../sys -Iinclude_e -I.
+    # INCLUDE_PATHS += -I$(OBJDIR)
+    INCLUDE_PATHS += -I$(DDK)/inc/api
+    INCLUDE_PATHS += -I$(DDK)/inc/ddk
+    INCLUDE_PATHS += -I$(DDK)/inc/crt
+
+    # #preprocessor MS defines
+    PREPROC  = -D_X86_=1 -Di386=1 -DSTD_CALL -DCONDITION_HANDLING=1
+    PREPROC += -DNT_UP=0 -DNT_INST=0 -DWIN32=100 -D_NT1X_=100 -DWINNT=1
+    PREPROC += -D_WIN32_WINNT=0x0501 -DWINVER=0x0501 -D_WIN32_IE=0x0603
+    PREPROC += -DWIN32_LEAN_AND_MEAN=1 
+    PREPROC += -D__BUILDMACHINE__=WinDDK -DFPO=0 -D_DLL=1
+    PREPROC += -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1
+    PREPROC += -DNDIS51_MINIPORT=1 -DNDIS51=1
+    PREPROC += -DMSC_NOOPT -DNTDDI_VERSION=0x05010200
+    PREPROC += -DKMDF_MAJOR_VERSION_STRING=01 -DKMDF_MINOR_VERSION_STRING=009
+    #PREPROC += -DDBG=1 #debug
+    PREPROC += -DNDEBUG #always up, seems no effect, possibly no debug?
+    PREPROC += -DDEVL=1 #always up, seems no effect
+    #macroing module name, WARNING: must match the one in .inf files
+    PREPROC += -DMODULENAME=Ipfw 
+
+    #our defines
+    OUR_PREPROC  = -D_KERNEL -DKERNEL_MODULE -DKLD_MODULE
+    OUR_PREPROC += -D__BSD_VISIBLE -DIPFIREWALL_DEFAULT_TO_ACCEPT
+    OUR_PREPROC += -D__LITTLE_ENDIAN -DSYSCTL_NODE -DEMULATE_SYSCTL
+
+ifeq ($(TCC),) # Microsoft C compiler
+    CC = $(DDK)/bin/x86/x86/cl.exe
+    LD = $(DDK)/bin/x86/x86/link.exe
+    # #complier options
+    CFLAGS  = -Fo$(OBJDIR)/  -c -FC -Zc:wchar_t-
+    CFLAGS += -Zl -Zp8 -Gy -Gm- -GF -cbstring -Gz -hotpatch -EHs-c-
+    CFLAGS += -W2 # -W3 gives too many conversion errors
+    CFLAGS += -GR- -GF -GS -Zi # XXX do we need this ?
+    CFLAGS += -Fd$(OBJDIR)/
+    CFLAGS += -wd4603 -wd4627 -typedil-
+    CFLAGS += -FI $(DDK)/inc/api/warning.h
+    CFLAGS += -FI winmissing.h
+    CFLAGS += -FI missing.h    # headers
+    CFLAGS += -FI ../glue.h    # headers
+
+    #optimization options
+    OPTIMIZE = -Od -Oi -Oy-
+
+    #linker options
+    LDFLAGS  = /MERGE:_PAGE=PAGE /MERGE:_TEXT=.text
+    LDFLAGS += /SECTION:INIT,d /OPT:REF /OPT:ICF
+    LDFLAGS += /IGNORE:4198,4010,4037,4039,4065,4070,4078,4087,4089,4221
+    LDFLAGS += /INCREMENTAL:NO /release /NODEFAULTLIB /WX
+    LDFLAGS += /debug /debugtype:cv,fixup,pdata
+    LDFLAGS += /version:6.1 /osversion:6.1 /functionpadmin:5
+    LDFLAGS += /safeseh /pdbcompress
+    LDFLAGS += /STACK:0x40000,0x1000 /driver /base:0x10000 /align:0x80
+    LDFLAGS += /stub:$(DDK)\\lib\\wxp\\stub512.com
+    LDFLAGS += /subsystem:native,5.01 /entry:GsDriverEntry@8
+    LDFLAGS += /out:$(OBJDIR)/ipfw.sys
+
+    #libraries to build against
+    LIBS  = $(DDK)/lib/wxp/i386/BufferOverflowK.lib
+    LIBS += $(DDK)/lib/wxp/i386/ntoskrnl.lib
+    LIBS += $(DDK)/lib/wxp/i386/hal.lib
+    LIBS += $(DDK)/lib/wxp/i386/wmilib.lib
+    LIBS += $(DDK)/lib/wxp/i386/ndis.lib
+    LIBS += $(DDK)/lib/wxp/i386/sehupd.lib
+else   # use tcc. not working yet for the kernel module.
+    # TCC points to the root of tcc tree
+    CC=$(TCC)/bin/wintcc
+    EXTRA_CFLAGS += -DTCC -I..
+    EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include
+    EXTRA_CFLAGS += -nostdinc
+
+    CFLAGS += -include winmissing.h -include missing.h -include ../glue.h
+    CFLAGS += -I../../inc/api -I../../inc/ddk -I../../inc/crt
+    CFLAGS += -DRC_INVOKED
+endif # use tcc
+
+    #empty include directory to be built
+    M ?= $(shell pwd)
+    EFILES_asm += div64.h
+    EFILES_linux += if.h random.h errno.h
+    EFILES_net += if_types.h inet_hashtables.h route.h
+
+    #targets
+all: $(TARGET)
+
+$(TARGET): include_e
+       # XXX dangerous rm -rf $(OBJDIR)
+       mkdir -p $(OBJDIR)
+       $(MSG) "  CC [$(CC)] $(CSOURCES)"
+       $(HIDE) $(CC) $(INCLUDE_PATHS) $(PREPROC) $(OUR_PREPROC) $(CFLAGS) $(OPTIMIZE) $(CSOURCES)
+       $(MSG) "  LD [$(LD)] $(COBJS)"
+       $(HIDE) $(LD) $(LDFLAGS) $(COBJS) $(LIBS)
+
+else # } { linux variables and targets
+
+# We have three sections: OpenWrt, Linux 2.4 and Linux 2.6
+
+ifeq ($(VER),openwrt)  #--- { The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
+  M=.
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
+
+  # xcflags-y is a temporary variable where we store build options
+  xcflags-y += -O1 -DLINUX_24
+  xcflags-y += -g
+
+  EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL
+
+  # we should not export anything
+  #export-objs := ipfw2_mod.o
+-include $(TOPDIR)/Rules.make
+endif # ---- } end openwrt version
+
+
+ifneq ($(shell echo $(VER)|grep '2.4'),)       #--- {
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
+  # guess the kernel path -- or is it under /lib/modules ?
+  KERNELPATH ?= $(KSRC)
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
+  MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
+  $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
+
+  # additional warning
+  WARN += -Wall -Wundef
+  WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
+  WARN += -fno-common -Werror-implicit-function-declaration
+  # WARN += -O2  -fno-stack-protector -m32 -msoft-float -mregparm=3
+  # -mregparm=3 gives a printk error
+  WARN += -m32 -msoft-float # -mregparm=3
+  #WARN += -freg-struct-return -mpreferred-stack-boundary=2
+  WARN += -Wno-sign-compare
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+        WARN += -Wno-pointer-sign
+  endif
+
+  ccflags-y += -O1 -DLINUX_24
+  CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
+  # The Main target
+all: mod24
+
+else # --- } {  linux 2.6 and newer
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
+ifeq ($(IPFW_PLANETLAB),1)
+  $(warning "---- Building for PlanetLab")
+  ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
+endif
+  # if not set, use the version from the installed system
+  KERNELPATH ?= $(KSRC)
+#  $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
+  WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
+  # The main target
+
+  # Required by GCC 4.6
+  ccflags-y += -Wno-unused-but-set-variable
+
+  # extract version number (decimal). Newer linuxes have a different dir
+  LINUX_VERSION_CODE := $(shell V=linux/version.h; G=. ; \
+        [ -f $(KERNELPATH)/include/$${V} ] || G=generated/uapi ;\
+        grep LINUX_VERSION_CODE $(KERNELPATH)/include/$${G}/linux/version.h | \
+        awk '{printf "%d", $$3} ')
+
+  #     awk '{printf "%d %03x%02d", $$3, $$3/256, $$3%256} ')
+  # $(warning version $(LINUX_VERSION_CODE))
+
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+
+  # Required by kernel <= 2.6.22, ccflags-y is used on newer version
+  ifeq ($(shell if [ "$(LINUX_VERSION_CODE)" -le 132630 ] ; then echo "true"; fi),true)
+    EXTRA_CFLAGS += $(ccflags-y)
+  endif
+
+  $(warning $(shell [ "$(LINUX_VERSION_CODE)" -le 132635 ] && \
+       [ `$(MAKE) -version | head -1 | cut -d " " -f 3` != '3.81' ] && \
+       echo "****   need make 3.81 *****") )
+  # $(warning make is $(MAKE) version is $(shell $(MAKE) -version | head -1) )
+
+all: $(TARGET)
+$(TARGET):     include_e
+       $(MAKE) -C $(KERNELPATH) V=$(V) M=`pwd` modules
+
+endif # } --- linux 2.6 and newer
+
+#-- back to the common section for linux
+
+# the list of objects used to build the module
+ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
+
+# additional $(CC) flags
+ccflags-y += $(WARN)
+ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
+ccflags-y += -g
+
+mod24: include_e $(obj-m)
+
+$(obj-m): $(ipfw_mod-y)
+       $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+
+endif # } ----- end of the non-Windows block
+
+ifneq ($(OBJDIR),mia)
+    $(error objdir set to $(OBJDIR))
+endif
+
+#--- various common targets
+clean:
+       -@rm -f *.o *.ko Module.symvers *.mod.c
+       -@# rm -rf $(OBJDIR)
+       -@rm -rf include_e
+
+distclean: clean
+       -@rm -f .*cmd modules.order opt_*
+       -@rm -rf .tmp_versions .*.o.d _CL_*
+
+# support to create empty dirs and files in include_e/
+# EFILES_foo/bar is the list of files to be created in foo/bar
+# (/ and . are allowed in gmake variable names)
+
+EFILES_. += opt_inet.h opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h
+EFILES_. += opt_mbuf_stress_test.h opt_param.h opt_ipdivert.h
+
+EFILES_altq += if_altq.h
+EFILES_arpa += inet.h
+EFILES_machine += in_cksum.h
+EFILES_net += ethernet.h netisr.h pf_mtag.h bpf.h if_types.h vnet.h
+
+EFILES_netinet += ether.h icmp6.h if_ether.h in.h in_pcb.h in_var.h
+EFILES_netinet += in_systm.h ip_carp.h ip_var.h pim.h
+EFILES_netinet += sctp.h tcp_timer.h tcpip.h udp_var.h
+EFILES_netinet6 += ip6_var.h
+
+EFILES_sys += _lock.h _rwlock.h rmlock.h _mutex.h jail.h
+EFILES_sys += condvar.h eventhandler.h domain.h
+EFILES_sys += limits.h lock.h mutex.h priv.h
+EFILES_sys += proc.h rwlock.h socket.h socketvar.h
+EFILES_sys += sysctl.h time.h ucred.h
+
+# first make a list of directories from variable names
+EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES)))
+# then prepend the directory name to individual files.
+#       $(empty) serves to interpret the following space literally,
+#       and the ":  = " substitution packs spaces into one.
+EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i):  = )))
+
+include_e:
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
+
+#--- some other targets for testing purposes
+test_radix: test_radix.o radix.o
+test_lookup: ip_fw_lookup.o
+test_radix test_lookup: CFLAGS=-Wall -Werror -O1
diff --git a/kipfw/bsd_compat.c b/kipfw/bsd_compat.c
new file mode 100644 (file)
index 0000000..ed2ce5d
--- /dev/null
@@ -0,0 +1,568 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: bsd_compat.c 11530 2012-08-01 10:29:32Z luigi $
+ *
+ * kernel variables and functions that are not available in linux.
+ */
+
+#include <sys/cdefs.h>
+#include <asm/div64.h> /* do_div on 2.4 */
+#include <linux/random.h>      /* get_random_bytes on 2.4 */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <sys/malloc.h>
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+int ticks;             /* kernel ticks counter */
+int hz = 1000;         /* default clock time */
+long tick = 1000;      /* XXX is this 100000/hz ? */
+int bootverbose = 0;
+struct timeval boottime;
+
+int     ip_defttl = 64;        /* XXX set default value */
+int    max_linkhdr = 16;
+int fw_one_pass = 1;
+u_long  in_ifaddrhmask;                         /* mask for hash table */
+struct  in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+u_int rt_numfibs = RT_NUMFIBS;
+
+/*
+ * pfil hook support.
+ * We make pfil_head_get return a non-null pointer, which is then ignored
+ * in our 'add-hook' routines.
+ */
+struct pfil_head;
+typedef int (pfil_hook_t)
+       (void *, struct mbuf **, struct ifnet *, int, struct inpcb *);
+
+struct pfil_head *
+pfil_head_get(int proto, u_long flags)
+{
+       static int dummy;
+       return (struct pfil_head *)&dummy;
+}
+int
+pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+int
+pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+/* define empty body for kernel function */
+int
+priv_check(struct thread *td, int priv)
+{
+       return 0;
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+       return 0;
+}
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+void
+ether_demux(struct ifnet *ifp, struct mbuf *m)
+{
+       return;
+}
+
+int
+ether_output_frame(struct ifnet *ifp, struct mbuf *m)
+{
+       return 0;
+}
+
+void
+in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
+{
+       return;
+}
+
+void
+icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
+{
+       return;
+}
+
+u_short
+in_cksum_skip(struct mbuf *m, int len, int skip)
+{
+       return 0;
+}
+
+u_short
+in_cksum_hdr(struct ip *ip)
+{
+       return 0;
+}
+
+/*
+ * we don't really reassemble, just return whatever we had.
+ */
+struct mbuf *
+ip_reass(struct mbuf *clone)
+{
+       return clone;
+}
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+/* credentials check */
+#include <netinet/ip_fw.h>
+#ifdef __linux__
+int
+cred_check(void *_insn,  int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb)
+{
+       int match = 0;
+       ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn;
+
+       if (*ugid_lookupp == 0) {        /* actively lookup and copy in cache */
+               /* returns null if any element of the chain up to file is null.
+                * if sk != NULL then we also have a reference
+                */
+               *ugid_lookupp = linux_lookup(proto,
+                       src_ip.s_addr, htons(src_port),
+                       dst_ip.s_addr, htons(dst_port),
+                       skb, oif ? 1 : 0, u);
+       }
+       if (*ugid_lookupp < 0)
+               return 0;
+
+       if (insn->o.opcode == O_UID)
+               match = (u->uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_JAIL)
+               match = (u->xid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = (u->gid == (uid_t)insn->d[0]);
+       return match;
+}
+#endif /* __linux__ */
+
+int
+jailed(struct ucred *cred)
+{
+       return 0;
+}
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int
+in_localaddr(struct in_addr in)
+{
+       return 1;
+}
+
+int
+sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (len < valsize)
+               sopt->sopt_valsize = valsize = len;
+       //printf("copyout buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len);
+       bcopy(buf, sopt->sopt_val, valsize);
+       return 0;
+}
+
+/*
+ * copy data from userland to kernel
+ */
+int
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (valsize < minlen)
+               return EINVAL;
+       if (valsize > len)
+               sopt->sopt_valsize = valsize = len;
+       //printf("copyin buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len);
+       bcopy(sopt->sopt_val, buf, valsize);
+       return 0;
+}
+
+void
+getmicrouptime(struct timeval *tv)
+{
+       do_gettimeofday(tv);
+}
+
+
+#include <arpa/inet.h>
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf)
+{
+#ifdef _WIN32
+#else
+       unsigned char *ucp = (unsigned char *)&ina;
+
+       sprintf(buf, "%d.%d.%d.%d",
+       ucp[0] & 0xff,
+       ucp[1] & 0xff,
+       ucp[2] & 0xff,
+       ucp[3] & 0xff);
+#endif
+       return buf;
+}
+
+char *
+inet_ntoa(struct in_addr ina)
+{
+       static char buf[16];
+       return inet_ntoa_r(ina, buf);
+}
+
+int
+random(void)
+{
+#ifdef _WIN32
+       static unsigned long seed;
+       if (seed == 0) {
+               LARGE_INTEGER tm;
+               KeQuerySystemTime(&tm);
+               seed = tm.LowPart;
+       }
+       return RtlRandomEx(&seed) & 0x7fffffff;
+#else
+       int r;
+       get_random_bytes(&r, sizeof(r));
+       return r & 0x7fffffff; 
+#endif
+}
+
+
+/*
+ * do_div really does a u64 / u32 bit division.
+ * we save the sign and convert to uint befor calling.
+ * We are safe just because we always call it with small operands.
+ */
+int64_t
+div64(int64_t a, int64_t b)
+{
+#ifdef _WIN32
+        int a1 = a, b1 = b;
+       return a1/b1;
+#else
+       uint64_t ua, ub;
+       int sign = ((a>0)?1:-1) * ((b>0)?1:-1);
+
+       ua = ((a>0)?a:-a);
+       ub = ((b>0)?b:-b);
+        do_div(ua, ub);
+       return sign*ua;
+#endif
+}
+
+#ifdef __MIPSEL__
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+        char *d = dst;
+        const char *s = src;
+        size_t n = siz;
+        /* Copy as many bytes as will fit */
+        if (n != 0 && --n != 0) {
+                do {
+                        if ((*d++ = *s++) == 0)
+                                break;
+                } while (--n != 0);
+        }
+
+        /* Not enough room in dst, add NUL and traverse rest of src */
+        if (n == 0) {
+                if (siz != 0)
+                        *d = '\0';              /* NUL-terminate dst */
+                while (*s++)
+                        ;
+        }
+
+        return(s - src - 1);    /* count does not include NUL */
+}
+#endif // __MIPSEL__
+
+/*
+ * compact version of fnmatch.
+ */
+int
+fnmatch(const char *pattern, const char *string, int flags)
+{
+       char s;
+
+       if (!string || !pattern)
+               return 1;       /* no match */
+       while ( (s = *string++) ) {
+               char p = *pattern++;
+               if (p == '\0')          /* pattern is over, no match */
+                       return 1;
+               if (p == '*')           /* wildcard, match */
+                       return 0;
+               if (p == '.' || p == s) /* char match, continue */
+                       continue;
+               return 1;               /* no match */
+       }
+       /* end of string, make sure the pattern is over too */
+       if (*pattern == '\0' || *pattern == '*')
+               return 0;
+       return 1;       /* no match */
+}
+
+
+/*
+ * linux 2.6.33 defines these functions to access to
+ * skbuff internal structures. Define the missing
+ * function for the previous versions too.
+ */
+#ifdef linux
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)
+inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
+{
+        skb->dst = dst;
+}
+
+inline struct dst_entry *skb_dst(const struct sk_buff *skb)
+{
+        return (struct dst_entry *)skb->dst;
+}
+#endif /* < 2.6.31 */
+#endif /* linux */
+
+
+/* support for sysctl emulation.
+ * XXX this is actually MI code that should be enabled also on openwrt
+ */
+#ifdef EMULATE_SYSCTL
+static struct sysctltable GST;
+
+int
+kesysctl_emu_get(struct sockopt* sopt)
+{
+       struct dn_id* oid = sopt->sopt_val;
+       struct sysctlhead* entry;
+       int sizeneeded = sizeof(struct dn_id) + GST.totalsize +
+               sizeof(struct sysctlhead);
+       unsigned char* pstring;
+       unsigned char* pdata;
+       int i;
+       
+       if (sopt->sopt_valsize < sizeneeded) {
+               // this is a probe to retrieve the space needed for
+               // a dump of the sysctl table
+               oid->id = sizeneeded;
+               sopt->sopt_valsize = sizeof(struct dn_id);
+               return 0;
+       }
+       
+       entry = (struct sysctlhead*)(oid+1);
+       for( i=0; i<GST.count; i++) {
+               entry->blocklen = GST.entry[i].head.blocklen;
+               entry->namelen = GST.entry[i].head.namelen;
+               entry->flags = GST.entry[i].head.flags;
+               entry->datalen = GST.entry[i].head.datalen;
+               pdata = (unsigned char*)(entry+1);
+               pstring = pdata+GST.entry[i].head.datalen;
+               bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen);
+               bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen);
+               entry = (struct sysctlhead*)
+                       ((unsigned char*)(entry) + GST.entry[i].head.blocklen);
+       }
+       sopt->sopt_valsize = sizeneeded;
+       return 0;
+}
+
+int
+kesysctl_emu_set(void* p, int l)
+{
+       struct sysctlhead* entry;
+       unsigned char* pdata;
+       unsigned char* pstring;
+       int i = 0;
+       
+       entry = (struct sysctlhead*)(((struct dn_id*)p)+1);
+       pdata = (unsigned char*)(entry+1);
+       pstring = pdata + entry->datalen;
+       
+       for (i=0; i<GST.count; i++) {
+               if (strcmp(GST.entry[i].name, pstring) != 0)
+                       continue;
+               printf("%s: match found! %s\n",__FUNCTION__,pstring);
+               //sanity check on len, not really useful now since
+               //we only accept int32
+               if (entry->datalen != GST.entry[i].head.datalen) {
+                       printf("%s: len mismatch, user %d vs kernel %d\n",
+                               __FUNCTION__, entry->datalen,
+                               GST.entry[i].head.datalen);
+                       return -1;
+               }
+               // check access (at the moment flags handles only the R/W rights
+               //later on will be type + access
+               if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) {
+                       printf("%s: the entry %s is read only\n",
+                               __FUNCTION__,GST.entry[i].name);
+                       return -1;
+               }
+               bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen);
+               return 0;
+       }
+       printf("%s: match not found\n",__FUNCTION__);
+       return 0;
+}
+
+/* convert all _ to . until the first . */
+static void
+underscoretopoint(char* s)
+{
+       for (; *s && *s != '.'; s++)
+               if (*s == '_')
+                       *s = '.';
+}
+
+static int
+formatnames()
+{
+       int i;
+       int size=0;
+       char* name;
+
+       for (i=0; i<GST.count; i++)
+               size += GST.entry[i].head.namelen;
+       GST.namebuffer = malloc(size, 0, 0);
+       if (GST.namebuffer == NULL)
+               return -1;
+       name = GST.namebuffer;
+       for (i=0; i<GST.count; i++) {
+               bcopy(GST.entry[i].name, name, GST.entry[i].head.namelen);
+               underscoretopoint(name);
+               GST.entry[i].name = name;
+               name += GST.entry[i].head.namelen;
+       }
+       return 0;
+}
+
+static void
+dumpGST()
+{
+       int i;
+
+       for (i=0; i<GST.count; i++) {
+               printf("SYSCTL: entry %i\n", i);
+               printf("name %s\n", GST.entry[i].name);
+               printf("namelen %i\n", GST.entry[i].head.namelen);
+               printf("type %i access %i\n",
+                       GST.entry[i].head.flags >> 2,
+                       GST.entry[i].head.flags & 0x00000003);
+               printf("data %i\n", *(int*)(GST.entry[i].data));
+               printf("datalen %i\n", GST.entry[i].head.datalen);
+               printf("blocklen %i\n", GST.entry[i].head.blocklen);
+       }
+}
+
+void sysctl_addgroup_f1();
+void sysctl_addgroup_f2();
+void sysctl_addgroup_f3();
+void sysctl_addgroup_f4();
+
+void
+keinit_GST()
+{
+       int ret;
+
+       sysctl_addgroup_f1();
+       sysctl_addgroup_f2();
+       sysctl_addgroup_f3();
+       sysctl_addgroup_f4();
+       ret = formatnames();
+       if (ret != 0)
+               printf("conversion of names failed for some reason\n");
+       //dumpGST();
+       printf("*** Global Sysctl Table entries = %i, total size = %i ***\n",
+               GST.count, GST.totalsize);
+}
+
+void
+keexit_GST()
+{
+       if (GST.namebuffer != NULL)
+               free(GST.namebuffer,0);
+       bzero(&GST, sizeof(GST));
+}
+
+void
+sysctl_pushback(char* name, int flags, int datalen, void* data)
+{
+       if (GST.count >= GST_HARD_LIMIT) {
+               printf("WARNING: global sysctl table full, this entry will not be added,"
+                               "please recompile the module increasing the table size\n");
+               return;
+       }
+       GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0'
+       GST.entry[GST.count].name = name;
+       GST.entry[GST.count].head.flags = flags;
+       GST.entry[GST.count].data = data;
+       GST.entry[GST.count].head.datalen = datalen;
+       GST.entry[GST.count].head.blocklen =
+               ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen +
+                       GST.entry[GST.count].head.datalen)+3) & ~3;
+       GST.totalsize += GST.entry[GST.count].head.blocklen;
+       GST.count++;
+}
+#endif /* EMULATE_SYSCTL */
diff --git a/kipfw/debug.c b/kipfw/debug.c
new file mode 100644 (file)
index 0000000..67a4f23
--- /dev/null
@@ -0,0 +1,67 @@
+#include <ntddk.h>
+
+const char* texify_cmd(int i)
+{
+       if (i==110)
+               return("IP_FW_ADD");
+       if (i==111)
+               return("IP_FW_DEL");
+       if (i==112)
+               return("IP_FW_FLUSH");
+       if (i==113)
+               return("IP_FW_ZERO");
+       if (i==114)
+               return("IP_FW_GET");
+       if (i==115)
+               return("IP_FW_RESETLOG");
+       if (i==116)
+               return("IP_FW_NAT_CFG");
+       if (i==117)
+               return("IP_FW_NAT_DEL");
+       if (i==118)
+               return("IP_FW_NAT_GET_CONFIG");
+       if (i==119)
+               return("IP_FW_NAT_GET_LOG");
+       if (i==120)
+               return("IP_DUMMYNET_CONFIGURE");
+       if (i==121)
+               return("IP_DUMMYNET_DEL");
+       if (i==122)
+               return("IP_DUMMYNET_FLUSH");
+       if (i==124)
+               return("IP_DUMMYNET_GET");
+       if (i==108)
+               return("IP_FW3");
+       if (i==109)
+               return("IP_DUMMYNET3");
+       return ("BOH");
+}
+
+const char* texify_proto(unsigned int p)
+{
+       if (p==1)
+               return("ICMP");
+       if (p==6)
+               return("TCP");
+       if (p==17)
+               return("UDP");
+       return("OTHER");
+}
+
+void hexdump(unsigned char* addr, int len, const char *msg)
+{
+       int i;
+       const  int cicli = len/8;
+       const int resto = len%8;
+       unsigned char d[8];
+
+       DbgPrint("%s at %p len %d\n", msg, addr, len);
+       for (i=0; i<=cicli; i++) {
+               bzero(d, 8);
+               bcopy(addr+i*8, d, i < cicli ? 8 : resto);
+               DbgPrint("%04X %02X %02X %02X %02X %02X %02X %02X %02X\n",
+                       i*8, d[0], d[1], d[2], d[3], d[4],
+                       d[5], d[6], d[7]);
+       }
+       DbgPrint("\n");
+}
diff --git a/kipfw/ipfw2_mod.c b/kipfw/ipfw2_mod.c
new file mode 100644 (file)
index 0000000..d0824ce
--- /dev/null
@@ -0,0 +1,955 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: ipfw2_mod.c 12501 2014-01-10 01:09:14Z luigi $
+ *
+ * The main interface to build ipfw+dummynet as a linux module.
+ * (and possibly as a windows module as well, though that part
+ * is not complete yet).
+ *
+ * The control interface uses the sockopt mechanism
+ * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW).
+ *
+ * The data interface uses the netfilter interface, at the moment
+ * hooked to the PRE_ROUTING and POST_ROUTING hooks.
+ * Unfortunately the netfilter interface is a moving target,
+ * so we need a set of macros to adapt to the various cases.
+ *
+ * In the netfilter hook we just mark packet as 'QUEUE' and then
+ * let the queue handler to do the whole work (filtering and
+ * possibly emulation).
+ * As we receive packets, we wrap them with an mbuf descriptor
+ * so the existing ipfw+dummynet code runs unmodified.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/mbuf.h>                  /* sizeof struct mbuf */
+#include <sys/param.h>                 /* NGROUPS */
+
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n",      \
+        __FUNCTION__, ## __VA_ARGS__)
+#endif
+
+#ifdef __linux__
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#ifndef CONFIG_NETFILTER
+#error should configure netfilter (broken on 2.6.26 and below ?)
+#endif
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>      /* NF_IP_PRI_FILTER */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
+#include <net/netfilter/nf_queue.h>    /* nf_queue */
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
+#define __read_mostly
+#endif
+
+#endif /* !__linux__ */
+
+#include <netinet/in.h>                        /* in_addr */
+#include <netinet/ip_fw.h>             /* ip_fw_ctl_t, ip_fw_chk_t */
+#include <netinet/ipfw/ip_fw_private.h>                /* ip_fw_ctl_t, ip_fw_chk_t */
+#include <netinet/ip_dummynet.h>       /* ip_dn_ctl_t, ip_dn_io_t */
+#include <net/pfil.h>                  /* PFIL_IN, PFIL_OUT */
+
+#ifdef __linux__
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,13)
+/* XXX was < 2.6.0:  inet_hashtables.h is introduced in 2.6.14 */
+// #warning --- inet_hashtables not present on 2.4
+#include <linux/tcp.h>
+#include <net/route.h>
+#include <net/sock.h>
+static inline int inet_iif(const struct sk_buff *skb)
+{
+        return ((struct rtable *)skb->dst)->rt_iif;
+}
+
+#else
+#include <net/inet_hashtables.h>       /* inet_lookup */
+#endif
+#endif /* __linux__ */
+
+#include <net/route.h>                 /* inet_iif */
+
+/*
+ * Here we allocate some global variables used in the firewall.
+ */
+//ip_dn_ctl_t    *ip_dn_ctl_ptr;
+int (*ip_dn_ctl_ptr)(struct sockopt *);
+
+ip_fw_ctl_t    *ip_fw_ctl_ptr;
+
+int    (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
+ip_fw_chk_t    *ip_fw_chk_ptr;
+
+void           (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+/* Divert hooks. */
+void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+/* ng_ipfw hooks. */
+ng_ipfw_input_t *ng_ipfw_input_p = NULL;
+
+/*---
+ * Glue code to implement the registration of children with the parent.
+ * Each child should call my_mod_register() when linking, so that
+ * module_init() and module_exit() can call init_children() and
+ * fini_children() to provide the necessary initialization.
+ * We use the same mechanism for MODULE_ and SYSINIT_.
+ * The former only get a pointer to the moduledata,
+ * the latter have two function pointers (init/uninit)
+ */
+#include <sys/module.h>
+struct mod_args {
+        const char *name;
+        int order;
+        struct moduledata *mod;
+       void (*init)(void), (*uninit)(void);
+};
+
+static unsigned int mod_idx;
+static struct mod_args mods[10];       /* hard limit to 10 modules */
+
+int
+my_mod_register(const char *name, int order,
+       struct moduledata *mod, void *init, void *uninit);
+/*
+ * my_mod_register should be called automatically as the init
+ * functions in the submodules. Unfortunately this compiler/linker
+ * trick is not supported yet so we call it manually.
+ */
+int
+my_mod_register(const char *name, int order,
+       struct moduledata *mod, void *init, void *uninit)
+{
+       struct mod_args m;
+
+       m.name = name;
+       m.order = order;
+       m.mod = mod;
+       m.init = init;
+       m.uninit = uninit;
+
+       printf("%s %s called\n", __FUNCTION__, name);
+       if (mod_idx < sizeof(mods) / sizeof(mods[0]))
+               mods[mod_idx++] = m;
+       return 0;
+}
+
+static void
+init_children(void)
+{
+       unsigned int i;
+
+        /* Call the functions registered at init time. */
+       printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx);
+        for (i = 0; i < mod_idx; i++) {
+               struct mod_args *m = &mods[i];
+                printf("+++ start module %d %s %s at %p order 0x%x\n",
+                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
+                        m->mod, m->order);
+               if (m->mod && m->mod->evhand)
+                       m->mod->evhand(NULL, MOD_LOAD, m->mod->priv);
+               else if (m->init)
+                       m->init();
+        }
+}
+
+static void
+fini_children(void)
+{
+       int i;
+
+        /* Call the functions registered at init time. */
+        for (i = mod_idx - 1; i >= 0; i--) {
+               struct mod_args *m = &mods[i];
+                printf("+++ end module %d %s %s at %p order 0x%x\n",
+                        i, m->name, m->mod ? m->mod->name : "SYSINIT",
+                        m->mod, m->order);
+               if (m->mod && m->mod->evhand)
+                       m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv);
+               else if (m->uninit)
+                       m->uninit();
+        }
+}
+/*--- end of module binding helper functions ---*/
+
+/*---
+ * Control hooks:
+ * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention.
+ * then call the ipfw handler in order to manage requests.
+ * In turn this is called by the linux set/get handlers.
+ */
+static int
+ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user)
+{
+       struct thread t;
+       int ret = EINVAL;
+
+       memset(s, 0, sizeof(*s));
+       s->sopt_name = cmd;
+       s->sopt_dir = dir;
+       s->sopt_valsize = len;
+       s->sopt_val = user;
+
+       /* sopt_td is not used but it is referenced */
+       memset(&t, 0, sizeof(t));
+       s->sopt_td = &t;
+       
+       //printf("%s called with cmd %d len %d sopt %p user %p\n", __FUNCTION__, cmd, len, s, user);
+
+       if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 ||
+           cmd < IP_DUMMYNET_CONFIGURE))
+               ret = ip_fw_ctl_ptr(s);
+       else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 ||
+           cmd >= IP_DUMMYNET_CONFIGURE))
+               ret = ip_dn_ctl_ptr(s);
+       
+       return -ret;    /* errors are < 0 on linux */
+}
+
+#ifdef linux
+/*
+ * Convert an mbuf into an skbuff
+ * At the moment this only works for ip packets fully contained
+ * in a single mbuf. We assume that on entry ip_len and ip_off are
+ * in host format, and the ip checksum is not computed.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* check boundary */
+int dst_output(struct skbuff *s)
+{
+       return 0;
+}
+
+struct sk_buff *
+mbuf2skbuff(struct mbuf* m)
+{
+       return NULL;
+}
+#else
+struct sk_buff *
+mbuf2skbuff(struct mbuf* m)
+{
+       struct sk_buff *skb;
+       size_t len = m->m_pkthdr.len;
+
+       /* used to lookup the routing table */
+       struct rtable *r;
+       struct flowi fl;
+       int ret = 0;    /* success for ip_route_output_key() */
+
+       struct ip *ip = mtod(m, struct ip *);
+
+       /* XXX ip_output has ip_len and ip_off in network format,
+        * linux expects host format */
+       ip->ip_len = ntohs(ip->ip_len);
+       ip->ip_off = ntohs(ip->ip_off);
+
+       ip->ip_sum = 0;
+       ip->ip_sum = in_cksum(m, ip->ip_hl<<2);
+
+       /* fill flowi struct, we need just the dst addr, see XXX */
+       bzero(&fl, sizeof(fl));
+       flow_daddr.daddr = ip->ip_dst.s_addr;
+
+       /*
+        * ip_route_output_key() should increment
+        * r->u.dst.__use and call a dst_hold(dst)
+        * XXX verify how we release the resources.
+        */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) /* check boundary */
+       r = ip_route_output_key(&init_net, &fl.u.ip4);
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26) /* check boundary */
+       ret = ip_route_output_key(&init_net, &r, &fl);
+#else
+       ret = ip_route_output_key(&r, &fl);
+#endif
+       if (ret != 0 || r == NULL ) {
+               printf("NO ROUTE FOUND\n");
+               return NULL;
+       }
+
+       /* allocate the skbuff and the data */
+       skb = alloc_skb(len + sizeof(struct ethhdr), GFP_ATOMIC);
+       if (skb == NULL) {
+               printf("%s: can not allocate SKB buffers.\n", __FUNCTION__);
+               return NULL;
+       }
+
+       skb->protocol = htons(ETH_P_IP); // XXX 8 or 16 bit ?
+       /* sk_dst_set XXX take the lock (?) */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+       skb_dst_set(skb, &r->u.dst);
+#else
+       skb_dst_set(skb, &r->dst);
+#endif
+       skb->dev = skb_dst(skb)->dev;
+
+       /* reserve space for ethernet header */
+       skb_reserve(skb, sizeof(struct ethhdr));
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
+       skb_reset_network_header(skb); // skb->network_header = skb->data - skb->head
+#else
+       skb->nh.raw = skb->data;
+#endif
+       /* set skbuff tail pointers and copy content */
+       skb_put(skb, len);
+       memcpy(skb->data, m->m_data, len);
+
+       return skb;
+}
+#endif /* linux 2.6+ */
+#endif /* linux */
+
+
+/*
+ * This function is called to reinject packets to the
+ * kernel stack within the linux netfilter system
+ * or to send a new created mbuf.
+ * In the first case we have a valid sk_buff pointer
+ * encapsulated within the fake mbuf, so we can call
+ * the reinject function trough netisr_dispatch.
+ * In the last case we need to build a sk_buff from scratch,
+ * before sending out the packet.
+ */
+int
+ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+    struct ip_moptions *imo, struct inpcb *inp)
+{
+       (void)opt; (void)ro; (void)flags; (void)imo; (void)inp; /* UNUSED */
+       if ( m->m_skb != NULL ) { /* reinjected packet, just call dispatch */
+               ND("sending... ");
+               netisr_dispatch(0, m);
+       } else {
+               /* self-generated packet, wrap as appropriate and send */
+#ifdef __linux__
+               struct sk_buff *skb = mbuf2skbuff(m);
+
+               if (skb != NULL)
+                       dst_output(skb);
+#else /* Windows */
+               D("unimplemented.");
+#endif
+               FREE_PKT(m);
+       }
+       return 0;
+}
+
+/*
+ * setsockopt hook has no return value other than the error code.
+ */
+int
+do_ipfw_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+       struct sockopt s;       /* pass arguments */
+       (void)sk;               /* UNUSED */
+       return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user);
+}
+
+/*
+ * getsockopt can can return a block of data in response.
+ */
+int
+do_ipfw_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+       struct sockopt s;       /* pass arguments */
+       int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user);
+
+       (void)sk;               /* UNUSED */
+       *len = s.sopt_valsize;  /* return length back to the caller */
+       return ret;
+}
+
+#ifdef __linux__
+
+/*
+ * declare our [get|set]sockopt hooks
+ */
+static struct nf_sockopt_ops ipfw_sockopts = {
+       .pf             = PF_INET,
+       .set_optmin     = _IPFW_SOCKOPT_BASE,
+       .set_optmax     = _IPFW_SOCKOPT_END,
+       .set            = do_ipfw_set_ctl,
+       .get_optmin     = _IPFW_SOCKOPT_BASE,
+       .get_optmax     = _IPFW_SOCKOPT_END,
+       .get            = do_ipfw_get_ctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+       .owner          = THIS_MODULE,
+#endif
+};
+
+/*----
+ * We need a number of macros to adapt to the various APIs in
+ * different linux versions. Among them:
+ *
+ * - the hook names change between macros (NF_IP*) and enum NF_INET_*
+ *
+ * - the second argument to the netfilter hook is
+ *     struct sk_buff **       in kernels <= 2.6.22
+ *     struct sk_buff *        in kernels > 2.6.22
+ *
+ * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT
+ *
+ * - the packet descriptor passed to the queue handler is
+ *     struct nf_info          in kernels <= 2.6.24
+ *     struct nf_queue_entry   in kernels <= 2.6.24
+ *
+ * - the arguments to the queue handler also change;
+ */
+
+/*
+ * declare hook to grab packets from the netfilter interface.
+ * The NF_* names change in different versions of linux, in some
+ * cases they are #defines, in others they are enum, so we
+ * need to adapt.
+ */
+#ifndef NF_IP_PRE_ROUTING
+#define NF_IP_PRE_ROUTING      NF_INET_PRE_ROUTING
+#endif
+#ifndef NF_IP_POST_ROUTING
+#define NF_IP_POST_ROUTING     NF_INET_POST_ROUTING
+#endif
+
+/*
+ * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains.
+ * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and
+ * POST_ROUTING chains, so if we want to use that information we
+ * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING.
+ * However at the moment the skb_tag info is not reliable so
+ * we stay with the standard hooks.
+ */
+#if 0 // defined(IPFW_PLANETLAB)
+#define IPFW_HOOK_IN NF_IP_LOCAL_IN
+#else
+#define IPFW_HOOK_IN NF_IP_PRE_ROUTING
+#endif
+
+/*
+ * The main netfilter hook.
+ * To make life simple, we queue everything and then do all the
+ * decision in the queue handler.
+ *
+ * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff**
+ * so we have an #ifdef to set the proper argument type.
+ */
+static unsigned int
+call_ipfw(
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0)
+       unsigned int hooknum,
+#else
+       const struct nf_hook_ops *hooknum,
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have **
+       struct sk_buff  **skb,
+#else
+       struct sk_buff  *skb,
+#endif
+       const struct net_device *in, const struct net_device *out,
+       int (*okfn)(struct sk_buff *))
+{
+       (void)hooknum; (void)skb; (void)in; (void)out; (void)okfn; /* UNUSED */
+       return NF_QUEUE;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)        /* XXX was 2.6.0 */
+#define        NF_STOP         NF_ACCEPT
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+
+/*
+ * nf_queue_entry is a recent addition, in previous versions
+ * of the code the struct is called nf_info.
+ */
+#define nf_queue_entry nf_info /* for simplicity */
+
+/* also, 2.4 and perhaps something else have different arguments */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)        /* XXX unsure */
+/* on 2.4 we use nf_info */
+#define QH_ARGS                struct sk_buff *skb, struct nf_info *info, void *data
+#else  /* 2.6.14. 2.6.24 */
+#define QH_ARGS                struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data
+#endif
+
+#define DEFINE_SKB     /* nothing, already an argument */
+#define        REINJECT(_inf, _verd)   nf_reinject(skb, _inf, _verd)
+
+#else  /* 2.6.25 and above */
+
+#define QH_ARGS                struct nf_queue_entry *info, unsigned int queuenum
+#define DEFINE_SKB     struct sk_buff *skb = info->skb;
+#define        REINJECT(_inf, _verd)   nf_reinject(_inf, _verd)
+#endif
+
+/*
+ * used by dummynet when dropping packets
+ * XXX use dummynet_send()
+ */
+void
+reinject_drop(struct mbuf* m)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)        /* unsure on the exact boundary */
+       struct sk_buff *skb = (struct sk_buff *)m;
+#endif
+       REINJECT(m->queue_entry, NF_DROP);
+}
+
+/*
+ * The real call to the firewall. nf_queue_entry points to the skbuf,
+ * and eventually we need to return both through nf_reinject().
+ */
+static int
+ipfw2_queue_handler(QH_ARGS)
+{
+       DEFINE_SKB      /* no semicolon here, goes in the macro */
+       int ret = 0;    /* return value */
+       struct mbuf *m;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+       if (skb->nh.iph == NULL) {
+               printf("null dp, len %d reinject now\n", skb->len);
+               REINJECT(info, NF_ACCEPT);
+               return 0;
+       }
+#endif
+       m = malloc(sizeof(*m), 0, 0);
+       if (m == NULL) {
+               printf("malloc fail, len %d reinject now\n", skb->len);
+               REINJECT(info, NF_ACCEPT);
+               return 0;
+       }
+
+       m->m_skb = skb;
+       m->m_len = skb->len;            /* len from ip header to end */
+       m->m_pkthdr.len = skb->len;     /* total packet len */
+       m->m_pkthdr.rcvif = info->indev;
+       m->queue_entry = info;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)        /* XXX was 2.6.0 */
+       m->m_data = (char *)skb->nh.iph;
+#else
+       m->m_data = (char *)skb_network_header(skb);    // XXX unsigned ? */
+#endif
+
+       /* XXX add the interface */
+       if (info->hook == IPFW_HOOK_IN) {
+               ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL);
+       } else {
+               ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL);
+       }
+
+       if (m != NULL) {        /* Accept. reinject and free the mbuf */
+               REINJECT(info, NF_ACCEPT);
+               m_freem(m);
+       } else if (ret == 0) {
+               /* dummynet has kept the packet, will reinject later. */
+       } else {
+               /*
+                * Packet dropped by ipfw or dummynet. Nothing to do as
+                * FREE_PKT already did a reinject as NF_DROP
+                */
+       }
+       return 0;
+}
+
+struct route;
+struct ip_moptions;
+struct inpcb;
+
+/* XXX should include prototypes for netisr_dispatch and ip_output */
+/*
+ * The reinjection routine after a packet comes out from dummynet.
+ * We must update the skb timestamp so ping reports the right time.
+ * This routine is also used (with num == -1) as FREE_PKT. XXX
+ */
+void
+netisr_dispatch(int num, struct mbuf *m)
+{
+       struct nf_queue_entry *info = m->queue_entry;
+       struct sk_buff *skb = m->m_skb; /* always used */
+
+       /*
+        * This function can be called by the FREE_PKT()
+        * used when ipfw generate their own mbuf packets
+        * or by the mbuf2skbuff() function.
+        */
+       m_freem(m);
+
+       /* XXX check
+        * info is null in the case of a real mbuf
+        * (one created by the ipfw code without a
+        * valid sk_buff pointer
+        */
+       if (info == NULL)
+               return;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)       // XXX above 2.6.x ?
+       __net_timestamp(skb);   /* update timestamp */
+#endif
+
+       /* XXX to obey one-pass, possibly call the queue handler here */
+       REINJECT(info, ((num == -1)?NF_DROP:NF_STOP));  /* accept but no more firewall */
+}
+
+/*
+ * socket lookup function for linux.
+ * This code is used to associate uid, gid, jail/xid to packets,
+ * and store the info in a cache *ugp where they can be accessed quickly.
+ * The function returns 1 if the info is found, -1 otherwise.
+ *
+ * We do this only on selected protocols: TCP, ...
+ *
+ * The chain is the following
+ *   sk_buff*  sock*  socket*    file*
+ *     skb  ->  sk ->sk_socket->file ->f_owner    ->pid
+ *     skb  ->  sk ->sk_socket->file ->f_uid (direct)
+ *     skb  ->  sk ->sk_socket->file ->f_cred->fsuid (2.6.29+)
+ *
+ * Related headers:
+ * linux/skbuff.h      struct skbuff
+ * net/sock.h          struct sock
+ * linux/net.h         struct socket
+ * linux/fs.h          struct file
+ *
+ * With vserver we may have sk->sk_xid and sk->sk_nid that
+ * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2]
+ * (no matches yet)
+ *
+ * Note- for locally generated, outgoing packets we should not need
+ * need a lookup because the sk_buff already points to the socket where
+ * the info is.
+ */
+extern struct inet_hashinfo tcp_hashinfo;
+int
+linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
+               const __be32 daddr, const __be16 dport,
+               struct sk_buff *skb, int dir, struct bsd_ucred *u)
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,13)       /* XXX was 2.6.0 */
+       return -1;
+#else
+       struct sock *sk;
+       int ret = -1;   /* default return value */
+       int st = -1;    /* state */
+
+
+       if (proto != IPPROTO_TCP)       /* XXX extend for UDP */
+               return -1;
+
+       if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
+               panic(" -- this should not happen\n");
+               return -1;
+       }
+
+       if (skb->sk) {
+               sk = skb->sk;
+       } else {
+               /*
+                * Try a lookup. On a match, sk has a refcount that we must
+                * release on exit (we know it because skb->sk = NULL).
+                *
+                * inet_lookup above 2.6.24 has an additional 'net' parameter
+                * so we use a macro to conditionally supply it.
+                * swap dst and src depending on the direction.
+                */
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24)
+#define _OPT_NET_ARG
+#else
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+/* there is no dev_net() on 2.6.25 */
+#define _OPT_NET_ARG (skb->dev->nd_net),
+#else  /* 2.6.26 and above */
+#define _OPT_NET_ARG dev_net(skb->dev),
+#endif
+#endif
+               sk =  (dir) ? /* dir != 0 on output */
+                   inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
+                       daddr, dport, saddr, sport,     // match outgoing
+                       inet_iif(skb)) :
+                   inet_lookup(_OPT_NET_ARG &tcp_hashinfo,
+                       saddr, sport, daddr, dport,     // match incoming
+                       skb->dev->ifindex);
+#undef _OPT_NET_ARG
+
+               if (sk == NULL) /* no match, nothing to be done */
+                       return -1;
+       }
+       ret = 1;        /* retrying won't make things better */
+       st = sk->sk_state;
+#ifdef CONFIG_VSERVER
+       u->xid = sk->sk_xid;
+       u->nid = sk->sk_nid;
+#else
+       u->xid = u->nid = 0;
+#endif
+       /*
+        * Exclude tcp states where sk points to a inet_timewait_sock which
+        * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more).
+        * To be safe, use a whitelist and not a blacklist.
+        * Before dereferencing sk_socket grab a lock on sk_callback_lock.
+        *
+        * Once again we need conditional code because the UID and GID
+        * location changes between kernels.
+        */
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
+/* use the current's real uid/gid */
+#define _CURR_UID f_uid
+#define _CURR_GID f_gid
+#else /* 2.6.29 and above */
+/* use the current's file access real uid/gid */
+#define _CURR_UID f_cred->fsuid
+#define _CURR_GID f_cred->fsgid
+#endif
+
+#define GOOD_STATES (  \
+       (1<<TCP_LISTEN) | (1<<TCP_SYN_RECV)   | (1<<TCP_SYN_SENT)   | \
+       (1<<TCP_ESTABLISHED)  | (1<<TCP_FIN_WAIT1) | (1<<TCP_FIN_WAIT2) )
+       // surely exclude TCP_CLOSE, TCP_TIME_WAIT, TCP_LAST_ACK
+       // uncertain TCP_CLOSE_WAIT and TCP_CLOSING
+
+       if ((1<<st) & GOOD_STATES) {
+               read_lock_bh(&sk->sk_callback_lock);
+               if (sk->sk_socket && sk->sk_socket->file) {
+                       u->uid = sk->sk_socket->file->_CURR_UID;
+                       u->gid = sk->sk_socket->file->_CURR_GID;
+               }
+               read_unlock_bh(&sk->sk_callback_lock);
+       } else {
+               u->uid = u->gid = 0;
+       }
+       if (!skb->sk) /* return the reference that came from the lookup */
+               sock_put(sk);
+#undef GOOD_STATES
+#undef _CURR_UID
+#undef _CURR_GID
+       return ret;
+
+#endif /* LINUX > 2.4 */
+}
+
+/*
+ * Now prepare to hook the various functions.
+ * Linux 2.4 has a different API so we need some adaptation
+ * for register and unregister hooks
+ *
+ * the unregister function changed arguments between 2.6.22 and 2.6.24
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+struct nf_queue_handler ipfw2_queue_handler_desc = {
+        .outfn = ipfw2_queue_handler,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
+        .name = "ipfw2 dummynet queue",
+#endif
+};
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
+#define REG_QH_ARG(pf, fn)     pf, &(fn ## _desc)
+#else
+#define REG_QH_ARG(pf, fn)     &(fn ## _desc)
+#endif
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) /* XXX was 2.6.0 */
+static int
+nf_register_hooks(struct nf_hook_ops *ops, int n)
+{
+       int i, ret = 0;
+       for (i = 0; i < n; i++) {
+               ret = nf_register_hook(ops + i);
+               if (ret < 0)
+                       break;
+       }
+       return ret;
+}
+
+static void
+nf_unregister_hooks(struct nf_hook_ops *ops, int n)
+{
+       int i;
+       for (i = 0; i < n; i++) {
+               nf_unregister_hook(ops + i);
+       }
+}
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX was 2.6.0 */
+#define REG_QH_ARG(pf, fn)     pf, fn, NULL
+#endif
+#define UNREG_QH_ARG(pf, fn) //fn      /* argument for nf_[un]register_queue_handler */
+#define SET_MOD_OWNER
+
+#else /* linux > 2.6.17 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define UNREG_QH_ARG(pf, fn) //fn
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
+#define UNREG_QH_ARG(pf, fn)   pf, &(fn ## _desc)
+#else
+#define UNREG_QH_ARG(pf, fn)
+#endif /* 2.6.0 < LINUX > 2.6.24 */
+
+#define SET_MOD_OWNER  .owner = THIS_MODULE,
+
+#endif /* !LINUX < 2.6.0 */
+
+static struct nf_hook_ops ipfw_ops[] __read_mostly = {
+        {
+                .hook           = call_ipfw,
+                .pf             = PF_INET,
+                .hooknum        = IPFW_HOOK_IN,
+                .priority       = NF_IP_PRI_FILTER,
+                SET_MOD_OWNER
+        },
+        {
+                .hook           = call_ipfw,
+                .pf             = PF_INET,
+                .hooknum        = NF_IP_POST_ROUTING,
+                .priority       = NF_IP_PRI_FILTER,
+               SET_MOD_OWNER
+        },
+};
+#endif /* __linux__ */
+
+/* descriptors for the children, until i find a way for the
+ * linker to produce them
+ */
+extern moduledata_t *moddesc_ipfw;
+extern moduledata_t *moddesc_dummynet;
+extern moduledata_t *moddesc_dn_fifo;
+extern moduledata_t *moddesc_dn_wf2qp;
+extern moduledata_t *moddesc_dn_rr;
+extern moduledata_t *moddesc_dn_qfq;
+extern moduledata_t *moddesc_dn_prio;
+extern void *sysinit_ipfw_init;
+extern void *sysuninit_ipfw_destroy;
+extern void *sysinit_vnet_ipfw_init;
+extern void *sysuninit_vnet_ipfw_uninit;
+
+/*
+ * Module glue - init and exit function.
+ */
+int __init
+ipfw_module_init(void)
+{
+       int ret = 0;
+#ifdef _WIN32
+       unsigned long resolution;
+#endif
+
+       rn_init(64);
+       my_mod_register("ipfw",  1, moddesc_ipfw, NULL, NULL);
+       my_mod_register("sy_ipfw",  2, NULL,
+               sysinit_ipfw_init, sysuninit_ipfw_destroy);
+       my_mod_register("sy_Vnet_ipfw",  3, NULL,
+               sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit);
+       my_mod_register("dummynet",  4, moddesc_dummynet, NULL, NULL);
+       my_mod_register("dn_fifo",  5, moddesc_dn_fifo, NULL, NULL);
+       my_mod_register("dn_wf2qp",  6, moddesc_dn_wf2qp, NULL, NULL);
+       my_mod_register("dn_rr",  7, moddesc_dn_rr, NULL, NULL);
+       my_mod_register("dn_qfq",  8, moddesc_dn_qfq, NULL, NULL);
+       my_mod_register("dn_prio",  9, moddesc_dn_prio, NULL, NULL);
+       init_children();
+
+#ifdef _WIN32
+       resolution = ExSetTimerResolution(1, TRUE);
+       printf("*** ExSetTimerResolution: resolution set to %d n-sec ***\n",resolution);
+#endif
+#ifdef EMULATE_SYSCTL
+       keinit_GST();
+#endif 
+
+#ifdef __linux__
+       /* sockopt register, in order to talk with user space */
+       ret = nf_register_sockopt(&ipfw_sockopts);
+        if (ret < 0) {
+               printf("error %d in nf_register_sockopt\n", ret);
+               goto clean_modules;
+       }
+
+       /* queue handler registration, in order to get network
+        * packet under a private queue */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2)
+       ret =
+#endif
+           nf_register_queue_handler(REG_QH_ARG(PF_INET, ipfw2_queue_handler) );
+        if (ret < 0)   /* queue busy */
+               goto unregister_sockopt;
+
+        ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
+        if (ret < 0)
+               goto unregister_sockopt;
+
+       printf("%s loaded\n", __FUNCTION__);
+       return 0;
+
+
+/* handle errors on load */
+unregister_sockopt:
+       nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) );
+       nf_unregister_sockopt(&ipfw_sockopts);
+
+clean_modules:
+       fini_children();
+       printf("%s error\n", __FUNCTION__);
+
+#endif /* __linux__ */
+       return ret;
+}
+
+/* module shutdown */
+void __exit
+ipfw_module_exit(void)
+{
+#ifdef EMULATE_SYSCTL
+       keexit_GST();
+#endif
+#ifdef _WIN32
+       ExSetTimerResolution(0,FALSE);
+
+#else  /* linux hook */
+        nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops));
+       /* maybe drain the queue before unregistering ? */
+       nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) );
+       nf_unregister_sockopt(&ipfw_sockopts);
+#endif /* __linux__ */
+
+       fini_children();
+
+       printf("%s unloaded\n", __FUNCTION__);
+}
+
+#ifdef __linux__
+module_init(ipfw_module_init)
+module_exit(ipfw_module_exit)
+MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
+#endif
diff --git a/kipfw/md_win.c b/kipfw/md_win.c
new file mode 100644 (file)
index 0000000..9e66889
--- /dev/null
@@ -0,0 +1,636 @@
+/*
+ * Copyright (C) 2010 Luigi Rizzo, Francesco Magno, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * kernel variables and functions that are not available in Windows.
+ */
+
+#include <net/pfil.h> /* provides PFIL_IN and PFIL_OUT */
+#include <arpa/inet.h>
+#include <netinet/in.h>                        /* in_addr */
+#include <ndis.h>
+#include <sys/mbuf.h>
+#include <passthru.h>
+
+/* credentials check */
+int
+cred_check(void *_insn,  int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb)
+{
+       return 0;
+}
+
+/*
+ * as good as anywhere, place here the missing calls
+ */
+
+void *
+my_alloc(int size)
+{
+       void *_ret = ExAllocatePoolWithTag(NonPagedPool, size, 'wfpi');
+       if (_ret)
+               memset(_ret, 0, size);
+       return _ret;
+}
+
+void
+panic(const char *fmt, ...)
+{
+       printf("%s", fmt);
+       for (;;);
+}
+
+int securelevel = 0;
+
+int ffs(int bits)
+{
+       int i;
+       if (bits == 0)
+               return (0);
+       for (i = 1; ; i++, bits >>= 1) {
+               if (bits & 1)
+                       break;
+       }
+       return (i);
+}
+
+void
+do_gettimeofday(struct timeval *tv)
+{
+       static LARGE_INTEGER prevtime; //system time in 100-nsec resolution
+       static LARGE_INTEGER prevcount; //RTC counter value
+       static LARGE_INTEGER freq; //frequency
+
+       LARGE_INTEGER currtime;
+       LARGE_INTEGER currcount;
+       if (prevtime.QuadPart == 0) { //first time we ask for system time
+               KeQuerySystemTime(&prevtime);
+               prevcount = KeQueryPerformanceCounter(&freq);
+               currtime.QuadPart = prevtime.QuadPart;
+       } else {
+               KeQuerySystemTime(&currtime);
+               currcount = KeQueryPerformanceCounter(&freq);
+               if (currtime.QuadPart == prevtime.QuadPart) {
+                       //time has NOT changed, calculate time using ticks and DO NOT update
+                       LONGLONG difftime = 0; //difference in 100-nsec
+                       LONGLONG diffcount = 0; //clock count difference
+                       //printf("time has NOT changed\n");
+                       diffcount = currcount.QuadPart - prevcount.QuadPart;
+                       diffcount *= 10000000;
+                       difftime = diffcount / freq.QuadPart;
+                       currtime.QuadPart += difftime;
+               } else {        
+                       //time has changed, update and return SystemTime
+                       //printf("time has changed\n");
+                       prevtime.QuadPart = currtime.QuadPart;
+                       prevcount.QuadPart = currcount.QuadPart;
+               }
+       }
+       currtime.QuadPart /= 10; //convert in usec
+       tv->tv_sec = currtime.QuadPart / (LONGLONG)1000000;
+       tv->tv_usec = currtime.QuadPart % (LONGLONG)1000000;
+       //printf("sec %d usec %d\n",tv->tv_sec, tv->tv_usec);
+}
+
+int time_uptime_w32()
+{
+       int ret;
+       LARGE_INTEGER tm;
+       KeQuerySystemTime(&tm);
+       ret = (int)(tm.QuadPart / (LONGLONG)1000000);
+       return ret;
+}
+
+
+/*
+ * Windows version of firewall hook. We receive a partial copy of
+ * the packet which points to the original buffers. In output,
+ * the refcount has been already incremented.
+ * The function reconstructs
+ * the whole packet in a contiguous memory area, builds a fake mbuf,
+ * calls the firewall, does the eventual cleaning and returns
+ * to MiniportSend or ProtocolReceive, which will silently return
+ * (dropping packet) or continue its execution (allowing packet).
+ * The memory area contains:
+ * - the fake mbuf, filled with data needed by ipfw, and information
+ *   for reinjection
+ * - the packet data
+ */
+void hexdump(PUCHAR,int, const char *);
+static char _if_in[] = "incoming";
+static char _if_out[] = "outgoing";
+
+int
+ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction,
+       NDIS_HANDLE Context)
+{      
+       unsigned int            BufferCount = 0;
+       unsigned                        TotalPacketLength = 0;
+       PNDIS_BUFFER            pCurrentBuffer = NULL;
+       PNDIS_BUFFER            pNextBuffer = NULL;
+       struct mbuf*            m;
+       unsigned char*          payload = NULL;
+       unsigned int            ofs, l;
+       unsigned short          EtherType = 0;
+       unsigned int            i = 0;
+       int                                     ret = 0;
+       PNDIS_BUFFER            pNdisBuffer, old_head, old_tail;
+       NDIS_HANDLE                     PacketPool;
+       PADAPT                          pAdapt;
+       NDIS_STATUS                     Status;
+
+       /* In NDIS, packets are a chain of NDIS_BUFFER. We query
+        * the packet to get a pointer of chain's head, the length
+        * of the chain, and the length of the packet itself.
+        * Then allocate a buffer for the mbuf and the payload.
+        */
+       NdisQueryPacket(pNdisPacket, NULL, &BufferCount,
+               &pCurrentBuffer, &TotalPacketLength);
+       m = malloc(sizeof(struct mbuf) + TotalPacketLength, 0, 0 );
+       if (m == NULL) //resource shortage, drop the packet
+               goto drop_pkt;
+
+       /* set mbuf fields to point past the MAC header.
+        * Also set additional W32 info
+        */
+       payload = (unsigned char*)(m + 1);
+       m->m_len = m->m_pkthdr.len = TotalPacketLength-14;
+       m->m_pkthdr.rcvif = (void *)((direction==INCOMING) ? _if_in : NULL);
+       m->m_data = payload + 14; /* past the MAC header */
+       m->direction = direction;
+       m->context = Context;
+       m->pkt = pNdisPacket;
+
+       /* m_skb != NULL is used in the ip_output routine to check
+        * for packets that come from the stack and differentiate
+        * from those internally generated by ipfw.
+        * The pointer is not used, just needs to be non-null.
+        */
+       m->m_skb = (void *)pNdisPacket;
+       /*
+        * Now copy the data from the Windows buffers to the mbuf.
+        */
+       for (i=0, ofs = 0; i < BufferCount; i++) {
+               unsigned char* src;
+               NdisQueryBufferSafe(pCurrentBuffer, &src, &l,
+                       NormalPagePriority);
+               bcopy(src, payload + ofs, l);
+               ofs += l;
+               NdisGetNextBuffer(pCurrentBuffer, &pNextBuffer);
+               pCurrentBuffer = pNextBuffer;
+       }
+       /*
+        * Identify EtherType. If the packet is not IP, simply allow
+        * and don't bother the firewall. XXX should be done before.
+        */
+       EtherType = *(unsigned short*)(payload + 12);
+       EtherType = RtlUshortByteSwap(EtherType);
+       if (EtherType != 0x0800) {
+               //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType);
+               free(m, 0);
+               return PASS;
+       }
+
+       /*
+        * Now build a buffer descriptor to replace the original chain.
+        */
+       pAdapt = Context;
+       PacketPool = direction == OUTGOING ?
+               pAdapt->SendPacketPoolHandle : pAdapt->RecvPacketPoolHandle;
+        NdisAllocateBuffer(&Status, &pNdisBuffer,
+                PacketPool, payload, m->m_pkthdr.len+14);
+        if (Status != NDIS_STATUS_SUCCESS)
+                goto drop_pkt;
+        /*
+        * Save the old buffer pointers, and put the new one
+        * into the chain.
+         */
+        pNdisBuffer->Next = NULL;
+       old_head = NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket);
+       old_tail = NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket);
+       NdisReinitializePacket(pNdisPacket);
+       NdisChainBufferAtFront(pNdisPacket, pNdisBuffer);
+#if 0
+       if (direction == INCOMING) {
+               DBGPRINT(("incoming: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength));
+       } else {
+               DBGPRINT(("outgoing: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength));
+       }
+#endif
+       if (direction == INCOMING)
+               ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL);
+       else
+               ret = ipfw_check_hook(NULL, &m, (struct ifnet*)_if_out, PFIL_OUT, NULL);
+
+       if (m != NULL) {
+               /* Accept. Restore the old buffer chain, free
+                * the mbuf and return PASS.
+                */
+               //DBGPRINT(("accepted\n"));
+               NdisReinitializePacket(pNdisPacket);
+               NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket) = old_head;
+               NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket) = old_tail;
+               NdisFreeBuffer(pNdisBuffer);
+               m_freem(m);
+               return PASS;
+       } else if (ret == 0) {
+               /* dummynet has kept the packet, will reinject later. */
+               //DBGPRINT(("kept by dummynet\n"));
+               return DUMMYNET;
+       } else {
+               /*
+                * Packet dropped by ipfw or dummynet. Nothing to do as
+                * FREE_PKT already freed the fake mbuf
+                */
+               //DBGPRINT(("dropped by dummynet, ret = %i\n", ret));
+               return DROP;
+       }
+drop_pkt:
+       /* for some reason we cannot proceed. Free any resources
+        * including those received from above, and return
+        * faking success. XXX this must be fixed later.
+        */
+       NdisFreePacket(pNdisPacket);
+       return DROP;
+}
+
+/*
+ * Windows reinjection function.
+ * The packet is already available as m->pkt, so we only
+ * need to send it to the right place.
+ * Normally a ndis intermediate driver allocates
+ * a fresh descriptor, while the actual data's ownership is
+ * retained by the protocol, or the miniport below.
+ * Since an intermediate driver behaves as a miniport driver
+ * at the upper edge (towards the protocol), and as a protocol
+ * driver at the lower edge (towards the NIC), when we handle a
+ * packet we have a reserved area in both directions (we can use
+ * only one for each direction at our own discretion).
+ * Normally this area is used to save a pointer to the original
+ * packet, so when the driver is done with it, the original descriptor
+ * can be retrieved, and the resources freed (packet descriptor,
+ * buffer descriptor(s) and the actual data). In our driver this
+ * area is used to mark the reinjected packets as 'orphan', because
+ * the original descriptor is gone long ago. This way we can handle
+ * correctly the resource freeing when the callback function
+ * is called by NDIS.
+ */
+
+void 
+netisr_dispatch(int num, struct mbuf *m)
+{
+       unsigned char*          payload = (unsigned char*)(m+1);
+       PADAPT                          pAdapt = m->context;
+       NDIS_STATUS                     Status;
+       PNDIS_PACKET            pPacket = m->pkt;
+       PNDIS_BUFFER            pNdisBuffer;
+       NDIS_HANDLE                     PacketPool;
+
+       if (num < 0)
+               goto drop_pkt;
+
+       //debug print
+#if 0
+       DbgPrint("reinject %s\n", m->direction == OUTGOING ?
+               "outgoing" : "incoming");
+#endif
+       NdisAcquireSpinLock(&pAdapt->Lock);
+       if (m->direction == OUTGOING) {
+               //we must first check if the adapter is going down,
+               // in this case abort the reinjection
+               if (pAdapt->PTDeviceState > NdisDeviceStateD0) {
+                       pAdapt->OutstandingSends--;
+                       // XXX should we notify up ?
+                       NdisReleaseSpinLock(&pAdapt->Lock);
+                       goto drop_pkt;
+               }
+       } else {
+               /* if the upper miniport edge is not initialized or
+                * the miniport edge is in low power state, abort
+                * XXX we should notify the error.
+                */
+               if (!pAdapt->MiniportHandle ||
+                   pAdapt->MPDeviceState > NdisDeviceStateD0) {
+                       NdisReleaseSpinLock(&pAdapt->Lock);
+                       goto drop_pkt;
+               }
+       }
+       NdisReleaseSpinLock(&pAdapt->Lock);
+
+       if (m->direction == OUTGOING) {
+               PSEND_RSVD      SendRsvd;
+               /* use the 8-bytes protocol reserved area, the first
+                * field is used to mark/the packet as 'orphan', the
+                * second stores the pointer to the mbuf, so in the
+                * the SendComplete handler we know that this is a
+                * reinjected packet and can free correctly.
+                */
+               SendRsvd = (PSEND_RSVD)(pPacket->ProtocolReserved);
+               SendRsvd->OriginalPkt = NULL;
+               SendRsvd->pMbuf = m;
+               //do the actual send
+               NdisSend(&Status, pAdapt->BindingHandle, pPacket);
+               if (Status != NDIS_STATUS_PENDING) {
+                       /* done, call the callback now */
+                       PtSendComplete(m->context, m->pkt, Status);
+               }
+               return; /* unconditional return here. */
+       } else {
+               /* There's no need to check the 8-bytes miniport 
+                * reserved area since the path going up will be always
+                * syncronous, and all the cleanup will be done inline.
+                * If the reinjected packed comes from a PtReceivePacket, 
+                * there will be no callback.
+                * Otherwise PtReceiveComplete will be called but will just
+                * return since all the cleaning is alreqady done */
+               // do the actual receive. 
+               ULONG Proc = KeGetCurrentProcessorNumber();
+               pAdapt->ReceivedIndicationFlags[Proc] = TRUE;
+               NdisMEthIndicateReceive(pAdapt->MiniportHandle, NULL, payload, 14, payload+14, m->m_len, m->m_len);
+               NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle);
+               pAdapt->ReceivedIndicationFlags[Proc] = FALSE;
+       }
+drop_pkt:
+       /* NDIS_PACKET exists and must be freed only if
+        * the packet come from a PtReceivePacket, oherwise
+        * m->pkt will ne null.
+        */
+       if (m->pkt != NULL)
+       {
+               NdisUnchainBufferAtFront(m->pkt, &pNdisBuffer);
+               NdisFreeBuffer(pNdisBuffer);
+               NdisFreePacket(m->pkt);
+       }
+       m_freem(m);
+}
+
+void win_freem(void *);        /* wrapper for m_freem() for protocol.c */
+void
+win_freem(void *_m)
+{
+       struct mbuf *m = _m;
+       m_freem(m);
+}
+
+/*
+ * not implemented in linux.
+ * taken from /usr/src/lib/libc/string/strlcpy.c
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+        char *d = dst;
+        const char *s = src;
+        size_t n = siz;
+        /* Copy as many bytes as will fit */
+        if (n != 0 && --n != 0) {
+                do {
+                        if ((*d++ = *s++) == 0)
+                                break;
+                } while (--n != 0);
+        }
+
+        /* Not enough room in dst, add NUL and traverse rest of src */
+        if (n == 0) {
+                if (siz != 0)
+                        *d = '\0';              /* NUL-terminate dst */
+                while (*s++)
+                        ;
+        }
+
+        return(s - src - 1);    /* count does not include NUL */
+}
+
+void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt)
+{
+       PNDIS_BUFFER pNdisBuffer;
+
+       NdisQueryPacket(Packet, NULL, NULL, &pNdisBuffer, NULL);
+       NdisUnchainBufferAtFront(Packet, &pNdisBuffer);
+       NdisFreeBuffer(pNdisBuffer);
+       win_freem(m);
+       NdisFreePacket(Packet);
+       ADAPT_DECR_PENDING_SENDS(pAdapt);
+}
+
+int
+ipfw2_qhandler_w32_oldstyle(int direction,
+       NDIS_HANDLE         ProtocolBindingContext,
+    unsigned char*      HeaderBuffer,
+    unsigned int        HeaderBufferSize,
+    unsigned char*      LookAheadBuffer,
+    unsigned int        LookAheadBufferSize,
+    unsigned int        PacketSize)
+{
+       struct mbuf* m;
+       unsigned char*          payload = NULL;
+       unsigned short          EtherType = 0;
+       int                                     ret = 0;
+       
+       /* We are in a special case when NIC signals an incoming
+        * packet using old style calls. This is done passing
+        * a pointer to the MAC header and a pointer to the
+        * rest of the packet.
+        * We simply allocate space for the mbuf and the
+        * subsequent payload section.
+        */
+       m = malloc(sizeof(struct mbuf) + HeaderBufferSize + LookAheadBufferSize, 0, 0 );
+       if (m == NULL) //resource shortage, drop the packet
+               return DROP;
+       
+       /* set mbuf fields to point past the MAC header.
+        * Also set additional W32 info.
+        * m->pkt here is set to null because the notification
+        * from the NIC has come with a header+loolahead buffer,
+        * no NDIS_PACKET has been provided.
+        */
+       payload = (unsigned char*)(m + 1);
+       m->m_len = m->m_pkthdr.len = HeaderBufferSize+LookAheadBufferSize-14;
+       m->m_data = payload + 14; /* past the MAC header */
+       m->direction = direction;
+       m->context = ProtocolBindingContext;
+       m->pkt = NULL;
+       
+       /*
+        * Now copy the data from the Windows buffers to the mbuf.
+        */
+       bcopy(HeaderBuffer, payload, HeaderBufferSize);
+       bcopy(LookAheadBuffer, payload+HeaderBufferSize, LookAheadBufferSize);
+       //hexdump(payload,HeaderBufferSize+LookAheadBufferSize,"qhandler");
+       /*
+        * Identify EtherType. If the packet is not IP, simply allow
+        * and don't bother the firewall. XXX should be done before.
+        */
+       EtherType = *(unsigned short*)(payload + 12);
+       EtherType = RtlUshortByteSwap(EtherType);
+       if (EtherType != 0x0800) {
+               //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType);
+               free(m, 0);
+               return PASS;
+       }
+
+       //DbgPrint("incoming_raw: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), HeaderBufferSize+LookAheadBufferSize);
+       
+       /* Query the firewall */
+       ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL);
+
+       if (m != NULL) {
+               /* Accept. Free the mbuf and return PASS. */
+               //DbgPrint("accepted\n");
+               m_freem(m);
+               return PASS;
+       } else if (ret == 0) {
+               /* dummynet has kept the packet, will reinject later. */
+               //DbgPrint("kept by dummynet\n");
+               return DUMMYNET;
+       } else {
+               /*
+                * Packet dropped by ipfw or dummynet. Nothing to do as
+                * FREE_PKT already freed the fake mbuf
+                */
+               //DbgPrint("dropped by dummynet, ret = %i\n", ret);
+               return DROP;
+       }
+}
+
+/* forward declaration because those functions are used only here,
+ * no point to make them visible in passthru/protocol/miniport */
+int do_ipfw_set_ctl(struct sock *sk, int cmd,
+       void __user *user, unsigned int len);
+int do_ipfw_get_ctl(struct sock *sk, int cmd,
+       void __user *user, int *len);
+
+NTSTATUS
+DevIoControl(
+    IN PDEVICE_OBJECT    pDeviceObject,
+    IN PIRP              pIrp
+    )
+/*++
+
+Routine Description:
+
+    This is the dispatch routine for handling device ioctl requests.
+
+Arguments:
+
+    pDeviceObject - Pointer to the device object.
+
+    pIrp - Pointer to the request packet.
+
+Return Value:
+
+    Status is returned.
+
+--*/
+{
+    PIO_STACK_LOCATION  pIrpSp;
+    NTSTATUS            NtStatus = STATUS_SUCCESS;
+    unsigned long       BytesReturned = 0;
+    unsigned long       FunctionCode;
+    unsigned long       len;
+    struct sockopt             *sopt;
+    int                                        ret = 0;
+    
+    UNREFERENCED_PARAMETER(pDeviceObject);
+    
+    pIrpSp = IoGetCurrentIrpStackLocation(pIrp);
+    
+    /*
+     * Using METHOD_BUFFERED as communication method, the userland
+     * side calls DeviceIoControl passing an input buffer and an output
+     * and their respective length (ipfw uses the same length for both).
+     * The system creates a single I/O buffer, with len=max(inlen,outlen).
+     * In the kernel we can read information from this buffer (which is
+     * directly accessible), overwrite it with our results, and set
+     * IoStatus.Information with the number of bytes that the system must
+     * copy back to userland.
+     * In our sockopt emulation, the initial part of the buffer contains
+     * a struct sockopt, followed by the data area.
+     */
+
+    len = pIrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    if (len < sizeof(struct sockopt))
+    {
+       return STATUS_NOT_SUPPORTED; // XXX find better value
+    }
+    sopt = pIrp->AssociatedIrp.SystemBuffer;
+
+    FunctionCode = pIrpSp->Parameters.DeviceIoControl.IoControlCode;
+
+    len = sopt->sopt_valsize;
+
+    switch (FunctionCode)
+    {
+               case IP_FW_SETSOCKOPT:
+                       ret = do_ipfw_set_ctl(NULL, sopt->sopt_name, sopt+1, len);
+                       break;
+                       
+               case IP_FW_GETSOCKOPT:
+                       ret = do_ipfw_get_ctl(NULL, sopt->sopt_name, sopt+1, &len);
+                       sopt->sopt_valsize = len;
+                       //sanity check on len
+                       if (len + sizeof(struct sockopt) <= pIrpSp->Parameters.DeviceIoControl.InputBufferLength)
+                               BytesReturned = len + sizeof(struct sockopt);
+                       else
+                               BytesReturned = pIrpSp->Parameters.DeviceIoControl.InputBufferLength;
+                       break;
+
+               default:
+                               NtStatus = STATUS_NOT_SUPPORTED;
+                               break;
+    }
+    
+    pIrp->IoStatus.Information = BytesReturned;
+    pIrp->IoStatus.Status = NtStatus;
+    IoCompleteRequest(pIrp, IO_NO_INCREMENT);
+
+    return NtStatus;
+} 
+
+void dummynet(void * unused);
+void ipfw_tick(void * vnetx);
+
+VOID dummynet_dpc(
+    __in struct _KDPC  *Dpc,
+    __in_opt PVOID  DeferredContext,
+    __in_opt PVOID  SystemArgument1,
+    __in_opt PVOID  SystemArgument2
+    )
+{
+       dummynet(NULL);
+}
+
+VOID ipfw_dpc(
+    __in struct _KDPC  *Dpc,
+    __in_opt PVOID  DeferredContext,
+    __in_opt PVOID  SystemArgument1,
+    __in_opt PVOID  SystemArgument2
+    )
+{
+       ipfw_tick(DeferredContext);
+}
diff --git a/kipfw/missing.h b/kipfw/missing.h
new file mode 100644 (file)
index 0000000..237c1dc
--- /dev/null
@@ -0,0 +1,645 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: missing.h 12256 2013-04-26 21:12:44Z luigi $
+ *
+ * Header for kernel variables and functions that are not available in
+ * userland.
+ */
+
+#ifndef _MISSING_H_
+#define _MISSING_H_
+
+#include <sys/cdefs.h>
+#ifdef linux
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#endif /* linux */
+
+/* portability features, to be set before the rest: */
+#define HAVE_NET_IPLEN         /* iplen/ipoff in net format */
+#define WITHOUT_BPF            /* do not use bpf logging */
+
+#ifdef _WIN32
+
+#ifndef DEFINE_SPINLOCK
+#define DEFINE_SPINLOCK(x)     FAST_MUTEX x
+#endif
+/* spinlock --> Guarded Mutex KGUARDED_MUTEX */
+/* http://www.reactos.org/wiki/index.php/Guarded_Mutex */
+#define spin_lock_init(_l)
+#define spin_lock_bh(_l)
+#define spin_unlock_bh(_l)
+
+#include <sys/socket.h>                /* bsd-compat.c */
+#include <netinet/in.h>                /* bsd-compat.c */
+#include <netinet/ip.h>                /* local version */
+#define INADDR_TO_IFP(a, b) b = NULL
+
+#else  /* __linux__ */
+
+#define MALLOC_DECLARE(x)      /* nothing */
+#include <linux/time.h>                /* do_gettimeofday */
+#include <netinet/ip.h>                /* local version */
+struct inpcb;
+
+/*
+ * Kernel locking support.
+ * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c
+ *
+ * In linux we use spinlock_bh to implement both.
+ * For 'struct rwlock' we need an #ifdef to change it to spinlock_t
+ */
+
+#ifndef DEFINE_SPINLOCK        /* this is for linux 2.4 */
+#define DEFINE_SPINLOCK(x)   spinlock_t x = SPIN_LOCK_UNLOCKED
+#endif
+
+
+#define rw_assert(a, b)
+#define rw_destroy(_l)
+#define rw_init(_l, msg)       spin_lock_init(_l)
+#define rw_rlock(_l)           spin_lock_bh(_l)
+#define rw_runlock(_l)         spin_unlock_bh(_l)
+#define rw_wlock(_l)           spin_lock_bh(_l)
+#define rw_wunlock(_l)         spin_unlock_bh(_l)
+#define rw_init_flags(_l, s, v)
+
+#define mtx_assert(a, b)
+#define        mtx_destroy(m)
+#define mtx_init(m, a,b,c)     spin_lock_init(m)
+#define mtx_lock(_l)           spin_lock_bh(_l)
+#define mtx_unlock(_l)         spin_unlock_bh(_l)
+
+#endif /* __linux__ */
+/* end of locking support */
+
+/*
+ * Reference to an ipfw rule that can be carried outside critical sections.
+ * A rule is identified by rulenum:rule_id which is ordered.
+ * In version chain_id the rule can be found in slot 'slot', so
+ * we don't need a lookup if chain_id == chain->id.
+ *
+ * On exit from the firewall this structure refers to the rule after
+ * the matching one (slot points to the new rule; rulenum:rule_id-1
+ * is the matching rule), and additional info (e.g. info often contains
+ * the insn argument or tablearg in the low 16 bits, in host format).
+ * On entry, the structure is valid if slot>0, and refers to the starting
+ * rules. 'info' contains the reason for reinject, e.g. divert port,
+ * divert direction, and so on.
+ */
+struct ipfw_rule_ref {
+       uint32_t        slot;           /* slot for matching rule       */
+       uint32_t        rulenum;        /* matching rule number         */
+       uint32_t        rule_id;        /* matching rule id             */
+       uint32_t        chain_id;       /* ruleset id                   */
+       uint32_t        info;           /* see below                    */
+};
+
+enum {
+       IPFW_INFO_MASK  = 0x0000ffff,
+       IPFW_INFO_OUT   = 0x00000000,   /* outgoing, just for convenience */
+       IPFW_INFO_IN    = 0x80000000,   /* incoming, overloads dir */
+       IPFW_ONEPASS    = 0x40000000,   /* One-pass, do not reinject */
+       IPFW_IS_MASK    = 0x30000000,   /* which source ? */
+       IPFW_IS_DIVERT  = 0x20000000,
+       IPFW_IS_DUMMYNET =0x10000000,
+       IPFW_IS_PIPE    = 0x08000000,   /* pipe=1, queue = 0 */
+};
+
+/* in netinet/in.h */
+#define        in_nullhost(x)  ((x).s_addr == INADDR_ANY)
+
+/* bzero not present on linux, but this should go in glue.h */
+#define bzero(s, n) memset(s, 0, n)
+#define bcmp(p1, p2, n) memcmp(p1, p2, n)
+
+/* ethernet stuff */
+#define        ETHERTYPE_IP            0x0800  /* IP protocol */
+//#define      ETHER_ADDR_LEN          6       /* length of an Ethernet address */
+struct ether_header {
+        u_char  ether_dhost[ETHER_ADDR_LEN];
+        u_char  ether_shost[ETHER_ADDR_LEN];
+        u_short ether_type;
+};
+
+#define ETHER_TYPE_LEN          2       /* length of the Ethernet type field */
+#define ETHER_HDR_LEN           (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN)
+
+/*
+ * Historically, BSD keeps ip_len and ip_off in host format
+ * when doing layer 3 processing, and this often requires
+ * to translate the format back and forth.
+ * To make the process explicit, we define a couple of macros
+ * that also take into account the fact that at some point
+ * we may want to keep those fields always in net format.
+ */
+
+#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN)
+#define SET_NET_IPLEN(p)        do {} while (0)
+#define SET_HOST_IPLEN(p)       do {} while (0)
+#else /* never on linux */
+#define SET_NET_IPLEN(p)        do {            \
+        struct ip *h_ip = (p);                  \
+        h_ip->ip_len = htons(h_ip->ip_len);     \
+        h_ip->ip_off = htons(h_ip->ip_off);     \
+        } while (0)
+
+#define SET_HOST_IPLEN(p)       do {            \
+        struct ip *h_ip = (p);                  \
+        h_ip->ip_len = ntohs(h_ip->ip_len);     \
+        h_ip->ip_off = ntohs(h_ip->ip_off);     \
+        } while (0)
+#endif /* !HAVE_NET_IPLEN */
+
+/* ip_dummynet.c */
+#define __FreeBSD_version 500035
+
+#ifdef __linux__
+struct moduledata;
+int my_mod_register(const char *name,
+       int order, struct moduledata *mod, void *init, void *uninit);
+
+/* define some macro for ip_dummynet */
+
+struct malloc_type {
+};
+
+#define MALLOC_DEFINE(type, shortdesc, longdesc)       \
+       struct malloc_type type[1]; void *md_dummy_ ## type = type
+
+#define CTASSERT(x)
+
+/* log... does not use the first argument */
+#define        LOG_ERR         0x100
+#define        LOG_INFO        0x200
+#define log(_level, fmt, arg...)  do {                 \
+       int _qwerty=_level;(void)_qwerty; printk(KERN_ERR fmt, ##arg); } while (0)
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+#else  /* _WIN32 */
+#define MALLOC_DEFINE(a,b,c)
+#endif /* _WIN32 */
+
+extern int     hz;
+extern long    tick;           /* exists in 2.4 but not in 2.6 */
+extern int     bootverbose;
+extern struct timeval boottime;
+
+/* The time_uptime a FreeBSD variable increased each second */
+#ifdef __linux__
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,37) /* revise boundaries */
+#define time_uptime get_seconds()
+#else /* OpenWRT */
+#define time_uptime CURRENT_TIME
+#endif
+#else /* WIN32 */
+#define time_uptime time_uptime_w32()
+#endif
+
+extern int     max_linkhdr;
+extern int     ip_defttl;
+extern u_long  in_ifaddrhmask;                         /* mask for hash table */
+extern struct in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+/*-------------------------------------------------*/
+
+/* define, includes and functions missing in linux */
+/* include and define */
+#include <arpa/inet.h>         /* inet_ntoa */
+
+struct mbuf;
+
+/* used by ip_dummynet.c */
+void reinject_drop(struct mbuf* m);
+
+#include <linux/errno.h>       /* error define */
+#include <linux/if.h>          /* IFNAMESIZ */
+
+void rn_init(int);
+/*
+ * some network structure can be defined in the bsd way
+ * by using the _FAVOR_BSD definition. This is not true
+ * for icmp structure.
+ * XXX struct icmp contains bsd names in 
+ * /usr/include/netinet/ip_icmp.h
+ */
+#ifdef __linux__
+#define icmp_code code
+#define icmp_type type
+
+/* linux in6_addr has no member __u6_addr
+ * replace the whole structure ?
+ */
+#define __u6_addr       in6_u
+#define __u6_addr32     u6_addr32
+#endif /* __linux__ */
+
+/* defined in linux/sctp.h with no bsd definition */
+struct sctphdr {
+        uint16_t src_port;      /* source port */
+        uint16_t dest_port;     /* destination port */
+        uint32_t v_tag;         /* verification tag of packet */
+        uint32_t checksum;      /* Adler32 C-Sum */
+        /* chunks follow... */
+};
+
+/* missing definition */
+#define TH_FIN  0x01
+#define TH_SYN  0x02
+#define TH_RST  0x04
+#define TH_ACK  0x10
+
+#define RTF_CLONING    0x100           /* generate new routes on use */
+
+#define IPPROTO_OSPFIGP         89              /* OSPFIGP */
+#define IPPROTO_CARP            112             /* CARP */
+#ifndef _WIN32
+#define IPPROTO_IPV4            IPPROTO_IPIP    /* for compatibility */
+#endif
+
+#define        CARP_VERSION            2
+#define        CARP_ADVERTISEMENT      0x01
+
+#define PRIV_NETINET_IPFW       491     /* Administer IPFW firewall. */
+
+#define IP_FORWARDING           0x1             /* most of ip header exists */
+
+#define NETISR_IP       2               /* same as AF_INET */
+
+#define PRIV_NETINET_DUMMYNET   494     /* Administer DUMMYNET. */
+
+extern int securelevel;
+
+struct carp_header {
+#if BYTE_ORDER == LITTLE_ENDIAN
+        u_int8_t        carp_type:4,
+                        carp_version:4;
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+        u_int8_t        carp_version:4,
+                        carp_type:4;
+#endif
+};
+
+struct pim {
+       int dummy;      /* windows compiler does not like empty definition */
+};
+
+#ifndef _WIN32
+struct route {
+       struct  rtentry *ro_rt;
+       struct  sockaddr ro_dst;
+};
+#endif
+
+struct ifaltq {
+       void *ifq_head;
+};
+
+/*
+ * ifnet->if_snd is used in ip_dummynet.c to take the transmission
+ * clock.
+ */
+#if defined( __linux__)
+#define        if_xname        name
+#define        if_snd          XXX
+/* search local the ip addresses, used for the "me" keyword */
+#include <linux/inetdevice.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+#define INADDR_TO_IFP(ip, b)   \
+       b = ip_dev_find(ip.s_addr)
+#else
+#define INADDR_TO_IFP(ip, b)   \
+       b = ip_dev_find((struct net *)&init_net, ip.s_addr)
+#endif
+
+#elif defined( _WIN32 )
+/* used in ip_dummynet.c */
+struct ifnet {
+       char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
+//        struct ifaltq if_snd;          /* output queue (includes altq) */
+};
+
+struct net_device {
+       char    if_xname[IFNAMSIZ];     /* external name (name + unit) */
+};
+#endif
+
+/* involves mbufs */
+int in_cksum(struct mbuf *m, int len);
+#define divert_cookie(mtag) 0
+#define divert_info(mtag) 0
+#define pf_find_mtag(a) NULL
+#define pf_get_mtag(a) NULL
+#ifndef _WIN32
+#define AF_LINK AF_ASH /* ? our sys/socket.h */
+#endif
+
+/* we don't pullup, either success or free and fail */
+#define m_pullup(m, x)                                 \
+       ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL))
+
+struct pf_mtag {
+       void            *hdr;           /* saved hdr pos in mbuf, for ECN */
+       sa_family_t      af;            /* for ECN */
+        u_int32_t        qid;           /* queue id */
+};
+
+#if 0 // ndef radix
+/* radix stuff in radix.h and radix.c */
+struct radix_node {
+       caddr_t rn_key;         /* object of search */
+       caddr_t rn_mask;        /* netmask, if present */
+};
+#endif /* !radix */
+
+/* missing kernel functions */
+char *inet_ntoa(struct in_addr ina);
+int random(void);
+
+/*
+ * Return the risult of a/b
+ *
+ * this is used in linux kernel space,
+ * since the 64bit division needs to
+ * be done using a macro
+ */
+int64_t
+div64(int64_t a, int64_t b);
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf);
+
+/* from bsd sys/queue.h */
+#define TAILQ_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = TAILQ_FIRST((head));                               \
+            (var) && ((tvar) = TAILQ_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+#define SLIST_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = SLIST_FIRST((head));                               \
+            (var) && ((tvar) = SLIST_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+/* depending of linux version */
+#ifndef ETHERTYPE_IPV6
+#define ETHERTYPE_IPV6          0x86dd          /* IP protocol version 6 */
+#endif
+
+/*-------------------------------------------------*/
+#define RT_NUMFIBS 1
+extern u_int rt_numfibs;
+
+/* involves kernel locking function */
+#ifdef RTFREE
+#undef RTFREE
+#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n");
+#endif
+
+void getmicrouptime(struct timeval *tv);
+
+/* from sys/netinet/ip_output.c */
+struct ip_moptions;
+struct route;
+struct ip;
+
+struct mbuf *ip_reass(struct mbuf *);
+u_short in_cksum_hdr(struct ip *);
+int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+    struct ip_moptions *imo, struct inpcb *inp);
+
+/* from net/netisr.c */
+void netisr_dispatch(int num, struct mbuf *m);
+
+/* definition moved in missing.c */
+int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len);
+
+int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen);
+
+/* defined in session.c */
+int priv_check(struct thread *td, int priv);
+
+/* struct ucred is in linux/socket.h and has pid, uid, gid.
+ * We need a 'bsd_ucred' to store also the extra info
+ */
+
+struct bsd_ucred {
+       uid_t           uid;
+       gid_t           gid;
+       uint32_t        xid;
+       uint32_t        nid;
+};
+
+int
+cred_check(void *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb);
+
+int securelevel_ge(struct ucred *cr, int level);
+
+struct sysctl_oid;
+struct sysctl_req;
+
+#ifdef _WIN32
+#define module_param_named(_name, _var, _ty, _perm)
+#else /* !_WIN32 */
+
+/* Linux 2.4 is mostly for openwrt */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#include <linux/bitops.h>       /* generic_ffs() used in ip_fw2.c */
+typedef uint32_t __be32;
+typedef uint16_t __be16;
+struct sock;
+struct net;
+struct inet_hashinfo;
+struct sock *inet_lookup(
+       struct inet_hashinfo *hashinfo,
+        const __be32 saddr, const __be16 sport,
+        const __be32 daddr, const __be16 dport,
+        const int dif);
+struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+#endif /* Linux < 2.6 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) &&     \
+       LINUX_VERSION_CODE > KERNEL_VERSION(2,6,16)     /* XXX NOT sure, in 2.6.9 give an error */
+#define module_param_named(_name, _var, _ty, _perm)    \
+       //module_param(_name, _ty, 0644)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+typedef unsigned long uintptr_t;
+
+#ifdef __i386__
+static inline unsigned long __fls(unsigned long word)
+{
+        asm("bsr %1,%0"
+            : "=r" (word)
+            : "rm" (word));
+        return word;
+}
+#endif
+
+#endif /* LINUX < 2.6.25 */
+
+#endif /* !_WIN32 so maybe __linux__ */
+
+#if defined (__linux__) && !defined (EMULATE_SYSCTL)
+#define SYSCTL_DECL(_1)
+#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8)
+#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6)
+#define _SYSCTL_BASE(_name, _var, _ty, _perm)          \
+       module_param_named(_name, *(_var), _ty,         \
+               ( (_perm) == CTLFLAG_RD) ? 0444: 0644 )
+#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b)
+
+#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc)       \
+       _SYSCTL_BASE(_name, _var, int, _mode)
+
+#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc)      \
+       _SYSCTL_BASE(_name, _var, long, _mode)
+
+#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc)     \
+       _SYSCTL_BASE(_name, _var, ulong, _mode)
+
+#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc)      \
+        _SYSCTL_BASE(_name, _var, uint, _mode)
+
+#define TUNABLE_INT(_name, _ptr)
+
+#define SYSCTL_VNET_PROC               SYSCTL_PROC
+#define SYSCTL_VNET_INT                        SYSCTL_INT
+#define SYSCTL_VNET_UINT               SYSCTL_UINT
+
+#endif
+
+#define SYSCTL_HANDLER_ARGS            \
+       struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req
+int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_long(SYSCTL_HANDLER_ARGS); 
+
+
+void ether_demux(struct ifnet *ifp, struct mbuf *m);
+
+int ether_output_frame(struct ifnet *ifp, struct mbuf *m);
+
+void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
+
+void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu);
+
+void rtfree(struct rtentry *rt);
+
+u_short in_cksum_skip(struct mbuf *m, int len, int skip);
+
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+int jailed(struct ucred *cred);
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int in_localaddr(struct in_addr in);
+
+/* the prototype is already in the headers */
+//int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); 
+
+int fnmatch(const char *pattern, const char *string, int flags);
+
+int
+linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
+       const __be32 daddr, const __be16 dport,
+       struct sk_buff *skb, int dir, struct bsd_ucred *u);
+
+/* vnet wrappers, in vnet.h and ip_var.h */
+//int ipfw_init(void);
+//void ipfw_destroy(void);
+
+#define        MTAG_IPFW       1148380143      /* IPFW-tagged cookie */
+#define        MTAG_IPFW_RULE  1262273568      /* rule reference */
+
+struct ip_fw_args;
+extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
+
+#define curvnet                 NULL
+#define        CURVNET_SET(_v)
+#define        CURVNET_RESTORE()
+#define VNET_ASSERT(condition)
+
+#define VNET_NAME(n)            n
+#define VNET_DECLARE(t, n)      extern t n
+#define VNET_DEFINE(t, n)       t n
+#define _VNET_PTR(b, n)         &VNET_NAME(n)
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VNET_VNET_PTR(vnet, n)          (&(n))
+#define VNET_VNET(vnet, n)              (n)
+
+#define VNET_PTR(n)             (&(n))
+#define VNET(n)                 (n)
+
+VNET_DECLARE(int, ip_defttl);
+#define V_ip_defttl    VNET(ip_defttl);
+
+int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp,
+       int dir, struct inpcb *inp);
+
+/* hooks for divert */
+extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+extern int (*ip_dn_ctl_ptr)(struct sockopt *);
+typedef int ip_fw_ctl_t(struct sockopt *);
+extern ip_fw_ctl_t *ip_fw_ctl_ptr;
+
+/* netgraph prototypes */
+typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int);
+extern  ng_ipfw_input_t *ng_ipfw_input_p;
+
+/* For kernel ipfw_ether and ipfw_bridge. */
+struct ip_fw_args;
+typedef int ip_fw_chk_t(struct ip_fw_args *args);
+extern  ip_fw_chk_t     *ip_fw_chk_ptr;
+
+#define V_ip_fw_chk_ptr         VNET(ip_fw_chk_ptr)
+#define V_ip_fw_ctl_ptr         VNET(ip_fw_ctl_ptr)
+#define        V_tcbinfo               VNET(tcbinfo)
+#define        V_udbinfo               VNET(udbinfo)
+
+#endif /* !_MISSING_H_ */
diff --git a/kipfw/mysetenv.sh b/kipfw/mysetenv.sh
new file mode 100644 (file)
index 0000000..baea772
--- /dev/null
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# bash script to set a suitable environment to call MSVC's build
+# to build a 64-bit version of the kernel.
+#
+# inspired by C:/winddk/7600.16385.1/bin/setenv.bat
+# see http://www.osronline.com/ddkx/ddtools/build_ref_0kqb.htm
+
+#############################################################
+#  edit theese variables to meet your configuration         #
+#  - DRIVE is the hard drive letter where DDK is installed  #
+#  - DDK is the path to the DDK's root directory            #
+#  - CYGDDK is the complete cygwin path to DDK              #
+#############################################################
+if [ $# -ne 3 ]; then
+echo "invalid params" && exit 1
+fi
+DRIVE=$1
+DDK=$2
+CYGDDK=/cygdrive/c/${DDK}
+TARGETOS=$3
+MYDIR=`pwd`    # XXX luigi
+
+if [ "$TARGETOS" = "wnet" ]; then
+export DDK_TARGET_OS=WinNET
+export _NT_TARGET_VERSION=0x502
+fi
+
+if [ "$TARGETOS" = "wlh" ]; then
+export DDK_TARGET_OS=WinLH
+export _NT_TARGET_VERSION=0x600
+fi
+
+if [ "$TARGETOS" = "win7" ]; then
+export DDK_TARGET_OS=Win7
+export _NT_TARGET_VERSION=0x601
+fi
+
+
+#############################################################
+#  don't edit anything else below this point                #
+#############################################################
+
+D=${DRIVE}${DDK}
+DB=${D}/bin
+DI=${D}/inc
+DL=${D}/lib
+
+
+export AMD64=1
+export ATL_INC_PATH=$DI                                # defaults to DDKROOT/inc
+export ATL_INC_ROOT=$DI                                # XXX redundant ?
+export ATL_LIB_PATH=${DL}/atl/*
+export BASEDIR=$D                              # default
+export BUFFER_OVERFLOW_CHECKS=1
+export BUILD_ALLOW_COMPILER_WARNINGS=1
+export BUILD_ALT_DIR=chk_${TARGETOS}_AMD64
+export BUILD_DEFAULT="-ei -nmake -i -nosqm"    # can go on the command line
+export BUILD_DEFAULT_TARGETS="-amd64"          # can also go on the command line
+export BUILD_MAKE_PROGRAM=nmake.exe            # default to nmake
+export BUILD_MULTIPROCESSOR=1                  # parallel make, same as -M
+export BUILD_OPTIONS=" ~imca ~toastpkg"
+export COFFBASE_TXT_FILE=${DB}/coffbase.txt
+export CPU=AMD64
+export CRT_INC_PATH=${DI}/crt                  # default
+export CRT_LIB_PATH=${DL}/crt/*                        # not default, it seems uses lib/{wnet,win7}/*
+export DDKBUILDENV=chk                         # checked or free
+export DDK_INC_PATH=${DI}/ddk
+export DDK_LIB_DEST=${DL}/${TARGETOS}
+export DDK_LIB_PATH=${DL}/${TARGETOS}/*
+export DEPRECATE_DDK_FUNCTIONS=1
+export DRIVER_INC_PATH=${DI}/ddk
+export HALKIT_INC_PATH=${DI}/ddk
+export HALKIT_LIB_PATH=${DL}/${TARGETOS}/*
+export IFSKIT_INC_PATH=${DI}/ddk
+export IFSKIT_LIB_DEST=${DL}/${TARGETOS}
+export IFSKIT_LIB_PATH=${DL}/${TARGETOS}/*
+export Include=${DI}/api
+export KMDF_INC_PATH=${DI}/wdf/kmdf
+export KMDF_LIB_PATH=${DL}/wdf/kmdf/*
+export LANGUAGE_NEUTRAL=0
+export Lib=${DL}
+export LINK_LIB_IGNORE=4198
+export MFC_INC_PATH=${DI}/mfc42
+export MFC_LIB_PATH=${DL}/mfc/*
+export MSC_OPTIMIZATION="/Od /Oi" 
+export NEW_CRTS=1
+export NO_BINPLACE=TRUE
+export NO_BROWSER_FILE=TRUE
+export NTDBGFILES=1
+export NTDEBUG=ntsd
+export NTDEBUGTYPE=both
+# need NTMAKEENV to point to the binary dir
+export NTMAKEENV=${DB}
+export OAK_INC_PATH=${DI}/api
+
+export PATH="${CYGDDK}/bin/amd64:${CYGDDK}/tools/sdv/bin:${CYGDDK}/tools/pfd/bin/bin/x86_AMD64\
+:${CYGDDK}/bin/SelfSign:${CYGDDK}/bin/x86/amd64:${CYGDDK}/bin/x86\
+:${CYGDDK}/tools/pfd/bin/bin/AMD64:${CYGDDK}/tools/tracing/amd64:$PATH"
+
+export PATHEXT=".COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC"
+export PROJECT_ROOT=${D}/src
+export PUBLIC_ROOT=${D}
+export RAZZLETOOLPATH=${DB}
+export RCNOFONTMAP=1
+export SDK_INC_PATH=${DI}/api
+export SDK_LIB_DEST=${DL}/${TARGETOS}
+export SDK_LIB_PATH=${DL}/${TARGETOS}/*
+export SDV=${D}/tools/sdv
+export separate_object_root=FALSE
+export TEMP=tmpbuild
+export TMP=tmpbuild
+export UMDF_INC_PATH=${DI}/wdf/umdf
+export USE_OBJECT_ROOT=1
+export WDM_INC_PATH=${DI}/ddk
+export WPP_CONFIG_PATH=${DB}/wppconfig
+export _AMD64bit=true
+export _BUILDARCH=AMD64
+export _BuildType=chk
+export _NTDRIVE=${DRIVE}
+export _NTROOT=${DDK}
+#
+# --- XXX note, it spams  C:/winddk/7600.16385.1/build.dat
+# -c: delete objs, -e: generare build.* logfiles, -f rescan sources, -g color errors
+unset MAKEFLAGS
+echo "emv ${MAKE} flags ${MAKEFLAGS}"
+cd kipfw-mod && build -cefg 
+echo "done"
+#cp objchk_${TARGETOS}_amd64/amd64/ipfw.sys ../binary/ipfw.sys
diff --git a/kipfw/netipfw.inf b/kipfw/netipfw.inf
new file mode 100644 (file)
index 0000000..5dee2c8
--- /dev/null
@@ -0,0 +1,81 @@
+; version section\r
+[Version]\r
+Signature  = "$Windows NT$"\r
+Class      = NetService\r
+ClassGUID  = {4D36E974-E325-11CE-BFC1-08002BE10318}\r
+Provider   = %Unipi%\r
+DriverVer  = 08/12/2012,3.0.1.1\r
+\r
+; manufacturer section\r
+[Manufacturer]\r
+%Unipi% = UNIPI,NTx86,NTamd64\r
+\r
+; control flags section\r
+; optional, unused in netipfw.inf inf, used in netipfw_m.inf\r
+[ControlFlags]\r
+\r
+; models section\r
+[UNIPI] ; Win2k\r
+%Desc% = Ipfw.ndi, unipi_ipfw\r
+[UNIPI.NTx86] ;For WinXP and later\r
+%Desc% = Ipfw.ndi, unipi_ipfw\r
+[UNIPI.NTamd64] ;For x64\r
+%Desc% = Ipfw.ndi, unipi_ipfw\r
+\r
+; ddinstall section\r
+[Ipfw.ndi]\r
+AddReg          = Ipfw.ndi.AddReg, Ipfw.AddReg\r
+Characteristics = 0x4410 ;  NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!!\r
+CopyFiles       = Ipfw.Files.Sys\r
+CopyInf         = netipfw_m.inf\r
+\r
+; remove section\r
+[Ipfw.ndi.Remove]\r
+DelFiles = Ipfw.Files.Sys\r
+\r
+;ddinstall.services section\r
+[Ipfw.ndi.Services]\r
+AddService = Ipfw,,Ipfw.AddService\r
+\r
+[Ipfw.AddService]\r
+DisplayName    = %ServiceDesc%\r
+ServiceType    = 1 ;SERVICE_KERNEL_DRIVER\r
+StartType      = 3 ;SERVICE_DEMAND_START\r
+ErrorControl   = 1 ;SERVICE_ERROR_NORMAL\r
+ServiceBinary  = %12%\ipfw.sys\r
+AddReg         = Ipfw.AddService.AddReg\r
+\r
+[Ipfw.AddService.AddReg]\r
+\r
+;file copy related sections\r
+[SourceDisksNames]\r
+1=%DiskDescription%,"",,\r
+\r
+[SourceDisksFiles]\r
+ipfw.sys=1\r
+\r
+[DestinationDirs]\r
+DefaultDestDir = 12\r
+Ipfw.Files.Sys   = 12   ; %windir%\System32\drivers\r
+\r
+; ddinstall->copyfiles points here\r
+[Ipfw.Files.Sys]\r
+ipfw.sys,,,2\r
+\r
+; ddinstall->addreg points here\r
+[Ipfw.ndi.AddReg]\r
+HKR, Ndi,            HelpText,            , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box\r
+HKR, Ndi,            FilterClass,         , failover\r
+HKR, Ndi,            FilterDeviceInfId,   , unipi_ipfwmp\r
+HKR, Ndi,            Service,             , Ipfw\r
+HKR, Ndi\Interfaces, UpperRange,          , noupper\r
+HKR, Ndi\Interfaces, LowerRange,          , nolower\r
+HKR, Ndi\Interfaces, FilterMediaTypes,    , "ethernet, tokenring, fddi, wan"\r
+\r
+;strings section\r
+[Strings]\r
+Unipi = "Unipi"\r
+DiskDescription = "Ipfw Driver Disk"\r
+Desc = "ipfw+dummynet"\r
+HELP = "This is ipfw and dummynet network emulator, developed by unipi.it"\r
+ServiceDesc = "ipfw service"\r
diff --git a/kipfw/netipfw_m.inf b/kipfw/netipfw_m.inf
new file mode 100644 (file)
index 0000000..a299b12
--- /dev/null
@@ -0,0 +1,56 @@
+; version section\r
+[Version]\r
+Signature  = "$Windows NT$"\r
+Class      = Net\r
+ClassGUID  = {4D36E972-E325-11CE-BFC1-08002BE10318}\r
+Provider   = %Unipi%\r
+DriverVer  = 08/12/2012,3.0.1.1\r
+\r
+; control flags section\r
+; optional, unused in netipfw.inf inf, used in netipfw_m.inf\r
+[ControlFlags]\r
+ExcludeFromSelect = unipi_ipfwmp\r
+\r
+; destinationdirs section, optional\r
+[DestinationDirs]\r
+DefaultDestDir=12\r
+; No files to copy \r
+\r
+; manufacturer section\r
+[Manufacturer]\r
+%Unipi% = UNIPI,NTx86,NTamd64\r
+\r
+; models section\r
+[UNIPI] ; Win2k\r
+%Desc% = IpfwMP.ndi, unipi_ipfwmp\r
+[UNIPI.NTx86] ;For WinXP and later\r
+%Desc% = IpfwMP.ndi, unipi_ipfwmp\r
+[UNIPI.NTamd64] ;For x64\r
+%Desc% = IpfwMP.ndi, unipi_ipfwmp\r
+\r
+; ddinstall section\r
+[IpfwMP.ndi]\r
+AddReg  = IpfwMP.ndi.AddReg\r
+Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN\r
+\r
+; ddinstall->addreg points here\r
+[IpfwMP.ndi.AddReg]\r
+HKR, Ndi, Service,  0,  IpfwMP\r
+\r
+;ddinstall.services section\r
+[IpfwMP.ndi.Services]\r
+AddService = IpfwMP,0x2, IpfwMP.AddService\r
+\r
+[IpfwMP.AddService]\r
+ServiceType    = 1 ;SERVICE_KERNEL_DRIVER\r
+StartType      = 3 ;SERVICE_DEMAND_START\r
+ErrorControl   = 1 ;SERVICE_ERROR_NORMAL\r
+ServiceBinary  = %12%\ipfw.sys\r
+AddReg         = IpfwMP.AddService.AddReg\r
+\r
+[IpfwMP.AddService.AddReg]\r
+; None\r
+\r
+[Strings]\r
+Unipi = "Unipi"\r
+Desc = "Ipfw Miniport"
diff --git a/kipfw/sources b/kipfw/sources
new file mode 100644 (file)
index 0000000..9481e75
--- /dev/null
@@ -0,0 +1,20 @@
+TARGETNAME=ipfw\r
+TARGETTYPE=DRIVER\r
+\r
+C_DEFINES=$(C_DEFINES) -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1\r
+\r
+MSC_WARNING_LEVEL=/W2\r
+\r
+# The driver is built in the XP or .NET build environment\r
+# So let us build NDIS 5.1 version.\r
+C_DEFINES=$(C_DEFINES) -DNDIS51_MINIPORT=1\r
+C_DEFINES=$(C_DEFINES) -DNDIS51=1\r
+\r
+# Enable dummynet preprocessing macros\r
+C_DEFINES=$(C_DEFINES) /D_WIN32 /DMODULENAME=Ipfw /D_BSD_SOURCE /DKERNEL_MODULE /D_KERNEL /DKLD_MODULE /D__BSD_VISIBLE /DIPFIREWALL_DEFAULT_TO_ACCEPT /D__LITTLE_ENDIAN /DSYSCTL_NODE /DEMULATE_SYSCTL -FIwinmissing.h -FImissing.h -FI../glue.h /DWIN32_LEAN_AND_MEAN=1\r
+\r
+TARGETLIBS=$(DDK_LIB_PATH)\ndis.lib\r
+\r
+INCLUDES= include_e ; ../sys\r
+\r
+SOURCES= ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c radix.c in_cksum.c ip_dummynet.c ip_dn_io.c ip_dn_glue.c dn_heap.c dn_sched_fifo.c dn_sched_wf2q.c dn_sched_rr.c dn_sched_qfq.c dn_sched_prio.c ipfw2_mod.c bsd_compat.c md_win.c miniport.c protocol.c passthru.c debug.c\r
diff --git a/kipfw/win-passthru.diff b/kipfw/win-passthru.diff
new file mode 100644 (file)
index 0000000..eeb211b
--- /dev/null
@@ -0,0 +1,251 @@
+diff -ubwrp original_passthru/miniport.c kipfw/miniport.c
+--- original_passthru/miniport.c       2012-08-01 14:34:15.096679600 +0200
++++ kipfw/miniport.c   2012-08-01 14:34:11.377929600 +0200
+@@ -223,6 +223,7 @@ Return Value:
+     //\r
+     // Use NDIS 5.1 packet stacking:\r
+     //\r
++    if (0)    // XXX IPFW - make sure we don't go in here
+     {\r
+         PNDIS_PACKET_STACK        pStack;\r
+         BOOLEAN                   Remaining;\r
+@@ -347,6 +348,25 @@ Return Value:
+                                                 MediaSpecificInfo,\r
+                                                 MediaSpecificInfoSize);\r
+         }\r
++#if 1 /* IPFW: query the firewall */
++      /* if dummynet keeps the packet, we mimic success.
++       * otherwise continue as usual.
++       */
++              {
++                      int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING,
++                                      MiniportAdapterContext);
++                      if (ret != PASS) {
++                              if (ret == DROP)
++                                      return NDIS_STATUS_FAILURE;
++                              else {  //dummynet kept the packet
++#ifndef WIN9X
++                                      NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);
++#endif
++                                      return NDIS_STATUS_SUCCESS; //otherwise simply continue
++                              }
++                      }
++              }
++#endif        /* end of IPFW code */
\r
+         NdisSend(&Status,\r
+                  pAdapt->BindingHandle,\r
+diff -ubwrp original_passthru/passthru.c kipfw/passthru.c
+--- original_passthru/passthru.c       2012-08-01 14:34:15.268554600 +0200
++++ kipfw/passthru.c   2012-08-01 14:34:11.534179600 +0200
+@@ -47,8 +47,15 @@ NDIS_HANDLE        NdisWrapperHandle;
+ // To support ioctls from user-mode:\r
+ //\r
\r
+-#define LINKNAME_STRING     L"\\DosDevices\\Passthru"\r
+-#define NTDEVICE_STRING     L"\\Device\\Passthru"\r
++#define STR2(x) #x
++#define STR(x) STR2(x)
++#define DOSPREFIX "\\DosDevices\\"
++#define NTPREFIX "\\Device\\"
++#define WIDEN2(x) L ## x
++#define WIDEN(x) WIDEN2(x)
++#define LINKNAME_STRING                       WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME))
++#define NTDEVICE_STRING                       WIDEN(NTPREFIX) WIDEN(STR(MODULENAME))
++#define PROTOCOLNAME_STRING           WIDEN(STR(MODULENAME))
\r
+ NDIS_HANDLE     NdisDeviceHandle = NULL;\r
+ PDEVICE_OBJECT  ControlDeviceObject = NULL;\r
+@@ -136,8 +143,8 @@ Return Value:
+         // Either the Send or the SendPackets handler should be specified.\r
+         // If SendPackets handler is specified, SendHandler is ignored\r
+         //\r
+-        MChars.SendHandler = NULL;    // MPSend;\r
+-        MChars.SendPacketsHandler = MPSendPackets;\r
++        MChars.SendHandler = MPSend;    // IPFW: use MPSend, not SendPackets
++        MChars.SendPacketsHandler = NULL;
\r
+         Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle,\r
+                                                   &MChars,\r
+@@ -165,7 +172,7 @@ Return Value:
+         // This is needed to ensure that NDIS can correctly determine\r
+         // the binding and call us to bind to miniports below.\r
+         //\r
+-        NdisInitUnicodeString(&Name, L"Passthru");    // Protocol name\r
++        NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING);    // Protocol name
+         PChars.Name = Name;\r
+         PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete;\r
+         PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete;\r
+@@ -205,6 +212,8 @@ Return Value:
+         NdisTerminateWrapper(NdisWrapperHandle, NULL);\r
+     }\r
\r
++    ipfw_module_init();       // IPFW - start the system
++
+     return(Status);\r
+ }\r
\r
+@@ -276,7 +285,8 @@ Return Value:
+         DispatchTable[IRP_MJ_CREATE] = PtDispatch;\r
+         DispatchTable[IRP_MJ_CLEANUP] = PtDispatch;\r
+         DispatchTable[IRP_MJ_CLOSE] = PtDispatch;\r
+-        DispatchTable[IRP_MJ_DEVICE_CONTROL] = PtDispatch;\r
++      // IPFW we use DevIoControl ?
++        DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl;
+         \r
\r
+         NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING);\r
+@@ -453,6 +463,7 @@ PtUnload(
+     \r
+     NdisFreeSpinLock(&GlobalLock);\r
\r
++    ipfw_module_exit(); // IPFW unloading dummynet
++
+     DBGPRINT(("PtUnload: done!\n"));\r
+ }\r
+-\r
+diff -ubwrp original_passthru/passthru.h kipfw/passthru.h
+--- original_passthru/passthru.h       2012-08-01 14:34:15.049804600 +0200
++++ kipfw/passthru.h   2012-08-01 14:34:11.362304600 +0200
+@@ -61,6 +61,13 @@ PtDispatch(
+     IN PIRP                      Irp\r
+     );\r
\r
++DRIVER_DISPATCH DevIoControl;\r
++NTSTATUS\r
++DevIoControl(\r
++    IN PDEVICE_OBJECT            pDeviceObject,\r
++    IN PIRP                      pIrp\r
++    );\r
++\r
+ NDIS_STATUS\r
+ PtRegisterDevice(\r
+     VOID\r
+@@ -366,6 +373,7 @@ PtDereferenceAdapt(
+ typedef struct _SEND_RSVD\r
+ {\r
+     PNDIS_PACKET    OriginalPkt;\r
++    struct mbuf*    pMbuf; // IPFW extension, reference to the mbuf\r
+ } SEND_RSVD, *PSEND_RSVD;\r
\r
+ //\r
+@@ -376,6 +384,7 @@ typedef struct _SEND_RSVD
+ typedef struct _RECV_RSVD\r
+ {\r
+     PNDIS_PACKET    OriginalPkt;\r
++    struct mbuf*    pMbuf; // IPFW extension, reference to the mbuf\r
+ } RECV_RSVD, *PRECV_RSVD;\r
\r
+ C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved));\r
+@@ -475,3 +484,17 @@ IsIMDeviceStateOn(
+ */\r
+ #define IsIMDeviceStateOn(_pP)        ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) \r
\r
++#include "winmissing.h"\r
++\r
++int ipfw_module_init(void);\r
++void ipfw_module_exit(void);\r
++int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction,\r
++      NDIS_HANDLE Context);\r
++int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext,\r
++              unsigned char* HeaderBuffer, unsigned int HeaderBufferSize,\r
++              unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize,\r
++          unsigned int PacketSize);\r
++void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt);\r
++void hexdump(PUCHAR,int, const char *);\r
++void my_init();\r
++void my_exit();
+\ Manca newline alla fine del file
+Solo in original_passthru: passthru.htm
+Solo in original_passthru: passthru.rc
+diff -ubwrp original_passthru/protocol.c kipfw/protocol.c
+--- original_passthru/protocol.c       2012-08-01 14:34:15.112304600 +0200
++++ kipfw/protocol.c   2012-08-01 14:34:11.409179600 +0200
+@@ -841,6 +841,14 @@ Return Value:
+         SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved);\r
+         Pkt = SendRsvd->OriginalPkt;\r
+     \r
++#if 1 // IPFW - new code
++      //DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt);
++      if (Pkt == NULL) { //this is a reinjected packet, with no 'father'
++              CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt);
++              return;
++      }
++#endif /* IPFW */
++    
+ #ifndef WIN9X\r
+         NdisIMCopySendCompletePerPacketInfo (Pkt, Packet);\r
+ #endif\r
+@@ -1021,6 +1029,13 @@ Return Value:
\r
+                 if (pAdapt->MiniportHandle != NULL)\r
+                 {\r
++#if 1 /* IPFW: query the firewall */
++                                      int     ret;
++                                      ret = ipfw2_qhandler_w32(MyPacket, INCOMING,
++                                              ProtocolBindingContext);
++                                      if (ret != PASS)
++                                      return 0; //otherwise simply continue
++#endif /* end of IPFW code */
+                     NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);\r
+                 }\r
\r
+@@ -1055,6 +1070,13 @@ Return Value:
+         {\r
+             case NdisMedium802_3:\r
+             case NdisMediumWan:\r
++                              //DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize);
++                              //hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive");
++                      {
++                              int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize);
++                              if (ret != PASS)
++                                      return NDIS_STATUS_SUCCESS;
++                      }
+                 NdisMEthIndicateReceive(pAdapt->MiniportHandle,\r
+                                              MacReceiveContext,\r
+                                              HeaderBuffer,\r
+@@ -1120,6 +1142,21 @@ Return Value:
+     PADAPT        pAdapt =(PADAPT)ProtocolBindingContext;\r
+     ULONG         Proc = KeGetCurrentProcessorNumber();      \r
\r
++      /* Warning: this is a poor implementation of the PtReceiveComplete
++       * made by MS, and it's a well known (but never fixed) issue.
++       * Since the ProcessorNumber here can be different from the one
++       * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete
++       * will not be called, causing poor performance in the incoming traffic.
++       * In our driver, PtReceive is called for IP packets ONLY by particulary 
++       * old NIC drivers, and the poor performance can be seen even 
++       * in traffic not handled by ipfw or dummynet.
++       * Fortunately, this is quite rare, all the incoming IP packets
++       * will arrive through PtReceivePacket, and this callback will never
++       * be called. For reinjected traffic, a workaround is done
++       * commuting the ReceivedIndicationFlag and calling
++       * NdisMEthIndicateReceiveComplete manually for each packet.
++       */
++
+     if (((pAdapt->MiniportHandle != NULL)\r
+                 && (pAdapt->MPDeviceState == NdisDeviceStateD0))\r
+                 && (pAdapt->ReceivedIndicationFlags[Proc]))\r
+@@ -1199,7 +1236,7 @@ Return Value:
+     // See also: PtReceive(). \r
+     //\r
+     (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining);\r
+-    if (Remaining)\r
++    if (0 && Remaining)
+     {\r
+         //\r
+         // We can reuse "Packet". Indicate it up and be done with it.\r
+@@ -1247,6 +1284,13 @@ Return Value:
\r
+         if (pAdapt->MiniportHandle != NULL)\r
+         {\r
++#if 1 /* IPFW: query the firewall */
++          int ret;
++          ret = ipfw2_qhandler_w32(MyPacket, INCOMING,
++                      ProtocolBindingContext);
++          if (ret != PASS)
++                      return 0; //otherwise simply continue
++#endif /* end of IPFW code */
+             NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);\r
+         }\r
\r
diff --git a/kipfw/winmissing.h b/kipfw/winmissing.h
new file mode 100644 (file)
index 0000000..5870264
--- /dev/null
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2010 Francesco Magno, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: winmissing.h 11647 2012-08-06 23:20:21Z luigi $
+ * definitions and other things needed to build freebsd kernel
+ * modules in Windows (with the MSVC compiler)
+ */
+
+#ifndef _WINMISSING_H_
+#define _WINMISSING_H_
+
+#include <ntifs.h>
+#include <ntddk.h>
+#include <basetsd.h>
+#include <windef.h>
+#include <stdio.h>
+#include <ndis.h>
+
+typedef UCHAR  u_char;
+typedef UCHAR  u_int8_t;
+typedef UCHAR  uint8_t;
+typedef USHORT u_short;
+typedef USHORT u_int16_t;
+typedef USHORT uint16_t;
+typedef USHORT n_short;
+typedef UINT   u_int;
+typedef INT32  int32_t;
+typedef UINT32 u_int32_t;
+typedef UINT32 uint32_t;
+typedef ULONG  u_long;
+typedef ULONG  n_long;
+typedef UINT64 uint64_t;
+typedef UINT64 u_int64_t;
+typedef INT64  int64_t;
+
+typedef UINT32 in_addr_t;
+typedef UCHAR  sa_family_t;
+typedef        USHORT  in_port_t;
+typedef UINT32 __gid_t;
+typedef UINT32 gid_t;
+typedef UINT32 __uid_t;
+typedef UINT32 uid_t;
+typedef ULONG  n_time;
+typedef char*  caddr_t;
+
+/* linux_lookup uses __be32 and __be16 in the prototype */
+typedef uint32_t __be32; /* XXX __u32 __bitwise __be32 */
+typedef uint16_t __be16; /* XXX */
+
+//*** DEBUG STUFF ***
+/*
+ * To see the debugging messages you need DbgView
+http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx
+ */
+#define printf         DbgPrint
+#define log(lev, ...)  DbgPrint(__VA_ARGS__)
+const char* texify_cmd(int i);
+const char* texify_proto(unsigned int p);
+//*** end DEBUG STUFF ***
+
+#define snprintf _snprintf
+#define timespec timeval
+struct timeval {
+       long tv_sec;
+       long tv_usec;
+};
+
+struct in_addr {
+       in_addr_t s_addr;
+};
+
+struct sockaddr_in {
+       uint8_t sin_len;
+       sa_family_t     sin_family;
+       in_port_t       sin_port;
+       struct  in_addr sin_addr;
+       char    sin_zero[8];
+};
+
+/* XXX watch out, windows names are actually longer */
+#define IFNAMSIZ       16
+#define IF_NAMESIZE    16
+
+#define ETHER_ADDR_LEN 6
+
+/* we do not include the windows headers for in6_addr so
+ * we need to provide our own definition for the kernel.
+ */
+struct in6_addr {
+        union {
+                uint8_t         __u6_addr8[16];
+                uint16_t        __u6_addr16[8]; 
+                uint32_t        __u6_addr32[4];
+        } __u6_addr;                    /* 128-bit IP6 address */
+};
+
+#define        htons(x) RtlUshortByteSwap(x)
+#define        ntohs(x) RtlUshortByteSwap(x)
+#define        htonl(x) RtlUlongByteSwap(x)
+#define        ntohl(x) RtlUlongByteSwap(x)
+
+#define ENOSPC          28      /* No space left on device */
+#define        EOPNOTSUPP      45      /* Operation not supported */
+#define        EACCES          13      /* Permission denied */
+#define        ENOENT          2       /* No such file or directory */
+#define EINVAL          22      /* Invalid argument */
+#define        EPROTONOSUPPORT 43      /* Protocol not supported */
+#define        ENOMEM          12      /* Cannot allocate memory */
+#define        EEXIST          17      /* File exists */
+#define ESRCH          3
+#define        ENOBUFS         55      /* No buffer space available */
+#define        EBUSY           16      /* Module busy */
+
+
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#define __packed 
+#define __aligned(x);
+#define __user
+#define __init
+#define __exit
+#define __func__ __FUNCTION__
+#define inline __inline
+
+struct sockaddr_in6 {
+       int dummy;
+};
+
+//SPINLOCKS
+#define DEFINE_SPINLOCK(x)             NDIS_SPIN_LOCK x
+#define mtx_init(m,a,b,c)              NdisAllocateSpinLock(m)
+#define mtx_lock(_l)                   NdisAcquireSpinLock(_l)
+#define mtx_unlock(_l)                 NdisReleaseSpinLock(_l)
+#define        mtx_destroy(m)                  NdisFreeSpinLock(m)
+#define mtx_assert(a, b)
+
+#define rw_rlock(_l)                   NdisAcquireSpinLock(_l)
+#define rw_runlock(_l)                 NdisReleaseSpinLock(_l)
+#define rw_assert(a, b)
+#define rw_wlock(_l)                   NdisAcquireSpinLock(_l)
+#define rw_wunlock(_l)                 NdisReleaseSpinLock(_l)
+#define rw_destroy(_l)                 NdisFreeSpinLock(_l)
+#define rw_init(_l, msg)               NdisAllocateSpinLock(_l)
+#define rw_init_flags(_l, s, v)                NdisAllocateSpinLock(_l)
+
+#define rwlock_t NDIS_SPIN_LOCK
+#define spinlock_t NDIS_SPIN_LOCK
+
+#define s6_addr   __u6_addr.__u6_addr8
+
+
+struct icmphdr {
+       u_char  icmp_type;              /* type of message, see below */
+       u_char  icmp_code;              /* type sub code */
+       u_short icmp_cksum;             /* ones complement cksum of struct */
+};
+
+#define        ICMP_ECHO               8               /* echo service */
+
+#define IPOPT_OPTVAL            0               /* option ID */
+#define IPOPT_OLEN              1               /* option length */
+#define IPOPT_EOL               0               /* end of option list */
+#define IPOPT_NOP               1               /* no operation */
+#define IPOPT_LSRR              131             /* loose source route */
+#define IPOPT_SSRR              137             /* strict source route */
+#define IPOPT_RR                7               /* record packet route */
+#define IPOPT_TS                68              /* timestamp */
+
+#define        IPPROTO_ICMP    1               /* control message protocol */
+#define        IPPROTO_TCP             6               /* tcp */
+#define        IPPROTO_UDP             17              /* user datagram protocol */
+#define        IPPROTO_ICMPV6          58              /* ICMP6 */
+#define        IPPROTO_SCTP            132             /* SCTP */
+#define        IPPROTO_HOPOPTS         0               /* IP6 hop-by-hop options */
+#define        IPPROTO_ROUTING         43              /* IP6 routing header */
+#define        IPPROTO_FRAGMENT        44              /* IP6 fragmentation header */
+#define        IPPROTO_DSTOPTS         60              /* IP6 destination option */
+#define        IPPROTO_AH              51              /* IP6 Auth Header */
+#define        IPPROTO_ESP             50              /* IP6 Encap Sec. Payload */
+#define        IPPROTO_NONE            59              /* IP6 no next header */
+#define        IPPROTO_PIM             103             /* Protocol Independent Mcast */
+
+#define IPPROTO_IPV6           41
+#define        IPPROTO_IPV4            4               /* IPv4 encapsulation */
+
+
+#define        INADDR_ANY              (uint32_t)0x00000000
+
+#define        AF_INET         2               /* internetwork: UDP, TCP, etc. */
+#define        AF_LINK         18              /* Link layer interface */
+
+#define        IN_CLASSD(i)            (((uint32_t)(i) & 0xf0000000) == 0xe0000000)
+#define        IN_MULTICAST(i)         IN_CLASSD(i)
+
+#define DROP 0
+#define PASS 1
+#define DUMMYNET 2
+#define INCOMING 0
+#define OUTGOING 1
+
+size_t strlcpy(char *dst, const char *src, size_t siz);
+void do_gettimeofday(struct timeval *tv);
+int ffs(int bits);
+int time_uptime_w32();
+
+#endif /* _WINMISSING_H_ */
diff --git a/kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk b/kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk
new file mode 100644 (file)
index 0000000..de31ced
Binary files /dev/null and b/kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk differ
diff --git a/modified_passthru/miniport.c b/modified_passthru/miniport.c
new file mode 100644 (file)
index 0000000..3baff88
--- /dev/null
@@ -0,0 +1,1481 @@
+/*++
+
+Copyright (c) 1992-2000  Microsoft Corporation
+
+Module Name:
+
+    miniport.c
+
+Abstract:
+
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.
+
+Author:
+
+Environment:
+
+
+Revision History:
+
+
+--*/
+
+#include "precomp.h"
+#pragma hdrstop
+
+
+
+NDIS_STATUS
+MPInitialize(
+    OUT PNDIS_STATUS             OpenErrorStatus,
+    OUT PUINT                    SelectedMediumIndex,
+    IN  PNDIS_MEDIUM             MediumArray,
+    IN  UINT                     MediumArraySize,
+    IN  NDIS_HANDLE              MiniportAdapterHandle,
+    IN  NDIS_HANDLE              WrapperConfigurationContext
+    )
+/*++
+
+Routine Description:
+
+    This is the initialize handler which gets called as a result of
+    the BindAdapter handler calling NdisIMInitializeDeviceInstanceEx.
+    The context parameter which we pass there is the adapter structure
+    which we retrieve here.
+
+    Arguments:
+
+    OpenErrorStatus            Not used by us.
+    SelectedMediumIndex        Place-holder for what media we are using
+    MediumArray                Array of ndis media passed down to us to pick from
+    MediumArraySize            Size of the array
+    MiniportAdapterHandle    The handle NDIS uses to refer to us
+    WrapperConfigurationContext    For use by NdisOpenConfiguration
+
+Return Value:
+
+    NDIS_STATUS_SUCCESS unless something goes wrong
+
+--*/
+{
+    UINT            i;
+    PADAPT          pAdapt;
+    NDIS_STATUS     Status = NDIS_STATUS_FAILURE;
+    NDIS_MEDIUM     Medium;
+
+    UNREFERENCED_PARAMETER(WrapperConfigurationContext);
+    
+    do
+    {
+        //
+        // Start off by retrieving our adapter context and storing
+        // the Miniport handle in it.
+        //
+        pAdapt = NdisIMGetDeviceContext(MiniportAdapterHandle);
+        pAdapt->MiniportIsHalted = FALSE;
+
+        DBGPRINT(("==> Miniport Initialize: Adapt %p\n", pAdapt));
+
+        //
+        // Usually we export the medium type of the adapter below as our
+        // virtual miniport's medium type. However if the adapter below us
+        // is a WAN device, then we claim to be of medium type 802.3.
+        //
+        Medium = pAdapt->Medium;
+
+        if (Medium == NdisMediumWan)
+        {
+            Medium = NdisMedium802_3;
+        }
+
+        for (i = 0; i < MediumArraySize; i++)
+        {
+            if (MediumArray[i] == Medium)
+            {
+                *SelectedMediumIndex = i;
+                break;
+            }
+        }
+
+        if (i == MediumArraySize)
+        {
+            Status = NDIS_STATUS_UNSUPPORTED_MEDIA;
+            break;
+        }
+
+
+        //
+        // Set the attributes now. NDIS_ATTRIBUTE_DESERIALIZE enables us
+        // to make up-calls to NDIS without having to call NdisIMSwitchToMiniport
+        // or NdisIMQueueCallBack. This also forces us to protect our data using
+        // spinlocks where appropriate. Also in this case NDIS does not queue
+        // packets on our behalf. Since this is a very simple pass-thru
+        // miniport, we do not have a need to protect anything. However in
+        // a general case there will be a need to use per-adapter spin-locks
+        // for the packet queues at the very least.
+        //
+        NdisMSetAttributesEx(MiniportAdapterHandle,
+                             pAdapt,
+                             0,                                        // CheckForHangTimeInSeconds
+                             NDIS_ATTRIBUTE_IGNORE_PACKET_TIMEOUT    |
+                                NDIS_ATTRIBUTE_IGNORE_REQUEST_TIMEOUT|
+                                NDIS_ATTRIBUTE_INTERMEDIATE_DRIVER |
+                                NDIS_ATTRIBUTE_DESERIALIZE |
+                                NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND,
+                             0);
+
+        pAdapt->MiniportHandle = MiniportAdapterHandle;
+        //
+        // Initialize LastIndicatedStatus to be NDIS_STATUS_MEDIA_CONNECT
+        //
+        pAdapt->LastIndicatedStatus = NDIS_STATUS_MEDIA_CONNECT;
+        
+        //
+        // Initialize the power states for both the lower binding (PTDeviceState)
+        // and our miniport edge to Powered On.
+        //
+        pAdapt->MPDeviceState = NdisDeviceStateD0;
+        pAdapt->PTDeviceState = NdisDeviceStateD0;
+
+        //
+        // Add this adapter to the global pAdapt List
+        //
+        NdisAcquireSpinLock(&GlobalLock);
+
+        pAdapt->Next = pAdaptList;
+        pAdaptList = pAdapt;
+
+        NdisReleaseSpinLock(&GlobalLock);
+        
+        //
+        // Create an ioctl interface
+        //
+        (VOID)PtRegisterDevice();
+
+        Status = NDIS_STATUS_SUCCESS;
+    }
+    while (FALSE);
+
+    //
+    // If we had received an UnbindAdapter notification on the underlying
+    // adapter, we would have blocked that thread waiting for the IM Init
+    // process to complete. Wake up any such thread.
+    //
+    ASSERT(pAdapt->MiniportInitPending == TRUE);
+    pAdapt->MiniportInitPending = FALSE;
+    NdisSetEvent(&pAdapt->MiniportInitEvent);
+
+    if (Status == NDIS_STATUS_SUCCESS)
+    {
+        PtReferenceAdapt(pAdapt);
+    }
+
+    DBGPRINT(("<== Miniport Initialize: Adapt %p, Status %x\n", pAdapt, Status));
+
+    *OpenErrorStatus = Status;
+
+    
+    return Status;
+}
+
+
+NDIS_STATUS
+MPSend(
+    IN NDIS_HANDLE             MiniportAdapterContext,
+    IN PNDIS_PACKET            Packet,
+    IN UINT                    Flags
+    )
+/*++
+
+Routine Description:
+
+    Send Packet handler. Either this or our SendPackets (array) handler is called
+    based on which one is enabled in our Miniport Characteristics.
+
+Arguments:
+
+    MiniportAdapterContext    Pointer to the adapter
+    Packet                    Packet to send
+    Flags                     Unused, passed down below
+
+Return Value:
+
+    Return code from NdisSend
+
+--*/
+{
+    PADAPT              pAdapt = (PADAPT)MiniportAdapterContext;
+    NDIS_STATUS         Status;
+    PNDIS_PACKET        MyPacket;
+    PVOID               MediaSpecificInfo = NULL;
+    ULONG               MediaSpecificInfoSize = 0;
+
+    //
+    // The driver should fail the send if the virtual miniport is in low 
+    // power state
+    //
+    if (pAdapt->MPDeviceState > NdisDeviceStateD0)
+    {
+         return NDIS_STATUS_FAILURE;
+    }
+
+#ifdef NDIS51
+    //
+    // Use NDIS 5.1 packet stacking:
+    //
+    if (0)     // XXX IPFW - make sure we don't go in here
+    {
+        PNDIS_PACKET_STACK        pStack;
+        BOOLEAN                   Remaining;
+
+        //
+        // Packet stacks: Check if we can use the same packet for sending down.
+        //
+
+        pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining);
+        if (Remaining)
+        {
+            //
+            // We can reuse "Packet".
+            //
+            // NOTE: if we needed to keep per-packet information in packets
+            // sent down, we can use pStack->IMReserved[].
+            //
+            ASSERT(pStack);
+            //
+            // If the below miniport is going to low power state, stop sending down any packet.
+            //
+            NdisAcquireSpinLock(&pAdapt->Lock);
+            if (pAdapt->PTDeviceState > NdisDeviceStateD0)
+            {
+                NdisReleaseSpinLock(&pAdapt->Lock);
+                return NDIS_STATUS_FAILURE;
+            }
+            pAdapt->OutstandingSends++;
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            NdisSend(&Status,
+                     pAdapt->BindingHandle,
+                     Packet);
+
+            if (Status != NDIS_STATUS_PENDING)
+            {
+                ADAPT_DECR_PENDING_SENDS(pAdapt);
+            }
+
+            return(Status);
+        }
+    }
+#endif // NDIS51
+
+    //
+    // We are either not using packet stacks, or there isn't stack space
+    // in the original packet passed down to us. Allocate a new packet
+    // to wrap the data with.
+    //
+    //
+    // If the below miniport is going to low power state, stop sending down any packet.
+    //
+    NdisAcquireSpinLock(&pAdapt->Lock);
+    if (pAdapt->PTDeviceState > NdisDeviceStateD0)
+    {
+        NdisReleaseSpinLock(&pAdapt->Lock);
+        return NDIS_STATUS_FAILURE;
+    
+    }
+    pAdapt->OutstandingSends++;
+    NdisReleaseSpinLock(&pAdapt->Lock);
+    
+    NdisAllocatePacket(&Status,
+                       &MyPacket,
+                       pAdapt->SendPacketPoolHandle);
+
+    if (Status == NDIS_STATUS_SUCCESS)
+    {
+        PSEND_RSVD            SendRsvd;
+
+        //
+        // Save a pointer to the original packet in our reserved
+        // area in the new packet. This is needed so that we can
+        // get back to the original packet when the new packet's send
+        // is completed.
+        //
+        SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved);
+        SendRsvd->OriginalPkt = Packet;
+
+        NdisGetPacketFlags(MyPacket) = Flags;
+
+        //
+        // Set up the new packet so that it describes the same
+        // data as the original packet.
+        //
+        NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);
+        NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);
+#ifdef WIN9X
+        //
+        // Work around the fact that NDIS does not initialize this
+        // to FALSE on Win9x.
+        //
+        NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE;
+#endif
+
+        //
+        // Copy the OOB Offset from the original packet to the new
+        // packet.
+        //
+        NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket),
+                       NDIS_OOB_DATA_FROM_PACKET(Packet),
+                       sizeof(NDIS_PACKET_OOB_DATA));
+
+#ifndef WIN9X
+        //
+        // Copy the right parts of per packet info into the new packet.
+        // This API is not available on Win9x since task offload is
+        // not supported on that platform.
+        //
+        NdisIMCopySendPerPacketInfo(MyPacket, Packet);
+#endif
+        
+        //
+        // Copy the Media specific information
+        //
+        NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet,
+                                            &MediaSpecificInfo,
+                                            &MediaSpecificInfoSize);
+
+        if (MediaSpecificInfo || MediaSpecificInfoSize)
+        {
+            NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket,
+                                                MediaSpecificInfo,
+                                                MediaSpecificInfoSize);
+               }
+#if 1  /* IPFW: query the firewall */
+       /* if dummynet keeps the packet, we mimic success.
+        * otherwise continue as usual.
+        */
+               {
+                       int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING,
+                                       MiniportAdapterContext);
+                       if (ret != PASS) {
+                               if (ret == DROP)
+                                       return NDIS_STATUS_FAILURE;
+                               else {  //dummynet kept the packet
+#ifndef WIN9X
+                                       NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);
+#endif
+                                       return NDIS_STATUS_SUCCESS; //otherwise simply continue
+                               }
+                       }
+               }
+#endif /* end of IPFW code */
+
+        NdisSend(&Status,
+                 pAdapt->BindingHandle,
+                 MyPacket);
+
+
+        if (Status != NDIS_STATUS_PENDING)
+        {
+#ifndef WIN9X
+            NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);
+#endif
+            NdisFreePacket(MyPacket);
+            ADAPT_DECR_PENDING_SENDS(pAdapt);
+        }
+    }
+    else
+    {
+        ADAPT_DECR_PENDING_SENDS(pAdapt);
+        //
+        // We are out of packets. Silently drop it. Alternatively we can deal with it:
+        //    - By keeping separate send and receive pools
+        //    - Dynamically allocate more pools as needed and free them when not needed
+        //
+    }
+
+    return(Status);
+}
+
+
+VOID
+MPSendPackets(
+    IN NDIS_HANDLE             MiniportAdapterContext,
+    IN PPNDIS_PACKET           PacketArray,
+    IN UINT                    NumberOfPackets
+    )
+/*++
+
+Routine Description:
+
+    Send Packet Array handler. Either this or our SendPacket handler is called
+    based on which one is enabled in our Miniport Characteristics.
+
+Arguments:
+
+    MiniportAdapterContext     Pointer to our adapter
+    PacketArray                Set of packets to send
+    NumberOfPackets            Self-explanatory
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT              pAdapt = (PADAPT)MiniportAdapterContext;
+    NDIS_STATUS         Status;
+    UINT                i;
+    PVOID               MediaSpecificInfo = NULL;
+    UINT                MediaSpecificInfoSize = 0;
+    
+
+    for (i = 0; i < NumberOfPackets; i++)
+    {
+        PNDIS_PACKET    Packet, MyPacket;
+
+        Packet = PacketArray[i];
+        //
+        // The driver should fail the send if the virtual miniport is in low 
+        // power state
+        //
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0)
+        {
+            NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),
+                            Packet,
+                            NDIS_STATUS_FAILURE);
+            continue;
+        }
+
+#ifdef NDIS51
+
+        //
+        // Use NDIS 5.1 packet stacking:
+        //
+        {
+            PNDIS_PACKET_STACK        pStack;
+            BOOLEAN                   Remaining;
+
+            //
+            // Packet stacks: Check if we can use the same packet for sending down.
+            //
+            pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining);
+            if (Remaining)
+            {
+                //
+                // We can reuse "Packet".
+                //
+                // NOTE: if we needed to keep per-packet information in packets
+                // sent down, we can use pStack->IMReserved[].
+                //
+                ASSERT(pStack);
+                //
+                // If the below miniport is going to low power state, stop sending down any packet.
+                //
+                NdisAcquireSpinLock(&pAdapt->Lock);
+                if (pAdapt->PTDeviceState > NdisDeviceStateD0)
+                {
+                    NdisReleaseSpinLock(&pAdapt->Lock);
+                    NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),
+                                        Packet,
+                                        NDIS_STATUS_FAILURE);
+                }
+                else
+                {
+                    pAdapt->OutstandingSends++;
+                    NdisReleaseSpinLock(&pAdapt->Lock);
+                
+                    NdisSend(&Status,
+                              pAdapt->BindingHandle,
+                              Packet);
+        
+                    if (Status != NDIS_STATUS_PENDING)
+                    {
+                        NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),
+                                            Packet,
+                                            Status);
+                   
+                        ADAPT_DECR_PENDING_SENDS(pAdapt);
+                    }
+                }
+                continue;
+            }
+        }
+#endif
+        do 
+        {
+            NdisAcquireSpinLock(&pAdapt->Lock);
+            //
+            // If the below miniport is going to low power state, stop sending down any packet.
+            //
+            if (pAdapt->PTDeviceState > NdisDeviceStateD0)
+            {
+                NdisReleaseSpinLock(&pAdapt->Lock);
+                Status = NDIS_STATUS_FAILURE;
+                break;
+            }
+            pAdapt->OutstandingSends++;
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            
+            NdisAllocatePacket(&Status,
+                               &MyPacket,
+                               pAdapt->SendPacketPoolHandle);
+
+            if (Status == NDIS_STATUS_SUCCESS)
+            {
+                PSEND_RSVD        SendRsvd;
+
+                SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved);
+                SendRsvd->OriginalPkt = Packet;
+
+                NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet);
+
+                NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);
+                NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);
+#ifdef WIN9X
+                //
+                // Work around the fact that NDIS does not initialize this
+                // to FALSE on Win9x.
+                //
+                NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE;
+#endif // WIN9X
+
+                //
+                // Copy the OOB data from the original packet to the new
+                // packet.
+                //
+                NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket),
+                            NDIS_OOB_DATA_FROM_PACKET(Packet),
+                            sizeof(NDIS_PACKET_OOB_DATA));
+                //
+                // Copy relevant parts of the per packet info into the new packet
+                //
+#ifndef WIN9X
+                NdisIMCopySendPerPacketInfo(MyPacket, Packet);
+#endif
+
+                //
+                // Copy the Media specific information
+                //
+                NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet,
+                                                    &MediaSpecificInfo,
+                                                    &MediaSpecificInfoSize);
+
+                if (MediaSpecificInfo || MediaSpecificInfoSize)
+                {
+                    NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket,
+                                                        MediaSpecificInfo,
+                                                        MediaSpecificInfoSize);
+                }
+
+                NdisSend(&Status,
+                         pAdapt->BindingHandle,
+                         MyPacket);
+
+                if (Status != NDIS_STATUS_PENDING)
+                {
+#ifndef WIN9X
+                    NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);
+#endif
+                    NdisFreePacket(MyPacket);
+                    ADAPT_DECR_PENDING_SENDS(pAdapt);
+                }
+            }
+            else
+            {
+                //
+                // The driver cannot allocate a packet.
+                // 
+                ADAPT_DECR_PENDING_SENDS(pAdapt);
+            }
+        }
+        while (FALSE);
+
+        if (Status != NDIS_STATUS_PENDING)
+        {
+            NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),
+                              Packet,
+                              Status);
+        }
+    }
+}
+
+
+NDIS_STATUS
+MPQueryInformation(
+    IN NDIS_HANDLE                MiniportAdapterContext,
+    IN NDIS_OID                   Oid,
+    IN PVOID                      InformationBuffer,
+    IN ULONG                      InformationBufferLength,
+    OUT PULONG                    BytesWritten,
+    OUT PULONG                    BytesNeeded
+    )
+/*++
+
+Routine Description:
+
+    Entry point called by NDIS to query for the value of the specified OID.
+    Typical processing is to forward the query down to the underlying miniport.
+
+    The following OIDs are filtered here:
+
+    OID_PNP_QUERY_POWER - return success right here
+
+    OID_GEN_SUPPORTED_GUIDS - do not forward, otherwise we will show up
+    multiple instances of private GUIDs supported by the underlying miniport.
+
+    OID_PNP_CAPABILITIES - we do send this down to the lower miniport, but
+    the values returned are postprocessed before we complete this request;
+    see PtRequestComplete.
+
+    NOTE on OID_TCP_TASK_OFFLOAD - if this IM driver modifies the contents
+    of data it passes through such that a lower miniport may not be able
+    to perform TCP task offload, then it should not forward this OID down,
+    but fail it here with the status NDIS_STATUS_NOT_SUPPORTED. This is to
+    avoid performing incorrect transformations on data.
+
+    If our miniport edge (upper edge) is at a low-power state, fail the request.
+
+    If our protocol edge (lower edge) has been notified of a low-power state,
+    we pend this request until the miniport below has been set to D0. Since
+    requests to miniports are serialized always, at most a single request will
+    be pended.
+
+Arguments:
+
+    MiniportAdapterContext    Pointer to the adapter structure
+    Oid                       Oid for this query
+    InformationBuffer         Buffer for information
+    InformationBufferLength   Size of this buffer
+    BytesWritten              Specifies how much info is written
+    BytesNeeded               In case the buffer is smaller than what we need, tell them how much is needed
+
+
+Return Value:
+
+    Return code from the NdisRequest below.
+
+--*/
+{
+    PADAPT        pAdapt = (PADAPT)MiniportAdapterContext;
+    NDIS_STATUS   Status = NDIS_STATUS_FAILURE;
+
+    do
+    {
+        if (Oid == OID_PNP_QUERY_POWER)
+        {
+            //
+            //  Do not forward this.
+            //
+            Status = NDIS_STATUS_SUCCESS;
+            break;
+        }
+
+        if (Oid == OID_GEN_SUPPORTED_GUIDS)
+        {
+            //
+            //  Do not forward this, otherwise we will end up with multiple
+            //  instances of private GUIDs that the underlying miniport
+            //  supports.
+            //
+            Status = NDIS_STATUS_NOT_SUPPORTED;
+            break;
+        }
+
+        if (Oid == OID_TCP_TASK_OFFLOAD)
+        {
+            //
+            // Fail this -if- this driver performs data transformations
+            // that can interfere with a lower driver's ability to offload
+            // TCP tasks.
+            //
+            // Status = NDIS_STATUS_NOT_SUPPORTED;
+            // break;
+            //
+        }
+        //
+        // If the miniport below is unbinding, just fail any request
+        //
+        NdisAcquireSpinLock(&pAdapt->Lock);
+        if (pAdapt->UnbindingInProcess == TRUE)
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+        NdisReleaseSpinLock(&pAdapt->Lock);
+        //
+        // All other queries are failed, if the miniport is not at D0,
+        //
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0) 
+        {
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+
+        pAdapt->Request.RequestType = NdisRequestQueryInformation;
+        pAdapt->Request.DATA.QUERY_INFORMATION.Oid = Oid;
+        pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer = InformationBuffer;
+        pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength = InformationBufferLength;
+        pAdapt->BytesNeeded = BytesNeeded;
+        pAdapt->BytesReadOrWritten = BytesWritten;
+
+        //
+        // If the miniport below is binding, fail the request
+        //
+        NdisAcquireSpinLock(&pAdapt->Lock);
+            
+        if (pAdapt->UnbindingInProcess == TRUE)
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+        //
+        // If the Protocol device state is OFF, mark this request as being 
+        // pended. We queue this until the device state is back to D0. 
+        //
+        if ((pAdapt->PTDeviceState > NdisDeviceStateD0) 
+                && (pAdapt->StandingBy == FALSE))
+        {
+            pAdapt->QueuedRequest = TRUE;
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_PENDING;
+            break;
+        }
+        //
+        // This is in the process of powering down the system, always fail the request
+        // 
+        if (pAdapt->StandingBy == TRUE)
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+        pAdapt->OutstandingRequests = TRUE;
+        
+        NdisReleaseSpinLock(&pAdapt->Lock);
+
+        //
+        // default case, most requests will be passed to the miniport below
+        //
+        NdisRequest(&Status,
+                    pAdapt->BindingHandle,
+                    &pAdapt->Request);
+
+
+        if (Status != NDIS_STATUS_PENDING)
+        {
+            PtRequestComplete(pAdapt, &pAdapt->Request, Status);
+            Status = NDIS_STATUS_PENDING;
+        }
+
+    } while (FALSE);
+
+    return(Status);
+
+}
+
+
+VOID
+MPQueryPNPCapabilities(
+    IN OUT PADAPT            pAdapt,
+    OUT PNDIS_STATUS         pStatus
+    )
+/*++
+
+Routine Description:
+
+    Postprocess a request for OID_PNP_CAPABILITIES that was forwarded
+    down to the underlying miniport, and has been completed by it.
+
+Arguments:
+
+    pAdapt - Pointer to the adapter structure
+    pStatus - Place to return final status
+
+Return Value:
+
+    None.
+
+--*/
+
+{
+    PNDIS_PNP_CAPABILITIES           pPNPCapabilities;
+    PNDIS_PM_WAKE_UP_CAPABILITIES    pPMstruct;
+
+    if (pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength >= sizeof(NDIS_PNP_CAPABILITIES))
+    {
+        pPNPCapabilities = (PNDIS_PNP_CAPABILITIES)(pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer);
+
+        //
+        // The following fields must be overwritten by an IM driver.
+        //
+        pPMstruct= & pPNPCapabilities->WakeUpCapabilities;
+        pPMstruct->MinMagicPacketWakeUp = NdisDeviceStateUnspecified;
+        pPMstruct->MinPatternWakeUp = NdisDeviceStateUnspecified;
+        pPMstruct->MinLinkChangeWakeUp = NdisDeviceStateUnspecified;
+        *pAdapt->BytesReadOrWritten = sizeof(NDIS_PNP_CAPABILITIES);
+        *pAdapt->BytesNeeded = 0;
+
+
+        //
+        // Setting our internal flags
+        // Default, device is ON
+        //
+        pAdapt->MPDeviceState = NdisDeviceStateD0;
+        pAdapt->PTDeviceState = NdisDeviceStateD0;
+
+        *pStatus = NDIS_STATUS_SUCCESS;
+    }
+    else
+    {
+        *pAdapt->BytesNeeded= sizeof(NDIS_PNP_CAPABILITIES);
+        *pStatus = NDIS_STATUS_RESOURCES;
+    }
+}
+
+
+NDIS_STATUS
+MPSetInformation(
+    IN NDIS_HANDLE                                  MiniportAdapterContext,
+    IN NDIS_OID                                     Oid,
+    __in_bcount(InformationBufferLength) IN PVOID   InformationBuffer,
+    IN ULONG                                        InformationBufferLength,
+    OUT PULONG                                      BytesRead,
+    OUT PULONG                                      BytesNeeded
+    )
+/*++
+
+Routine Description:
+
+    Miniport SetInfo handler.
+
+    In the case of OID_PNP_SET_POWER, record the power state and return the OID.    
+    Do not pass below
+    If the device is suspended, do not block the SET_POWER_OID 
+    as it is used to reactivate the Passthru miniport
+
+    
+    PM- If the MP is not ON (DeviceState > D0) return immediately  (except for 'query power' and 'set power')
+         If MP is ON, but the PT is not at D0, then queue the queue the request for later processing
+
+    Requests to miniports are always serialized
+
+
+Arguments:
+
+    MiniportAdapterContext    Pointer to the adapter structure
+    Oid                       Oid for this query
+    InformationBuffer         Buffer for information
+    InformationBufferLength   Size of this buffer
+    BytesRead                 Specifies how much info is read
+    BytesNeeded               In case the buffer is smaller than what we need, tell them how much is needed
+
+Return Value:
+
+    Return code from the NdisRequest below.
+
+--*/
+{
+    PADAPT        pAdapt = (PADAPT)MiniportAdapterContext;
+    NDIS_STATUS   Status;
+
+    Status = NDIS_STATUS_FAILURE;
+
+    do
+    {
+        //
+        // The Set Power should not be sent to the miniport below the Passthru, but is handled internally
+        //
+        if (Oid == OID_PNP_SET_POWER)
+        {
+            MPProcessSetPowerOid(&Status, 
+                                 pAdapt, 
+                                 InformationBuffer, 
+                                 InformationBufferLength, 
+                                 BytesRead, 
+                                 BytesNeeded);
+            break;
+
+        }
+
+        //
+        // If the miniport below is unbinding, fail the request
+        //
+        NdisAcquireSpinLock(&pAdapt->Lock);     
+        if (pAdapt->UnbindingInProcess == TRUE)
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+        NdisReleaseSpinLock(&pAdapt->Lock);
+        //
+        // All other Set Information requests are failed, if the miniport is
+        // not at D0 or is transitioning to a device state greater than D0.
+        //
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0)
+        {
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+
+        // Set up the Request and return the result
+        pAdapt->Request.RequestType = NdisRequestSetInformation;
+        pAdapt->Request.DATA.SET_INFORMATION.Oid = Oid;
+        pAdapt->Request.DATA.SET_INFORMATION.InformationBuffer = InformationBuffer;
+        pAdapt->Request.DATA.SET_INFORMATION.InformationBufferLength = InformationBufferLength;
+        pAdapt->BytesNeeded = BytesNeeded;
+        pAdapt->BytesReadOrWritten = BytesRead;
+
+        //
+        // If the miniport below is unbinding, fail the request
+        //
+        NdisAcquireSpinLock(&pAdapt->Lock);     
+        if (pAdapt->UnbindingInProcess == TRUE)
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+            
+        //
+        // If the device below is at a low power state, we cannot send it the
+        // request now, and must pend it.
+        //
+        if ((pAdapt->PTDeviceState > NdisDeviceStateD0) 
+                && (pAdapt->StandingBy == FALSE))
+        {
+            pAdapt->QueuedRequest = TRUE;
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_PENDING;
+            break;
+        }
+        //
+        // This is in the process of powering down the system, always fail the request
+        // 
+        if (pAdapt->StandingBy == TRUE)
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            Status = NDIS_STATUS_FAILURE;
+            break;
+        }
+        pAdapt->OutstandingRequests = TRUE;
+        
+        NdisReleaseSpinLock(&pAdapt->Lock);
+        //
+        // Forward the request to the device below.
+        //
+        NdisRequest(&Status,
+                    pAdapt->BindingHandle,
+                    &pAdapt->Request);
+
+        if (Status != NDIS_STATUS_PENDING)
+        {
+            *BytesRead = pAdapt->Request.DATA.SET_INFORMATION.BytesRead;
+            *BytesNeeded = pAdapt->Request.DATA.SET_INFORMATION.BytesNeeded;
+            pAdapt->OutstandingRequests = FALSE;
+        }
+
+    } while (FALSE);
+
+    return(Status);
+}
+
+
+VOID
+MPProcessSetPowerOid(
+    IN OUT PNDIS_STATUS                             pNdisStatus,
+    IN PADAPT                                       pAdapt,
+    __in_bcount(InformationBufferLength) IN PVOID   InformationBuffer,
+    IN ULONG                                        InformationBufferLength,
+    OUT PULONG                                      BytesRead,
+    OUT PULONG                                      BytesNeeded
+    )
+/*++
+
+Routine Description:
+    This routine does all the procssing for a request with a SetPower Oid
+    The miniport shoud accept  the Set Power and transition to the new state
+
+    The Set Power should not be passed to the miniport below
+
+    If the IM miniport is going into a low power state, then there is no guarantee if it will ever
+    be asked go back to D0, before getting halted. No requests should be pended or queued.
+
+    
+Arguments:
+    pNdisStatus           - Status of the operation
+    pAdapt                - The Adapter structure
+    InformationBuffer     - The New DeviceState
+    InformationBufferLength
+    BytesRead             - No of bytes read
+    BytesNeeded           -  No of bytes needed
+
+
+Return Value:
+    Status  - NDIS_STATUS_SUCCESS if all the wait events succeed.
+
+--*/
+{
+
+    
+    NDIS_DEVICE_POWER_STATE NewDeviceState;
+
+    DBGPRINT(("==>MPProcessSetPowerOid: Adapt %p\n", pAdapt)); 
+
+    ASSERT (InformationBuffer != NULL);
+
+    *pNdisStatus = NDIS_STATUS_FAILURE;
+
+    do 
+    {
+        //
+        // Check for invalid length
+        //
+        if (InformationBufferLength < sizeof(NDIS_DEVICE_POWER_STATE))
+        {
+            *pNdisStatus = NDIS_STATUS_INVALID_LENGTH;
+            break;
+        }
+
+        NewDeviceState = (*(PNDIS_DEVICE_POWER_STATE)InformationBuffer);
+
+        //
+        // Check for invalid device state
+        //
+        if ((pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0))
+        {
+            //
+            // If the miniport is in a non-D0 state, the miniport can only receive a Set Power to D0
+            //
+            ASSERT (!(pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0));
+
+            *pNdisStatus = NDIS_STATUS_FAILURE;
+            break;
+        }    
+
+        //
+        // Is the miniport transitioning from an On (D0) state to an Low Power State (>D0)
+        // If so, then set the StandingBy Flag - (Block all incoming requests)
+        //
+        if (pAdapt->MPDeviceState == NdisDeviceStateD0 && NewDeviceState > NdisDeviceStateD0)
+        {
+            pAdapt->StandingBy = TRUE;
+        }
+
+        //
+        // If the miniport is transitioning from a low power state to ON (D0), then clear the StandingBy flag
+        // All incoming requests will be pended until the physical miniport turns ON.
+        //
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0 &&  NewDeviceState == NdisDeviceStateD0)
+        {
+            pAdapt->StandingBy = FALSE;
+        }
+        
+        //
+        // Now update the state in the pAdapt structure;
+        //
+        pAdapt->MPDeviceState = NewDeviceState;
+        
+        *pNdisStatus = NDIS_STATUS_SUCCESS;
+    
+
+    } while (FALSE);    
+        
+    if (*pNdisStatus == NDIS_STATUS_SUCCESS)
+    {
+        //
+        // The miniport resume from low power state
+        // 
+        if (pAdapt->StandingBy == FALSE)
+        {
+            //
+            // If we need to indicate the media connect state
+            // 
+            if (pAdapt->LastIndicatedStatus != pAdapt->LatestUnIndicateStatus)
+            {
+               if (pAdapt->MiniportHandle != NULL)
+               {
+                   NdisMIndicateStatus(pAdapt->MiniportHandle,
+                                            pAdapt->LatestUnIndicateStatus,
+                                            (PVOID)NULL,
+                                            0);
+                   NdisMIndicateStatusComplete(pAdapt->MiniportHandle);
+                   pAdapt->LastIndicatedStatus = pAdapt->LatestUnIndicateStatus;
+               }
+            }
+        }
+        else
+        {
+            //
+            // Initialize LatestUnIndicatedStatus
+            //
+            pAdapt->LatestUnIndicateStatus = pAdapt->LastIndicatedStatus;
+        }
+        *BytesRead = sizeof(NDIS_DEVICE_POWER_STATE);
+        *BytesNeeded = 0;
+    }
+    else
+    {
+        *BytesRead = 0;
+        *BytesNeeded = sizeof (NDIS_DEVICE_POWER_STATE);
+    }
+
+    DBGPRINT(("<==MPProcessSetPowerOid: Adapt %p\n", pAdapt)); 
+}
+
+
+VOID
+MPReturnPacket(
+    IN NDIS_HANDLE             MiniportAdapterContext,
+    IN PNDIS_PACKET            Packet
+    )
+/*++
+
+Routine Description:
+
+    NDIS Miniport entry point called whenever protocols are done with
+    a packet that we had indicated up and they had queued up for returning
+    later.
+
+Arguments:
+
+    MiniportAdapterContext    - pointer to ADAPT structure
+    Packet    - packet being returned.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    PADAPT            pAdapt = (PADAPT)MiniportAdapterContext;
+
+#ifdef NDIS51
+    //
+    // Packet stacking: Check if this packet belongs to us.
+    //
+    if (NdisGetPoolFromPacket(Packet) != pAdapt->RecvPacketPoolHandle)
+    {
+        //
+        // We reused the original packet in a receive indication.
+        // Simply return it to the miniport below us.
+        //
+        NdisReturnPackets(&Packet, 1);
+    }
+    else
+#endif // NDIS51
+    {
+        //
+        // This is a packet allocated from this IM's receive packet pool.
+        // Reclaim our packet, and return the original to the driver below.
+        //
+
+        PNDIS_PACKET    MyPacket;
+        PRECV_RSVD      RecvRsvd;
+    
+        RecvRsvd = (PRECV_RSVD)(Packet->MiniportReserved);
+        MyPacket = RecvRsvd->OriginalPkt;
+    
+        NdisFreePacket(Packet);
+        NdisReturnPackets(&MyPacket, 1);
+    }
+}
+
+
+NDIS_STATUS
+MPTransferData(
+    OUT PNDIS_PACKET            Packet,
+    OUT PUINT                   BytesTransferred,
+    IN NDIS_HANDLE              MiniportAdapterContext,
+    IN NDIS_HANDLE              MiniportReceiveContext,
+    IN UINT                     ByteOffset,
+    IN UINT                     BytesToTransfer
+    )
+/*++
+
+Routine Description:
+
+    Miniport's transfer data handler.
+
+Arguments:
+
+    Packet                    Destination packet
+    BytesTransferred          Place-holder for how much data was copied
+    MiniportAdapterContext    Pointer to the adapter structure
+    MiniportReceiveContext    Context
+    ByteOffset                Offset into the packet for copying data
+    BytesToTransfer           How much to copy.
+
+Return Value:
+
+    Status of transfer
+
+--*/
+{
+    PADAPT        pAdapt = (PADAPT)MiniportAdapterContext;
+    NDIS_STATUS   Status;
+
+    //
+    // Return, if the device is OFF
+    //
+
+    if (IsIMDeviceStateOn(pAdapt) == FALSE)
+    {
+        return NDIS_STATUS_FAILURE;
+    }
+
+    NdisTransferData(&Status,
+                     pAdapt->BindingHandle,
+                     MiniportReceiveContext,
+                     ByteOffset,
+                     BytesToTransfer,
+                     Packet,
+                     BytesTransferred);
+
+    return(Status);
+}
+
+VOID
+MPHalt(
+    IN NDIS_HANDLE                MiniportAdapterContext
+    )
+/*++
+
+Routine Description:
+
+    Halt handler. All the hard-work for clean-up is done here.
+
+Arguments:
+
+    MiniportAdapterContext    Pointer to the Adapter
+
+Return Value:
+
+    None.
+
+--*/
+{
+    PADAPT             pAdapt = (PADAPT)MiniportAdapterContext;
+    NDIS_STATUS        Status;
+    PADAPT            *ppCursor;
+
+    DBGPRINT(("==>MiniportHalt: Adapt %p\n", pAdapt));
+
+    pAdapt->MiniportHandle = NULL;
+    pAdapt->MiniportIsHalted = TRUE;
+
+    //
+    // Remove this adapter from the global list
+    //
+    NdisAcquireSpinLock(&GlobalLock);
+
+    for (ppCursor = &pAdaptList; *ppCursor != NULL; ppCursor = &(*ppCursor)->Next)
+    {
+        if (*ppCursor == pAdapt)
+        {
+            *ppCursor = pAdapt->Next;
+            break;
+        }
+    }
+
+    NdisReleaseSpinLock(&GlobalLock);
+
+    //
+    // Delete the ioctl interface that was created when the miniport
+    // was created.
+    //
+    (VOID)PtDeregisterDevice();
+
+    //
+    // If we have a valid bind, close the miniport below the protocol
+    //
+#pragma prefast(suppress: __WARNING_DEREF_NULL_PTR, "pAdapt cannot be NULL")
+    if (pAdapt->BindingHandle != NULL)
+    {
+        //
+        // Close the binding below. and wait for it to complete
+        //
+        NdisResetEvent(&pAdapt->Event);
+
+        NdisCloseAdapter(&Status, pAdapt->BindingHandle);
+
+        if (Status == NDIS_STATUS_PENDING)
+        {
+            NdisWaitEvent(&pAdapt->Event, 0);
+            Status = pAdapt->Status;
+        }
+
+        ASSERT (Status == NDIS_STATUS_SUCCESS);
+
+        pAdapt->BindingHandle = NULL;
+        
+        PtDereferenceAdapt(pAdapt);
+    }
+
+    if (PtDereferenceAdapt(pAdapt))
+    {
+        pAdapt = NULL;
+    }
+        
+    
+    DBGPRINT(("<== MiniportHalt: pAdapt %p\n", pAdapt));
+}
+
+
+#ifdef NDIS51_MINIPORT
+
+VOID
+MPCancelSendPackets(
+    IN NDIS_HANDLE            MiniportAdapterContext,
+    IN PVOID                  CancelId
+    )
+/*++
+
+Routine Description:
+
+    The miniport entry point to handle cancellation of all send packets
+    that match the given CancelId. If we have queued any packets that match
+    this, then we should dequeue them and call NdisMSendComplete for all
+    such packets, with a status of NDIS_STATUS_REQUEST_ABORTED.
+
+    We should also call NdisCancelSendPackets in turn, on each lower binding
+    that this adapter corresponds to. This is to let miniports below cancel
+    any matching packets.
+
+Arguments:
+
+    MiniportAdapterContext    - pointer to ADAPT structure
+    CancelId    - ID of packets to be cancelled.
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT    pAdapt = (PADAPT)MiniportAdapterContext;
+
+    //
+    // If we queue packets on our adapter structure, this would be 
+    // the place to acquire a spinlock to it, unlink any packets whose
+    // Id matches CancelId, release the spinlock and call NdisMSendComplete
+    // with NDIS_STATUS_REQUEST_ABORTED for all unlinked packets.
+    //
+
+    //
+    // Next, pass this down so that we let the miniport(s) below cancel
+    // any packets that they might have queued.
+    //
+    NdisCancelSendPackets(pAdapt->BindingHandle, CancelId);
+
+    return;
+}
+
+VOID
+MPDevicePnPEvent(
+    IN NDIS_HANDLE              MiniportAdapterContext,
+    IN NDIS_DEVICE_PNP_EVENT    DevicePnPEvent,
+    IN PVOID                    InformationBuffer,
+    IN ULONG                    InformationBufferLength
+    )
+/*++
+
+Routine Description:
+
+    This handler is called to notify us of PnP events directed to
+    our miniport device object.
+
+Arguments:
+
+    MiniportAdapterContext    - pointer to ADAPT structure
+    DevicePnPEvent - the event
+    InformationBuffer - Points to additional event-specific information
+    InformationBufferLength - length of above
+
+Return Value:
+
+    None
+--*/
+{
+    // TBD - add code/comments about processing this.
+
+    UNREFERENCED_PARAMETER(MiniportAdapterContext);
+    UNREFERENCED_PARAMETER(DevicePnPEvent);
+    UNREFERENCED_PARAMETER(InformationBuffer);
+    UNREFERENCED_PARAMETER(InformationBufferLength);
+    
+    return;
+}
+
+VOID
+MPAdapterShutdown(
+    IN NDIS_HANDLE                MiniportAdapterContext
+    )
+/*++
+
+Routine Description:
+
+    This handler is called to notify us of an impending system shutdown.
+
+Arguments:
+
+    MiniportAdapterContext    - pointer to ADAPT structure
+
+Return Value:
+
+    None
+--*/
+{
+    UNREFERENCED_PARAMETER(MiniportAdapterContext);
+    
+    return;
+}
+
+#endif
+
+
+VOID
+MPFreeAllPacketPools(
+    IN PADAPT                    pAdapt
+    )
+/*++
+
+Routine Description:
+
+    Free all packet pools on the specified adapter.
+    
+Arguments:
+
+    pAdapt    - pointer to ADAPT structure
+
+Return Value:
+
+    None
+
+--*/
+{
+    if (pAdapt->RecvPacketPoolHandle != NULL)
+    {
+        //
+        // Free the packet pool that is used to indicate receives
+        //
+        NdisFreePacketPool(pAdapt->RecvPacketPoolHandle);
+
+        pAdapt->RecvPacketPoolHandle = NULL;
+    }
+
+    if (pAdapt->SendPacketPoolHandle != NULL)
+    {
+
+        //
+        //  Free the packet pool that is used to send packets below
+        //
+
+        NdisFreePacketPool(pAdapt->SendPacketPoolHandle);
+
+        pAdapt->SendPacketPoolHandle = NULL;
+
+    }
+}
+
diff --git a/modified_passthru/passthru.c b/modified_passthru/passthru.c
new file mode 100644 (file)
index 0000000..c366173
--- /dev/null
@@ -0,0 +1,469 @@
+/*++
+
+Copyright (c) 1992-2000  Microsoft Corporation
+Module Name:
+    passthru.c
+
+Abstract:
+
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.
+
+Author:
+
+Environment:
+
+
+Revision History:
+
+
+--*/
+
+
+#include "precomp.h"
+#pragma hdrstop
+
+#pragma NDIS_INIT_FUNCTION(DriverEntry)
+
+NDIS_HANDLE         ProtHandle = NULL;
+NDIS_HANDLE         DriverHandle = NULL;
+NDIS_MEDIUM         MediumArray[4] =
+                    {
+                        NdisMedium802_3,    // Ethernet
+                        NdisMedium802_5,    // Token-ring
+                        NdisMediumFddi,     // Fddi
+                        NdisMediumWan       // NDISWAN
+                    };
+
+NDIS_SPIN_LOCK     GlobalLock;
+
+PADAPT             pAdaptList = NULL;
+LONG               MiniportCount = 0;
+
+NDIS_HANDLE        NdisWrapperHandle;
+
+//
+// To support ioctls from user-mode:
+//
+
+#define STR2(x) #x
+#define STR(x) STR2(x)
+#define DOSPREFIX "\\DosDevices\\"
+#define NTPREFIX "\\Device\\"
+#define WIDEN2(x) L ## x
+#define WIDEN(x) WIDEN2(x)
+#define LINKNAME_STRING                        WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME))
+#define NTDEVICE_STRING                        WIDEN(NTPREFIX) WIDEN(STR(MODULENAME))
+#define PROTOCOLNAME_STRING            WIDEN(STR(MODULENAME))
+
+NDIS_HANDLE     NdisDeviceHandle = NULL;
+PDEVICE_OBJECT  ControlDeviceObject = NULL;
+
+enum _DEVICE_STATE
+{
+    PS_DEVICE_STATE_READY = 0,    // ready for create/delete
+    PS_DEVICE_STATE_CREATING,    // create operation in progress
+    PS_DEVICE_STATE_DELETING    // delete operation in progress
+} ControlDeviceState = PS_DEVICE_STATE_READY;
+
+
+
+NTSTATUS
+DriverEntry(
+    IN PDRIVER_OBJECT        DriverObject,
+    IN PUNICODE_STRING       RegistryPath
+    )
+/*++
+
+Routine Description:
+
+    First entry point to be called, when this driver is loaded.
+    Register with NDIS as an intermediate driver.
+
+Arguments:
+
+    DriverObject - pointer to the system's driver object structure
+        for this driver
+    
+    RegistryPath - system's registry path for this driver
+    
+Return Value:
+
+    STATUS_SUCCESS if all initialization is successful, STATUS_XXX
+    error code if not.
+
+--*/
+{
+    NDIS_STATUS                        Status;
+    NDIS_PROTOCOL_CHARACTERISTICS      PChars;
+    NDIS_MINIPORT_CHARACTERISTICS      MChars;
+    NDIS_STRING                        Name;
+
+    Status = NDIS_STATUS_SUCCESS;
+    NdisAllocateSpinLock(&GlobalLock);
+
+    NdisMInitializeWrapper(&NdisWrapperHandle, DriverObject, RegistryPath, NULL);
+
+    do
+    {
+        //
+        // Register the miniport with NDIS. Note that it is the miniport
+        // which was started as a driver and not the protocol. Also the miniport
+        // must be registered prior to the protocol since the protocol's BindAdapter
+        // handler can be initiated anytime and when it is, it must be ready to
+        // start driver instances.
+        //
+
+        NdisZeroMemory(&MChars, sizeof(NDIS_MINIPORT_CHARACTERISTICS));
+
+        MChars.MajorNdisVersion = PASSTHRU_MAJOR_NDIS_VERSION;
+        MChars.MinorNdisVersion = PASSTHRU_MINOR_NDIS_VERSION;
+
+        MChars.InitializeHandler = MPInitialize;
+        MChars.QueryInformationHandler = MPQueryInformation;
+        MChars.SetInformationHandler = MPSetInformation;
+        MChars.ResetHandler = NULL;
+        MChars.TransferDataHandler = MPTransferData;
+        MChars.HaltHandler = MPHalt;
+#ifdef NDIS51_MINIPORT
+        MChars.CancelSendPacketsHandler = MPCancelSendPackets;
+        MChars.PnPEventNotifyHandler = MPDevicePnPEvent;
+        MChars.AdapterShutdownHandler = MPAdapterShutdown;
+#endif // NDIS51_MINIPORT
+
+        //
+        // We will disable the check for hang timeout so we do not
+        // need a check for hang handler!
+        //
+        MChars.CheckForHangHandler = NULL;
+        MChars.ReturnPacketHandler = MPReturnPacket;
+
+        //
+        // Either the Send or the SendPackets handler should be specified.
+        // If SendPackets handler is specified, SendHandler is ignored
+        //
+        MChars.SendHandler = MPSend;    // IPFW: use MPSend, not SendPackets
+        MChars.SendPacketsHandler = NULL;
+
+        Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle,
+                                                  &MChars,
+                                                  sizeof(MChars),
+                                                  &DriverHandle);
+        if (Status != NDIS_STATUS_SUCCESS)
+        {
+            break;
+        }
+
+#ifndef WIN9X
+        NdisMRegisterUnloadHandler(NdisWrapperHandle, PtUnload);
+#endif
+
+        //
+        // Now register the protocol.
+        //
+        NdisZeroMemory(&PChars, sizeof(NDIS_PROTOCOL_CHARACTERISTICS));
+        PChars.MajorNdisVersion = PASSTHRU_PROT_MAJOR_NDIS_VERSION;
+        PChars.MinorNdisVersion = PASSTHRU_PROT_MINOR_NDIS_VERSION;
+
+        //
+        // Make sure the protocol-name matches the service-name
+        // (from the INF) under which this protocol is installed.
+        // This is needed to ensure that NDIS can correctly determine
+        // the binding and call us to bind to miniports below.
+        //
+        NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING);    // Protocol name
+        PChars.Name = Name;
+        PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete;
+        PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete;
+        PChars.SendCompleteHandler = PtSendComplete;
+        PChars.TransferDataCompleteHandler = PtTransferDataComplete;
+    
+        PChars.ResetCompleteHandler = PtResetComplete;
+        PChars.RequestCompleteHandler = PtRequestComplete;
+        PChars.ReceiveHandler = PtReceive;
+        PChars.ReceiveCompleteHandler = PtReceiveComplete;
+        PChars.StatusHandler = PtStatus;
+        PChars.StatusCompleteHandler = PtStatusComplete;
+        PChars.BindAdapterHandler = PtBindAdapter;
+        PChars.UnbindAdapterHandler = PtUnbindAdapter;
+        PChars.UnloadHandler = PtUnloadProtocol;
+
+        PChars.ReceivePacketHandler = PtReceivePacket;
+        PChars.PnPEventHandler= PtPNPHandler;
+
+        NdisRegisterProtocol(&Status,
+                             &ProtHandle,
+                             &PChars,
+                             sizeof(NDIS_PROTOCOL_CHARACTERISTICS));
+
+        if (Status != NDIS_STATUS_SUCCESS)
+        {
+            NdisIMDeregisterLayeredMiniport(DriverHandle);
+            break;
+        }
+
+        NdisIMAssociateMiniport(DriverHandle, ProtHandle);
+    }
+    while (FALSE);
+
+    if (Status != NDIS_STATUS_SUCCESS)
+    {
+        NdisTerminateWrapper(NdisWrapperHandle, NULL);
+    }
+       
+    ipfw_module_init();        // IPFW - start the system
+
+    return(Status);
+}
+
+
+NDIS_STATUS
+PtRegisterDevice(
+    VOID
+    )
+/*++
+
+Routine Description:
+
+    Register an ioctl interface - a device object to be used for this
+    purpose is created by NDIS when we call NdisMRegisterDevice.
+
+    This routine is called whenever a new miniport instance is
+    initialized. However, we only create one global device object,
+    when the first miniport instance is initialized. This routine
+    handles potential race conditions with PtDeregisterDevice via
+    the ControlDeviceState and MiniportCount variables.
+
+    NOTE: do not call this from DriverEntry; it will prevent the driver
+    from being unloaded (e.g. on uninstall).
+
+Arguments:
+
+    None
+
+Return Value:
+
+    NDIS_STATUS_SUCCESS if we successfully register a device object.
+
+--*/
+{
+    NDIS_STATUS            Status = NDIS_STATUS_SUCCESS;
+    UNICODE_STRING         DeviceName;
+    UNICODE_STRING         DeviceLinkUnicodeString;
+    PDRIVER_DISPATCH       DispatchTable[IRP_MJ_MAXIMUM_FUNCTION+1];
+
+    DBGPRINT(("==>PtRegisterDevice\n"));
+
+    NdisAcquireSpinLock(&GlobalLock);
+
+    ++MiniportCount;
+    
+    if (1 == MiniportCount)
+    {
+        ASSERT(ControlDeviceState != PS_DEVICE_STATE_CREATING);
+
+        //
+        // Another thread could be running PtDeregisterDevice on
+        // behalf of another miniport instance. If so, wait for
+        // it to exit.
+        //
+        while (ControlDeviceState != PS_DEVICE_STATE_READY)
+        {
+            NdisReleaseSpinLock(&GlobalLock);
+            NdisMSleep(1);
+            NdisAcquireSpinLock(&GlobalLock);
+        }
+
+        ControlDeviceState = PS_DEVICE_STATE_CREATING;
+
+        NdisReleaseSpinLock(&GlobalLock);
+
+    
+        NdisZeroMemory(DispatchTable, (IRP_MJ_MAXIMUM_FUNCTION+1) * sizeof(PDRIVER_DISPATCH));
+
+        DispatchTable[IRP_MJ_CREATE] = PtDispatch;
+        DispatchTable[IRP_MJ_CLEANUP] = PtDispatch;
+        DispatchTable[IRP_MJ_CLOSE] = PtDispatch;
+       // IPFW we use DevIoControl ?
+        DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl;
+        
+
+        NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING);
+        NdisInitUnicodeString(&DeviceLinkUnicodeString, LINKNAME_STRING);
+
+        //
+        // Create a device object and register our dispatch handlers
+        //
+        
+        Status = NdisMRegisterDevice(
+                    NdisWrapperHandle, 
+                    &DeviceName,
+                    &DeviceLinkUnicodeString,
+                    &DispatchTable[0],
+                    &ControlDeviceObject,
+                    &NdisDeviceHandle
+                    );
+
+        NdisAcquireSpinLock(&GlobalLock);
+
+        ControlDeviceState = PS_DEVICE_STATE_READY;
+    }
+
+    NdisReleaseSpinLock(&GlobalLock);
+
+    DBGPRINT(("<==PtRegisterDevice: %x\n", Status));
+
+    return (Status);
+}
+
+
+NTSTATUS
+PtDispatch(
+    IN PDEVICE_OBJECT    DeviceObject,
+    IN PIRP              Irp
+    )
+/*++
+Routine Description:
+
+    Process IRPs sent to this device.
+
+Arguments:
+
+    DeviceObject - pointer to a device object
+    Irp      - pointer to an I/O Request Packet
+
+Return Value:
+
+    NTSTATUS - STATUS_SUCCESS always - change this when adding
+    real code to handle ioctls.
+
+--*/
+{
+    PIO_STACK_LOCATION  irpStack;
+    NTSTATUS            status = STATUS_SUCCESS;
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+    
+    DBGPRINT(("==>Pt Dispatch\n"));
+    irpStack = IoGetCurrentIrpStackLocation(Irp);
+      
+
+    switch (irpStack->MajorFunction)
+    {
+        case IRP_MJ_CREATE:
+            break;
+            
+        case IRP_MJ_CLEANUP:
+            break;
+            
+        case IRP_MJ_CLOSE:
+            break;        
+                    
+               case IRP_MJ_DEVICE_CONTROL:
+           //
+           // Add code here to handle ioctl commands sent to passthru.
+           //
+                       break;
+        default:
+            break;
+    }
+
+    Irp->IoStatus.Status = status;
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    DBGPRINT(("<== Pt Dispatch\n"));
+
+    return status;
+
+} 
+
+
+NDIS_STATUS
+PtDeregisterDevice(
+    VOID
+    )
+/*++
+
+Routine Description:
+
+    Deregister the ioctl interface. This is called whenever a miniport
+    instance is halted. When the last miniport instance is halted, we
+    request NDIS to delete the device object
+
+Arguments:
+
+    NdisDeviceHandle - Handle returned by NdisMRegisterDevice
+
+Return Value:
+
+    NDIS_STATUS_SUCCESS if everything worked ok
+
+--*/
+{
+    NDIS_STATUS Status = NDIS_STATUS_SUCCESS;
+
+    DBGPRINT(("==>PassthruDeregisterDevice\n"));
+
+    NdisAcquireSpinLock(&GlobalLock);
+
+    ASSERT(MiniportCount > 0);
+
+    --MiniportCount;
+    
+    if (0 == MiniportCount)
+    {
+        //
+        // All miniport instances have been halted. Deregister
+        // the control device.
+        //
+
+        ASSERT(ControlDeviceState == PS_DEVICE_STATE_READY);
+
+        //
+        // Block PtRegisterDevice() while we release the control
+        // device lock and deregister the device.
+        // 
+        ControlDeviceState = PS_DEVICE_STATE_DELETING;
+
+        NdisReleaseSpinLock(&GlobalLock);
+
+        if (NdisDeviceHandle != NULL)
+        {
+            Status = NdisMDeregisterDevice(NdisDeviceHandle);
+            NdisDeviceHandle = NULL;
+        }
+
+        NdisAcquireSpinLock(&GlobalLock);
+        ControlDeviceState = PS_DEVICE_STATE_READY;
+    }
+
+    NdisReleaseSpinLock(&GlobalLock);
+
+    DBGPRINT(("<== PassthruDeregisterDevice: %x\n", Status));
+    return Status;
+    
+}
+
+VOID
+PtUnload(
+    IN PDRIVER_OBJECT        DriverObject
+    )
+//
+// PassThru driver unload function
+//
+{
+    UNREFERENCED_PARAMETER(DriverObject);
+    
+    DBGPRINT(("PtUnload: entered\n"));   
+    
+    PtUnloadProtocol();
+    
+    NdisIMDeregisterLayeredMiniport(DriverHandle);
+    
+    NdisFreeSpinLock(&GlobalLock);
+       
+    ipfw_module_exit(); // IPFW unloading dummynet
+
+    DBGPRINT(("PtUnload: done!\n"));
+}
diff --git a/modified_passthru/passthru.h b/modified_passthru/passthru.h
new file mode 100644 (file)
index 0000000..6e79db7
--- /dev/null
@@ -0,0 +1,500 @@
+/*++\r
+\r
+Copyright (c) 1992-2000  Microsoft Corporation\r
+\r
+Module Name:\r
+\r
+    passthru.h\r
+\r
+Abstract:\r
+\r
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.\r
+\r
+Author:\r
+\r
+Environment:\r
+\r
+\r
+Revision History:\r
+\r
\r
+--*/\r
+\r
+#ifdef NDIS51_MINIPORT\r
+#define PASSTHRU_MAJOR_NDIS_VERSION            5\r
+#define PASSTHRU_MINOR_NDIS_VERSION            1\r
+#else\r
+#define PASSTHRU_MAJOR_NDIS_VERSION            4\r
+#define PASSTHRU_MINOR_NDIS_VERSION            0\r
+#endif\r
+\r
+#ifdef NDIS51\r
+#define PASSTHRU_PROT_MAJOR_NDIS_VERSION    5\r
+#define PASSTHRU_PROT_MINOR_NDIS_VERSION    0\r
+#else\r
+#define PASSTHRU_PROT_MAJOR_NDIS_VERSION    4\r
+#define PASSTHRU_PROT_MINOR_NDIS_VERSION    0\r
+#endif\r
+\r
+#define MAX_BUNDLEID_LENGTH 50\r
+\r
+#define TAG 'ImPa'\r
+#define WAIT_INFINITE 0\r
+\r
+\r
+\r
+//advance declaration\r
+typedef struct _ADAPT ADAPT, *PADAPT;\r
+\r
+DRIVER_INITIALIZE DriverEntry;\r
+extern\r
+NTSTATUS\r
+DriverEntry(\r
+    IN PDRIVER_OBJECT            DriverObject,\r
+    IN PUNICODE_STRING           RegistryPath\r
+    );\r
+\r
+DRIVER_DISPATCH PtDispatch;\r
+NTSTATUS\r
+PtDispatch(\r
+    IN PDEVICE_OBJECT            DeviceObject,\r
+    IN PIRP                      Irp\r
+    );\r
+\r
+DRIVER_DISPATCH DevIoControl;\r
+NTSTATUS\r
+DevIoControl(\r
+    IN PDEVICE_OBJECT            pDeviceObject,\r
+    IN PIRP                      pIrp\r
+    );\r
+\r
+NDIS_STATUS\r
+PtRegisterDevice(\r
+    VOID\r
+    );\r
+\r
+NDIS_STATUS\r
+PtDeregisterDevice(\r
+    VOID\r
+   );\r
+\r
+DRIVER_UNLOAD PtUnload;\r
+VOID\r
+PtUnloadProtocol(\r
+    VOID\r
+    );\r
+\r
+//\r
+// Protocol proto-types\r
+//\r
+extern\r
+VOID\r
+PtOpenAdapterComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                Status,\r
+    IN NDIS_STATUS                OpenErrorStatus\r
+    );\r
+\r
+extern\r
+VOID\r
+PtCloseAdapterComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtResetComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtRequestComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_REQUEST              NdisRequest,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtStatus(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                GeneralStatus,\r
+    IN PVOID                      StatusBuffer,\r
+    IN UINT                       StatusBufferSize\r
+    );\r
+\r
+extern\r
+VOID\r
+PtStatusComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext\r
+    );\r
+\r
+extern\r
+VOID\r
+PtSendComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_PACKET               Packet,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtTransferDataComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_PACKET               Packet,\r
+    IN NDIS_STATUS                Status,\r
+    IN UINT                       BytesTransferred\r
+    );\r
+\r
+extern\r
+NDIS_STATUS\r
+PtReceive(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_HANDLE                MacReceiveContext,\r
+    IN PVOID                      HeaderBuffer,\r
+    IN UINT                       HeaderBufferSize,\r
+    IN PVOID                      LookAheadBuffer,\r
+    IN UINT                       LookaheadBufferSize,\r
+    IN UINT                       PacketSize\r
+    );\r
+\r
+extern\r
+VOID\r
+PtReceiveComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext\r
+    );\r
+\r
+extern\r
+INT\r
+PtReceivePacket(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_PACKET               Packet\r
+    );\r
+\r
+extern\r
+VOID\r
+PtBindAdapter(\r
+    OUT PNDIS_STATUS              Status,\r
+    IN  NDIS_HANDLE               BindContext,\r
+    IN  PNDIS_STRING              DeviceName,\r
+    IN  PVOID                     SystemSpecific1,\r
+    IN  PVOID                     SystemSpecific2\r
+    );\r
+\r
+extern\r
+VOID\r
+PtUnbindAdapter(\r
+    OUT PNDIS_STATUS              Status,\r
+    IN  NDIS_HANDLE               ProtocolBindingContext,\r
+    IN  NDIS_HANDLE               UnbindContext\r
+    );\r
+    \r
+VOID\r
+PtUnload(\r
+    IN PDRIVER_OBJECT             DriverObject\r
+    );\r
+\r
+\r
+\r
+extern \r
+NDIS_STATUS\r
+PtPNPHandler(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNET_PNP_EVENT             pNetPnPEvent\r
+    );\r
+\r
+\r
+\r
+\r
+NDIS_STATUS\r
+PtPnPNetEventReconfigure(\r
+    IN PADAPT            pAdapt,\r
+    IN PNET_PNP_EVENT    pNetPnPEvent\r
+    );    \r
+\r
+NDIS_STATUS \r
+PtPnPNetEventSetPower (\r
+    IN PADAPT                    pAdapt,\r
+    IN PNET_PNP_EVENT            pNetPnPEvent\r
+    );\r
+    \r
+\r
+//\r
+// Miniport proto-types\r
+//\r
+NDIS_STATUS\r
+MPInitialize(\r
+    OUT PNDIS_STATUS             OpenErrorStatus,\r
+    OUT PUINT                    SelectedMediumIndex,\r
+    IN PNDIS_MEDIUM              MediumArray,\r
+    IN UINT                      MediumArraySize,\r
+    IN NDIS_HANDLE               MiniportAdapterHandle,\r
+    IN NDIS_HANDLE               WrapperConfigurationContext\r
+    );\r
+\r
+VOID\r
+MPSendPackets(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN PPNDIS_PACKET              PacketArray,\r
+    IN UINT                       NumberOfPackets\r
+    );\r
+\r
+NDIS_STATUS\r
+MPSend(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN PNDIS_PACKET               Packet,\r
+    IN UINT                       Flags\r
+    );\r
+\r
+NDIS_STATUS\r
+MPQueryInformation(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_OID                   Oid,\r
+    IN PVOID                      InformationBuffer,\r
+    IN ULONG                      InformationBufferLength,\r
+    OUT PULONG                    BytesWritten,\r
+    OUT PULONG                    BytesNeeded\r
+    );\r
+\r
+NDIS_STATUS\r
+MPSetInformation(\r
+    IN NDIS_HANDLE                                      MiniportAdapterContext,\r
+    IN NDIS_OID                                         Oid,\r
+    __in_bcount(InformationBufferLength) IN PVOID       InformationBuffer,\r
+    IN ULONG                                            InformationBufferLength,\r
+    OUT PULONG                                          BytesRead,\r
+    OUT PULONG                                          BytesNeeded\r
+    );\r
+\r
+VOID\r
+MPReturnPacket(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN PNDIS_PACKET               Packet\r
+    );\r
+\r
+NDIS_STATUS\r
+MPTransferData(\r
+    OUT PNDIS_PACKET              Packet,\r
+    OUT PUINT                     BytesTransferred,\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_HANDLE                MiniportReceiveContext,\r
+    IN UINT                       ByteOffset,\r
+    IN UINT                       BytesToTransfer\r
+    );\r
+\r
+VOID\r
+MPHalt(\r
+    IN NDIS_HANDLE                MiniportAdapterContext\r
+    );\r
+\r
+\r
+VOID\r
+MPQueryPNPCapabilities(  \r
+    OUT PADAPT                    MiniportProtocolContext, \r
+    OUT PNDIS_STATUS              Status\r
+    );\r
+\r
+\r
+#ifdef NDIS51_MINIPORT\r
+\r
+VOID\r
+MPCancelSendPackets(\r
+    IN NDIS_HANDLE            MiniportAdapterContext,\r
+    IN PVOID                  CancelId\r
+    );\r
+\r
+VOID\r
+MPAdapterShutdown(\r
+    IN NDIS_HANDLE                MiniportAdapterContext\r
+    );\r
+\r
+VOID\r
+MPDevicePnPEvent(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_DEVICE_PNP_EVENT      DevicePnPEvent,\r
+    IN PVOID                      InformationBuffer,\r
+    IN ULONG                      InformationBufferLength\r
+    );\r
+\r
+#endif // NDIS51_MINIPORT\r
+\r
+VOID\r
+MPFreeAllPacketPools(\r
+    IN PADAPT                    pAdapt\r
+    );\r
+\r
+\r
+VOID\r
+MPProcessSetPowerOid(\r
+    IN OUT PNDIS_STATUS                             pNdisStatus,\r
+    IN PADAPT                                       pAdapt,\r
+    __in_bcount(InformationBufferLength) IN PVOID   InformationBuffer,\r
+    IN ULONG                                        InformationBufferLength,\r
+    OUT PULONG                                      BytesRead,\r
+    OUT PULONG                                      BytesNeeded\r
+    );\r
+\r
+VOID\r
+PtReferenceAdapt(\r
+    IN PADAPT     pAdapt\r
+    );\r
+\r
+BOOLEAN\r
+PtDereferenceAdapt(\r
+    IN PADAPT     pAdapt\r
+    );\r
+\r
+//\r
+// There should be no DbgPrint's in the Free version of the driver\r
+//\r
+#if DBG\r
+\r
+#define DBGPRINT(Fmt)                                        \\r
+    {                                                        \\r
+       DbgPrint("Passthru: ");                              \
+        DbgPrint Fmt;                                        \\r
+    }\r
+\r
+#else // if DBG\r
+\r
+#define DBGPRINT(Fmt)                                            \r
+\r
+#endif // if DBG \r
+\r
+#define    NUM_PKTS_IN_POOL    256\r
+\r
+\r
+//\r
+// Protocol reserved part of a sent packet that is allocated by us.\r
+//\r
+typedef struct _SEND_RSVD\r
+{\r
+    PNDIS_PACKET    OriginalPkt;\r
+    struct mbuf*    pMbuf; // IPFW extension, reference to the mbuf\r
+} SEND_RSVD, *PSEND_RSVD;\r
+\r
+//\r
+// Miniport reserved part of a received packet that is allocated by\r
+// us. Note that this should fit into the MiniportReserved space\r
+// in an NDIS_PACKET.\r
+//\r
+typedef struct _RECV_RSVD\r
+{\r
+    PNDIS_PACKET    OriginalPkt;\r
+    struct mbuf*    pMbuf; // IPFW extension, reference to the mbuf\r
+} RECV_RSVD, *PRECV_RSVD;\r
+\r
+C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved));\r
+\r
+//\r
+// Event Codes related to the PassthruEvent Structure\r
+//\r
+\r
+typedef enum \r
+{\r
+    Passthru_Invalid,\r
+    Passthru_SetPower,\r
+    Passthru_Unbind\r
+\r
+} PASSSTHRU_EVENT_CODE, *PPASTHRU_EVENT_CODE; \r
+\r
+//\r
+// Passthru Event with  a code to state why they have been state\r
+//\r
+\r
+typedef struct _PASSTHRU_EVENT\r
+{\r
+    NDIS_EVENT Event;\r
+    PASSSTHRU_EVENT_CODE Code;\r
+\r
+} PASSTHRU_EVENT, *PPASSTHRU_EVENT;\r
+\r
+\r
+//\r
+// Structure used by both the miniport as well as the protocol part of the intermediate driver\r
+// to represent an adapter and its corres. lower bindings\r
+//\r
+typedef struct _ADAPT\r
+{\r
+    struct _ADAPT *                Next;\r
+    \r
+    NDIS_HANDLE                    BindingHandle;    // To the lower miniport\r
+    NDIS_HANDLE                    MiniportHandle;    // NDIS Handle to for miniport up-calls\r
+    NDIS_HANDLE                    SendPacketPoolHandle;\r
+    NDIS_HANDLE                    RecvPacketPoolHandle;\r
+    NDIS_STATUS                    Status;            // Open Status\r
+    NDIS_EVENT                     Event;            // Used by bind/halt for Open/Close Adapter synch.\r
+    NDIS_MEDIUM                    Medium;\r
+    NDIS_REQUEST                   Request;        // This is used to wrap a request coming down\r
+                                                // to us. This exploits the fact that requests\r
+                                                // are serialized down to us.\r
+    PULONG                         BytesNeeded;\r
+    PULONG                         BytesReadOrWritten;\r
+    BOOLEAN                        ReceivedIndicationFlags[32];\r
+    \r
+    BOOLEAN                        OutstandingRequests;      // TRUE iff a request is pending\r
+                                                        // at the miniport below\r
+    BOOLEAN                        QueuedRequest;            // TRUE iff a request is queued at\r
+                                                        // this IM miniport\r
+\r
+    BOOLEAN                        StandingBy;                // True - When the miniport or protocol is transitioning from a D0 to Standby (>D0) State\r
+    BOOLEAN                        UnbindingInProcess;\r
+    NDIS_SPIN_LOCK                 Lock;\r
+                                                        // False - At all other times, - Flag is cleared after a transition to D0\r
+\r
+    NDIS_DEVICE_POWER_STATE        MPDeviceState;            // Miniport's Device State \r
+    NDIS_DEVICE_POWER_STATE        PTDeviceState;            // Protocol's Device State \r
+    NDIS_STRING                    DeviceName;                // For initializing the miniport edge\r
+    NDIS_EVENT                     MiniportInitEvent;        // For blocking UnbindAdapter while\r
+                                                        // an IM Init is in progress.\r
+    BOOLEAN                        MiniportInitPending;    // TRUE iff IMInit in progress\r
+    NDIS_STATUS                    LastIndicatedStatus;    // The last indicated media status\r
+    NDIS_STATUS                    LatestUnIndicateStatus; // The latest suppressed media status\r
+    ULONG                          OutstandingSends;\r
+    LONG                           RefCount;\r
+    BOOLEAN                        MiniportIsHalted;\r
+} ADAPT, *PADAPT;\r
+\r
+extern    NDIS_HANDLE                        ProtHandle, DriverHandle;\r
+extern    NDIS_MEDIUM                        MediumArray[4];\r
+extern    PADAPT                             pAdaptList;\r
+extern    NDIS_SPIN_LOCK                     GlobalLock;\r
+\r
+\r
+#define ADAPT_MINIPORT_HANDLE(_pAdapt)    ((_pAdapt)->MiniportHandle)\r
+#define ADAPT_DECR_PENDING_SENDS(_pAdapt)     \\r
+    {                                         \\r
+        NdisAcquireSpinLock(&(_pAdapt)->Lock);   \\r
+        (_pAdapt)->OutstandingSends--;           \\r
+        NdisReleaseSpinLock(&(_pAdapt)->Lock);   \\r
+    }\r
+\r
+//\r
+// Custom Macros to be used by the passthru driver \r
+//\r
+/*\r
+BOOLEAN\r
+IsIMDeviceStateOn(\r
+   PADAPT \r
+   )\r
+\r
+*/\r
+#define IsIMDeviceStateOn(_pP)        ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) \r
+\r
+#include "winmissing.h"\r
+\r
+int ipfw_module_init(void);\r
+void ipfw_module_exit(void);\r
+int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction,\r
+       NDIS_HANDLE Context);\r
+int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext,\r
+               unsigned char* HeaderBuffer, unsigned int HeaderBufferSize,\r
+               unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize,\r
+           unsigned int PacketSize);\r
+void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt);\r
+void hexdump(PUCHAR,int, const char *);\r
+void my_init();\r
+void my_exit();
\ No newline at end of file
diff --git a/modified_passthru/precomp.h b/modified_passthru/precomp.h
new file mode 100644 (file)
index 0000000..b2870d1
--- /dev/null
@@ -0,0 +1,11 @@
+#pragma warning(disable:4214)   // bit field types other than int\r
+\r
+#pragma warning(disable:4201)   // nameless struct/union\r
+#pragma warning(disable:4115)   // named type definition in parentheses\r
+#pragma warning(disable:4127)   // conditional expression is constant\r
+#pragma warning(disable:4054)   // cast of function pointer to PVOID\r
+#pragma warning(disable:4244)   // conversion from 'int' to 'BOOLEAN', possible loss of data\r
+\r
+#include <ndis.h>\r
+#include "passthru.h"\r
+\r
diff --git a/modified_passthru/protocol.c b/modified_passthru/protocol.c
new file mode 100644 (file)
index 0000000..9db4c36
--- /dev/null
@@ -0,0 +1,1670 @@
+/*++
+
+Copyright(c) 1992-2000  Microsoft Corporation
+
+Module Name:
+
+    protocol.c
+
+Abstract:
+
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.
+
+Author:
+
+Environment:
+
+
+Revision History:
+
+
+--*/
+
+
+#include "precomp.h"
+#pragma hdrstop
+
+#define MAX_PACKET_POOL_SIZE 0x0000FFFF
+#define MIN_PACKET_POOL_SIZE 0x000000FF
+
+//
+// NDIS version as 0xMMMMmmmm, where M=Major/m=minor (0x00050001 = 5.1); 
+// initially unknown (0)
+// 
+ULONG       NdisDotSysVersion =  0x0;
+
+
+#define NDIS_SYS_VERSION_51       0x00050001
+
+
+VOID
+PtBindAdapter(
+    OUT PNDIS_STATUS            Status,
+    IN  NDIS_HANDLE             BindContext,
+    IN  PNDIS_STRING            DeviceName,
+    IN  PVOID                   SystemSpecific1,
+    IN  PVOID                   SystemSpecific2
+    )
+/*++
+
+Routine Description:
+
+    Called by NDIS to bind to a miniport below.
+
+Arguments:
+
+    Status            - Return status of bind here.
+    BindContext        - Can be passed to NdisCompleteBindAdapter if this call is pended.
+    DeviceName         - Device name to bind to. This is passed to NdisOpenAdapter.
+    SystemSpecific1    - Can be passed to NdisOpenProtocolConfiguration to read per-binding information
+    SystemSpecific2    - Unused
+
+Return Value:
+
+    NDIS_STATUS_PENDING    if this call is pended. In this case call NdisCompleteBindAdapter
+    to complete.
+    Anything else          Completes this call synchronously
+
+--*/
+{
+    NDIS_HANDLE                     ConfigHandle = NULL;
+    PNDIS_CONFIGURATION_PARAMETER   Param;
+    NDIS_STRING                     DeviceStr = NDIS_STRING_CONST("UpperBindings");
+    NDIS_STRING                     NdisVersionStr = NDIS_STRING_CONST("NdisVersion");
+    PADAPT                          pAdapt = NULL;
+    NDIS_STATUS                     Sts;
+    UINT                            MediumIndex;
+    ULONG                           TotalSize;
+    BOOLEAN                         NoCleanUpNeeded = FALSE;
+
+
+    UNREFERENCED_PARAMETER(BindContext);
+    UNREFERENCED_PARAMETER(SystemSpecific2);
+    
+    DBGPRINT(("==> Protocol BindAdapter\n"));
+
+    do
+    {
+        //
+        // Access the configuration section for our binding-specific
+        // parameters.
+        //
+        NdisOpenProtocolConfiguration(Status,
+                                       &ConfigHandle,
+                                       SystemSpecific1);
+
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            break;
+        }
+        if (NdisDotSysVersion == 0)
+        {
+            NdisReadConfiguration(Status,
+                                  &Param,
+                                  ConfigHandle,
+                                  &NdisVersionStr,        // "NdisVersion"
+                                  NdisParameterInteger);
+            if (*Status != NDIS_STATUS_SUCCESS)
+            {
+                break;
+            }
+            
+            NdisDotSysVersion = Param->ParameterData.IntegerData;
+        }
+                        
+
+        //
+        // Read the "UpperBindings" reserved key that contains a list
+        // of device names representing our miniport instances corresponding
+        // to this lower binding. Since this is a 1:1 IM driver, this key
+        // contains exactly one name.
+        //
+        // If we want to implement a N:1 mux driver (N adapter instances
+        // over a single lower binding), then UpperBindings will be a
+        // MULTI_SZ containing a list of device names - we would loop through
+        // this list, calling NdisIMInitializeDeviceInstanceEx once for
+        // each name in it.
+        //
+        NdisReadConfiguration(Status,
+                              &Param,
+                              ConfigHandle,
+                              &DeviceStr,
+                              NdisParameterString);
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            break;
+        }
+
+        //
+        // Allocate memory for the Adapter structure. This represents both the
+        // protocol context as well as the adapter structure when the miniport
+        // is initialized.
+        //
+        // In addition to the base structure, allocate space for the device
+        // instance string.
+        //
+        TotalSize = sizeof(ADAPT) + Param->ParameterData.StringData.MaximumLength;
+
+        NdisAllocateMemoryWithTag(&pAdapt, TotalSize, TAG);
+
+        if (pAdapt == NULL)
+        {
+            *Status = NDIS_STATUS_RESOURCES;
+            break;
+        }
+
+        //
+        // Initialize the adapter structure. We copy in the IM device
+        // name as well, because we may need to use it in a call to
+        // NdisIMCancelInitializeDeviceInstance. The string returned
+        // by NdisReadConfiguration is active (i.e. available) only
+        // for the duration of this call to our BindAdapter handler.
+        //
+        NdisZeroMemory(pAdapt, TotalSize);
+        pAdapt->DeviceName.MaximumLength = Param->ParameterData.StringData.MaximumLength;
+        pAdapt->DeviceName.Length = Param->ParameterData.StringData.Length;
+        pAdapt->DeviceName.Buffer = (PWCHAR)((ULONG_PTR)pAdapt + sizeof(ADAPT));
+        NdisMoveMemory(pAdapt->DeviceName.Buffer,
+                       Param->ParameterData.StringData.Buffer,
+                       Param->ParameterData.StringData.MaximumLength);
+
+
+
+        NdisInitializeEvent(&pAdapt->Event);
+        NdisAllocateSpinLock(&pAdapt->Lock);
+
+        //
+        // Allocate a packet pool for sends. We need this to pass sends down.
+        // We cannot use the same packet descriptor that came down to our send
+        // handler (see also NDIS 5.1 packet stacking).
+        //
+        NdisAllocatePacketPoolEx(Status,
+                                   &pAdapt->SendPacketPoolHandle,
+                                   MIN_PACKET_POOL_SIZE,
+                                   MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE,
+                                   sizeof(SEND_RSVD));
+
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            break;
+        }
+
+        //
+        // Allocate a packet pool for receives. We need this to indicate receives.
+        // Same consideration as sends (see also NDIS 5.1 packet stacking).
+        //
+        NdisAllocatePacketPoolEx(Status,
+                                   &pAdapt->RecvPacketPoolHandle,
+                                   MIN_PACKET_POOL_SIZE,
+                                   MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE,
+                                   PROTOCOL_RESERVED_SIZE_IN_PACKET);
+
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            break;
+        }
+
+        //
+        // Now open the adapter below and complete the initialization
+        //
+        NdisOpenAdapter(Status,
+                          &Sts,
+                          &pAdapt->BindingHandle,
+                          &MediumIndex,
+                          MediumArray,
+                          sizeof(MediumArray)/sizeof(NDIS_MEDIUM),
+                          ProtHandle,
+                          pAdapt,
+                          DeviceName,
+                          0,
+                          NULL);
+
+        if (*Status == NDIS_STATUS_PENDING)
+        {
+            NdisWaitEvent(&pAdapt->Event, 0);
+            *Status = pAdapt->Status;
+        }
+
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            break;
+        }
+        PtReferenceAdapt(pAdapt);
+
+#pragma prefast(suppress: __WARNING_POTENTIAL_BUFFER_OVERFLOW, "Ndis guarantees MediumIndex to be within bounds");
+        pAdapt->Medium = MediumArray[MediumIndex];
+
+        //
+        // Now ask NDIS to initialize our miniport (upper) edge.
+        // Set the flag below to synchronize with a possible call
+        // to our protocol Unbind handler that may come in before
+        // our miniport initialization happens.
+        //
+        pAdapt->MiniportInitPending = TRUE;
+        NdisInitializeEvent(&pAdapt->MiniportInitEvent);
+
+        PtReferenceAdapt(pAdapt);
+
+        *Status = NdisIMInitializeDeviceInstanceEx(DriverHandle,
+                                           &pAdapt->DeviceName,
+                                           pAdapt);
+
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            if (pAdapt->MiniportIsHalted == TRUE)
+            {
+                NoCleanUpNeeded = TRUE;
+            }
+            
+            DBGPRINT(("BindAdapter: Adapt %p, IMInitializeDeviceInstance error %x\n",
+                pAdapt, *Status));
+            
+            if (PtDereferenceAdapt(pAdapt))
+            {
+                pAdapt = NULL;
+            }
+            
+            break;
+        }
+        
+        PtDereferenceAdapt(pAdapt);
+
+    } while(FALSE);
+
+    //
+    // Close the configuration handle now - see comments above with
+    // the call to NdisIMInitializeDeviceInstanceEx.
+    //
+    if (ConfigHandle != NULL)
+    {
+        NdisCloseConfiguration(ConfigHandle);
+    }
+
+    if ((*Status != NDIS_STATUS_SUCCESS) && (NoCleanUpNeeded == FALSE))
+    {
+        if (pAdapt != NULL)
+        {
+            if (pAdapt->BindingHandle != NULL)
+            {
+                NDIS_STATUS    LocalStatus;
+
+                //
+                // Close the binding we opened above.
+                //
+
+                NdisResetEvent(&pAdapt->Event);
+                
+                NdisCloseAdapter(&LocalStatus, pAdapt->BindingHandle);
+                pAdapt->BindingHandle = NULL;
+
+                if (LocalStatus == NDIS_STATUS_PENDING)
+                {
+                     NdisWaitEvent(&pAdapt->Event, 0);
+                     LocalStatus = pAdapt->Status;
+
+                     
+                }
+                if (PtDereferenceAdapt(pAdapt))
+                {
+                     pAdapt = NULL;
+                }
+            }
+        }
+    }
+
+
+    DBGPRINT(("<== Protocol BindAdapter: pAdapt %p, Status %x\n", pAdapt, *Status));
+}
+
+
+VOID
+PtOpenAdapterComplete(
+    IN  NDIS_HANDLE             ProtocolBindingContext,
+    IN  NDIS_STATUS             Status,
+    IN  NDIS_STATUS             OpenErrorStatus
+    )
+/*++
+
+Routine Description:
+
+    Completion routine for NdisOpenAdapter issued from within the PtBindAdapter. Simply
+    unblock the caller.
+
+Arguments:
+
+    ProtocolBindingContext    Pointer to the adapter
+    Status                    Status of the NdisOpenAdapter call
+    OpenErrorStatus            Secondary status(ignored by us).
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT      pAdapt =(PADAPT)ProtocolBindingContext;
+    
+    UNREFERENCED_PARAMETER(OpenErrorStatus);
+    
+    DBGPRINT(("==> PtOpenAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status));
+    pAdapt->Status = Status;
+    NdisSetEvent(&pAdapt->Event);
+}
+
+
+VOID
+PtUnbindAdapter(
+    OUT PNDIS_STATUS           Status,
+    IN  NDIS_HANDLE            ProtocolBindingContext,
+    IN  NDIS_HANDLE            UnbindContext
+    )
+/*++
+
+Routine Description:
+
+    Called by NDIS when we are required to unbind to the adapter below.
+    This functions shares functionality with the miniport's HaltHandler.
+    The code should ensure that NdisCloseAdapter and NdisFreeMemory is called
+    only once between the two functions
+
+Arguments:
+
+    Status                    Placeholder for return status
+    ProtocolBindingContext    Pointer to the adapter structure
+    UnbindContext            Context for NdisUnbindComplete() if this pends
+
+Return Value:
+
+    Status for NdisIMDeinitializeDeviceContext
+
+--*/
+{
+    PADAPT         pAdapt =(PADAPT)ProtocolBindingContext;
+    NDIS_STATUS    LocalStatus;
+
+    UNREFERENCED_PARAMETER(UnbindContext);
+    
+    DBGPRINT(("==> PtUnbindAdapter: Adapt %p\n", pAdapt));
+
+    //
+    // Set the flag that the miniport below is unbinding, so the request handlers will
+    // fail any request comming later
+    // 
+    NdisAcquireSpinLock(&pAdapt->Lock);
+    pAdapt->UnbindingInProcess = TRUE;
+    if (pAdapt->QueuedRequest == TRUE)
+    {
+        pAdapt->QueuedRequest = FALSE;
+        NdisReleaseSpinLock(&pAdapt->Lock);
+
+        PtRequestComplete(pAdapt,
+                         &pAdapt->Request,
+                         NDIS_STATUS_FAILURE );
+
+    }
+    else
+    {
+        NdisReleaseSpinLock(&pAdapt->Lock);
+    }
+#ifndef WIN9X
+    //
+    // Check if we had called NdisIMInitializeDeviceInstanceEx and
+    // we are awaiting a call to MiniportInitialize.
+    //
+    if (pAdapt->MiniportInitPending == TRUE)
+    {
+        //
+        // Try to cancel the pending IMInit process.
+        //
+        LocalStatus = NdisIMCancelInitializeDeviceInstance(
+                        DriverHandle,
+                        &pAdapt->DeviceName);
+
+        if (LocalStatus == NDIS_STATUS_SUCCESS)
+        {
+            //
+            // Successfully cancelled IM Initialization; our
+            // Miniport Initialize routine will not be called
+            // for this device.
+            //
+            pAdapt->MiniportInitPending = FALSE;
+            ASSERT(pAdapt->MiniportHandle == NULL);
+        }
+        else
+        {
+            //
+            // Our Miniport Initialize routine will be called
+            // (may be running on another thread at this time).
+            // Wait for it to finish.
+            //
+            NdisWaitEvent(&pAdapt->MiniportInitEvent, 0);
+            ASSERT(pAdapt->MiniportInitPending == FALSE);
+        }
+
+    }
+#endif // !WIN9X
+
+    //
+    // Call NDIS to remove our device-instance. We do most of the work
+    // inside the HaltHandler.
+    //
+    // The Handle will be NULL if our miniport Halt Handler has been called or
+    // if the IM device was never initialized
+    //
+    
+    if (pAdapt->MiniportHandle != NULL)
+    {
+        *Status = NdisIMDeInitializeDeviceInstance(pAdapt->MiniportHandle);
+
+        if (*Status != NDIS_STATUS_SUCCESS)
+        {
+            *Status = NDIS_STATUS_FAILURE;
+        }
+    }
+    else
+    {
+        //
+        // We need to do some work here. 
+        // Close the binding below us 
+        // and release the memory allocated.
+        //
+        
+        if(pAdapt->BindingHandle != NULL)
+        {
+            NdisResetEvent(&pAdapt->Event);
+
+            NdisCloseAdapter(Status, pAdapt->BindingHandle);
+
+            //
+            // Wait for it to complete
+            //
+            if(*Status == NDIS_STATUS_PENDING)
+            {
+                 NdisWaitEvent(&pAdapt->Event, 0);
+                 *Status = pAdapt->Status;
+            }
+            pAdapt->BindingHandle = NULL;
+        }
+        else
+        {
+            //
+            // Both Our MiniportHandle and Binding Handle  should not be NULL.
+            //
+            *Status = NDIS_STATUS_FAILURE;
+            ASSERT(0);
+        }
+
+        //
+        //    Free the memory here, if was not released earlier(by calling the HaltHandler)
+        //
+        MPFreeAllPacketPools(pAdapt);
+        NdisFreeSpinLock(&pAdapt->Lock);
+        NdisFreeMemory(pAdapt, 0, 0);
+    }
+
+    DBGPRINT(("<== PtUnbindAdapter: Adapt %p\n", pAdapt));
+}
+
+VOID
+PtUnloadProtocol(
+    VOID
+)
+{
+    NDIS_STATUS Status;
+
+    if (ProtHandle != NULL)
+    {
+        NdisDeregisterProtocol(&Status, ProtHandle);
+        ProtHandle = NULL;
+    }
+
+    DBGPRINT(("PtUnloadProtocol: done!\n"));
+}
+
+
+
+VOID
+PtCloseAdapterComplete(
+    IN    NDIS_HANDLE            ProtocolBindingContext,
+    IN    NDIS_STATUS            Status
+    )
+/*++
+
+Routine Description:
+
+    Completion for the CloseAdapter call.
+
+Arguments:
+
+    ProtocolBindingContext    Pointer to the adapter structure
+    Status                    Completion status
+
+Return Value:
+
+    None.
+
+--*/
+{
+    PADAPT      pAdapt =(PADAPT)ProtocolBindingContext;
+
+    DBGPRINT(("CloseAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status));
+    pAdapt->Status = Status;
+    NdisSetEvent(&pAdapt->Event);
+}
+
+
+VOID
+PtResetComplete(
+    IN  NDIS_HANDLE            ProtocolBindingContext,
+    IN  NDIS_STATUS            Status
+    )
+/*++
+
+Routine Description:
+
+    Completion for the reset.
+
+Arguments:
+
+    ProtocolBindingContext    Pointer to the adapter structure
+    Status                    Completion status
+
+Return Value:
+
+    None.
+
+--*/
+{
+
+    UNREFERENCED_PARAMETER(ProtocolBindingContext);
+    UNREFERENCED_PARAMETER(Status);
+    //
+    // We never issue a reset, so we should not be here.
+    //
+    ASSERT(0);
+}
+
+
+VOID
+PtRequestComplete(
+    IN  NDIS_HANDLE            ProtocolBindingContext,
+    IN  PNDIS_REQUEST          NdisRequest,
+    IN  NDIS_STATUS            Status
+    )
+/*++
+
+Routine Description:
+
+    Completion handler for the previously posted request. All OIDS
+    are completed by and sent to the same miniport that they were requested for.
+    If Oid == OID_PNP_QUERY_POWER then the data structure needs to returned with all entries =
+    NdisDeviceStateUnspecified
+
+Arguments:
+
+    ProtocolBindingContext    Pointer to the adapter structure
+    NdisRequest                The posted request
+    Status                    Completion status
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT        pAdapt = (PADAPT)ProtocolBindingContext;
+    NDIS_OID      Oid = pAdapt->Request.DATA.SET_INFORMATION.Oid ;
+
+    //
+    // Since our request is not outstanding anymore
+    //
+    ASSERT(pAdapt->OutstandingRequests == TRUE);
+
+    pAdapt->OutstandingRequests = FALSE;
+
+    //
+    // Complete the Set or Query, and fill in the buffer for OID_PNP_CAPABILITIES, if need be.
+    //
+    switch (NdisRequest->RequestType)
+    {
+      case NdisRequestQueryInformation:
+
+        //
+        // We never pass OID_PNP_QUERY_POWER down.
+        //
+        ASSERT(Oid != OID_PNP_QUERY_POWER);
+
+        if ((Oid == OID_PNP_CAPABILITIES) && (Status == NDIS_STATUS_SUCCESS))
+        {
+            MPQueryPNPCapabilities(pAdapt, &Status);
+        }
+        *pAdapt->BytesReadOrWritten = NdisRequest->DATA.QUERY_INFORMATION.BytesWritten;
+        *pAdapt->BytesNeeded = NdisRequest->DATA.QUERY_INFORMATION.BytesNeeded;
+
+        if (((Oid == OID_GEN_MAC_OPTIONS) 
+              && (Status == NDIS_STATUS_SUCCESS))
+              && (NdisDotSysVersion >= NDIS_SYS_VERSION_51))
+        {
+            //
+            // Only do this on Windows XP or greater (NDIS.SYS v 5.1); 
+            // do not do in Windows 2000 (NDIS.SYS v 5.0))
+            //
+                
+            //
+            // Remove the no-loopback bit from mac-options. In essence we are
+            // telling NDIS that we can handle loopback. We don't, but the
+            // interface below us does. If we do not do this, then loopback
+            // processing happens both below us and above us. This is wasteful
+            // at best and if Netmon is running, it will see multiple copies
+            // of loopback packets when sniffing above us.
+            //
+            // Only the lowest miniport is a stack of layered miniports should
+            // ever report this bit set to NDIS.
+            //
+            *(PULONG)NdisRequest->DATA.QUERY_INFORMATION.InformationBuffer &= ~NDIS_MAC_OPTION_NO_LOOPBACK;
+        }
+
+        NdisMQueryInformationComplete(pAdapt->MiniportHandle,
+                                      Status);
+        break;
+
+      case NdisRequestSetInformation:
+
+        ASSERT( Oid != OID_PNP_SET_POWER);
+
+        *pAdapt->BytesReadOrWritten = NdisRequest->DATA.SET_INFORMATION.BytesRead;
+        *pAdapt->BytesNeeded = NdisRequest->DATA.SET_INFORMATION.BytesNeeded;
+        NdisMSetInformationComplete(pAdapt->MiniportHandle,
+                                    Status);
+        break;
+
+      default:
+        ASSERT(0);
+        break;
+    }
+    
+}
+
+
+VOID
+PtStatus(
+    IN  NDIS_HANDLE         ProtocolBindingContext,
+    IN  NDIS_STATUS         GeneralStatus,
+    IN  PVOID               StatusBuffer,
+    IN  UINT                StatusBufferSize
+    )
+/*++
+
+Routine Description:
+
+    Status handler for the lower-edge(protocol).
+
+Arguments:
+
+    ProtocolBindingContext    Pointer to the adapter structure
+    GeneralStatus             Status code
+    StatusBuffer              Status buffer
+    StatusBufferSize          Size of the status buffer
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT      pAdapt = (PADAPT)ProtocolBindingContext;
+
+    //
+    // Pass up this indication only if the upper edge miniport is initialized
+    // and powered on. Also ignore indications that might be sent by the lower
+    // miniport when it isn't at D0.
+    //
+    if ((pAdapt->MiniportHandle != NULL)  &&
+        (pAdapt->MPDeviceState == NdisDeviceStateD0) &&
+        (pAdapt->PTDeviceState == NdisDeviceStateD0))    
+    {
+        if ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || 
+            (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT))
+        {
+            
+            pAdapt->LastIndicatedStatus = GeneralStatus;
+        }
+        NdisMIndicateStatus(pAdapt->MiniportHandle,
+                            GeneralStatus,
+                            StatusBuffer,
+                            StatusBufferSize);
+    }
+    //
+    // Save the last indicated media status 
+    //
+    else
+    {
+        if ((pAdapt->MiniportHandle != NULL) && 
+        ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || 
+            (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT)))
+        {
+            pAdapt->LatestUnIndicateStatus = GeneralStatus;
+        }
+    }
+    
+}
+
+
+VOID
+PtStatusComplete(
+    IN NDIS_HANDLE            ProtocolBindingContext
+    )
+/*++
+
+Routine Description:
+
+
+Arguments:
+
+
+Return Value:
+
+
+--*/
+{
+    PADAPT      pAdapt = (PADAPT)ProtocolBindingContext;
+
+    //
+    // Pass up this indication only if the upper edge miniport is initialized
+    // and powered on. Also ignore indications that might be sent by the lower
+    // miniport when it isn't at D0.
+    //
+    if ((pAdapt->MiniportHandle != NULL)  &&
+        (pAdapt->MPDeviceState == NdisDeviceStateD0) &&
+        (pAdapt->PTDeviceState == NdisDeviceStateD0))    
+    {
+        NdisMIndicateStatusComplete(pAdapt->MiniportHandle);
+    }
+}
+
+
+VOID
+PtSendComplete(
+    IN  NDIS_HANDLE            ProtocolBindingContext,
+    IN  PNDIS_PACKET           Packet,
+    IN  NDIS_STATUS            Status
+    )
+/*++
+
+Routine Description:
+
+    Called by NDIS when the miniport below had completed a send. We should
+    complete the corresponding upper-edge send this represents.
+
+Arguments:
+
+    ProtocolBindingContext - Points to ADAPT structure
+    Packet - Low level packet being completed
+    Status - status of send
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT            pAdapt = (PADAPT)ProtocolBindingContext;
+    PNDIS_PACKET      Pkt; 
+    NDIS_HANDLE       PoolHandle;
+
+#ifdef NDIS51
+    //
+    // Packet stacking:
+    //
+    // Determine if the packet we are completing is the one we allocated. If so, then
+    // get the original packet from the reserved area and completed it and free the
+    // allocated packet. If this is the packet that was sent down to us, then just
+    // complete it
+    //
+    PoolHandle = NdisGetPoolFromPacket(Packet);
+    if (PoolHandle != pAdapt->SendPacketPoolHandle)
+    {
+        //
+        // We had passed down a packet belonging to the protocol above us.
+        //
+        // DBGPRINT(("PtSendComp: Adapt %p, Stacked Packet %p\n", pAdapt, Packet));
+
+        NdisMSendComplete(pAdapt->MiniportHandle,
+                          Packet,
+                          Status);
+    }
+    else
+#endif // NDIS51
+    {
+        PSEND_RSVD        SendRsvd;
+
+        SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved);
+        Pkt = SendRsvd->OriginalPkt;
+
+#if 1  // IPFW - new code
+       //DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt);
+       if (Pkt == NULL) { //this is a reinjected packet, with no 'father'
+               CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt);
+               return;
+       }
+#endif /* IPFW */
+    
+#ifndef WIN9X
+        NdisIMCopySendCompletePerPacketInfo (Pkt, Packet);
+#endif
+    
+        NdisDprFreePacket(Packet);
+
+        NdisMSendComplete(pAdapt->MiniportHandle,
+                                 Pkt,
+                                 Status);
+    }
+    //
+    // Decrease the outstanding send count
+    //
+    ADAPT_DECR_PENDING_SENDS(pAdapt);
+}       
+
+
+VOID
+PtTransferDataComplete(
+    IN  NDIS_HANDLE         ProtocolBindingContext,
+    IN  PNDIS_PACKET        Packet,
+    IN  NDIS_STATUS         Status,
+    IN  UINT                BytesTransferred
+    )
+/*++
+
+Routine Description:
+
+    Entry point called by NDIS to indicate completion of a call by us
+    to NdisTransferData.
+
+    See notes under SendComplete.
+
+Arguments:
+
+Return Value:
+
+--*/
+{
+    PADAPT      pAdapt =(PADAPT)ProtocolBindingContext;
+
+    if(pAdapt->MiniportHandle)
+    {
+        NdisMTransferDataComplete(pAdapt->MiniportHandle,
+                                  Packet,
+                                  Status,
+                                  BytesTransferred);
+    }
+}
+
+
+NDIS_STATUS
+PtReceive(
+    IN  NDIS_HANDLE         ProtocolBindingContext,
+    IN  NDIS_HANDLE         MacReceiveContext,
+    IN  PVOID               HeaderBuffer,
+    IN  UINT                HeaderBufferSize,
+    IN  PVOID               LookAheadBuffer,
+    IN  UINT                LookAheadBufferSize,
+    IN  UINT                PacketSize
+    )
+/*++
+
+Routine Description:
+
+    Handle receive data indicated up by the miniport below. We pass
+    it along to the protocol above us.
+
+    If the miniport below indicates packets, NDIS would more
+    likely call us at our ReceivePacket handler. However we
+    might be called here in certain situations even though
+    the miniport below has indicated a receive packet, e.g.
+    if the miniport had set packet status to NDIS_STATUS_RESOURCES.
+        
+Arguments:
+
+    <see DDK ref page for ProtocolReceive>
+
+Return Value:
+
+    NDIS_STATUS_SUCCESS if we processed the receive successfully,
+    NDIS_STATUS_XXX error code if we discarded it.
+
+--*/
+{
+    PADAPT            pAdapt = (PADAPT)ProtocolBindingContext;
+    PNDIS_PACKET      MyPacket, Packet = NULL;
+    NDIS_STATUS       Status = NDIS_STATUS_SUCCESS;
+    ULONG             Proc = KeGetCurrentProcessorNumber();      
+    
+    if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0))
+    {
+        Status = NDIS_STATUS_FAILURE;
+    }
+    else do
+    {
+        //
+        // Get at the packet, if any, indicated up by the miniport below.
+        //
+        Packet = NdisGetReceivedPacket(pAdapt->BindingHandle, MacReceiveContext);
+        if (Packet != NULL)
+        {              
+            //
+            // The miniport below did indicate up a packet. Use information
+            // from that packet to construct a new packet to indicate up.
+            //
+
+#ifdef NDIS51
+            //
+            // NDIS 5.1 NOTE: Do not reuse the original packet in indicating
+            // up a receive, even if there is sufficient packet stack space.
+            // If we had to do so, we would have had to overwrite the
+            // status field in the original packet to NDIS_STATUS_RESOURCES,
+            // and it is not allowed for protocols to overwrite this field
+            // in received packets.
+            //
+#endif // NDIS51
+
+            //
+            // Get a packet off the pool and indicate that up
+            //
+            NdisDprAllocatePacket(&Status,
+                                &MyPacket,
+                                pAdapt->RecvPacketPoolHandle);
+
+            if (Status == NDIS_STATUS_SUCCESS)
+            {
+                //
+                // Make our packet point to data from the original
+                // packet. NOTE: this works only because we are
+                // indicating a receive directly from the context of
+                // our receive indication. If we need to queue this
+                // packet and indicate it from another thread context,
+                // we will also have to allocate a new buffer and copy
+                // over the packet contents, OOB data and per-packet
+                // information. This is because the packet data
+                // is available only for the duration of this
+                // receive indication call.
+                //
+                NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);
+                NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);
+
+                //
+                // Get the original packet (it could be the same packet as the
+                // one received or a different one based on the number of layered
+                // miniports below) and set it on the indicated packet so the OOB
+                // data is visible correctly at protocols above.  If the IM driver 
+                // modifies the packet in any way it should not set the new packet's
+                // original packet equal to the original packet of the packet that 
+                // was indicated to it from the underlying driver, in this case, the 
+                // IM driver should also ensure that the related per packet info should
+                // be copied to the new packet.
+                // we can set the original packet to the original packet of the packet
+                // indicated from the underlying driver because the driver doesn't modify
+                // the data content in the packet.
+                //
+                NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet));
+                NDIS_SET_PACKET_HEADER_SIZE(MyPacket, HeaderBufferSize);
+
+                //
+                // Copy packet flags.
+                //
+                NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet);
+
+                //
+                // Force protocols above to make a copy if they want to hang
+                // on to data in this packet. This is because we are in our
+                // Receive handler (not ReceivePacket) and we can't return a
+                // ref count from here.
+                //
+                NDIS_SET_PACKET_STATUS(MyPacket, NDIS_STATUS_RESOURCES);
+
+                //
+                // By setting NDIS_STATUS_RESOURCES, we also know that we can reclaim
+                // this packet as soon as the call to NdisMIndicateReceivePacket
+                // returns.
+                //
+
+                if (pAdapt->MiniportHandle != NULL)
+                {
+#if 1  /* IPFW: query the firewall */
+                                       int     ret;
+                                       ret = ipfw2_qhandler_w32(MyPacket, INCOMING,
+                                               ProtocolBindingContext);
+                                       if (ret != PASS)
+                                       return 0; //otherwise simply continue
+#endif /* end of IPFW code */
+                    NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);
+                }
+
+                //
+                // Reclaim the indicated packet. Since we had set its status
+                // to NDIS_STATUS_RESOURCES, we are guaranteed that protocols
+                // above are done with it.
+                //
+                NdisDprFreePacket(MyPacket);
+
+                break;
+            }
+        }
+        else
+        {
+            //
+            // The miniport below us uses the old-style (not packet)
+            // receive indication. Fall through.
+            //
+        }
+
+        //
+        // Fall through if the miniport below us has either not
+        // indicated a packet or we could not allocate one
+        //
+        pAdapt->ReceivedIndicationFlags[Proc] = TRUE;
+        if (pAdapt->MiniportHandle == NULL)
+        {
+            break;
+        }
+        switch (pAdapt->Medium)
+        {
+            case NdisMedium802_3:
+            case NdisMediumWan:
+                               //DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize);
+                               //hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive");
+                       {
+                               int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize);
+                               if (ret != PASS)
+                                       return NDIS_STATUS_SUCCESS;
+                       }
+                NdisMEthIndicateReceive(pAdapt->MiniportHandle,
+                                             MacReceiveContext,
+                                             HeaderBuffer,
+                                             HeaderBufferSize,
+                                             LookAheadBuffer,
+                                             LookAheadBufferSize,
+                                             PacketSize);
+                break;
+
+            case NdisMedium802_5:
+                NdisMTrIndicateReceive(pAdapt->MiniportHandle,
+                                            MacReceiveContext,
+                                            HeaderBuffer,
+                                            HeaderBufferSize,
+                                            LookAheadBuffer,
+                                            LookAheadBufferSize,
+                                            PacketSize);
+                break;
+
+#if FDDI
+                 case NdisMediumFddi:
+                        NdisMFddiIndicateReceive(pAdapt->MiniportHandle,
+                                                                                         MacReceiveContext,
+                                                                                         HeaderBuffer,
+                                                                                         HeaderBufferSize,
+                                                                                         LookAheadBuffer,
+                                                                                         LookAheadBufferSize,
+                                                                                         PacketSize);
+                        break;
+#endif
+                 default:
+                        ASSERT(FALSE);
+                        break;
+               }
+
+    } while(FALSE);
+
+    return Status;
+}
+
+
+VOID
+PtReceiveComplete(
+    IN NDIS_HANDLE        ProtocolBindingContext
+    )
+/*++
+
+Routine Description:
+
+    Called by the adapter below us when it is done indicating a batch of
+    received packets.
+
+Arguments:
+
+    ProtocolBindingContext    Pointer to our adapter structure.
+
+Return Value:
+
+    None
+
+--*/
+{
+    PADAPT        pAdapt =(PADAPT)ProtocolBindingContext;
+    ULONG         Proc = KeGetCurrentProcessorNumber();      
+       
+       /* Warning: this is a poor implementation of the PtReceiveComplete
+        * made by MS, and it's a well known (but never fixed) issue.
+        * Since the ProcessorNumber here can be different from the one
+        * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete
+        * will not be called, causing poor performance in the incoming traffic.
+        * In our driver, PtReceive is called for IP packets ONLY by particulary 
+        * old NIC drivers, and the poor performance can be seen even 
+        * in traffic not handled by ipfw or dummynet.
+        * Fortunately, this is quite rare, all the incoming IP packets
+        * will arrive through PtReceivePacket, and this callback will never
+        * be called. For reinjected traffic, a workaround is done
+        * commuting the ReceivedIndicationFlag and calling
+        * NdisMEthIndicateReceiveComplete manually for each packet.
+        */
+
+    if (((pAdapt->MiniportHandle != NULL)
+                && (pAdapt->MPDeviceState == NdisDeviceStateD0))
+                && (pAdapt->ReceivedIndicationFlags[Proc]))
+    {
+        switch (pAdapt->Medium)
+        {
+            case NdisMedium802_3:
+            case NdisMediumWan:
+                NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle);
+                break;
+
+                 case NdisMedium802_5:
+                       NdisMTrIndicateReceiveComplete(pAdapt->MiniportHandle);
+                       break;
+#if FDDI
+                 case NdisMediumFddi:
+                       NdisMFddiIndicateReceiveComplete(pAdapt->MiniportHandle);
+                       break;
+#endif
+                 default:
+                       ASSERT(FALSE);
+                       break;
+               }
+       }
+
+    pAdapt->ReceivedIndicationFlags[Proc] = FALSE;
+}
+
+
+INT
+PtReceivePacket(
+    IN NDIS_HANDLE            ProtocolBindingContext,
+    IN PNDIS_PACKET           Packet
+    )
+/*++
+
+Routine Description:
+
+    ReceivePacket handler. Called by NDIS if the miniport below supports
+    NDIS 4.0 style receives. Re-package the buffer chain in a new packet
+    and indicate the new packet to protocols above us. Any context for
+    packets indicated up must be kept in the MiniportReserved field.
+
+    NDIS 5.1 - packet stacking - if there is sufficient "stack space" in
+    the packet passed to us, we can use the same packet in a receive
+    indication.
+
+Arguments:
+
+    ProtocolBindingContext - Pointer to our adapter structure.
+    Packet - Pointer to the packet
+
+Return Value:
+
+    == 0 -> We are done with the packet
+    != 0 -> We will keep the packet and call NdisReturnPackets() this
+            many times when done.
+--*/
+{
+    PADAPT              pAdapt =(PADAPT)ProtocolBindingContext;
+    NDIS_STATUS         Status;
+    PNDIS_PACKET        MyPacket;
+    BOOLEAN             Remaining;
+
+    //
+    // Drop the packet silently if the upper miniport edge isn't initialized or
+    // the miniport edge is in low power state
+    //
+    if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0))
+    {
+          return 0;
+    }
+
+#ifdef NDIS51
+    //
+    // Check if we can reuse the same packet for indicating up.
+    // See also: PtReceive(). 
+    //
+    (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining);
+    if (0 && Remaining)
+    {
+        //
+        // We can reuse "Packet". Indicate it up and be done with it.
+        //
+        Status = NDIS_GET_PACKET_STATUS(Packet);
+        NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &Packet, 1);
+        return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0);
+    }
+#endif // NDIS51
+
+    //
+    // Get a packet off the pool and indicate that up
+    //
+    NdisDprAllocatePacket(&Status,
+                           &MyPacket,
+                           pAdapt->RecvPacketPoolHandle);
+
+    if (Status == NDIS_STATUS_SUCCESS)
+    {
+        PRECV_RSVD            RecvRsvd;
+
+        RecvRsvd = (PRECV_RSVD)(MyPacket->MiniportReserved);
+        RecvRsvd->OriginalPkt = Packet;
+
+        NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);
+        NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);
+
+        //
+        // Get the original packet (it could be the same packet as the one
+        // received or a different one based on the number of layered miniports
+        // below) and set it on the indicated packet so the OOB data is visible
+        // correctly to protocols above us.
+        //
+        NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet));
+
+        //
+        // Set Packet Flags
+        //
+        NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet);
+
+        Status = NDIS_GET_PACKET_STATUS(Packet);
+
+        NDIS_SET_PACKET_STATUS(MyPacket, Status);
+        NDIS_SET_PACKET_HEADER_SIZE(MyPacket, NDIS_GET_PACKET_HEADER_SIZE(Packet));
+
+        if (pAdapt->MiniportHandle != NULL)
+        {
+#if 1  /* IPFW: query the firewall */
+           int ret;
+           ret = ipfw2_qhandler_w32(MyPacket, INCOMING,
+                       ProtocolBindingContext);
+           if (ret != PASS)
+                       return 0; //otherwise simply continue
+#endif /* end of IPFW code */
+            NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);
+        }
+
+        //
+        // Check if we had indicated up the packet with NDIS_STATUS_RESOURCES
+        // NOTE -- do not use NDIS_GET_PACKET_STATUS(MyPacket) for this since
+        // it might have changed! Use the value saved in the local variable.
+        //
+        if (Status == NDIS_STATUS_RESOURCES)
+        {
+            //
+            // Our ReturnPackets handler will not be called for this packet.
+            // We should reclaim it right here.
+            //
+            NdisDprFreePacket(MyPacket);
+        }
+
+        return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0);
+    }
+    else
+    {
+        //
+        // We are out of packets. Silently drop it.
+        //
+        return(0);
+    }
+}
+
+
+NDIS_STATUS
+PtPNPHandler(
+    IN NDIS_HANDLE        ProtocolBindingContext,
+    IN PNET_PNP_EVENT     pNetPnPEvent
+    )
+
+/*++
+Routine Description:
+
+    This is called by NDIS to notify us of a PNP event related to a lower
+    binding. Based on the event, this dispatches to other helper routines.
+
+    NDIS 5.1: forward this event to the upper protocol(s) by calling
+    NdisIMNotifyPnPEvent.
+
+Arguments:
+
+    ProtocolBindingContext - Pointer to our adapter structure. Can be NULL
+                for "global" notifications
+
+    pNetPnPEvent - Pointer to the PNP event to be processed.
+
+Return Value:
+
+    NDIS_STATUS code indicating status of event processing.
+
+--*/
+{
+    PADAPT            pAdapt  =(PADAPT)ProtocolBindingContext;
+    NDIS_STATUS       Status  = NDIS_STATUS_SUCCESS;
+
+    DBGPRINT(("PtPnPHandler: Adapt %p, Event %d\n", pAdapt, pNetPnPEvent->NetEvent));
+
+    switch (pNetPnPEvent->NetEvent)
+    {
+        case NetEventSetPower:
+            Status = PtPnPNetEventSetPower(pAdapt, pNetPnPEvent);
+            break;
+
+         case NetEventReconfigure:
+            Status = PtPnPNetEventReconfigure(pAdapt, pNetPnPEvent);
+            break;
+
+         default:
+#ifdef NDIS51
+            //
+            // Pass on this notification to protocol(s) above, before
+            // doing anything else with it.
+            //
+            if (pAdapt && pAdapt->MiniportHandle)
+            {
+                Status = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);
+            }
+#else
+            Status = NDIS_STATUS_SUCCESS;
+
+#endif // NDIS51
+
+            break;
+    }
+
+    return Status;
+}
+
+
+NDIS_STATUS
+PtPnPNetEventReconfigure(
+    IN PADAPT            pAdapt,
+    IN PNET_PNP_EVENT    pNetPnPEvent
+    )
+/*++
+Routine Description:
+
+    This routine is called from NDIS to notify our protocol edge of a
+    reconfiguration of parameters for either a specific binding (pAdapt
+    is not NULL), or global parameters if any (pAdapt is NULL).
+
+Arguments:
+
+    pAdapt - Pointer to our adapter structure.
+    pNetPnPEvent - the reconfigure event
+
+Return Value:
+
+    NDIS_STATUS_SUCCESS
+
+--*/
+{
+    NDIS_STATUS    ReconfigStatus = NDIS_STATUS_SUCCESS;
+    NDIS_STATUS    ReturnStatus = NDIS_STATUS_SUCCESS;
+
+    do
+    {
+        //
+        // Is this is a global reconfiguration notification ?
+        //
+        if (pAdapt == NULL)
+        {
+            //
+            // An important event that causes this notification to us is if
+            // one of our upper-edge miniport instances was enabled after being
+            // disabled earlier, e.g. from Device Manager in Win2000. Note that
+            // NDIS calls this because we had set up an association between our
+            // miniport and protocol entities by calling NdisIMAssociateMiniport.
+            //
+            // Since we would have torn down the lower binding for that miniport,
+            // we need NDIS' assistance to re-bind to the lower miniport. The
+            // call to NdisReEnumerateProtocolBindings does exactly that.
+            //
+            NdisReEnumerateProtocolBindings (ProtHandle);        
+            
+            break;
+        }
+
+#ifdef NDIS51
+        //
+        // Pass on this notification to protocol(s) above before doing anything
+        // with it.
+        //
+        if (pAdapt->MiniportHandle)
+        {
+            ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);
+        }
+#endif // NDIS51
+
+        ReconfigStatus = NDIS_STATUS_SUCCESS;
+
+    } while(FALSE);
+
+    DBGPRINT(("<==PtPNPNetEventReconfigure: pAdapt %p\n", pAdapt));
+
+#ifdef NDIS51
+    //
+    // Overwrite status with what upper-layer protocol(s) returned.
+    //
+    ReconfigStatus = ReturnStatus;
+#endif
+
+    return ReconfigStatus;
+}
+
+
+NDIS_STATUS
+PtPnPNetEventSetPower(
+    IN PADAPT            pAdapt,
+    IN PNET_PNP_EVENT    pNetPnPEvent
+    )
+/*++
+Routine Description:
+
+    This is a notification to our protocol edge of the power state
+    of the lower miniport. If it is going to a low-power state, we must
+    wait here for all outstanding sends and requests to complete.
+
+    NDIS 5.1:  Since we use packet stacking, it is not sufficient to
+    check usage of our local send packet pool to detect whether or not
+    all outstanding sends have completed. For this, use the new API
+    NdisQueryPendingIOCount.
+
+    NDIS 5.1: Use the 5.1 API NdisIMNotifyPnPEvent to pass on PnP
+    notifications to upper protocol(s).
+
+Arguments:
+
+    pAdapt            -    Pointer to the adpater structure
+    pNetPnPEvent    -    The Net Pnp Event. this contains the new device state
+
+Return Value:
+
+    NDIS_STATUS_SUCCESS or the status returned by upper-layer protocols.
+
+--*/
+{
+    PNDIS_DEVICE_POWER_STATE       pDeviceState  =(PNDIS_DEVICE_POWER_STATE)(pNetPnPEvent->Buffer);
+    NDIS_DEVICE_POWER_STATE        PrevDeviceState = pAdapt->PTDeviceState;  
+    NDIS_STATUS                    Status;
+    NDIS_STATUS                    ReturnStatus;
+
+    ReturnStatus = NDIS_STATUS_SUCCESS;
+
+    //
+    // Set the Internal Device State, this blocks all new sends or receives
+    //
+    NdisAcquireSpinLock(&pAdapt->Lock);
+    pAdapt->PTDeviceState = *pDeviceState;
+
+    //
+    // Check if the miniport below is going to a low power state.
+    //
+    if (pAdapt->PTDeviceState > NdisDeviceStateD0)
+    {
+        //
+        // If the miniport below is going to standby, fail all incoming requests
+        //
+        if (PrevDeviceState == NdisDeviceStateD0)
+        {
+            pAdapt->StandingBy = TRUE;
+        }
+
+        NdisReleaseSpinLock(&pAdapt->Lock);
+
+#ifdef NDIS51
+        //
+        // Notify upper layer protocol(s) first.
+        //
+        if (pAdapt->MiniportHandle != NULL)
+        {
+            ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);
+        }
+#endif // NDIS51
+
+        //
+        // Wait for outstanding sends and requests to complete.
+        //
+        while (pAdapt->OutstandingSends != 0)
+        {
+            NdisMSleep(2);
+        }
+
+        while (pAdapt->OutstandingRequests == TRUE)
+        {
+            //
+            // sleep till outstanding requests complete
+            //
+            NdisMSleep(2);
+        }
+
+        //
+        // If the below miniport is going to low power state, complete the queued request
+        //
+        NdisAcquireSpinLock(&pAdapt->Lock);
+        if (pAdapt->QueuedRequest)
+        {
+            pAdapt->QueuedRequest = FALSE;
+            NdisReleaseSpinLock(&pAdapt->Lock);
+            PtRequestComplete(pAdapt, &pAdapt->Request, NDIS_STATUS_FAILURE);
+        }
+        else
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+        }
+            
+
+        ASSERT(NdisPacketPoolUsage(pAdapt->SendPacketPoolHandle) == 0);
+        ASSERT(pAdapt->OutstandingRequests == FALSE);
+    }
+    else
+    {
+        //
+        // If the physical miniport is powering up (from Low power state to D0), 
+        // clear the flag
+        //
+        if (PrevDeviceState > NdisDeviceStateD0)
+        {
+            pAdapt->StandingBy = FALSE;
+        }
+        //
+        // The device below is being turned on. If we had a request
+        // pending, send it down now.
+        //
+        if (pAdapt->QueuedRequest == TRUE)
+        {
+            pAdapt->QueuedRequest = FALSE;
+        
+            pAdapt->OutstandingRequests = TRUE;
+            NdisReleaseSpinLock(&pAdapt->Lock);
+
+            NdisRequest(&Status,
+                        pAdapt->BindingHandle,
+                        &pAdapt->Request);
+
+            if (Status != NDIS_STATUS_PENDING)
+            {
+                PtRequestComplete(pAdapt,
+                                  &pAdapt->Request,
+                                  Status);
+                
+            }
+        }
+        else
+        {
+            NdisReleaseSpinLock(&pAdapt->Lock);
+        }
+
+
+#ifdef NDIS51
+        //
+        // Pass on this notification to protocol(s) above
+        //
+        if (pAdapt->MiniportHandle)
+        {
+            ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);
+        }
+#endif // NDIS51
+
+    }
+
+    return ReturnStatus;
+}
+
+VOID
+PtReferenceAdapt(
+    IN PADAPT     pAdapt
+    )
+{
+    NdisAcquireSpinLock(&pAdapt->Lock);
+    
+    ASSERT(pAdapt->RefCount >= 0);
+
+    pAdapt->RefCount ++;
+    NdisReleaseSpinLock(&pAdapt->Lock);
+}
+
+
+BOOLEAN
+PtDereferenceAdapt(
+    IN PADAPT     pAdapt
+    )
+{
+    NdisAcquireSpinLock(&pAdapt->Lock);
+
+    ASSERT(pAdapt->RefCount > 0);
+
+    pAdapt->RefCount--;
+
+    if (pAdapt->RefCount == 0)
+    {
+        NdisReleaseSpinLock(&pAdapt->Lock);
+        
+        //
+        // Free all resources on this adapter structure.
+        //
+        MPFreeAllPacketPools (pAdapt);;
+        NdisFreeSpinLock(&pAdapt->Lock);
+        NdisFreeMemory(pAdapt, 0 , 0);
+        
+        return TRUE;
+        
+    }
+    else
+    {
+        NdisReleaseSpinLock(&pAdapt->Lock);
+
+        return FALSE;
+    }
+}
+
+
diff --git a/original_passthru/makefile b/original_passthru/makefile
new file mode 100644 (file)
index 0000000..c6c9e94
--- /dev/null
@@ -0,0 +1,22 @@
+#\r
+# DO NOT EDIT THIS FILE!!!  Edit .\sources. if you want to add a new source\r
+# file to this component.  This file merely indirects to the real make file\r
+# that is shared by all the components of NT\r
+#\r
+\r
+#!INCLUDE $(NTMAKEENV)\makefile.def\r
+\r
+\r
+!IF DEFINED(_NT_TARGET_VERSION)\r
+!      IF $(_NT_TARGET_VERSION)>=0x501\r
+!              INCLUDE $(NTMAKEENV)\makefile.def\r
+!      ELSE\r
+#               Only warn once per directory\r
+!               INCLUDE $(NTMAKEENV)\makefile.plt\r
+!               IF "$(BUILD_PASS)"=="PASS1"\r
+!                  message BUILDMSG: Warning : The sample "$(MAKEDIR)" is not valid for the current OS target.\r
+!               ENDIF\r
+!      ENDIF\r
+!ELSE\r
+!      INCLUDE $(NTMAKEENV)\makefile.def\r
+!ENDIF\r
diff --git a/original_passthru/miniport.c b/original_passthru/miniport.c
new file mode 100644 (file)
index 0000000..a7f3bbc
--- /dev/null
@@ -0,0 +1,1461 @@
+/*++\r
+\r
+Copyright (c) 1992-2000  Microsoft Corporation\r
+\r
+Module Name:\r
+\r
+    miniport.c\r
+\r
+Abstract:\r
+\r
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.\r
+\r
+Author:\r
+\r
+Environment:\r
+\r
+\r
+Revision History:\r
+\r
+\r
+--*/\r
+\r
+#include "precomp.h"\r
+#pragma hdrstop\r
+\r
+\r
+\r
+NDIS_STATUS\r
+MPInitialize(\r
+    OUT PNDIS_STATUS             OpenErrorStatus,\r
+    OUT PUINT                    SelectedMediumIndex,\r
+    IN  PNDIS_MEDIUM             MediumArray,\r
+    IN  UINT                     MediumArraySize,\r
+    IN  NDIS_HANDLE              MiniportAdapterHandle,\r
+    IN  NDIS_HANDLE              WrapperConfigurationContext\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    This is the initialize handler which gets called as a result of\r
+    the BindAdapter handler calling NdisIMInitializeDeviceInstanceEx.\r
+    The context parameter which we pass there is the adapter structure\r
+    which we retrieve here.\r
+\r
+    Arguments:\r
+\r
+    OpenErrorStatus            Not used by us.\r
+    SelectedMediumIndex        Place-holder for what media we are using\r
+    MediumArray                Array of ndis media passed down to us to pick from\r
+    MediumArraySize            Size of the array\r
+    MiniportAdapterHandle    The handle NDIS uses to refer to us\r
+    WrapperConfigurationContext    For use by NdisOpenConfiguration\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_SUCCESS unless something goes wrong\r
+\r
+--*/\r
+{\r
+    UINT            i;\r
+    PADAPT          pAdapt;\r
+    NDIS_STATUS     Status = NDIS_STATUS_FAILURE;\r
+    NDIS_MEDIUM     Medium;\r
+\r
+    UNREFERENCED_PARAMETER(WrapperConfigurationContext);\r
+    \r
+    do\r
+    {\r
+        //\r
+        // Start off by retrieving our adapter context and storing\r
+        // the Miniport handle in it.\r
+        //\r
+        pAdapt = NdisIMGetDeviceContext(MiniportAdapterHandle);\r
+        pAdapt->MiniportIsHalted = FALSE;\r
+\r
+        DBGPRINT(("==> Miniport Initialize: Adapt %p\n", pAdapt));\r
+\r
+        //\r
+        // Usually we export the medium type of the adapter below as our\r
+        // virtual miniport's medium type. However if the adapter below us\r
+        // is a WAN device, then we claim to be of medium type 802.3.\r
+        //\r
+        Medium = pAdapt->Medium;\r
+\r
+        if (Medium == NdisMediumWan)\r
+        {\r
+            Medium = NdisMedium802_3;\r
+        }\r
+\r
+        for (i = 0; i < MediumArraySize; i++)\r
+        {\r
+            if (MediumArray[i] == Medium)\r
+            {\r
+                *SelectedMediumIndex = i;\r
+                break;\r
+            }\r
+        }\r
+\r
+        if (i == MediumArraySize)\r
+        {\r
+            Status = NDIS_STATUS_UNSUPPORTED_MEDIA;\r
+            break;\r
+        }\r
+\r
+\r
+        //\r
+        // Set the attributes now. NDIS_ATTRIBUTE_DESERIALIZE enables us\r
+        // to make up-calls to NDIS without having to call NdisIMSwitchToMiniport\r
+        // or NdisIMQueueCallBack. This also forces us to protect our data using\r
+        // spinlocks where appropriate. Also in this case NDIS does not queue\r
+        // packets on our behalf. Since this is a very simple pass-thru\r
+        // miniport, we do not have a need to protect anything. However in\r
+        // a general case there will be a need to use per-adapter spin-locks\r
+        // for the packet queues at the very least.\r
+        //\r
+        NdisMSetAttributesEx(MiniportAdapterHandle,\r
+                             pAdapt,\r
+                             0,                                        // CheckForHangTimeInSeconds\r
+                             NDIS_ATTRIBUTE_IGNORE_PACKET_TIMEOUT    |\r
+                                NDIS_ATTRIBUTE_IGNORE_REQUEST_TIMEOUT|\r
+                                NDIS_ATTRIBUTE_INTERMEDIATE_DRIVER |\r
+                                NDIS_ATTRIBUTE_DESERIALIZE |\r
+                                NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND,\r
+                             0);\r
+\r
+        pAdapt->MiniportHandle = MiniportAdapterHandle;\r
+        //\r
+        // Initialize LastIndicatedStatus to be NDIS_STATUS_MEDIA_CONNECT\r
+        //\r
+        pAdapt->LastIndicatedStatus = NDIS_STATUS_MEDIA_CONNECT;\r
+        \r
+        //\r
+        // Initialize the power states for both the lower binding (PTDeviceState)\r
+        // and our miniport edge to Powered On.\r
+        //\r
+        pAdapt->MPDeviceState = NdisDeviceStateD0;\r
+        pAdapt->PTDeviceState = NdisDeviceStateD0;\r
+\r
+        //\r
+        // Add this adapter to the global pAdapt List\r
+        //\r
+        NdisAcquireSpinLock(&GlobalLock);\r
+\r
+        pAdapt->Next = pAdaptList;\r
+        pAdaptList = pAdapt;\r
+\r
+        NdisReleaseSpinLock(&GlobalLock);\r
+        \r
+        //\r
+        // Create an ioctl interface\r
+        //\r
+        (VOID)PtRegisterDevice();\r
+\r
+        Status = NDIS_STATUS_SUCCESS;\r
+    }\r
+    while (FALSE);\r
+\r
+    //\r
+    // If we had received an UnbindAdapter notification on the underlying\r
+    // adapter, we would have blocked that thread waiting for the IM Init\r
+    // process to complete. Wake up any such thread.\r
+    //\r
+    ASSERT(pAdapt->MiniportInitPending == TRUE);\r
+    pAdapt->MiniportInitPending = FALSE;\r
+    NdisSetEvent(&pAdapt->MiniportInitEvent);\r
+\r
+    if (Status == NDIS_STATUS_SUCCESS)\r
+    {\r
+        PtReferenceAdapt(pAdapt);\r
+    }\r
+\r
+    DBGPRINT(("<== Miniport Initialize: Adapt %p, Status %x\n", pAdapt, Status));\r
+\r
+    *OpenErrorStatus = Status;\r
+\r
+    \r
+    return Status;\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+MPSend(\r
+    IN NDIS_HANDLE             MiniportAdapterContext,\r
+    IN PNDIS_PACKET            Packet,\r
+    IN UINT                    Flags\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Send Packet handler. Either this or our SendPackets (array) handler is called\r
+    based on which one is enabled in our Miniport Characteristics.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    Pointer to the adapter\r
+    Packet                    Packet to send\r
+    Flags                     Unused, passed down below\r
+\r
+Return Value:\r
+\r
+    Return code from NdisSend\r
+\r
+--*/\r
+{\r
+    PADAPT              pAdapt = (PADAPT)MiniportAdapterContext;\r
+    NDIS_STATUS         Status;\r
+    PNDIS_PACKET        MyPacket;\r
+    PVOID               MediaSpecificInfo = NULL;\r
+    ULONG               MediaSpecificInfoSize = 0;\r
+\r
+    //\r
+    // The driver should fail the send if the virtual miniport is in low \r
+    // power state\r
+    //\r
+    if (pAdapt->MPDeviceState > NdisDeviceStateD0)\r
+    {\r
+         return NDIS_STATUS_FAILURE;\r
+    }\r
+\r
+#ifdef NDIS51\r
+    //\r
+    // Use NDIS 5.1 packet stacking:\r
+    //\r
+    {\r
+        PNDIS_PACKET_STACK        pStack;\r
+        BOOLEAN                   Remaining;\r
+\r
+        //\r
+        // Packet stacks: Check if we can use the same packet for sending down.\r
+        //\r
+\r
+        pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining);\r
+        if (Remaining)\r
+        {\r
+            //\r
+            // We can reuse "Packet".\r
+            //\r
+            // NOTE: if we needed to keep per-packet information in packets\r
+            // sent down, we can use pStack->IMReserved[].\r
+            //\r
+            ASSERT(pStack);\r
+            //\r
+            // If the below miniport is going to low power state, stop sending down any packet.\r
+            //\r
+            NdisAcquireSpinLock(&pAdapt->Lock);\r
+            if (pAdapt->PTDeviceState > NdisDeviceStateD0)\r
+            {\r
+                NdisReleaseSpinLock(&pAdapt->Lock);\r
+                return NDIS_STATUS_FAILURE;\r
+            }\r
+            pAdapt->OutstandingSends++;\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            NdisSend(&Status,\r
+                     pAdapt->BindingHandle,\r
+                     Packet);\r
+\r
+            if (Status != NDIS_STATUS_PENDING)\r
+            {\r
+                ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+            }\r
+\r
+            return(Status);\r
+        }\r
+    }\r
+#endif // NDIS51\r
+\r
+    //\r
+    // We are either not using packet stacks, or there isn't stack space\r
+    // in the original packet passed down to us. Allocate a new packet\r
+    // to wrap the data with.\r
+    //\r
+    //\r
+    // If the below miniport is going to low power state, stop sending down any packet.\r
+    //\r
+    NdisAcquireSpinLock(&pAdapt->Lock);\r
+    if (pAdapt->PTDeviceState > NdisDeviceStateD0)\r
+    {\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+        return NDIS_STATUS_FAILURE;\r
+    \r
+    }\r
+    pAdapt->OutstandingSends++;\r
+    NdisReleaseSpinLock(&pAdapt->Lock);\r
+    \r
+    NdisAllocatePacket(&Status,\r
+                       &MyPacket,\r
+                       pAdapt->SendPacketPoolHandle);\r
+\r
+    if (Status == NDIS_STATUS_SUCCESS)\r
+    {\r
+        PSEND_RSVD            SendRsvd;\r
+\r
+        //\r
+        // Save a pointer to the original packet in our reserved\r
+        // area in the new packet. This is needed so that we can\r
+        // get back to the original packet when the new packet's send\r
+        // is completed.\r
+        //\r
+        SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved);\r
+        SendRsvd->OriginalPkt = Packet;\r
+\r
+        NdisGetPacketFlags(MyPacket) = Flags;\r
+\r
+        //\r
+        // Set up the new packet so that it describes the same\r
+        // data as the original packet.\r
+        //\r
+        NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);\r
+        NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);\r
+#ifdef WIN9X\r
+        //\r
+        // Work around the fact that NDIS does not initialize this\r
+        // to FALSE on Win9x.\r
+        //\r
+        NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE;\r
+#endif\r
+\r
+        //\r
+        // Copy the OOB Offset from the original packet to the new\r
+        // packet.\r
+        //\r
+        NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket),\r
+                       NDIS_OOB_DATA_FROM_PACKET(Packet),\r
+                       sizeof(NDIS_PACKET_OOB_DATA));\r
+\r
+#ifndef WIN9X\r
+        //\r
+        // Copy the right parts of per packet info into the new packet.\r
+        // This API is not available on Win9x since task offload is\r
+        // not supported on that platform.\r
+        //\r
+        NdisIMCopySendPerPacketInfo(MyPacket, Packet);\r
+#endif\r
+        \r
+        //\r
+        // Copy the Media specific information\r
+        //\r
+        NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet,\r
+                                            &MediaSpecificInfo,\r
+                                            &MediaSpecificInfoSize);\r
+\r
+        if (MediaSpecificInfo || MediaSpecificInfoSize)\r
+        {\r
+            NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket,\r
+                                                MediaSpecificInfo,\r
+                                                MediaSpecificInfoSize);\r
+        }\r
+\r
+        NdisSend(&Status,\r
+                 pAdapt->BindingHandle,\r
+                 MyPacket);\r
+\r
+\r
+        if (Status != NDIS_STATUS_PENDING)\r
+        {\r
+#ifndef WIN9X\r
+            NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);\r
+#endif\r
+            NdisFreePacket(MyPacket);\r
+            ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+        }\r
+    }\r
+    else\r
+    {\r
+        ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+        //\r
+        // We are out of packets. Silently drop it. Alternatively we can deal with it:\r
+        //    - By keeping separate send and receive pools\r
+        //    - Dynamically allocate more pools as needed and free them when not needed\r
+        //\r
+    }\r
+\r
+    return(Status);\r
+}\r
+\r
+\r
+VOID\r
+MPSendPackets(\r
+    IN NDIS_HANDLE             MiniportAdapterContext,\r
+    IN PPNDIS_PACKET           PacketArray,\r
+    IN UINT                    NumberOfPackets\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Send Packet Array handler. Either this or our SendPacket handler is called\r
+    based on which one is enabled in our Miniport Characteristics.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext     Pointer to our adapter\r
+    PacketArray                Set of packets to send\r
+    NumberOfPackets            Self-explanatory\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT              pAdapt = (PADAPT)MiniportAdapterContext;\r
+    NDIS_STATUS         Status;\r
+    UINT                i;\r
+    PVOID               MediaSpecificInfo = NULL;\r
+    UINT                MediaSpecificInfoSize = 0;\r
+    \r
+\r
+    for (i = 0; i < NumberOfPackets; i++)\r
+    {\r
+        PNDIS_PACKET    Packet, MyPacket;\r
+\r
+        Packet = PacketArray[i];\r
+        //\r
+        // The driver should fail the send if the virtual miniport is in low \r
+        // power state\r
+        //\r
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0)\r
+        {\r
+            NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),\r
+                            Packet,\r
+                            NDIS_STATUS_FAILURE);\r
+            continue;\r
+        }\r
+\r
+#ifdef NDIS51\r
+\r
+        //\r
+        // Use NDIS 5.1 packet stacking:\r
+        //\r
+        {\r
+            PNDIS_PACKET_STACK        pStack;\r
+            BOOLEAN                   Remaining;\r
+\r
+            //\r
+            // Packet stacks: Check if we can use the same packet for sending down.\r
+            //\r
+            pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining);\r
+            if (Remaining)\r
+            {\r
+                //\r
+                // We can reuse "Packet".\r
+                //\r
+                // NOTE: if we needed to keep per-packet information in packets\r
+                // sent down, we can use pStack->IMReserved[].\r
+                //\r
+                ASSERT(pStack);\r
+                //\r
+                // If the below miniport is going to low power state, stop sending down any packet.\r
+                //\r
+                NdisAcquireSpinLock(&pAdapt->Lock);\r
+                if (pAdapt->PTDeviceState > NdisDeviceStateD0)\r
+                {\r
+                    NdisReleaseSpinLock(&pAdapt->Lock);\r
+                    NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),\r
+                                        Packet,\r
+                                        NDIS_STATUS_FAILURE);\r
+                }\r
+                else\r
+                {\r
+                    pAdapt->OutstandingSends++;\r
+                    NdisReleaseSpinLock(&pAdapt->Lock);\r
+                \r
+                    NdisSend(&Status,\r
+                              pAdapt->BindingHandle,\r
+                              Packet);\r
+        \r
+                    if (Status != NDIS_STATUS_PENDING)\r
+                    {\r
+                        NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),\r
+                                            Packet,\r
+                                            Status);\r
+                   \r
+                        ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+                    }\r
+                }\r
+                continue;\r
+            }\r
+        }\r
+#endif\r
+        do \r
+        {\r
+            NdisAcquireSpinLock(&pAdapt->Lock);\r
+            //\r
+            // If the below miniport is going to low power state, stop sending down any packet.\r
+            //\r
+            if (pAdapt->PTDeviceState > NdisDeviceStateD0)\r
+            {\r
+                NdisReleaseSpinLock(&pAdapt->Lock);\r
+                Status = NDIS_STATUS_FAILURE;\r
+                break;\r
+            }\r
+            pAdapt->OutstandingSends++;\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            \r
+            NdisAllocatePacket(&Status,\r
+                               &MyPacket,\r
+                               pAdapt->SendPacketPoolHandle);\r
+\r
+            if (Status == NDIS_STATUS_SUCCESS)\r
+            {\r
+                PSEND_RSVD        SendRsvd;\r
+\r
+                SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved);\r
+                SendRsvd->OriginalPkt = Packet;\r
+\r
+                NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet);\r
+\r
+                NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);\r
+                NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);\r
+#ifdef WIN9X\r
+                //\r
+                // Work around the fact that NDIS does not initialize this\r
+                // to FALSE on Win9x.\r
+                //\r
+                NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE;\r
+#endif // WIN9X\r
+\r
+                //\r
+                // Copy the OOB data from the original packet to the new\r
+                // packet.\r
+                //\r
+                NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket),\r
+                            NDIS_OOB_DATA_FROM_PACKET(Packet),\r
+                            sizeof(NDIS_PACKET_OOB_DATA));\r
+                //\r
+                // Copy relevant parts of the per packet info into the new packet\r
+                //\r
+#ifndef WIN9X\r
+                NdisIMCopySendPerPacketInfo(MyPacket, Packet);\r
+#endif\r
+\r
+                //\r
+                // Copy the Media specific information\r
+                //\r
+                NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet,\r
+                                                    &MediaSpecificInfo,\r
+                                                    &MediaSpecificInfoSize);\r
+\r
+                if (MediaSpecificInfo || MediaSpecificInfoSize)\r
+                {\r
+                    NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket,\r
+                                                        MediaSpecificInfo,\r
+                                                        MediaSpecificInfoSize);\r
+                }\r
+\r
+                NdisSend(&Status,\r
+                         pAdapt->BindingHandle,\r
+                         MyPacket);\r
+\r
+                if (Status != NDIS_STATUS_PENDING)\r
+                {\r
+#ifndef WIN9X\r
+                    NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket);\r
+#endif\r
+                    NdisFreePacket(MyPacket);\r
+                    ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+                }\r
+            }\r
+            else\r
+            {\r
+                //\r
+                // The driver cannot allocate a packet.\r
+                // \r
+                ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+            }\r
+        }\r
+        while (FALSE);\r
+\r
+        if (Status != NDIS_STATUS_PENDING)\r
+        {\r
+            NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt),\r
+                              Packet,\r
+                              Status);\r
+        }\r
+    }\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+MPQueryInformation(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_OID                   Oid,\r
+    IN PVOID                      InformationBuffer,\r
+    IN ULONG                      InformationBufferLength,\r
+    OUT PULONG                    BytesWritten,\r
+    OUT PULONG                    BytesNeeded\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Entry point called by NDIS to query for the value of the specified OID.\r
+    Typical processing is to forward the query down to the underlying miniport.\r
+\r
+    The following OIDs are filtered here:\r
+\r
+    OID_PNP_QUERY_POWER - return success right here\r
+\r
+    OID_GEN_SUPPORTED_GUIDS - do not forward, otherwise we will show up\r
+    multiple instances of private GUIDs supported by the underlying miniport.\r
+\r
+    OID_PNP_CAPABILITIES - we do send this down to the lower miniport, but\r
+    the values returned are postprocessed before we complete this request;\r
+    see PtRequestComplete.\r
+\r
+    NOTE on OID_TCP_TASK_OFFLOAD - if this IM driver modifies the contents\r
+    of data it passes through such that a lower miniport may not be able\r
+    to perform TCP task offload, then it should not forward this OID down,\r
+    but fail it here with the status NDIS_STATUS_NOT_SUPPORTED. This is to\r
+    avoid performing incorrect transformations on data.\r
+\r
+    If our miniport edge (upper edge) is at a low-power state, fail the request.\r
+\r
+    If our protocol edge (lower edge) has been notified of a low-power state,\r
+    we pend this request until the miniport below has been set to D0. Since\r
+    requests to miniports are serialized always, at most a single request will\r
+    be pended.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    Pointer to the adapter structure\r
+    Oid                       Oid for this query\r
+    InformationBuffer         Buffer for information\r
+    InformationBufferLength   Size of this buffer\r
+    BytesWritten              Specifies how much info is written\r
+    BytesNeeded               In case the buffer is smaller than what we need, tell them how much is needed\r
+\r
+\r
+Return Value:\r
+\r
+    Return code from the NdisRequest below.\r
+\r
+--*/\r
+{\r
+    PADAPT        pAdapt = (PADAPT)MiniportAdapterContext;\r
+    NDIS_STATUS   Status = NDIS_STATUS_FAILURE;\r
+\r
+    do\r
+    {\r
+        if (Oid == OID_PNP_QUERY_POWER)\r
+        {\r
+            //\r
+            //  Do not forward this.\r
+            //\r
+            Status = NDIS_STATUS_SUCCESS;\r
+            break;\r
+        }\r
+\r
+        if (Oid == OID_GEN_SUPPORTED_GUIDS)\r
+        {\r
+            //\r
+            //  Do not forward this, otherwise we will end up with multiple\r
+            //  instances of private GUIDs that the underlying miniport\r
+            //  supports.\r
+            //\r
+            Status = NDIS_STATUS_NOT_SUPPORTED;\r
+            break;\r
+        }\r
+\r
+        if (Oid == OID_TCP_TASK_OFFLOAD)\r
+        {\r
+            //\r
+            // Fail this -if- this driver performs data transformations\r
+            // that can interfere with a lower driver's ability to offload\r
+            // TCP tasks.\r
+            //\r
+            // Status = NDIS_STATUS_NOT_SUPPORTED;\r
+            // break;\r
+            //\r
+        }\r
+        //\r
+        // If the miniport below is unbinding, just fail any request\r
+        //\r
+        NdisAcquireSpinLock(&pAdapt->Lock);\r
+        if (pAdapt->UnbindingInProcess == TRUE)\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+        //\r
+        // All other queries are failed, if the miniport is not at D0,\r
+        //\r
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0) \r
+        {\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+\r
+        pAdapt->Request.RequestType = NdisRequestQueryInformation;\r
+        pAdapt->Request.DATA.QUERY_INFORMATION.Oid = Oid;\r
+        pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer = InformationBuffer;\r
+        pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength = InformationBufferLength;\r
+        pAdapt->BytesNeeded = BytesNeeded;\r
+        pAdapt->BytesReadOrWritten = BytesWritten;\r
+\r
+        //\r
+        // If the miniport below is binding, fail the request\r
+        //\r
+        NdisAcquireSpinLock(&pAdapt->Lock);\r
+            \r
+        if (pAdapt->UnbindingInProcess == TRUE)\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+        //\r
+        // If the Protocol device state is OFF, mark this request as being \r
+        // pended. We queue this until the device state is back to D0. \r
+        //\r
+        if ((pAdapt->PTDeviceState > NdisDeviceStateD0) \r
+                && (pAdapt->StandingBy == FALSE))\r
+        {\r
+            pAdapt->QueuedRequest = TRUE;\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_PENDING;\r
+            break;\r
+        }\r
+        //\r
+        // This is in the process of powering down the system, always fail the request\r
+        // \r
+        if (pAdapt->StandingBy == TRUE)\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+        pAdapt->OutstandingRequests = TRUE;\r
+        \r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+\r
+        //\r
+        // default case, most requests will be passed to the miniport below\r
+        //\r
+        NdisRequest(&Status,\r
+                    pAdapt->BindingHandle,\r
+                    &pAdapt->Request);\r
+\r
+\r
+        if (Status != NDIS_STATUS_PENDING)\r
+        {\r
+            PtRequestComplete(pAdapt, &pAdapt->Request, Status);\r
+            Status = NDIS_STATUS_PENDING;\r
+        }\r
+\r
+    } while (FALSE);\r
+\r
+    return(Status);\r
+\r
+}\r
+\r
+\r
+VOID\r
+MPQueryPNPCapabilities(\r
+    IN OUT PADAPT            pAdapt,\r
+    OUT PNDIS_STATUS         pStatus\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Postprocess a request for OID_PNP_CAPABILITIES that was forwarded\r
+    down to the underlying miniport, and has been completed by it.\r
+\r
+Arguments:\r
+\r
+    pAdapt - Pointer to the adapter structure\r
+    pStatus - Place to return final status\r
+\r
+Return Value:\r
+\r
+    None.\r
+\r
+--*/\r
+\r
+{\r
+    PNDIS_PNP_CAPABILITIES           pPNPCapabilities;\r
+    PNDIS_PM_WAKE_UP_CAPABILITIES    pPMstruct;\r
+\r
+    if (pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength >= sizeof(NDIS_PNP_CAPABILITIES))\r
+    {\r
+        pPNPCapabilities = (PNDIS_PNP_CAPABILITIES)(pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer);\r
+\r
+        //\r
+        // The following fields must be overwritten by an IM driver.\r
+        //\r
+        pPMstruct= & pPNPCapabilities->WakeUpCapabilities;\r
+        pPMstruct->MinMagicPacketWakeUp = NdisDeviceStateUnspecified;\r
+        pPMstruct->MinPatternWakeUp = NdisDeviceStateUnspecified;\r
+        pPMstruct->MinLinkChangeWakeUp = NdisDeviceStateUnspecified;\r
+        *pAdapt->BytesReadOrWritten = sizeof(NDIS_PNP_CAPABILITIES);\r
+        *pAdapt->BytesNeeded = 0;\r
+\r
+\r
+        //\r
+        // Setting our internal flags\r
+        // Default, device is ON\r
+        //\r
+        pAdapt->MPDeviceState = NdisDeviceStateD0;\r
+        pAdapt->PTDeviceState = NdisDeviceStateD0;\r
+\r
+        *pStatus = NDIS_STATUS_SUCCESS;\r
+    }\r
+    else\r
+    {\r
+        *pAdapt->BytesNeeded= sizeof(NDIS_PNP_CAPABILITIES);\r
+        *pStatus = NDIS_STATUS_RESOURCES;\r
+    }\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+MPSetInformation(\r
+    IN NDIS_HANDLE                                  MiniportAdapterContext,\r
+    IN NDIS_OID                                     Oid,\r
+    __in_bcount(InformationBufferLength) IN PVOID   InformationBuffer,\r
+    IN ULONG                                        InformationBufferLength,\r
+    OUT PULONG                                      BytesRead,\r
+    OUT PULONG                                      BytesNeeded\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Miniport SetInfo handler.\r
+\r
+    In the case of OID_PNP_SET_POWER, record the power state and return the OID.    \r
+    Do not pass below\r
+    If the device is suspended, do not block the SET_POWER_OID \r
+    as it is used to reactivate the Passthru miniport\r
+\r
+    \r
+    PM- If the MP is not ON (DeviceState > D0) return immediately  (except for 'query power' and 'set power')\r
+         If MP is ON, but the PT is not at D0, then queue the queue the request for later processing\r
+\r
+    Requests to miniports are always serialized\r
+\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    Pointer to the adapter structure\r
+    Oid                       Oid for this query\r
+    InformationBuffer         Buffer for information\r
+    InformationBufferLength   Size of this buffer\r
+    BytesRead                 Specifies how much info is read\r
+    BytesNeeded               In case the buffer is smaller than what we need, tell them how much is needed\r
+\r
+Return Value:\r
+\r
+    Return code from the NdisRequest below.\r
+\r
+--*/\r
+{\r
+    PADAPT        pAdapt = (PADAPT)MiniportAdapterContext;\r
+    NDIS_STATUS   Status;\r
+\r
+    Status = NDIS_STATUS_FAILURE;\r
+\r
+    do\r
+    {\r
+        //\r
+        // The Set Power should not be sent to the miniport below the Passthru, but is handled internally\r
+        //\r
+        if (Oid == OID_PNP_SET_POWER)\r
+        {\r
+            MPProcessSetPowerOid(&Status, \r
+                                 pAdapt, \r
+                                 InformationBuffer, \r
+                                 InformationBufferLength, \r
+                                 BytesRead, \r
+                                 BytesNeeded);\r
+            break;\r
+\r
+        }\r
+\r
+        //\r
+        // If the miniport below is unbinding, fail the request\r
+        //\r
+        NdisAcquireSpinLock(&pAdapt->Lock);     \r
+        if (pAdapt->UnbindingInProcess == TRUE)\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+        //\r
+        // All other Set Information requests are failed, if the miniport is\r
+        // not at D0 or is transitioning to a device state greater than D0.\r
+        //\r
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0)\r
+        {\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+\r
+        // Set up the Request and return the result\r
+        pAdapt->Request.RequestType = NdisRequestSetInformation;\r
+        pAdapt->Request.DATA.SET_INFORMATION.Oid = Oid;\r
+        pAdapt->Request.DATA.SET_INFORMATION.InformationBuffer = InformationBuffer;\r
+        pAdapt->Request.DATA.SET_INFORMATION.InformationBufferLength = InformationBufferLength;\r
+        pAdapt->BytesNeeded = BytesNeeded;\r
+        pAdapt->BytesReadOrWritten = BytesRead;\r
+\r
+        //\r
+        // If the miniport below is unbinding, fail the request\r
+        //\r
+        NdisAcquireSpinLock(&pAdapt->Lock);     \r
+        if (pAdapt->UnbindingInProcess == TRUE)\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+            \r
+        //\r
+        // If the device below is at a low power state, we cannot send it the\r
+        // request now, and must pend it.\r
+        //\r
+        if ((pAdapt->PTDeviceState > NdisDeviceStateD0) \r
+                && (pAdapt->StandingBy == FALSE))\r
+        {\r
+            pAdapt->QueuedRequest = TRUE;\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_PENDING;\r
+            break;\r
+        }\r
+        //\r
+        // This is in the process of powering down the system, always fail the request\r
+        // \r
+        if (pAdapt->StandingBy == TRUE)\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            Status = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }\r
+        pAdapt->OutstandingRequests = TRUE;\r
+        \r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+        //\r
+        // Forward the request to the device below.\r
+        //\r
+        NdisRequest(&Status,\r
+                    pAdapt->BindingHandle,\r
+                    &pAdapt->Request);\r
+\r
+        if (Status != NDIS_STATUS_PENDING)\r
+        {\r
+            *BytesRead = pAdapt->Request.DATA.SET_INFORMATION.BytesRead;\r
+            *BytesNeeded = pAdapt->Request.DATA.SET_INFORMATION.BytesNeeded;\r
+            pAdapt->OutstandingRequests = FALSE;\r
+        }\r
+\r
+    } while (FALSE);\r
+\r
+    return(Status);\r
+}\r
+\r
+\r
+VOID\r
+MPProcessSetPowerOid(\r
+    IN OUT PNDIS_STATUS                             pNdisStatus,\r
+    IN PADAPT                                       pAdapt,\r
+    __in_bcount(InformationBufferLength) IN PVOID   InformationBuffer,\r
+    IN ULONG                                        InformationBufferLength,\r
+    OUT PULONG                                      BytesRead,\r
+    OUT PULONG                                      BytesNeeded\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+    This routine does all the procssing for a request with a SetPower Oid\r
+    The miniport shoud accept  the Set Power and transition to the new state\r
+\r
+    The Set Power should not be passed to the miniport below\r
+\r
+    If the IM miniport is going into a low power state, then there is no guarantee if it will ever\r
+    be asked go back to D0, before getting halted. No requests should be pended or queued.\r
+\r
+    \r
+Arguments:\r
+    pNdisStatus           - Status of the operation\r
+    pAdapt                - The Adapter structure\r
+    InformationBuffer     - The New DeviceState\r
+    InformationBufferLength\r
+    BytesRead             - No of bytes read\r
+    BytesNeeded           -  No of bytes needed\r
+\r
+\r
+Return Value:\r
+    Status  - NDIS_STATUS_SUCCESS if all the wait events succeed.\r
+\r
+--*/\r
+{\r
+\r
+    \r
+    NDIS_DEVICE_POWER_STATE NewDeviceState;\r
+\r
+    DBGPRINT(("==>MPProcessSetPowerOid: Adapt %p\n", pAdapt)); \r
+\r
+    ASSERT (InformationBuffer != NULL);\r
+\r
+    *pNdisStatus = NDIS_STATUS_FAILURE;\r
+\r
+    do \r
+    {\r
+        //\r
+        // Check for invalid length\r
+        //\r
+        if (InformationBufferLength < sizeof(NDIS_DEVICE_POWER_STATE))\r
+        {\r
+            *pNdisStatus = NDIS_STATUS_INVALID_LENGTH;\r
+            break;\r
+        }\r
+\r
+        NewDeviceState = (*(PNDIS_DEVICE_POWER_STATE)InformationBuffer);\r
+\r
+        //\r
+        // Check for invalid device state\r
+        //\r
+        if ((pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0))\r
+        {\r
+            //\r
+            // If the miniport is in a non-D0 state, the miniport can only receive a Set Power to D0\r
+            //\r
+            ASSERT (!(pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0));\r
+\r
+            *pNdisStatus = NDIS_STATUS_FAILURE;\r
+            break;\r
+        }    \r
+\r
+        //\r
+        // Is the miniport transitioning from an On (D0) state to an Low Power State (>D0)\r
+        // If so, then set the StandingBy Flag - (Block all incoming requests)\r
+        //\r
+        if (pAdapt->MPDeviceState == NdisDeviceStateD0 && NewDeviceState > NdisDeviceStateD0)\r
+        {\r
+            pAdapt->StandingBy = TRUE;\r
+        }\r
+\r
+        //\r
+        // If the miniport is transitioning from a low power state to ON (D0), then clear the StandingBy flag\r
+        // All incoming requests will be pended until the physical miniport turns ON.\r
+        //\r
+        if (pAdapt->MPDeviceState > NdisDeviceStateD0 &&  NewDeviceState == NdisDeviceStateD0)\r
+        {\r
+            pAdapt->StandingBy = FALSE;\r
+        }\r
+        \r
+        //\r
+        // Now update the state in the pAdapt structure;\r
+        //\r
+        pAdapt->MPDeviceState = NewDeviceState;\r
+        \r
+        *pNdisStatus = NDIS_STATUS_SUCCESS;\r
+    \r
+\r
+    } while (FALSE);    \r
+        \r
+    if (*pNdisStatus == NDIS_STATUS_SUCCESS)\r
+    {\r
+        //\r
+        // The miniport resume from low power state\r
+        // \r
+        if (pAdapt->StandingBy == FALSE)\r
+        {\r
+            //\r
+            // If we need to indicate the media connect state\r
+            // \r
+            if (pAdapt->LastIndicatedStatus != pAdapt->LatestUnIndicateStatus)\r
+            {\r
+               if (pAdapt->MiniportHandle != NULL)\r
+               {\r
+                   NdisMIndicateStatus(pAdapt->MiniportHandle,\r
+                                            pAdapt->LatestUnIndicateStatus,\r
+                                            (PVOID)NULL,\r
+                                            0);\r
+                   NdisMIndicateStatusComplete(pAdapt->MiniportHandle);\r
+                   pAdapt->LastIndicatedStatus = pAdapt->LatestUnIndicateStatus;\r
+               }\r
+            }\r
+        }\r
+        else\r
+        {\r
+            //\r
+            // Initialize LatestUnIndicatedStatus\r
+            //\r
+            pAdapt->LatestUnIndicateStatus = pAdapt->LastIndicatedStatus;\r
+        }\r
+        *BytesRead = sizeof(NDIS_DEVICE_POWER_STATE);\r
+        *BytesNeeded = 0;\r
+    }\r
+    else\r
+    {\r
+        *BytesRead = 0;\r
+        *BytesNeeded = sizeof (NDIS_DEVICE_POWER_STATE);\r
+    }\r
+\r
+    DBGPRINT(("<==MPProcessSetPowerOid: Adapt %p\n", pAdapt)); \r
+}\r
+\r
+\r
+VOID\r
+MPReturnPacket(\r
+    IN NDIS_HANDLE             MiniportAdapterContext,\r
+    IN PNDIS_PACKET            Packet\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    NDIS Miniport entry point called whenever protocols are done with\r
+    a packet that we had indicated up and they had queued up for returning\r
+    later.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    - pointer to ADAPT structure\r
+    Packet    - packet being returned.\r
+\r
+Return Value:\r
+\r
+    None.\r
+\r
+--*/\r
+{\r
+    PADAPT            pAdapt = (PADAPT)MiniportAdapterContext;\r
+\r
+#ifdef NDIS51\r
+    //\r
+    // Packet stacking: Check if this packet belongs to us.\r
+    //\r
+    if (NdisGetPoolFromPacket(Packet) != pAdapt->RecvPacketPoolHandle)\r
+    {\r
+        //\r
+        // We reused the original packet in a receive indication.\r
+        // Simply return it to the miniport below us.\r
+        //\r
+        NdisReturnPackets(&Packet, 1);\r
+    }\r
+    else\r
+#endif // NDIS51\r
+    {\r
+        //\r
+        // This is a packet allocated from this IM's receive packet pool.\r
+        // Reclaim our packet, and return the original to the driver below.\r
+        //\r
+\r
+        PNDIS_PACKET    MyPacket;\r
+        PRECV_RSVD      RecvRsvd;\r
+    \r
+        RecvRsvd = (PRECV_RSVD)(Packet->MiniportReserved);\r
+        MyPacket = RecvRsvd->OriginalPkt;\r
+    \r
+        NdisFreePacket(Packet);\r
+        NdisReturnPackets(&MyPacket, 1);\r
+    }\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+MPTransferData(\r
+    OUT PNDIS_PACKET            Packet,\r
+    OUT PUINT                   BytesTransferred,\r
+    IN NDIS_HANDLE              MiniportAdapterContext,\r
+    IN NDIS_HANDLE              MiniportReceiveContext,\r
+    IN UINT                     ByteOffset,\r
+    IN UINT                     BytesToTransfer\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Miniport's transfer data handler.\r
+\r
+Arguments:\r
+\r
+    Packet                    Destination packet\r
+    BytesTransferred          Place-holder for how much data was copied\r
+    MiniportAdapterContext    Pointer to the adapter structure\r
+    MiniportReceiveContext    Context\r
+    ByteOffset                Offset into the packet for copying data\r
+    BytesToTransfer           How much to copy.\r
+\r
+Return Value:\r
+\r
+    Status of transfer\r
+\r
+--*/\r
+{\r
+    PADAPT        pAdapt = (PADAPT)MiniportAdapterContext;\r
+    NDIS_STATUS   Status;\r
+\r
+    //\r
+    // Return, if the device is OFF\r
+    //\r
+\r
+    if (IsIMDeviceStateOn(pAdapt) == FALSE)\r
+    {\r
+        return NDIS_STATUS_FAILURE;\r
+    }\r
+\r
+    NdisTransferData(&Status,\r
+                     pAdapt->BindingHandle,\r
+                     MiniportReceiveContext,\r
+                     ByteOffset,\r
+                     BytesToTransfer,\r
+                     Packet,\r
+                     BytesTransferred);\r
+\r
+    return(Status);\r
+}\r
+\r
+VOID\r
+MPHalt(\r
+    IN NDIS_HANDLE                MiniportAdapterContext\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Halt handler. All the hard-work for clean-up is done here.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    Pointer to the Adapter\r
+\r
+Return Value:\r
+\r
+    None.\r
+\r
+--*/\r
+{\r
+    PADAPT             pAdapt = (PADAPT)MiniportAdapterContext;\r
+    NDIS_STATUS        Status;\r
+    PADAPT            *ppCursor;\r
+\r
+    DBGPRINT(("==>MiniportHalt: Adapt %p\n", pAdapt));\r
+\r
+    pAdapt->MiniportHandle = NULL;\r
+    pAdapt->MiniportIsHalted = TRUE;\r
+\r
+    //\r
+    // Remove this adapter from the global list\r
+    //\r
+    NdisAcquireSpinLock(&GlobalLock);\r
+\r
+    for (ppCursor = &pAdaptList; *ppCursor != NULL; ppCursor = &(*ppCursor)->Next)\r
+    {\r
+        if (*ppCursor == pAdapt)\r
+        {\r
+            *ppCursor = pAdapt->Next;\r
+            break;\r
+        }\r
+    }\r
+\r
+    NdisReleaseSpinLock(&GlobalLock);\r
+\r
+    //\r
+    // Delete the ioctl interface that was created when the miniport\r
+    // was created.\r
+    //\r
+    (VOID)PtDeregisterDevice();\r
+\r
+    //\r
+    // If we have a valid bind, close the miniport below the protocol\r
+    //\r
+#pragma prefast(suppress: __WARNING_DEREF_NULL_PTR, "pAdapt cannot be NULL")\r
+    if (pAdapt->BindingHandle != NULL)\r
+    {\r
+        //\r
+        // Close the binding below. and wait for it to complete\r
+        //\r
+        NdisResetEvent(&pAdapt->Event);\r
+\r
+        NdisCloseAdapter(&Status, pAdapt->BindingHandle);\r
+\r
+        if (Status == NDIS_STATUS_PENDING)\r
+        {\r
+            NdisWaitEvent(&pAdapt->Event, 0);\r
+            Status = pAdapt->Status;\r
+        }\r
+\r
+        ASSERT (Status == NDIS_STATUS_SUCCESS);\r
+\r
+        pAdapt->BindingHandle = NULL;\r
+        \r
+        PtDereferenceAdapt(pAdapt);\r
+    }\r
+\r
+    if (PtDereferenceAdapt(pAdapt))\r
+    {\r
+        pAdapt = NULL;\r
+    }\r
+        \r
+    \r
+    DBGPRINT(("<== MiniportHalt: pAdapt %p\n", pAdapt));\r
+}\r
+\r
+\r
+#ifdef NDIS51_MINIPORT\r
+\r
+VOID\r
+MPCancelSendPackets(\r
+    IN NDIS_HANDLE            MiniportAdapterContext,\r
+    IN PVOID                  CancelId\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    The miniport entry point to handle cancellation of all send packets\r
+    that match the given CancelId. If we have queued any packets that match\r
+    this, then we should dequeue them and call NdisMSendComplete for all\r
+    such packets, with a status of NDIS_STATUS_REQUEST_ABORTED.\r
+\r
+    We should also call NdisCancelSendPackets in turn, on each lower binding\r
+    that this adapter corresponds to. This is to let miniports below cancel\r
+    any matching packets.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    - pointer to ADAPT structure\r
+    CancelId    - ID of packets to be cancelled.\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT    pAdapt = (PADAPT)MiniportAdapterContext;\r
+\r
+    //\r
+    // If we queue packets on our adapter structure, this would be \r
+    // the place to acquire a spinlock to it, unlink any packets whose\r
+    // Id matches CancelId, release the spinlock and call NdisMSendComplete\r
+    // with NDIS_STATUS_REQUEST_ABORTED for all unlinked packets.\r
+    //\r
+\r
+    //\r
+    // Next, pass this down so that we let the miniport(s) below cancel\r
+    // any packets that they might have queued.\r
+    //\r
+    NdisCancelSendPackets(pAdapt->BindingHandle, CancelId);\r
+\r
+    return;\r
+}\r
+\r
+VOID\r
+MPDevicePnPEvent(\r
+    IN NDIS_HANDLE              MiniportAdapterContext,\r
+    IN NDIS_DEVICE_PNP_EVENT    DevicePnPEvent,\r
+    IN PVOID                    InformationBuffer,\r
+    IN ULONG                    InformationBufferLength\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    This handler is called to notify us of PnP events directed to\r
+    our miniport device object.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    - pointer to ADAPT structure\r
+    DevicePnPEvent - the event\r
+    InformationBuffer - Points to additional event-specific information\r
+    InformationBufferLength - length of above\r
+\r
+Return Value:\r
+\r
+    None\r
+--*/\r
+{\r
+    // TBD - add code/comments about processing this.\r
+\r
+    UNREFERENCED_PARAMETER(MiniportAdapterContext);\r
+    UNREFERENCED_PARAMETER(DevicePnPEvent);\r
+    UNREFERENCED_PARAMETER(InformationBuffer);\r
+    UNREFERENCED_PARAMETER(InformationBufferLength);\r
+    \r
+    return;\r
+}\r
+\r
+VOID\r
+MPAdapterShutdown(\r
+    IN NDIS_HANDLE                MiniportAdapterContext\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    This handler is called to notify us of an impending system shutdown.\r
+\r
+Arguments:\r
+\r
+    MiniportAdapterContext    - pointer to ADAPT structure\r
+\r
+Return Value:\r
+\r
+    None\r
+--*/\r
+{\r
+    UNREFERENCED_PARAMETER(MiniportAdapterContext);\r
+    \r
+    return;\r
+}\r
+\r
+#endif\r
+\r
+\r
+VOID\r
+MPFreeAllPacketPools(\r
+    IN PADAPT                    pAdapt\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Free all packet pools on the specified adapter.\r
+    \r
+Arguments:\r
+\r
+    pAdapt    - pointer to ADAPT structure\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    if (pAdapt->RecvPacketPoolHandle != NULL)\r
+    {\r
+        //\r
+        // Free the packet pool that is used to indicate receives\r
+        //\r
+        NdisFreePacketPool(pAdapt->RecvPacketPoolHandle);\r
+\r
+        pAdapt->RecvPacketPoolHandle = NULL;\r
+    }\r
+\r
+    if (pAdapt->SendPacketPoolHandle != NULL)\r
+    {\r
+\r
+        //\r
+        //  Free the packet pool that is used to send packets below\r
+        //\r
+\r
+        NdisFreePacketPool(pAdapt->SendPacketPoolHandle);\r
+\r
+        pAdapt->SendPacketPoolHandle = NULL;\r
+\r
+    }\r
+}\r
+\r
diff --git a/original_passthru/netsf.inf b/original_passthru/netsf.inf
new file mode 100644 (file)
index 0000000..5e03a01
--- /dev/null
@@ -0,0 +1,165 @@
+; -- NETSF.INF --\r
+;\r
+; Passthru driver INF file - this is the INF for the service (protocol)\r
+; part.\r
+;\r
+; Copyright (c) 1993-2001, Microsoft Corporation\r
+;\r
+; ----------------------------------------------------------------------\r
+; Notes:\r
+; 0. The term "filter" is used in this INF to refer to an NDIS IM driver that\r
+;    implements a 1:1 relationship between upper and lower bindings.\r
+;\r
+; 1. Items specifically required for a filter have been marked with\r
+;    "!!--Filter Specific--!!" keyword\r
+; 2. In general a filter DOES NOT require a notify object for proper installation.\r
+;    A notify object is only required if one wants to have better control\r
+;    over binding operations or if one wants to receive notifications\r
+;    when other components get installed/removed/bound/unbound.\r
+;    Since Windows 2000 systems do not have support for CopyINF directive,\r
+;    a notify object is required to programmatically copy the miniport INF  \r
+;    file to the system INF directory. Previous versions of this INF file\r
+;    erroneously used to copy the INF files directly by using the CopyFiles \r
+;    directive.\r
+;    On Windows XP, you can install a filter IM without a notify object.\r
+;    by following the instructions in (4).\r
+;\r
+; 3. If you want to use this INF file with your own IM driver, please\r
+;    make the following modifications:\r
+;    File netsf.inf\r
+;    --------------\r
+;    a. In section [SourceDiskFiles] and [Passthru.Files.Sys]\r
+;       change passthru.sys to the name of your own driver binary.\r
+;    b. In section [Passthru.ndi.AddReg], change values of\r
+;       BindForm and MiniportId to appropriate values.\r
+;    File netsf_m.inf\r
+;    ----------------\r
+;    a. Replace MS_PassthruMP with InfId of your miniport.\r
+;    b. In section [PassthruMP.AddService],\r
+;       change ServiceBinary appropriately.\r
+;    c. In section [PassthruMP.ndi.AddReg],\r
+;       change "Passthru" in the line having "Service"\r
+;       to reflect the appropriate name\r
+;\r
+;\r
+; ----------------------------------------------------------------------\r
+\r
+[Version]\r
+Signature  = "$Windows NT$"\r
+Class      = NetService\r
+ClassGUID  = {4D36E974-E325-11CE-BFC1-08002BE10318}\r
+Provider   = %Msft%\r
+DriverVer  =10/01/2002,6.0.5019.0\r
+\r
+[Manufacturer]\r
+%Msft% = MSFT,NTx86,NTia64,NTamd64\r
+\r
+[ControlFlags]\r
+\r
+;=========================================================================\r
+;\r
+;=========================================================================\r
+;For Win2K\r
+\r
+[MSFT]\r
+%Passthru_Desc% = Passthru.ndi, ms_passthru\r
\r
+;For WinXP and later\r
+\r
+[MSFT.NTx86]\r
+%Passthru_Desc% = Passthru.ndi, ms_passthru\r
+\r
+[MSFT.NTia64]\r
+%Passthru_Desc% = Passthru.ndi, ms_passthru\r
+\r
+[MSFT.NTamd64]\r
+%Passthru_Desc% = Passthru.ndi, ms_passthru\r
+\r
+\r
+[Passthru.ndi]\r
+AddReg          = Passthru.ndi.AddReg, Passthru.AddReg\r
+Characteristics = 0x4410 ;  NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!!\r
+CopyFiles       = Passthru.Files.Sys\r
+CopyInf         = netsf_m.inf\r
+\r
+[Passthru.ndi.Remove]\r
+DelFiles = Passthru.Files.Sys\r
+\r
+[Passthru.ndi.Services]\r
+AddService = Passthru,, Passthru.AddService\r
+\r
+[Passthru.AddService]\r
+DisplayName    = %PassthruService_Desc%\r
+ServiceType    = 1 ;SERVICE_KERNEL_DRIVER\r
+StartType      = 3 ;SERVICE_DEMAND_START\r
+ErrorControl   = 1 ;SERVICE_ERROR_NORMAL\r
+ServiceBinary  = %12%\passthru.sys\r
+AddReg         = Passthru.AddService.AddReg\r
+\r
+\r
+[Passthru.AddService.AddReg]\r
+; ----------------------------------------------------------------------\r
+; Add any miniport-specific parameters here.  These are params that your\r
+; filter device is going to use.\r
+;\r
+;HKR, Parameters, ParameterName,  0x10000, "MultiSz", "Parameter", "Value"\r
+;HKR, Parameters, ParameterName2, 0x10001, 4\r
+\r
+\r
+; ----------------------------------------------------------------------\r
+; File copy\r
+;\r
+[SourceDisksNames]\r
+1=%DiskDescription%,"",,\r
+\r
+[SourceDisksFiles]\r
+passthru.sys=1\r
+\r
+[DestinationDirs]\r
+DefaultDestDir = 12\r
+Passthru.Files.Sys   = 12   ; %windir%\System32\drivers\r
+\r
+[Passthru.Files.Sys]\r
+passthru.sys,,,2\r
+\r
+; ----------------------------------------------------------------------\r
+; Filter Install\r
+;\r
+\r
+[Passthru.ndi.AddReg]\r
+HKR, Ndi, HelpText, , %Passthru_HELP%\r
+\r
+; ----------------------------------------------------------------------\r
+; !!--Filter Specific--!!\r
+;\r
+; Note:\r
+; 1. Other components may also have UpperRange/LowerRange but for filters\r
+;    the value of both of them must be noupper/nolower\r
+; 2. The value FilterClass is required.\r
+; 3. The value Service is required\r
+; 4. FilterDeviceInfId is the InfId of the filter device (miniport) that will\r
+;    be installed for each filtered adapter.\r
+;    In this case this is ms_passthrump (refer to netsf_m.inf)\r
+;\r
+HKR, Ndi,            FilterClass,         , failover\r
+HKR, Ndi,            FilterDeviceInfId,   , ms_passthrump\r
+HKR, Ndi,            Service,             , Passthru\r
+HKR, Ndi\Interfaces, UpperRange,          , noupper\r
+HKR, Ndi\Interfaces, LowerRange,          , nolower\r
+HKR, Ndi\Interfaces, FilterMediaTypes,    , "ethernet, tokenring, fddi, wan"\r
+\r
+[Passthru.AddReg]\r
+; The following key is Required\r
+; The following key is Passthru specific\r
+HKR, Parameters, Param1, 0, 4\r
+\r
+; ----------------------------------------------------------------------\r
+[Strings]\r
+Msft = "Microsoft"\r
+DiskDescription = "Microsoft Passthru Driver Disk"\r
+\r
+Passthru_Desc = "Passthru Driver"\r
+Passthru_HELP = "Passthru Driver"\r
+PassthruService_Desc = "Passthru Service"\r
+\r
+\r
diff --git a/original_passthru/netsf_m.inf b/original_passthru/netsf_m.inf
new file mode 100644 (file)
index 0000000..6605a02
--- /dev/null
@@ -0,0 +1,93 @@
+; -- NETSF_M.INF --\r
+;\r
+; Passsthru Miniport INF file\r
+;\r
+; Copyright (c) 1993-1999, Microsoft Corporation\r
+\r
+; ----------------------------------------------------------------------\r
+; Notes:\r
+; 0. The term "filter" is used here to refer to an NDIS IM driver that\r
+;    implements a 1:1 relationship between upper and lower bindings.\r
+; 1. Items specifically required for a filter have been marked with\r
+;    "!!--Filter Specific--!!" keyword\r
+; 2. A filter DOES NOT require a notify object for proper installation.\r
+;    A notify object is only required if one wants to have better control\r
+;    over binding operations or if one wants to receive notifications\r
+;    when other components get installed/removed/bound/unbound.\r
+;    This sample uses a notify object as an example only. If you do not\r
+;    want to use a notify object, please comment out the lines that add\r
+;    ClsId and ComponentDll registry keys.\r
+; ----------------------------------------------------------------------\r
+\r
+[Version]\r
+signature  = "$Windows NT$"\r
+Class      = Net\r
+ClassGUID  = {4d36e972-e325-11ce-bfc1-08002be10318}\r
+Provider   = %Msft%\r
+DriverVer  =10/01/2002,6.0.5019.0\r
+\r
+[ControlFlags]\r
+ExcludeFromSelect = ms_passthrump\r
+\r
+[DestinationDirs]\r
+DefaultDestDir=12\r
+; No files to copy \r
+\r
+[Manufacturer]\r
+%Msft% = MSFT,NTx86,NTia64,NTamd64\r
+\r
+;For Win2K\r
+\r
+[MSFT]\r
+%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump\r
+\r
+;For WinXP and later\r
+\r
+[MSFT.NTx86]\r
+%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump\r
+\r
+[MSFT.NTia64]\r
+%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump\r
+\r
+[MSFT.NTamd64]\r
+%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump\r
+\r
+\r
+[PassthruMP.ndi]\r
+AddReg  = PassthruMP.ndi.AddReg\r
+Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN\r
+\r
+[PassthruMP.ndi.AddReg]\r
+HKR, Ndi, Service,  0,  PassthruMP\r
+\r
+[PassthruMP.ndi.Services]\r
+AddService = PassthruMP,0x2, PassthruMP.AddService\r
+\r
+\r
+[PassthruMP.AddService]\r
+ServiceType    = 1 ;SERVICE_KERNEL_DRIVER\r
+StartType      = 3 ;SERVICE_DEMAND_START\r
+ErrorControl   = 1 ;SERVICE_ERROR_NORMAL\r
+ServiceBinary  = %12%\passthru.sys\r
+AddReg         = PassthruMP.AddService.AddReg\r
+\r
+\r
+[PassthruMP.AddService.AddReg]\r
+; ----------------------------------------------------------------------\r
+; Add any miniport-specific parameters here.  These are params that your\r
+; filter device is going to use.\r
+;\r
+;HKR, Parameters, ParameterName,  0x10000, "MultiSz", "Parameter", "Value"\r
+;HKR, Parameters, ParameterName2, 0x10001, 4\r
+\r
+[Strings]\r
+Msft = "Microsoft"\r
+PassthruMP_Desc = "Passthru Miniport"\r
+\r
+[SourceDisksNames]\r
+;None \r
+\r
+[SourceDisksFiles]\r
+;None\r
+\r
+\r
diff --git a/original_passthru/passthru.c b/original_passthru/passthru.c
new file mode 100644 (file)
index 0000000..f614f2a
--- /dev/null
@@ -0,0 +1,458 @@
+/*++\r
+\r
+Copyright (c) 1992-2000  Microsoft Corporation\r
\r
+Module Name:\r
\r
+    passthru.c\r
+\r
+Abstract:\r
+\r
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.\r
+\r
+Author:\r
+\r
+Environment:\r
+\r
+\r
+Revision History:\r
+\r
+\r
+--*/\r
+\r
+\r
+#include "precomp.h"\r
+#pragma hdrstop\r
+\r
+#pragma NDIS_INIT_FUNCTION(DriverEntry)\r
+\r
+NDIS_HANDLE         ProtHandle = NULL;\r
+NDIS_HANDLE         DriverHandle = NULL;\r
+NDIS_MEDIUM         MediumArray[4] =\r
+                    {\r
+                        NdisMedium802_3,    // Ethernet\r
+                        NdisMedium802_5,    // Token-ring\r
+                        NdisMediumFddi,     // Fddi\r
+                        NdisMediumWan       // NDISWAN\r
+                    };\r
+\r
+NDIS_SPIN_LOCK     GlobalLock;\r
+\r
+PADAPT             pAdaptList = NULL;\r
+LONG               MiniportCount = 0;\r
+\r
+NDIS_HANDLE        NdisWrapperHandle;\r
+\r
+//\r
+// To support ioctls from user-mode:\r
+//\r
+\r
+#define LINKNAME_STRING     L"\\DosDevices\\Passthru"\r
+#define NTDEVICE_STRING     L"\\Device\\Passthru"\r
+\r
+NDIS_HANDLE     NdisDeviceHandle = NULL;\r
+PDEVICE_OBJECT  ControlDeviceObject = NULL;\r
+\r
+enum _DEVICE_STATE\r
+{\r
+    PS_DEVICE_STATE_READY = 0,    // ready for create/delete\r
+    PS_DEVICE_STATE_CREATING,    // create operation in progress\r
+    PS_DEVICE_STATE_DELETING    // delete operation in progress\r
+} ControlDeviceState = PS_DEVICE_STATE_READY;\r
+\r
+\r
+\r
+NTSTATUS\r
+DriverEntry(\r
+    IN PDRIVER_OBJECT        DriverObject,\r
+    IN PUNICODE_STRING       RegistryPath\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    First entry point to be called, when this driver is loaded.\r
+    Register with NDIS as an intermediate driver.\r
+\r
+Arguments:\r
+\r
+    DriverObject - pointer to the system's driver object structure\r
+        for this driver\r
+    \r
+    RegistryPath - system's registry path for this driver\r
+    \r
+Return Value:\r
+\r
+    STATUS_SUCCESS if all initialization is successful, STATUS_XXX\r
+    error code if not.\r
+\r
+--*/\r
+{\r
+    NDIS_STATUS                        Status;\r
+    NDIS_PROTOCOL_CHARACTERISTICS      PChars;\r
+    NDIS_MINIPORT_CHARACTERISTICS      MChars;\r
+    NDIS_STRING                        Name;\r
+\r
+    Status = NDIS_STATUS_SUCCESS;\r
+    NdisAllocateSpinLock(&GlobalLock);\r
+\r
+    NdisMInitializeWrapper(&NdisWrapperHandle, DriverObject, RegistryPath, NULL);\r
+\r
+    do\r
+    {\r
+        //\r
+        // Register the miniport with NDIS. Note that it is the miniport\r
+        // which was started as a driver and not the protocol. Also the miniport\r
+        // must be registered prior to the protocol since the protocol's BindAdapter\r
+        // handler can be initiated anytime and when it is, it must be ready to\r
+        // start driver instances.\r
+        //\r
+\r
+        NdisZeroMemory(&MChars, sizeof(NDIS_MINIPORT_CHARACTERISTICS));\r
+\r
+        MChars.MajorNdisVersion = PASSTHRU_MAJOR_NDIS_VERSION;\r
+        MChars.MinorNdisVersion = PASSTHRU_MINOR_NDIS_VERSION;\r
+\r
+        MChars.InitializeHandler = MPInitialize;\r
+        MChars.QueryInformationHandler = MPQueryInformation;\r
+        MChars.SetInformationHandler = MPSetInformation;\r
+        MChars.ResetHandler = NULL;\r
+        MChars.TransferDataHandler = MPTransferData;\r
+        MChars.HaltHandler = MPHalt;\r
+#ifdef NDIS51_MINIPORT\r
+        MChars.CancelSendPacketsHandler = MPCancelSendPackets;\r
+        MChars.PnPEventNotifyHandler = MPDevicePnPEvent;\r
+        MChars.AdapterShutdownHandler = MPAdapterShutdown;\r
+#endif // NDIS51_MINIPORT\r
+\r
+        //\r
+        // We will disable the check for hang timeout so we do not\r
+        // need a check for hang handler!\r
+        //\r
+        MChars.CheckForHangHandler = NULL;\r
+        MChars.ReturnPacketHandler = MPReturnPacket;\r
+\r
+        //\r
+        // Either the Send or the SendPackets handler should be specified.\r
+        // If SendPackets handler is specified, SendHandler is ignored\r
+        //\r
+        MChars.SendHandler = NULL;    // MPSend;\r
+        MChars.SendPacketsHandler = MPSendPackets;\r
+\r
+        Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle,\r
+                                                  &MChars,\r
+                                                  sizeof(MChars),\r
+                                                  &DriverHandle);\r
+        if (Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            break;\r
+        }\r
+\r
+#ifndef WIN9X\r
+        NdisMRegisterUnloadHandler(NdisWrapperHandle, PtUnload);\r
+#endif\r
+\r
+        //\r
+        // Now register the protocol.\r
+        //\r
+        NdisZeroMemory(&PChars, sizeof(NDIS_PROTOCOL_CHARACTERISTICS));\r
+        PChars.MajorNdisVersion = PASSTHRU_PROT_MAJOR_NDIS_VERSION;\r
+        PChars.MinorNdisVersion = PASSTHRU_PROT_MINOR_NDIS_VERSION;\r
+\r
+        //\r
+        // Make sure the protocol-name matches the service-name\r
+        // (from the INF) under which this protocol is installed.\r
+        // This is needed to ensure that NDIS can correctly determine\r
+        // the binding and call us to bind to miniports below.\r
+        //\r
+        NdisInitUnicodeString(&Name, L"Passthru");    // Protocol name\r
+        PChars.Name = Name;\r
+        PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete;\r
+        PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete;\r
+        PChars.SendCompleteHandler = PtSendComplete;\r
+        PChars.TransferDataCompleteHandler = PtTransferDataComplete;\r
+    \r
+        PChars.ResetCompleteHandler = PtResetComplete;\r
+        PChars.RequestCompleteHandler = PtRequestComplete;\r
+        PChars.ReceiveHandler = PtReceive;\r
+        PChars.ReceiveCompleteHandler = PtReceiveComplete;\r
+        PChars.StatusHandler = PtStatus;\r
+        PChars.StatusCompleteHandler = PtStatusComplete;\r
+        PChars.BindAdapterHandler = PtBindAdapter;\r
+        PChars.UnbindAdapterHandler = PtUnbindAdapter;\r
+        PChars.UnloadHandler = PtUnloadProtocol;\r
+\r
+        PChars.ReceivePacketHandler = PtReceivePacket;\r
+        PChars.PnPEventHandler= PtPNPHandler;\r
+\r
+        NdisRegisterProtocol(&Status,\r
+                             &ProtHandle,\r
+                             &PChars,\r
+                             sizeof(NDIS_PROTOCOL_CHARACTERISTICS));\r
+\r
+        if (Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            NdisIMDeregisterLayeredMiniport(DriverHandle);\r
+            break;\r
+        }\r
+\r
+        NdisIMAssociateMiniport(DriverHandle, ProtHandle);\r
+    }\r
+    while (FALSE);\r
+\r
+    if (Status != NDIS_STATUS_SUCCESS)\r
+    {\r
+        NdisTerminateWrapper(NdisWrapperHandle, NULL);\r
+    }\r
+\r
+    return(Status);\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+PtRegisterDevice(\r
+    VOID\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Register an ioctl interface - a device object to be used for this\r
+    purpose is created by NDIS when we call NdisMRegisterDevice.\r
+\r
+    This routine is called whenever a new miniport instance is\r
+    initialized. However, we only create one global device object,\r
+    when the first miniport instance is initialized. This routine\r
+    handles potential race conditions with PtDeregisterDevice via\r
+    the ControlDeviceState and MiniportCount variables.\r
+\r
+    NOTE: do not call this from DriverEntry; it will prevent the driver\r
+    from being unloaded (e.g. on uninstall).\r
+\r
+Arguments:\r
+\r
+    None\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_SUCCESS if we successfully register a device object.\r
+\r
+--*/\r
+{\r
+    NDIS_STATUS            Status = NDIS_STATUS_SUCCESS;\r
+    UNICODE_STRING         DeviceName;\r
+    UNICODE_STRING         DeviceLinkUnicodeString;\r
+    PDRIVER_DISPATCH       DispatchTable[IRP_MJ_MAXIMUM_FUNCTION+1];\r
+\r
+    DBGPRINT(("==>PtRegisterDevice\n"));\r
+\r
+    NdisAcquireSpinLock(&GlobalLock);\r
+\r
+    ++MiniportCount;\r
+    \r
+    if (1 == MiniportCount)\r
+    {\r
+        ASSERT(ControlDeviceState != PS_DEVICE_STATE_CREATING);\r
+\r
+        //\r
+        // Another thread could be running PtDeregisterDevice on\r
+        // behalf of another miniport instance. If so, wait for\r
+        // it to exit.\r
+        //\r
+        while (ControlDeviceState != PS_DEVICE_STATE_READY)\r
+        {\r
+            NdisReleaseSpinLock(&GlobalLock);\r
+            NdisMSleep(1);\r
+            NdisAcquireSpinLock(&GlobalLock);\r
+        }\r
+\r
+        ControlDeviceState = PS_DEVICE_STATE_CREATING;\r
+\r
+        NdisReleaseSpinLock(&GlobalLock);\r
+\r
+    \r
+        NdisZeroMemory(DispatchTable, (IRP_MJ_MAXIMUM_FUNCTION+1) * sizeof(PDRIVER_DISPATCH));\r
+\r
+        DispatchTable[IRP_MJ_CREATE] = PtDispatch;\r
+        DispatchTable[IRP_MJ_CLEANUP] = PtDispatch;\r
+        DispatchTable[IRP_MJ_CLOSE] = PtDispatch;\r
+        DispatchTable[IRP_MJ_DEVICE_CONTROL] = PtDispatch;\r
+        \r
+\r
+        NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING);\r
+        NdisInitUnicodeString(&DeviceLinkUnicodeString, LINKNAME_STRING);\r
+\r
+        //\r
+        // Create a device object and register our dispatch handlers\r
+        //\r
+        \r
+        Status = NdisMRegisterDevice(\r
+                    NdisWrapperHandle, \r
+                    &DeviceName,\r
+                    &DeviceLinkUnicodeString,\r
+                    &DispatchTable[0],\r
+                    &ControlDeviceObject,\r
+                    &NdisDeviceHandle\r
+                    );\r
+\r
+        NdisAcquireSpinLock(&GlobalLock);\r
+\r
+        ControlDeviceState = PS_DEVICE_STATE_READY;\r
+    }\r
+\r
+    NdisReleaseSpinLock(&GlobalLock);\r
+\r
+    DBGPRINT(("<==PtRegisterDevice: %x\n", Status));\r
+\r
+    return (Status);\r
+}\r
+\r
+\r
+NTSTATUS\r
+PtDispatch(\r
+    IN PDEVICE_OBJECT    DeviceObject,\r
+    IN PIRP              Irp\r
+    )\r
+/*++\r
+Routine Description:\r
+\r
+    Process IRPs sent to this device.\r
+\r
+Arguments:\r
+\r
+    DeviceObject - pointer to a device object\r
+    Irp      - pointer to an I/O Request Packet\r
+\r
+Return Value:\r
+\r
+    NTSTATUS - STATUS_SUCCESS always - change this when adding\r
+    real code to handle ioctls.\r
+\r
+--*/\r
+{\r
+    PIO_STACK_LOCATION  irpStack;\r
+    NTSTATUS            status = STATUS_SUCCESS;\r
+\r
+    UNREFERENCED_PARAMETER(DeviceObject);\r
+    \r
+    DBGPRINT(("==>Pt Dispatch\n"));\r
+    irpStack = IoGetCurrentIrpStackLocation(Irp);\r
+      \r
+\r
+    switch (irpStack->MajorFunction)\r
+    {\r
+        case IRP_MJ_CREATE:\r
+            break;\r
+            \r
+        case IRP_MJ_CLEANUP:\r
+            break;\r
+            \r
+        case IRP_MJ_CLOSE:\r
+            break;        \r
+            \r
+        case IRP_MJ_DEVICE_CONTROL:\r
+            //\r
+            // Add code here to handle ioctl commands sent to passthru.\r
+            //\r
+            break;        \r
+        default:\r
+            break;\r
+    }\r
+\r
+    Irp->IoStatus.Status = status;\r
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);\r
+\r
+    DBGPRINT(("<== Pt Dispatch\n"));\r
+\r
+    return status;\r
+\r
+} \r
+\r
+\r
+NDIS_STATUS\r
+PtDeregisterDevice(\r
+    VOID\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Deregister the ioctl interface. This is called whenever a miniport\r
+    instance is halted. When the last miniport instance is halted, we\r
+    request NDIS to delete the device object\r
+\r
+Arguments:\r
+\r
+    NdisDeviceHandle - Handle returned by NdisMRegisterDevice\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_SUCCESS if everything worked ok\r
+\r
+--*/\r
+{\r
+    NDIS_STATUS Status = NDIS_STATUS_SUCCESS;\r
+\r
+    DBGPRINT(("==>PassthruDeregisterDevice\n"));\r
+\r
+    NdisAcquireSpinLock(&GlobalLock);\r
+\r
+    ASSERT(MiniportCount > 0);\r
+\r
+    --MiniportCount;\r
+    \r
+    if (0 == MiniportCount)\r
+    {\r
+        //\r
+        // All miniport instances have been halted. Deregister\r
+        // the control device.\r
+        //\r
+\r
+        ASSERT(ControlDeviceState == PS_DEVICE_STATE_READY);\r
+\r
+        //\r
+        // Block PtRegisterDevice() while we release the control\r
+        // device lock and deregister the device.\r
+        // \r
+        ControlDeviceState = PS_DEVICE_STATE_DELETING;\r
+\r
+        NdisReleaseSpinLock(&GlobalLock);\r
+\r
+        if (NdisDeviceHandle != NULL)\r
+        {\r
+            Status = NdisMDeregisterDevice(NdisDeviceHandle);\r
+            NdisDeviceHandle = NULL;\r
+        }\r
+\r
+        NdisAcquireSpinLock(&GlobalLock);\r
+        ControlDeviceState = PS_DEVICE_STATE_READY;\r
+    }\r
+\r
+    NdisReleaseSpinLock(&GlobalLock);\r
+\r
+    DBGPRINT(("<== PassthruDeregisterDevice: %x\n", Status));\r
+    return Status;\r
+    \r
+}\r
+\r
+VOID\r
+PtUnload(\r
+    IN PDRIVER_OBJECT        DriverObject\r
+    )\r
+//\r
+// PassThru driver unload function\r
+//\r
+{\r
+    UNREFERENCED_PARAMETER(DriverObject);\r
+    \r
+    DBGPRINT(("PtUnload: entered\n"));\r
+    \r
+    PtUnloadProtocol();\r
+    \r
+    NdisIMDeregisterLayeredMiniport(DriverHandle);\r
+    \r
+    NdisFreeSpinLock(&GlobalLock);\r
+\r
+    DBGPRINT(("PtUnload: done!\n"));\r
+}\r
+\r
diff --git a/original_passthru/passthru.h b/original_passthru/passthru.h
new file mode 100644 (file)
index 0000000..badde8a
--- /dev/null
@@ -0,0 +1,477 @@
+/*++\r
+\r
+Copyright (c) 1992-2000  Microsoft Corporation\r
+\r
+Module Name:\r
+\r
+    passthru.h\r
+\r
+Abstract:\r
+\r
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.\r
+\r
+Author:\r
+\r
+Environment:\r
+\r
+\r
+Revision History:\r
+\r
\r
+--*/\r
+\r
+#ifdef NDIS51_MINIPORT\r
+#define PASSTHRU_MAJOR_NDIS_VERSION            5\r
+#define PASSTHRU_MINOR_NDIS_VERSION            1\r
+#else\r
+#define PASSTHRU_MAJOR_NDIS_VERSION            4\r
+#define PASSTHRU_MINOR_NDIS_VERSION            0\r
+#endif\r
+\r
+#ifdef NDIS51\r
+#define PASSTHRU_PROT_MAJOR_NDIS_VERSION    5\r
+#define PASSTHRU_PROT_MINOR_NDIS_VERSION    0\r
+#else\r
+#define PASSTHRU_PROT_MAJOR_NDIS_VERSION    4\r
+#define PASSTHRU_PROT_MINOR_NDIS_VERSION    0\r
+#endif\r
+\r
+#define MAX_BUNDLEID_LENGTH 50\r
+\r
+#define TAG 'ImPa'\r
+#define WAIT_INFINITE 0\r
+\r
+\r
+\r
+//advance declaration\r
+typedef struct _ADAPT ADAPT, *PADAPT;\r
+\r
+DRIVER_INITIALIZE DriverEntry;\r
+extern\r
+NTSTATUS\r
+DriverEntry(\r
+    IN PDRIVER_OBJECT            DriverObject,\r
+    IN PUNICODE_STRING           RegistryPath\r
+    );\r
+\r
+DRIVER_DISPATCH PtDispatch;\r
+NTSTATUS\r
+PtDispatch(\r
+    IN PDEVICE_OBJECT            DeviceObject,\r
+    IN PIRP                      Irp\r
+    );\r
+\r
+NDIS_STATUS\r
+PtRegisterDevice(\r
+    VOID\r
+    );\r
+\r
+NDIS_STATUS\r
+PtDeregisterDevice(\r
+    VOID\r
+   );\r
+\r
+DRIVER_UNLOAD PtUnload;\r
+VOID\r
+PtUnloadProtocol(\r
+    VOID\r
+    );\r
+\r
+//\r
+// Protocol proto-types\r
+//\r
+extern\r
+VOID\r
+PtOpenAdapterComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                Status,\r
+    IN NDIS_STATUS                OpenErrorStatus\r
+    );\r
+\r
+extern\r
+VOID\r
+PtCloseAdapterComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtResetComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtRequestComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_REQUEST              NdisRequest,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtStatus(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_STATUS                GeneralStatus,\r
+    IN PVOID                      StatusBuffer,\r
+    IN UINT                       StatusBufferSize\r
+    );\r
+\r
+extern\r
+VOID\r
+PtStatusComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext\r
+    );\r
+\r
+extern\r
+VOID\r
+PtSendComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_PACKET               Packet,\r
+    IN NDIS_STATUS                Status\r
+    );\r
+\r
+extern\r
+VOID\r
+PtTransferDataComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_PACKET               Packet,\r
+    IN NDIS_STATUS                Status,\r
+    IN UINT                       BytesTransferred\r
+    );\r
+\r
+extern\r
+NDIS_STATUS\r
+PtReceive(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN NDIS_HANDLE                MacReceiveContext,\r
+    IN PVOID                      HeaderBuffer,\r
+    IN UINT                       HeaderBufferSize,\r
+    IN PVOID                      LookAheadBuffer,\r
+    IN UINT                       LookaheadBufferSize,\r
+    IN UINT                       PacketSize\r
+    );\r
+\r
+extern\r
+VOID\r
+PtReceiveComplete(\r
+    IN NDIS_HANDLE                ProtocolBindingContext\r
+    );\r
+\r
+extern\r
+INT\r
+PtReceivePacket(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNDIS_PACKET               Packet\r
+    );\r
+\r
+extern\r
+VOID\r
+PtBindAdapter(\r
+    OUT PNDIS_STATUS              Status,\r
+    IN  NDIS_HANDLE               BindContext,\r
+    IN  PNDIS_STRING              DeviceName,\r
+    IN  PVOID                     SystemSpecific1,\r
+    IN  PVOID                     SystemSpecific2\r
+    );\r
+\r
+extern\r
+VOID\r
+PtUnbindAdapter(\r
+    OUT PNDIS_STATUS              Status,\r
+    IN  NDIS_HANDLE               ProtocolBindingContext,\r
+    IN  NDIS_HANDLE               UnbindContext\r
+    );\r
+    \r
+VOID\r
+PtUnload(\r
+    IN PDRIVER_OBJECT             DriverObject\r
+    );\r
+\r
+\r
+\r
+extern \r
+NDIS_STATUS\r
+PtPNPHandler(\r
+    IN NDIS_HANDLE                ProtocolBindingContext,\r
+    IN PNET_PNP_EVENT             pNetPnPEvent\r
+    );\r
+\r
+\r
+\r
+\r
+NDIS_STATUS\r
+PtPnPNetEventReconfigure(\r
+    IN PADAPT            pAdapt,\r
+    IN PNET_PNP_EVENT    pNetPnPEvent\r
+    );    \r
+\r
+NDIS_STATUS \r
+PtPnPNetEventSetPower (\r
+    IN PADAPT                    pAdapt,\r
+    IN PNET_PNP_EVENT            pNetPnPEvent\r
+    );\r
+    \r
+\r
+//\r
+// Miniport proto-types\r
+//\r
+NDIS_STATUS\r
+MPInitialize(\r
+    OUT PNDIS_STATUS             OpenErrorStatus,\r
+    OUT PUINT                    SelectedMediumIndex,\r
+    IN PNDIS_MEDIUM              MediumArray,\r
+    IN UINT                      MediumArraySize,\r
+    IN NDIS_HANDLE               MiniportAdapterHandle,\r
+    IN NDIS_HANDLE               WrapperConfigurationContext\r
+    );\r
+\r
+VOID\r
+MPSendPackets(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN PPNDIS_PACKET              PacketArray,\r
+    IN UINT                       NumberOfPackets\r
+    );\r
+\r
+NDIS_STATUS\r
+MPSend(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN PNDIS_PACKET               Packet,\r
+    IN UINT                       Flags\r
+    );\r
+\r
+NDIS_STATUS\r
+MPQueryInformation(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_OID                   Oid,\r
+    IN PVOID                      InformationBuffer,\r
+    IN ULONG                      InformationBufferLength,\r
+    OUT PULONG                    BytesWritten,\r
+    OUT PULONG                    BytesNeeded\r
+    );\r
+\r
+NDIS_STATUS\r
+MPSetInformation(\r
+    IN NDIS_HANDLE                                      MiniportAdapterContext,\r
+    IN NDIS_OID                                         Oid,\r
+    __in_bcount(InformationBufferLength) IN PVOID       InformationBuffer,\r
+    IN ULONG                                            InformationBufferLength,\r
+    OUT PULONG                                          BytesRead,\r
+    OUT PULONG                                          BytesNeeded\r
+    );\r
+\r
+VOID\r
+MPReturnPacket(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN PNDIS_PACKET               Packet\r
+    );\r
+\r
+NDIS_STATUS\r
+MPTransferData(\r
+    OUT PNDIS_PACKET              Packet,\r
+    OUT PUINT                     BytesTransferred,\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_HANDLE                MiniportReceiveContext,\r
+    IN UINT                       ByteOffset,\r
+    IN UINT                       BytesToTransfer\r
+    );\r
+\r
+VOID\r
+MPHalt(\r
+    IN NDIS_HANDLE                MiniportAdapterContext\r
+    );\r
+\r
+\r
+VOID\r
+MPQueryPNPCapabilities(  \r
+    OUT PADAPT                    MiniportProtocolContext, \r
+    OUT PNDIS_STATUS              Status\r
+    );\r
+\r
+\r
+#ifdef NDIS51_MINIPORT\r
+\r
+VOID\r
+MPCancelSendPackets(\r
+    IN NDIS_HANDLE            MiniportAdapterContext,\r
+    IN PVOID                  CancelId\r
+    );\r
+\r
+VOID\r
+MPAdapterShutdown(\r
+    IN NDIS_HANDLE                MiniportAdapterContext\r
+    );\r
+\r
+VOID\r
+MPDevicePnPEvent(\r
+    IN NDIS_HANDLE                MiniportAdapterContext,\r
+    IN NDIS_DEVICE_PNP_EVENT      DevicePnPEvent,\r
+    IN PVOID                      InformationBuffer,\r
+    IN ULONG                      InformationBufferLength\r
+    );\r
+\r
+#endif // NDIS51_MINIPORT\r
+\r
+VOID\r
+MPFreeAllPacketPools(\r
+    IN PADAPT                    pAdapt\r
+    );\r
+\r
+\r
+VOID\r
+MPProcessSetPowerOid(\r
+    IN OUT PNDIS_STATUS                             pNdisStatus,\r
+    IN PADAPT                                       pAdapt,\r
+    __in_bcount(InformationBufferLength) IN PVOID   InformationBuffer,\r
+    IN ULONG                                        InformationBufferLength,\r
+    OUT PULONG                                      BytesRead,\r
+    OUT PULONG                                      BytesNeeded\r
+    );\r
+\r
+VOID\r
+PtReferenceAdapt(\r
+    IN PADAPT     pAdapt\r
+    );\r
+\r
+BOOLEAN\r
+PtDereferenceAdapt(\r
+    IN PADAPT     pAdapt\r
+    );\r
+\r
+//\r
+// There should be no DbgPrint's in the Free version of the driver\r
+//\r
+#if DBG\r
+\r
+#define DBGPRINT(Fmt)                                        \\r
+    {                                                        \\r
+        DbgPrint("Passthru: ");                                \\r
+        DbgPrint Fmt;                                        \\r
+    }\r
+\r
+#else // if DBG\r
+\r
+#define DBGPRINT(Fmt)                                            \r
+\r
+#endif // if DBG \r
+\r
+#define    NUM_PKTS_IN_POOL    256\r
+\r
+\r
+//\r
+// Protocol reserved part of a sent packet that is allocated by us.\r
+//\r
+typedef struct _SEND_RSVD\r
+{\r
+    PNDIS_PACKET    OriginalPkt;\r
+} SEND_RSVD, *PSEND_RSVD;\r
+\r
+//\r
+// Miniport reserved part of a received packet that is allocated by\r
+// us. Note that this should fit into the MiniportReserved space\r
+// in an NDIS_PACKET.\r
+//\r
+typedef struct _RECV_RSVD\r
+{\r
+    PNDIS_PACKET    OriginalPkt;\r
+} RECV_RSVD, *PRECV_RSVD;\r
+\r
+C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved));\r
+\r
+//\r
+// Event Codes related to the PassthruEvent Structure\r
+//\r
+\r
+typedef enum \r
+{\r
+    Passthru_Invalid,\r
+    Passthru_SetPower,\r
+    Passthru_Unbind\r
+\r
+} PASSSTHRU_EVENT_CODE, *PPASTHRU_EVENT_CODE; \r
+\r
+//\r
+// Passthru Event with  a code to state why they have been state\r
+//\r
+\r
+typedef struct _PASSTHRU_EVENT\r
+{\r
+    NDIS_EVENT Event;\r
+    PASSSTHRU_EVENT_CODE Code;\r
+\r
+} PASSTHRU_EVENT, *PPASSTHRU_EVENT;\r
+\r
+\r
+//\r
+// Structure used by both the miniport as well as the protocol part of the intermediate driver\r
+// to represent an adapter and its corres. lower bindings\r
+//\r
+typedef struct _ADAPT\r
+{\r
+    struct _ADAPT *                Next;\r
+    \r
+    NDIS_HANDLE                    BindingHandle;    // To the lower miniport\r
+    NDIS_HANDLE                    MiniportHandle;    // NDIS Handle to for miniport up-calls\r
+    NDIS_HANDLE                    SendPacketPoolHandle;\r
+    NDIS_HANDLE                    RecvPacketPoolHandle;\r
+    NDIS_STATUS                    Status;            // Open Status\r
+    NDIS_EVENT                     Event;            // Used by bind/halt for Open/Close Adapter synch.\r
+    NDIS_MEDIUM                    Medium;\r
+    NDIS_REQUEST                   Request;        // This is used to wrap a request coming down\r
+                                                // to us. This exploits the fact that requests\r
+                                                // are serialized down to us.\r
+    PULONG                         BytesNeeded;\r
+    PULONG                         BytesReadOrWritten;\r
+    BOOLEAN                        ReceivedIndicationFlags[32];\r
+    \r
+    BOOLEAN                        OutstandingRequests;      // TRUE iff a request is pending\r
+                                                        // at the miniport below\r
+    BOOLEAN                        QueuedRequest;            // TRUE iff a request is queued at\r
+                                                        // this IM miniport\r
+\r
+    BOOLEAN                        StandingBy;                // True - When the miniport or protocol is transitioning from a D0 to Standby (>D0) State\r
+    BOOLEAN                        UnbindingInProcess;\r
+    NDIS_SPIN_LOCK                 Lock;\r
+                                                        // False - At all other times, - Flag is cleared after a transition to D0\r
+\r
+    NDIS_DEVICE_POWER_STATE        MPDeviceState;            // Miniport's Device State \r
+    NDIS_DEVICE_POWER_STATE        PTDeviceState;            // Protocol's Device State \r
+    NDIS_STRING                    DeviceName;                // For initializing the miniport edge\r
+    NDIS_EVENT                     MiniportInitEvent;        // For blocking UnbindAdapter while\r
+                                                        // an IM Init is in progress.\r
+    BOOLEAN                        MiniportInitPending;    // TRUE iff IMInit in progress\r
+    NDIS_STATUS                    LastIndicatedStatus;    // The last indicated media status\r
+    NDIS_STATUS                    LatestUnIndicateStatus; // The latest suppressed media status\r
+    ULONG                          OutstandingSends;\r
+    LONG                           RefCount;\r
+    BOOLEAN                        MiniportIsHalted;\r
+} ADAPT, *PADAPT;\r
+\r
+extern    NDIS_HANDLE                        ProtHandle, DriverHandle;\r
+extern    NDIS_MEDIUM                        MediumArray[4];\r
+extern    PADAPT                             pAdaptList;\r
+extern    NDIS_SPIN_LOCK                     GlobalLock;\r
+\r
+\r
+#define ADAPT_MINIPORT_HANDLE(_pAdapt)    ((_pAdapt)->MiniportHandle)\r
+#define ADAPT_DECR_PENDING_SENDS(_pAdapt)     \\r
+    {                                         \\r
+        NdisAcquireSpinLock(&(_pAdapt)->Lock);   \\r
+        (_pAdapt)->OutstandingSends--;           \\r
+        NdisReleaseSpinLock(&(_pAdapt)->Lock);   \\r
+    }\r
+\r
+//\r
+// Custom Macros to be used by the passthru driver \r
+//\r
+/*\r
+BOOLEAN\r
+IsIMDeviceStateOn(\r
+   PADAPT \r
+   )\r
+\r
+*/\r
+#define IsIMDeviceStateOn(_pP)        ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) \r
+\r
diff --git a/original_passthru/passthru.htm b/original_passthru/passthru.htm
new file mode 100644 (file)
index 0000000..ee23278
--- /dev/null
@@ -0,0 +1,486 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml"\r
+xmlns:o="urn:schemas-microsoft-com:office:office"\r
+xmlns:w="urn:schemas-microsoft-com:office:word"\r
+xmlns:st1="urn:schemas-microsoft-com:office:smarttags"\r
+xmlns="http://www.w3.org/TR/REC-html40">\r
+\r
+<head>\r
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r
+<meta name=ProgId content=Word.Document>\r
+<meta name=Generator content="Microsoft Word 10">\r
+<meta name=Originator content="Microsoft Word 10">\r
+<link rel=File-List href="passthru_files/filelist.xml">\r
+<title>passthru</title>\r
+<o:SmartTagType namespaceuri="urn:schemas-microsoft-com:office:smarttags"\r
+ name="place"/>\r
+<o:SmartTagType namespaceuri="urn:schemas-microsoft-com:office:smarttags"\r
+ name="PlaceType"/>\r
+<o:SmartTagType namespaceuri="urn:schemas-microsoft-com:office:smarttags"\r
+ name="PlaceName"/>\r
+<!--[if gte mso 9]><xml>\r
+ <w:WordDocument>\r
+  <w:SpellingState>Clean</w:SpellingState>\r
+  <w:GrammarState>Clean</w:GrammarState>\r
+  <w:Compatibility>\r
+   <w:UseFELayout/>\r
+  </w:Compatibility>\r
+  <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>\r
+ </w:WordDocument>\r
+</xml><![endif]--><!--[if !mso]><object\r
+ classid="clsid:38481807-CA0E-42D2-BF39-B33AF135CC4D" id=ieooui></object>\r
+<style>\r
+st1\:*{behavior:url(#ieooui) }\r
+</style>\r
+<![endif]-->\r
+<style>\r
+<!--\r
+ /* Font Definitions */\r
+ @font-face\r
+       {font-family:"MS Mincho";\r
+       panose-1:2 2 6 9 4 2 5 8 3 4;\r
+       mso-font-alt:"\FF2D\FF33 \660E\671D";\r
+       mso-font-charset:128;\r
+       mso-generic-font-family:modern;\r
+       mso-font-pitch:fixed;\r
+       mso-font-signature:-1610612033 1757936891 16 0 131231 0;}\r
+@font-face\r
+       {font-family:Verdana;\r
+       panose-1:2 11 6 4 3 5 4 4 2 4;\r
+       mso-font-charset:0;\r
+       mso-generic-font-family:swiss;\r
+       mso-font-pitch:variable;\r
+       mso-font-signature:536871559 0 0 0 415 0;}\r
+@font-face\r
+       {font-family:"\@MS Mincho";\r
+       panose-1:2 2 6 9 4 2 5 8 3 4;\r
+       mso-font-charset:128;\r
+       mso-generic-font-family:modern;\r
+       mso-font-pitch:fixed;\r
+       mso-font-signature:-1610612033 1757936891 16 0 131231 0;}\r
+@font-face\r
+       {font-family:"MS Sans Serif";\r
+       panose-1:0 0 0 0 0 0 0 0 0 0;\r
+       mso-font-charset:0;\r
+       mso-generic-font-family:swiss;\r
+       mso-font-format:other;\r
+       mso-font-pitch:variable;\r
+       mso-font-signature:3 0 0 0 1 0;}\r
+ /* Style Definitions */\r
+ p.MsoNormal, li.MsoNormal, div.MsoNormal\r
+       {mso-style-parent:"";\r
+       margin:0in;\r
+       margin-bottom:.0001pt;\r
+       mso-pagination:widow-orphan;\r
+       font-size:12.0pt;\r
+       font-family:"Times New Roman";\r
+       mso-fareast-font-family:"Times New Roman";\r
+       color:black;}\r
+h2\r
+       {mso-margin-top-alt:auto;\r
+       margin-right:0in;\r
+       mso-margin-bottom-alt:auto;\r
+       margin-left:0in;\r
+       mso-pagination:widow-orphan;\r
+       mso-outline-level:2;\r
+       font-size:18.0pt;\r
+       font-family:"Times New Roman";\r
+       mso-fareast-font-family:"MS Mincho";\r
+       color:black;\r
+       font-weight:bold;}\r
+h3\r
+       {mso-margin-top-alt:auto;\r
+       margin-right:0in;\r
+       mso-margin-bottom-alt:auto;\r
+       margin-left:0in;\r
+       mso-pagination:widow-orphan;\r
+       mso-outline-level:3;\r
+       font-size:13.5pt;\r
+       font-family:"Times New Roman";\r
+       mso-fareast-font-family:"MS Mincho";\r
+       color:black;\r
+       font-weight:bold;}\r
+h4\r
+       {mso-margin-top-alt:auto;\r
+       margin-right:0in;\r
+       mso-margin-bottom-alt:auto;\r
+       margin-left:0in;\r
+       mso-pagination:widow-orphan;\r
+       mso-outline-level:4;\r
+       font-size:12.0pt;\r
+       font-family:"Times New Roman";\r
+       mso-fareast-font-family:"MS Mincho";\r
+       color:black;\r
+       font-weight:bold;}\r
+a:link, span.MsoHyperlink\r
+       {color:blue;\r
+       text-decoration:underline;\r
+       text-underline:single;}\r
+a:visited, span.MsoHyperlinkFollowed\r
+       {color:purple;\r
+       text-decoration:underline;\r
+       text-underline:single;}\r
+p\r
+       {mso-margin-top-alt:auto;\r
+       margin-right:0in;\r
+       mso-margin-bottom-alt:auto;\r
+       margin-left:0in;\r
+       mso-pagination:widow-orphan;\r
+       font-size:12.0pt;\r
+       font-family:"Times New Roman";\r
+       mso-fareast-font-family:"Times New Roman";\r
+       color:black;}\r
+pre\r
+       {margin:0in;\r
+       margin-bottom:.0001pt;\r
+       mso-pagination:widow-orphan;\r
+       tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt;\r
+       font-size:10.0pt;\r
+       font-family:"Courier New";\r
+       mso-fareast-font-family:"Courier New";\r
+       color:black;}\r
+span.SpellE\r
+       {mso-style-name:"";\r
+       mso-spl-e:yes;}\r
+span.GramE\r
+       {mso-style-name:"";\r
+       mso-gram-e:yes;}\r
+@page Section1\r
+       {size:8.5in 11.0in;\r
+       margin:1.0in 1.25in 1.0in 1.25in;\r
+       mso-header-margin:.5in;\r
+       mso-footer-margin:.5in;\r
+       mso-paper-source:0;}\r
+div.Section1\r
+       {page:Section1;}\r
+-->\r
+</style>\r
+<!--[if gte mso 10]>\r
+<style>\r
+ /* Style Definitions */\r
+ table.MsoNormalTable\r
+       {mso-style-name:"Table Normal";\r
+       mso-tstyle-rowband-size:0;\r
+       mso-tstyle-colband-size:0;\r
+       mso-style-noshow:yes;\r
+       mso-style-parent:"";\r
+       mso-padding-alt:0in 5.4pt 0in 5.4pt;\r
+       mso-para-margin:0in;\r
+       mso-para-margin-bottom:.0001pt;\r
+       mso-pagination:widow-orphan;\r
+       font-size:10.0pt;\r
+       font-family:"Times New Roman";}\r
+</style>\r
+<![endif]-->\r
+<meta name=Template content="C:\Program Files\Microsoft Office\Office\html.dot">\r
+<!--[if gte mso 9]><xml>\r
+ <o:shapedefaults v:ext="edit" spidmax="3074"/>\r
+</xml><![endif]--><!--[if gte mso 9]><xml>\r
+ <o:shapelayout v:ext="edit">\r
+  <o:idmap v:ext="edit" data="1"/>\r
+ </o:shapelayout></xml><![endif]-->\r
+</head>\r
+\r
+<body bgcolor=white lang=EN-US link=blue vlink=purple style='tab-interval:.5in'>\r
+\r
+<div class=Section1>\r
+\r
+<h2><a name=MYSAMPLE></a><a name=top></a><span style='mso-bookmark:MYSAMPLE'>\r
+\r
+<!doctype HTML>\r
+\r
+<span style='font-family:Verdana'><! ---------------- Snip Snip ---------------- >PASSTHRU.SYS\r
+- Sample NDIS Intermediate Driver</span></span><span style='font-family:Verdana'><o:p></o:p></span></h2>\r
+\r
+<h3><span style='font-family:Verdana'>SUMMARY<o:p></o:p></span></h3>\r
+\r
+<p><st1:place><st1:PlaceName><span class=SpellE><b><span style='font-family:\r
+  Verdana'>Passthru</span></b></span></st1:PlaceName><b><span style='font-family:\r
+ Verdana'> </span></b><st1:PlaceName><b><span style='font-family:Verdana'>Intermediate</span></b></st1:PlaceName><b><span\r
+ style='font-family:Verdana'> </span></b><st1:PlaceType><b><span\r
+  style='font-family:Verdana'>Miniport</span></b></st1:PlaceType></st1:place><b><span\r
+style='font-family:Verdana'> Driver<o:p></o:p></span></b></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>The <span class=SpellE>Passthru</span>\r
+sample is a do-nothing pass-through NDIS 5 driver that demonstrates the basic\r
+principles underlying an NDIS Intermediate Miniport (IM) driver. This driver\r
+exposes a virtual adapter for each binding to a real or virtual NDIS adapter.\r
+Protocols bind to these virtual adapters as if they are real adapters. <o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>The <span class=SpellE>Passthru</span>\r
+driver re-packages and sends down all requests and sends submitted to this\r
+virtual adapter. The <span class=SpellE>Passthru</span> driver can be modified\r
+to change the data before passing it along. For example, it could\r
+encrypt/compress outgoing and decrypt/decompress incoming data.<o:p></o:p></span></p>\r
+\r
+<p><span class=SpellE><span style='font-size:10.0pt;font-family:Verdana'>Passthru</span></span><span\r
+style='font-size:10.0pt;font-family:Verdana'> also re-packages and indicates up\r
+all received data and status indications that it receives at its lower\r
+(protocol) edge.<o:p></o:p></span></p>\r
+\r
+<h3><span style='font-family:Verdana'>BUILDING THE SAMPLE<o:p></o:p></span></h3>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Run the <b>build</b>\r
+command from this directory to build the sample\97it creates the binary <span\r
+class=SpellE>Passthru.sys</span>. <o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>To install this driver on\r
+Windows® 2000, use the PASSTHRU sample notification object and <span\r
+class=SpellE>INFs</span>, also found in this DDK.<o:p></o:p></span></p>\r
+\r
+<h3><span style='font-family:Verdana'>INSTALLING THE SAMPLE<o:p></o:p></span></h3>\r
+\r
+<p><span class=SpellE><span style='font-size:10.0pt;font-family:Verdana'>Passthru</span></span><span\r
+style='font-size:10.0pt;font-family:Verdana'> is installed as a service (called\r
+\93<span class=SpellE>Passthru</span> Driver\94 in the supplied <span class=SpellE>INFs</span>/notification\r
+object). To install, follow the steps below.<o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Prepare a floppy disk (or\r
+installation directory) that contains these files: <span class=SpellE>netsf.inf</span>,\r
+<span class=SpellE>netsf_m.inf</span> and <span class=SpellE>passthru.sys</span>.<o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>On the desktop,\r
+right-click the <b>My Network Places</b> icon and choose <b>Properties</b>. <o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Right-click on the\r
+relevant Local Area Connection icon and choose <b>Properties</b>. <o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Click <b>Install</b>,\r
+then <b>Service</b>, then <b>Add</b>, <span class=GramE>then</span> <b>Have Disk</b>.\r
+<o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Browse to the\r
+drive/directory containing the files listed above. Click <b>OK</b>. This should\r
+show \93<span class=SpellE>Passthru</span> Driver\94 in a list of Network Services.\r
+Highlight this and click <b>OK</b>. This should install the <span class=SpellE>Passthru</span>\r
+driver. <o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Click <b>OK</b> or <span\r
+class=GramE><b>Yes</b></span> each time the system prompts with a warning\r
+regarding installation of unsigned files. This is necessary because binaries\r
+generated via the DDK build environment are not signed.<o:p></o:p></span></p>\r
+\r
+<p><span style='font-size:10.0pt;font-family:Verdana'>Two .INF files are needed\r
+rather than one because <span class=SpellE>Passthru</span> is installed both as\r
+a protocol and a miniport.<o:p></o:p></span></p>\r
+\r
+<h3><span style='font-family:Verdana'>CODE TOUR<o:p></o:p></span></h3>\r
+\r
+<h4><span style='font-family:Verdana'>File Manifest<o:p></o:p></span></h4>\r
+\r
+<pre><u>File<span style='mso-tab-count:2'>           </span>Description<o:p></o:p></u></pre><pre><o:p>&nbsp;</o:p></pre><pre><span\r
+class=SpellE>Makefile</span><span style='mso-tab-count:1'>       </span>Used during compilation to create the object and sys files</pre><pre><span\r
+class=SpellE>Miniport.c</span><span style='mso-tab-count:1'>     </span>Miniport related functions of the <span\r
+class=SpellE>passthru</span> driver</pre><pre><span class=SpellE>Netsf.inf</span><span\r
+style='mso-tab-count:1'>      </span>Installation INF for the service (protocol side installation)</pre><pre><span\r
+class=SpellE>Netsf_m.inf</span><span style='mso-tab-count:1'>    </span>Installation INF for the miniport (virtual device installation)</pre><pre><span\r
+class=SpellE>Passthru.c</span><span style='mso-tab-count:1'>     </span><span\r
+class=SpellE>DriverEntry</span> routine and any routines common to the <span\r
+class=SpellE>passthru</span> miniport and protocol </pre><pre><span\r
+class=SpellE>Passthru.h</span><span style='mso-tab-count:1'>     </span>Prototypes of all functions and data structures used by the <span\r
+class=SpellE>Passthru</span> driver</pre><pre>Passthru.htm<span\r
+style='mso-tab-count:1'>   </span>Documentation for the <span class=SpellE>Passthru</span> driver (this file)</pre><pre><span\r
+class=SpellE>Passthru.rc</span><span style='mso-tab-count:1'>    </span>Resource <span\r
+class=GramE>file</span> for the <span class=SpellE>Passthru</span> driver</pre><pre><span\r
+class=SpellE>Precomp.h</span><span style='mso-tab-count:1'>      </span><span\r
+class=SpellE>Precompile</span> header file</pre><pre><span class=SpellE>Protocol.c</span><span\r
+style='mso-tab-count:1'>     </span>Protocol related functions of the <span\r
+class=SpellE>Passthru</span> driver</pre><pre>Sources<span style='mso-tab-count:\r
+2'>        </span>List of source files that are compiled and linked to create the <span\r
+class=SpellE>passthru</span> driver. This can be modified to create binaries that operate on previous Windows versions (e.g. Windows 2000).</pre>\r
+\r
+<h4 style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-family:Verdana'>Programming Tour<o:p></o:p></span></h4>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>Basic steps in initializing and\r
+halting of <span class=SpellE>Passthru</span> driver:<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>1) During <span class=SpellE>DriverEntry</span>,\r
+the <span class=SpellE>Passthru</span> driver registers as a protocol and an\r
+Intermediate miniport driver.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>2) Later on, NDIS calls <span\r
+class=SpellE>Passthru\92s</span> <span class=SpellE>BindAdapterHandler</span>, <span\r
+class=SpellE>PtBindAdapter</span>, for each underlying NDIS adapter to which it\r
+is configured to bind.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>3) In the context of <span\r
+class=SpellE>BindAdapterHandler</span> and after successfully opening a binding\r
+to the underlying adapter, the <span class=SpellE>Passthru</span> driver\r
+queries the reserved keyword &quot;<span class=SpellE>UpperBindings</span>&quot;\r
+to get a list of device names for the virtual adapters that this particular\r
+binding is to expose. Since this driver implements a 1:1 relationship between\r
+lower bindings and virtual adapters, this list contains a single name. \93<span\r
+class=SpellE>Mux</span>\94 IM drivers that expose multiple virtual adapters over\r
+a single underlying adapter will process multiple entries in <span\r
+class=SpellE>UpperBindings</span>.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>4) For each device name, the <span\r
+class=SpellE>Passthru</span> driver calls <span class=SpellE>NdisIMInitializeDeviceInstanceEx</span>.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>5) In response, NDIS will\r
+eventually call back <span class=SpellE>Passthru</span> miniport\92s <span\r
+class=SpellE>MiniportInitialize</span> entry point, <span class=SpellE>MPInitialize</span>.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>6) After <span class=SpellE>MPInitialize</span>\r
+successfully returns, NDIS takes care of getting upper-layer protocols to bind\r
+to the newly created virtual adapter(s).<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>7) All requests and sends coming\r
+from upper-layer protocols for the <span class=SpellE>Passthru</span> miniport\r
+driver are repackaged and sent down to NDIS, to be passed to the underlying\r
+NDIS adapter.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>8) All indications arriving from\r
+bindings to an underlying NDIS adapter are forwarded up as if they generated\r
+from <span class=SpellE>Passthru\92s</span> virtual adapters.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>9) NDIS calls the <span\r
+class=SpellE>Passthru</span> driver\92s <span class=SpellE>ProtocolUnbind</span>\r
+entry point to request it to close the binding between an underlying adapter\r
+and <span class=SpellE>Passthru</span> protocol. In processing this, the <span\r
+class=SpellE>Passthru</span> driver first calls <span class=SpellE>NdisIMDeInitializeDeviceInstance</span>\r
+for the virtual adapter(s) representing that particular binding.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>10) NDIS in turn will close all\r
+the bindings between upper-layer protocols and virtual <span class=SpellE>Passthru</span>\r
+adapter.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>11) After all the bindings are\r
+closed, NDIS calls the <span class=SpellE>Passthru</span> driver\92s <span\r
+class=SpellE>MiniportHalt</span> entry point (<span class=SpellE>MPHalt</span>)\r
+for the virtual adapter.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>12) The <span class=SpellE>Passthru</span>\r
+protocol then closes the binding to the underlying adapter by calling <span\r
+class=SpellE>NdisCloseAdapter</span>, and completes the unbind request issued\r
+in step 9.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>13) <b>Handling Power Management</b><o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>13.1 During initialization, the <span\r
+class=SpellE>Passthru</span> miniport should set the Attribute '<i>NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND</i>'\r
+in its call to <span class=SpellE>NdisMSetAttributesEx</span>. <o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>13.2 When the <span class=SpellE>Passthru</span>\r
+miniport is requested to report its Plug and Play capabilities\r
+(OID_PNP_CAPABILITIES), the <span class=SpellE>Passthru</span> miniport must\r
+pass the request to the underlying miniport. If this request succeeds, then the\r
+<span class=SpellE>Passthru</span> miniport should overwrite the following\r
+fields before successfully completing the original request: <o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>NDIS_DEVICE_POWER_STATE<span\r
+style='mso-tab-count:1'>          </span><span class=SpellE>MinMagicPacketWakeUp</span>\r
+= <span class=SpellE>NdisDeviceStateUnspecified</span>;<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>NDIS_DEVICE_POWER_STATE<span\r
+style='mso-tab-count:1'>          </span><span class=SpellE>MinPatternWakeUp</span>=\r
+<span class=SpellE>NdisDeviceStateUnspecified</span>;<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>NDIS_DEVICE_POWER_STATE<span\r
+style='mso-tab-count:1'>          </span><span class=SpellE>MinLinkChangeWakeUp</span>=<span\r
+class=SpellE>NdisDeviceStateUnspecified</span><o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>If the miniport below the <span\r
+class=SpellE>Passthru</span> protocol fails this request, then the status that\r
+was returned should be used to respond to the original request that was made to\r
+the <span class=SpellE>Passthru</span> miniport. <o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>13.3 OID_PNP_SET_POWER and OID_PNP_QUERY_POWER\r
+should not be passed to the miniport below the <span class=SpellE>Passthru</span>\r
+protocol, as those <span class=SpellE>miniports</span> will receive independent\r
+requests from NDIS.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>13.4 NDIS calls the <span\r
+class=SpellE>Passthru</span> driver\92s <span class=SpellE>ProtocolPnPEvent</span>\r
+entry point (<span class=SpellE>PtPnPHandler</span>) whenever the underlying adapter\r
+is transitioned to a different power state. If the underlying adapter is\r
+transitioning to a low power state, the IM driver should wait for all\r
+outstanding sends and requests to complete.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>14) <b>NDIS 5.1 Features</b><o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>14.1 All NDIS 5.1 features in <span\r
+class=SpellE>Passthru</span> are identified by #<span class=SpellE>ifdef</span>\r
+NDIS51 compiler directives. The following major features are illustrated (refer\r
+to the DDK documentation for more information on these):<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><b><span\r
+style='font-size:10.0pt;font-family:Verdana'>Packet stacking</span></b><span\r
+style='font-size:10.0pt;font-family:Verdana'>: this allows an IM driver to\r
+reuse a packet submitted to its protocol or miniport edge to forward data down\r
+(or up) to the adjacent layer.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><b><span\r
+style='font-size:10.0pt;font-family:Verdana'>Canceling Sends</span></b><span\r
+style='font-size:10.0pt;font-family:Verdana'>: <span class=SpellE>Passthru</span>\r
+propagates send cancellations from protocols above it to lower <span\r
+class=SpellE>miniports</span>.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><b><span\r
+style='font-size:10.0pt;font-family:Verdana'>PnP Event Propagation</span></b><span\r
+style='font-size:10.0pt;font-family:Verdana'>: <span class=SpellE>Passthru</span>\r
+propagates PnP events arriving at its protocol (lower) edge to higher layer\r
+protocols that are bound to its virtual adapter.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+class=SpellE><b><span style='font-size:10.0pt;font-family:Verdana'>NdisQueryPendingIOCount</span></b></span><span\r
+style='font-size:10.0pt;font-family:Verdana'>: <span class=SpellE>Passthru</span>\r
+uses this new API to determine if any I/O operations are in progress on its\r
+lower binding.<o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'>15) For Win2K SP2 and <span\r
+class=SpellE>WinXP</span>, the <span class=SpellE>Passthru</span> sample no\r
+longer requires a Notify Object. The Notify Object has been removed. <o:p></o:p></span></p>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:10.0pt;font-family:Verdana'><span\r
+style='mso-spacerun:yes'> </span><o:p></o:p></span></p>\r
+\r
+<p align=center style='text-align:center;tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><a\r
+href="#top"><span style='font-size:10.0pt;font-family:Verdana'>Top of page</span></a><span\r
+style='font-size:10.0pt;font-family:Verdana'> <o:p></o:p></span></p>\r
+\r
+<table class=MsoNormalTable border=0 cellspacing=0 cellpadding=0 width=624\r
+ style='width:6.5in;mso-cellspacing:0in;mso-padding-alt:0in 0in 0in 0in'>\r
+ <tr style='mso-yfti-irow:0;mso-yfti-lastrow:yes;height:1.5pt'>\r
+  <td style='background:aqua;padding:.75pt .75pt .75pt .75pt;height:1.5pt'>\r
+  <p class=MsoNormal><o:p>&nbsp;</o:p></p>\r
+  </td>\r
+ </tr>\r
+</table>\r
+\r
+<p style='tab-stops:45.8pt 91.6pt 137.4pt 183.2pt 229.0pt 274.8pt 320.6pt 366.4pt 412.2pt 458.0pt 503.8pt 549.6pt 595.4pt 641.2pt 687.0pt 732.8pt'><span\r
+style='font-size:7.5pt;font-family:"MS Sans Serif"'>© 1999 Microsoft\r
+Corporation</span><span style='font-size:10.0pt;font-family:Verdana'> <o:p></o:p></span></p>\r
+\r
+</div>\r
+\r
+</body>\r
+\r
+</html>\r
+\r
diff --git a/original_passthru/passthru.rc b/original_passthru/passthru.rc
new file mode 100644 (file)
index 0000000..6ae427c
--- /dev/null
@@ -0,0 +1,41 @@
+#include <windows.h>\r
+#include <ntverp.h>\r
+\r
+/*-----------------------------------------------*/\r
+/* the following lines are specific to this file */\r
+/*-----------------------------------------------*/\r
+\r
+/* VER_FILETYPE, VER_FILESUBTYPE, VER_FILEDESCRIPTION_STR\r
+ * and VER_INTERNALNAME_STR must be defined before including COMMON.VER\r
+ * The strings don't need a '\0', since common.ver has them.\r
+ */\r
+#define        VER_FILETYPE    VFT_DRV\r
+/* possible values:            VFT_UNKNOWN\r
+                               VFT_APP\r
+                               VFT_DLL\r
+                               VFT_DRV\r
+                               VFT_FONT\r
+                               VFT_VXD\r
+                               VFT_STATIC_LIB\r
+*/\r
+#define        VER_FILESUBTYPE VFT2_DRV_NETWORK\r
+/* possible values             VFT2_UNKNOWN\r
+                               VFT2_DRV_PRINTER\r
+                               VFT2_DRV_KEYBOARD\r
+                               VFT2_DRV_LANGUAGE\r
+                               VFT2_DRV_DISPLAY\r
+                               VFT2_DRV_MOUSE\r
+                               VFT2_DRV_NETWORK\r
+                               VFT2_DRV_SYSTEM\r
+                               VFT2_DRV_INSTALLABLE\r
+                               VFT2_DRV_SOUND\r
+                               VFT2_DRV_COMM\r
+*/\r
+#define VER_FILEDESCRIPTION_STR     "Sample NDIS 4.0 Intermediate Miniport Driver"\r
+#define VER_INTERNALNAME_STR        "PASSTHRU.SYS"\r
+#define VER_ORIGINALFILENAME_STR    "PASSTHRU.SYS"\r
+#define VER_LANGNEUTRAL\r
+\r
+#include "common.ver"\r
+\r
+\1a\r
diff --git a/original_passthru/precomp.h b/original_passthru/precomp.h
new file mode 100644 (file)
index 0000000..b2870d1
--- /dev/null
@@ -0,0 +1,11 @@
+#pragma warning(disable:4214)   // bit field types other than int\r
+\r
+#pragma warning(disable:4201)   // nameless struct/union\r
+#pragma warning(disable:4115)   // named type definition in parentheses\r
+#pragma warning(disable:4127)   // conditional expression is constant\r
+#pragma warning(disable:4054)   // cast of function pointer to PVOID\r
+#pragma warning(disable:4244)   // conversion from 'int' to 'BOOLEAN', possible loss of data\r
+\r
+#include <ndis.h>\r
+#include "passthru.h"\r
+\r
diff --git a/original_passthru/protocol.c b/original_passthru/protocol.c
new file mode 100644 (file)
index 0000000..213924c
--- /dev/null
@@ -0,0 +1,1626 @@
+/*++\r
+\r
+Copyright(c) 1992-2000  Microsoft Corporation\r
+\r
+Module Name:\r
+\r
+    protocol.c\r
+\r
+Abstract:\r
+\r
+    Ndis Intermediate Miniport driver sample. This is a passthru driver.\r
+\r
+Author:\r
+\r
+Environment:\r
+\r
+\r
+Revision History:\r
+\r
+\r
+--*/\r
+\r
+\r
+#include "precomp.h"\r
+#pragma hdrstop\r
+\r
+#define MAX_PACKET_POOL_SIZE 0x0000FFFF\r
+#define MIN_PACKET_POOL_SIZE 0x000000FF\r
+\r
+//\r
+// NDIS version as 0xMMMMmmmm, where M=Major/m=minor (0x00050001 = 5.1); \r
+// initially unknown (0)\r
+// \r
+ULONG       NdisDotSysVersion =  0x0;\r
+\r
+\r
+#define NDIS_SYS_VERSION_51       0x00050001\r
+\r
+\r
+VOID\r
+PtBindAdapter(\r
+    OUT PNDIS_STATUS            Status,\r
+    IN  NDIS_HANDLE             BindContext,\r
+    IN  PNDIS_STRING            DeviceName,\r
+    IN  PVOID                   SystemSpecific1,\r
+    IN  PVOID                   SystemSpecific2\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Called by NDIS to bind to a miniport below.\r
+\r
+Arguments:\r
+\r
+    Status            - Return status of bind here.\r
+    BindContext        - Can be passed to NdisCompleteBindAdapter if this call is pended.\r
+    DeviceName         - Device name to bind to. This is passed to NdisOpenAdapter.\r
+    SystemSpecific1    - Can be passed to NdisOpenProtocolConfiguration to read per-binding information\r
+    SystemSpecific2    - Unused\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_PENDING    if this call is pended. In this case call NdisCompleteBindAdapter\r
+    to complete.\r
+    Anything else          Completes this call synchronously\r
+\r
+--*/\r
+{\r
+    NDIS_HANDLE                     ConfigHandle = NULL;\r
+    PNDIS_CONFIGURATION_PARAMETER   Param;\r
+    NDIS_STRING                     DeviceStr = NDIS_STRING_CONST("UpperBindings");\r
+    NDIS_STRING                     NdisVersionStr = NDIS_STRING_CONST("NdisVersion");\r
+    PADAPT                          pAdapt = NULL;\r
+    NDIS_STATUS                     Sts;\r
+    UINT                            MediumIndex;\r
+    ULONG                           TotalSize;\r
+    BOOLEAN                         NoCleanUpNeeded = FALSE;\r
+\r
+\r
+    UNREFERENCED_PARAMETER(BindContext);\r
+    UNREFERENCED_PARAMETER(SystemSpecific2);\r
+    \r
+    DBGPRINT(("==> Protocol BindAdapter\n"));\r
+\r
+    do\r
+    {\r
+        //\r
+        // Access the configuration section for our binding-specific\r
+        // parameters.\r
+        //\r
+        NdisOpenProtocolConfiguration(Status,\r
+                                       &ConfigHandle,\r
+                                       SystemSpecific1);\r
+\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            break;\r
+        }\r
+        if (NdisDotSysVersion == 0)\r
+        {\r
+            NdisReadConfiguration(Status,\r
+                                  &Param,\r
+                                  ConfigHandle,\r
+                                  &NdisVersionStr,        // "NdisVersion"\r
+                                  NdisParameterInteger);\r
+            if (*Status != NDIS_STATUS_SUCCESS)\r
+            {\r
+                break;\r
+            }\r
+            \r
+            NdisDotSysVersion = Param->ParameterData.IntegerData;\r
+        }\r
+                        \r
+\r
+        //\r
+        // Read the "UpperBindings" reserved key that contains a list\r
+        // of device names representing our miniport instances corresponding\r
+        // to this lower binding. Since this is a 1:1 IM driver, this key\r
+        // contains exactly one name.\r
+        //\r
+        // If we want to implement a N:1 mux driver (N adapter instances\r
+        // over a single lower binding), then UpperBindings will be a\r
+        // MULTI_SZ containing a list of device names - we would loop through\r
+        // this list, calling NdisIMInitializeDeviceInstanceEx once for\r
+        // each name in it.\r
+        //\r
+        NdisReadConfiguration(Status,\r
+                              &Param,\r
+                              ConfigHandle,\r
+                              &DeviceStr,\r
+                              NdisParameterString);\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            break;\r
+        }\r
+\r
+        //\r
+        // Allocate memory for the Adapter structure. This represents both the\r
+        // protocol context as well as the adapter structure when the miniport\r
+        // is initialized.\r
+        //\r
+        // In addition to the base structure, allocate space for the device\r
+        // instance string.\r
+        //\r
+        TotalSize = sizeof(ADAPT) + Param->ParameterData.StringData.MaximumLength;\r
+\r
+        NdisAllocateMemoryWithTag(&pAdapt, TotalSize, TAG);\r
+\r
+        if (pAdapt == NULL)\r
+        {\r
+            *Status = NDIS_STATUS_RESOURCES;\r
+            break;\r
+        }\r
+\r
+        //\r
+        // Initialize the adapter structure. We copy in the IM device\r
+        // name as well, because we may need to use it in a call to\r
+        // NdisIMCancelInitializeDeviceInstance. The string returned\r
+        // by NdisReadConfiguration is active (i.e. available) only\r
+        // for the duration of this call to our BindAdapter handler.\r
+        //\r
+        NdisZeroMemory(pAdapt, TotalSize);\r
+        pAdapt->DeviceName.MaximumLength = Param->ParameterData.StringData.MaximumLength;\r
+        pAdapt->DeviceName.Length = Param->ParameterData.StringData.Length;\r
+        pAdapt->DeviceName.Buffer = (PWCHAR)((ULONG_PTR)pAdapt + sizeof(ADAPT));\r
+        NdisMoveMemory(pAdapt->DeviceName.Buffer,\r
+                       Param->ParameterData.StringData.Buffer,\r
+                       Param->ParameterData.StringData.MaximumLength);\r
+\r
+\r
+\r
+        NdisInitializeEvent(&pAdapt->Event);\r
+        NdisAllocateSpinLock(&pAdapt->Lock);\r
+\r
+        //\r
+        // Allocate a packet pool for sends. We need this to pass sends down.\r
+        // We cannot use the same packet descriptor that came down to our send\r
+        // handler (see also NDIS 5.1 packet stacking).\r
+        //\r
+        NdisAllocatePacketPoolEx(Status,\r
+                                   &pAdapt->SendPacketPoolHandle,\r
+                                   MIN_PACKET_POOL_SIZE,\r
+                                   MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE,\r
+                                   sizeof(SEND_RSVD));\r
+\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            break;\r
+        }\r
+\r
+        //\r
+        // Allocate a packet pool for receives. We need this to indicate receives.\r
+        // Same consideration as sends (see also NDIS 5.1 packet stacking).\r
+        //\r
+        NdisAllocatePacketPoolEx(Status,\r
+                                   &pAdapt->RecvPacketPoolHandle,\r
+                                   MIN_PACKET_POOL_SIZE,\r
+                                   MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE,\r
+                                   PROTOCOL_RESERVED_SIZE_IN_PACKET);\r
+\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            break;\r
+        }\r
+\r
+        //\r
+        // Now open the adapter below and complete the initialization\r
+        //\r
+        NdisOpenAdapter(Status,\r
+                          &Sts,\r
+                          &pAdapt->BindingHandle,\r
+                          &MediumIndex,\r
+                          MediumArray,\r
+                          sizeof(MediumArray)/sizeof(NDIS_MEDIUM),\r
+                          ProtHandle,\r
+                          pAdapt,\r
+                          DeviceName,\r
+                          0,\r
+                          NULL);\r
+\r
+        if (*Status == NDIS_STATUS_PENDING)\r
+        {\r
+            NdisWaitEvent(&pAdapt->Event, 0);\r
+            *Status = pAdapt->Status;\r
+        }\r
+\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            break;\r
+        }\r
+        PtReferenceAdapt(pAdapt);\r
+\r
+#pragma prefast(suppress: __WARNING_POTENTIAL_BUFFER_OVERFLOW, "Ndis guarantees MediumIndex to be within bounds");\r
+        pAdapt->Medium = MediumArray[MediumIndex];\r
+\r
+        //\r
+        // Now ask NDIS to initialize our miniport (upper) edge.\r
+        // Set the flag below to synchronize with a possible call\r
+        // to our protocol Unbind handler that may come in before\r
+        // our miniport initialization happens.\r
+        //\r
+        pAdapt->MiniportInitPending = TRUE;\r
+        NdisInitializeEvent(&pAdapt->MiniportInitEvent);\r
+\r
+        PtReferenceAdapt(pAdapt);\r
+\r
+        *Status = NdisIMInitializeDeviceInstanceEx(DriverHandle,\r
+                                           &pAdapt->DeviceName,\r
+                                           pAdapt);\r
+\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            if (pAdapt->MiniportIsHalted == TRUE)\r
+            {\r
+                NoCleanUpNeeded = TRUE;\r
+            }\r
+            \r
+            DBGPRINT(("BindAdapter: Adapt %p, IMInitializeDeviceInstance error %x\n",\r
+                pAdapt, *Status));\r
+            \r
+            if (PtDereferenceAdapt(pAdapt))\r
+            {\r
+                pAdapt = NULL;\r
+            }\r
+            \r
+            break;\r
+        }\r
+        \r
+        PtDereferenceAdapt(pAdapt);\r
+\r
+    } while(FALSE);\r
+\r
+    //\r
+    // Close the configuration handle now - see comments above with\r
+    // the call to NdisIMInitializeDeviceInstanceEx.\r
+    //\r
+    if (ConfigHandle != NULL)\r
+    {\r
+        NdisCloseConfiguration(ConfigHandle);\r
+    }\r
+\r
+    if ((*Status != NDIS_STATUS_SUCCESS) && (NoCleanUpNeeded == FALSE))\r
+    {\r
+        if (pAdapt != NULL)\r
+        {\r
+            if (pAdapt->BindingHandle != NULL)\r
+            {\r
+                NDIS_STATUS    LocalStatus;\r
+\r
+                //\r
+                // Close the binding we opened above.\r
+                //\r
+\r
+                NdisResetEvent(&pAdapt->Event);\r
+                \r
+                NdisCloseAdapter(&LocalStatus, pAdapt->BindingHandle);\r
+                pAdapt->BindingHandle = NULL;\r
+\r
+                if (LocalStatus == NDIS_STATUS_PENDING)\r
+                {\r
+                     NdisWaitEvent(&pAdapt->Event, 0);\r
+                     LocalStatus = pAdapt->Status;\r
+\r
+                     \r
+                }\r
+                if (PtDereferenceAdapt(pAdapt))\r
+                {\r
+                     pAdapt = NULL;\r
+                }\r
+            }\r
+        }\r
+    }\r
+\r
+\r
+    DBGPRINT(("<== Protocol BindAdapter: pAdapt %p, Status %x\n", pAdapt, *Status));\r
+}\r
+\r
+\r
+VOID\r
+PtOpenAdapterComplete(\r
+    IN  NDIS_HANDLE             ProtocolBindingContext,\r
+    IN  NDIS_STATUS             Status,\r
+    IN  NDIS_STATUS             OpenErrorStatus\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Completion routine for NdisOpenAdapter issued from within the PtBindAdapter. Simply\r
+    unblock the caller.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext    Pointer to the adapter\r
+    Status                    Status of the NdisOpenAdapter call\r
+    OpenErrorStatus            Secondary status(ignored by us).\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT      pAdapt =(PADAPT)ProtocolBindingContext;\r
+    \r
+    UNREFERENCED_PARAMETER(OpenErrorStatus);\r
+    \r
+    DBGPRINT(("==> PtOpenAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status));\r
+    pAdapt->Status = Status;\r
+    NdisSetEvent(&pAdapt->Event);\r
+}\r
+\r
+\r
+VOID\r
+PtUnbindAdapter(\r
+    OUT PNDIS_STATUS           Status,\r
+    IN  NDIS_HANDLE            ProtocolBindingContext,\r
+    IN  NDIS_HANDLE            UnbindContext\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Called by NDIS when we are required to unbind to the adapter below.\r
+    This functions shares functionality with the miniport's HaltHandler.\r
+    The code should ensure that NdisCloseAdapter and NdisFreeMemory is called\r
+    only once between the two functions\r
+\r
+Arguments:\r
+\r
+    Status                    Placeholder for return status\r
+    ProtocolBindingContext    Pointer to the adapter structure\r
+    UnbindContext            Context for NdisUnbindComplete() if this pends\r
+\r
+Return Value:\r
+\r
+    Status for NdisIMDeinitializeDeviceContext\r
+\r
+--*/\r
+{\r
+    PADAPT         pAdapt =(PADAPT)ProtocolBindingContext;\r
+    NDIS_STATUS    LocalStatus;\r
+\r
+    UNREFERENCED_PARAMETER(UnbindContext);\r
+    \r
+    DBGPRINT(("==> PtUnbindAdapter: Adapt %p\n", pAdapt));\r
+\r
+    //\r
+    // Set the flag that the miniport below is unbinding, so the request handlers will\r
+    // fail any request comming later\r
+    // \r
+    NdisAcquireSpinLock(&pAdapt->Lock);\r
+    pAdapt->UnbindingInProcess = TRUE;\r
+    if (pAdapt->QueuedRequest == TRUE)\r
+    {\r
+        pAdapt->QueuedRequest = FALSE;\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+\r
+        PtRequestComplete(pAdapt,\r
+                         &pAdapt->Request,\r
+                         NDIS_STATUS_FAILURE );\r
+\r
+    }\r
+    else\r
+    {\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+    }\r
+#ifndef WIN9X\r
+    //\r
+    // Check if we had called NdisIMInitializeDeviceInstanceEx and\r
+    // we are awaiting a call to MiniportInitialize.\r
+    //\r
+    if (pAdapt->MiniportInitPending == TRUE)\r
+    {\r
+        //\r
+        // Try to cancel the pending IMInit process.\r
+        //\r
+        LocalStatus = NdisIMCancelInitializeDeviceInstance(\r
+                        DriverHandle,\r
+                        &pAdapt->DeviceName);\r
+\r
+        if (LocalStatus == NDIS_STATUS_SUCCESS)\r
+        {\r
+            //\r
+            // Successfully cancelled IM Initialization; our\r
+            // Miniport Initialize routine will not be called\r
+            // for this device.\r
+            //\r
+            pAdapt->MiniportInitPending = FALSE;\r
+            ASSERT(pAdapt->MiniportHandle == NULL);\r
+        }\r
+        else\r
+        {\r
+            //\r
+            // Our Miniport Initialize routine will be called\r
+            // (may be running on another thread at this time).\r
+            // Wait for it to finish.\r
+            //\r
+            NdisWaitEvent(&pAdapt->MiniportInitEvent, 0);\r
+            ASSERT(pAdapt->MiniportInitPending == FALSE);\r
+        }\r
+\r
+    }\r
+#endif // !WIN9X\r
+\r
+    //\r
+    // Call NDIS to remove our device-instance. We do most of the work\r
+    // inside the HaltHandler.\r
+    //\r
+    // The Handle will be NULL if our miniport Halt Handler has been called or\r
+    // if the IM device was never initialized\r
+    //\r
+    \r
+    if (pAdapt->MiniportHandle != NULL)\r
+    {\r
+        *Status = NdisIMDeInitializeDeviceInstance(pAdapt->MiniportHandle);\r
+\r
+        if (*Status != NDIS_STATUS_SUCCESS)\r
+        {\r
+            *Status = NDIS_STATUS_FAILURE;\r
+        }\r
+    }\r
+    else\r
+    {\r
+        //\r
+        // We need to do some work here. \r
+        // Close the binding below us \r
+        // and release the memory allocated.\r
+        //\r
+        \r
+        if(pAdapt->BindingHandle != NULL)\r
+        {\r
+            NdisResetEvent(&pAdapt->Event);\r
+\r
+            NdisCloseAdapter(Status, pAdapt->BindingHandle);\r
+\r
+            //\r
+            // Wait for it to complete\r
+            //\r
+            if(*Status == NDIS_STATUS_PENDING)\r
+            {\r
+                 NdisWaitEvent(&pAdapt->Event, 0);\r
+                 *Status = pAdapt->Status;\r
+            }\r
+            pAdapt->BindingHandle = NULL;\r
+        }\r
+        else\r
+        {\r
+            //\r
+            // Both Our MiniportHandle and Binding Handle  should not be NULL.\r
+            //\r
+            *Status = NDIS_STATUS_FAILURE;\r
+            ASSERT(0);\r
+        }\r
+\r
+        //\r
+        //    Free the memory here, if was not released earlier(by calling the HaltHandler)\r
+        //\r
+        MPFreeAllPacketPools(pAdapt);\r
+        NdisFreeSpinLock(&pAdapt->Lock);\r
+        NdisFreeMemory(pAdapt, 0, 0);\r
+    }\r
+\r
+    DBGPRINT(("<== PtUnbindAdapter: Adapt %p\n", pAdapt));\r
+}\r
+\r
+VOID\r
+PtUnloadProtocol(\r
+    VOID\r
+)\r
+{\r
+    NDIS_STATUS Status;\r
+\r
+    if (ProtHandle != NULL)\r
+    {\r
+        NdisDeregisterProtocol(&Status, ProtHandle);\r
+        ProtHandle = NULL;\r
+    }\r
+\r
+    DBGPRINT(("PtUnloadProtocol: done!\n"));\r
+}\r
+\r
+\r
+\r
+VOID\r
+PtCloseAdapterComplete(\r
+    IN    NDIS_HANDLE            ProtocolBindingContext,\r
+    IN    NDIS_STATUS            Status\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Completion for the CloseAdapter call.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext    Pointer to the adapter structure\r
+    Status                    Completion status\r
+\r
+Return Value:\r
+\r
+    None.\r
+\r
+--*/\r
+{\r
+    PADAPT      pAdapt =(PADAPT)ProtocolBindingContext;\r
+\r
+    DBGPRINT(("CloseAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status));\r
+    pAdapt->Status = Status;\r
+    NdisSetEvent(&pAdapt->Event);\r
+}\r
+\r
+\r
+VOID\r
+PtResetComplete(\r
+    IN  NDIS_HANDLE            ProtocolBindingContext,\r
+    IN  NDIS_STATUS            Status\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Completion for the reset.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext    Pointer to the adapter structure\r
+    Status                    Completion status\r
+\r
+Return Value:\r
+\r
+    None.\r
+\r
+--*/\r
+{\r
+\r
+    UNREFERENCED_PARAMETER(ProtocolBindingContext);\r
+    UNREFERENCED_PARAMETER(Status);\r
+    //\r
+    // We never issue a reset, so we should not be here.\r
+    //\r
+    ASSERT(0);\r
+}\r
+\r
+\r
+VOID\r
+PtRequestComplete(\r
+    IN  NDIS_HANDLE            ProtocolBindingContext,\r
+    IN  PNDIS_REQUEST          NdisRequest,\r
+    IN  NDIS_STATUS            Status\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Completion handler for the previously posted request. All OIDS\r
+    are completed by and sent to the same miniport that they were requested for.\r
+    If Oid == OID_PNP_QUERY_POWER then the data structure needs to returned with all entries =\r
+    NdisDeviceStateUnspecified\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext    Pointer to the adapter structure\r
+    NdisRequest                The posted request\r
+    Status                    Completion status\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT        pAdapt = (PADAPT)ProtocolBindingContext;\r
+    NDIS_OID      Oid = pAdapt->Request.DATA.SET_INFORMATION.Oid ;\r
+\r
+    //\r
+    // Since our request is not outstanding anymore\r
+    //\r
+    ASSERT(pAdapt->OutstandingRequests == TRUE);\r
+\r
+    pAdapt->OutstandingRequests = FALSE;\r
+\r
+    //\r
+    // Complete the Set or Query, and fill in the buffer for OID_PNP_CAPABILITIES, if need be.\r
+    //\r
+    switch (NdisRequest->RequestType)\r
+    {\r
+      case NdisRequestQueryInformation:\r
+\r
+        //\r
+        // We never pass OID_PNP_QUERY_POWER down.\r
+        //\r
+        ASSERT(Oid != OID_PNP_QUERY_POWER);\r
+\r
+        if ((Oid == OID_PNP_CAPABILITIES) && (Status == NDIS_STATUS_SUCCESS))\r
+        {\r
+            MPQueryPNPCapabilities(pAdapt, &Status);\r
+        }\r
+        *pAdapt->BytesReadOrWritten = NdisRequest->DATA.QUERY_INFORMATION.BytesWritten;\r
+        *pAdapt->BytesNeeded = NdisRequest->DATA.QUERY_INFORMATION.BytesNeeded;\r
+\r
+        if (((Oid == OID_GEN_MAC_OPTIONS) \r
+              && (Status == NDIS_STATUS_SUCCESS))\r
+              && (NdisDotSysVersion >= NDIS_SYS_VERSION_51))\r
+        {\r
+            //\r
+            // Only do this on Windows XP or greater (NDIS.SYS v 5.1); \r
+            // do not do in Windows 2000 (NDIS.SYS v 5.0))\r
+            //\r
+                \r
+            //\r
+            // Remove the no-loopback bit from mac-options. In essence we are\r
+            // telling NDIS that we can handle loopback. We don't, but the\r
+            // interface below us does. If we do not do this, then loopback\r
+            // processing happens both below us and above us. This is wasteful\r
+            // at best and if Netmon is running, it will see multiple copies\r
+            // of loopback packets when sniffing above us.\r
+            //\r
+            // Only the lowest miniport is a stack of layered miniports should\r
+            // ever report this bit set to NDIS.\r
+            //\r
+            *(PULONG)NdisRequest->DATA.QUERY_INFORMATION.InformationBuffer &= ~NDIS_MAC_OPTION_NO_LOOPBACK;\r
+        }\r
+\r
+        NdisMQueryInformationComplete(pAdapt->MiniportHandle,\r
+                                      Status);\r
+        break;\r
+\r
+      case NdisRequestSetInformation:\r
+\r
+        ASSERT( Oid != OID_PNP_SET_POWER);\r
+\r
+        *pAdapt->BytesReadOrWritten = NdisRequest->DATA.SET_INFORMATION.BytesRead;\r
+        *pAdapt->BytesNeeded = NdisRequest->DATA.SET_INFORMATION.BytesNeeded;\r
+        NdisMSetInformationComplete(pAdapt->MiniportHandle,\r
+                                    Status);\r
+        break;\r
+\r
+      default:\r
+        ASSERT(0);\r
+        break;\r
+    }\r
+    \r
+}\r
+\r
+\r
+VOID\r
+PtStatus(\r
+    IN  NDIS_HANDLE         ProtocolBindingContext,\r
+    IN  NDIS_STATUS         GeneralStatus,\r
+    IN  PVOID               StatusBuffer,\r
+    IN  UINT                StatusBufferSize\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Status handler for the lower-edge(protocol).\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext    Pointer to the adapter structure\r
+    GeneralStatus             Status code\r
+    StatusBuffer              Status buffer\r
+    StatusBufferSize          Size of the status buffer\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT      pAdapt = (PADAPT)ProtocolBindingContext;\r
+\r
+    //\r
+    // Pass up this indication only if the upper edge miniport is initialized\r
+    // and powered on. Also ignore indications that might be sent by the lower\r
+    // miniport when it isn't at D0.\r
+    //\r
+    if ((pAdapt->MiniportHandle != NULL)  &&\r
+        (pAdapt->MPDeviceState == NdisDeviceStateD0) &&\r
+        (pAdapt->PTDeviceState == NdisDeviceStateD0))    \r
+    {\r
+        if ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || \r
+            (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT))\r
+        {\r
+            \r
+            pAdapt->LastIndicatedStatus = GeneralStatus;\r
+        }\r
+        NdisMIndicateStatus(pAdapt->MiniportHandle,\r
+                            GeneralStatus,\r
+                            StatusBuffer,\r
+                            StatusBufferSize);\r
+    }\r
+    //\r
+    // Save the last indicated media status \r
+    //\r
+    else\r
+    {\r
+        if ((pAdapt->MiniportHandle != NULL) && \r
+        ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || \r
+            (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT)))\r
+        {\r
+            pAdapt->LatestUnIndicateStatus = GeneralStatus;\r
+        }\r
+    }\r
+    \r
+}\r
+\r
+\r
+VOID\r
+PtStatusComplete(\r
+    IN NDIS_HANDLE            ProtocolBindingContext\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+\r
+Arguments:\r
+\r
+\r
+Return Value:\r
+\r
+\r
+--*/\r
+{\r
+    PADAPT      pAdapt = (PADAPT)ProtocolBindingContext;\r
+\r
+    //\r
+    // Pass up this indication only if the upper edge miniport is initialized\r
+    // and powered on. Also ignore indications that might be sent by the lower\r
+    // miniport when it isn't at D0.\r
+    //\r
+    if ((pAdapt->MiniportHandle != NULL)  &&\r
+        (pAdapt->MPDeviceState == NdisDeviceStateD0) &&\r
+        (pAdapt->PTDeviceState == NdisDeviceStateD0))    \r
+    {\r
+        NdisMIndicateStatusComplete(pAdapt->MiniportHandle);\r
+    }\r
+}\r
+\r
+\r
+VOID\r
+PtSendComplete(\r
+    IN  NDIS_HANDLE            ProtocolBindingContext,\r
+    IN  PNDIS_PACKET           Packet,\r
+    IN  NDIS_STATUS            Status\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Called by NDIS when the miniport below had completed a send. We should\r
+    complete the corresponding upper-edge send this represents.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext - Points to ADAPT structure\r
+    Packet - Low level packet being completed\r
+    Status - status of send\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT            pAdapt = (PADAPT)ProtocolBindingContext;\r
+    PNDIS_PACKET      Pkt; \r
+    NDIS_HANDLE       PoolHandle;\r
+\r
+#ifdef NDIS51\r
+    //\r
+    // Packet stacking:\r
+    //\r
+    // Determine if the packet we are completing is the one we allocated. If so, then\r
+    // get the original packet from the reserved area and completed it and free the\r
+    // allocated packet. If this is the packet that was sent down to us, then just\r
+    // complete it\r
+    //\r
+    PoolHandle = NdisGetPoolFromPacket(Packet);\r
+    if (PoolHandle != pAdapt->SendPacketPoolHandle)\r
+    {\r
+        //\r
+        // We had passed down a packet belonging to the protocol above us.\r
+        //\r
+        // DBGPRINT(("PtSendComp: Adapt %p, Stacked Packet %p\n", pAdapt, Packet));\r
+\r
+        NdisMSendComplete(pAdapt->MiniportHandle,\r
+                          Packet,\r
+                          Status);\r
+    }\r
+    else\r
+#endif // NDIS51\r
+    {\r
+        PSEND_RSVD        SendRsvd;\r
+\r
+        SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved);\r
+        Pkt = SendRsvd->OriginalPkt;\r
+    \r
+#ifndef WIN9X\r
+        NdisIMCopySendCompletePerPacketInfo (Pkt, Packet);\r
+#endif\r
+    \r
+        NdisDprFreePacket(Packet);\r
+\r
+        NdisMSendComplete(pAdapt->MiniportHandle,\r
+                                 Pkt,\r
+                                 Status);\r
+    }\r
+    //\r
+    // Decrease the outstanding send count\r
+    //\r
+    ADAPT_DECR_PENDING_SENDS(pAdapt);\r
+}       \r
+\r
+\r
+VOID\r
+PtTransferDataComplete(\r
+    IN  NDIS_HANDLE         ProtocolBindingContext,\r
+    IN  PNDIS_PACKET        Packet,\r
+    IN  NDIS_STATUS         Status,\r
+    IN  UINT                BytesTransferred\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Entry point called by NDIS to indicate completion of a call by us\r
+    to NdisTransferData.\r
+\r
+    See notes under SendComplete.\r
+\r
+Arguments:\r
+\r
+Return Value:\r
+\r
+--*/\r
+{\r
+    PADAPT      pAdapt =(PADAPT)ProtocolBindingContext;\r
+\r
+    if(pAdapt->MiniportHandle)\r
+    {\r
+        NdisMTransferDataComplete(pAdapt->MiniportHandle,\r
+                                  Packet,\r
+                                  Status,\r
+                                  BytesTransferred);\r
+    }\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+PtReceive(\r
+    IN  NDIS_HANDLE         ProtocolBindingContext,\r
+    IN  NDIS_HANDLE         MacReceiveContext,\r
+    IN  PVOID               HeaderBuffer,\r
+    IN  UINT                HeaderBufferSize,\r
+    IN  PVOID               LookAheadBuffer,\r
+    IN  UINT                LookAheadBufferSize,\r
+    IN  UINT                PacketSize\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Handle receive data indicated up by the miniport below. We pass\r
+    it along to the protocol above us.\r
+\r
+    If the miniport below indicates packets, NDIS would more\r
+    likely call us at our ReceivePacket handler. However we\r
+    might be called here in certain situations even though\r
+    the miniport below has indicated a receive packet, e.g.\r
+    if the miniport had set packet status to NDIS_STATUS_RESOURCES.\r
+        \r
+Arguments:\r
+\r
+    <see DDK ref page for ProtocolReceive>\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_SUCCESS if we processed the receive successfully,\r
+    NDIS_STATUS_XXX error code if we discarded it.\r
+\r
+--*/\r
+{\r
+    PADAPT            pAdapt = (PADAPT)ProtocolBindingContext;\r
+    PNDIS_PACKET      MyPacket, Packet = NULL;\r
+    NDIS_STATUS       Status = NDIS_STATUS_SUCCESS;\r
+    ULONG             Proc = KeGetCurrentProcessorNumber();      \r
+    \r
+    if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0))\r
+    {\r
+        Status = NDIS_STATUS_FAILURE;\r
+    }\r
+    else do\r
+    {\r
+        //\r
+        // Get at the packet, if any, indicated up by the miniport below.\r
+        //\r
+        Packet = NdisGetReceivedPacket(pAdapt->BindingHandle, MacReceiveContext);\r
+        if (Packet != NULL)\r
+        {\r
+            //\r
+            // The miniport below did indicate up a packet. Use information\r
+            // from that packet to construct a new packet to indicate up.\r
+            //\r
+\r
+#ifdef NDIS51\r
+            //\r
+            // NDIS 5.1 NOTE: Do not reuse the original packet in indicating\r
+            // up a receive, even if there is sufficient packet stack space.\r
+            // If we had to do so, we would have had to overwrite the\r
+            // status field in the original packet to NDIS_STATUS_RESOURCES,\r
+            // and it is not allowed for protocols to overwrite this field\r
+            // in received packets.\r
+            //\r
+#endif // NDIS51\r
+\r
+            //\r
+            // Get a packet off the pool and indicate that up\r
+            //\r
+            NdisDprAllocatePacket(&Status,\r
+                                &MyPacket,\r
+                                pAdapt->RecvPacketPoolHandle);\r
+\r
+            if (Status == NDIS_STATUS_SUCCESS)\r
+            {\r
+                //\r
+                // Make our packet point to data from the original\r
+                // packet. NOTE: this works only because we are\r
+                // indicating a receive directly from the context of\r
+                // our receive indication. If we need to queue this\r
+                // packet and indicate it from another thread context,\r
+                // we will also have to allocate a new buffer and copy\r
+                // over the packet contents, OOB data and per-packet\r
+                // information. This is because the packet data\r
+                // is available only for the duration of this\r
+                // receive indication call.\r
+                //\r
+                NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);\r
+                NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);\r
+\r
+                //\r
+                // Get the original packet (it could be the same packet as the\r
+                // one received or a different one based on the number of layered\r
+                // miniports below) and set it on the indicated packet so the OOB\r
+                // data is visible correctly at protocols above.  If the IM driver \r
+                // modifies the packet in any way it should not set the new packet's\r
+                // original packet equal to the original packet of the packet that \r
+                // was indicated to it from the underlying driver, in this case, the \r
+                // IM driver should also ensure that the related per packet info should\r
+                // be copied to the new packet.\r
+                // we can set the original packet to the original packet of the packet\r
+                // indicated from the underlying driver because the driver doesn't modify\r
+                // the data content in the packet.\r
+                //\r
+                NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet));\r
+                NDIS_SET_PACKET_HEADER_SIZE(MyPacket, HeaderBufferSize);\r
+\r
+                //\r
+                // Copy packet flags.\r
+                //\r
+                NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet);\r
+\r
+                //\r
+                // Force protocols above to make a copy if they want to hang\r
+                // on to data in this packet. This is because we are in our\r
+                // Receive handler (not ReceivePacket) and we can't return a\r
+                // ref count from here.\r
+                //\r
+                NDIS_SET_PACKET_STATUS(MyPacket, NDIS_STATUS_RESOURCES);\r
+\r
+                //\r
+                // By setting NDIS_STATUS_RESOURCES, we also know that we can reclaim\r
+                // this packet as soon as the call to NdisMIndicateReceivePacket\r
+                // returns.\r
+                //\r
+\r
+                if (pAdapt->MiniportHandle != NULL)\r
+                {\r
+                    NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);\r
+                }\r
+\r
+                //\r
+                // Reclaim the indicated packet. Since we had set its status\r
+                // to NDIS_STATUS_RESOURCES, we are guaranteed that protocols\r
+                // above are done with it.\r
+                //\r
+                NdisDprFreePacket(MyPacket);\r
+\r
+                break;\r
+            }\r
+        }\r
+        else\r
+        {\r
+            //\r
+            // The miniport below us uses the old-style (not packet)\r
+            // receive indication. Fall through.\r
+            //\r
+        }\r
+\r
+        //\r
+        // Fall through if the miniport below us has either not\r
+        // indicated a packet or we could not allocate one\r
+        //\r
+        pAdapt->ReceivedIndicationFlags[Proc] = TRUE;\r
+        if (pAdapt->MiniportHandle == NULL)\r
+        {\r
+            break;\r
+        }\r
+        switch (pAdapt->Medium)\r
+        {\r
+            case NdisMedium802_3:\r
+            case NdisMediumWan:\r
+                NdisMEthIndicateReceive(pAdapt->MiniportHandle,\r
+                                             MacReceiveContext,\r
+                                             HeaderBuffer,\r
+                                             HeaderBufferSize,\r
+                                             LookAheadBuffer,\r
+                                             LookAheadBufferSize,\r
+                                             PacketSize);\r
+                break;\r
+\r
+            case NdisMedium802_5:\r
+                NdisMTrIndicateReceive(pAdapt->MiniportHandle,\r
+                                            MacReceiveContext,\r
+                                            HeaderBuffer,\r
+                                            HeaderBufferSize,\r
+                                            LookAheadBuffer,\r
+                                            LookAheadBufferSize,\r
+                                            PacketSize);\r
+                break;\r
+\r
+#if FDDI\r
+                 case NdisMediumFddi:\r
+                        NdisMFddiIndicateReceive(pAdapt->MiniportHandle,\r
+                                                                                         MacReceiveContext,\r
+                                                                                         HeaderBuffer,\r
+                                                                                         HeaderBufferSize,\r
+                                                                                         LookAheadBuffer,\r
+                                                                                         LookAheadBufferSize,\r
+                                                                                         PacketSize);\r
+                        break;\r
+#endif\r
+                 default:\r
+                        ASSERT(FALSE);\r
+                        break;\r
+               }\r
+\r
+    } while(FALSE);\r
+\r
+    return Status;\r
+}\r
+\r
+\r
+VOID\r
+PtReceiveComplete(\r
+    IN NDIS_HANDLE        ProtocolBindingContext\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    Called by the adapter below us when it is done indicating a batch of\r
+    received packets.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext    Pointer to our adapter structure.\r
+\r
+Return Value:\r
+\r
+    None\r
+\r
+--*/\r
+{\r
+    PADAPT        pAdapt =(PADAPT)ProtocolBindingContext;\r
+    ULONG         Proc = KeGetCurrentProcessorNumber();      \r
+\r
+    if (((pAdapt->MiniportHandle != NULL)\r
+                && (pAdapt->MPDeviceState == NdisDeviceStateD0))\r
+                && (pAdapt->ReceivedIndicationFlags[Proc]))\r
+    {\r
+        switch (pAdapt->Medium)\r
+        {\r
+            case NdisMedium802_3:\r
+            case NdisMediumWan:\r
+                NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle);\r
+                break;\r
+\r
+                 case NdisMedium802_5:\r
+                       NdisMTrIndicateReceiveComplete(pAdapt->MiniportHandle);\r
+                       break;\r
+#if FDDI\r
+                 case NdisMediumFddi:\r
+                       NdisMFddiIndicateReceiveComplete(pAdapt->MiniportHandle);\r
+                       break;\r
+#endif\r
+                 default:\r
+                       ASSERT(FALSE);\r
+                       break;\r
+               }\r
+       }\r
+\r
+    pAdapt->ReceivedIndicationFlags[Proc] = FALSE;\r
+}\r
+\r
+\r
+INT\r
+PtReceivePacket(\r
+    IN NDIS_HANDLE            ProtocolBindingContext,\r
+    IN PNDIS_PACKET           Packet\r
+    )\r
+/*++\r
+\r
+Routine Description:\r
+\r
+    ReceivePacket handler. Called by NDIS if the miniport below supports\r
+    NDIS 4.0 style receives. Re-package the buffer chain in a new packet\r
+    and indicate the new packet to protocols above us. Any context for\r
+    packets indicated up must be kept in the MiniportReserved field.\r
+\r
+    NDIS 5.1 - packet stacking - if there is sufficient "stack space" in\r
+    the packet passed to us, we can use the same packet in a receive\r
+    indication.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext - Pointer to our adapter structure.\r
+    Packet - Pointer to the packet\r
+\r
+Return Value:\r
+\r
+    == 0 -> We are done with the packet\r
+    != 0 -> We will keep the packet and call NdisReturnPackets() this\r
+            many times when done.\r
+--*/\r
+{\r
+    PADAPT              pAdapt =(PADAPT)ProtocolBindingContext;\r
+    NDIS_STATUS         Status;\r
+    PNDIS_PACKET        MyPacket;\r
+    BOOLEAN             Remaining;\r
+\r
+    //\r
+    // Drop the packet silently if the upper miniport edge isn't initialized or\r
+    // the miniport edge is in low power state\r
+    //\r
+    if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0))\r
+    {\r
+          return 0;\r
+    }\r
+\r
+#ifdef NDIS51\r
+    //\r
+    // Check if we can reuse the same packet for indicating up.\r
+    // See also: PtReceive(). \r
+    //\r
+    (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining);\r
+    if (Remaining)\r
+    {\r
+        //\r
+        // We can reuse "Packet". Indicate it up and be done with it.\r
+        //\r
+        Status = NDIS_GET_PACKET_STATUS(Packet);\r
+        NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &Packet, 1);\r
+        return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0);\r
+    }\r
+#endif // NDIS51\r
+\r
+    //\r
+    // Get a packet off the pool and indicate that up\r
+    //\r
+    NdisDprAllocatePacket(&Status,\r
+                           &MyPacket,\r
+                           pAdapt->RecvPacketPoolHandle);\r
+\r
+    if (Status == NDIS_STATUS_SUCCESS)\r
+    {\r
+        PRECV_RSVD            RecvRsvd;\r
+\r
+        RecvRsvd = (PRECV_RSVD)(MyPacket->MiniportReserved);\r
+        RecvRsvd->OriginalPkt = Packet;\r
+\r
+        NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet);\r
+        NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet);\r
+\r
+        //\r
+        // Get the original packet (it could be the same packet as the one\r
+        // received or a different one based on the number of layered miniports\r
+        // below) and set it on the indicated packet so the OOB data is visible\r
+        // correctly to protocols above us.\r
+        //\r
+        NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet));\r
+\r
+        //\r
+        // Set Packet Flags\r
+        //\r
+        NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet);\r
+\r
+        Status = NDIS_GET_PACKET_STATUS(Packet);\r
+\r
+        NDIS_SET_PACKET_STATUS(MyPacket, Status);\r
+        NDIS_SET_PACKET_HEADER_SIZE(MyPacket, NDIS_GET_PACKET_HEADER_SIZE(Packet));\r
+\r
+        if (pAdapt->MiniportHandle != NULL)\r
+        {\r
+            NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1);\r
+        }\r
+\r
+        //\r
+        // Check if we had indicated up the packet with NDIS_STATUS_RESOURCES\r
+        // NOTE -- do not use NDIS_GET_PACKET_STATUS(MyPacket) for this since\r
+        // it might have changed! Use the value saved in the local variable.\r
+        //\r
+        if (Status == NDIS_STATUS_RESOURCES)\r
+        {\r
+            //\r
+            // Our ReturnPackets handler will not be called for this packet.\r
+            // We should reclaim it right here.\r
+            //\r
+            NdisDprFreePacket(MyPacket);\r
+        }\r
+\r
+        return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0);\r
+    }\r
+    else\r
+    {\r
+        //\r
+        // We are out of packets. Silently drop it.\r
+        //\r
+        return(0);\r
+    }\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+PtPNPHandler(\r
+    IN NDIS_HANDLE        ProtocolBindingContext,\r
+    IN PNET_PNP_EVENT     pNetPnPEvent\r
+    )\r
+\r
+/*++\r
+Routine Description:\r
+\r
+    This is called by NDIS to notify us of a PNP event related to a lower\r
+    binding. Based on the event, this dispatches to other helper routines.\r
+\r
+    NDIS 5.1: forward this event to the upper protocol(s) by calling\r
+    NdisIMNotifyPnPEvent.\r
+\r
+Arguments:\r
+\r
+    ProtocolBindingContext - Pointer to our adapter structure. Can be NULL\r
+                for "global" notifications\r
+\r
+    pNetPnPEvent - Pointer to the PNP event to be processed.\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS code indicating status of event processing.\r
+\r
+--*/\r
+{\r
+    PADAPT            pAdapt  =(PADAPT)ProtocolBindingContext;\r
+    NDIS_STATUS       Status  = NDIS_STATUS_SUCCESS;\r
+\r
+    DBGPRINT(("PtPnPHandler: Adapt %p, Event %d\n", pAdapt, pNetPnPEvent->NetEvent));\r
+\r
+    switch (pNetPnPEvent->NetEvent)\r
+    {\r
+        case NetEventSetPower:\r
+            Status = PtPnPNetEventSetPower(pAdapt, pNetPnPEvent);\r
+            break;\r
+\r
+         case NetEventReconfigure:\r
+            Status = PtPnPNetEventReconfigure(pAdapt, pNetPnPEvent);\r
+            break;\r
+\r
+         default:\r
+#ifdef NDIS51\r
+            //\r
+            // Pass on this notification to protocol(s) above, before\r
+            // doing anything else with it.\r
+            //\r
+            if (pAdapt && pAdapt->MiniportHandle)\r
+            {\r
+                Status = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);\r
+            }\r
+#else\r
+            Status = NDIS_STATUS_SUCCESS;\r
+\r
+#endif // NDIS51\r
+\r
+            break;\r
+    }\r
+\r
+    return Status;\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+PtPnPNetEventReconfigure(\r
+    IN PADAPT            pAdapt,\r
+    IN PNET_PNP_EVENT    pNetPnPEvent\r
+    )\r
+/*++\r
+Routine Description:\r
+\r
+    This routine is called from NDIS to notify our protocol edge of a\r
+    reconfiguration of parameters for either a specific binding (pAdapt\r
+    is not NULL), or global parameters if any (pAdapt is NULL).\r
+\r
+Arguments:\r
+\r
+    pAdapt - Pointer to our adapter structure.\r
+    pNetPnPEvent - the reconfigure event\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_SUCCESS\r
+\r
+--*/\r
+{\r
+    NDIS_STATUS    ReconfigStatus = NDIS_STATUS_SUCCESS;\r
+    NDIS_STATUS    ReturnStatus = NDIS_STATUS_SUCCESS;\r
+\r
+    do\r
+    {\r
+        //\r
+        // Is this is a global reconfiguration notification ?\r
+        //\r
+        if (pAdapt == NULL)\r
+        {\r
+            //\r
+            // An important event that causes this notification to us is if\r
+            // one of our upper-edge miniport instances was enabled after being\r
+            // disabled earlier, e.g. from Device Manager in Win2000. Note that\r
+            // NDIS calls this because we had set up an association between our\r
+            // miniport and protocol entities by calling NdisIMAssociateMiniport.\r
+            //\r
+            // Since we would have torn down the lower binding for that miniport,\r
+            // we need NDIS' assistance to re-bind to the lower miniport. The\r
+            // call to NdisReEnumerateProtocolBindings does exactly that.\r
+            //\r
+            NdisReEnumerateProtocolBindings (ProtHandle);        \r
+            \r
+            break;\r
+        }\r
+\r
+#ifdef NDIS51\r
+        //\r
+        // Pass on this notification to protocol(s) above before doing anything\r
+        // with it.\r
+        //\r
+        if (pAdapt->MiniportHandle)\r
+        {\r
+            ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);\r
+        }\r
+#endif // NDIS51\r
+\r
+        ReconfigStatus = NDIS_STATUS_SUCCESS;\r
+\r
+    } while(FALSE);\r
+\r
+    DBGPRINT(("<==PtPNPNetEventReconfigure: pAdapt %p\n", pAdapt));\r
+\r
+#ifdef NDIS51\r
+    //\r
+    // Overwrite status with what upper-layer protocol(s) returned.\r
+    //\r
+    ReconfigStatus = ReturnStatus;\r
+#endif\r
+\r
+    return ReconfigStatus;\r
+}\r
+\r
+\r
+NDIS_STATUS\r
+PtPnPNetEventSetPower(\r
+    IN PADAPT            pAdapt,\r
+    IN PNET_PNP_EVENT    pNetPnPEvent\r
+    )\r
+/*++\r
+Routine Description:\r
+\r
+    This is a notification to our protocol edge of the power state\r
+    of the lower miniport. If it is going to a low-power state, we must\r
+    wait here for all outstanding sends and requests to complete.\r
+\r
+    NDIS 5.1:  Since we use packet stacking, it is not sufficient to\r
+    check usage of our local send packet pool to detect whether or not\r
+    all outstanding sends have completed. For this, use the new API\r
+    NdisQueryPendingIOCount.\r
+\r
+    NDIS 5.1: Use the 5.1 API NdisIMNotifyPnPEvent to pass on PnP\r
+    notifications to upper protocol(s).\r
+\r
+Arguments:\r
+\r
+    pAdapt            -    Pointer to the adpater structure\r
+    pNetPnPEvent    -    The Net Pnp Event. this contains the new device state\r
+\r
+Return Value:\r
+\r
+    NDIS_STATUS_SUCCESS or the status returned by upper-layer protocols.\r
+\r
+--*/\r
+{\r
+    PNDIS_DEVICE_POWER_STATE       pDeviceState  =(PNDIS_DEVICE_POWER_STATE)(pNetPnPEvent->Buffer);\r
+    NDIS_DEVICE_POWER_STATE        PrevDeviceState = pAdapt->PTDeviceState;  \r
+    NDIS_STATUS                    Status;\r
+    NDIS_STATUS                    ReturnStatus;\r
+\r
+    ReturnStatus = NDIS_STATUS_SUCCESS;\r
+\r
+    //\r
+    // Set the Internal Device State, this blocks all new sends or receives\r
+    //\r
+    NdisAcquireSpinLock(&pAdapt->Lock);\r
+    pAdapt->PTDeviceState = *pDeviceState;\r
+\r
+    //\r
+    // Check if the miniport below is going to a low power state.\r
+    //\r
+    if (pAdapt->PTDeviceState > NdisDeviceStateD0)\r
+    {\r
+        //\r
+        // If the miniport below is going to standby, fail all incoming requests\r
+        //\r
+        if (PrevDeviceState == NdisDeviceStateD0)\r
+        {\r
+            pAdapt->StandingBy = TRUE;\r
+        }\r
+\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+\r
+#ifdef NDIS51\r
+        //\r
+        // Notify upper layer protocol(s) first.\r
+        //\r
+        if (pAdapt->MiniportHandle != NULL)\r
+        {\r
+            ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);\r
+        }\r
+#endif // NDIS51\r
+\r
+        //\r
+        // Wait for outstanding sends and requests to complete.\r
+        //\r
+        while (pAdapt->OutstandingSends != 0)\r
+        {\r
+            NdisMSleep(2);\r
+        }\r
+\r
+        while (pAdapt->OutstandingRequests == TRUE)\r
+        {\r
+            //\r
+            // sleep till outstanding requests complete\r
+            //\r
+            NdisMSleep(2);\r
+        }\r
+\r
+        //\r
+        // If the below miniport is going to low power state, complete the queued request\r
+        //\r
+        NdisAcquireSpinLock(&pAdapt->Lock);\r
+        if (pAdapt->QueuedRequest)\r
+        {\r
+            pAdapt->QueuedRequest = FALSE;\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+            PtRequestComplete(pAdapt, &pAdapt->Request, NDIS_STATUS_FAILURE);\r
+        }\r
+        else\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+        }\r
+            \r
+\r
+        ASSERT(NdisPacketPoolUsage(pAdapt->SendPacketPoolHandle) == 0);\r
+        ASSERT(pAdapt->OutstandingRequests == FALSE);\r
+    }\r
+    else\r
+    {\r
+        //\r
+        // If the physical miniport is powering up (from Low power state to D0), \r
+        // clear the flag\r
+        //\r
+        if (PrevDeviceState > NdisDeviceStateD0)\r
+        {\r
+            pAdapt->StandingBy = FALSE;\r
+        }\r
+        //\r
+        // The device below is being turned on. If we had a request\r
+        // pending, send it down now.\r
+        //\r
+        if (pAdapt->QueuedRequest == TRUE)\r
+        {\r
+            pAdapt->QueuedRequest = FALSE;\r
+        \r
+            pAdapt->OutstandingRequests = TRUE;\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+\r
+            NdisRequest(&Status,\r
+                        pAdapt->BindingHandle,\r
+                        &pAdapt->Request);\r
+\r
+            if (Status != NDIS_STATUS_PENDING)\r
+            {\r
+                PtRequestComplete(pAdapt,\r
+                                  &pAdapt->Request,\r
+                                  Status);\r
+                \r
+            }\r
+        }\r
+        else\r
+        {\r
+            NdisReleaseSpinLock(&pAdapt->Lock);\r
+        }\r
+\r
+\r
+#ifdef NDIS51\r
+        //\r
+        // Pass on this notification to protocol(s) above\r
+        //\r
+        if (pAdapt->MiniportHandle)\r
+        {\r
+            ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent);\r
+        }\r
+#endif // NDIS51\r
+\r
+    }\r
+\r
+    return ReturnStatus;\r
+}\r
+\r
+VOID\r
+PtReferenceAdapt(\r
+    IN PADAPT     pAdapt\r
+    )\r
+{\r
+    NdisAcquireSpinLock(&pAdapt->Lock);\r
+    \r
+    ASSERT(pAdapt->RefCount >= 0);\r
+\r
+    pAdapt->RefCount ++;\r
+    NdisReleaseSpinLock(&pAdapt->Lock);\r
+}\r
+\r
+\r
+BOOLEAN\r
+PtDereferenceAdapt(\r
+    IN PADAPT     pAdapt\r
+    )\r
+{\r
+    NdisAcquireSpinLock(&pAdapt->Lock);\r
+\r
+    ASSERT(pAdapt->RefCount > 0);\r
+\r
+    pAdapt->RefCount--;\r
+\r
+    if (pAdapt->RefCount == 0)\r
+    {\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+        \r
+        //\r
+        // Free all resources on this adapter structure.\r
+        //\r
+        MPFreeAllPacketPools (pAdapt);;\r
+        NdisFreeSpinLock(&pAdapt->Lock);\r
+        NdisFreeMemory(pAdapt, 0 , 0);\r
+        \r
+        return TRUE;\r
+        \r
+    }\r
+    else\r
+    {\r
+        NdisReleaseSpinLock(&pAdapt->Lock);\r
+\r
+        return FALSE;\r
+    }\r
+}\r
+\r
+\r
diff --git a/original_passthru/sources b/original_passthru/sources
new file mode 100644 (file)
index 0000000..d52d78f
--- /dev/null
@@ -0,0 +1,39 @@
+TARGETNAME=passthru\r
+TARGETTYPE=DRIVER\r
+\r
+C_DEFINES=$(C_DEFINES) -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1\r
+\r
+MSC_WARNING_LEVEL=/WX /W4\r
+\r
+!if "$(DDK_TARGET_OS)"=="Win2K"\r
+#\r
+# The driver is built in the Win2K build environment\r
+#\r
+C_DEFINES=$(C_DEFINES) -DNDIS40_MINIPORT=1\r
+C_DEFINES=$(C_DEFINES) -DNDIS40=1\r
+!else \r
+#\r
+# The driver is built in the XP or .NET build environment\r
+# So let us build NDIS 5.1 version.\r
+#\r
+C_DEFINES=$(C_DEFINES) -DNDIS51_MINIPORT=1\r
+C_DEFINES=$(C_DEFINES) -DNDIS51=1\r
+!endif\r
+\r
+# Uncomment the following to build for Win98/SE/WinMe\r
+# This causes several APIs that are not present in Win9X to be\r
+# ifdef'ed out.\r
+# C_DEFINES=$(C_DEFINES) -DWIN9X=1\r
+\r
+PRECOMPILED_INCLUDE=precomp.h\r
+\r
+TARGETLIBS=$(DDK_LIB_PATH)\ndis.lib\r
+\r
+INCLUDES=\r
+\r
+SOURCES=\\r
+    miniport.c \\r
+    passthru.c \\r
+    passthru.rc \\r
+    protocol.c\r
+\r
diff --git a/planetlab/Makefile.planetlab b/planetlab/Makefile.planetlab
new file mode 100644 (file)
index 0000000..f341262
--- /dev/null
@@ -0,0 +1,181 @@
+# $Id: Makefile 11687 2012-08-12 20:51:25Z luigi $
+#
+# Top level makefile for building ipfw/dummynet (kernel and userspace).
+# You can run it manually or also under the Planetlab build.
+# Planetlab wants also the 'install' target.
+#
+# To build on system with non standard Kernel sources or userland files,
+# you should run this with
+#
+#      make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr
+#
+# We assume that $(USRDIR) contains include/ and lib/ used to build userland.
+#
+
+include Makefile.inc
+
+DATE ?= $(shell date +%Y%m%d)
+SNAPSHOT_NAME=$(DATE)-ipfw3.tgz
+BINDIST=$(DATE)-dummynet-linux.tgz
+WINDIST=$(DATE)-dummynet-windows.zip
+
+.PHONY: ipfw kipfw
+
+###########################################
+#  windows x86 and x64 specific variables #
+###########################################
+#  DRIVE must be the hard drive letter where DDK is installed
+#  DDKDIR must be the path to the DDK root directory, without drive letter
+#  TARGETOS (x64 only) must be one of the following:
+#  wnet   -> windows server 2003
+#  wlh    -> windows vista and windows server 2008
+#  win7   -> windows 7
+#  future version must be added here
+export WIN64
+export DDK
+export DRIVE
+export DDKDIR
+DRIVE ?= C:
+DDKDIR ?= /WinDDK/7600.16385.1
+DDK = $(DRIVE)$(DDKDIR)
+
+TARGETOS=win7
+
+_all: all
+
+clean distclean:
+       -@(cd ipfw && $(MAKE) $(@) )
+       -@rm -rf kipfw-mod binary64/[A-hj-z]*
+
+all: kipfw ipfw
+       @# -- windows only
+ifeq ($(OSARCH),Windows)       # copy files
+ifeq ($(WIN64),)
+       -@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/
+       -@ cp kipfw/*.inf binary/
+else
+       -@ cp binary/* binary64/
+       -@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/
+endif  # WIN64
+endif  # Windows
+
+win64:
+       $(MAKE) WIN64=1
+
+# kipfw-src prepares the sources for the kernel part.
+# The windows files (passthru etc.) are modified version of the
+# examples found in the $(DDK)/src/network/ndis/passthru/driver/
+# They can be re-created using the 'ndis-glue' target
+# # We need a sed trick to remove newlines from the patchfile.
+
+ndis-glue:
+       -@mkdir -p kipfw-mod
+       cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod
+       cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch )
+
+kipfw-src:
+       -@rm -rf kipfw-mod
+       -@mkdir -p kipfw-mod
+       -@cp -Rp kipfw/* kipfw-mod
+       -@cp `find sys -name \*.c` kipfw-mod
+       -@(cd kipfw-mod && $(MAKE) include_e)
+ifeq ($(OSARCH),Windows)
+       make ndis-glue
+endif
+
+snapshot:
+       $(MAKE) distclean
+       (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME) --exclude .svn \
+               --exclude README.openwrt --exclude tags --exclude NOTES \
+               --exclude tcc-0.9.25-bsd \
+               --exclude original_passthru \
+               --exclude ipfw3.diff --exclude add_rules \
+               --exclude test --exclude test_ \
+               ipfw3-2012 )
+
+bindist:
+       $(MAKE) clean
+       $(MAKE) all
+       tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko
+
+windist:
+       $(MAKE) clean
+       -$(MAKE) all
+       -rm /tmp/$(WINDIST)
+       zip -r /tmp/$(WINDIST) binary -x \*.svn\*
+
+
+ipfw:
+       @(cd ipfw && $(MAKE) $(@) )
+
+kipfw: kipfw-src
+ifeq ($(WIN64),)       # linux or windows 32 bit
+       @(cd kipfw-mod && $(MAKE) $(@) )
+else   #--- windows 64 bit, we use build.exe and nmake
+       rm -f kipfw-mod/Makefile
+       mkdir kipfw-mod/tmpbuild                # check mysetenv.sh
+       bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS)
+endif
+
+IPF3_REPO ?= svn+ssh://some.host/some/path/ipfw3-2012
+
+planetlab_update:
+       # clean and create a local working directory
+       rm -rf /tmp/pl-tmp
+       mkdir -p /tmp/pl-tmp/pl
+       mkdir -p /tmp/pl-tmp/ol2
+       # get the trunk version of the PlanetLab repository
+       # to specify the sshkey use the .ssh/config file
+       (cd /tmp/pl-tmp/pl; \
+               svn co svn+ssh://svn.planet-lab.org/svn/ipfw/trunk)
+       # get an updated copy of the main ipfw repository
+       (cd /tmp/pl-tmp/ol2; svn export $(IPFW3_REPO) )
+       # copy the new version over the old one
+       (cd /tmp/pl-tmp; cp -rP ol2/ipfw3/* pl/trunk)
+       # files cleanup in the old version
+       (cd /tmp/pl-tmp; diff -r ol2/ipfw3 pl/trunk | \
+               grep -v "svn" | awk '{print $$3 $$4}' | \
+               sed 's/:/\//' | xargs rm -rf)
+       # local adjustments here
+       rm -rf /tmp/pl-tmp/pl/trunk/planetlab/check_planetlab_sync
+       # commit to the remote repo
+       @echo "Please, revise the update with the commands:"
+       @echo "(cd /tmp/pl-tmp/pl/trunk; svn diff)"
+       @echo "(cd /tmp/pl-tmp/pl/trunk; svn status)"
+       @echo "and commit with:"
+       @echo "(cd /tmp/pl-tmp/pl/trunk; svn ci -m 'Update from the mail ipfw repo.')"
+
+openwrt_release:
+       # create a temporary directory
+       $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX))
+       # create the source destination directory
+       $(eval IPFWDIR := ipfw3-$(DATE))
+       $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR))
+       mkdir $(DSTDIR)
+       # copy the package, clean objects and svn info
+       cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR)
+       (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf)
+       (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR))
+
+       # create the port files in /tmp/ipfw3-port
+       $(eval PORTDIR := $(TMPDIR)/ipfw3)
+       mkdir -p $(PORTDIR)/patches
+       # generate the Makefile, PKG_VERSION and PKG_MD5SUM
+       md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum
+       cat ./OPENWRT/Makefile | \
+               sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \
+               sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \
+               > $(PORTDIR)/Makefile
+
+       @echo ""
+       @echo "The openwrt port is in $(TMPDIR)/ipfw3-port"
+       @echo "The source file should be copied to the public server:"
+       @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet"
+       @echo "after this the temporary directory $(TMPDIR) can be removed."
+
+install:
+
+diff:
+       -@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw)
+       -@(diff -upr $(BSD_HEAD)/sys sys)
+
diff --git a/planetlab/check_planetlab_sync b/planetlab/check_planetlab_sync
new file mode 100755 (executable)
index 0000000..f59853f
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+#
+# This script is used to check the sync of the local repo
+# with the remote planetlab repository
+
+tmpfile=/tmp/chech_planetlab_sync.tmp
+
+# check for local copy sync
+svn diff > /tmp/chech_planetlab_sync.tmp
+if [ -s $tmpfile ] ; then
+       echo "Local repo unsynced, can not continue"
+       exit -1
+       rm $tmpfile
+fi
+
+# export remote copy
+svn --force export http://svn.planet-lab.org/svn/ipfw/trunk ./ >> /dev/null
+
+# check diffs again, output to the user
+svn diff 
+svn status | grep -v check_planetlab_sync
diff --git a/planetlab/ipfw b/planetlab/ipfw
new file mode 100755 (executable)
index 0000000..114cafb
--- /dev/null
@@ -0,0 +1,84 @@
+#!/bin/sh
+#
+# ipfw init the emulation service
+#
+# chkconfig: 2345 09 91
+# description: ipfw init and shutdown
+#
+
+# Source function library.
+. /etc/init.d/functions
+
+IPFW=ipfw
+IPFW_BACKEND=/vsys/ipfw-be
+IPFW_MOD=ipfw_mod
+
+if [ ! -x /sbin/$IPFW ] || [ ! -x ${IPFW_BACKEND} ]; then
+    echo -n "/sbin/$IPFW does not exist."; warning; echo
+    exit 0
+fi
+
+# Load the ipfw module, and initialize netconfig
+start() {
+       # load the module
+       modprobe $IPFW_MOD >& /dev/null
+       let ret=$?;
+        [ $ret -eq 0 ] && success || failure
+
+       # init netconfig
+       echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null
+       echo "super init" | ${IPFW_BACKEND} root >& /dev/null
+
+       return $ret
+}
+
+stop() {
+       # clean netconfig stuff
+       echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null
+       echo "Unloading $IPFW_MOD module: "
+
+       # unload the ipfw module
+       rmmod ${IPFW_MOD}
+       let ret=$?;
+       [ $ret -eq 0 ] && success || failure
+
+       return $ret
+}
+
+# echo the ipfw status
+status() {
+       # check for module presence
+       grep '^ipfw_mod$' /proc/modules >& /dev/null || echo "ipfw not loaded" && return 0
+
+       # Show active users
+       USERS=$(grep BLOCK /tmp/ff | wc -l)
+       echo "ipfw is loaded and there are currently ${USERS} with active emulation."
+       return 0
+}
+
+# main
+case "$1" in
+    start)
+       start
+       RETVAL=$?
+       ;;
+    stop)
+       stop
+       RETVAL=$?
+       ;;
+    restart)
+       stop
+       start
+       RETVAL=$?
+       ;;
+    status)
+       status
+       RETVAL=$?
+       ;;
+    *)
+       echo $"Usage: $0 {start|stop|restart|status}"
+       exit 1
+       ;;
+esac
+
+exit $RETVAL
diff --git a/planetlab/ipfw.8.gz b/planetlab/ipfw.8.gz
new file mode 100644 (file)
index 0000000..c2db923
Binary files /dev/null and b/planetlab/ipfw.8.gz differ
diff --git a/planetlab/ipfw.cron b/planetlab/ipfw.cron
new file mode 100644 (file)
index 0000000..1b09340
--- /dev/null
@@ -0,0 +1,3 @@
+# Runs every 5 minutes and clean ipfw expired rules
+# $Id: ipfw.cron 6069 2010-04-15 09:35:33Z marta $
+*/5 * * * * root     echo "super killexpired" | /vsys/ipfw-be root > /dev/null 2>&1
diff --git a/planetlab/ipfwroot.spec b/planetlab/ipfwroot.spec
new file mode 100644 (file)
index 0000000..1170ed7
--- /dev/null
@@ -0,0 +1,135 @@
+#
+# $Id: ipfwroot.spec 16174 2009-12-15 13:38:15Z marta $
+#
+# TODO:
+# restart crond
+#
+%define url $URL: svn+ssh://onelab2/home/svn/ports-luigi/ipfw3-2012/planetlab/ipfwroot.spec $
+
+# Marta Carbone <marta.carbone@iet.unipi.it>
+# 2009 - Universita` di Pisa
+# License is BSD.
+
+# kernel_release, kernel_version and kernel_arch are expected to be set by the build to e.g.
+# kernel_release : vs2.3.0.29.1.planetlab
+# kernel_version : 2.6.22.14
+
+%define name ipfwroot
+%define version 0.9
+%define taglevel 11
+
+%define release %{kernel_version}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}}
+%define kernel_id_arch %{kernel_version}-%{kernel_release}-%{kernel_arch}
+%define kernel_id %{kernel_version}-%{kernel_release}
+
+Summary: ipfw and dummynet for Linux
+Name: %{name}
+Version: %{version}
+Release: %{release}
+License: BSD
+Group: System Environment/Kernel
+Source0: %{name}-%{version}.tar.bz2
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
+Requires: vixie-cron
+Requires: vsys-scripts
+
+Vendor: unipi
+Packager: PlanetLab <marta@onelab2.iet.unipi.it>
+# XXX ask 
+Distribution: PlanetLab %{plrelease}
+URL: %(echo %{url} | cut -d ' ' -f 2)
+
+%description
+ipfw is the Linux port of the FreeBSD ipfw and dummynet packages
+
+%prep
+%setup
+
+%build
+# clean the rpm build directory
+rm -rf $RPM_BUILD_ROOT
+
+# with the new build, we use the kernel-devel rpm for building
+%define kernelpath /usr/src/kernels/%{kernel_id_arch}
+
+%__make KERNELPATH=%kernelpath clean
+%__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1
+
+%install
+install -D -m 755 dummynet2/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
+install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw
+install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron
+install -D -m 755 planetlab/ipfw $RPM_BUILD_ROOT/etc/rc.d/init.d/ipfw
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%post
+### this script is also triggered while the node image is being created at build-time
+# some parts of the script do not make sense in this context
+# this is why the build exports PL_BOOTCD=1 in such cases
+depmod -a
+/sbin/chkconfig --add ipfw
+# start the service if not building
+[ -z "$PL_BOOTCD" ] && service ipfw start
+
+%postun
+# stop the service if not building
+[ -z "$PL_BOOTCD" ] && service ipfw stop
+
+# here there is a list of the final installation directories
+%files
+%defattr(-,root,root)
+%dir /lib/modules/%{kernel_id}
+/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko
+/sbin/ipfw
+%{_sysconfdir}/cron.d/ipfw.cron
+/etc/rc.d/init.d/ipfw
+
+%changelog
+* Mon Apr 12 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-11
+- add ipfw initialization script to chkconfig
+
+* Wed Mar 03 2010 Talip Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-10
+- - Load module at installation - Marta
+
+* Mon Jan 11 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-9
+- consistent with vsys-scripts-0.95-13
+
+* Mon Jan 11 2010 Marta Carbone <marta.carbone@iet.unipi.it>
+- Integrated the ipfw rules cleanup into the backend
+
+* Sat Jan 09 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-8
+- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits
+
+* Wed Jan 06 2010 Marta Carbone <marta.carbone@iet.unipi.it>
+- move to dummynet2, added support for table lookup
+- added the vsys-script dependencies and the ipfw initialization
+
+* Tue Dec 15 2009 Marta Carbone <marta.carbone@iet.unipi.it>
+- more work on the radix code, added sysctl read/write support
+
+* Sun Nov 29 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-7
+- added missing qsort.c - tag 0.9-6 was broken
+
+* Thu Nov 26 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-6
+- root: removed goto into the main ipfw switch, enabled slice_id matching
+- slice: completely move netconfig checks into the backend
+
+* Mon Nov 09 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-5
+- additional features on matching packets, including uid match
+
+* Mon Sep 07 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-4
+- on behalf of Marta Carbone, more options and features
+
+* Thu Jul 23 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-3
+- fixed memory usage issue
+
+* Wed Jul 15 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-2
+- patch for building on x86_64
+
+* Thu Jun 25 2009 Marta Carbone <marta.carbone@iet.unipi.it>
+- post installation removed for deployment, moved manpages to the slice package
+
+* Fri Apr 17 2009 Marta Carbone <marta.carbone@iet.unipi.it>
+- Initial release
diff --git a/planetlab/ipfwslice.spec b/planetlab/ipfwslice.spec
new file mode 100644 (file)
index 0000000..cd98b89
--- /dev/null
@@ -0,0 +1,94 @@
+#
+# $Id: ipfwslice.spec 16174 2009-12-15 13:38:15Z marta $
+#
+# TODO:
+# restart crond
+# modprobe ipfw_mod.ko (depmod ?)
+#
+%define url $URL: svn+ssh://onelab2/home/svn/ports-luigi/ipfw3-2012/planetlab/ipfwslice.spec $
+
+# Marta Carbone <marta.carbone@iet.unipi.it>
+# 2009 - Universita` di Pisa
+# License is BSD.
+
+%define name ipfwslice
+%define version 0.9
+%define taglevel 11
+
+%define release %{kernel_version}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}}
+%define kernel_id_arch %{kernel_version}-%{kernel_release}-%{kernel_arch}
+%define kernel_id %{kernel_version}-%{kernel_release}
+
+Summary: ipfw and dummynet for Linux
+Name: %{name}
+Version: %{version}
+Release: %{release}
+License: BSD
+Group: System Environment/Kernel
+Source0: %{name}-%{version}.tar.bz2
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
+
+Vendor: unipi
+Packager: PlanetLab <marta@onelab2.iet.unipi.it>
+Distribution: PlanetLab %{plrelease}
+URL: %(echo %{url} | cut -d ' ' -f 2)
+
+%description
+the frontend part of the ipfw planetlab package
+
+%prep
+%setup
+
+%build
+rm -rf $RPM_BUILD_ROOT
+
+%install
+install -D -m 755 planetlab/netconfig $RPM_BUILD_ROOT/sbin/netconfig
+install -D -m 755 planetlab/ipfw.8.gz $RPM_BUILD_ROOT/%{_mandir}/man8/ipfw.8.gz
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+# here there is a list of the final installation directories
+%files
+%defattr(-,root,root)
+/sbin/netconfig
+%{_mandir}/man8/ipfw.8*
+
+%changelog
+* Mon Apr 12 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-11
+- add ipfw initialization script to chkconfig
+
+* Wed Mar 03 2010 Talip Baris Metin <Talip-Baris.Metin@sophia.inria.fr> - ipfw-0.9-10
+- - Load module at installation - Marta
+
+* Mon Jan 11 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-9
+- consistent with vsys-scripts-0.95-13
+
+* Sat Jan 09 2010 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-8
+- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits
+
+* Tue Dec 15 2009 Marta Carbone <marta.carbone@iet.unipi.it>
+- more work on the radix code, added sysctl read/write support
+
+* Sun Nov 29 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-7
+- added missing qsort.c - tag 0.9-6 was broken
+
+* Thu Nov 26 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-6
+- root: removed goto into the main ipfw switch, enabled slice_id matching
+- slice: completely move netconfig checks into the backend
+
+* Mon Nov 09 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-5
+- additional features on matching packets, including uid match
+
+* Mon Sep 07 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-4
+- on behalf of Marta Carbone, more options and features
+
+* Thu Jul 23 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-3
+- fixed memory usage issue
+
+* Wed Jul 15 2009 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - ipfw-0.9-2
+- patch for building on x86_64
+
+* Thu Jun 25 2009 Marta Carbone <marta.carbone@iet.unipi.it>
+- Initial release
diff --git a/planetlab/netconfig b/planetlab/netconfig
new file mode 100755 (executable)
index 0000000..7108582
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+#
+# Marta Carbone, Luigi Rizzo
+# Copyright (C) 2009 Universita` di Pisa
+# $Id: netconfig 4533 2009-12-16 14:39:23Z luigi $
+#
+# This script is the frontend to be used with the vsys system.
+# It simply passes information to the backend and gets back the reply
+
+PIPE_IN=/vsys/ipfw-be.in
+PIPE_OUT=/vsys/ipfw-be.out
+
+sudo sh -c "echo $* >> ${PIPE_IN}"
+sudo sh -c "cat ${PIPE_OUT}"
diff --git a/planetlab/planetlab-tags.mk b/planetlab/planetlab-tags.mk
new file mode 100644 (file)
index 0000000..25eff0e
--- /dev/null
@@ -0,0 +1,6 @@
+# $Id: planetlab-tags.mk 7450 2010-10-18 11:17:43Z marta $
+# These are good to build the ipfw modules from svn on kernels 2.6.22
+# and are used to fetch files from the onelab2 repository.
+linux-2.6-SVNBRANCH    := 22
+linux-2.6-SVNPATH      := http://svn.planet-lab.org/svn/linux-2.6/tags/linux-2.6-22-39-1
+ipfwsrc-SVNPATH                := svn+ssh://luigi%40onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3
diff --git a/planetlab/planetlab.mk b/planetlab/planetlab.mk
new file mode 100644 (file)
index 0000000..6d3504b
--- /dev/null
@@ -0,0 +1,26 @@
+# $Id: planetlab.mk 4533 2009-12-16 14:39:23Z luigi $
+# .mk file to build a module
+kernel-MODULES := linux-2.6
+kernel-SPEC := kernel-2.6.spec 
+kernel-BUILD-FROM-SRPM := yes
+ifeq "$(HOSTARCH)" "i386"
+kernel-RPMFLAGS:= --target i686
+else
+kernel-RPMFLAGS:= --target $(HOSTARCH)
+endif
+ALL += kernel
+
+ipfwroot-MODULES := ipfwsrc
+ipfwroot-SPEC := planetlab/ipfwroot.spec
+ipfwroot-DEPEND-DEVEL-RPMS := kernel-devel
+ipfwroot-SPECVARS = kernel_version=$(kernel.rpm-version) \
+        kernel_release=$(kernel.rpm-release) \
+        kernel_arch=$(kernel.rpm-arch)
+ALL += ipfwroot 
+
+ipfwslice-MODULES := ipfwsrc
+ipfwslice-SPEC := planetlab/ipfwslice.spec
+ipfwslice-SPECVARS = kernel_version=$(kernel.rpm-version) \
+        kernel_release=$(kernel.rpm-release) \
+        kernel_arch=$(kernel.rpm-arch)
+ALL += ipfwslice
diff --git a/planetlab/sample_hook b/planetlab/sample_hook
new file mode 100755 (executable)
index 0000000..b47c8de
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+#
+# Marta Carbone <marta.carbone@iet.unipi.it>
+# 2009 - Universita` di Pisa
+#
+# This is a sample hook file in charge to collect
+# statistical information on netconfig usage. It dumps
+# on a log file slicename, port and the configuration string
+# used to configure a dummynet experiment.
+#
+# Each time a user configure a dummynet port, this file
+# will be executed.
+# The following variables will be passed as argument:
+# 
+# ${SLICE} ${PORT} ${CONFIG_STRING} 
+# ${SLICE} The slicename executing the netconfig command
+# ${PORT} The port to be configured
+# ${CONFIG_STRING} The configuration string
+#
+# Note that this script can get additional information
+# by executing the ipfw command, e.g.
+# ipfw list            # list of installed rules
+# ipfw show            # list of rules and statistical information
+# ipfw pipe show       # list of pipes
+#
+# a complete list of ipfw commands is available at:
+# http://www.freebsd.org/cgi/man.cgi?query=ipfw&sektion=8
+
+# logfile
+LOG_FILE=/tmp/ipfw_hook.log
+
+echo -e `date` >> ${LOG_FILE}
+echo "$*" >> ${LOG_FILE}
diff --git a/sys/net/if.h b/sys/net/if.h
new file mode 100644 (file)
index 0000000..1aa8e7b
--- /dev/null
@@ -0,0 +1 @@
+#include <linux/if.h>
diff --git a/sys/net/pfil.h b/sys/net/pfil.h
new file mode 100644 (file)
index 0000000..af26a79
--- /dev/null
@@ -0,0 +1,121 @@
+/*     $FreeBSD: src/sys/net/pfil.h,v 1.16 2007/06/08 12:43:25 gallatin Exp $ */
+/*     $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */
+
+/*-
+ * Copyright (c) 1996 Matthew R. Green
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NET_PFIL_H_
+#define _NET_PFIL_H_
+
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+struct mbuf;
+struct ifnet;
+struct inpcb;
+
+/*
+ * The packet filter hooks are designed for anything to call them to
+ * possibly intercept the packet.
+ */
+struct packet_filter_hook {
+        TAILQ_ENTRY(packet_filter_hook) pfil_link;
+       int     (*pfil_func)(void *, struct mbuf **, struct ifnet *, int,
+                   struct inpcb *);
+       void    *pfil_arg;
+};
+
+#define PFIL_IN                0x00000001
+#define PFIL_OUT       0x00000002
+#define PFIL_WAITOK    0x00000004
+#define PFIL_ALL       (PFIL_IN|PFIL_OUT)
+
+typedef        TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t;
+
+#define        PFIL_TYPE_AF            1       /* key is AF_* type */
+#define        PFIL_TYPE_IFNET         2       /* key is ifnet pointer */
+
+struct pfil_head {
+       pfil_list_t     ph_in;
+       pfil_list_t     ph_out;
+       int             ph_type;
+       int             ph_nhooks;
+#if defined( __linux__ ) || defined( _WIN32 )
+       rwlock_t        ph_mtx;
+#else
+       struct rmlock   ph_lock;
+#endif
+       union {
+               u_long          phu_val;
+               void            *phu_ptr;
+       } ph_un;
+#define        ph_af           ph_un.phu_val
+#define        ph_ifnet        ph_un.phu_ptr
+       LIST_ENTRY(pfil_head) ph_list;
+};
+
+int    pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *,
+           int, struct inpcb *), void *, int, struct pfil_head *);
+int    pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *,
+           int, struct inpcb *), void *, int, struct pfil_head *);
+int    pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *,
+           int, struct inpcb *inp);
+
+int    pfil_head_register(struct pfil_head *);
+int    pfil_head_unregister(struct pfil_head *);
+
+struct pfil_head *pfil_head_get(int, u_long);
+
+#define        PFIL_HOOKED(p) ((p)->ph_nhooks > 0)
+#define        PFIL_LOCK_INIT(p) \
+    rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE)
+#define        PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock)
+#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t))
+#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock)
+#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t))
+#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock)
+#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock)
+#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock)
+
+static __inline struct packet_filter_hook *
+pfil_hook_get(int dir, struct pfil_head *ph)
+{
+
+       if (dir == PFIL_IN)
+               return (TAILQ_FIRST(&ph->ph_in));
+       else if (dir == PFIL_OUT)
+               return (TAILQ_FIRST(&ph->ph_out));
+       else
+               return (NULL);
+}
+
+#endif /* _NET_PFIL_H_ */
diff --git a/sys/net/radix.c b/sys/net/radix.c
new file mode 100644 (file)
index 0000000..22bac2b
--- /dev/null
@@ -0,0 +1,1203 @@
+/*-
+ * Copyright (c) 1988, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)radix.c     8.5 (Berkeley) 5/19/95
+ * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $
+ */
+
+/*
+ * Routines to build and maintain radix trees for routing lookups.
+ */
+#include <sys/param.h>
+#ifdef _KERNEL
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <net/radix.h>
+#include "opt_mpath.h"
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+#else /* !_KERNEL */
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x)       fprintf(stderr, "PANIC: %s", x), exit(1)
+#define min(a, b) ((a) < (b) ? (a) : (b) )
+#include <net/radix.h>
+#endif /* !_KERNEL */
+
+static int     rn_walktree_from(struct radix_node_head *h, void *a, void *m,
+                   walktree_f_t *f, void *w);
+static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
+static struct radix_node
+        *rn_insert(void *, struct radix_node_head *, int *,
+            struct radix_node [2]),
+        *rn_newpair(void *, int, struct radix_node[2]),
+        *rn_search(void *, struct radix_node *),
+        *rn_search_m(void *, struct radix_node *, void *);
+
+static int     max_keylen;
+static struct radix_mask *rn_mkfreelist;
+static struct radix_node_head *mask_rnhead;
+/*
+ * Work area -- the following point to 3 buffers of size max_keylen,
+ * allocated in this order in a block of memory malloc'ed by rn_init.
+ * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards.
+ * addmask_key is used in rn_addmask in rw mode and not thread-safe.
+ */
+static char *rn_zeros, *rn_ones, *addmask_key;
+
+#define MKGet(m) {                                             \
+       if (rn_mkfreelist) {                                    \
+               m = rn_mkfreelist;                              \
+               rn_mkfreelist = (m)->rm_mklist;                 \
+       } else                                                  \
+               R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); }
+#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);}
+
+#define rn_masktop (mask_rnhead->rnh_treetop)
+
+static int     rn_lexobetter(void *m_arg, void *n_arg);
+static struct radix_mask *
+               rn_new_radix_mask(struct radix_node *tt,
+                   struct radix_mask *next);
+static int     rn_satisfies_leaf(char *trial, struct radix_node *leaf,
+                   int skip);
+
+/*
+ * The data structure for the keys is a radix tree with one way
+ * branching removed.  The index rn_bit at an internal node n represents a bit
+ * position to be tested.  The tree is arranged so that all descendants
+ * of a node n have keys whose bits all agree up to position rn_bit - 1.
+ * (We say the index of n is rn_bit.)
+ *
+ * There is at least one descendant which has a one bit at position rn_bit,
+ * and at least one with a zero there.
+ *
+ * A route is determined by a pair of key and mask.  We require that the
+ * bit-wise logical and of the key and mask to be the key.
+ * We define the index of a route to associated with the mask to be
+ * the first bit number in the mask where 0 occurs (with bit number 0
+ * representing the highest order bit).
+ *
+ * We say a mask is normal if every bit is 0, past the index of the mask.
+ * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
+ * and m is a normal mask, then the route applies to every descendant of n.
+ * If the index(m) < rn_bit, this implies the trailing last few bits of k
+ * before bit b are all 0, (and hence consequently true of every descendant
+ * of n), so the route applies to all descendants of the node as well.
+ *
+ * Similar logic shows that a non-normal mask m such that
+ * index(m) <= index(n) could potentially apply to many children of n.
+ * Thus, for each non-host route, we attach its mask to a list at an internal
+ * node as high in the tree as we can go.
+ *
+ * The present version of the code makes use of normal routes in short-
+ * circuiting an explict mask and compare operation when testing whether
+ * a key satisfies a normal route, and also in remembering the unique leaf
+ * that governs a subtree.
+ */
+
+/*
+ * Most of the functions in this code assume that the key/mask arguments
+ * are sockaddr-like structures, where the first byte is an u_char
+ * indicating the size of the entire structure.
+ *
+ * To make the assumption more explicit, we use the LEN() macro to access
+ * this field. It is safe to pass an expression with side effects
+ * to LEN() as the argument is evaluated only once.
+ * We cast the result to int as this is the dominant usage.
+ */
+#define LEN(x) ( (int) (*(const u_char *)(x)) )
+
+/*
+ * XXX THIS NEEDS TO BE FIXED
+ * In the code, pointers to keys and masks are passed as either
+ * 'void *' (because callers use to pass pointers of various kinds), or
+ * 'caddr_t' (which is fine for pointer arithmetics, but not very
+ * clean when you dereference it to access data). Furthermore, caddr_t
+ * is really 'char *', while the natural type to operate on keys and
+ * masks would be 'u_char'. This mismatch require a lot of casts and
+ * intermediate variables to adapt types that clutter the code.
+ */
+
+/*
+ * Search a node in the tree matching the key.
+ */
+static struct radix_node *
+rn_search(v_arg, head)
+       void *v_arg;
+       struct radix_node *head;
+{
+       register struct radix_node *x;
+       register caddr_t v;
+
+       for (x = head, v = v_arg; x->rn_bit >= 0;) {
+               if (x->rn_bmask & v[x->rn_offset])
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       }
+       return (x);
+}
+
+/*
+ * Same as above, but with an additional mask.
+ * XXX note this function is used only once.
+ */
+static struct radix_node *
+rn_search_m(v_arg, head, m_arg)
+       struct radix_node *head;
+       void *v_arg, *m_arg;
+{
+       register struct radix_node *x;
+       register caddr_t v = v_arg, m = m_arg;
+
+       for (x = head; x->rn_bit >= 0;) {
+               if ((x->rn_bmask & m[x->rn_offset]) &&
+                   (x->rn_bmask & v[x->rn_offset]))
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       }
+       return x;
+}
+
+int
+rn_refines(m_arg, n_arg)
+       void *m_arg, *n_arg;
+{
+       register caddr_t m = m_arg, n = n_arg;
+       register caddr_t lim, lim2 = lim = n + LEN(n);
+       int longer = LEN(n++) - LEN(m++);
+       int masks_are_equal = 1;
+
+       if (longer > 0)
+               lim -= longer;
+       while (n < lim) {
+               if (*n & ~(*m))
+                       return 0;
+               if (*n++ != *m++)
+                       masks_are_equal = 0;
+       }
+       while (n < lim2)
+               if (*n++)
+                       return 0;
+       if (masks_are_equal && (longer < 0))
+               for (lim2 = m - longer; m < lim2; )
+                       if (*m++)
+                               return 1;
+       return (!masks_are_equal);
+}
+
+struct radix_node *
+rn_lookup(v_arg, m_arg, head)
+       void *v_arg, *m_arg;
+       struct radix_node_head *head;
+{
+       register struct radix_node *x;
+       caddr_t netmask = 0;
+
+       if (m_arg) {
+               x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
+               if (x == 0)
+                       return (0);
+               netmask = x->rn_key;
+       }
+       x = rn_match(v_arg, head);
+       if (x && netmask) {
+               while (x && x->rn_mask != netmask)
+                       x = x->rn_dupedkey;
+       }
+       return x;
+}
+
+static int
+rn_satisfies_leaf(trial, leaf, skip)
+       char *trial;
+       register struct radix_node *leaf;
+       int skip;
+{
+       register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
+       char *cplim;
+       int length = min(LEN(cp), LEN(cp2));
+
+       if (cp3 == NULL)
+               cp3 = rn_ones;
+       else
+               length = min(length, LEN(cp3));
+       cplim = cp + length; cp3 += skip; cp2 += skip;
+       for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
+               if ((*cp ^ *cp2) & *cp3)
+                       return 0;
+       return 1;
+}
+
+struct radix_node *
+rn_match(v_arg, head)
+       void *v_arg;
+       struct radix_node_head *head;
+{
+       caddr_t v = v_arg;
+       register struct radix_node *t = head->rnh_treetop, *x;
+       register caddr_t cp = v, cp2;
+       caddr_t cplim;
+       struct radix_node *saved_t, *top = t;
+       int off = t->rn_offset, vlen = LEN(cp), matched_off;
+       register int test, b, rn_bit;
+
+       /*
+        * Open code rn_search(v, top) to avoid overhead of extra
+        * subroutine call.
+        */
+       for (; t->rn_bit >= 0; ) {
+               if (t->rn_bmask & cp[t->rn_offset])
+                       t = t->rn_right;
+               else
+                       t = t->rn_left;
+       }
+       /*
+        * See if we match exactly as a host destination
+        * or at least learn how many bits match, for normal mask finesse.
+        *
+        * It doesn't hurt us to limit how many bytes to check
+        * to the length of the mask, since if it matches we had a genuine
+        * match and the leaf we have is the most specific one anyway;
+        * if it didn't match with a shorter length it would fail
+        * with a long one.  This wins big for class B&C netmasks which
+        * are probably the most common case...
+        */
+       if (t->rn_mask)
+               vlen = *(u_char *)t->rn_mask;
+       cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
+       for (; cp < cplim; cp++, cp2++)
+               if (*cp != *cp2)
+                       goto on1;
+       /*
+        * This extra grot is in case we are explicitly asked
+        * to look up the default.  Ugh!
+        *
+        * Never return the root node itself, it seems to cause a
+        * lot of confusion.
+        */
+       if (t->rn_flags & RNF_ROOT)
+               t = t->rn_dupedkey;
+       return t;
+on1:
+       test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
+       for (b = 7; (test >>= 1) > 0;)
+               b--;
+       matched_off = cp - v;
+       b += matched_off << 3;
+       rn_bit = -1 - b;
+       /*
+        * If there is a host route in a duped-key chain, it will be first.
+        */
+       if ((saved_t = t)->rn_mask == 0)
+               t = t->rn_dupedkey;
+       for (; t; t = t->rn_dupedkey)
+               /*
+                * Even if we don't match exactly as a host,
+                * we may match if the leaf we wound up at is
+                * a route to a net.
+                */
+               if (t->rn_flags & RNF_NORMAL) {
+                       if (rn_bit <= t->rn_bit)
+                               return t;
+               } else if (rn_satisfies_leaf(v, t, matched_off))
+                               return t;
+       t = saved_t;
+       /* start searching up the tree */
+       do {
+               register struct radix_mask *m;
+               t = t->rn_parent;
+               m = t->rn_mklist;
+               /*
+                * If non-contiguous masks ever become important
+                * we can restore the masking and open coding of
+                * the search and satisfaction test and put the
+                * calculation of "off" back before the "do".
+                */
+               while (m) {
+                       if (m->rm_flags & RNF_NORMAL) {
+                               if (rn_bit <= m->rm_bit)
+                                       return (m->rm_leaf);
+                       } else {
+                               off = min(t->rn_offset, matched_off);
+                               x = rn_search_m(v, t, m->rm_mask);
+                               while (x && x->rn_mask != m->rm_mask)
+                                       x = x->rn_dupedkey;
+                               if (x && rn_satisfies_leaf(v, x, off))
+                                       return x;
+                       }
+                       m = m->rm_mklist;
+               }
+       } while (t != top);
+       return 0;
+}
+
+#ifdef RN_DEBUG
+int    rn_nodenum;
+struct radix_node *rn_clist;
+int    rn_saveinfo;
+int    rn_debug =  1;
+#endif
+
+/*
+ * Whenever we add a new leaf to the tree, we also add a parent node,
+ * so we allocate them as an array of two elements: the first one must be
+ * the leaf (see RNTORT() in route.c), the second one is the parent.
+ * This routine initializes the relevant fields of the nodes, so that
+ * the leaf is the left child of the parent node, and both nodes have
+ * (almost) all all fields filled as appropriate.
+ * (XXX some fields are left unset, see the '#if 0' section).
+ * The function returns a pointer to the parent node.
+ */
+
+static struct radix_node *
+rn_newpair(v, b, nodes)
+       void *v;
+       int b;
+       struct radix_node nodes[2];
+{
+       register struct radix_node *tt = nodes, *t = tt + 1;
+       t->rn_bit = b;
+       t->rn_bmask = 0x80 >> (b & 7);
+       t->rn_left = tt;
+       t->rn_offset = b >> 3;
+
+#if 0  /* XXX perhaps we should fill these fields as well. */
+       t->rn_parent = t->rn_right = NULL;
+
+       tt->rn_mask = NULL;
+       tt->rn_dupedkey = NULL;
+       tt->rn_bmask = 0;
+#endif
+       tt->rn_bit = -1;
+       tt->rn_key = (caddr_t)v;
+       tt->rn_parent = t;
+       tt->rn_flags = t->rn_flags = RNF_ACTIVE;
+       tt->rn_mklist = t->rn_mklist = 0;
+#ifdef RN_DEBUG
+       tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
+       tt->rn_twin = t;
+       tt->rn_ybro = rn_clist;
+       rn_clist = tt;
+#endif
+       return t;
+}
+
+static struct radix_node *
+rn_insert(v_arg, head, dupentry, nodes)
+       void *v_arg;
+       struct radix_node_head *head;
+       int *dupentry;
+       struct radix_node nodes[2];
+{
+       caddr_t v = v_arg;
+       struct radix_node *top = head->rnh_treetop;
+       int head_off = top->rn_offset, vlen = LEN(v);
+       register struct radix_node *t = rn_search(v_arg, top);
+       register caddr_t cp = v + head_off;
+       register int b;
+       struct radix_node *tt;
+       /*
+        * Find first bit at which v and t->rn_key differ
+        */
+    {
+       register caddr_t cp2 = t->rn_key + head_off;
+       register int cmp_res;
+       caddr_t cplim = v + vlen;
+
+       while (cp < cplim)
+               if (*cp2++ != *cp++)
+                       goto on1;
+       *dupentry = 1;
+       return t;
+on1:
+       *dupentry = 0;
+       cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
+       for (b = (cp - v) << 3; cmp_res; b--)
+               cmp_res >>= 1;
+    }
+    {
+       register struct radix_node *p, *x = top;
+       cp = v;
+       do {
+               p = x;
+               if (cp[x->rn_offset] & x->rn_bmask)
+                       x = x->rn_right;
+               else
+                       x = x->rn_left;
+       } while (b > (unsigned) x->rn_bit);
+                               /* x->rn_bit < b && x->rn_bit >= 0 */
+#ifdef RN_DEBUG
+       if (rn_debug)
+               log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p);
+#endif
+       t = rn_newpair(v_arg, b, nodes); 
+       tt = t->rn_left;
+       if ((cp[p->rn_offset] & p->rn_bmask) == 0)
+               p->rn_left = t;
+       else
+               p->rn_right = t;
+       x->rn_parent = t;
+       t->rn_parent = p; /* frees x, p as temp vars below */
+       if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
+               t->rn_right = x;
+       } else {
+               t->rn_right = tt;
+               t->rn_left = x;
+       }
+#ifdef RN_DEBUG
+       if (rn_debug)
+               log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p);
+#endif
+    }
+       return (tt);
+}
+
+struct radix_node *
+rn_addmask(n_arg, search, skip)
+       int search, skip;
+       void *n_arg;
+{
+       caddr_t netmask = (caddr_t)n_arg;
+       register struct radix_node *x;
+       register caddr_t cp, cplim;
+       register int b = 0, mlen, j;
+       int maskduplicated, m0, isnormal;
+       struct radix_node *saved_x;
+       static int last_zeroed = 0;
+
+       if ((mlen = LEN(netmask)) > max_keylen)
+               mlen = max_keylen;
+       if (skip == 0)
+               skip = 1;
+       if (mlen <= skip)
+               return (mask_rnhead->rnh_nodes);
+       if (skip > 1)
+               bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
+       if ((m0 = mlen) > skip)
+               bcopy(netmask + skip, addmask_key + skip, mlen - skip);
+       /*
+        * Trim trailing zeroes.
+        */
+       for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
+               cp--;
+       mlen = cp - addmask_key;
+       if (mlen <= skip) {
+               if (m0 >= last_zeroed)
+                       last_zeroed = mlen;
+               return (mask_rnhead->rnh_nodes);
+       }
+       if (m0 < last_zeroed)
+               bzero(addmask_key + m0, last_zeroed - m0);
+       *addmask_key = last_zeroed = mlen;
+       x = rn_search(addmask_key, rn_masktop);
+       if (bcmp(addmask_key, x->rn_key, mlen) != 0)
+               x = 0;
+       if (x || search)
+               return (x);
+       R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
+       if ((saved_x = x) == 0)
+               return (0);
+       netmask = cp = (caddr_t)(x + 2);
+       bcopy(addmask_key, cp, mlen);
+       x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
+       if (maskduplicated) {
+               log(LOG_ERR, "rn_addmask: mask impossibly already in tree");
+               Free(saved_x);
+               return (x);
+       }
+       /*
+        * Calculate index of mask, and check for normalcy.
+        * First find the first byte with a 0 bit, then if there are
+        * more bits left (remember we already trimmed the trailing 0's),
+        * the pattern must be one of those in normal_chars[], or we have
+        * a non-contiguous mask.
+        */
+       cplim = netmask + mlen;
+       isnormal = 1;
+       for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;)
+               cp++;
+       if (cp != cplim) {
+               static char normal_chars[] = {
+                       0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
+
+               for (j = 0x80; (j & *cp) != 0; j >>= 1)
+                       b++;
+               if (*cp != normal_chars[b] || cp != (cplim - 1))
+                       isnormal = 0;
+       }
+       b += (cp - netmask) << 3;
+       x->rn_bit = -1 - b;
+       if (isnormal)
+               x->rn_flags |= RNF_NORMAL;
+       return (x);
+}
+
+static int     /* XXX: arbitrary ordering for non-contiguous masks */
+rn_lexobetter(m_arg, n_arg)
+       void *m_arg, *n_arg;
+{
+       register u_char *mp = m_arg, *np = n_arg, *lim;
+
+       if (LEN(mp) > LEN(np))
+               return 1;  /* not really, but need to check longer one first */
+       if (LEN(mp) == LEN(np))
+               for (lim = mp + LEN(mp); mp < lim;)
+                       if (*mp++ > *np++)
+                               return 1;
+       return 0;
+}
+
+static struct radix_mask *
+rn_new_radix_mask(tt, next)
+       register struct radix_node *tt;
+       register struct radix_mask *next;
+{
+       register struct radix_mask *m;
+
+       MKGet(m);
+       if (m == 0) {
+               log(LOG_ERR, "Mask for route not entered\n");
+               return (0);
+       }
+       bzero(m, sizeof *m);
+       m->rm_bit = tt->rn_bit;
+       m->rm_flags = tt->rn_flags;
+       if (tt->rn_flags & RNF_NORMAL)
+               m->rm_leaf = tt;
+       else
+               m->rm_mask = tt->rn_mask;
+       m->rm_mklist = next;
+       tt->rn_mklist = m;
+       return m;
+}
+
+struct radix_node *
+rn_addroute(v_arg, n_arg, head, treenodes)
+       void *v_arg, *n_arg;
+       struct radix_node_head *head;
+       struct radix_node treenodes[2];
+{
+       caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
+       register struct radix_node *t, *x = 0, *tt;
+       struct radix_node *saved_tt, *top = head->rnh_treetop;
+       short b = 0, b_leaf = 0;
+       int keyduplicated;
+       caddr_t mmask;
+       struct radix_mask *m, **mp;
+
+       /*
+        * In dealing with non-contiguous masks, there may be
+        * many different routes which have the same mask.
+        * We will find it useful to have a unique pointer to
+        * the mask to speed avoiding duplicate references at
+        * nodes and possibly save time in calculating indices.
+        */
+       if (netmask)  {
+               if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
+                       return (0);
+               b_leaf = x->rn_bit;
+               b = -1 - x->rn_bit;
+               netmask = x->rn_key;
+       }
+       /*
+        * Deal with duplicated keys: attach node to previous instance
+        */
+       saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
+       if (keyduplicated) {
+               for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
+#ifdef RADIX_MPATH
+                       /* permit multipath, if enabled for the family */
+                       if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
+                               /*
+                                * go down to the end of multipaths, so that
+                                * new entry goes into the end of rn_dupedkey
+                                * chain.
+                                */
+                               do {
+                                       t = tt;
+                                       tt = tt->rn_dupedkey;
+                               } while (tt && t->rn_mask == tt->rn_mask);
+                               break;
+                       }
+#endif
+                       if (tt->rn_mask == netmask)
+                               return (0);
+                       if (netmask == 0 ||
+                           (tt->rn_mask &&
+                            ((b_leaf < tt->rn_bit) /* index(netmask) > node */
+                             || rn_refines(netmask, tt->rn_mask)
+                             || rn_lexobetter(netmask, tt->rn_mask))))
+                               break;
+               }
+               /*
+                * If the mask is not duplicated, we wouldn't
+                * find it among possible duplicate key entries
+                * anyway, so the above test doesn't hurt.
+                *
+                * We sort the masks for a duplicated key the same way as
+                * in a masklist -- most specific to least specific.
+                * This may require the unfortunate nuisance of relocating
+                * the head of the list.
+                *
+                * We also reverse, or doubly link the list through the
+                * parent pointer.
+                */
+               if (tt == saved_tt) {
+                       struct  radix_node *xx = x;
+                       /* link in at head of list */
+                       (tt = treenodes)->rn_dupedkey = t;
+                       tt->rn_flags = t->rn_flags;
+                       tt->rn_parent = x = t->rn_parent;
+                       t->rn_parent = tt;                      /* parent */
+                       if (x->rn_left == t)
+                               x->rn_left = tt;
+                       else
+                               x->rn_right = tt;
+                       saved_tt = tt; x = xx;
+               } else {
+                       (tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
+                       t->rn_dupedkey = tt;
+                       tt->rn_parent = t;                      /* parent */
+                       if (tt->rn_dupedkey)                    /* parent */
+                               tt->rn_dupedkey->rn_parent = tt; /* parent */
+               }
+#ifdef RN_DEBUG
+               t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
+               tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt;
+#endif
+               tt->rn_key = (caddr_t) v;
+               tt->rn_bit = -1;
+               tt->rn_flags = RNF_ACTIVE;
+       }
+       /*
+        * Put mask in tree.
+        */
+       if (netmask) {
+               tt->rn_mask = netmask;
+               tt->rn_bit = x->rn_bit;
+               tt->rn_flags |= x->rn_flags & RNF_NORMAL;
+       }
+       t = saved_tt->rn_parent;
+       if (keyduplicated)
+               goto on2;
+       b_leaf = -1 - t->rn_bit;
+       if (t->rn_right == saved_tt)
+               x = t->rn_left;
+       else
+               x = t->rn_right;
+       /* Promote general routes from below */
+       if (x->rn_bit < 0) {
+           for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+               if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
+                       *mp = m = rn_new_radix_mask(x, 0);
+                       if (m)
+                               mp = &m->rm_mklist;
+               }
+       } else if (x->rn_mklist) {
+               /*
+                * Skip over masks whose index is > that of new node
+                */
+               for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
+                       if (m->rm_bit >= b_leaf)
+                               break;
+               t->rn_mklist = m; *mp = 0;
+       }
+on2:
+       /* Add new route to highest possible ancestor's list */
+       if ((netmask == 0) || (b > t->rn_bit ))
+               return tt; /* can't lift at all */
+       b_leaf = tt->rn_bit;
+       do {
+               x = t;
+               t = t->rn_parent;
+       } while (b <= t->rn_bit && x != top);
+       /*
+        * Search through routes associated with node to
+        * insert new route according to index.
+        * Need same criteria as when sorting dupedkeys to avoid
+        * double loop on deletion.
+        */
+       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) {
+               if (m->rm_bit < b_leaf)
+                       continue;
+               if (m->rm_bit > b_leaf)
+                       break;
+               if (m->rm_flags & RNF_NORMAL) {
+                       mmask = m->rm_leaf->rn_mask;
+                       if (tt->rn_flags & RNF_NORMAL) {
+#if !defined(RADIX_MPATH)
+                           log(LOG_ERR,
+                               "Non-unique normal route, mask not entered\n");
+#endif
+                               return tt;
+                       }
+               } else
+                       mmask = m->rm_mask;
+               if (mmask == netmask) {
+                       m->rm_refs++;
+                       tt->rn_mklist = m;
+                       return tt;
+               }
+               if (rn_refines(netmask, mmask)
+                   || rn_lexobetter(netmask, mmask))
+                       break;
+       }
+       *mp = rn_new_radix_mask(tt, *mp);
+       return tt;
+}
+
+struct radix_node *
+rn_delete(v_arg, netmask_arg, head)
+       void *v_arg, *netmask_arg;
+       struct radix_node_head *head;
+{
+       register struct radix_node *t, *p, *x, *tt;
+       struct radix_mask *m, *saved_m, **mp;
+       struct radix_node *dupedkey, *saved_tt, *top;
+       caddr_t v, netmask;
+       int b, head_off, vlen;
+
+       v = v_arg;
+       netmask = netmask_arg;
+       x = head->rnh_treetop;
+       tt = rn_search(v, x);
+       head_off = x->rn_offset;
+       vlen =  LEN(v);
+       saved_tt = tt;
+       top = x;
+       if (tt == 0 ||
+           bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
+               return (0);
+       /*
+        * Delete our route from mask lists.
+        */
+       if (netmask) {
+               if ((x = rn_addmask(netmask, 1, head_off)) == 0)
+                       return (0);
+               netmask = x->rn_key;
+               while (tt->rn_mask != netmask)
+                       if ((tt = tt->rn_dupedkey) == 0)
+                               return (0);
+       }
+       if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
+               goto on1;
+       if (tt->rn_flags & RNF_NORMAL) {
+               if (m->rm_leaf != tt || m->rm_refs > 0) {
+                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
+                       return 0;  /* dangling ref could cause disaster */
+               }
+       } else {
+               if (m->rm_mask != tt->rn_mask) {
+                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
+                       goto on1;
+               }
+               if (--m->rm_refs >= 0)
+                       goto on1;
+       }
+       b = -1 - tt->rn_bit;
+       t = saved_tt->rn_parent;
+       if (b > t->rn_bit)
+               goto on1; /* Wasn't lifted at all */
+       do {
+               x = t;
+               t = t->rn_parent;
+       } while (b <= t->rn_bit && x != top);
+       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
+               if (m == saved_m) {
+                       *mp = m->rm_mklist;
+                       MKFree(m);
+                       break;
+               }
+       if (m == 0) {
+               log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
+               if (tt->rn_flags & RNF_NORMAL)
+                       return (0); /* Dangling ref to us */
+       }
+on1:
+       /*
+        * Eliminate us from tree
+        */
+       if (tt->rn_flags & RNF_ROOT)
+               return (0);
+#ifdef RN_DEBUG
+       /* Get us out of the creation list */
+       for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {}
+       if (t) t->rn_ybro = tt->rn_ybro;
+#endif
+       t = tt->rn_parent;
+       dupedkey = saved_tt->rn_dupedkey;
+       if (dupedkey) {
+               /*
+                * Here, tt is the deletion target and
+                * saved_tt is the head of the dupekey chain.
+                */
+               if (tt == saved_tt) {
+                       /* remove from head of chain */
+                       x = dupedkey; x->rn_parent = t;
+                       if (t->rn_left == tt)
+                               t->rn_left = x;
+                       else
+                               t->rn_right = x;
+               } else {
+                       /* find node in front of tt on the chain */
+                       for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
+                               p = p->rn_dupedkey;
+                       if (p) {
+                               p->rn_dupedkey = tt->rn_dupedkey;
+                               if (tt->rn_dupedkey)            /* parent */
+                                       tt->rn_dupedkey->rn_parent = p;
+                                                               /* parent */
+                       } else log(LOG_ERR, "rn_delete: couldn't find us\n");
+               }
+               t = tt + 1;
+               if  (t->rn_flags & RNF_ACTIVE) {
+#ifndef RN_DEBUG
+                       *++x = *t;
+                       p = t->rn_parent;
+#else
+                       b = t->rn_info;
+                       *++x = *t;
+                       t->rn_info = b;
+                       p = t->rn_parent;
+#endif
+                       if (p->rn_left == t)
+                               p->rn_left = x;
+                       else
+                               p->rn_right = x;
+                       x->rn_left->rn_parent = x;
+                       x->rn_right->rn_parent = x;
+               }
+               goto out;
+       }
+       if (t->rn_left == tt)
+               x = t->rn_right;
+       else
+               x = t->rn_left;
+       p = t->rn_parent;
+       if (p->rn_right == t)
+               p->rn_right = x;
+       else
+               p->rn_left = x;
+       x->rn_parent = p;
+       /*
+        * Demote routes attached to us.
+        */
+       if (t->rn_mklist) {
+               if (x->rn_bit >= 0) {
+                       for (mp = &x->rn_mklist; (m = *mp);)
+                               mp = &m->rm_mklist;
+                       *mp = t->rn_mklist;
+               } else {
+                       /* If there are any key,mask pairs in a sibling
+                          duped-key chain, some subset will appear sorted
+                          in the same order attached to our mklist */
+                       for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
+                               if (m == x->rn_mklist) {
+                                       struct radix_mask *mm = m->rm_mklist;
+                                       x->rn_mklist = 0;
+                                       if (--(m->rm_refs) < 0)
+                                               MKFree(m);
+                                       m = mm;
+                               }
+                       if (m)
+                               log(LOG_ERR,
+                                   "rn_delete: Orphaned Mask %p at %p\n",
+                                   m, x);
+               }
+       }
+       /*
+        * We may be holding an active internal node in the tree.
+        */
+       x = tt + 1;
+       if (t != x) {
+#ifndef RN_DEBUG
+               *t = *x;
+#else
+               b = t->rn_info;
+               *t = *x;
+               t->rn_info = b;
+#endif
+               t->rn_left->rn_parent = t;
+               t->rn_right->rn_parent = t;
+               p = x->rn_parent;
+               if (p->rn_left == x)
+                       p->rn_left = t;
+               else
+                       p->rn_right = t;
+       }
+out:
+       tt->rn_flags &= ~RNF_ACTIVE;
+       tt[1].rn_flags &= ~RNF_ACTIVE;
+       return (tt);
+}
+
+/*
+ * This is the same as rn_walktree() except for the parameters and the
+ * exit.
+ */
+static int
+rn_walktree_from(h, a, m, f, w)
+       struct radix_node_head *h;
+       void *a, *m;
+       walktree_f_t *f;
+       void *w;
+{
+       int error;
+       struct radix_node *base, *next;
+       u_char *xa = (u_char *)a;
+       u_char *xm = (u_char *)m;
+       register struct radix_node *rn, *last = 0 /* shut up gcc */;
+       int stopping = 0;
+       int lastb;
+
+       /*
+        * rn_search_m is sort-of-open-coded here. We cannot use the
+        * function because we need to keep track of the last node seen.
+        */
+       /* printf("about to search\n"); */
+       for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) {
+               last = rn;
+               /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n",
+                      rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */
+               if (!(rn->rn_bmask & xm[rn->rn_offset])) {
+                       break;
+               }
+               if (rn->rn_bmask & xa[rn->rn_offset]) {
+                       rn = rn->rn_right;
+               } else {
+                       rn = rn->rn_left;
+               }
+       }
+       /* printf("done searching\n"); */
+
+       /*
+        * Two cases: either we stepped off the end of our mask,
+        * in which case last == rn, or we reached a leaf, in which
+        * case we want to start from the last node we looked at.
+        * Either way, last is the node we want to start from.
+        */
+       rn = last;
+       lastb = rn->rn_bit;
+
+       /* printf("rn %p, lastb %d\n", rn, lastb);*/
+
+       /*
+        * This gets complicated because we may delete the node
+        * while applying the function f to it, so we need to calculate
+        * the successor node in advance.
+        */
+       while (rn->rn_bit >= 0)
+               rn = rn->rn_left;
+
+       while (!stopping) {
+               /* printf("node %p (%d)\n", rn, rn->rn_bit); */
+               base = rn;
+               /* If at right child go back up, otherwise, go right */
+               while (rn->rn_parent->rn_right == rn
+                      && !(rn->rn_flags & RNF_ROOT)) {
+                       rn = rn->rn_parent;
+
+                       /* if went up beyond last, stop */
+                       if (rn->rn_bit <= lastb) {
+                               stopping = 1;
+                               /* printf("up too far\n"); */
+                               /*
+                                * XXX we should jump to the 'Process leaves'
+                                * part, because the values of 'rn' and 'next'
+                                * we compute will not be used. Not a big deal
+                                * because this loop will terminate, but it is
+                                * inefficient and hard to understand!
+                                */
+                       }
+               }
+               
+               /* 
+                * At the top of the tree, no need to traverse the right
+                * half, prevent the traversal of the entire tree in the
+                * case of default route.
+                */
+               if (rn->rn_parent->rn_flags & RNF_ROOT)
+                       stopping = 1;
+
+               /* Find the next *leaf* since next node might vanish, too */
+               for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
+                       rn = rn->rn_left;
+               next = rn;
+               /* Process leaves */
+               while ((rn = base) != 0) {
+                       base = rn->rn_dupedkey;
+                       /* printf("leaf %p\n", rn); */
+                       if (!(rn->rn_flags & RNF_ROOT)
+                           && (error = (*f)(rn, w)))
+                               return (error);
+               }
+               rn = next;
+
+               if (rn->rn_flags & RNF_ROOT) {
+                       /* printf("root, stopping"); */
+                       stopping = 1;
+               }
+
+       }
+       return 0;
+}
+
+static int
+rn_walktree(h, f, w)
+       struct radix_node_head *h;
+       walktree_f_t *f;
+       void *w;
+{
+       int error;
+       struct radix_node *base, *next;
+       register struct radix_node *rn = h->rnh_treetop;
+       /*
+        * This gets complicated because we may delete the node
+        * while applying the function f to it, so we need to calculate
+        * the successor node in advance.
+        */
+
+       /* First time through node, go left */
+       while (rn->rn_bit >= 0)
+               rn = rn->rn_left;
+       for (;;) {
+               base = rn;
+               /* If at right child go back up, otherwise, go right */
+               while (rn->rn_parent->rn_right == rn
+                      && (rn->rn_flags & RNF_ROOT) == 0)
+                       rn = rn->rn_parent;
+               /* Find the next *leaf* since next node might vanish, too */
+               for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
+                       rn = rn->rn_left;
+               next = rn;
+               /* Process leaves */
+               while ((rn = base)) {
+                       base = rn->rn_dupedkey;
+                       if (!(rn->rn_flags & RNF_ROOT)
+                           && (error = (*f)(rn, w)))
+                               return (error);
+               }
+               rn = next;
+               if (rn->rn_flags & RNF_ROOT)
+                       return (0);
+       }
+       /* NOTREACHED */
+}
+
+/*
+ * Allocate and initialize an empty tree. This has 3 nodes, which are
+ * part of the radix_node_head (in the order <left,root,right>) and are
+ * marked RNF_ROOT so they cannot be freed.
+ * The leaves have all-zero and all-one keys, with significant
+ * bits starting at 'off'.
+ * Return 1 on success, 0 on error.
+ */
+int
+rn_inithead(head, off)
+       void **head;
+       int off;
+{
+       register struct radix_node_head *rnh;
+       register struct radix_node *t, *tt, *ttt;
+       if (*head)
+               return (1);
+       R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh));
+       if (rnh == 0)
+               return (0);
+#ifdef _KERNEL
+       RADIX_NODE_HEAD_LOCK_INIT(rnh);
+#endif
+       *head = rnh;
+       t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
+       ttt = rnh->rnh_nodes + 2;
+       t->rn_right = ttt;
+       t->rn_parent = t;
+       tt = t->rn_left;        /* ... which in turn is rnh->rnh_nodes */
+       tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
+       tt->rn_bit = -1 - off;
+       *ttt = *tt;
+       ttt->rn_key = rn_ones;
+       rnh->rnh_addaddr = rn_addroute;
+       rnh->rnh_deladdr = rn_delete;
+       rnh->rnh_matchaddr = rn_match;
+       rnh->rnh_lookup = rn_lookup;
+       rnh->rnh_walktree = rn_walktree;
+       rnh->rnh_walktree_from = rn_walktree_from;
+       rnh->rnh_treetop = t;
+       return (1);
+}
+
+int
+rn_detachhead(void **head)
+{
+       struct radix_node_head *rnh;
+
+       KASSERT((head != NULL && *head != NULL),
+           ("%s: head already freed", __func__));
+       rnh = *head;
+       
+       /* Free <left,root,right> nodes. */
+       Free(rnh);
+
+       *head = NULL;
+       return (1);
+}
+
+void
+rn_init(int maxk)
+{
+       char *cp, *cplim;
+
+       max_keylen = maxk;
+       if (max_keylen == 0) {
+               log(LOG_ERR,
+                   "rn_init: radix functions require max_keylen be set\n");
+               return;
+       }
+       R_Malloc(rn_zeros, char *, 3 * max_keylen);
+       if (rn_zeros == NULL)
+               panic("rn_init");
+       bzero(rn_zeros, 3 * max_keylen);
+       rn_ones = cp = rn_zeros + max_keylen;
+       addmask_key = cplim = rn_ones + max_keylen;
+       while (cp < cplim)
+               *cp++ = -1;
+       if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
+               panic("rn_init 2");
+}
diff --git a/sys/net/radix.h b/sys/net/radix.h
new file mode 100644 (file)
index 0000000..4102c99
--- /dev/null
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 1988, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)radix.h     8.2 (Berkeley) 10/31/94
+ * $FreeBSD: head/sys/net/radix.h 185747 2008-12-07 21:15:43Z kmacy $
+ */
+
+#ifndef _RADIX_H_
+#define        _RADIX_H_
+
+#ifdef _KERNEL
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+#include <sys/_rwlock.h>
+#endif
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_RTABLE);
+#endif
+
+/*
+ * Radix search tree node layout.
+ */
+
+struct radix_node {
+       struct  radix_mask *rn_mklist;  /* list of masks contained in subtree */
+       struct  radix_node *rn_parent;  /* parent */
+       short   rn_bit;                 /* bit offset; -1-index(netmask) */
+       char    rn_bmask;               /* node: mask for bit test*/
+       u_char  rn_flags;               /* enumerated next */
+#define RNF_NORMAL     1               /* leaf contains normal route */
+#define RNF_ROOT       2               /* leaf is root leaf for tree */
+#define RNF_ACTIVE     4               /* This node is alive (for rtfree) */
+       union {
+               struct {                        /* leaf only data: */
+                       caddr_t rn_Key;         /* object of search */
+                       caddr_t rn_Mask;        /* netmask, if present */
+                       struct  radix_node *rn_Dupedkey;
+               } rn_leaf;
+               struct {                        /* node only data: */
+                       int     rn_Off;         /* where to start compare */
+                       struct  radix_node *rn_L;/* progeny */
+                       struct  radix_node *rn_R;/* progeny */
+               } rn_node;
+       }               rn_u;
+#ifdef RN_DEBUG
+       int rn_info;
+       struct radix_node *rn_twin;
+       struct radix_node *rn_ybro;
+#endif
+};
+
+#define        rn_dupedkey     rn_u.rn_leaf.rn_Dupedkey
+#define        rn_key          rn_u.rn_leaf.rn_Key
+#define        rn_mask         rn_u.rn_leaf.rn_Mask
+#define        rn_offset       rn_u.rn_node.rn_Off
+#define        rn_left         rn_u.rn_node.rn_L
+#define        rn_right        rn_u.rn_node.rn_R
+
+/*
+ * Annotations to tree concerning potential routes applying to subtrees.
+ */
+
+struct radix_mask {
+       short   rm_bit;                 /* bit offset; -1-index(netmask) */
+       char    rm_unused;              /* cf. rn_bmask */
+       u_char  rm_flags;               /* cf. rn_flags */
+       struct  radix_mask *rm_mklist;  /* more masks to try */
+       union   {
+               caddr_t rmu_mask;               /* the mask */
+               struct  radix_node *rmu_leaf;   /* for normal routes */
+       }       rm_rmu;
+       int     rm_refs;                /* # of references to this struct */
+};
+
+#define        rm_mask rm_rmu.rmu_mask
+#define        rm_leaf rm_rmu.rmu_leaf         /* extra field would make 32 bytes */
+
+typedef int walktree_f_t(struct radix_node *, void *);
+
+struct radix_node_head {
+       struct  radix_node *rnh_treetop;
+       u_int   rnh_gen;                /* generation counter */
+       int     rnh_multipath;          /* multipath capable ? */
+       int     rnh_addrsize;           /* permit, but not require fixed keys */
+       int     rnh_pktsize;            /* permit, but not require fixed keys */
+       struct  radix_node *(*rnh_addaddr)      /* add based on sockaddr */
+               (void *v, void *mask,
+                    struct radix_node_head *head, struct radix_node nodes[]);
+       struct  radix_node *(*rnh_addpkt)       /* add based on packet hdr */
+               (void *v, void *mask,
+                    struct radix_node_head *head, struct radix_node nodes[]);
+       struct  radix_node *(*rnh_deladdr)      /* remove based on sockaddr */
+               (void *v, void *mask, struct radix_node_head *head);
+       struct  radix_node *(*rnh_delpkt)       /* remove based on packet hdr */
+               (void *v, void *mask, struct radix_node_head *head);
+       struct  radix_node *(*rnh_matchaddr)    /* locate based on sockaddr */
+               (void *v, struct radix_node_head *head);
+       struct  radix_node *(*rnh_lookup)       /* locate based on sockaddr */
+               (void *v, void *mask, struct radix_node_head *head);
+       struct  radix_node *(*rnh_matchpkt)     /* locate based on packet hdr */
+               (void *v, struct radix_node_head *head);
+       int     (*rnh_walktree)                 /* traverse tree */
+               (struct radix_node_head *head, walktree_f_t *f, void *w);
+       int     (*rnh_walktree_from)            /* traverse tree below a */
+               (struct radix_node_head *head, void *a, void *m,
+                    walktree_f_t *f, void *w);
+       void    (*rnh_close)    /* do something when the last ref drops */
+               (struct radix_node *rn, struct radix_node_head *head);
+       struct  radix_node rnh_nodes[3];        /* empty tree for common case */
+#ifdef _KERNEL
+#if defined( __linux__ ) || defined( _WIN32 )
+        spinlock_t rnh_lock;
+#else
+       struct  rwlock rnh_lock;                /* locks entire radix tree */
+#endif /* !__linux__ */
+#endif
+};
+
+#ifndef _KERNEL
+#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n)))
+#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n)))
+#define Free(p) free((char *)p);
+#else
+#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT))
+#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO))
+#define Free(p) free((caddr_t)p, M_RTABLE);
+
+#define        RADIX_NODE_HEAD_LOCK_INIT(rnh)  \
+    rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
+#define        RADIX_NODE_HEAD_LOCK(rnh)       rw_wlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_UNLOCK(rnh)     rw_wunlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_RLOCK(rnh)      rw_rlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_RUNLOCK(rnh)    rw_runlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh)   rw_try_upgrade(&(rnh)->rnh_lock)
+
+
+#define        RADIX_NODE_HEAD_DESTROY(rnh)    rw_destroy(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED)
+#define        RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED)
+#endif /* _KERNEL */
+
+void    rn_init(int);
+int     rn_inithead(void **, int);
+int     rn_detachhead(void **);
+int     rn_refines(void *, void *);
+struct radix_node
+        *rn_addmask(void *, int, int),
+        *rn_addroute (void *, void *, struct radix_node_head *,
+                       struct radix_node [2]),
+        *rn_delete(void *, void *, struct radix_node_head *),
+        *rn_lookup (void *v_arg, void *m_arg,
+                       struct radix_node_head *head),
+        *rn_match(void *, struct radix_node_head *);
+
+#endif /* _RADIX_H_ */
diff --git a/sys/netgraph/ng_ipfw.h b/sys/netgraph/ng_ipfw.h
new file mode 100644 (file)
index 0000000..de74d4e
--- /dev/null
@@ -0,0 +1,33 @@
+/*-
+ * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $
+ */
+
+#ifndef _NG_IPFW_H
+#define _NG_IPFW_H
+#define NG_IPFW_NODE_TYPE    "ipfw"
+#define NGM_IPFW_COOKIE      1105988990
+#endif /* _NG_IPFW_H */
diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c
new file mode 100644 (file)
index 0000000..d6acf87
--- /dev/null
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 1988, 1992, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)in_cksum.c  8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $");
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+
+/*
+ * Checksum routine for Internet Protocol family headers (Portable Version).
+ *
+ * This routine is very heavily used in the network
+ * code and should be modified for each CPU to be as fast as possible.
+ */
+
+#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
+
+int
+in_cksum(struct mbuf *m, int len)
+{
+       register u_short *w;
+       register int sum = 0;
+       register int mlen = 0;
+       int byte_swapped = 0;
+
+       union {
+               char    c[2];
+               u_short s;
+       } s_util;
+       union {
+               u_short s[2];
+               long    l;
+       } l_util;
+
+       for (;m && len; m = m->m_next) {
+               if (m->m_len == 0)
+                       continue;
+               w = mtod(m, u_short *);
+               if (mlen == -1) {
+                       /*
+                        * The first byte of this mbuf is the continuation
+                        * of a word spanning between this mbuf and the
+                        * last mbuf.
+                        *
+                        * s_util.c[0] is already saved when scanning previous
+                        * mbuf.
+                        */
+                       s_util.c[1] = *(char *)w;
+                       sum += s_util.s;
+                       w = (u_short *)((char *)w + 1);
+                       mlen = m->m_len - 1;
+                       len--;
+               } else
+                       mlen = m->m_len;
+               if (len < mlen)
+                       mlen = len;
+               len -= mlen;
+               /*
+                * Force to even boundary.
+                */
+               if ((1 & (uintptr_t) w) && (mlen > 0)) {
+                       REDUCE;
+                       sum <<= 8;
+                       s_util.c[0] = *(u_char *)w;
+                       w = (u_short *)((char *)w + 1);
+                       mlen--;
+                       byte_swapped = 1;
+               }
+               /*
+                * Unroll the loop to make overhead from
+                * branches &c small.
+                */
+               while ((mlen -= 32) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
+                       sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
+                       sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
+                       w += 16;
+               }
+               mlen += 32;
+               while ((mlen -= 8) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       w += 4;
+               }
+               mlen += 8;
+               if (mlen == 0 && byte_swapped == 0)
+                       continue;
+               REDUCE;
+               while ((mlen -= 2) >= 0) {
+                       sum += *w++;
+               }
+               if (byte_swapped) {
+                       REDUCE;
+                       sum <<= 8;
+                       byte_swapped = 0;
+                       if (mlen == -1) {
+                               s_util.c[1] = *(char *)w;
+                               sum += s_util.s;
+                               mlen = 0;
+                       } else
+                               mlen = -1;
+               } else if (mlen == -1)
+                       s_util.c[0] = *(char *)w;
+       }
+       if (len)
+               printf("cksum: out of data\n");
+       if (mlen == -1) {
+               /* The last mbuf has odd # of bytes. Follow the
+                  standard (the odd byte may be shifted left by 8 bits
+                  or not as determined by endian-ness of the machine) */
+               s_util.c[1] = 0;
+               sum += s_util.s;
+       }
+       REDUCE;
+       return (~sum & 0xffff);
+}
diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h
new file mode 100644 (file)
index 0000000..c9da4d8
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef _NETINET_IP_H_
+#define _NETINET_IP_H_
+
+#define LITTLE_ENDIAN   1234
+#define BIG_ENDIAN      4321
+#if defined(__BIG_ENDIAN)
+#define BYTE_ORDER      BIG_ENDIAN
+//#warning we are in bigendian
+#elif defined(__LITTLE_ENDIAN)
+//#warning we are in littleendian
+#define BYTE_ORDER      LITTLE_ENDIAN
+#else
+#error no platform
+#endif
+
+/* XXX endiannes doesn't belong here */
+// #define LITTLE_ENDIAN   1234
+// #define BIG_ENDIAN      4321
+// #define BYTE_ORDER      LITTLE_ENDIAN
+
+/*
+ * Structure of an internet header, naked of options.
+ */
+struct ip {
+#if BYTE_ORDER == LITTLE_ENDIAN
+        u_char  ip_hl:4,                /* header length */
+                ip_v:4;                 /* version */
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+        u_char  ip_v:4,                 /* version */
+                ip_hl:4;                /* header length */
+#endif
+        u_char  ip_tos;                 /* type of service */
+        u_short ip_len;                 /* total length */
+        u_short ip_id;                  /* identification */
+        u_short ip_off;                 /* fragment offset field */
+#define IP_RF 0x8000                    /* reserved fragment flag */
+#define IP_DF 0x4000                    /* dont fragment flag */
+#define IP_MF 0x2000                    /* more fragments flag */
+#define IP_OFFMASK 0x1fff               /* mask for fragmenting bits */
+        u_char  ip_ttl;                 /* time to live */
+        u_char  ip_p;                   /* protocol */
+        u_short ip_sum;                 /* checksum */
+        struct  in_addr ip_src,ip_dst;  /* source and dest address */
+} __packed __aligned(4);
+
+#define        IPTOS_LOWDELAY          0x10
+
+#endif /* _NETINET_IP_H_ */
diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h
new file mode 100644 (file)
index 0000000..88b42a4
--- /dev/null
@@ -0,0 +1,58 @@
+#ifndef _NETINET_IP6_H_
+#define _NETINET_IP6_H_
+#define IN6_ARE_ADDR_EQUAL(a, b)                        \
+(memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0)
+
+struct ip6_hdr {
+        union {
+                struct ip6_hdrctl {
+                        u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */  
+                        u_int16_t ip6_un1_plen; /* payload length */
+                        u_int8_t  ip6_un1_nxt;  /* next header */
+                        u_int8_t  ip6_un1_hlim; /* hop limit */
+                } ip6_un1;
+                u_int8_t ip6_un2_vfc;   /* 4 bits version, top 4 bits class */
+        } ip6_ctlun;
+        struct in6_addr ip6_src;        /* source address */
+        struct in6_addr ip6_dst;        /* destination address */
+};
+#define ip6_nxt         ip6_ctlun.ip6_un1.ip6_un1_nxt
+#define ip6_flow        ip6_ctlun.ip6_un1.ip6_un1_flow
+
+
+struct icmp6_hdr {
+        u_int8_t        icmp6_type;     /* type field */
+        u_int8_t        icmp6_code;     /* code field */
+        u_int16_t       icmp6_cksum;    /* checksum field */
+        union {
+                u_int32_t       icmp6_un_data32[1]; /* type-specific field */
+                u_int16_t       icmp6_un_data16[2]; /* type-specific field */
+                u_int8_t        icmp6_un_data8[4];  /* type-specific field */
+        } icmp6_dataun;
+};
+
+struct ip6_hbh {
+        u_int8_t ip6h_nxt;      /* next header */
+        u_int8_t ip6h_len;      /* length in units of 8 octets */
+        /* followed by options */
+}; 
+struct ip6_rthdr {
+        u_int8_t  ip6r_nxt;     /* next header */
+        u_int8_t  ip6r_len;     /* length in units of 8 octets */
+        u_int8_t  ip6r_type;    /* routing type */
+        u_int8_t  ip6r_segleft; /* segments left */
+        /* followed by routing type specific data */
+};
+struct ip6_frag {
+        u_int8_t  ip6f_nxt;             /* next header */
+        u_int8_t  ip6f_reserved;        /* reserved field */
+        u_int16_t ip6f_offlg;           /* offset, reserved, and flag */
+        u_int32_t ip6f_ident;           /* identification */
+};
+#define IP6F_OFF_MASK           0xfff8  /* mask out offset from _offlg */
+#define IP6F_MORE_FRAG          0x0001  /* more-fragments flag */
+struct  ip6_ext {
+        u_int8_t ip6e_nxt;
+        u_int8_t ip6e_len;
+};
+#endif /* _NETINET_IP6_H_ */
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
new file mode 100644 (file)
index 0000000..eab28f6
--- /dev/null
@@ -0,0 +1,263 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_dummynet.h 203321 2010-01-31 21:39:25Z luigi $
+ */
+
+#ifndef _IP_DUMMYNET_H
+#define _IP_DUMMYNET_H
+
+/*
+ * Definition of the kernel-userland API for dummynet.
+ *
+ * Setsockopt() and getsockopt() pass a batch of objects, each
+ * of them starting with a "struct dn_id" which should fully identify
+ * the object and its relation with others in the sequence.
+ * The first object in each request should have
+ *      type= DN_CMD_*, id = DN_API_VERSION.
+ * For other objects, type and subtype specify the object, len indicates
+ * the total length including the header, and 'id' identifies the specific
+ * object.
+ *
+ * Most objects are numbered with an identifier in the range 1..65535.
+ * DN_MAX_ID indicates the first value outside the range.
+ */
+
+#define        DN_API_VERSION  12500000
+#define        DN_MAX_ID       0x10000
+
+struct dn_id {
+       uint16_t        len;    /* total obj len including this header */
+       uint8_t         type;
+       uint8_t         subtype;
+       uint32_t        id;     /* generic id */
+};
+
+/*
+ * These values are in the type field of struct dn_id.
+ * To preserve the ABI, never rearrange the list or delete
+ * entries with the exception of DN_LAST
+ */
+enum {
+       DN_NONE = 0,
+       DN_LINK = 1,
+       DN_FS,
+       DN_SCH,
+       DN_SCH_I,
+       DN_QUEUE,
+       DN_DELAY_LINE,
+       DN_PROFILE,
+       DN_FLOW,                /* struct dn_flow */
+       DN_TEXT,                /* opaque text is the object */
+
+       DN_CMD_CONFIG = 0x80,   /* objects follow */
+       DN_CMD_DELETE,          /* subtype + list of entries */
+       DN_CMD_GET,             /* subtype + list of entries */
+       DN_CMD_FLUSH,
+       /* for compatibility with FreeBSD 7.2/8 */
+       DN_COMPAT_PIPE,
+       DN_COMPAT_QUEUE,
+       DN_GET_COMPAT,
+
+       /* special commands for emulation of sysctl variables */
+       DN_SYSCTL_GET,
+       DN_SYSCTL_SET,
+
+       DN_LAST,
+};
+
+enum { /* subtype for schedulers, flowset and the like */
+       DN_SCHED_UNKNOWN = 0,
+       DN_SCHED_FIFO = 1,
+       DN_SCHED_WF2QP = 2,
+       /* others are in individual modules */
+};
+
+enum { /* user flags */
+       DN_HAVE_MASK    = 0x0001,       /* fs or sched has a mask */
+       DN_NOERROR      = 0x0002,       /* do not report errors */
+       DN_QHT_HASH     = 0x0004,       /* qht is a hash table */
+       DN_QSIZE_BYTES  = 0x0008,       /* queue size is in bytes */
+       DN_HAS_PROFILE  = 0x0010,       /* a link has a profile */
+       DN_IS_RED       = 0x0020,
+       DN_IS_GENTLE_RED= 0x0040,
+       DN_PIPE_CMD     = 0x1000,       /* pipe config... */
+};
+
+/*
+ * link template.
+ */
+struct dn_link {
+       struct dn_id oid;
+
+       /*
+        * Userland sets bw and delay in bits/s and milliseconds.
+        * The kernel converts this back and forth to bits/tick and ticks.
+        * XXX what about burst ?
+        */
+       int32_t         link_nr;
+       int             bandwidth;      /* bit/s or bits/tick.   */
+       int             delay;          /* ms and ticks */
+       uint64_t        burst;          /* scaled. bits*Hz  XXX */
+};
+
+/*
+ * A flowset, which is a template for flows. Contains parameters
+ * from the command line: id, target scheduler, queue sizes, plr,
+ * flow masks, buckets for the flow hash, and possibly scheduler-
+ * specific parameters (weight, quantum and so on).
+ */
+struct dn_fs {
+       struct dn_id oid;
+       uint32_t fs_nr;         /* the flowset number */
+       uint32_t flags;         /* userland flags */
+       int qsize;              /* queue size in slots or bytes */
+       int32_t plr;            /* PLR, pkt loss rate (2^31-1 means 100%) */
+       uint32_t buckets;       /* buckets used for the queue hash table */
+
+       struct ipfw_flow_id flow_mask;
+       uint32_t sched_nr;      /* the scheduler we attach to */
+       /* generic scheduler parameters. Leave them at -1 if unset.
+        * Now we use 0: weight, 1: lmax, 2: priority
+        */
+       int par[4];
+
+       /* RED/GRED parameters.
+        * weight and probabilities are in the range 0..1 represented
+        * in fixed point arithmetic with SCALE_RED decimal bits.
+        */
+#define SCALE_RED      16
+#define SCALE(x)       ( (x) << SCALE_RED )
+#define SCALE_VAL(x)   ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+       int w_q ;               /* queue weight (scaled) */
+       int max_th ;            /* maximum threshold for queue (scaled) */
+       int min_th ;            /* minimum threshold for queue (scaled) */
+       int max_p ;             /* maximum value for p_b (scaled) */
+
+};
+
+/*
+ * dn_flow collects flow_id and stats for queues and scheduler
+ * instances, and is used to pass these info to userland.
+ * oid.type/oid.subtype describe the object, oid.id is number
+ * of the parent object.
+ */
+struct dn_flow {
+       struct dn_id    oid;
+       struct ipfw_flow_id fid;
+       uint64_t        tot_pkts; /* statistics counters  */
+       uint64_t        tot_bytes;
+       uint32_t        length; /* Queue length, in packets */
+       uint32_t        len_bytes; /* Queue length, in bytes */
+       uint32_t        drops;
+};
+
+
+/*
+ * Scheduler template, mostly indicating the name, number,
+ * sched_mask and buckets.
+ */
+struct dn_sch {
+       struct dn_id    oid;
+       uint32_t        sched_nr; /* N, scheduler number */
+       uint32_t        buckets; /* number of buckets for the instances */
+       uint32_t        flags;  /* have_mask, ... */
+
+       char name[16];  /* null terminated */
+       /* mask to select the appropriate scheduler instance */
+       struct ipfw_flow_id sched_mask; /* M */
+};
+
+
+/* A delay profile is attached to a link.
+ * Note that a profile, as any other object, cannot be longer than 2^16
+ */
+#define        ED_MAX_SAMPLES_NO       1024
+struct dn_profile {
+       struct dn_id    oid;
+       /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN                32
+       char    name[ED_MAX_NAME_LEN];
+       int     link_nr;
+       int     loss_level;
+       int     bandwidth;                      // XXX use link bandwidth?
+       int     samples_no;                     /* actual len of samples[] */
+       int     samples[0];                     /* may be shorter */
+};
+
+
+
+/*
+ * Overall structure of dummynet
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE (bad name).
+
+A QUEUE defines a classifier, which groups packets into flows
+according to a 'mask', puts them into independent queues (one
+per flow) with configurable size and queue management policy,
+and passes flows to a scheduler:
+
+                 (flow_mask|sched_mask)  sched_mask
+        +---------+   weight Wx  +-------------+
+         |         |->-[flow]-->--|             |-+
+    -->--| QUEUE x |   ...        |             | |
+         |         |->-[flow]-->--| SCHEDuler N | |
+        +---------+              |             | |
+            ...                  |             +--[LINK N]-->--
+        +---------+   weight Wy  |             | +--[LINK N]-->--
+         |         |->-[flow]-->--|             | |
+    -->--| QUEUE y |   ...        |             | |
+         |         |->-[flow]-->--|             | |
+        +---------+              +-------------+ |
+                                   +-------------+
+
+Many QUEUE objects can connect to the same scheduler, each
+QUEUE object can have its own set of parameters.
+
+In turn, the SCHEDuler 'forks' multiple instances according
+to a 'sched_mask', each instance manages its own set of queues
+and transmits on a private instance of a configurable LINK.
+
+A PIPE is a simplified version of the above, where there
+is no flow_mask, and each scheduler instance handles a single queue.
+
+The following data structures (visible from userland) describe
+the objects used by dummynet:
+
+ + dn_link, contains the main configuration parameters related
+   to delay and bandwidth;
+ + dn_profile describes a delay profile;
+ + dn_flow describes the flow status (flow id, statistics)
+   
+ + dn_sch describes a scheduler
+ + dn_fs describes a flowset (msk, weight, queue parameters)
+
+ *
+ */
+
+#endif /* _IP_DUMMYNET_H */
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
new file mode 100644 (file)
index 0000000..9bfe775
--- /dev/null
@@ -0,0 +1,646 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_fw.h 202072 2010-01-11 10:12:35Z luigi $
+ */
+
+#ifndef _IPFW2_H
+#define _IPFW2_H
+
+/*
+ * The default rule number.  By the design of ip_fw, the default rule
+ * is the last one, so its number can also serve as the highest number
+ * allowed for a rule.  The ip_fw code relies on both meanings of this
+ * constant. 
+ */
+#define        IPFW_DEFAULT_RULE       65535
+
+/*
+ * The number of ipfw tables.  The maximum allowed table number is the
+ * (IPFW_TABLES_MAX - 1).
+ */
+#define        IPFW_TABLES_MAX         128
+
+/*
+ * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
+ * argument between 1 and 65534. The value 0 is unused, the value
+ * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
+ * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
+ * result of the most recent table() lookup.
+ * Note that 16bit is only a historical limit, resulting from
+ * the use of a 16-bit fields for that value. In reality, we can have
+ * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
+ */
+#define        IPFW_ARG_MIN            1
+#define        IPFW_ARG_MAX            65534
+#define IP_FW_TABLEARG         65535   /* XXX should use 0 */
+
+ /*
+ * Number of entries in the call stack of the call/return commands.
+ * Call stack currently is an uint16_t array with rule numbers.
+ */
+#define        IPFW_CALLSTACK_SIZE     16
+
+/* IP_FW3 header/opcodes */
+typedef struct _ip_fw3_opheader {
+       uint16_t opcode;        /* Operation opcode */
+       uint16_t reserved[3];   /* Align to 64-bit boundary */
+} ip_fw3_opheader;
+
+
+/* IPFW extented tables support XXX what namespace ? */
+#define        IP_FW_TABLE_XADD        86      /* add entry */
+#define        IP_FW_TABLE_XDEL        87      /* delete entry */
+#define        IP_FW_TABLE_XGETSIZE    88      /* get table size */
+#define        IP_FW_TABLE_XLIST       89      /* list table contents */
+
+/*
+ * The kernel representation of ipfw rules is made of a list of
+ * 'instructions' (for all practical purposes equivalent to BPF
+ * instructions), which specify which fields of the packet
+ * (or its metadata) should be analysed.
+ *
+ * Each instruction is stored in a structure which begins with
+ * "ipfw_insn", and can contain extra fields depending on the
+ * instruction type (listed below).
+ * Note that the code is written so that individual instructions
+ * have a size which is a multiple of 32 bits. This means that, if
+ * such structures contain pointers or other 64-bit entities,
+ * (there is just one instance now) they may end up unaligned on
+ * 64-bit architectures, so the must be handled with care.
+ *
+ * "enum ipfw_opcodes" are the opcodes supported. We can have up
+ * to 256 different opcodes. When adding new opcodes, they should
+ * be appended to the end of the opcode list before O_LAST_OPCODE,
+ * this will prevent the ABI from being broken, otherwise users
+ * will have to recompile ipfw(8) when they update the kernel.
+ */
+
+enum ipfw_opcodes {            /* arguments (4 byte each)      */
+       O_NOP,
+
+       O_IP_SRC,               /* u32 = IP                     */
+       O_IP_SRC_MASK,          /* ip = IP/mask                 */
+       O_IP_SRC_ME,            /* none                         */
+       O_IP_SRC_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_DST,               /* u32 = IP                     */
+       O_IP_DST_MASK,          /* ip = IP/mask                 */
+       O_IP_DST_ME,            /* none                         */
+       O_IP_DST_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_SRCPORT,           /* (n)port list:mask 4 byte ea  */
+       O_IP_DSTPORT,           /* (n)port list:mask 4 byte ea  */
+       O_PROTO,                /* arg1=protocol                */
+
+       O_MACADDR2,             /* 2 mac addr:mask              */
+       O_MAC_TYPE,             /* same as srcport              */
+
+       O_LAYER2,               /* none                         */
+       O_IN,                   /* none                         */
+       O_FRAG,                 /* none                         */
+
+       O_RECV,                 /* none                         */
+       O_XMIT,                 /* none                         */
+       O_VIA,                  /* none                         */
+
+       O_IPOPT,                /* arg1 = 2*u8 bitmap           */
+       O_IPLEN,                /* arg1 = len                   */
+       O_IPID,                 /* arg1 = id                    */
+
+       O_IPTOS,                /* arg1 = id                    */
+       O_IPPRECEDENCE,         /* arg1 = precedence << 5       */
+       O_IPTTL,                /* arg1 = TTL                   */
+
+       O_IPVER,                /* arg1 = version               */
+       O_UID,                  /* u32 = id                     */
+       O_GID,                  /* u32 = id                     */
+       O_ESTAB,                /* none (tcp established)       */
+       O_TCPFLAGS,             /* arg1 = 2*u8 bitmap           */
+       O_TCPWIN,               /* arg1 = desired win           */
+       O_TCPSEQ,               /* u32 = desired seq.           */
+       O_TCPACK,               /* u32 = desired seq.           */
+       O_ICMPTYPE,             /* u32 = icmp bitmap            */
+       O_TCPOPTS,              /* arg1 = 2*u8 bitmap           */
+
+       O_VERREVPATH,           /* none                         */
+       O_VERSRCREACH,          /* none                         */
+
+       O_PROBE_STATE,          /* none                         */
+       O_KEEP_STATE,           /* none                         */
+       O_LIMIT,                /* ipfw_insn_limit              */
+       O_LIMIT_PARENT,         /* dyn_type, not an opcode.     */
+
+       /*
+        * These are really 'actions'.
+        */
+
+       O_LOG,                  /* ipfw_insn_log                */
+       O_PROB,                 /* u32 = match probability      */
+
+       O_CHECK_STATE,          /* none                         */
+       O_ACCEPT,               /* none                         */
+       O_DENY,                 /* none                         */
+       O_REJECT,               /* arg1=icmp arg (same as deny) */
+       O_COUNT,                /* none                         */
+       O_SKIPTO,               /* arg1=next rule number        */
+       O_PIPE,                 /* arg1=pipe number             */
+       O_QUEUE,                /* arg1=queue number            */
+       O_DIVERT,               /* arg1=port number             */
+       O_TEE,                  /* arg1=port number             */
+       O_FORWARD_IP,           /* fwd sockaddr                 */
+       O_FORWARD_MAC,          /* fwd mac                      */
+       O_NAT,                  /* nope                         */
+       O_REASS,                /* none                         */
+       
+       /*
+        * More opcodes.
+        */
+       O_IPSEC,                /* has ipsec history            */
+       O_IP_SRC_LOOKUP,        /* arg1=table number, u32=value */
+       O_IP_DST_LOOKUP,        /* arg1=table number, u32=value */
+       O_ANTISPOOF,            /* none                         */
+       O_JAIL,                 /* u32 = id                     */
+       O_ALTQ,                 /* u32 = altq classif. qid      */
+       O_DIVERTED,             /* arg1=bitmap (1:loop, 2:out)  */
+       O_TCPDATALEN,           /* arg1 = tcp data len          */
+       O_IP6_SRC,              /* address without mask         */
+       O_IP6_SRC_ME,           /* my addresses                 */
+       O_IP6_SRC_MASK,         /* address with the mask        */
+       O_IP6_DST,
+       O_IP6_DST_ME,
+       O_IP6_DST_MASK,
+       O_FLOW6ID,              /* for flow id tag in the ipv6 pkt */
+       O_ICMP6TYPE,            /* icmp6 packet type filtering  */
+       O_EXT_HDR,              /* filtering for ipv6 extension header */
+       O_IP6,
+
+       /*
+        * actions for ng_ipfw
+        */
+       O_NETGRAPH,             /* send to ng_ipfw              */
+       O_NGTEE,                /* copy to ng_ipfw              */
+
+       O_IP4,
+
+       O_UNREACH6,             /* arg1=icmpv6 code arg (deny)  */
+
+       O_TAG,                  /* arg1=tag number */
+       O_TAGGED,               /* arg1=tag number */
+
+       O_SETFIB,               /* arg1=FIB number */
+       O_FIB,                  /* arg1=FIB desired fib number */
+
+       O_SOCKARG,              /* socket argument */
+
+       O_CALLRETURN,           /* arg1=called rule number */
+
+       O_FORWARD_IP6,          /* fwd sockaddr_in6             */
+
+       O_LAST_OPCODE           /* not an opcode!               */
+};
+
+
+/*
+ * The extension header are filtered only for presence using a bit
+ * vector with a flag for each header.
+ */
+#define EXT_FRAGMENT   0x1
+#define EXT_HOPOPTS    0x2
+#define EXT_ROUTING    0x4
+#define EXT_AH         0x8
+#define EXT_ESP                0x10
+#define EXT_DSTOPTS    0x20
+#define EXT_RTHDR0             0x40
+#define EXT_RTHDR2             0x80
+
+/*
+ * Template for instructions.
+ *
+ * ipfw_insn is used for all instructions which require no operands,
+ * a single 16-bit value (arg1), or a couple of 8-bit values.
+ *
+ * For other instructions which require different/larger arguments
+ * we have derived structures, ipfw_insn_*.
+ *
+ * The size of the instruction (in 32-bit words) is in the low
+ * 6 bits of "len". The 2 remaining bits are used to implement
+ * NOT and OR on individual instructions. Given a type, you can
+ * compute the length to be put in "len" using F_INSN_SIZE(t)
+ *
+ * F_NOT       negates the match result of the instruction.
+ *
+ * F_OR                is used to build or blocks. By default, instructions
+ *             are evaluated as part of a logical AND. An "or" block
+ *             { X or Y or Z } contains F_OR set in all but the last
+ *             instruction of the block. A match will cause the code
+ *             to skip past the last instruction of the block.
+ *
+ * NOTA BENE: in a couple of places we assume that
+ *     sizeof(ipfw_insn) == sizeof(u_int32_t)
+ * this needs to be fixed.
+ *
+ */
+typedef struct _ipfw_insn {    /* template for instructions */
+       u_int8_t        opcode;
+       u_int8_t        len;    /* number of 32-bit words */
+#define        F_NOT           0x80
+#define        F_OR            0x40
+#define        F_LEN_MASK      0x3f
+#define        F_LEN(cmd)      ((cmd)->len & F_LEN_MASK)
+
+       u_int16_t       arg1;
+} ipfw_insn;
+
+/*
+ * The F_INSN_SIZE(type) computes the size, in 4-byte words, of
+ * a given type.
+ */
+#define        F_INSN_SIZE(t)  ((sizeof (t))/sizeof(u_int32_t))
+
+/*
+ * This is used to store an array of 16-bit entries (ports etc.)
+ */
+typedef struct _ipfw_insn_u16 {
+       ipfw_insn o;
+       u_int16_t ports[2];     /* there may be more */
+} ipfw_insn_u16;
+
+/*
+ * This is used to store an array of 32-bit entries
+ * (uid, single IPv4 addresses etc.)
+ */
+typedef struct _ipfw_insn_u32 {
+       ipfw_insn o;
+       u_int32_t d[1]; /* one or more */
+} ipfw_insn_u32;
+
+/*
+ * This is used to store IP addr-mask pairs.
+ */
+typedef struct _ipfw_insn_ip {
+       ipfw_insn o;
+       struct in_addr  addr;
+       struct in_addr  mask;
+} ipfw_insn_ip;
+
+/*
+ * This is used to forward to a given address (ip).
+ */
+typedef struct  _ipfw_insn_sa {
+       ipfw_insn o;
+       struct sockaddr_in sa;
+} ipfw_insn_sa;
+
+/*
+* This is used to forward to a given address (ipv6).
+*/
+typedef struct _ipfw_insn_sa6 {
+       ipfw_insn o;
+       struct sockaddr_in6 sa;
+} ipfw_insn_sa6;
+
+/*
+ * This is used for MAC addr-mask pairs.
+ */
+typedef struct _ipfw_insn_mac {
+       ipfw_insn o;
+       u_char addr[12];        /* dst[6] + src[6] */
+       u_char mask[12];        /* dst[6] + src[6] */
+} ipfw_insn_mac;
+
+/*
+ * This is used for interface match rules (recv xx, xmit xx).
+ */
+typedef struct _ipfw_insn_if {
+       ipfw_insn o;
+       union {
+               struct in_addr ip;
+               int glob;
+       } p;
+       char name[IFNAMSIZ];
+} ipfw_insn_if;
+
+/*
+ * This is used for storing an altq queue id number.
+ */
+typedef struct _ipfw_insn_altq {
+       ipfw_insn       o;
+       u_int32_t       qid;
+} ipfw_insn_altq;
+
+/*
+ * This is used for limit rules.
+ */
+typedef struct _ipfw_insn_limit {
+       ipfw_insn o;
+       u_int8_t _pad;
+       u_int8_t limit_mask;    /* combination of DYN_* below   */
+#define        DYN_SRC_ADDR    0x1
+#define        DYN_SRC_PORT    0x2
+#define        DYN_DST_ADDR    0x4
+#define        DYN_DST_PORT    0x8
+
+       u_int16_t conn_limit;
+} ipfw_insn_limit;
+
+/*
+ * This is used for log instructions.
+ */
+typedef struct  _ipfw_insn_log {
+        ipfw_insn o;
+       u_int32_t max_log;      /* how many do we log -- 0 = all */
+       u_int32_t log_left;     /* how many left to log         */
+} ipfw_insn_log;
+
+/*
+ * Data structures required by both ipfw(8) and ipfw(4) but not part of the
+ * management API are protected by IPFW_INTERNAL.
+ */
+#ifdef IPFW_INTERNAL
+/* Server pool support (LSNAT). */
+struct cfg_spool {
+       LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
+       struct in_addr          addr;
+       u_short                 port;
+};
+#endif
+
+/* Redirect modes id. */
+#define REDIR_ADDR      0x01
+#define REDIR_PORT      0x02
+#define REDIR_PROTO     0x04
+
+#ifdef IPFW_INTERNAL
+/* Nat redirect configuration. */
+struct cfg_redir {
+       LIST_ENTRY(cfg_redir)   _next;          /* chain of redir instances */
+       u_int16_t               mode;           /* type of redirect mode */
+       struct in_addr          laddr;          /* local ip address */
+       struct in_addr          paddr;          /* public ip address */
+       struct in_addr          raddr;          /* remote ip address */
+       u_short                 lport;          /* local port */
+       u_short                 pport;          /* public port */
+       u_short                 rport;          /* remote port  */
+       u_short                 pport_cnt;      /* number of public ports */
+       u_short                 rport_cnt;      /* number of remote ports */
+       int                     proto;          /* protocol: tcp/udp */
+       struct alias_link       **alink;        
+       /* num of entry in spool chain */
+       u_int16_t               spool_cnt;      
+       /* chain of spool instances */
+       LIST_HEAD(spool_chain, cfg_spool) spool_chain;
+};
+#endif
+
+#define NAT_BUF_LEN     1024
+
+#ifdef IPFW_INTERNAL
+/* Nat configuration data struct. */
+struct cfg_nat {
+       /* chain of nat instances */
+       LIST_ENTRY(cfg_nat)     _next;
+       int                     id;                     /* nat id */
+       struct in_addr          ip;                     /* nat ip address */
+       char                    if_name[IF_NAMESIZE];   /* interface name */
+       int                     mode;                   /* aliasing mode */
+       struct libalias         *lib;                   /* libalias instance */
+       /* number of entry in spool chain */
+       int                     redir_cnt;              
+       /* chain of redir instances */
+       LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
+};
+#endif
+
+#define SOF_NAT         sizeof(struct cfg_nat)
+#define SOF_REDIR       sizeof(struct cfg_redir)
+#define SOF_SPOOL       sizeof(struct cfg_spool)
+
+/* Nat command. */
+typedef struct _ipfw_insn_nat {
+       ipfw_insn       o;
+       struct cfg_nat *nat;    
+} ipfw_insn_nat;
+
+/* Apply ipv6 mask on ipv6 addr */
+#define APPLY_MASK(addr,mask)                          \
+    (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
+    (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
+    (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
+    (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];
+
+/* Structure for ipv6 */
+typedef struct _ipfw_insn_ip6 {
+       ipfw_insn o;
+       struct in6_addr addr6;
+       struct in6_addr mask6;
+} ipfw_insn_ip6;
+
+/* Used to support icmp6 types */
+typedef struct _ipfw_insn_icmp6 {
+       ipfw_insn o;
+       uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
+                       *     define ICMP6_MAXTYPE
+                       *     as follows: n = ICMP6_MAXTYPE/32 + 1
+                        *     Actually is 203 
+                       */
+} ipfw_insn_icmp6;
+
+/*
+ * Here we have the structure representing an ipfw rule.
+ *
+ * It starts with a general area (with link fields and counters)
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values.
+ *
+ * Given a rule pointer  r:
+ *
+ *  r->cmd             is the start of the first instruction.
+ *  ACTION_PTR(r)      is the start of the first action (things to do
+ *                     once a rule matched).
+ *
+ * When assembling instruction, remember the following:
+ *
+ *  + if a rule has a "keep-state" (or "limit") option, then the
+ *     first instruction (at r->cmd) MUST BE an O_PROBE_STATE
+ *  + if a rule has a "log" option, then the first action
+ *     (at ACTION_PTR(r)) MUST be O_LOG
+ *  + if a rule has an "altq" option, it comes after "log"
+ *  + if a rule has an O_TAG option, it comes after "log" and "altq"
+ *
+ * NOTE: we use a simple linked list of rules because we never need
+ *     to delete a rule without scanning the list. We do not use
+ *     queue(3) macros for portability and readability.
+ */
+
+struct ip_fw {
+#ifdef _X64EMU
+               int32_t pad1;
+#endif
+       struct ip_fw    *x_next;        /* linked list of rules         */
+#ifdef _X64EMU
+               int32_t pad2;
+#endif
+       struct ip_fw    *next_rule;     /* ptr to next [skipto] rule    */
+       /* 'next_rule' is used to pass up 'set_disable' status          */
+
+       uint16_t        act_ofs;        /* offset of action in 32-bit units */
+       uint16_t        cmd_len;        /* # of 32-bit words in cmd     */
+       uint16_t        rulenum;        /* rule number                  */
+       uint8_t set;            /* rule set (0..31)             */
+#define        RESVD_SET       31      /* set for default and persistent rules */
+       uint8_t         _pad;           /* padding                      */
+       uint32_t        id;             /* rule id */
+
+       /* These fields are present in all rules.                       */
+       uint64_t        pcnt;           /* Packet counter               */
+       uint64_t        bcnt;           /* Byte counter                 */
+       uint32_t        timestamp;      /* tv_sec of last match         */
+
+       ipfw_insn       cmd[1];         /* storage for commands         */
+};
+
+#define ACTION_PTR(rule)                               \
+       (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )
+
+#define RULESIZE(rule)  (sizeof(struct ip_fw) + \
+       ((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+
+#if 1 // should be moved to in.h
+/*
+ * This structure is used as a flow mask and a flow id for various
+ * parts of the code.
+ * addr_type is used in userland and kernel to mark the address type.
+ * fib is used in the kernel to record the fib in use.
+ * _flags is used in the kernel to store tcp flags for dynamic rules.
+ */
+struct ipfw_flow_id {
+       uint32_t        dst_ip;
+       uint32_t        src_ip;
+       uint16_t        dst_port;
+       uint16_t        src_port;
+       uint8_t         fib;
+       uint8_t         proto;
+       uint8_t         _flags; /* protocol-specific flags */
+       uint8_t         addr_type; /* 4=ip4, 6=ip6, 1=ether ? */
+       struct in6_addr dst_ip6;
+       struct in6_addr src_ip6;
+       uint32_t        flow_id6;
+       uint32_t        extra; /* queue/pipe or frag_id */
+};
+#endif
+
+#define IS_IP6_FLOW_ID(id)     ((id)->addr_type == 6)
+
+/*
+ * Dynamic ipfw rule.
+ */
+typedef struct _ipfw_dyn_rule ipfw_dyn_rule;
+
+struct _ipfw_dyn_rule {
+       ipfw_dyn_rule   *next;          /* linked list of rules.        */
+       struct ip_fw *rule;             /* pointer to rule              */
+       /* 'rule' is used to pass up the rule number (from the parent)  */
+
+       ipfw_dyn_rule *parent;          /* pointer to parent rule       */
+       u_int64_t       pcnt;           /* packet match counter         */
+       u_int64_t       bcnt;           /* byte match counter           */
+       struct ipfw_flow_id id;         /* (masked) flow id             */
+       u_int32_t       expire;         /* expire time                  */
+       u_int32_t       bucket;         /* which bucket in hash table   */
+       u_int32_t       state;          /* state of this rule (typically a
+                                        * combination of TCP flags)
+                                        */
+       u_int32_t       ack_fwd;        /* most recent ACKs in forward  */
+       u_int32_t       ack_rev;        /* and reverse directions (used */
+                                       /* to generate keepalives)      */
+       u_int16_t       dyn_type;       /* rule type                    */
+       u_int16_t       count;          /* refcount                     */
+};
+
+/*
+ * Definitions for IP option names.
+ */
+#define        IP_FW_IPOPT_LSRR        0x01
+#define        IP_FW_IPOPT_SSRR        0x02
+#define        IP_FW_IPOPT_RR          0x04
+#define        IP_FW_IPOPT_TS          0x08
+
+/*
+ * Definitions for TCP option names.
+ */
+#define        IP_FW_TCPOPT_MSS        0x01
+#define        IP_FW_TCPOPT_WINDOW     0x02
+#define        IP_FW_TCPOPT_SACK       0x04
+#define        IP_FW_TCPOPT_TS         0x08
+#define        IP_FW_TCPOPT_CC         0x10
+
+#define        ICMP_REJECT_RST         0x100   /* fake ICMP code (send a TCP RST) */
+#define        ICMP6_UNREACH_RST       0x100   /* fake ICMPv6 code (send a TCP RST) */
+
+/*
+ * These are used for lookup tables.
+ */
+
+#define        IPFW_TABLE_CIDR         1       /* Table for holding IPv4/IPv6 prefixes */
+#define        IPFW_TABLE_INTERFACE    2       /* Table for holding interface names */
+#define        IPFW_TABLE_MAXTYPE      2       /* Maximum valid number */
+
+typedef struct _ipfw_table_entry {
+       in_addr_t       addr;           /* network address              */
+       u_int32_t       value;          /* value                        */
+       u_int16_t       tbl;            /* table number                 */
+       u_int8_t        masklen;        /* mask length                  */
+} ipfw_table_entry;
+
+typedef struct _ipfw_table_xentry {
+       uint16_t        len;            /* Total entry length           */
+       uint8_t         type;           /* entry type                   */
+       uint8_t         masklen;        /* mask length                  */
+       uint16_t        tbl;            /* table number                 */
+       uint32_t        value;          /* value                        */
+       union {
+               /* Longest field needs to be aligned by 4-byte boundary */
+               struct  in6_addr addr6; /* IPv6 address                 */
+               char    iface[IF_NAMESIZE];     /* interface name       */
+       } k;
+} ipfw_table_xentry;
+
+typedef struct _ipfw_table {
+       u_int32_t       size;           /* size of entries in bytes     */
+       u_int32_t       cnt;            /* # of entries                 */
+       u_int16_t       tbl;            /* table number                 */
+       ipfw_table_entry ent[0];        /* entries                      */
+} ipfw_table;
+
+typedef struct _ipfw_xtable {
+       ip_fw3_opheader opheader;       /* eXtended tables are controlled via IP_FW3 */
+       uint32_t        size;           /* size of entries in bytes     */
+       uint32_t        cnt;            /* # of entries                 */
+       uint16_t        tbl;            /* table number                 */
+       uint8_t         type;           /* table type                   */
+       ipfw_table_xentry xent[0];      /* entries                      */
+} ipfw_xtable;
+
+#endif /* _IPFW2_H */
diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h
new file mode 100644 (file)
index 0000000..5c7b851
--- /dev/null
@@ -0,0 +1,17 @@
+/*
+ * additional define not present in linux
+ * should go in glue.h
+ */
+#ifndef _NETINET_IP_ICMP_H_
+#define _NETINET_IP_ICMP_H_
+
+#define ICMP_MAXTYPE            40      /* defined as 18 in compat.h */
+#define ICMP_ROUTERSOLICIT      10              /* router solicitation */
+#define ICMP_TSTAMP             13              /* timestamp request */
+#define ICMP_IREQ               15              /* information request */
+#define ICMP_MASKREQ            17              /* address mask request */
+#define         ICMP_UNREACH_HOST       1               /* bad host */
+
+#define ICMP_UNREACH            3               /* dest unreachable, codes: */
+
+#endif /* _NETINET_IP_ICMP_H_ */
diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c
new file mode 100644 (file)
index 0000000..fe2e971
--- /dev/null
@@ -0,0 +1,588 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, used in dummynet
+ *
+ * $Id: dn_heap.c 11480 2012-07-31 08:02:00Z luigi $
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#ifdef _KERNEL
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.c 203279 2010-01-31 12:20:29Z luigi $");
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <netinet/ipfw/dn_heap.h>
+#ifndef log
+#define log(x, arg...)
+#endif
+
+#else /* !_KERNEL */
+
+#include <stdio.h>
+#include <dn_test.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include  "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...)    fprintf(stderr, ## x), exit(1)
+#define MALLOC_DEFINE(a, b, c)
+static void *my_malloc(int s) {        return malloc(s); }
+static void my_free(void *p) { free(p); }
+#define malloc(s, t, w)        my_malloc(s)
+#define free(p, t)     my_free(p)
+#endif /* !_KERNEL */
+
+MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( (x)+(x) + 1 )
+#define        HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_resize(struct dn_heap *h, unsigned int new_size)
+{
+       struct dn_heap_entry *p;
+
+       if (h->size >= new_size )       /* have enough room */
+               return 0;
+#if 1  /* round to the next power of 2 */
+       new_size |= new_size >> 1;
+       new_size |= new_size >> 2;
+       new_size |= new_size >> 4;
+       new_size |= new_size >> 8;
+       new_size |= new_size >> 16;
+#else
+       new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
+#endif
+       p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
+       if (p == NULL) {
+               printf("--- %s, resize %d failed\n", __func__, new_size );
+               return 1; /* error */
+       }
+       if (h->size > 0) {
+               bcopy(h->p, p, h->size * sizeof(*p) );
+               free(h->p, M_DN_HEAP);
+       }
+       h->p = p;
+       h->size = new_size;
+       return 0;
+}
+
+int
+heap_init(struct dn_heap *h, int size, int ofs)
+{
+       if (heap_resize(h, size))
+               return 1;
+       h->elements = 0;
+       h->ofs = ofs;
+       return 0;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If ofs > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(h, i) do {                                  \
+       if (h->ofs > 0)                                         \
+           *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i;      \
+       } while (0)
+/*
+ * RESET_OFFSET is used for sanity checks. It sets ofs
+ * to an invalid value.
+ */
+#define RESET_OFFSET(h, i) do {                                        \
+       if (h->ofs > 0)                                         \
+           *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16;    \
+       } while (0)
+
+int
+heap_insert(struct dn_heap *h, uint64_t key1, void *p)
+{
+       int son = h->elements;
+
+       //log("%s key %llu p %p\n", __FUNCTION__, key1, p);
+       if (p == NULL) { /* data already there, set starting point */
+               son = key1;
+       } else { /* insert new element at the end, possibly resize */
+               son = h->elements;
+               if (son == h->size) /* need resize... */
+                       // XXX expand by 16 or so
+                       if (heap_resize(h, h->elements+16) )
+                               return 1; /* failure... */
+               h->p[son].object = p;
+               h->p[son].key = key1;
+               h->elements++;
+       }
+       /* make sure that son >= father along the path */
+       while (son > 0) {
+               int father = HEAP_FATHER(son);
+               struct dn_heap_entry tmp;
+
+               if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+                       break; /* found right position */
+               /* son smaller than father, swap and repeat */
+               HEAP_SWAP(h->p[son], h->p[father], tmp);
+               SET_OFFSET(h, son);
+               son = father;
+       }
+       SET_OFFSET(h, son);
+       return 0;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{
+       int child, father, max = h->elements - 1;
+
+       if (max < 0) {
+               printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
+               return;
+       }
+       if (obj == NULL)
+               father = 0; /* default: move up smallest child */
+       else { /* extract specific element, index is at offset */
+               if (h->ofs <= 0)
+                       panic("%s: extract from middle not set on %p\n",
+                               __FUNCTION__, h);
+               father = *((int *)((char *)obj + h->ofs));
+               if (father < 0 || father >= h->elements) {
+                       panic("%s: father %d out of bound 0..%d\n",
+                               __FUNCTION__, father, h->elements);
+               }
+       }
+       /*
+        * below, father is the index of the empty element, which
+        * we replace at each step with the smallest child until we
+        * reach the bottom level.
+        */
+       // XXX why removing RESET_OFFSET increases runtime by 10% ?
+       RESET_OFFSET(h, father);
+       while ( (child = HEAP_LEFT(father)) <= max ) {
+               if (child != max &&
+                   DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+                       child++; /* take right child, otherwise left */
+               h->p[father] = h->p[child];
+               SET_OFFSET(h, father);
+               father = child;
+       }
+       h->elements--;
+       if (father != max) {
+               /*
+                * Fill hole with last entry and bubble up,
+                * reusing the insert code
+                */
+               h->p[father] = h->p[max];
+               heap_insert(h, father, NULL);
+       }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, uint64_t new_key, void *object)
+{
+       int temp, i, max = h->elements-1;
+       struct dn_heap_entry *p, buf;
+
+       if (h->ofs <= 0)
+               panic("cannot move items on this heap");
+       p = h->p;       /* shortcut */
+
+       i = *((int *)((char *)object + h->ofs));
+       if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
+               p[i].key = new_key;
+               for (; i>0 &&
+                   DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
+                   i = temp ) { /* bubble up */
+                       HEAP_SWAP(p[i], p[temp], buf);
+                       SET_OFFSET(h, i);
+               }
+       } else {                /* must move down */
+               p[i].key = new_key;
+               while ( (temp = HEAP_LEFT(i)) <= max ) {
+                       /* found left child */
+                       if (temp != max &&
+                           DN_KEY_LT(p[temp+1].key, p[temp].key))
+                               temp++; /* select child with min key */
+                       if (DN_KEY_LT(>p[temp].key, new_key)) {
+                               /* go down */
+                               HEAP_SWAP(p[i], p[temp], buf);
+                               SET_OFFSET(h, i);
+                       } else
+                               break;
+                       i = temp;
+               }
+       }
+       SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+       int i;
+
+       for (i = 0; i < h->elements; i++ )
+               heap_insert(h, i , NULL);
+}
+
+int
+heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
+       uintptr_t arg)
+{
+       int i, ret, found;
+
+       for (i = found = 0 ; i < h->elements ;) {
+               ret = fn(h->p[i].object, arg);
+               if (ret & HEAP_SCAN_DEL) {
+                       h->elements-- ;
+                       h->p[i] = h->p[h->elements] ;
+                       found++ ;
+               } else
+                       i++ ;
+               if (ret & HEAP_SCAN_END)
+                       break;
+       }
+       if (found)
+               heapify(h);
+       return found;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+       if (h->size >0 )
+               free(h->p, M_DN_HEAP);
+       bzero(h, sizeof(*h) );
+}
+
+/*
+ * hash table support.
+ */
+
+struct dn_ht {
+        int buckets;            /* how many buckets, really buckets - 1*/
+        int entries;            /* how many entries */
+        int ofs;               /* offset of link field */
+        uint32_t (*hash)(uintptr_t, int, void *arg);
+        int (*match)(void *_el, uintptr_t key, int, void *);
+        void *(*newh)(uintptr_t, int, void *);
+        void **ht;              /* bucket heads */
+};
+/*
+ * Initialize, allocating bucket pointers inline.
+ * Recycle previous record if possible.
+ * If the 'newh' function is not supplied, we assume that the
+ * key passed to ht_find is the same object to be stored in.
+ */
+struct dn_ht *
+dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
+        uint32_t (*h)(uintptr_t, int, void *),
+        int (*match)(void *, uintptr_t, int, void *),
+       void *(*newh)(uintptr_t, int, void *))
+{
+       int l;
+
+       /*
+        * Notes about rounding bucket size to a power of two.
+        * Given the original bucket size, we compute the nearest lower and
+        * higher power of two, minus 1  (respectively b_min and b_max) because
+        * this value will be used to do an AND with the index returned
+        * by hash function.
+        * To choice between these two values, the original bucket size is
+        * compared with b_min. If the original size is greater than 4/3 b_min,
+        * we round the bucket size to b_max, else to b_min.
+        * This ratio try to round to the nearest power of two, advantaging
+        * the greater size if the different between two power is relatively
+        * big.
+        * Rounding the bucket size to a power of two avoid the use of
+        * module when calculating the correct bucket.
+        * The ht->buckets variable store the bucket size - 1 to simply
+        * do an AND between the index returned by hash function and ht->bucket
+        * instead of a module.
+        */
+       int b_min; /* min buckets */
+       int b_max; /* max buckets */
+       int b_ori; /* original buckets */
+
+       if (h == NULL || match == NULL) {
+               printf("--- missing hash or match function");
+               return NULL;
+       }
+       if (buckets < 1 || buckets > 65536)
+               return NULL;
+
+       b_ori = buckets;
+       /* calculate next power of 2, - 1*/
+       buckets |= buckets >> 1;
+       buckets |= buckets >> 2;
+       buckets |= buckets >> 4;
+       buckets |= buckets >> 8;
+       buckets |= buckets >> 16;
+
+       b_max = buckets; /* Next power */
+       b_min = buckets >> 1; /* Previous power */
+
+       /* Calculate the 'nearest' bucket size */
+       if (b_min * 4000 / 3000 < b_ori)
+               buckets = b_max;
+       else
+               buckets = b_min;
+
+       if (ht) {       /* see if we can reuse */
+               if (buckets <= ht->buckets) {
+                       ht->buckets = buckets;
+               } else {
+                       /* free pointers if not allocated inline */
+                       if (ht->ht != (void *)(ht + 1))
+                               free(ht->ht, M_DN_HEAP);
+                       free(ht, M_DN_HEAP);
+                       ht = NULL;
+               }
+       }
+       if (ht == NULL) {
+               /* Allocate buckets + 1 entries because buckets is use to
+                * do the AND with the index returned by hash function
+                */
+               l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
+               ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
+       }
+       if (ht) {
+               ht->ht = (void **)(ht + 1);
+               ht->buckets = buckets;
+               ht->ofs = ofs;
+               ht->hash = h;
+               ht->match = match;
+               ht->newh = newh;
+       }
+       return ht;
+}
+
+/* dummy callback for dn_ht_free to unlink all */
+static int
+do_del(void *obj, void *arg)
+{
+       return DNHT_SCAN_DEL;
+}
+
+void
+dn_ht_free(struct dn_ht *ht, int flags)
+{
+       if (ht == NULL)
+               return;
+       if (flags & DNHT_REMOVE) {
+               (void)dn_ht_scan(ht, do_del, NULL);
+       } else {
+               if (ht->ht && ht->ht != (void *)(ht + 1))
+                       free(ht->ht, M_DN_HEAP);
+               free(ht, M_DN_HEAP);
+       }
+}
+
+int
+dn_ht_entries(struct dn_ht *ht)
+{
+       return ht ? ht->entries : 0;
+}
+
+/*
+ * Helper function to scan a bucket in the hash table, it
+ * can only be called on a non-empty bucket for a valid table.
+ *
+ * In lookup and scan, consider ht->ht[i] as pointing to the tail
+ * of the queue (head is NEXTP(tail). The 'empty' value is irrelevant.
+ * While searching, start analysing p = head, end when p == tail.
+ * Note that 'tail' is a cache of the _original_ ht->ht[i]
+ * and is used to check for loop termination. If you remove
+ * it, you must also adjust 'p' when deleting the 'tail' element.
+ */
+#define NEXT(_h, _p) *((void **)((char *)(_p) + (_h)->ofs))
+static int
+dn_ht_scan_body(struct dn_ht *ht, int *bucket,
+       int (*fn)(void *, void *), void *arg)
+{
+       int ret, found = 0, i = *bucket;
+       void *tail, *pp, *p, *nextp;
+
+       pp = tail = ht->ht[i];
+       do {
+               p = NEXT(ht, pp);
+               nextp = NEXT(ht, p);
+               ret = fn(p, arg);
+               if ((ret & DNHT_SCAN_DEL) == 0) {
+                       pp = p;  /* prepare for next loop */
+               } else {
+                       found++;
+                       ht->entries--;
+                       /* skip current element */
+                       if (pp != p)
+                               /* pp == p implies p == tail */
+                               NEXT(ht, pp) = nextp;
+                       if (p == tail)
+                               ht->ht[i] = (pp != p) ? pp : NULL;
+               }
+               if (ret & DNHT_SCAN_END) {
+                       /* Update ht->ht[i] before returning */
+                       ht->ht[i] = (ht->ht[i] == NULL) ? NULL : pp;
+                       return found;
+               }
+       } while (p != tail);
+
+       (*bucket)++;
+       return found;
+}
+
+/*
+ * lookup and optionally create or delete element.
+ * This is an optimized version of the scan so it is coded
+ * inline.
+ */
+void *
+dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
+{
+       int i, found;
+       void *tail, *pp, *p; /* pp is the prev element, pp is current */
+
+       if (ht == NULL) /* easy on an empty hash */
+               return NULL;
+       i = (ht->buckets == 1) ? 0 :
+               (ht->hash(key, flags, arg) & ht->buckets);
+
+       pp = tail = ht->ht[i];
+       if (tail) { /* non empty, try a lookup */
+               do {
+                       p = NEXT(ht, pp);
+                       found = (flags & DNHT_MATCH_PTR) ? key == (uintptr_t)p :
+                                       ht->match(p, key, flags, arg);
+                       if (!found)
+                               continue;
+                       if (flags & DNHT_REMOVE) {
+                               ht->entries--;
+                               if (p != pp)    /* skip current element */
+                                       NEXT(ht, pp) = NEXT(ht, p);
+                               if (p == tail)
+                                       ht->ht[i] = (pp != p) ? pp : NULL;
+                       }
+                       return p;
+               } while ( (pp = p) != tail);
+       }
+       /* not found */
+       if ((flags & DNHT_INSERT) == 0)
+               return NULL;
+       p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
+       if (p) {
+               ht->entries++;
+               if (tail == NULL) {
+                       ht->ht[i] = NEXT(ht, p) = p;
+               } else {
+                       NEXT(ht, p) = NEXT(ht, tail);
+                       NEXT(ht, tail) = p;
+               }
+       }
+
+       return p;
+}
+
+/*
+ * do a scan with the option to delete the object.
+ * Similar to the lookup, but the match function is different,
+ * and we extract 'next' before running the callback because
+ * the element may be destroyed there.
+ */
+int
+dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
+{
+       int i, bucket, found = 0;
+
+       if (ht == NULL || fn == NULL)
+               return 0;
+       for (i = 0; i <= ht->buckets; i++) {
+               if (ht->ht[i] == NULL)
+                       continue; /* empty  bucket */
+               bucket = i;
+               found += dn_ht_scan_body(ht, &bucket, fn, arg);
+               if (bucket == i) /* early exit */
+                               return found;
+       }
+       return found;
+}
+
+/*
+ * Similar to dn_ht_scan(), except that the scan is performed only
+ * in the bucket 'bucket'. The function returns a correct bucket number if
+ * the original is invalid.
+ * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i]
+ * pointer to the last entry processed. Moreover, the bucket number passed
+ * by caller is decremented, because usually the caller increment it.
+ */
+int
+dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
+                void *arg)
+{
+       if (ht == NULL || fn == NULL)
+               return 0;
+       if (*bucket > ht->buckets || *bucket < 0)
+               *bucket = 0;
+       if (ht->ht[*bucket] == NULL) {
+               (*bucket)++;
+               return 0;
+       } else
+               return dn_ht_scan_body(ht, bucket, fn, arg);
+}
diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h
new file mode 100644 (file)
index 0000000..09b2ac7
--- /dev/null
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, header file
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $
+ */
+
+#ifndef _IP_DN_HEAP_H
+#define _IP_DN_HEAP_H
+
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+
+/*
+ * This module implements a binary heap supporting random extraction.
+ *
+ * A heap entry contains an uint64_t key and a pointer to object.
+ * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
+ *
+ * The heap is a struct dn_heap plus a dynamically allocated
+ * array of dn_heap_entry entries. 'size' represents the size of
+ * the array, 'elements' count entries in use. The topmost
+ * element has the smallest key.
+ * The heap supports ordered insert, and extract from the top.
+ * To extract an object from the middle of the heap, we the object
+ * must reserve an 'int32_t' to store the position of the object
+ * in the heap itself, and the location of this field must be
+ * passed as an argument to heap_init() -- use -1 if the feature
+ * is not used.
+ */
+struct dn_heap_entry {
+       uint64_t key;   /* sorting key, smallest comes first */
+       void *object;   /* object pointer */
+};
+
+struct dn_heap {
+       int size;       /* the size of the array */
+       int elements;   /* elements in use */
+       int ofs;        /* offset in the object of heap index */
+       struct dn_heap_entry *p;        /* array of "size" entries */
+};
+
+enum {
+       HEAP_SCAN_DEL = 1,
+       HEAP_SCAN_END = 2,
+};
+
+/*
+ * heap_init() reinitializes the heap setting the size and the offset
+ *     of the index for random extraction (use -1 if not used).
+ *     The 'elements' counter is set to 0.
+ *
+ * SET_HEAP_OFS() indicates where, in the object, is stored the index
+ *     for random extractions from the heap.
+ *
+ * heap_free() frees the memory associated to a heap.
+ *
+ * heap_insert() adds a key-pointer pair to the heap
+ *
+ * HEAP_TOP() returns a pointer to the top element of the heap,
+ *     but makes no checks on its existance (XXX should we change ?)
+ *
+ * heap_extract() removes the entry at the top, returing the pointer.
+ *     (the key should have been read before).
+ *
+ * heap_scan() invokes a callback on each entry of the heap.
+ *     The callback can return a combination of HEAP_SCAN_DEL and
+ *     HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
+ *     be removed, and HEAP_SCAN_END means to terminate the scan.
+ *     heap_scan() returns the number of elements removed.
+ *     Because the order is not guaranteed, we should use heap_scan()
+ *     only as a last resort mechanism.
+ */
+#define HEAP_TOP(h)    ((h)->p)
+#define SET_HEAP_OFS(h, n)     do { (h)->ofs = n; } while (0)
+int     heap_init(struct dn_heap *h, int size, int ofs);
+int     heap_insert(struct dn_heap *h, uint64_t key1, void *p);
+void    heap_extract(struct dn_heap *h, void *obj);
+void heap_free(struct dn_heap *h);
+int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
+
+/*------------------------------------------------------
+ * This module implements a generic hash table with support for
+ * running callbacks on the entire table. To avoid allocating
+ * memory during hash table operations, objects must reserve
+ * space for a link field. XXX if the heap is moderately full,
+ * an SLIST suffices, and we can tolerate the cost of a hash
+ * computation on each removal.
+ *
+ * dn_ht_init() initializes the table, setting the number of
+ *     buckets, the offset of the link field, the main callbacks.
+ *     Callbacks are:
+ * 
+ *     hash(key, flags, arg) called to return a bucket index.
+ *     match(obj, key, flags, arg) called to determine if key
+ *             matches the current 'obj' in the heap
+ *     newh(key, flags, arg) optional, used to allocate a new
+ *             object during insertions.
+ *
+ * dn_ht_free() frees the heap or unlink elements.
+ *     DNHT_REMOVE unlink elements, 0 frees the heap.
+ *     You need two calls to do both.
+ *
+ * dn_ht_find() is the main lookup function, which can also be
+ *     used to insert or delete elements in the hash table.
+ *     The final 'arg' is passed to all callbacks.
+ *
+ * dn_ht_scan() is used to invoke a callback on all entries of
+ *     the heap, or possibly on just one bucket. The callback
+ *     is invoked with a pointer to the object, and must return
+ *     one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
+ *     removal of the object from the heap and the end of the
+ *     scan, respectively.
+ *
+ * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
+ *     only the specific bucket of the table. The bucket is a in-out
+ *     parameter and return a valid bucket number if the original
+ *     is invalid.
+ *
+ * A combination of flags can be used to modify the operation
+ * of the dn_ht_find(), and of the callbacks:
+ *
+ * DNHT_KEY_IS_OBJ     means the key is the object pointer.
+ *     It is usally of interest for the hash and match functions.
+ *
+ * DNHT_MATCH_PTR      during a lookup, match pointers instead
+ *     of calling match(). Normally used when removing specific
+ *     entries. Does not imply KEY_IS_OBJ as the latter _is_ used
+ *     by the match function.
+ *
+ * DNHT_INSERT         insert the element if not found.
+ *     Calls new() to allocates a new object unless
+ *     DNHT_KEY_IS_OBJ is set.
+ *
+ * DNHT_UNIQUE         only insert if object not found.
+ *     XXX should it imply DNHT_INSERT ?
+ *
+ * DNHT_REMOVE         remove objects if we find them.
+ */
+struct dn_ht;  /* should be opaque */
+
+struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, 
+        uint32_t (*hash)(uintptr_t, int, void *),
+        int (*match)(void *, uintptr_t, int, void *),
+        void *(*newh)(uintptr_t, int, void *));
+void dn_ht_free(struct dn_ht *, int flags);
+
+void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
+int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
+int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
+int dn_ht_entries(struct dn_ht *);
+
+enum {  /* flags values.
+        * first two are returned by the scan callback to indicate
+        * to delete the matching element or to end the scan
+        */
+        DNHT_SCAN_DEL  = 0x0001,
+        DNHT_SCAN_END  = 0x0002,
+        DNHT_KEY_IS_OBJ        = 0x0004,       /* key is the obj pointer */
+        DNHT_MATCH_PTR = 0x0008,       /* match by pointer, not match() */
+        DNHT_INSERT    = 0x0010,       /* insert if not found */
+        DNHT_UNIQUE    = 0x0020,       /* report error if already there */
+        DNHT_REMOVE    = 0x0040,       /* remove on find or dn_ht_free */
+}; 
+
+#endif /* _IP_DN_HEAP_H */
diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h
new file mode 100644 (file)
index 0000000..a755e86
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The API to write a packet scheduling algorithm for dummynet.
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/dn_sched.h 204591 2010-03-02 17:40:48Z luigi $
+ */
+
+#ifndef _DN_SCHED_H
+#define _DN_SCHED_H
+
+#define        DN_MULTIQUEUE   0x01
+/*
+ * Descriptor for a scheduling algorithm.
+ * Contains all function pointers for a given scheduler
+ * This is typically created when a module is loaded, and stored
+ * in a global list of schedulers.
+ */
+struct dn_alg {
+       uint32_t type;           /* the scheduler type */
+       const char *name;   /* scheduler name */
+       uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */
+
+       /*
+        * The following define the size of 3 optional data structures
+        * that may need to be allocated at runtime, and are appended
+        * to each of the base data structures: scheduler, sched.inst,
+        * and queue. We don't have a per-flowset structure.
+        */
+       /*    + parameters attached to the template, e.g.
+        *      default queue sizes, weights, quantum size, and so on;
+        */
+       size_t schk_datalen;
+
+       /*    + per-instance parameters, such as timestamps,
+        *      containers for queues, etc;
+        */
+       size_t si_datalen;
+
+       size_t q_datalen;       /* per-queue parameters (e.g. S,F) */
+
+       /*
+        * Methods implemented by the scheduler:
+        * enqueue      enqueue packet 'm' on scheduler 's', queue 'q'.
+        *      q is NULL for !MULTIQUEUE.
+        *      Return 0 on success, 1 on drop (packet consumed anyways).
+        *      Note that q should be interpreted only as a hint
+        *      on the flow that the mbuf belongs to: while a
+        *      scheduler will normally enqueue m into q, it is ok
+        *      to leave q alone and put the mbuf elsewhere.
+        *      This function is called in two cases:
+        *       - when a new packet arrives to the scheduler;
+        *       - when a scheduler is reconfigured. In this case the
+        *         call is issued by the new_queue callback, with a 
+        *         non empty queue (q) and m pointing to the first
+        *         mbuf in the queue. For this reason, the function
+        *         should internally check for (m != q->mq.head)
+        *         before calling dn_enqueue().
+        *
+        * dequeue      Called when scheduler instance 's' can
+        *      dequeue a packet. Return NULL if none are available.
+        *      XXX what about non work-conserving ?
+        *
+        * config       called on 'sched X config ...', normally writes
+        *      in the area of size sch_arg
+        *
+        * destroy      called on 'sched delete', frees everything
+        *      in sch_arg (other parts are handled by more specific
+        *      functions)
+        *
+        * new_sched    called when a new instance is created, e.g.
+        *      to create the local queue for !MULTIQUEUE, set V or
+        *      copy parameters for WFQ, and so on.
+        *
+        * free_sched   called when deleting an instance, cleans
+        *      extra data in the per-instance area.
+        *
+        * new_fsk      called when a flowset is linked to a scheduler,
+        *      e.g. to validate parameters such as weights etc.
+        * free_fsk     when a flowset is unlinked from a scheduler.
+        *      (probably unnecessary)
+        *
+        * new_queue    called to set the per-queue parameters,
+        *      e.g. S and F, adjust sum of weights in the parent, etc.
+        *
+        *      The new_queue callback is normally called from when
+        *      creating a new queue. In some cases (such as a
+        *      scheduler change or reconfiguration) it can be called
+        *      with a non empty queue. In this case, the queue
+        *      In case of non empty queue, the new_queue callback could
+        *      need to call the enqueue function. In this case,
+        *      the callback should eventually call enqueue() passing
+        *      as m the first element in the queue.
+        *
+        * free_queue   actions related to a queue removal, e.g. undo
+        *      all the above. If the queue has data in it, also remove
+        *      from the scheduler. This can e.g. happen during a reconfigure.
+        *      If safe == 1 remove the queue only if the scheduler no longer
+        *      need it, otherwise delete it even if the scheduler is using
+        *      it. Usually, the flag safe is set when the drain routine is
+        *      running to delete idle queues.
+        */
+       int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
+               struct mbuf *);
+       struct mbuf * (*dequeue)(struct dn_sch_inst *);
+
+       int (*config)(struct dn_schk *);
+       int (*destroy)(struct dn_schk*);
+       int (*new_sched)(struct dn_sch_inst *);
+       int (*free_sched)(struct dn_sch_inst *);
+       int (*new_fsk)(struct dn_fsk *f);
+       int (*free_fsk)(struct dn_fsk *f);
+       int (*new_queue)(struct dn_queue *q);
+       int (*free_queue)(struct dn_queue *q, int safe);
+
+       /* run-time fields */
+       int ref_count;      /* XXX number of instances in the system */
+       SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
+};
+
+/* MSVC does not support initializers so we need this ugly macro */
+#ifdef _WIN32
+#define _SI(fld)
+#else
+#define _SI(fld)       fld
+#endif
+
+/*
+ * Additionally, dummynet exports some functions and macros
+ * to be used by schedulers:
+ */
+
+void dn_free_pkts(struct mbuf *mnext);
+int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
+/* bound a variable between min and max */
+int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
+
+/*
+ * Extract the head of a queue, update stats. Must be the very last
+ * thing done on a dequeue as the queue itself may go away.
+ */
+static __inline struct mbuf*
+dn_dequeue(struct dn_queue *q)
+{
+       struct mbuf *m = q->mq.head;
+       if (m == NULL)
+               return NULL;
+       q->mq.head = m->m_nextpkt;
+
+       /* Update stats for the queue */
+       q->ni.length--;
+       q->ni.len_bytes -= m->m_pkthdr.len;
+       /* When the queue becomes idle, update idle_time (used by RED)
+        * and also update the count of idle queues (for garbage collection).
+        */
+       if (q->ni.length == 0) {
+               dn_cfg.idle_queue++;
+               q->q_time = dn_cfg.curr_time;
+       }
+       if (q->_si) {
+               struct dn_flow *ni = &(q->_si->ni);
+               /* update stats for the scheduler instance, and keep track
+                * of idle scheduler instances if needed
+                */
+               ni->length--;
+               ni->len_bytes -= m->m_pkthdr.len;
+               if (ni->length == 0)
+                       dn_cfg.idle_si++;
+       }
+       return m;
+}
+
+int dn_sched_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNSCHED_MODULE(name, dnsched)                  \
+       static moduledata_t name##_mod = {                      \
+               #name, dn_sched_modevent, dnsched               \
+       };                                                      \
+       DECLARE_MODULE(name, name##_mod,                        \
+               SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);     \
+        MODULE_DEPEND(name, dummynet, 3, 3, 3);
+#endif /* _DN_SCHED_H */
diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c
new file mode 100644 (file)
index 0000000..d8733c9
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: dn_sched_fifo.c 11480 2012-07-31 08:02:00Z luigi $
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>    /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>            /* ipfw_rule_ref */
+#include <netinet/ip_fw.h>     /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+/*
+ * This file implements a FIFO scheduler for a single queue.
+ * The queue is allocated as part of the scheduler instance,
+ * and there is a single flowset is in the template which stores
+ * queue size and policy.
+ * Enqueue and dequeue use the default library functions.
+ */
+static int 
+fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
+{
+       /* XXX if called with q != NULL and m=NULL, this is a
+        * re-enqueue from an existing scheduler, which we should
+        * handle.
+        */
+       return dn_enqueue((struct dn_queue *)(si+1), m, 0);
+}
+
+static struct mbuf *
+fifo_dequeue(struct dn_sch_inst *si)
+{
+       return dn_dequeue((struct dn_queue *)(si + 1));
+}
+
+static int
+fifo_new_sched(struct dn_sch_inst *si)
+{
+       /* This scheduler instance contains the queue */
+       struct dn_queue *q = (struct dn_queue *)(si + 1);
+
+        set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+       q->_si = si;
+       q->fs = si->sched->fs;
+       return 0;
+}
+
+static int
+fifo_free_sched(struct dn_sch_inst *si)
+{
+       struct dn_queue *q = (struct dn_queue *)(si + 1);
+       dn_free_pkts(q->mq.head);
+       bzero(q, sizeof(*q));
+       return 0;
+}
+
+/*
+ * FIFO scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fifo_desc = {
+       _SI( .type = )  DN_SCHED_FIFO,
+       _SI( .name = )  "FIFO",
+       _SI( .flags = ) 0,
+
+       _SI( .schk_datalen = ) 0,
+       _SI( .si_datalen = )  sizeof(struct dn_queue),
+       _SI( .q_datalen = )  0,
+
+       _SI( .enqueue = )  fifo_enqueue,
+       _SI( .dequeue = )  fifo_dequeue,
+       _SI( .config = )  NULL,
+       _SI( .destroy = )  NULL,
+       _SI( .new_sched = )  fifo_new_sched,
+       _SI( .free_sched = )  fifo_free_sched,
+       _SI( .new_fsk = )  NULL,
+       _SI( .free_fsk = )  NULL,
+       _SI( .new_queue = )  NULL,
+       _SI( .free_queue = )  NULL,
+};
+
+DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netinet/ipfw/dn_sched_prio.c b/sys/netinet/ipfw/dn_sched_prio.c
new file mode 100644 (file)
index 0000000..7bc67ea
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: dn_sched_prio.c 11480 2012-07-31 08:02:00Z luigi $
+ */
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>    /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>            /* ipfw_rule_ref */
+#include <netinet/ip_fw.h>     /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_PRIO  5 //XXX
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#define test_bit(ix, pData)    ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData)   (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/* Size of the array of queues pointers. */
+#define BITMAP_T       unsigned long
+#define MAXPRIO                (sizeof(BITMAP_T) * 8)
+
+/*
+ * The scheduler instance contains an array of pointers to queues,
+ * one for each priority, and a bitmap listing backlogged queues.
+ */
+struct prio_si {
+       BITMAP_T bitmap;                        /* array bitmap */
+       struct dn_queue *q_array[MAXPRIO];      /* Array of queues pointers */
+};
+
+/*
+ * If a queue with the same priority is already backlogged, use
+ * that one instead of the queue passed as argument.
+ */
+static int 
+prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+       struct prio_si *si = (struct prio_si *)(_si + 1);
+       int prio = q->fs->fs.par[0];
+
+       if (test_bit(prio, &si->bitmap) == 0) {
+               /* No queue with this priority, insert */
+               __set_bit(prio, &si->bitmap);
+               si->q_array[prio] = q;
+       } else { /* use the existing queue */
+               q = si->q_array[prio];
+       }
+       if (dn_enqueue(q, m, 0))
+               return 1;
+       return 0;
+}
+
+/*
+ * Packets are dequeued only from the highest priority queue.
+ * The function ffs() return the lowest bit in the bitmap that rapresent
+ * the array index (-1) which contains the pointer to the highest priority
+ * queue.
+ * After the dequeue, if this queue become empty, it is index is removed
+ * from the bitmap.
+ * Scheduler is idle if the bitmap is empty
+ *
+ * NOTE: highest priority is 0, lowest is sched->max_prio_q
+ */
+static struct mbuf *
+prio_dequeue(struct dn_sch_inst *_si)
+{
+       struct prio_si *si = (struct prio_si *)(_si + 1);
+       struct mbuf *m;
+       struct dn_queue *q;
+       int prio;
+
+       if (si->bitmap == 0) /* scheduler idle */
+               return NULL;
+
+       prio = ffs(si->bitmap) - 1;
+
+       /* Take the highest priority queue in the scheduler */
+       q = si->q_array[prio];
+       // assert(q)
+
+       m = dn_dequeue(q);
+       if (q->mq.head == NULL) {
+               /* Queue is now empty, remove from scheduler
+                * and mark it
+                */
+               si->q_array[prio] = NULL;
+               __clear_bit(prio, &si->bitmap);
+       }
+       return m;
+}
+
+static int
+prio_new_sched(struct dn_sch_inst *_si)
+{
+       struct prio_si *si = (struct prio_si *)(_si + 1);
+
+       bzero(si->q_array, sizeof(si->q_array));
+       si->bitmap = 0;
+
+       return 0;
+}
+
+static int
+prio_new_fsk(struct dn_fsk *fs)
+{
+       /* Check if the prioritiy is between 0 and MAXPRIO-1 */
+       ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority");
+       return 0;
+}
+
+static int
+prio_new_queue(struct dn_queue *q)
+{
+       struct prio_si *si = (struct prio_si *)(q->_si + 1);
+       int prio = q->fs->fs.par[0];
+       struct dn_queue *oldq;
+
+       q->ni.oid.subtype = DN_SCHED_PRIO;
+
+       if (q->mq.head == NULL)
+               return 0;
+
+       /* Queue already full, must insert in the scheduler or append
+        * mbufs to existing queue. This partly duplicates prio_enqueue
+        */
+       if (test_bit(prio, &si->bitmap) == 0) {
+               /* No queue with this priority, insert */
+               __set_bit(prio, &si->bitmap);
+               si->q_array[prio] = q;
+       } else if ( (oldq = si->q_array[prio]) != q) {
+               /* must append to the existing queue.
+                * can simply append q->mq.head to q2->...
+                * and add the counters to those of q2
+                */
+               oldq->mq.tail->m_nextpkt = q->mq.head;
+               oldq->mq.tail = q->mq.tail;
+               oldq->ni.length += q->ni.length;
+               q->ni.length = 0;
+               oldq->ni.len_bytes += q->ni.len_bytes;
+               q->ni.len_bytes = 0;
+               q->mq.tail = q->mq.head = NULL;
+       }
+       return 0;
+}
+
+static int
+prio_free_queue(struct dn_queue *q, int safe)
+{
+       int prio = q->fs->fs.par[0];
+       struct prio_si *si = (struct prio_si *)(q->_si + 1);
+
+       if (si->q_array[prio] == q) {
+               si->q_array[prio] = NULL;
+               __clear_bit(prio, &si->bitmap);
+       }
+       return 0;
+}
+
+
+static struct dn_alg prio_desc = {
+       _SI( .type = ) DN_SCHED_PRIO,
+       _SI( .name = ) "PRIO",
+       _SI( .flags = ) DN_MULTIQUEUE,
+
+       /* we need extra space in the si and the queue */
+       _SI( .schk_datalen = ) 0,
+       _SI( .si_datalen = ) sizeof(struct prio_si),
+       _SI( .q_datalen = ) 0,
+
+       _SI( .enqueue = ) prio_enqueue,
+       _SI( .dequeue = ) prio_dequeue,
+
+       _SI( .config = )  NULL,
+       _SI( .destroy = )  NULL,
+       _SI( .new_sched = ) prio_new_sched,
+       _SI( .free_sched = ) NULL,
+
+       _SI( .new_fsk = ) prio_new_fsk,
+       _SI( .free_fsk = )  NULL,
+
+       _SI( .new_queue = ) prio_new_queue,
+       _SI( .free_queue = ) prio_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc);
diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c
new file mode 100644 (file)
index 0000000..eaf0478
--- /dev/null
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: dn_sched_qfq.c 11656 2012-08-07 08:39:06Z luigi $
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>    /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>            /* ipfw_rule_ref */
+#include <netinet/ip_fw.h>     /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifdef QFQ_DEBUG
+struct qfq_sched;
+static void dump_sched(struct qfq_sched *q, const char *msg);
+#define        NO(x)   x
+#else
+#define NO(x)
+#endif
+#define DN_SCHED_QFQ   4 // XXX Where?
+typedef        unsigned long   bitmap;
+
+/*
+ * bitmaps ops are critical. Some linux versions have __fls
+ * and the bitmap ops. Some machines have ffs
+ */
+#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
+int fls(unsigned int n)
+{
+       int i = 0;
+       for (i = 0; n > 0; n >>= 1, i++)
+               ;
+       return i;
+}
+#endif
+
+#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
+static inline unsigned long __fls(unsigned long word)
+{
+       return fls(word) - 1;
+}
+#endif
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#ifdef QFQ_DEBUG
+int test_bit(int ix, bitmap *p)
+{
+       if (ix < 0 || ix > 31)
+               D("bad index %d", ix);
+       return *p & (1<<ix);
+}
+void __set_bit(int ix, bitmap *p)
+{
+       if (ix < 0 || ix > 31)
+               D("bad index %d", ix);
+       *p |= (1<<ix);
+}
+void __clear_bit(int ix, bitmap *p)
+{
+       if (ix < 0 || ix > 31)
+               D("bad index %d", ix);
+       *p &= ~(1<<ix);
+}
+#else /* !QFQ_DEBUG */
+/* XXX do we have fast version, or leave it to the compiler ? */
+#define test_bit(ix, pData)    ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData)   (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif /* !QFQ_DEBUG */
+#endif /* !__linux__ */
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/*-------------------------------------------*/
+/*
+
+Virtual time computations.
+
+S, F and V are all computed in fixed point arithmetic with
+FRAC_BITS decimal bits.
+
+   QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+       one bit per index.
+   QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+   The layout of the bits is as below:
+  
+                   [ MTU_SHIFT ][      FRAC_BITS    ]
+                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
+                                ^.__grp->index = 0
+                                *.__grp->slot_shift
+  
+   where MIN_SLOT_SHIFT is derived by difference from the others.
+
+The max group index corresponds to Lmax/w_min, where
+Lmax=1<<MTU_SHIFT, w_min = 1 .
+From this, and knowing how many groups (MAX_INDEX) we want,
+we can derive the shift corresponding to each group.
+
+Because we often need to compute
+       F = S + len/w_i  and V = V + len/wsum
+instead of storing w_i store the value
+       inv_w = (1<<FRAC_BITS)/w_i
+so we can do F = S + len * inv_w * wsum.
+We use W_TOT in the formulas so we can easily move between
+static and adaptive weight sum.
+
+The per-scheduler-instance data contain all the data structures
+for the scheduler: bitmaps and bucket lists.
+
+ */
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group. This is approx lmax/lmin + 5.
+ * XXX check because it poses constraints on MAX_INDEX
+ */
+#define QFQ_MAX_SLOTS  32
+/*
+ * Shifts used for class<->group mapping. Class weights are
+ * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
+ * group with the smallest index that can support the L_i / r_i
+ * configured for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ *
+ * When computing the group index, we do (len<<FP_SHIFT)/weight,
+ * then compute an FLS (which is like a log2()), and if the result
+ * is below the MAX_INDEX region we use 0 (which is the same as
+ * using a larger len).
+ */
+#define QFQ_MAX_INDEX          19
+#define QFQ_MAX_WSHIFT         16      /* log2(max_weight) */
+
+#define        QFQ_MAX_WEIGHT          (1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM           (2*QFQ_MAX_WEIGHT)
+//#define IWSUM        (q->i_wsum)
+#define IWSUM  ((1<<FRAC_BITS)/QFQ_MAX_WSUM)
+
+#define FRAC_BITS              30      /* fixed point arithmetic */
+#define ONE_FP                 (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT          11      /* log2(max_len) */
+#define QFQ_MIN_SLOT_SHIFT     (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states, also indexes for the bitmaps array in
+ * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+/*
+ * additional queue info. Some of this info should come from
+ * the flowset, we copy them here for faster processing.
+ * This is an overlay of the struct dn_queue
+ */
+struct qfq_class {
+       struct dn_queue _q;
+       uint64_t S, F;          /* flow timestamps (exact) */
+       struct qfq_class *next; /* Link for the slot list. */
+
+       /* group we belong to. In principle we would need the index,
+        * which is log_2(lmax/weight), but we never reference it
+        * directly, only the group.
+        */
+       struct qfq_group *grp;
+
+       /* these are copied from the flowset. */
+       uint32_t        inv_w;  /* ONE_FP/weight */
+       uint32_t        lmax;   /* Max packet size for this flow. */
+};
+
+/* Group descriptor, see the paper for details.
+ * Basically this contains the bucket lists
+ */
+struct qfq_group {
+       uint64_t S, F;                  /* group timestamps (approx). */
+       unsigned int slot_shift;        /* Slot shift. */
+       unsigned int index;             /* Group index. */
+       unsigned int front;             /* Index of the front slot. */
+       bitmap full_slots;              /* non-empty slots */
+
+       /* Array of lists of active classes. */
+       struct qfq_class *slots[QFQ_MAX_SLOTS];
+};
+
+/* scheduler instance descriptor. */
+struct qfq_sched {
+       uint64_t        V;              /* Precise virtual time. */
+       uint32_t        wsum;           /* weight sum */
+       NO(uint32_t     i_wsum;         /* ONE_FP/w_sum */
+       uint32_t        _queued;        /* debugging */
+       uint32_t        loops;  /* debugging */)
+       bitmap bitmaps[QFQ_MAX_STATE];  /* Group bitmaps. */
+       struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+/*---- support functions ----------------------------*/
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(uint64_t a, uint64_t b)
+{
+       return (int64_t)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
+{
+       return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+                                       unsigned long bitmap)
+{
+       int index = ffs(bitmap) - 1; // zero-based
+       return &q->groups[index];
+}
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
+{
+       uint64_t slot_size = (uint64_t)maxlen *inv_w;
+       unsigned long size_map;
+       int index = 0;
+
+       size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
+       if (!size_map)
+               goto out;
+
+       index = __fls(size_map) + 1;    // basically a log_2()
+       index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+       if (index < 0)
+               index = 0;
+
+out:
+       ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
+       return index;
+}
+/*---- end support functions ----*/
+
+/*-------- API calls --------------------------------*/
+/*
+ * Validate and copy parameters from flowset.
+ */
+static int
+qfq_new_queue(struct dn_queue *_q)
+{
+       struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+       struct qfq_class *cl = (struct qfq_class *)_q;
+       int i;
+       uint32_t w;     /* approximated weight */
+
+       /* import parameters from the flowset. They should be correct
+        * already.
+        */
+       w = _q->fs->fs.par[0];
+       cl->lmax = _q->fs->fs.par[1];
+       if (!w || w > QFQ_MAX_WEIGHT) {
+               w = 1;
+               D("rounding weight to 1");
+       }
+       cl->inv_w = ONE_FP/w;
+       w = ONE_FP/cl->inv_w;   
+       if (q->wsum + w > QFQ_MAX_WSUM)
+               return EINVAL;
+
+       i = qfq_calc_index(cl->inv_w, cl->lmax);
+       cl->grp = &q->groups[i];
+       q->wsum += w;
+       // XXX cl->S = q->V; ?
+       // XXX compute q->i_wsum
+       return 0;
+}
+
+/* remove an empty queue */
+static int
+qfq_free_queue(struct dn_queue *_q, int safe)
+{
+       struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+       struct qfq_class *cl = (struct qfq_class *)_q;
+       if (cl->inv_w) {
+               q->wsum -= ONE_FP/cl->inv_w;
+               cl->inv_w = 0; /* reset weight to avoid run twice */
+       }
+       return 0;
+}
+
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long
+mask_from(unsigned long bitmap, int from)
+{
+       return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static inline unsigned int
+qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
+{
+       /* if S > V we are not eligible */
+       unsigned int state = qfq_gt(grp->S, q->V);
+       unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+       struct qfq_group *next;
+
+       if (mask) {
+               next = qfq_ffs(q, mask);
+               if (qfq_gt(grp->F, next->F))
+                       state |= EB;
+       }
+
+       return state;
+}
+
+/*
+ * In principle
+ *     q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ *     q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void
+qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
+{
+       q->bitmaps[dst] |= q->bitmaps[src] & mask;
+       q->bitmaps[src] &= ~mask;
+}
+
+static inline void
+qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
+{
+       unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+       struct qfq_group *next;
+
+       if (mask) {
+               next = qfq_ffs(q, mask);
+               if (!qfq_gt(next->F, old_finish))
+                       return;
+       }
+
+       mask = (1UL << index) - 1;
+       qfq_move_groups(q, mask, EB, ER);
+       qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+       old_V ^= q->V;
+       old_V >>= QFQ_MIN_SLOT_SHIFT;
+       if (old_V) {
+               ...
+       }
+ *
+ */
+static inline void
+qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+       unsigned long mask, vslot, old_vslot;
+
+       vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+       old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+       if (vslot != old_vslot) {
+               mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
+               qfq_move_groups(q, mask, IR, ER);
+               qfq_move_groups(q, mask, IB, EB);
+       }
+}
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static inline void
+qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
+{
+       uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
+       unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+       cl->next = grp->slots[i];
+       grp->slots[i] = cl;
+       __set_bit(slot, &grp->full_slots);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static inline void
+qfq_front_slot_remove(struct qfq_group *grp)
+{
+       struct qfq_class **h = &grp->slots[grp->front];
+
+       *h = (*h)->next;
+       if (!*h)
+               __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static inline struct qfq_class *
+qfq_slot_scan(struct qfq_group *grp)
+{
+       int i;
+
+       ND("grp %d full %x", grp->index, grp->full_slots);
+       if (!grp->full_slots)
+               return NULL;
+
+       i = ffs(grp->full_slots) - 1; // zero-based
+       if (i > 0) {
+               grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+               grp->full_slots >>= i;
+       }
+
+       return grp->slots[grp->front];
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static inline void
+qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
+{
+       unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+       grp->full_slots <<= i;
+       grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+
+static inline void
+qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+       bitmap ineligible;
+
+       ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+       if (ineligible) {
+               if (!q->bitmaps[ER]) {
+                       struct qfq_group *grp;
+                       grp = qfq_ffs(q, ineligible);
+                       if (qfq_gt(grp->S, q->V))
+                               q->V = grp->S;
+               }
+               qfq_make_eligible(q, old_V);
+       }
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static inline int
+qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
+           struct qfq_class *cl)
+{
+
+       cl->S = cl->F;
+       if (cl->_q.mq.head == NULL)  {
+               qfq_front_slot_remove(grp);
+       } else {
+               unsigned int len;
+               uint64_t roundedS;
+
+               len = cl->_q.mq.head->m_pkthdr.len;
+               cl->F = cl->S + (uint64_t)len * cl->inv_w;
+               roundedS = qfq_round_down(cl->S, grp->slot_shift);
+               if (roundedS == grp->S)
+                       return 0;
+
+               qfq_front_slot_remove(grp);
+               qfq_slot_insert(grp, cl, roundedS);
+       }
+       return 1;
+}
+
+static struct mbuf *
+qfq_dequeue(struct dn_sch_inst *si)
+{
+       struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+       struct qfq_group *grp;
+       struct qfq_class *cl;
+       struct mbuf *m;
+       uint64_t old_V;
+
+       NO(q->loops++;)
+       if (!q->bitmaps[ER]) {
+               NO(if (q->queued)
+                       dump_sched(q, "start dequeue");)
+               return NULL;
+       }
+
+       grp = qfq_ffs(q, q->bitmaps[ER]);
+
+       cl = grp->slots[grp->front];
+       /* extract from the first bucket in the bucket list */
+       m = dn_dequeue(&cl->_q);
+
+       if (!m) {
+               D("BUG/* non-workconserving leaf */");
+               return NULL;
+       }
+       NO(q->queued--;)
+       old_V = q->V;
+       q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
+       ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
+
+       if (qfq_update_class(q, grp, cl)) {
+               uint64_t old_F = grp->F;
+               cl = qfq_slot_scan(grp);
+               if (!cl) { /* group gone, remove from ER */
+                       __clear_bit(grp->index, &q->bitmaps[ER]);
+                       // grp->S = grp->F + 1; // XXX debugging only
+               } else {
+                       uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
+                       unsigned int s;
+
+                       if (grp->S == roundedS)
+                               goto skip_unblock;
+                       grp->S = roundedS;
+                       grp->F = roundedS + (2ULL << grp->slot_shift);
+                       /* remove from ER and put in the new set */
+                       __clear_bit(grp->index, &q->bitmaps[ER]);
+                       s = qfq_calc_state(q, grp);
+                       __set_bit(grp->index, &q->bitmaps[s]);
+               }
+               /* we need to unblock even if the group has gone away */
+               qfq_unblock_groups(q, grp->index, old_F);
+       }
+
+skip_unblock:
+       qfq_update_eligible(q, old_V);
+       NO(if (!q->bitmaps[ER] && q->queued)
+               dump_sched(q, "end dequeue");)
+
+       return m;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static inline void
+qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+       unsigned long mask;
+       uint64_t limit, roundedF;
+       int slot_shift = cl->grp->slot_shift;
+
+       roundedF = qfq_round_down(cl->F, slot_shift);
+       limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+       if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+               /* timestamp was stale */
+               mask = mask_from(q->bitmaps[ER], cl->grp->index);
+               if (mask) {
+                       struct qfq_group *next = qfq_ffs(q, mask);
+                       if (qfq_gt(roundedF, next->F)) {
+                               cl->S = next->F;
+                               return;
+                       }
+               }
+               cl->S = q->V;
+       } else { /* timestamp is not stale */
+               cl->S = cl->F;
+       }
+}
+
+static int
+qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
+{
+       struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+       struct qfq_group *grp;
+       struct qfq_class *cl = (struct qfq_class *)_q;
+       uint64_t roundedS;
+       int s;
+
+       NO(q->loops++;)
+       DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
+               _q, cl->inv_w, cl->grp->index);
+       /* XXX verify that the packet obeys the parameters */
+       if (m != _q->mq.head) {
+               if (dn_enqueue(_q, m, 0)) /* packet was dropped */
+                       return 1;
+               NO(q->queued++;)
+               if (m != _q->mq.head)
+                       return 0;
+       }
+       /* If reach this point, queue q was idle */
+       grp = cl->grp;
+       qfq_update_start(q, cl); /* adjust start time */
+       /* compute new finish time and rounded start. */
+       cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
+       roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+       /*
+        * insert cl in the correct bucket.
+        * If cl->S >= grp->S we don't need to adjust the
+        * bucket list and simply go to the insertion phase.
+        * Otherwise grp->S is decreasing, we must make room
+        * in the bucket list, and also recompute the group state.
+        * Finally, if there were no flows in this group and nobody
+        * was in ER make sure to adjust V.
+        */
+       if (grp->full_slots) {
+               if (!qfq_gt(grp->S, cl->S))
+                       goto skip_update;
+               /* create a slot for this cl->S */
+               qfq_slot_rotate(q, grp, roundedS);
+               /* group was surely ineligible, remove */
+               __clear_bit(grp->index, &q->bitmaps[IR]);
+               __clear_bit(grp->index, &q->bitmaps[IB]);
+       } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+               q->V = roundedS;
+
+       grp->S = roundedS;
+       grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
+       s = qfq_calc_state(q, grp);
+       __set_bit(grp->index, &q->bitmaps[s]);
+       ND("new state %d 0x%x", s, q->bitmaps[s]);
+       ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
+skip_update:
+       qfq_slot_insert(grp, cl, roundedS);
+
+       return 0;
+}
+
+
+#if 0
+static inline void
+qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+       struct qfq_class *cl, struct qfq_class **pprev)
+{
+       unsigned int i, offset;
+       uint64_t roundedS;
+
+       roundedS = qfq_round_down(cl->S, grp->slot_shift);
+       offset = (roundedS - grp->S) >> grp->slot_shift;
+       i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+#ifdef notyet
+       if (!pprev) {
+               pprev = &grp->slots[i];
+               while (*pprev && *pprev != cl)
+                       pprev = &(*pprev)->next;
+       }
+#endif
+
+       *pprev = cl->next;
+       if (!grp->slots[i])
+               __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ * XXX description to be completed.
+ */
+static void
+qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
+                                struct qfq_class **pprev)
+{
+       struct qfq_group *grp = &q->groups[cl->index];
+       unsigned long mask;
+       uint64_t roundedS;
+       int s;
+
+       cl->F = cl->S;  // not needed if the class goes away.
+       qfq_slot_remove(q, grp, cl, pprev);
+
+       if (!grp->full_slots) {
+               /* nothing left in the group, remove from all sets.
+                * Do ER last because if we were blocking other groups
+                * we must unblock them.
+                */
+               __clear_bit(grp->index, &q->bitmaps[IR]);
+               __clear_bit(grp->index, &q->bitmaps[EB]);
+               __clear_bit(grp->index, &q->bitmaps[IB]);
+
+               if (test_bit(grp->index, &q->bitmaps[ER]) &&
+                   !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+                       mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+                       if (mask)
+                               mask = ~((1UL << __fls(mask)) - 1);
+                       else
+                               mask = ~0UL;
+                       qfq_move_groups(q, mask, EB, ER);
+                       qfq_move_groups(q, mask, IB, IR);
+               }
+               __clear_bit(grp->index, &q->bitmaps[ER]);
+       } else if (!grp->slots[grp->front]) {
+               cl = qfq_slot_scan(grp);
+               roundedS = qfq_round_down(cl->S, grp->slot_shift);
+               if (grp->S != roundedS) {
+                       __clear_bit(grp->index, &q->bitmaps[ER]);
+                       __clear_bit(grp->index, &q->bitmaps[IR]);
+                       __clear_bit(grp->index, &q->bitmaps[EB]);
+                       __clear_bit(grp->index, &q->bitmaps[IB]);
+                       grp->S = roundedS;
+                       grp->F = roundedS + (2ULL << grp->slot_shift);
+                       s = qfq_calc_state(q, grp);
+                       __set_bit(grp->index, &q->bitmaps[s]);
+               }
+       }
+       qfq_update_eligible(q, q->V);
+}
+#endif
+
+static int
+qfq_new_fsk(struct dn_fsk *f)
+{
+       ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
+       ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
+       ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
+       return 0;
+}
+
+/*
+ * initialize a new scheduler instance
+ */
+static int
+qfq_new_sched(struct dn_sch_inst *si)
+{
+       struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+       struct qfq_group *grp;
+       int i;
+
+       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+               grp = &q->groups[i];
+               grp->index = i;
+               grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
+                                       (QFQ_MAX_INDEX - i);
+       }
+       return 0;
+}
+
+/*
+ * QFQ scheduler descriptor
+ */
+static struct dn_alg qfq_desc = {
+       _SI( .type = ) DN_SCHED_QFQ,
+       _SI( .name = ) "QFQ",
+       _SI( .flags = ) DN_MULTIQUEUE,
+
+       _SI( .schk_datalen = ) 0,
+       _SI( .si_datalen = ) sizeof(struct qfq_sched),
+       _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
+
+       _SI( .enqueue = ) qfq_enqueue,
+       _SI( .dequeue = ) qfq_dequeue,
+
+       _SI( .config = )  NULL,
+       _SI( .destroy = )  NULL,
+       _SI( .new_sched = ) qfq_new_sched,
+       _SI( .free_sched = )  NULL,
+       _SI( .new_fsk = ) qfq_new_fsk,
+       _SI( .free_fsk = )  NULL,
+       _SI( .new_queue = ) qfq_new_queue,
+       _SI( .free_queue = ) qfq_free_queue,
+};
+
+DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
+
+#ifdef QFQ_DEBUG
+static void
+dump_groups(struct qfq_sched *q, uint32_t mask)
+{
+       int i, j;
+
+       for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
+               struct qfq_group *g = &q->groups[i];
+
+               if (0 == (mask & (1<<i)))
+                       continue;
+               for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+                       if (g->slots[j])
+                               D("    bucket %d %p", j, g->slots[j]);
+               }
+               D("full_slots 0x%x", g->full_slots);
+               D("        %2d S 0x%20llx F 0x%llx %c", i,
+                       g->S, g->F,
+                       mask & (1<<i) ? '1' : '0');
+       }
+}
+
+static void
+dump_sched(struct qfq_sched *q, const char *msg)
+{
+       D("--- in %s: ---", msg);
+       ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
+       D("    ER 0x%08x", q->bitmaps[ER]);
+       D("    EB 0x%08x", q->bitmaps[EB]);
+       D("    IR 0x%08x", q->bitmaps[IR]);
+       D("    IB 0x%08x", q->bitmaps[IB]);
+       dump_groups(q, 0xffffffff);
+};
+#endif /* QFQ_DEBUG */
diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c
new file mode 100644 (file)
index 0000000..2a93746
--- /dev/null
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: dn_sched_rr.c 11480 2012-07-31 08:02:00Z luigi $
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>    /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>            /* ipfw_rule_ref */
+#include <netinet/ip_fw.h>     /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_RR    3 // XXX Where?
+
+struct rr_queue {
+       struct dn_queue q;              /* Standard queue */
+       int status;                     /* 1: queue is in the list */
+       int credit;                     /* Number of bytes to transmit */
+       int quantum;                    /* quantum * C */
+       struct rr_queue *qnext;         /* */
+};
+
+/* struct rr_schk contains global config parameters
+ * and is right after dn_schk
+ */
+struct rr_schk {
+       int min_q;              /* Min quantum */
+       int max_q;              /* Max quantum */
+       int q_bytes;            /* Bytes per quantum */
+};
+
+/* per-instance round robin list, right after dn_sch_inst */
+struct rr_si {
+       struct rr_queue *head, *tail;   /* Pointer to current queue */
+};
+
+/* Append a queue to the rr list */
+static inline void
+rr_append(struct rr_queue *q, struct rr_si *si)
+{
+       q->status = 1;          /* mark as in-rr_list */
+       q->credit = q->quantum; /* initialize credit */
+
+       /* append to the tail */
+       if (si->head == NULL)
+               si->head = q;
+       else
+               si->tail->qnext = q;
+       si->tail = q;           /* advance the tail pointer */
+       q->qnext = si->head;    /* make it circular */
+}
+
+/* Remove the head queue from circular list. */
+static inline void
+rr_remove_head(struct rr_si *si)
+{
+       if (si->head == NULL)
+               return; /* empty queue */
+       si->head->status = 0;
+
+       if (si->head == si->tail) {
+               si->head = si->tail = NULL;
+               return;
+       }
+
+       si->head = si->head->qnext;
+       si->tail->qnext = si->head;
+}
+
+/* Remove a queue from circular list.
+ * XXX see if ti can be merge with remove_queue()
+ */
+static inline void
+remove_queue_q(struct rr_queue *q, struct rr_si *si)
+{
+       struct rr_queue *prev;
+
+       if (q->status != 1)
+               return;
+       if (q == si->head) {
+               rr_remove_head(si);
+               return;
+       }
+
+       for (prev = si->head; prev; prev = prev->qnext) {
+               if (prev->qnext != q)
+                       continue;
+               prev->qnext = q->qnext;
+               if (q == si->tail)
+                       si->tail = prev;
+               q->status = 0;
+               break;
+       }
+}
+
+
+static inline void
+next_pointer(struct rr_si *si)
+{
+       if (si->head == NULL)
+               return; /* empty queue */
+
+       si->head = si->head->qnext;
+       si->tail = si->tail->qnext;
+}
+
+static int
+rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+       struct rr_si *si;
+       struct rr_queue *rrq;
+
+       if (m != q->mq.head) {
+               if (dn_enqueue(q, m, 0)) /* packet was dropped */
+                       return 1;
+               if (m != q->mq.head)
+                       return 0;
+       }
+
+       /* If reach this point, queue q was idle */
+       si = (struct rr_si *)(_si + 1);
+       rrq = (struct rr_queue *)q;
+
+       if (rrq->status == 1) /* Queue is already in the queue list */
+               return 0;
+
+       /* Insert the queue in the queue list */
+       rr_append(rrq, si);
+
+       return 0;
+}
+
+static struct mbuf *
+rr_dequeue(struct dn_sch_inst *_si)
+{
+       /* Access scheduler instance private data */
+       struct rr_si *si = (struct rr_si *)(_si + 1);
+       struct rr_queue *rrq;
+       uint64_t len;
+
+       while ( (rrq = si->head) ) {
+               struct mbuf *m = rrq->q.mq.head;
+               if ( m == NULL) {
+                       /* empty queue, remove from list */
+                       rr_remove_head(si);
+                       continue;
+               }
+               len = m->m_pkthdr.len;
+
+               if (len > rrq->credit) {
+                       /* Packet too big */
+                       rrq->credit += rrq->quantum;
+                       /* Try next queue */
+                       next_pointer(si);
+               } else {
+                       rrq->credit -= len;
+                       return dn_dequeue(&rrq->q);
+               }
+       }
+
+       /* no packet to dequeue*/
+       return NULL;
+}
+
+static int
+rr_config(struct dn_schk *_schk)
+{
+       struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
+       ND("called");
+
+       /* use reasonable quantums (64..2k bytes, default 1500) */
+       schk->min_q = 64;
+       schk->max_q = 2048;
+       schk->q_bytes = 1500;   /* quantum */
+
+       return 0;
+}
+
+static int
+rr_new_sched(struct dn_sch_inst *_si)
+{
+       struct rr_si *si = (struct rr_si *)(_si + 1);
+
+       ND("called");
+       si->head = si->tail = NULL;
+
+       return 0;
+}
+
+static int
+rr_free_sched(struct dn_sch_inst *_si)
+{
+       ND("called");
+       /* Nothing to do? */
+       return 0;
+}
+
+static int
+rr_new_fsk(struct dn_fsk *fs)
+{
+       struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
+       /* par[0] is the weight, par[1] is the quantum step */
+       ipdn_bound_var(&fs->fs.par[0], 1,
+               1, 65536, "RR weight");
+       ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
+               schk->min_q, schk->max_q, "RR quantum");
+       return 0;
+}
+
+static int
+rr_new_queue(struct dn_queue *_q)
+{
+       struct rr_queue *q = (struct rr_queue *)_q;
+
+       _q->ni.oid.subtype = DN_SCHED_RR;
+
+       q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
+       ND("called, q->quantum %d", q->quantum);
+       q->credit = q->quantum;
+       q->status = 0;
+
+       if (_q->mq.head != NULL) {
+               /* Queue NOT empty, insert in the queue list */
+               rr_append(q, (struct rr_si *)(_q->_si + 1));
+       }
+       return 0;
+}
+
+static int
+rr_free_queue(struct dn_queue *_q, int safe)
+{
+       struct rr_queue *q = (struct rr_queue *)_q;
+
+       ND("called");
+       if (safe)       /* Delete only if status == 0 */
+               return q->status;
+
+       if (q->status == 1) {
+               struct rr_si *si = (struct rr_si *)(_q->_si + 1);
+               remove_queue_q(q, si);
+       }
+       return 0;
+}
+
+/*
+ * RR scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg rr_desc = {
+       _SI( .type = ) DN_SCHED_RR,
+       _SI( .name = ) "RR",
+       _SI( .flags = ) DN_MULTIQUEUE,
+
+       _SI( .schk_datalen = ) 0,
+       _SI( .si_datalen = ) sizeof(struct rr_si),
+       _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
+
+       _SI( .enqueue = ) rr_enqueue,
+       _SI( .dequeue = ) rr_dequeue,
+
+       _SI( .config = ) rr_config,
+       _SI( .destroy = ) NULL,
+       _SI( .new_sched = ) rr_new_sched,
+       _SI( .free_sched = ) rr_free_sched,
+       _SI( .new_fsk = ) rr_new_fsk,
+       _SI( .free_fsk = ) NULL,
+       _SI( .new_queue = ) rr_new_queue,
+       _SI( .free_queue = ) rr_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c
new file mode 100644 (file)
index 0000000..86d0d57
--- /dev/null
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: dn_sched_wf2q.c 11480 2012-07-31 08:02:00Z luigi $
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>    /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>            /* ipfw_rule_ref */
+#include <netinet/ip_fw.h>     /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifndef MAX64
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#endif
+
+/*
+ * timestamps are computed on 64 bit using fixed point arithmetic.
+ * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
+ * and sum of weights, respectively. FRAC_BITS is the number of
+ * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
+ * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
+ * using an unsigned 32-bit division, and to avoid wraparounds we need
+ * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
+ * As an example
+ * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
+ */
+#ifndef FRAC_BITS
+#define FRAC_BITS    28 /* shift for fixed point arithmetic */
+#define        ONE_FP  (1UL << FRAC_BITS)
+#endif
+
+/*
+ * Private information for the scheduler instance:
+ * sch_heap (key is Finish time) returns the next queue to serve
+ * ne_heap (key is Start time) stores not-eligible queues
+ * idle_heap (key=start/finish time) stores idle flows. It must
+ *     support extract-from-middle.
+ * A flow is only in 1 of the three heaps.
+ * XXX todo: use a more efficient data structure, e.g. a tree sorted
+ * by F with min_subtree(S) in each node
+ */
+struct wf2qp_si {
+    struct dn_heap sch_heap;   /* top extract - key Finish  time */
+    struct dn_heap ne_heap;    /* top extract - key Start   time */
+    struct dn_heap idle_heap;  /* random extract - key Start=Finish time */
+    uint64_t V;                        /* virtual time */
+    uint32_t inv_wsum;         /* inverse of sum of weights */
+    uint32_t wsum;             /* sum of weights */
+};
+
+struct wf2qp_queue {
+    struct dn_queue _q;
+    uint64_t S, F;             /* start time, finish time */
+    uint32_t inv_w;            /* ONE_FP / weight */
+    int32_t heap_pos;          /* position (index) of struct in heap */
+};
+
+/*
+ * This file implements a WF2Q+ scheduler as it has been in dummynet
+ * since 2000.
+ * The scheduler supports per-flow queues and has O(log N) complexity.
+ *
+ * WF2Q+ needs to drain entries from the idle heap so that we
+ * can keep the sum of weights up to date. We can do it whenever
+ * we get a chance, or periodically, or following some other
+ * strategy. The function idle_check() drains at most N elements
+ * from the idle heap.
+ */
+static void
+idle_check(struct wf2qp_si *si, int n, int force)
+{
+    struct dn_heap *h = &si->idle_heap;
+    while (n-- > 0 && h->elements > 0 &&
+               (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
+       struct dn_queue *q = HEAP_TOP(h)->object;
+        struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+
+        heap_extract(h, NULL);
+        /* XXX to let the flowset delete the queue we should
+        * mark it as 'unused' by the scheduler.
+        */
+        alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
+        si->wsum -= q->fs->fs.par[0];  /* adjust sum of weights */
+       if (si->wsum > 0)
+               si->inv_wsum = ONE_FP/si->wsum;
+    }
+}
+
+static int
+wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+    struct dn_fsk *fs = q->fs;
+    struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+    struct wf2qp_queue *alg_fq;
+    uint64_t len = m->m_pkthdr.len;
+
+    if (m != q->mq.head) {
+       if (dn_enqueue(q, m, 0)) /* packet was dropped */
+           return 1;
+       if (m != q->mq.head)    /* queue was already busy */
+           return 0;
+    }
+
+    /* If reach this point, queue q was idle */
+    alg_fq = (struct wf2qp_queue *)q;
+
+    if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
+        /* F<S means timestamps are invalid ->brand new queue. */
+        alg_fq->S = si->V;             /* init start time */
+        si->wsum += fs->fs.par[0];     /* add weight of new queue. */
+       si->inv_wsum = ONE_FP/si->wsum;
+    } else { /* if it was idle then it was in the idle heap */
+        heap_extract(&si->idle_heap, q);
+        alg_fq->S = MAX64(alg_fq->F, si->V);   /* compute new S */
+    }
+    alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
+
+    /* if nothing is backlogged, make sure this flow is eligible */
+    if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
+        si->V = MAX64(alg_fq->S, si->V);
+
+    /*
+     * Look at eligibility. A flow is not eligibile if S>V (when
+     * this happens, it means that there is some other flow already
+     * scheduled for the same pipe, so the sch_heap cannot be
+     * empty). If the flow is not eligible we just store it in the
+     * ne_heap. Otherwise, we store in the sch_heap.
+     * Note that for all flows in sch_heap (SCH), S_i <= V,
+     * and for all flows in ne_heap (NEH), S_i > V.
+     * So when we need to compute max(V, min(S_i)) forall i in
+     * SCH+NEH, we only need to look into NEH.
+     */
+    if (DN_KEY_LT(si->V, alg_fq->S)) {
+        /* S>V means flow Not eligible. */
+        if (si->sch_heap.elements == 0)
+            D("++ ouch! not eligible but empty scheduler!");
+        heap_insert(&si->ne_heap, alg_fq->S, q);
+    } else {
+        heap_insert(&si->sch_heap, alg_fq->F, q);
+    }
+    return 0;
+}
+
+/* XXX invariant: sch > 0 || V >= min(S in neh) */
+static struct mbuf *
+wf2qp_dequeue(struct dn_sch_inst *_si)
+{
+       /* Access scheduler instance private data */
+       struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+       struct mbuf *m;
+       struct dn_queue *q;
+       struct dn_heap *sch = &si->sch_heap;
+       struct dn_heap *neh = &si->ne_heap;
+       struct wf2qp_queue *alg_fq;
+
+       if (sch->elements == 0 && neh->elements == 0) {
+               /* we have nothing to do. We could kill the idle heap
+                * altogether and reset V
+                */
+               idle_check(si, 0x7fffffff, 1);
+               si->V = 0;
+               si->wsum = 0;   /* should be set already */
+               return NULL;    /* quick return if nothing to do */
+       }
+       idle_check(si, 1, 0);   /* drain something from the idle heap */
+
+       /* make sure at least one element is eligible, bumping V
+        * and moving entries that have become eligible.
+        * We need to repeat the first part twice, before and
+        * after extracting the candidate, or enqueue() will
+        * find the data structure in a wrong state.
+        */
+  m = NULL;
+  for(;;) {
+       /*
+        * Compute V = max(V, min(S_i)). Remember that all elements
+        * in sch have by definition S_i <= V so if sch is not empty,
+        * V is surely the max and we must not update it. Conversely,
+        * if sch is empty we only need to look at neh.
+        * We don't need to move the queues, as it will be done at the
+        * next enqueue
+        */
+       if (sch->elements == 0 && neh->elements > 0) {
+               si->V = MAX64(si->V, HEAP_TOP(neh)->key);
+       }
+       while (neh->elements > 0 &&
+                   DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
+               q = HEAP_TOP(neh)->object;
+               alg_fq = (struct wf2qp_queue *)q;
+               heap_extract(neh, NULL);
+               heap_insert(sch, alg_fq->F, q);
+       }
+       if (m) /* pkt found in previous iteration */
+               break;
+       /* ok we have at least one eligible pkt */
+       q = HEAP_TOP(sch)->object;
+       alg_fq = (struct wf2qp_queue *)q;
+       m = dn_dequeue(q);
+       heap_extract(sch, NULL); /* Remove queue from heap. */
+       si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
+       alg_fq->S = alg_fq->F;  /* Update start time. */
+       if (q->mq.head == 0) {  /* not backlogged any more. */
+               heap_insert(&si->idle_heap, alg_fq->F, q);
+       } else {                        /* Still backlogged. */
+               /* Update F, store in neh or sch */
+               uint64_t len = q->mq.head->m_pkthdr.len;
+               alg_fq->F += len * alg_fq->inv_w;
+               if (DN_KEY_LEQ(alg_fq->S, si->V)) {
+                       heap_insert(sch, alg_fq->F, q);
+               } else {
+                       heap_insert(neh, alg_fq->S, q);
+               }
+       }
+    }
+       return m;
+}
+
+static int
+wf2qp_new_sched(struct dn_sch_inst *_si)
+{
+       struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+       int ofs = offsetof(struct wf2qp_queue, heap_pos);
+
+       /* all heaps support extract from middle */
+       if (heap_init(&si->idle_heap, 16, ofs) ||
+           heap_init(&si->sch_heap, 16, ofs) ||
+           heap_init(&si->ne_heap, 16, ofs)) {
+               heap_free(&si->ne_heap);
+               heap_free(&si->sch_heap);
+               heap_free(&si->idle_heap);
+               return ENOMEM;
+       }
+       return 0;
+}
+
+static int
+wf2qp_free_sched(struct dn_sch_inst *_si)
+{
+       struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+
+       heap_free(&si->sch_heap);
+       heap_free(&si->ne_heap);
+       heap_free(&si->idle_heap);
+
+       return 0;
+}
+
+static int
+wf2qp_new_fsk(struct dn_fsk *fs)
+{
+       ipdn_bound_var(&fs->fs.par[0], 1,
+               1, 100, "WF2Q+ weight");
+       return 0;
+}
+
+static int
+wf2qp_new_queue(struct dn_queue *_q)
+{
+       struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
+
+       _q->ni.oid.subtype = DN_SCHED_WF2QP;
+       q->F = 0;       /* not strictly necessary */
+       q->S = q->F + 1;    /* mark timestamp as invalid. */
+        q->inv_w = ONE_FP / _q->fs->fs.par[0];
+       if (_q->mq.head != NULL) {
+               wf2qp_enqueue(_q->_si, _q, _q->mq.head);
+       }
+       return 0;
+}
+
+/*
+ * Called when the infrastructure removes a queue (e.g. flowset
+ * is reconfigured). Nothing to do if we did not 'own' the queue,
+ * otherwise remove it from the right heap and adjust the sum
+ * of weights.
+ */
+static int
+wf2qp_free_queue(struct dn_queue *q, int safe)
+{
+       struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+       struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
+
+       if (alg_fq->S >= alg_fq->F + 1)
+               return 0;       /* nothing to do, not in any heap */
+
+       /* queue is in a scheduler heap */
+       if (safe)       /* do not delete in safe mode */
+               return 1;
+
+       si->wsum -= q->fs->fs.par[0];
+       if (si->wsum > 0)
+               si->inv_wsum = ONE_FP/si->wsum;
+
+       /* extract from the heap. XXX TODO we may need to adjust V
+        * to make sure the invariants hold.
+        */
+       if (q->mq.head == NULL) {
+               heap_extract(&si->idle_heap, q);
+       } else if (DN_KEY_LT(si->V, alg_fq->S)) {
+               heap_extract(&si->ne_heap, q);
+       } else {
+               heap_extract(&si->sch_heap, q);
+       }
+       return 0;
+}
+
+/*
+ * WF2Q+ scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg wf2qp_desc = {
+       _SI( .type = ) DN_SCHED_WF2QP,
+       _SI( .name = ) "WF2Q+",
+       _SI( .flags = ) DN_MULTIQUEUE,
+
+       /* we need extra space in the si and the queue */
+       _SI( .schk_datalen = ) 0,
+       _SI( .si_datalen = ) sizeof(struct wf2qp_si),
+       _SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
+                               sizeof(struct dn_queue),
+
+       _SI( .enqueue = ) wf2qp_enqueue,
+       _SI( .dequeue = ) wf2qp_dequeue,
+
+       _SI( .config = )  NULL,
+       _SI( .destroy = )  NULL,
+       _SI( .new_sched = ) wf2qp_new_sched,
+       _SI( .free_sched = ) wf2qp_free_sched,
+
+       _SI( .new_fsk = ) wf2qp_new_fsk,
+       _SI( .free_fsk = )  NULL,
+
+       _SI( .new_queue = ) wf2qp_new_queue,
+       _SI( .free_queue = ) wf2qp_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c
new file mode 100644 (file)
index 0000000..aa0ac90
--- /dev/null
@@ -0,0 +1,846 @@
+/*-
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: ip_dn_glue.c 12500 2013-12-11 23:07:58Z luigi $
+ *
+ * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* FREEBSD7.2 ip_dummynet.h r191715*/
+
+struct dn_heap_entry7 {
+       int64_t key;        /* sorting key. Topmost element is smallest one */
+       void *object;      /* object pointer */
+};
+
+struct dn_heap7 {
+       int size;
+       int elements;
+       int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
+       struct dn_heap_entry7 *p;   /* really an array of "size" entries */
+};
+
+/* Common to 7.2 and 8 */
+struct dn_flow_set {
+       SLIST_ENTRY(dn_flow_set)    next;   /* linked list in a hash slot */
+
+       u_short fs_nr ;             /* flow_set number       */
+       u_short flags_fs;
+#define DNOLD_HAVE_FLOW_MASK   0x0001
+#define DNOLD_IS_RED       0x0002
+#define DNOLD_IS_GENTLE_RED    0x0004
+#define DNOLD_QSIZE_IS_BYTES   0x0008  /* queue size is measured in bytes */
+#define DNOLD_NOERROR      0x0010  /* do not report ENOBUFS on drops  */
+#define DNOLD_HAS_PROFILE      0x0020  /* the pipe has a delay profile. */
+#define DNOLD_IS_PIPE      0x4000
+#define DNOLD_IS_QUEUE     0x8000
+
+       struct dn_pipe7 *pipe ;  /* pointer to parent pipe */
+       u_short parent_nr ;     /* parent pipe#, 0 if local to a pipe */
+
+       int weight ;        /* WFQ queue weight */
+       int qsize ;         /* queue size in slots or bytes */
+       int plr ;           /* pkt loss rate (2^31-1 means 100%) */
+
+       struct ipfw_flow_id flow_mask ;
+
+       /* hash table of queues onto this flow_set */
+       int rq_size ;       /* number of slots */
+       int rq_elements ;       /* active elements */
+       struct dn_flow_queue7 **rq;  /* array of rq_size entries */
+
+       u_int32_t last_expired ;    /* do not expire too frequently */
+       int backlogged ;        /* #active queues for this flowset */
+
+        /* RED parameters */
+#define SCALE_RED               16
+#define SCALE(x)                ( (x) << SCALE_RED )
+#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
+       int w_q ;           /* queue weight (scaled) */
+       int max_th ;        /* maximum threshold for queue (scaled) */
+       int min_th ;        /* minimum threshold for queue (scaled) */
+       int max_p ;         /* maximum value for p_b (scaled) */
+       u_int c_1 ;         /* max_p/(max_th-min_th) (scaled) */
+       u_int c_2 ;         /* max_p*min_th/(max_th-min_th) (scaled) */
+       u_int c_3 ;         /* for GRED, (1-max_p)/max_th (scaled) */
+       u_int c_4 ;         /* for GRED, 1 - 2*max_p (scaled) */
+       u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
+       u_int lookup_depth ;    /* depth of lookup table */
+       int lookup_step ;       /* granularity inside the lookup table */
+       int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+       int avg_pkt_size ;      /* medium packet size */
+       int max_pkt_size ;      /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+#define DN_IS_PIPE             0x4000
+#define DN_IS_QUEUE            0x8000
+struct dn_flow_queue7 {
+       struct dn_flow_queue7 *next ;
+       struct ipfw_flow_id id ;
+
+       struct mbuf *head, *tail ;  /* queue of packets */
+       u_int len ;
+       u_int len_bytes ;
+
+       u_long numbytes;
+
+       u_int64_t tot_pkts ;    /* statistics counters  */
+       u_int64_t tot_bytes ;
+       u_int32_t drops ;
+
+       int hash_slot ;     /* debugging/diagnostic */
+
+       /* RED parameters */
+       int avg ;                   /* average queue length est. (scaled) */
+       int count ;                 /* arrivals since last RED drop */
+       int random ;                /* random value (scaled) */
+       u_int32_t q_time;      /* start of queue idle time */
+
+       /* WF2Q+ support */
+       struct dn_flow_set *fs ;    /* parent flow set */
+       int heap_pos ;      /* position (index) of struct in heap */
+       int64_t sched_time ;     /* current time when queue enters ready_heap */
+
+       int64_t S,F ;        /* start time, finish time */
+};
+
+struct dn_pipe7 {        /* a pipe */
+       SLIST_ENTRY(dn_pipe7)    next;   /* linked list in a hash slot */
+
+       int pipe_nr ;       /* number   */
+       int bandwidth;      /* really, bytes/tick.  */
+       int delay ;         /* really, ticks    */
+
+       struct  mbuf *head, *tail ; /* packets in delay line */
+
+       /* WF2Q+ */
+       struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+       struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+       struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+       int64_t V ;          /* virtual time */
+       int sum;            /* sum of weights of all active sessions */
+
+       int numbytes;
+
+       int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
+
+       /*
+       * When the tx clock come from an interface (if_name[0] != '\0'), its name
+       * is stored below, whereas the ifp is filled when the rule is configured.
+       */
+       char if_name[IFNAMSIZ];
+       struct ifnet *ifp ;
+       int ready ; /* set if ifp != NULL and we got a signal from it */
+
+       struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+SLIST_HEAD(dn_pipe_head7, dn_pipe7);
+
+
+/* FREEBSD8 ip_dummynet.h r196045 */
+struct dn_flow_queue8 {
+       struct dn_flow_queue8 *next ;
+       struct ipfw_flow_id id ;
+
+       struct mbuf *head, *tail ;  /* queue of packets */
+       u_int len ;
+       u_int len_bytes ;
+
+       uint64_t numbytes ;     /* credit for transmission (dynamic queues) */
+       int64_t extra_bits;     /* extra bits simulating unavailable channel */
+
+       u_int64_t tot_pkts ;    /* statistics counters  */
+       u_int64_t tot_bytes ;
+       u_int32_t drops ;
+
+       int hash_slot ;     /* debugging/diagnostic */
+
+       /* RED parameters */
+       int avg ;                   /* average queue length est. (scaled) */
+       int count ;                 /* arrivals since last RED drop */
+       int random ;                /* random value (scaled) */
+       int64_t idle_time;       /* start of queue idle time */
+
+       /* WF2Q+ support */
+       struct dn_flow_set *fs ;    /* parent flow set */
+       int heap_pos ;      /* position (index) of struct in heap */
+       int64_t sched_time ;     /* current time when queue enters ready_heap */
+
+       int64_t S,F ;        /* start time, finish time */
+};
+
+struct dn_pipe8 {        /* a pipe */
+       SLIST_ENTRY(dn_pipe8)    next;   /* linked list in a hash slot */
+
+       int pipe_nr ;       /* number   */
+       int bandwidth;      /* really, bytes/tick.  */
+       int delay ;         /* really, ticks    */
+
+       struct  mbuf *head, *tail ; /* packets in delay line */
+
+       /* WF2Q+ */
+       struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+       struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+       struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+       int64_t V ;          /* virtual time */
+       int sum;            /* sum of weights of all active sessions */
+
+       /* Same as in dn_flow_queue, numbytes can become large */
+       int64_t numbytes;       /* bits I can transmit (more or less). */
+       uint64_t burst;     /* burst size, scaled: bits * hz */
+
+       int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
+       int64_t idle_time;       /* start of pipe idle time */
+
+       char if_name[IFNAMSIZ];
+       struct ifnet *ifp ;
+       int ready ; /* set if ifp != NULL and we got a signal from it */
+
+       struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+    /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN     32
+       char name[ED_MAX_NAME_LEN];
+       int loss_level;
+       int samples_no;
+       int *samples;
+};
+
+#define ED_MAX_SAMPLES_NO   1024
+struct dn_pipe_max8 {
+       struct dn_pipe8 pipe;
+       int samples[ED_MAX_SAMPLES_NO];
+};
+SLIST_HEAD(dn_pipe_head8, dn_pipe8);
+
+/*
+ * Changes from 7.2 to 8:
+ * dn_pipe:
+ *      numbytes from int to int64_t
+ *      add burst (int64_t)
+ *      add idle_time (int64_t)
+ *      add profile
+ *      add struct dn_pipe_max
+ *      add flag DN_HAS_PROFILE
+ *
+ * dn_flow_queue
+ *      numbytes from u_long to int64_t
+ *      add extra_bits (int64_t)
+ *      q_time from u_int32_t to int64_t and name idle_time
+ *
+ * dn_flow_set unchanged
+ *
+ */
+
+/* NOTE:XXX copied from dummynet.c */
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+       oid->len = len;
+       oid->type = type;
+       oid->subtype = 0;
+       oid->id = id;
+}
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+       struct dn_id *ret = *o;
+       oid_fill(ret, len, type, 0);
+       *o = O_NEXT(*o, len);
+       return ret;
+}
+
+
+static size_t pipesize7 = sizeof(struct dn_pipe7);
+static size_t pipesize8 = sizeof(struct dn_pipe8);
+static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
+
+/* Indicate 'ipfw' version
+ * 1: from FreeBSD 7.2
+ * 0: from FreeBSD 8
+ * -1: unknown (for now is unused)
+ *
+ * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
+ * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown,
+ *       it is suppose to be the FreeBSD 8 version.
+ */
+static int is7 = 0;
+
+static int
+convertflags2new(int src)
+{
+       int dst = 0;
+
+       if (src & DNOLD_HAVE_FLOW_MASK)
+               dst |= DN_HAVE_MASK;
+       if (src & DNOLD_QSIZE_IS_BYTES)
+               dst |= DN_QSIZE_BYTES;
+       if (src & DNOLD_NOERROR)
+               dst |= DN_NOERROR;
+       if (src & DNOLD_IS_RED)
+               dst |= DN_IS_RED;
+       if (src & DNOLD_IS_GENTLE_RED)
+               dst |= DN_IS_GENTLE_RED;
+       if (src & DNOLD_HAS_PROFILE)
+               dst |= DN_HAS_PROFILE;
+
+       return dst;
+}
+
+static int
+convertflags2old(int src)
+{
+       int dst = 0;
+
+       if (src & DN_HAVE_MASK)
+               dst |= DNOLD_HAVE_FLOW_MASK;
+       if (src & DN_IS_RED)
+               dst |= DNOLD_IS_RED;
+       if (src & DN_IS_GENTLE_RED)
+               dst |= DNOLD_IS_GENTLE_RED;
+       if (src & DN_NOERROR)
+               dst |= DNOLD_NOERROR;
+       if (src & DN_HAS_PROFILE)
+               dst |= DNOLD_HAS_PROFILE;
+       if (src & DN_QSIZE_BYTES)
+               dst |= DNOLD_QSIZE_IS_BYTES;
+
+       return dst;
+}
+
+static int
+dn_compat_del(void *v)
+{
+       struct dn_pipe7 *p = (struct dn_pipe7 *) v;
+       struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
+       struct {
+               struct dn_id oid;
+               uintptr_t a[1]; /* add more if we want a list */
+       } cmd;
+
+       /* XXX DN_API_VERSION ??? */
+       oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+
+       if (is7) {
+               if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+                       return EINVAL;
+               if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+                       return EINVAL;
+       } else {
+               if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
+                       return EINVAL;
+               if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
+                       return EINVAL;
+       }
+
+       if (p->pipe_nr != 0) { /* pipe x delete */
+               cmd.a[0] = p->pipe_nr;
+               cmd.oid.subtype = DN_LINK;
+       } else { /* queue x delete */
+               cmd.oid.subtype = DN_FS;
+               cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
+       }
+
+       return do_config(&cmd, cmd.oid.len);
+}
+
+static int
+dn_compat_config_queue(struct dn_fs *fs, void* v)
+{
+       struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+       struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+       struct dn_flow_set *f;
+
+       if (is7)
+               f = &p7->fs;
+       else
+               f = &p8->fs;
+
+       fs->fs_nr = f->fs_nr;
+       fs->sched_nr = f->parent_nr;
+       fs->flow_mask = f->flow_mask;
+       fs->buckets = f->rq_size;
+       fs->qsize = f->qsize;
+       fs->plr = f->plr;
+       fs->par[0] = f->weight;
+       fs->flags = convertflags2new(f->flags_fs);
+       if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
+               fs->w_q = f->w_q;
+               fs->max_th = f->max_th;
+               fs->min_th = f->min_th;
+               fs->max_p = f->max_p;
+       }
+
+       return 0;
+}
+
+static int
+dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, 
+                     struct dn_fs *fs, void* v)
+{
+       struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+       struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+       int i = p7->pipe_nr;
+
+       sch->sched_nr = i;
+       sch->oid.subtype = 0;
+       p->link_nr = i;
+       fs->fs_nr = i + 2*DN_MAX_ID;
+       fs->sched_nr = i + DN_MAX_ID;
+
+       /* Common to 7 and 8 */
+       p->bandwidth = p7->bandwidth;
+       p->delay = p7->delay;
+       if (!is7) {
+               /* FreeBSD 8 has burst  */
+               p->burst = p8->burst;
+       }
+
+       /* fill the fifo flowset */
+       dn_compat_config_queue(fs, v);
+       fs->fs_nr = i + 2*DN_MAX_ID;
+       fs->sched_nr = i + DN_MAX_ID;
+
+       /* Move scheduler related parameter from fs to sch */
+       sch->buckets = fs->buckets; /*XXX*/
+       fs->buckets = 0;
+       if (fs->flags & DN_HAVE_MASK) {
+               sch->flags |= DN_HAVE_MASK;
+               fs->flags &= ~DN_HAVE_MASK;
+               sch->sched_mask = fs->flow_mask;
+               bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
+       }
+
+       return 0;
+}
+
+static int
+dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
+                        void *v)
+{
+       struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+       p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
+       
+       pf->link_nr = p->link_nr;
+       pf->loss_level = p8->loss_level;
+//     pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
+       pf->samples_no = p8->samples_no;
+       strncpy(pf->name, p8->name,sizeof(pf->name));
+       bcopy(p8->samples, pf->samples, sizeof(pf->samples));
+
+       return 0;
+}
+
+/*
+ * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
+ * the three main struct, else only a flowset is created
+ */
+static int
+dn_compat_configure(void *v)
+{
+       struct dn_id *buf = NULL, *base;
+       struct dn_sch *sch = NULL;
+       struct dn_link *p = NULL;
+       struct dn_fs *fs = NULL;
+       struct dn_profile *pf = NULL;
+       int lmax;
+       int error;
+
+       struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+       struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+       int i; /* number of object to configure */
+
+       lmax = sizeof(struct dn_id);    /* command header */
+       lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+               sizeof(struct dn_fs) + sizeof(struct dn_profile);
+
+       base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO);
+       o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+       base->id = DN_API_VERSION;
+
+       /* pipe_nr is the same in p7 and p8 */
+       i = p7->pipe_nr;
+       if (i != 0) { /* pipe config */
+               sch = o_next(&buf, sizeof(*sch), DN_SCH);
+               p = o_next(&buf, sizeof(*p), DN_LINK);
+               fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+               error = dn_compat_config_pipe(sch, p, fs, v);
+               if (error) {
+                       free(buf, M_DUMMYNET);
+                       return error;
+               }
+               if (!is7 && p8->samples_no > 0) {
+                       /* Add profiles*/
+                       pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+                       error = dn_compat_config_profile(pf, p, v);
+                       if (error) {
+                               free(buf, M_DUMMYNET);
+                               return error;
+                       }
+               }
+       } else { /* queue config */
+               fs = o_next(&buf, sizeof(*fs), DN_FS);
+               error = dn_compat_config_queue(fs, v);
+               if (error) {
+                       free(buf, M_DUMMYNET);
+                       return error;
+               }
+       }
+       error = do_config(base, (char *)buf - (char *)base);
+
+       if (buf)
+               free(buf, M_DUMMYNET);
+       return error;
+}
+
+int
+dn_compat_calc_size(void)
+{
+       int need = 0;
+       /* XXX use FreeBSD 8 struct size */
+       /* NOTE:
+        * - half scheduler:            schk_count/2
+        * - all flowset:               fsk_count
+        * - all flowset queues:        queue_count
+        * - all pipe queue:            si_count
+        */
+       need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
+       need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
+       need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
+       need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
+
+       return need;
+}
+
+int
+dn_c_copy_q (void *_ni, void *arg)
+{
+       struct copy_args *a = arg;
+       struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
+       struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
+       struct dn_flow *ni = (struct dn_flow *)_ni;
+       int size = 0;
+
+       /* XXX hash slot not set */
+       /* No difference between 7.2/8 */
+       fq7->len = ni->length;
+       fq7->len_bytes = ni->len_bytes;
+       fq7->id = ni->fid;
+
+       if (is7) {
+               size = sizeof(struct dn_flow_queue7);
+               fq7->tot_pkts = ni->tot_pkts;
+               fq7->tot_bytes = ni->tot_bytes;
+               fq7->drops = ni->drops;
+       } else {
+               size = sizeof(struct dn_flow_queue8);
+               fq8->tot_pkts = ni->tot_pkts;
+               fq8->tot_bytes = ni->tot_bytes;
+               fq8->drops = ni->drops;
+       }
+
+       *a->start += size;
+       return 0;
+}
+
+int
+dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
+{
+       struct dn_link *l = &s->link;
+       struct dn_fsk *f = s->fs;
+
+       struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
+       struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
+       struct dn_flow_set *fs;
+       int size = 0;
+
+       if (is7) {
+               fs = &pipe7->fs;
+               size = sizeof(struct dn_pipe7);
+       } else {
+               fs = &pipe8->fs;
+               size = sizeof(struct dn_pipe8);
+       }
+
+       /* These 4 field are the same in pipe7 and pipe8 */
+       pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
+       pipe7->bandwidth = l->bandwidth;
+       pipe7->delay = l->delay * 1000 / hz;
+       pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
+
+       if (!is7) {
+               if (s->profile) {
+                       struct dn_profile *pf = s->profile;
+                       strncpy(pipe8->name, pf->name, sizeof(pf->name));
+                       pipe8->loss_level = pf->loss_level;
+                       pipe8->samples_no = pf->samples_no;
+               }
+               pipe8->burst = div64(l->burst , 8 * hz);
+       }
+
+       fs->flow_mask = s->sch.sched_mask;
+       fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
+
+       fs->parent_nr = l->link_nr - DN_MAX_ID;
+       fs->qsize = f->fs.qsize;
+       fs->plr = f->fs.plr;
+       fs->w_q = f->fs.w_q;
+       fs->max_th = f->max_th;
+       fs->min_th = f->min_th;
+       fs->max_p = f->fs.max_p;
+       fs->rq_elements = nq;
+
+       fs->flags_fs = convertflags2old(f->fs.flags);
+
+       *a->start += size;
+       return 0;
+}
+
+
+int
+dn_compat_copy_pipe(struct copy_args *a, void *_o)
+{
+       int have = a->end - *a->start;
+       int need = 0;
+       int pipe_size = sizeof(struct dn_pipe8);
+       int queue_size = sizeof(struct dn_flow_queue8);
+       int n_queue = 0; /* number of queues */
+
+       struct dn_schk *s = (struct dn_schk *)_o;
+       /* calculate needed space:
+        * - struct dn_pipe
+        * - if there are instances, dn_queue * n_instances
+        */
+       n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
+                                               (s->siht ? 1 : 0));
+       need = pipe_size + queue_size * n_queue;
+       if (have < need) {
+               D("have %d < need %d", have, need);
+               return 1;
+       }
+       /* copy pipe */
+       dn_c_copy_pipe(s, a, n_queue);
+
+       /* copy queues */
+       if (s->sch.flags & DN_HAVE_MASK)
+               dn_ht_scan(s->siht, dn_c_copy_q, a);
+       else if (s->siht)
+               dn_c_copy_q(s->siht, a);
+       return 0;
+}
+
+int
+dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
+{
+       struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
+
+       fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+       fs->fs_nr = f->fs.fs_nr;
+       fs->qsize = f->fs.qsize;
+       fs->plr = f->fs.plr;
+       fs->w_q = f->fs.w_q;
+       fs->max_th = f->max_th;
+       fs->min_th = f->min_th;
+       fs->max_p = f->fs.max_p;
+       fs->flow_mask = f->fs.flow_mask;
+       fs->rq_elements = nq;
+       fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
+       fs->parent_nr = f->fs.sched_nr;
+       fs->weight = f->fs.par[0];
+
+       fs->flags_fs = convertflags2old(f->fs.flags);
+       *a->start += sizeof(struct dn_flow_set);
+       return 0;
+}
+
+int
+dn_compat_copy_queue(struct copy_args *a, void *_o)
+{
+       int have = a->end - *a->start;
+       int need = 0;
+       int fs_size = sizeof(struct dn_flow_set);
+       int queue_size = sizeof(struct dn_flow_queue8);
+
+       struct dn_fsk *fs = (struct dn_fsk *)_o;
+       int n_queue = 0; /* number of queues */
+
+       n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
+                                               (fs->qht ? 1 : 0));
+
+       need = fs_size + queue_size * n_queue;
+       if (have < need) {
+               D("have < need");
+               return 1;
+       }
+
+       /* copy flowset */
+       dn_c_copy_fs(fs, a, n_queue);
+
+       /* copy queues */
+       if (fs->fs.flags & DN_HAVE_MASK)
+               dn_ht_scan(fs->qht, dn_c_copy_q, a);
+       else if (fs->qht)
+               dn_c_copy_q(fs->qht, a);
+
+       return 0;
+}
+
+int
+copy_data_helper_compat(void *_o, void *_arg)
+{
+       struct copy_args *a = _arg;
+
+       if (a->type == DN_COMPAT_PIPE) {
+               struct dn_schk *s = _o;
+               if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
+                       return 0;       /* not old type */
+               }
+               /* copy pipe parameters, and if instance exists, copy
+                * other parameters and eventually queues.
+                */
+               if(dn_compat_copy_pipe(a, _o))
+                       return DNHT_SCAN_END;
+       } else if (a->type == DN_COMPAT_QUEUE) {
+               struct dn_fsk *fs = _o;
+               if (fs->fs.fs_nr >= DN_MAX_ID)
+                       return 0;
+               if (dn_compat_copy_queue(a, _o))
+                       return DNHT_SCAN_END;
+       }
+       return 0;
+}
+
+/* Main function to manage old requests */
+int
+ip_dummynet_compat(struct sockopt *sopt)
+{
+       int error=0;
+       void *v = NULL;
+       struct dn_id oid;
+
+       /* Lenght of data, used to found ipfw version... */
+       int len = sopt->sopt_valsize;
+
+       /* len can be 0 if command was dummynet_flush */
+       if (len == pipesize7) {
+               D("setting compatibility with FreeBSD 7.2");
+               is7 = 1;
+       }
+       else if (len == pipesize8 || len == pipesizemax8) {
+               D("setting compatibility with FreeBSD 8");
+               is7 = 0;
+       }
+
+       switch (sopt->sopt_name) {
+       default:
+               printf("dummynet: -- unknown option %d", sopt->sopt_name);
+               error = EINVAL;
+               break;
+
+       case IP_DUMMYNET_FLUSH:
+               oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+               do_config(&oid, oid.len);
+               break;
+
+       case IP_DUMMYNET_DEL:
+               v = malloc(len, M_TEMP, M_WAITOK);
+               error = sooptcopyin(sopt, v, len, len);
+               if (error)
+                       break;
+               error = dn_compat_del(v);
+               free(v, M_TEMP);
+               break;
+
+       case IP_DUMMYNET_CONFIGURE:
+               v = malloc(len, M_TEMP, M_WAITOK);
+               error = sooptcopyin(sopt, v, len, len);
+               if (error)
+                       break;
+               error = dn_compat_configure(v);
+               free(v, M_TEMP);
+               break;
+
+       case IP_DUMMYNET_GET: {
+               void *buf;
+               int ret;
+               int original_size = sopt->sopt_valsize;
+               int size;
+
+               ret = dummynet_get(sopt, &buf);
+               if (ret)
+                       return 0;//XXX ?
+               size = sopt->sopt_valsize;
+               sopt->sopt_valsize = original_size;
+               D("size=%d, buf=%p", size, buf);
+               ret = sooptcopyout(sopt, buf, size);
+               if (ret)
+                       printf("  %s ERROR sooptcopyout\n", __FUNCTION__);
+               if (buf)
+                       free(buf, M_DUMMYNET);
+           }
+       }
+
+       return error;
+}
+
+
diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c
new file mode 100644 (file)
index 0000000..fd0dbeb
--- /dev/null
@@ -0,0 +1,962 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Dummynet portions related to packet handling.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 2010-01-31 21:39:25Z luigi $");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>                /* ip_len, ip_off */
+#include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ * instead of dn_cfg.curr_time
+ */
+
+struct dn_parms dn_cfg;
+//VNET_DEFINE(struct dn_parms, _base_dn_cfg);
+
+static long tick_last;         /* Last tick duration (usec). */
+static long tick_delta;                /* Last vs standard tick diff (usec). */
+static long tick_delta_sum;    /* Accumulated tick difference (usec).*/
+static long tick_adjustment;   /* Tick adjustments done. */
+static long tick_lost;         /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static unsigned long   io_pkt;
+static unsigned long   io_pkt_fast;
+static unsigned long   io_pkt_drop;
+
+/*
+ * We use a heap to store entities for which we have pending timer events.
+ * The heap is checked at every tick and all entities with expired events
+ * are extracted.
+ */
+  
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+
+/* wrapper to pass dn_cfg fields to SYSCTL_* */
+//#define DC(x)        (&(VNET_NAME(_base_dn_cfg).x))
+#define DC(x)  (&(dn_cfg.x))
+/* parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, DC(hash_size), 0, "Default hash table size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, DC(slot_limit), 0,
+    "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, DC(byte_limit), 0,
+    "Upper limit in bytes for pipe queue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
+    CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");
+
+/* RED parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");
+
+/* time adjustment */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+
+/* Drain parameters */
+SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
+SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+    CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
+SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_object,
+    CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine");
+SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick,
+    CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio,
+    CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine");
+
+/* statistics */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
+    CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
+    CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
+    CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
+    CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+#undef DC
+SYSEND
+
+#endif
+
+static void    dummynet_send(struct mbuf *);
+
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.
+ * Outside dummynet, only the 'rule' field is relevant, and it must
+ * be at the beginning of the structure.
+ */
+struct dn_pkt_tag {
+       struct ipfw_rule_ref rule;      /* matching rule        */
+
+       /* second part, dummynet specific */
+       int dn_dir;             /* action when packet comes out.*/
+                               /* see ip_fw_private.h          */
+       uint64_t output_time;   /* when the pkt is due for delivery*/
+       struct ifnet *ifp;      /* interface, for ip_output     */
+       struct _ip6dn_args ip6opt;      /* XXX ipv6 options     */
+};
+
+/*
+ * Return the mbuf tag holding the dummynet state (it should
+ * be the first one on the list).
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+       struct m_tag *mtag = m_tag_first(m);
+       KASSERT(mtag != NULL &&
+           mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+           mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+           ("packet on dummynet queue w/o dummynet tag!"));
+       return (struct dn_pkt_tag *)(mtag+1);
+}
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+       if (q->head == NULL)
+               q->head = m;
+       else
+               q->tail->m_nextpkt = m;
+       q->tail = m;
+       m->m_nextpkt = NULL;
+}
+
+/*
+ * Dispose a list of packet. Use a functions so if we need to do
+ * more work, this is a central point to do it.
+ */
+void dn_free_pkts(struct mbuf *mnext)
+{
+        struct mbuf *m;
+    
+        while ((m = mnext) != NULL) {
+                mnext = m->m_nextpkt;
+                FREE_PKT(m);
+        }
+}
+
+static int
+red_drops (struct dn_queue *q, int len)
+{
+       /*
+        * RED algorithm
+        *
+        * RED calculates the average queue size (avg) using a low-pass filter
+        * with an exponential weighted (w_q) moving average:
+        *      avg  <-  (1-w_q) * avg + w_q * q_size
+        * where q_size is the queue length (measured in bytes or * packets).
+        *
+        * If q_size == 0, we compute the idle time for the link, and set
+        *      avg = (1 - w_q)^(idle/s)
+        * where s is the time needed for transmitting a medium-sized packet.
+        *
+        * Now, if avg < min_th the packet is enqueued.
+        * If avg > max_th the packet is dropped. Otherwise, the packet is
+        * dropped with probability P function of avg.
+        */
+
+       struct dn_fsk *fs = q->fs;
+       int64_t p_b = 0;
+
+       /* Queue in bytes or packets? */
+       uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
+           q->ni.len_bytes : q->ni.length;
+
+       /* Average queue size estimation. */
+       if (q_size != 0) {
+               /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+               int diff = SCALE(q_size) - q->avg;
+               int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+               q->avg += (int)v;
+       } else {
+               /*
+                * Queue is empty, find for how long the queue has been
+                * empty and use a lookup table for computing
+                * (1 - * w_q)^(idle_time/s) where s is the time to send a
+                * (small) packet.
+                * XXX check wraps...
+                */
+               if (q->avg) {
+                       u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
+
+                       q->avg = (t < fs->lookup_depth) ?
+                           SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+               }
+       }
+
+       /* Should i drop? */
+       if (q->avg < fs->min_th) {
+               q->count = -1;
+               return (0);     /* accept packet */
+       }
+       if (q->avg >= fs->max_th) {     /* average queue >=  max threshold */
+               if (fs->fs.flags & DN_IS_GENTLE_RED) {
+                       /*
+                        * According to Gentle-RED, if avg is greater than
+                        * max_th the packet is dropped with a probability
+                        *       p_b = c_3 * avg - c_4
+                        * where c_3 = (1 - max_p) / max_th
+                        *       c_4 = 1 - 2 * max_p
+                        */
+                       p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+                           fs->c_4;
+               } else {
+                       q->count = -1;
+                       return (1);
+               }
+       } else if (q->avg > fs->min_th) {
+               /*
+                * We compute p_b using the linear dropping function
+                *       p_b = c_1 * avg - c_2
+                * where c_1 = max_p / (max_th - min_th)
+                *       c_2 = max_p * min_th / (max_th - min_th)
+                */
+               p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+       }
+
+       if (fs->fs.flags & DN_QSIZE_BYTES)
+               p_b = div64((p_b * len) , fs->max_pkt_size);
+       if (++q->count == 0)
+               q->random = random() & 0xffff;
+       else {
+               /*
+                * q->count counts packets arrived since last drop, so a greater
+                * value of q->count means a greater packet drop probability.
+                */
+               if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+                       q->count = 0;
+                       /* After a drop we calculate a new random value. */
+                       q->random = random() & 0xffff;
+                       return (1);     /* drop */
+               }
+       }
+       /* End of RED algorithm. */
+
+       return (0);     /* accept */
+
+}
+
+/*
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{   
+       struct dn_fs *f;
+       struct dn_flow *ni;     /* stats for scheduler instance */
+       uint64_t len;
+
+       if (q->fs == NULL || q->_si == NULL) {
+               printf("%s fs %p si %p, dropping\n",
+                       __FUNCTION__, q->fs, q->_si);
+               FREE_PKT(m);
+               return 1;
+       }
+       f = &(q->fs->fs);
+       ni = &q->_si->ni;
+       len = m->m_pkthdr.len;
+       /* Update statistics, then check reasons to drop pkt. */
+       q->ni.tot_bytes += len;
+       q->ni.tot_pkts++;
+       ni->tot_bytes += len;
+       ni->tot_pkts++;
+       if (drop)
+               goto drop;
+       if (f->plr && random() < f->plr)
+               goto drop;
+       if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
+               goto drop;
+       if (f->flags & DN_QSIZE_BYTES) {
+               if (q->ni.len_bytes > f->qsize)
+                       goto drop;
+       } else if (q->ni.length >= f->qsize) {
+               goto drop;
+       }
+       mq_append(&q->mq, m);
+       if (q->ni.length == 0) {        /* queue was idle */
+               dn_cfg.idle_queue--;
+               if (ni->length == 0)    /* scheduler was idle */
+                       dn_cfg.idle_si--;
+       }
+       q->ni.length++;
+       q->ni.len_bytes += len;
+       ni->length++;
+       ni->len_bytes += len;
+       return 0;
+
+drop:
+       io_pkt_drop++;
+       q->ni.drops++;
+       ni->drops++;
+       FREE_PKT(m);
+       return 1;
+}
+
+/*
+ * Fetch packets from the delay line which are due now. If there are
+ * leftover packets, reinsert the delay line in the heap.
+ * Runs under scheduler lock.
+ */
+static void
+transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
+{
+       struct mbuf *m;
+       struct dn_pkt_tag *pkt = NULL;
+
+       dline->oid.subtype = 0; /* not in heap */
+       while ((m = dline->mq.head) != NULL) {
+               pkt = dn_tag_get(m);
+               if (!DN_KEY_LEQ(pkt->output_time, now))
+                       break;
+               dline->mq.head = m->m_nextpkt;
+               mq_append(q, m);
+       }
+       if (m != NULL) {
+               dline->oid.subtype = 1; /* in heap */
+               heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
+       }
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are
+ * in milliseconds so we need to divide by 1000.
+ */
+static uint64_t
+extra_bits(struct mbuf *m, struct dn_schk *s)
+{
+       int index;
+       uint64_t bits;
+       struct dn_profile *pf = s->profile;
+
+       if (!pf || pf->samples_no == 0)
+               return 0;
+       index  = random() % pf->samples_no;
+       bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
+       if (index >= pf->loss_level) {
+               struct dn_pkt_tag *dt = dn_tag_get(m);
+               if (dt)
+                       dt->dn_dir = DIR_DROP;
+       }
+       return bits;
+}
+
+/*
+ * Send traffic from a scheduler instance due by 'now'.
+ * Return a pointer to the head of the queue.
+ */
+static struct mbuf *
+serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
+{
+       struct mq def_q;
+       struct dn_schk *s = si->sched;
+       struct mbuf *m = NULL;
+       int delay_line_idle = (si->dline.mq.head == NULL);
+       int done, bw;
+
+       if (q == NULL) {
+               q = &def_q;
+               q->head = NULL;
+       }
+
+       bw = s->link.bandwidth;
+       si->kflags &= ~DN_ACTIVE;
+
+       if (bw > 0)
+               si->credit += (now - si->sched_time) * bw;
+       else
+               si->credit = 0;
+       si->sched_time = now;
+       done = 0;
+       while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+               uint64_t len_scaled;
+
+               /*
+                * Some schedulers might want wake up the scheduler later.
+                * To suppor this the caller returns an mbuf with len < 0
+                * this will result in a new wake up of the scheduler
+                * instance between m->m_pkthdr.len ticks.
+                */
+               if (m->m_pkthdr.len < 0) {
+                       si->kflags |= DN_ACTIVE;
+                       heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si);
+                       if (delay_line_idle && done)
+                               transmit_event(q, &si->dline, now);
+                       return NULL;
+               }
+
+               /* a regular mbuf received */
+               done++;
+               len_scaled = (bw == 0) ? 0 : hz *
+                       (m->m_pkthdr.len * 8 + extra_bits(m, s));
+               si->credit -= len_scaled;
+               /* Move packet in the delay line */
+               dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay;
+               mq_append(&si->dline.mq, m);
+       }
+
+       /*
+        * If credit >= 0 the instance is idle, mark time.
+        * Otherwise put back in the heap, and adjust the output
+        * time of the last inserted packet, m, which was too early.
+        */
+       if (si->credit >= 0) {
+               si->idle_time = now;
+       } else {
+               uint64_t t;
+               KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
+               t = div64(bw - 1 - si->credit, bw);
+               if (m)
+                       dn_tag_get(m)->output_time += t;
+               si->kflags |= DN_ACTIVE;
+               heap_insert(&dn_cfg.evheap, now + t, si);
+       }
+       if (delay_line_idle && done)
+               transmit_event(q, &si->dline, now);
+       return q->head;
+}
+
+/*
+ * Support function to read the TSC (or equivalent). We use this
+ * high resolution timer to adapt the amount of work done for
+ * expiring the clock.
+ * Supports Linux and FreeBSD both i386 and amd64 platform
+ * Supports OpenWRT mips architecture
+ *
+ * SMP no special works is needed in
+ * - In linux 2.6 timers will always run in the same cpu that have added it.See
+ * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html)
+ * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which
+ *   the timer must be run
+ * - Windows runs dummynet_task() on cpu0.
+ *
+ * - Linux 2.4 doesn't assure to run a timer in the same cpu every time.
+ */
+#ifdef HAVE_TSC
+uint64_t
+readTSC (void)
+{
+       uint64_t a=0;
+
+#ifdef __linux__
+       /* Linux and openwrt have a macro to read the tsc for i386 and
+        * amd64.
+        * Openwrt have patched the kernel and allow use of tsc with mips
+        * and other platforms
+        * rdtscll() is a macro defined in include/asm-xxx/msr.h,
+        * where xxx is the architecture (x86, mips).
+        */
+       rdtscll(a);
+#elif defined(_WIN32)
+       /* Microsoft recommends the use of KeQueryPerformanceCounter()
+        * insteead of rdtsc().
+        */
+       KeQueryPerformanceCounter((PLARGE_INTEGER)&a);  //XXX not tested!
+#elif defined(__FreeBSD__)
+       /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h.
+        * We could use the macro instead of explicity assembly XXX
+        */
+       return rdtsc();
+#endif
+       return a;
+}
+#endif /* HAVE_TSC */
+
+/*
+ * compute avg task period.
+ * We could do something more complex, possibly.
+ */
+static void
+do_update_cycle(void)
+{
+#ifdef HAVE_TSC
+       uint64_t tmp = readTSC();
+#if defined (LINUX_24) && defined(CONFIG_SMP)
+       /* on LINUX24 and SMP, we have no guarantees on which cpu runs
+        * the timer callbacks. If the difference between new and
+        * old value is negative, we assume that the values come from
+        * different cpus so we adjust 'new' accordingly.
+        */
+       if (tmp <= dn_cfg.cycle_task_new)
+               dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task;
+#endif /* !(linux24 && SMP) */
+       dn_cfg.cycle_task_old = dn_cfg.cycle_task_new;
+       dn_cfg.cycle_task_new = tmp;
+       dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old;
+
+       /* Update the average
+        * avg = (2^N * avg + new - avg ) / 2^N * avg
+        * N==4 seems to be a good compromise between clock clock change
+        *      and 'spurious' cycle_task value
+        */
+#define DN_N   4
+       dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) +
+                               dn_cfg.cycle_task - dn_cfg.cycle_task_avg;
+       dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N;
+#undef DN_N
+
+#endif /* HAVE_TSC */
+}
+
+static void
+do_drain(void)
+{
+#ifdef HAVE_TSC
+       uint64_t dt_max;
+#endif
+       if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire)
+               return;
+       /* It's time to check if drain routines should be called */
+       dn_cfg.expire_cycle = 0;
+
+       dn_cfg.idle_queue_wait = 0;
+       dn_cfg.idle_si_wait = 0;
+       /* Do a drain cycle even if there isn't time to do it */
+#ifdef HAVE_TSC
+       dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio;
+#endif
+       for (;;) {
+               int done = 0;
+
+               if (dn_cfg.idle_queue > dn_cfg.expire_object &&
+                   dn_cfg.idle_queue_wait < dn_cfg.idle_queue) {
+                       dn_drain_queue();
+                       done = 1;
+               }
+               if (dn_cfg.idle_si > dn_cfg.expire_object &&
+                   dn_cfg.idle_si_wait < dn_cfg.idle_si) {
+                       dn_drain_scheduler();
+                       done = 1;
+               }
+               /* time to end ? */
+#ifndef HAVE_TSC
+               /* If tsc does not exist, do only one drain cycle and exit */
+               break;
+#else
+               /* Exit when nothing was done or we have consumed all time */
+               if ( (done == 0) || 
+                    ((readTSC() -  dn_cfg.cycle_task_new) * 100 > dt_max) )
+                       break;
+#endif /* HAVE_TSC */
+       }
+}
+
+/*
+ * The timer handler for dummynet. Time is computed in ticks, but
+ * but the code is tolerant to the actual rate at which this is called.
+ * Once complete, the function reschedules itself for the next tick.
+ */
+void
+dummynet_task(void *context, int pending)
+{
+       struct timeval t;
+       struct mq q = { NULL, NULL }; /* queue to accumulate results */
+
+       CURVNET_SET((struct vnet *)context);
+
+       do_update_cycle();      /* compute avg. tick duration */
+
+       DN_BH_WLOCK();
+
+       /* Update number of lost(coalesced) ticks. */
+       tick_lost += pending - 1;
+
+       getmicrouptime(&t);
+       /* Last tick duration (usec). */
+       tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
+       (t.tv_usec - dn_cfg.prev_t.tv_usec);
+       /* Last tick vs standard tick difference (usec). */
+       tick_delta = (tick_last * hz - 1000000) / hz;
+       /* Accumulated tick difference (usec). */
+       tick_delta_sum += tick_delta;
+
+       dn_cfg.prev_t = t;
+
+       /*
+       * Adjust curr_time if the accumulated tick difference is
+       * greater than the 'standard' tick. Since curr_time should
+       * be monotonically increasing, we do positive adjustments
+       * as required, and throttle curr_time in case of negative
+       * adjustment.
+       */
+       dn_cfg.curr_time++;
+       if (tick_delta_sum - tick >= 0) {
+               int diff = tick_delta_sum / tick;
+
+               dn_cfg.curr_time += diff;
+               tick_diff += diff;
+               tick_delta_sum %= tick;
+               tick_adjustment++;
+       } else if (tick_delta_sum + tick <= 0) {
+               dn_cfg.curr_time--;
+               tick_diff--;
+               tick_delta_sum += tick;
+               tick_adjustment++;
+       }
+
+       /* serve pending events, accumulate in q */
+       for (;;) {
+               struct dn_id *p;    /* generic parameter to handler */
+
+               if (dn_cfg.evheap.elements == 0 ||
+                   DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
+                       break;
+               p = HEAP_TOP(&dn_cfg.evheap)->object;
+               heap_extract(&dn_cfg.evheap, NULL);
+
+               if (p->type == DN_SCH_I) {
+                       serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
+               } else { /* extracted a delay line */
+                       transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
+               }
+       }
+       do_drain();
+
+       DN_BH_WUNLOCK();
+       dn_reschedule();
+       if (q.head != NULL)
+               dummynet_send(q.head);
+       CURVNET_RESTORE();
+}
+
+/*
+ * forward a chain of packets to the proper destination.
+ * This runs outside the dummynet lock.
+ */
+static void
+dummynet_send(struct mbuf *m)
+{
+       struct mbuf *n;
+
+       for (; m != NULL; m = n) {
+               struct ifnet *ifp = NULL;       /* gcc 3.4.6 complains */
+               struct m_tag *tag;
+               int dst;
+
+               n = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               tag = m_tag_first(m);
+               if (tag == NULL) { /* should not happen */
+                       dst = DIR_DROP;
+               } else {
+                       struct dn_pkt_tag *pkt = dn_tag_get(m);
+                       /* extract the dummynet info, rename the tag
+                        * to carry reinject info.
+                        */
+                       dst = pkt->dn_dir;
+                       ifp = pkt->ifp;
+                       tag->m_tag_cookie = MTAG_IPFW_RULE;
+                       tag->m_tag_id = 0;
+               }
+
+               switch (dst) {
+               case DIR_OUT:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+                       break ;
+
+               case DIR_IN :
+                       /* put header in network format for ip_input() */
+                       //SET_NET_IPLEN(mtod(m, struct ip *));
+                       netisr_dispatch(NETISR_IP, m);
+                       break;
+
+#ifdef INET6
+               case DIR_IN | PROTO_IPV6:
+                       netisr_dispatch(NETISR_IPV6, m);
+                       break;
+
+               case DIR_OUT | PROTO_IPV6:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+                       break;
+#endif
+
+               case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+                       if (bridge_dn_p != NULL)
+                               ((*bridge_dn_p)(m, ifp));
+                       else
+                               printf("dummynet: if_bridge not loaded\n");
+
+                       break;
+
+               case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+                       /*
+                        * The Ethernet code assumes the Ethernet header is
+                        * contiguous in the first mbuf header.
+                        * Insure this is true.
+                        */
+                       if (m->m_len < ETHER_HDR_LEN &&
+                           (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+                               printf("dummynet/ether: pullup failed, "
+                                   "dropping packet\n");
+                               break;
+                       }
+                       ether_demux(m->m_pkthdr.rcvif, m);
+                       break;
+
+               case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+                       ether_output_frame(ifp, m);
+                       break;
+
+               case DIR_DROP:
+                       /* drop the packet after some time */
+                       FREE_PKT(m);
+                       break;
+
+               default:
+                       printf("dummynet: bad switch %d!\n", dst);
+                       FREE_PKT(m);
+                       break;
+               }
+       }
+}
+
+static inline int
+tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
+{
+       struct dn_pkt_tag *dt;
+       struct m_tag *mtag;
+
+       mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+                   sizeof(*dt), M_NOWAIT | M_ZERO);
+       if (mtag == NULL)
+               return 1;               /* Cannot allocate packet header. */
+       m_tag_prepend(m, mtag);         /* Attach to mbuf chain. */
+       dt = (struct dn_pkt_tag *)(mtag + 1);
+       dt->rule = fwa->rule;
+       dt->rule.info &= IPFW_ONEPASS;  /* only keep this info */
+       dt->dn_dir = dir;
+       dt->ifp = fwa->oif;
+       /* dt->output tame is updated as we move through */
+       dt->output_time = dn_cfg.curr_time;
+       return 0;
+}
+
+
+/*
+ * dummynet hook for packets.
+ * We use the argument to locate the flowset fs and the sched_set sch
+ * associated to it. The we apply flow_mask and sched_mask to
+ * determine the queue and scheduler instances.
+ *
+ * dir         where shall we send the packet after dummynet.
+ * *m0         the mbuf with the packet
+ * ifp         the 'ifp' parameter from the caller.
+ *             NULL in ip_input, destination interface in ip_output,
+ */
+int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+       struct mbuf *m = *m0;
+       struct dn_fsk *fs = NULL;
+       struct dn_sch_inst *si;
+       struct dn_queue *q = NULL;      /* default */
+
+       int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
+               ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
+       DN_BH_WLOCK();
+       io_pkt++;
+       /* we could actually tag outside the lock, but who cares... */
+       if (tag_mbuf(m, dir, fwa))
+               goto dropit;
+       if (dn_cfg.busy) {
+               /* if the upper half is busy doing something expensive,
+                * lets queue the packet and move forward
+                */
+               mq_append(&dn_cfg.pending, m);
+               m = *m0 = NULL; /* consumed */
+               goto done; /* already active, nothing to do */
+       }
+       /* XXX locate_flowset could be optimised with a direct ref. */
+       fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
+       if (fs == NULL)
+               goto dropit;    /* This queue/pipe does not exist! */
+       if (fs->sched == NULL)  /* should not happen */
+               goto dropit;
+       /*
+        * If the scheduler supports multiple queues, find the right one
+        * (otherwise it will be ignored by enqueue).
+        */
+       if (fs->sched->fp->flags & DN_MULTIQUEUE) {
+               q = ipdn_q_find(fs, &(fwa->f_id));
+               if (q == NULL)
+                       goto dropit;
+               /* The scheduler instance lookup is done only for new queue.
+                * The callback q_new() will create the scheduler instance
+                * if needed.
+                */
+               si = q->_si;
+       } else
+               si = ipdn_si_find(fs->sched, &(fwa->f_id));
+
+       if (si == NULL)
+               goto dropit;
+       if (fs->sched->fp->enqueue(si, q, m)) {
+               /* packet was dropped by enqueue() */
+               m = *m0 = NULL;
+               goto dropit;
+       }
+
+       if (si->kflags & DN_ACTIVE) {
+               m = *m0 = NULL; /* consumed */
+               goto done; /* already active, nothing to do */
+       }
+
+       /* compute the initial allowance */
+       if (si->idle_time < dn_cfg.curr_time) {
+           /* Do this only on the first packet on an idle pipe */
+           struct dn_link *p = &fs->sched->link;
+
+           si->sched_time = dn_cfg.curr_time;
+           si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
+           if (p->burst) {
+               uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
+               if (burst > p->burst)
+                       burst = p->burst;
+               si->credit += burst;
+           }
+       }
+       /* pass through scheduler and delay line */
+       m = serve_sched(NULL, si, dn_cfg.curr_time);
+
+       /* optimization -- pass it back to ipfw for immediate send */
+       /* XXX Don't call dummynet_send() if scheduler return the packet
+        *     just enqueued. This avoid a lock order reversal.
+        *     
+        */
+       if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
+               /* fast io, rename the tag * to carry reinject info. */
+               struct m_tag *tag = m_tag_first(m);
+
+               tag->m_tag_cookie = MTAG_IPFW_RULE;
+               tag->m_tag_id = 0;
+               io_pkt_fast++;
+               if (m->m_nextpkt != NULL) {
+                       printf("dummynet: fast io: pkt chain detected!\n");
+                       m->m_nextpkt = NULL;
+               }
+               m = NULL;
+       } else {
+               *m0 = NULL;
+       }
+done:
+       DN_BH_WUNLOCK();
+       if (m)
+               dummynet_send(m);
+       return 0;
+
+dropit:
+       io_pkt_drop++;
+       DN_BH_WUNLOCK();
+       if (m)
+               FREE_PKT(m);
+       *m0 = NULL;
+       return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
+}
diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h
new file mode 100644 (file)
index 0000000..ecb4fe2
--- /dev/null
@@ -0,0 +1,419 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * internal dummynet APIs.
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/ip_dn_private.h 204591 2010-03-02 17:40:48Z luigi $
+ */
+
+#ifndef _IP_DN_PRIVATE_H
+#define _IP_DN_PRIVATE_H
+
+/* debugging support
+ * use ND() to remove debugging, D() to print a line,
+ * DX(level, ...) to print above a certain level
+ * If you redefine D() you are expected to redefine all.
+ */
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n",      \
+        __FUNCTION__, ## __VA_ARGS__)
+#define DX(lev, fmt, ...) do {              \
+        if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
+#endif
+
+MALLOC_DECLARE(M_DUMMYNET);
+
+#ifndef __linux__
+#define div64(a, b)  ((int64_t)(a) / (int64_t)(b))
+#endif
+
+#define DN_LOCK_INIT() do {                            \
+       mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF);       \
+       mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF);       \
+       } while (0)
+#define DN_LOCK_DESTROY() do {                         \
+       mtx_destroy(&dn_cfg.uh_mtx);                    \
+       mtx_destroy(&dn_cfg.bh_mtx);                    \
+       } while (0)
+#if 0 /* not used yet */
+#define DN_UH_RLOCK()          mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_RUNLOCK()                mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_WLOCK()          mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_WUNLOCK()                mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_LOCK_ASSERT()    mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+#endif
+
+#define DN_BH_RLOCK()          mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_RUNLOCK()                mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_WLOCK()          mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_WUNLOCK()                mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_LOCK_ASSERT()    mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+
+SLIST_HEAD(dn_schk_head, dn_schk);
+SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
+SLIST_HEAD(dn_fsk_head, dn_fsk);
+SLIST_HEAD(dn_queue_head, dn_queue);
+SLIST_HEAD(dn_alg_head, dn_alg);
+
+struct mq {    /* a basic queue of packets*/
+        struct mbuf *head, *tail;
+};
+
+static inline void
+set_oid(struct dn_id *o, int type, int len)
+{
+        o->type = type;
+        o->len = len;
+        o->subtype = 0;
+};
+
+uint64_t readTSC (void);
+/*
+ * see if tsc (ot other timer) is supported.
+ * - FreeBSD has rdtsc macro for i386 and amd64
+ * - Linux has rdtscll and/or rdtsc (also for openWRT patched kernel source)
+ * - Windows has KeQueryPerformanceCounter() function that use tsc or other
+ *   timer
+ */
+#if defined(rdtscll) || defined(rdtsc) || defined(_WIN32)
+#define HAVE_TSC
+#endif
+/*
+ * configuration and global data for a dummynet instance
+ *
+ * When a configuration is modified from userland, 'id' is incremented
+ * so we can use the value to check for stale pointers.
+ */
+struct dn_parms {
+       uint32_t        id;             /* configuration version */
+
+       /* defaults (sysctl-accessible) */
+       int     red_lookup_depth;
+       int     red_avg_pkt_size;
+       int     red_max_pkt_size;
+       int     hash_size;
+       int     max_hash_size;
+       long    byte_limit;             /* max queue sizes */
+       long    slot_limit;
+
+       int     io_fast;
+       int     debug;
+
+       /* timekeeping */
+       struct timeval prev_t;          /* last time dummynet_tick ran */
+       struct dn_heap  evheap;         /* scheduled events */
+
+       /* counters of objects -- used for reporting space */
+       int     schk_count;
+       int     si_count;
+       int     fsk_count;
+       int     queue_count;
+
+       /* ticks and other stuff */
+       uint64_t        curr_time;      /* in ticks */
+
+       /*
+        * Variables to manage the time spent in the drain routines.
+        * max_drain is max the fraction of a tick (0..100) to be used
+        * for draining.
+        * We also need some variables to store the average number of
+        * timecounter ticks between calls to the periodic task, etc.
+        */
+       int drain_ratio;
+       uint64_t cycle_task_new;        /* TSC when dummynet_task() starts */
+       uint64_t cycle_task_old;        /* TSC when prev. dummynet_task() starts */
+       uint64_t cycle_task;
+       uint64_t cycle_task_avg;        /* Moving average of cicle_task */
+
+       /* flowsets and schedulers are in hash tables, with 'hash_size'
+        * buckets. fshash is looked up at every packet arrival
+        * so better be generous if we expect many entries.
+        */
+       struct dn_ht    *fshash;
+       struct dn_ht    *schedhash;
+       /* list of flowsets without a scheduler -- use sch_chain */
+       struct dn_fsk_head      fsu;    /* list of unlinked flowsets */
+       struct dn_alg_head      schedlist;      /* list of algorithms */
+
+       /* Counter of idle objects -- used by drain routine
+        * We scan when idle_queue (or idle_si) > expire_object.
+        * The drain routine is called every 'expire' cycles (the counter
+        * used is expire_cycle).
+        * We can disable the expire routine by setting expire to 0.
+        * An object is kept alive for at least object_idle_tick after it
+        * becomes idle. During the scan, we count the number of objects
+        * that are idle but not ready in 'idle_si_wait' and 'idle_queue_wait'
+        */
+       int     idle_queue;
+       int     idle_queue_wait;                /* idle but not expired yet */
+       int     idle_si;
+       int     idle_si_wait;                   /* idle but not expired yet */
+       uint32_t expire_object;                 /* threshold for expires */
+       uint32_t expire;                        /* how often to expire */
+       uint32_t expire_cycle;
+       uint32_t object_idle_tick;              /* lifetime of objs */
+       uint32_t expire_object_examined;        /* Burst of object examined */
+
+       /* drain_fs and drain_sch point to the next bucket to scan when
+        * draining.
+        */
+       uint32_t drain_fs;
+       uint32_t drain_sch;
+
+       int init_done;
+
+       /* if the upper half is busy doing something long,
+        * can set the busy flag and we will enqueue packets in
+        * a queue for later processing.
+        */
+       int     busy;
+       struct  mq      pending;
+
+#ifdef _KERNEL
+       /*
+        * This file is normally used in the kernel, unless we do
+        * some userland tests, in which case we do not need a mtx.
+        * uh_mtx arbitrates between system calls and also
+        * protects fshash, schedhash and fsunlinked.
+        * These structures are readonly for the lower half.
+        * bh_mtx protects all other structures which may be
+        * modified upon packet arrivals
+        */
+#if defined( __linux__ ) || defined( _WIN32 )
+       spinlock_t uh_mtx;
+       spinlock_t bh_mtx;
+#else
+       struct mtx uh_mtx;
+       struct mtx bh_mtx;
+#endif
+
+#endif /* _KERNEL */
+};
+
+/*
+ * Delay line, contains all packets on output from a link.
+ * Every scheduler instance has one.
+ */
+struct delay_line {
+       struct dn_id oid;
+       struct dn_sch_inst *si;
+       struct mq mq;
+};
+
+/*
+ * The kernel side of a flowset. It is linked in a hash table
+ * of flowsets, and in a list of children of their parent scheduler.
+ * qht is either the queue or (if HAVE_MASK) a hash table queues.
+ * Note that the mask to use is the (flow_mask|sched_mask), which
+ * changes as we attach/detach schedulers. So we store it here.
+ *
+ * XXX If we want to add scheduler-specific parameters, we need to
+ * put them in external storage because the scheduler may not be
+ * available when the fsk is created.
+ */
+struct dn_fsk { /* kernel side of a flowset */
+       struct dn_fs fs;
+       SLIST_ENTRY(dn_fsk) fsk_next;   /* hash chain for fshash */
+
+       struct ipfw_flow_id fsk_mask;
+
+       /* qht is a hash table of queues, or just a single queue
+        * a bit in fs.flags tells us which one
+        */
+       struct dn_ht    *qht;
+       struct dn_schk *sched;          /* Sched we are linked to */
+       SLIST_ENTRY(dn_fsk) sch_chain;  /* list of fsk attached to sched */
+
+       /* bucket index used by drain routine to drain queues for this
+        * flowset
+        */
+       int drain_bucket;
+       /* Parameter realted to RED / GRED */
+       /* original values are in dn_fs*/
+       int w_q ;               /* queue weight (scaled) */
+       int max_th ;            /* maximum threshold for queue (scaled) */
+       int min_th ;            /* minimum threshold for queue (scaled) */
+       int max_p ;             /* maximum value for p_b (scaled) */
+
+       u_int c_1 ;             /* max_p/(max_th-min_th) (scaled) */
+       u_int c_2 ;             /* max_p*min_th/(max_th-min_th) (scaled) */
+       u_int c_3 ;             /* for GRED, (1-max_p)/max_th (scaled) */
+       u_int c_4 ;             /* for GRED, 1 - 2*max_p (scaled) */
+       u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
+       u_int lookup_depth ;    /* depth of lookup table */
+       int lookup_step ;       /* granularity inside the lookup table */
+       int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+       int avg_pkt_size ;      /* medium packet size */
+       int max_pkt_size ;      /* max packet size */
+};
+
+/*
+ * A queue is created as a child of a flowset unless it belongs to
+ * a !MULTIQUEUE scheduler. It is normally in a hash table in the
+ * flowset. fs always points to the parent flowset.
+ * si normally points to the sch_inst, unless the flowset has been
+ * detached from the scheduler -- in this case si == NULL and we
+ * should not enqueue.
+ */
+struct dn_queue {
+       struct dn_flow ni;      /* oid, flow_id, stats */
+       struct mq mq;   /* packets queue */
+       struct dn_sch_inst *_si;        /* owner scheduler instance */
+       SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
+       struct dn_fsk *fs;              /* parent flowset. */
+
+       /* RED parameters */
+       int avg;                /* average queue length est. (scaled) */
+       int count;              /* arrivals since last RED drop */
+       int random;             /* random value (scaled) */
+       uint64_t q_time;        /* start of queue idle time */
+
+};
+
+/*
+ * The kernel side of a scheduler. Contains the userland config,
+ * a link, pointer to extra config arguments from command line,
+ * kernel flags, and a pointer to the scheduler methods.
+ * It is stored in a hash table, and holds a list of all
+ * flowsets and scheduler instances.
+ * XXX sch must be at the beginning, see schk_hash().
+ */
+struct dn_schk {
+       struct dn_sch sch;
+       struct dn_alg *fp;      /* Pointer to scheduler functions */
+       struct dn_link link;    /* The link, embedded */
+       struct dn_profile *profile; /* delay profile, if any */
+       struct dn_id *cfg;      /* extra config arguments */
+
+       SLIST_ENTRY(dn_schk) schk_next;  /* hash chain for schedhash */
+
+       struct dn_fsk_head fsk_list;  /* all fsk linked to me */
+       struct dn_fsk *fs;      /* Flowset for !MULTIQUEUE */
+
+       /* bucket index used by the drain routine to drain the scheduler
+        * instance for this flowset.
+        */
+       int drain_bucket;
+
+       /* Hash table of all instances (through sch.sched_mask)
+        * or single instance if no mask. Always valid.
+        */
+       struct dn_ht    *siht;
+};
+
+
+/*
+ * Scheduler instance.
+ * Contains variables and all queues relative to a this instance.
+ * This struct is created a runtime.
+ */
+struct dn_sch_inst {
+       struct dn_flow  ni;     /* oid, flowid and stats */
+       SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
+       struct delay_line dline;
+       struct dn_schk *sched;  /* the template */
+       int             kflags; /* DN_ACTIVE */
+
+       int64_t credit;         /* bits I can transmit (more or less). */
+       uint64_t sched_time;    /* time link was scheduled in ready_heap */
+       uint64_t idle_time;     /* start of scheduler instance idle time */
+
+       /* q_count is the number of queues that this instance is using.
+        * The counter is incremented or decremented when
+        * a reference from the queue is created or deleted.
+        * It is used to make sure that a scheduler instance can be safely
+        * deleted by the drain routine.
+        */
+       int q_count;
+
+};
+
+
+/* kernel-side flags. Linux has DN_DELETE in fcntl.h
+ */
+enum {
+       /* 1 and 2 are reserved for the SCAN flags */
+       DN_DESTROY      = 0x0004, /* destroy */
+       DN_DELETE_FS    = 0x0008, /* destroy flowset */
+       DN_DETACH       = 0x0010,
+       DN_ACTIVE       = 0x0020, /* object is in evheap */
+       DN_F_DLINE      = 0x0040, /* object is a delay line */
+       DN_DEL_SAFE     = 0x0080, /* delete a queue only if no longer needed
+                                  * by scheduler */
+       DN_QHT_IS_Q     = 0x0100, /* in flowset, qht is a single queue */
+};
+
+extern struct dn_parms dn_cfg;
+//VNET_DECLARE(struct dn_parms, _base_dn_cfg);
+//#define dn_cfg       VNET(_base_dn_cfg)
+
+int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+void dummynet_task(void *context, int pending);
+void dn_reschedule(void);
+
+struct dn_queue *ipdn_q_find(struct dn_fsk *, struct ipfw_flow_id *);
+struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
+
+/*
+ * copy_range is a template for requests for ranges of pipes/queues/scheds.
+ * The number of ranges is variable and can be derived by o.len.
+ * As a default, we use a small number of entries so that the struct
+ * fits easily on the stack and is sufficient for most common requests.
+ */
+#define DEFAULT_RANGES 5
+struct copy_range {
+        struct dn_id o;
+        uint32_t       r[ 2 * DEFAULT_RANGES ];
+};
+
+struct copy_args {
+       char **start;
+       char *end;
+       int flags;
+       int type;
+       struct copy_range *extra;       /* extra filtering */
+};
+
+struct sockopt;
+int ip_dummynet_compat(struct sockopt *sopt);
+int dummynet_get(struct sockopt *sopt, void **compat);
+int dn_c_copy_q (void *_ni, void *arg);
+int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
+int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
+int dn_compat_copy_queue(struct copy_args *a, void *_o);
+int dn_compat_copy_pipe(struct copy_args *a, void *_o);
+int copy_data_helper_compat(void *_o, void *_arg);
+int dn_compat_calc_size(void);
+int do_config(void *p, int l);
+
+/* function to drain idle object */
+void dn_drain_scheduler(void);
+void dn_drain_queue(void);
+
+#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c
new file mode 100644 (file)
index 0000000..7c63a2d
--- /dev/null
@@ -0,0 +1,2396 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dummynet.c 203340 2010-02-01 12:06:37Z luigi $");
+
+/*
+ * Configuration and internal object management for dummynet.
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* which objects to copy */
+#define DN_C_LINK      0x01
+#define DN_C_SCH       0x02
+#define DN_C_FLOW      0x04
+#define DN_C_FS                0x08
+#define DN_C_QUEUE     0x10
+
+/* we use this argument in case of a schk_new */
+struct schk_new_arg {
+       struct dn_alg *fp;
+       struct dn_sch *sch;
+};
+
+/*---- callout hooks. ----*/
+static struct callout dn_timeout;
+static struct task     dn_task;
+static struct taskqueue        *dn_tq = NULL;
+
+/* dummynet and ipfw_tick can't be static in windows */
+void
+dummynet(void * arg)
+{
+
+       (void)arg;      /* UNUSED */
+       taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+void
+dn_reschedule(void)
+{
+       callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0);
+}
+/*----- end of callout hooks -----*/
+
+/* Return a scheduler descriptor given the type or name. */
+static struct dn_alg *
+find_sched_type(int type, char *name)
+{
+       struct dn_alg *d;
+
+       SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
+               if (d->type == type || (name && !strcasecmp(d->name, name)))
+                       return d;
+       }
+       return NULL; /* not found */
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+       int oldv = *v;
+       const char *op = NULL;
+       if (dflt < lo)
+               dflt = lo;
+       if (dflt > hi)
+               dflt = hi;
+       if (oldv < lo) {
+               *v = dflt;
+               op = "Bump";
+       } else if (oldv > hi) {
+               *v = hi;
+               op = "Clamp";
+       } else
+               return *v;
+       if (op && msg)
+               printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
+       return *v;
+}
+
+/*---- flow_id mask, hash and compare functions ---*/
+/*
+ * The flow_id includes the 5-tuple, the queue/pipe number
+ * which we store in the extra area in host order,
+ * and for ipv6 also the flow_id6.
+ * XXX see if we want the tos byte (can store in 'flags')
+ */
+static struct ipfw_flow_id *
+flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
+{
+       int is_v6 = IS_IP6_FLOW_ID(id);
+
+       id->dst_port &= mask->dst_port;
+       id->src_port &= mask->src_port;
+       id->proto &= mask->proto;
+       id->extra &= mask->extra;
+       if (is_v6) {
+               APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
+               APPLY_MASK(&id->src_ip6, &mask->src_ip6);
+               id->flow_id6 &= mask->flow_id6;
+       } else {
+               id->dst_ip &= mask->dst_ip;
+               id->src_ip &= mask->src_ip;
+       }
+       return id;
+}
+
+/* computes an OR of two masks, result in dst and also returned */
+static struct ipfw_flow_id *
+flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
+{
+       int is_v6 = IS_IP6_FLOW_ID(dst);
+
+       dst->dst_port |= src->dst_port;
+       dst->src_port |= src->src_port;
+       dst->proto |= src->proto;
+       dst->extra |= src->extra;
+       if (is_v6) {
+#define OR_MASK(_d, _s)                          \
+    (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
+    (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
+    (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
+    (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
+               OR_MASK(&dst->dst_ip6, &src->dst_ip6);
+               OR_MASK(&dst->src_ip6, &src->src_ip6);
+#undef OR_MASK
+               dst->flow_id6 |= src->flow_id6;
+       } else {
+               dst->dst_ip |= src->dst_ip;
+               dst->src_ip |= src->src_ip;
+       }
+       return dst;
+}
+
+static int
+nonzero_mask(struct ipfw_flow_id *m)
+{
+       if (m->dst_port || m->src_port || m->proto || m->extra)
+               return 1;
+       if (IS_IP6_FLOW_ID(m)) {
+               return
+                       m->dst_ip6.__u6_addr.__u6_addr32[0] ||
+                       m->dst_ip6.__u6_addr.__u6_addr32[1] ||
+                       m->dst_ip6.__u6_addr.__u6_addr32[2] ||
+                       m->dst_ip6.__u6_addr.__u6_addr32[3] ||
+                       m->src_ip6.__u6_addr.__u6_addr32[0] ||
+                       m->src_ip6.__u6_addr.__u6_addr32[1] ||
+                       m->src_ip6.__u6_addr.__u6_addr32[2] ||
+                       m->src_ip6.__u6_addr.__u6_addr32[3] ||
+                       m->flow_id6;
+       } else {
+               return m->dst_ip || m->src_ip;
+       }
+}
+
+/* XXX we may want a better hash function */
+static uint32_t
+flow_id_hash(struct ipfw_flow_id *id)
+{
+    uint32_t i;
+
+    if (IS_IP6_FLOW_ID(id)) {
+       uint32_t *d = (uint32_t *)&id->dst_ip6;
+       uint32_t *s = (uint32_t *)&id->src_ip6;
+        i = (d[0]      ) ^ (d[1])       ^
+            (d[2]      ) ^ (d[3])       ^
+            (d[0] >> 15) ^ (d[1] >> 15) ^
+            (d[2] >> 15) ^ (d[3] >> 15) ^
+            (s[0] <<  1) ^ (s[1] <<  1) ^
+            (s[2] <<  1) ^ (s[3] <<  1) ^
+            (s[0] << 16) ^ (s[1] << 16) ^
+            (s[2] << 16) ^ (s[3] << 16) ^
+            (id->dst_port << 1) ^ (id->src_port) ^
+           (id->extra) ^
+            (id->proto ) ^ (id->flow_id6);
+    } else {
+        i = (id->dst_ip)        ^ (id->dst_ip >> 15) ^
+            (id->src_ip << 1)   ^ (id->src_ip >> 16) ^
+           (id->extra) ^
+            (id->dst_port << 1) ^ (id->src_port)     ^ (id->proto);
+    }
+    return i;
+}
+
+/* Like bcmp, returns 0 if ids match, 1 otherwise. */
+static int
+flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
+{
+       int is_v6 = IS_IP6_FLOW_ID(id1);
+
+       if (!is_v6) {
+           if (IS_IP6_FLOW_ID(id2))
+               return 1; /* different address families */
+
+           return (id1->dst_ip == id2->dst_ip &&
+                   id1->src_ip == id2->src_ip &&
+                   id1->dst_port == id2->dst_port &&
+                   id1->src_port == id2->src_port &&
+                   id1->proto == id2->proto &&
+                   id1->extra == id2->extra) ? 0 : 1;
+       }
+       /* the ipv6 case */
+       return (
+           !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
+           !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
+           id1->dst_port == id2->dst_port &&
+           id1->src_port == id2->src_port &&
+           id1->proto == id2->proto &&
+           id1->extra == id2->extra &&
+           id1->flow_id6 == id2->flow_id6) ? 0 : 1;
+}
+/*--------- end of flow-id mask, hash and compare ---------*/
+
+/*--- support functions for the qht hashtable ----
+ * Entries are hashed by flow-id
+ */
+static uint32_t
+q_hash(uintptr_t key, int flags, void *arg)
+{
+       /* compute the hash slot from the flow id */
+       struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+               &((struct dn_queue *)key)->ni.fid :
+               (struct ipfw_flow_id *)key;
+
+       return flow_id_hash(id);
+}
+
+static int
+q_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+       struct dn_queue *o = (struct dn_queue *)obj;
+       struct ipfw_flow_id *id2;
+
+       if (flags & DNHT_KEY_IS_OBJ) {
+               /* compare pointers */
+               id2 = &((struct dn_queue *)key)->ni.fid;
+       } else {
+               id2 = (struct ipfw_flow_id *)key;
+       }
+       return (0 == flow_id_cmp(&o->ni.fid,  id2));
+}
+
+/*
+ * create a new queue instance for the given 'key'.
+ */
+static void *
+q_new(uintptr_t key, int flags, void *arg)
+{   
+       struct dn_queue *q, *template = arg;
+       struct dn_fsk *fs = template->fs;
+       int size = sizeof(*q) + fs->sched->fp->q_datalen;
+
+       q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (q == NULL) {
+               D("no memory for new queue");
+               return NULL;
+       }
+
+       set_oid(&q->ni.oid, DN_QUEUE, size);
+       if (fs->fs.flags & DN_QHT_HASH)
+               q->ni.fid = *(struct ipfw_flow_id *)key;
+       q->fs = fs;
+       q->_si = ipdn_si_find(q->fs->sched, &(template->ni.fid));
+       if (q->_si == NULL) {
+               D("no memory for new si");
+               free (q, M_DUMMYNET);
+               return NULL;
+       }
+
+       q->_si->q_count++;
+
+       if (fs->sched->fp->new_queue)
+               fs->sched->fp->new_queue(q);
+       dn_cfg.queue_count++;
+       dn_cfg.idle_queue++;
+       return q;
+}
+
+/*
+ * Notify schedulers that a queue is going away.
+ * If (flags & DN_DESTROY), also free the packets.
+ * The version for callbacks is called q_delete_cb().
+ * Returns 1 if the queue is NOT deleted (usually when 
+ * the drain routine try to delete a queue that a scheduler
+ * instance needs), 0 otherwise.
+ * NOTE: flag DN_DEL_SAFE means that the queue should be
+ *       deleted only if the scheduler no longer needs it
+ */
+static int
+dn_delete_queue(struct dn_queue *q, int flags)
+{
+       struct dn_fsk *fs = q->fs;
+
+       // D("fs %p si %p\n", fs, q->_si);
+       /* notify the parent scheduler that the queue is going away */
+       if (fs && fs->sched->fp->free_queue)
+               if (fs->sched->fp->free_queue(q, flags & DN_DEL_SAFE) == 1)
+                       return 1;       /* queue NOT deleted */
+       q->_si->q_count--;
+       q->_si = NULL;
+       if (flags & DN_DESTROY) {
+               if (q->mq.head)
+                       dn_free_pkts(q->mq.head);
+               else
+                       dn_cfg.idle_queue--;
+               bzero(q, sizeof(*q));   // safety
+               free(q, M_DUMMYNET);
+               dn_cfg.queue_count--;
+       }
+       return 0;
+}
+
+static int
+q_delete_cb(void *q, void *arg)
+{
+       int flags = (int)(uintptr_t)arg;
+       dn_delete_queue(q, flags);
+       return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
+}
+
+/*
+ * calls dn_delete_queue/q_delete_cb on all queues,
+ * which notifies the parent scheduler and possibly drains packets.
+ * flags & DN_DESTROY: drains queues and destroy qht;
+ */
+static void
+qht_delete(struct dn_fsk *fs, int flags)
+{
+       ND("fs %d start flags %d qht %p",
+               fs->fs.fs_nr, flags, fs->qht);
+       if (!fs->qht)
+               return;
+       if (fs->fs.flags & DN_QHT_HASH) {
+               dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
+               if (flags & DN_DESTROY) {
+                       dn_ht_free(fs->qht, 0);
+                       fs->qht = NULL;
+               }
+       } else {
+               dn_delete_queue((struct dn_queue *)(fs->qht), flags);
+               if (flags & DN_DESTROY)
+                       fs->qht = NULL;
+       }
+}
+
+/*
+ * Find and possibly create the queue for a MULTIQUEUE scheduler.
+ * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
+ */
+struct dn_queue *
+ipdn_q_find(struct dn_fsk *fs, struct ipfw_flow_id *id)
+{
+       struct dn_queue template;
+
+       template.fs = fs;
+
+       if (fs->fs.flags & DN_QHT_HASH) {
+               struct ipfw_flow_id masked_id;
+               if (fs->qht == NULL) {
+                       fs->qht = dn_ht_init(NULL, fs->fs.buckets,
+                               offsetof(struct dn_queue, q_next),
+                               q_hash, q_match, q_new);
+                       if (fs->qht == NULL)
+                               return NULL;
+               }
+               masked_id = *id;
+               flow_id_mask(&fs->fsk_mask, &masked_id);
+               return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
+                       DNHT_INSERT, &template);
+       } else {
+               if (fs->qht == NULL)
+                       fs->qht = q_new(0, 0, &template);
+               return (struct dn_queue *)fs->qht;
+       }
+}
+/*--- end of queue hash table ---*/
+
+/*--- support functions for the sch_inst hashtable ----
+ *
+ * These are hashed by flow-id
+ */
+static uint32_t
+si_hash(uintptr_t key, int flags, void *arg)
+{
+       /* compute the hash slot from the flow id */
+       struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+               &((struct dn_sch_inst *)key)->ni.fid :
+               (struct ipfw_flow_id *)key;
+
+       return flow_id_hash(id);
+}
+
+static int
+si_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+       struct dn_sch_inst *o = obj;
+       struct ipfw_flow_id *id2;
+
+       id2 = (flags & DNHT_KEY_IS_OBJ) ?
+               &((struct dn_sch_inst *)key)->ni.fid :
+               (struct ipfw_flow_id *)key;
+       return flow_id_cmp(&o->ni.fid,  id2) == 0;
+}
+
+static int si_reset_credit(void *_si, void *arg); // XXX si_new use this
+
+/*
+ * create a new instance for the given 'key'
+ * Allocate memory for instance, delay line and scheduler private data.
+ */
+static void *
+si_new(uintptr_t key, int flags, void *arg)
+{
+       struct dn_schk *s = arg;
+       struct dn_sch_inst *si;
+       int l = sizeof(*si) + s->fp->si_datalen;
+
+       si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (si == NULL)
+               goto error;
+
+       /* Set length only for the part passed up to userland. */
+       set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
+       set_oid(&(si->dline.oid), DN_DELAY_LINE,
+               sizeof(struct delay_line));
+       /* mark si and dline as outside the event queue */
+       si->ni.oid.id = si->dline.oid.id = -1;
+
+       si->sched = s;
+       si->dline.si = si;
+
+       if (s->fp->new_sched && s->fp->new_sched(si)) {
+               D("new_sched error");
+               goto error;
+       }
+       if (s->sch.flags & DN_HAVE_MASK)
+               si->ni.fid = *(struct ipfw_flow_id *)key;
+
+       si_reset_credit(si, NULL);
+       dn_cfg.si_count++;
+       dn_cfg.idle_si++;
+       return si;
+
+error:
+       if (si) {
+               bzero(si, sizeof(*si)); // safety
+               free(si, M_DUMMYNET);
+       }
+        return NULL;
+}
+
+/*
+ * Callback from siht to delete all scheduler instances. Remove
+ * si and delay line from the system heap, destroy all queues.
+ * We assume that all flowset have been notified and do not
+ * point to us anymore.
+ */
+static int
+si_destroy(void *_si, void *arg)
+{
+       struct dn_sch_inst *si = _si;
+       struct dn_schk *s = si->sched;
+       struct delay_line *dl = &si->dline;
+
+       if (dl->oid.subtype) /* remove delay line from event heap */
+               heap_extract(&dn_cfg.evheap, dl);
+       if (si->ni.length == 0)
+               dn_cfg.idle_si--;
+       dn_free_pkts(dl->mq.head);      /* drain delay line */
+       if (si->kflags & DN_ACTIVE) /* remove si from event heap */
+               heap_extract(&dn_cfg.evheap, si);
+       if (s->fp->free_sched)
+               s->fp->free_sched(si);
+       bzero(si, sizeof(*si)); /* safety */
+       free(si, M_DUMMYNET);
+       dn_cfg.si_count--;
+       return DNHT_SCAN_DEL;
+}
+
+/*
+ * Find the scheduler instance for this packet. If we need to apply
+ * a mask, do on a local copy of the flow_id to preserve the original.
+ * Assume siht is always initialized if we have a mask.
+ */
+struct dn_sch_inst *
+ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
+{
+
+       if (s->sch.flags & DN_HAVE_MASK) {
+               struct ipfw_flow_id id_t = *id;
+               flow_id_mask(&s->sch.sched_mask, &id_t);
+               return dn_ht_find(s->siht, (uintptr_t)&id_t,
+                       DNHT_INSERT, s);
+       }
+       if (!s->siht)
+               s->siht = si_new(0, 0, s);
+       return (struct dn_sch_inst *)s->siht;
+}
+
+/* callback to flush credit for the scheduler instance */
+static int
+si_reset_credit(void *_si, void *arg)
+{
+       struct dn_sch_inst *si = _si;
+       struct dn_link *p = &si->sched->link;
+
+       si->idle_time = dn_cfg.curr_time;
+       si->credit = p->burst + (dn_cfg.io_fast ?  p->bandwidth : 0);
+       return 0;
+}
+
+static void
+schk_reset_credit(struct dn_schk *s)
+{
+       if (s->sch.flags & DN_HAVE_MASK)
+               dn_ht_scan(s->siht, si_reset_credit, NULL);
+       else if (s->siht)
+               si_reset_credit(s->siht, NULL);
+}
+/*---- end of sch_inst hashtable ---------------------*/
+
+/*-------------------------------------------------------
+ * flowset hash (fshash) support. Entries are hashed by fs_nr.
+ * New allocations are put in the fsunlinked list, from which
+ * they are removed when they point to a specific scheduler.
+ */
+static uint32_t
+fsk_hash(uintptr_t key, int flags, void *arg)
+{
+       uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+               ((struct dn_fsk *)key)->fs.fs_nr;
+
+       return ( (i>>8)^(i>>4)^i );
+}
+
+static int
+fsk_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+       struct dn_fsk *fs = obj;
+       int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+               ((struct dn_fsk *)key)->fs.fs_nr;
+
+       return (fs->fs.fs_nr == i);
+}
+
+static void *
+fsk_new(uintptr_t key, int flags, void *arg)
+{
+       struct dn_fsk *fs;
+
+       fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (fs) {
+               set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
+               dn_cfg.fsk_count++;
+               fs->drain_bucket = 0;
+               SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+       }
+       return fs;
+}
+
+/*
+ * detach flowset from its current scheduler. Flags as follows:
+ * DN_DETACH removes from the fsk_list
+ * DN_DESTROY deletes individual queues
+ * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
+ */
+static void
+fsk_detach(struct dn_fsk *fs, int flags)
+{
+       if (flags & DN_DELETE_FS)
+               flags |= DN_DESTROY;
+       ND("fs %d from sched %d flags %s %s %s",
+               fs->fs.fs_nr, fs->fs.sched_nr,
+               (flags & DN_DELETE_FS) ? "DEL_FS":"",
+               (flags & DN_DESTROY) ? "DEL":"",
+               (flags & DN_DETACH) ? "DET":"");
+       if (flags & DN_DETACH) { /* detach from the list */
+               struct dn_fsk_head *h;
+               h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
+               SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
+       }
+       /* Free the RED parameters, they will be recomputed on
+        * subsequent attach if needed.
+        */
+       if (fs->w_q_lookup)
+               free(fs->w_q_lookup, M_DUMMYNET);
+       fs->w_q_lookup = NULL;
+       qht_delete(fs, flags);
+       if (fs->sched && fs->sched->fp->free_fsk)
+               fs->sched->fp->free_fsk(fs);
+       fs->sched = NULL;
+       if (flags & DN_DELETE_FS) {
+               bzero(fs, sizeof(*fs)); /* safety */
+               free(fs, M_DUMMYNET);
+               dn_cfg.fsk_count--;
+       } else {
+               SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+       }
+}
+
+/*
+ * Detach or destroy all flowsets in a list.
+ * flags specifies what to do:
+ * DN_DESTROY: flush all queues
+ * DN_DELETE_FS:       DN_DESTROY + destroy flowset
+ *     DN_DELETE_FS implies DN_DESTROY
+ */
+static void
+fsk_detach_list(struct dn_fsk_head *h, int flags)
+{
+       struct dn_fsk *fs;
+       int n = 0; /* only for stats */
+
+       ND("head %p flags %x", h, flags);
+       while ((fs = SLIST_FIRST(h))) {
+               SLIST_REMOVE_HEAD(h, sch_chain);
+               n++;
+               fsk_detach(fs, flags);
+       }
+       ND("done %d flowsets", n);
+}
+
+/*
+ * called on 'queue X delete' -- removes the flowset from fshash,
+ * deletes all queues for the flowset, and removes the flowset.
+ */
+static int
+delete_fs(int i, int locked)
+{
+       struct dn_fsk *fs;
+       int err = 0;
+
+       if (!locked)
+               DN_BH_WLOCK();
+       fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
+       if (dn_ht_entries(dn_cfg.fshash) == 0) {
+               dn_ht_free(dn_cfg.fshash, 0);
+               dn_cfg.fshash = NULL;
+       }
+       ND("fs %d found %p", i, fs);
+       if (fs) {
+               fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
+               err = 0;
+       } else
+               err = EINVAL;
+       if (!locked)
+               DN_BH_WUNLOCK();
+       return err;
+}
+
+/*----- end of flowset hashtable support -------------*/
+
+/*------------------------------------------------------------
+ * Scheduler hash. When searching by index we pass sched_nr,
+ * otherwise we pass struct dn_sch * which is the first field in
+ * struct dn_schk so we can cast between the two. We use this trick
+ * because in the create phase (but it should be fixed).
+ */
+static uint32_t
+schk_hash(uintptr_t key, int flags, void *_arg)
+{
+       uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+               ((struct dn_schk *)key)->sch.sched_nr;
+       return ( (i>>8)^(i>>4)^i );
+}
+
+static int
+schk_match(void *obj, uintptr_t key, int flags, void *_arg)
+{
+       struct dn_schk *s = (struct dn_schk *)obj;
+       int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+               ((struct dn_schk *)key)->sch.sched_nr;
+       return (s->sch.sched_nr == i);
+}
+
+/*
+ * Create the entry and intialize with the sched hash if needed.
+ * Leave s->fp unset so we can tell whether a dn_ht_find() returns
+ * a new object or a previously existing one.
+ */
+static void *
+schk_new(uintptr_t key, int flags, void *arg)
+{
+       struct schk_new_arg *a = arg;
+       struct dn_schk *s;
+       int l = sizeof(*s) +a->fp->schk_datalen;
+
+       s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (s == NULL)
+               return NULL;
+       set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
+       s->sch = *a->sch; // copy initial values
+       s->link.link_nr = s->sch.sched_nr;
+       SLIST_INIT(&s->fsk_list);
+       /* initialize the hash table or create the single instance */
+       s->fp = a->fp;  /* si_new needs this */
+       s->drain_bucket = 0;
+       if (s->sch.flags & DN_HAVE_MASK) {
+               s->siht = dn_ht_init(NULL, s->sch.buckets,
+                       offsetof(struct dn_sch_inst, si_next),
+                       si_hash, si_match, si_new);
+               if (s->siht == NULL) {
+                       free(s, M_DUMMYNET);
+                       return NULL;
+               }
+       }
+       s->fp = NULL;   /* mark as a new scheduler */
+       dn_cfg.schk_count++;
+       return s;
+}
+
+/*
+ * Callback for sched delete. Notify all attached flowsets to
+ * detach from the scheduler, destroy the internal flowset, and
+ * all instances. The scheduler goes away too.
+ * arg is 0 (only detach flowsets and destroy instances)
+ * DN_DESTROY (detach & delete queues, delete schk)
+ * or DN_DELETE_FS (delete queues and flowsets, delete schk)
+ */
+static int
+schk_delete_cb(void *obj, void *arg)
+{
+       struct dn_schk *s = obj;
+#if 0
+       int a = (int)arg;
+       ND("sched %d arg %s%s",
+               s->sch.sched_nr,
+               a&DN_DESTROY ? "DEL ":"",
+               a&DN_DELETE_FS ? "DEL_FS":"");
+#endif
+       fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
+       /* no more flowset pointing to us now */
+       if (s->sch.flags & DN_HAVE_MASK) {
+               dn_ht_scan(s->siht, si_destroy, NULL);
+               dn_ht_free(s->siht, 0);
+       }
+       else if (s->siht)
+               si_destroy(s->siht, NULL);
+       if (s->profile) {
+               free(s->profile, M_DUMMYNET);
+               s->profile = NULL;
+       }
+       s->siht = NULL;
+       if (s->fp->destroy)
+               s->fp->destroy(s);
+       bzero(s, sizeof(*s));   // safety
+       free(obj, M_DUMMYNET);
+       dn_cfg.schk_count--;
+       return DNHT_SCAN_DEL;
+}
+
+/*
+ * called on a 'sched X delete' command. Deletes a single scheduler.
+ * This is done by removing from the schedhash, unlinking all
+ * flowsets and deleting their traffic.
+ */
+static int
+delete_schk(int i)
+{
+       struct dn_schk *s;
+
+       s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+       if (dn_ht_entries(dn_cfg.schedhash) == 0) {
+               dn_ht_free(dn_cfg.schedhash, 0);
+               dn_cfg.schedhash = NULL;
+       }
+       ND("%d %p", i, s);
+       if (!s)
+               return EINVAL;
+       delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
+       /* then detach flowsets, delete traffic */
+       schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
+       return 0;
+}
+/*--- end of schk hashtable support ---*/
+
+static int
+copy_obj(char **start, char *end, void *_o, const char *msg, int i)
+{
+       struct dn_id *o = _o;
+       int have = end - *start;
+
+       if (have < o->len || o->len == 0 || o->type == 0) {
+               D("(WARN) type %d %s %d have %d need %d",
+                       o->type, msg, i, have, o->len);
+               return 1;
+       }
+       ND("type %d %s %d len %d", o->type, msg, i, o->len);
+       bcopy(_o, *start, o->len);
+       if (o->type == DN_LINK) {
+               /* Adjust burst parameter for link */
+               struct dn_link *l = (struct dn_link *)*start;
+               l->burst =  div64(l->burst, 8 * hz);
+       } else if (o->type == DN_SCH) {
+               /* Set id->id to the number of instances */
+               struct dn_schk *s = _o;
+               struct dn_id *id = (struct dn_id *)(*start);
+               id->id = (s->sch.flags & DN_HAVE_MASK) ?
+                       dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
+       }
+       *start += o->len;
+       return 0;
+}
+
+/* Specific function to copy a queue.
+ * Copies only the user-visible part of a queue (which is in
+ * a struct dn_flow), and sets len accordingly.
+ */
+static int
+copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
+{
+       struct dn_id *o = _o;
+       int have = end - *start;
+       int len = sizeof(struct dn_flow); /* see above comment */
+
+       if (have < len || o->len == 0 || o->type != DN_QUEUE) {
+               D("ERROR type %d %s %d have %d need %d",
+                       o->type, msg, i, have, len);
+               return 1;
+       }
+       ND("type %d %s %d len %d", o->type, msg, i, len);
+       bcopy(_o, *start, len);
+       ((struct dn_id*)(*start))->len = len;
+       *start += len;
+       return 0;
+}
+
+static int
+copy_q_cb(void *obj, void *arg)
+{
+       struct dn_queue *q = obj;
+       struct copy_args *a = arg;
+       struct dn_flow *ni = (struct dn_flow *)(*a->start);
+        if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
+                return DNHT_SCAN_END;
+        ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
+        ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
+        return 0;
+}
+
+static int
+copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+       if (!fs->qht)
+               return 0;
+       if (fs->fs.flags & DN_QHT_HASH)
+               dn_ht_scan(fs->qht, copy_q_cb, a);
+       else
+               copy_q_cb(fs->qht, a);
+       return 0;
+}
+
+/*
+ * This routine only copies the initial part of a profile ? XXX
+ * XXX marta: I think this routine is called to print a summary
+ * of the pipe configuration and does not need to show the 
+ * profile samples list.
+ */
+static int
+copy_profile(struct copy_args *a, struct dn_profile *p)
+{
+       int have = a->end - *a->start;
+       /* XXX here we check for max length */
+       int profile_len = sizeof(struct dn_profile);
+
+       if (p == NULL)
+               return 0;
+       if (have < profile_len) {
+               D("error have %d need %d", have, profile_len);
+               return 1;
+       }
+       bcopy(p, *a->start, profile_len);
+       ((struct dn_id *)(*a->start))->len = profile_len;
+       *a->start += profile_len;
+       return 0;
+}
+
+static int
+copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+       struct dn_fs *ufs = (struct dn_fs *)(*a->start);
+       if (!fs)
+               return 0;
+       ND("flowset %d", fs->fs.fs_nr);
+       if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
+               return DNHT_SCAN_END;
+       ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
+               dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
+       if (flags) {    /* copy queues */
+               copy_q(a, fs, 0);
+       }
+       return 0;
+}
+
+static int
+copy_si_cb(void *obj, void *arg)
+{
+       struct dn_sch_inst *si = obj;
+       struct copy_args *a = arg;
+       struct dn_flow *ni = (struct dn_flow *)(*a->start);
+       if (copy_obj(a->start, a->end, &si->ni, "inst",
+                       si->sched->sch.sched_nr))
+               return DNHT_SCAN_END;
+       ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
+       ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
+       return 0;
+}
+
+static int
+copy_si(struct copy_args *a, struct dn_schk *s, int flags)
+{
+       if (s->sch.flags & DN_HAVE_MASK)
+               dn_ht_scan(s->siht, copy_si_cb, a);
+       else if (s->siht)
+               copy_si_cb(s->siht, a);
+       return 0;
+}
+
+/*
+ * compute a list of children of a scheduler and copy up
+ */
+static int
+copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
+{
+       struct dn_fsk *fs;
+       struct dn_id *o;
+       uint32_t *p;
+
+       int n = 0, space = sizeof(*o);
+       SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+               if (fs->fs.fs_nr < DN_MAX_ID)
+                       n++;
+       }
+       space += n * sizeof(uint32_t);
+       DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
+       if (a->end - *(a->start) < space)
+               return DNHT_SCAN_END;
+       o = (struct dn_id *)(*(a->start));
+       o->len = space;
+       *a->start += o->len;
+       o->type = DN_TEXT;
+       p = (uint32_t *)(o+1);
+       SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
+               if (fs->fs.fs_nr < DN_MAX_ID)
+                       *p++ = fs->fs.fs_nr;
+       return 0;
+}
+
+static int
+copy_data_helper(void *_o, void *_arg)
+{
+       struct copy_args *a = _arg;
+       uint32_t *r = a->extra->r; /* start of first range */
+       uint32_t *lim;  /* first invalid pointer */
+       int n;
+
+       lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
+
+       if (a->type == DN_LINK || a->type == DN_SCH) {
+               /* pipe|sched show, we receive a dn_schk */
+               struct dn_schk *s = _o;
+
+               n = s->sch.sched_nr;
+               if (a->type == DN_SCH && n >= DN_MAX_ID)
+                       return 0;       /* not a scheduler */
+               if (a->type == DN_LINK && n <= DN_MAX_ID)
+                   return 0;   /* not a pipe */
+
+               /* see if the object is within one of our ranges */
+               for (;r < lim; r += 2) {
+                       if (n < r[0] || n > r[1])
+                               continue;
+                       /* Found a valid entry, copy and we are done */
+                       if (a->flags & DN_C_LINK) {
+                               if (copy_obj(a->start, a->end,
+                                   &s->link, "link", n))
+                                       return DNHT_SCAN_END;
+                               if (copy_profile(a, s->profile))
+                                       return DNHT_SCAN_END;
+                               if (copy_flowset(a, s->fs, 0))
+                                       return DNHT_SCAN_END;
+                       }
+                       if (a->flags & DN_C_SCH) {
+                               if (copy_obj(a->start, a->end,
+                                   &s->sch, "sched", n))
+                                       return DNHT_SCAN_END;
+                               /* list all attached flowsets */
+                               if (copy_fsk_list(a, s, 0))
+                                       return DNHT_SCAN_END;
+                       }
+                       if (a->flags & DN_C_FLOW)
+                               copy_si(a, s, 0);
+                       break;
+               }
+       } else if (a->type == DN_FS) {
+               /* queue show, skip internal flowsets */
+               struct dn_fsk *fs = _o;
+
+               n = fs->fs.fs_nr;
+               if (n >= DN_MAX_ID)
+                       return 0;
+               /* see if the object is within one of our ranges */
+               for (;r < lim; r += 2) {
+                       if (n < r[0] || n > r[1])
+                               continue;
+                       if (copy_flowset(a, fs, 0))
+                               return DNHT_SCAN_END;
+                       copy_q(a, fs, 0);
+                       break; /* we are done */
+               }
+       }
+       return 0;
+}
+
+static inline struct dn_schk *
+locate_scheduler(int i)
+{
+       return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
+}
+
+/*
+ * red parameters are in fixed point arithmetic.
+ */
+static int
+config_red(struct dn_fsk *fs)
+{
+       int64_t s, idle, weight, w0;
+       int t, i;
+
+       fs->w_q = fs->fs.w_q;
+       fs->max_p = fs->fs.max_p;
+       ND("called");
+       /* Doing stuff that was in userland */
+       i = fs->sched->link.bandwidth;
+       s = (i <= 0) ? 0 :
+               hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
+
+       idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
+       fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
+       /* fs->lookup_step not scaled, */
+       if (!fs->lookup_step)
+               fs->lookup_step = 1;
+       w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
+
+       for (t = fs->lookup_step; t > 1; --t)
+               weight = SCALE_MUL(weight, w0);
+       fs->lookup_weight = (int)(weight); // scaled
+
+       /* Now doing stuff that was in kerneland */
+       fs->min_th = SCALE(fs->fs.min_th);
+       fs->max_th = SCALE(fs->fs.max_th);
+
+       fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
+       fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
+
+       if (fs->fs.flags & DN_IS_GENTLE_RED) {
+               fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
+               fs->c_4 = SCALE(1) - 2 * fs->max_p;
+       }
+
+       /* If the lookup table already exist, free and create it again. */
+       if (fs->w_q_lookup) {
+               free(fs->w_q_lookup, M_DUMMYNET);
+               fs->w_q_lookup = NULL;
+       }
+       if (dn_cfg.red_lookup_depth == 0) {
+               printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+                   "must be > 0\n");
+               fs->fs.flags &= ~DN_IS_RED;
+               fs->fs.flags &= ~DN_IS_GENTLE_RED;
+               return (EINVAL);
+       }
+       fs->lookup_depth = dn_cfg.red_lookup_depth;
+       fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
+           M_DUMMYNET, M_NOWAIT);
+       if (fs->w_q_lookup == NULL) {
+               printf("dummynet: sorry, cannot allocate red lookup table\n");
+               fs->fs.flags &= ~DN_IS_RED;
+               fs->fs.flags &= ~DN_IS_GENTLE_RED;
+               return(ENOSPC);
+       }
+
+       /* Fill the lookup table with (1 - w_q)^x */
+       fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
+
+       for (i = 1; i < fs->lookup_depth; i++)
+               fs->w_q_lookup[i] =
+                   SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
+
+       if (dn_cfg.red_avg_pkt_size < 1)
+               dn_cfg.red_avg_pkt_size = 512;
+       fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
+       if (dn_cfg.red_max_pkt_size < 1)
+               dn_cfg.red_max_pkt_size = 1500;
+       fs->max_pkt_size = dn_cfg.red_max_pkt_size;
+       ND("exit");
+       return 0;
+}
+
+/* Scan all flowset attached to this scheduler and update red */
+static void
+update_red(struct dn_schk *s)
+{
+       struct dn_fsk *fs;
+       SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+               if (fs && (fs->fs.flags & DN_IS_RED))
+                       config_red(fs);
+       }
+}
+
+/* attach flowset to scheduler s, possibly requeue */
+static void
+fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
+{
+       ND("remove fs %d from fsunlinked, link to sched %d",
+               fs->fs.fs_nr, s->sch.sched_nr);
+       SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
+       fs->sched = s;
+       SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
+       if (s->fp->new_fsk)
+               s->fp->new_fsk(fs);
+       /* XXX compute fsk_mask */
+       fs->fsk_mask = fs->fs.flow_mask;
+       if (fs->sched->sch.flags & DN_HAVE_MASK)
+               flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
+       if (fs->qht) {
+               /*
+                * we must drain qht according to the old
+                * type, and reinsert according to the new one.
+                * The requeue is complex -- in general we need to
+                * reclassify every single packet.
+                * For the time being, let's hope qht is never set
+                * when we reach this point.
+                */
+               D("XXX TODO requeue from fs %d to sch %d",
+                       fs->fs.fs_nr, s->sch.sched_nr);
+               fs->qht = NULL;
+       }
+       /* set the new type for qht */
+       if (nonzero_mask(&fs->fsk_mask))
+               fs->fs.flags |= DN_QHT_HASH;
+       else
+               fs->fs.flags &= ~DN_QHT_HASH;
+
+       /* XXX config_red() can fail... */
+       if (fs->fs.flags & DN_IS_RED)
+               config_red(fs);
+}
+
+/* update all flowsets which may refer to this scheduler */
+static void
+update_fs(struct dn_schk *s)
+{
+       struct dn_fsk *fs, *tmp;
+
+       SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
+               if (s->sch.sched_nr != fs->fs.sched_nr) {
+                       D("fs %d for sch %d not %d still unlinked",
+                               fs->fs.fs_nr, fs->fs.sched_nr,
+                               s->sch.sched_nr);
+                       continue;
+               }
+               fsk_attach(fs, s);
+       }
+}
+
+/*
+ * Configuration -- to preserve backward compatibility we use
+ * the following scheme (N is 65536)
+ *     NUMBER          SCHED   LINK    FLOWSET
+ *        1 ..  N-1    (1)WFQ  (2)WFQ  (3)queue
+ *      N+1 .. 2N-1    (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1
+ *     2N+1 .. 3N-1    --      --      (7)FIFO for sched N+1..2N-1
+ *
+ * "pipe i config" configures #1, #2 and #3
+ * "sched i config" configures #1 and possibly #6
+ * "queue i config" configures #3
+ * #1 is configured with 'pipe i config' or 'sched i config'
+ * #2 is configured with 'pipe i config', and created if not
+ *     existing with 'sched i config'
+ * #3 is configured with 'queue i config'
+ * #4 is automatically configured after #1, can only be FIFO
+ * #5 is automatically configured after #2
+ * #6 is automatically created when #1 is !MULTIQUEUE,
+ *     and can be updated.
+ * #7 is automatically configured after #2
+ */
+
+/*
+ * configure a link (and its FIFO instance)
+ */
+static int
+config_link(struct dn_link *p, struct dn_id *arg)
+{
+       int i;
+
+       if (p->oid.len != sizeof(*p)) {
+               D("invalid pipe len %d", p->oid.len);
+               return EINVAL;
+       }
+       i = p->link_nr;
+       if (i <= 0 || i >= DN_MAX_ID)
+               return EINVAL;
+       /*
+        * The config program passes parameters as follows:
+        * bw = bits/second (0 means no limits),
+        * delay = ms, must be translated into ticks.
+        * qsize = slots/bytes
+        * burst ???
+        */
+       p->delay = (p->delay * hz) / 1000;
+       /* Scale burst size: bytes -> bits * hz */
+       p->burst *= 8 * hz;
+
+       DN_BH_WLOCK();
+       /* do it twice, base link and FIFO link */
+       for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+           struct dn_schk *s = locate_scheduler(i);
+           if (s == NULL) {
+               DN_BH_WUNLOCK();
+               D("sched %d not found", i);
+               return EINVAL;
+           }
+           /* remove profile if exists */
+           if (s->profile) {
+               free(s->profile, M_DUMMYNET);
+               s->profile = NULL;
+           }
+           /* copy all parameters */
+           s->link.oid = p->oid;
+           s->link.link_nr = i;
+           s->link.delay = p->delay;
+           if (s->link.bandwidth != p->bandwidth) {
+               /* XXX bandwidth changes, need to update red params */
+           s->link.bandwidth = p->bandwidth;
+               update_red(s);
+           }
+           s->link.burst = p->burst;
+           schk_reset_credit(s);
+       }
+       dn_cfg.id++;
+       DN_BH_WUNLOCK();
+       return 0;
+}
+
+/*
+ * configure a flowset. Can be called from inside with locked=1,
+ */
+static struct dn_fsk *
+config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
+{
+       int i;
+       struct dn_fsk *fs;
+
+       if (nfs->oid.len != sizeof(*nfs)) {
+               D("invalid flowset len %d", nfs->oid.len);
+               return NULL;
+       }
+       i = nfs->fs_nr;
+       if (i <= 0 || i >= 3*DN_MAX_ID)
+               return NULL;
+       ND("flowset %d", i);
+       /* XXX other sanity checks */
+        if (nfs->flags & DN_QSIZE_BYTES) {
+               ipdn_bound_var(&nfs->qsize, 16384,
+                   1500, dn_cfg.byte_limit, NULL); // "queue byte size");
+        } else {
+               ipdn_bound_var(&nfs->qsize, 50,
+                   1, dn_cfg.slot_limit, NULL); // "queue slot size");
+        }
+       if (nfs->flags & DN_HAVE_MASK) {
+               /* make sure we have some buckets */
+               ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size,
+                       1, dn_cfg.max_hash_size, "flowset buckets");
+       } else {
+               nfs->buckets = 1;       /* we only need 1 */
+       }
+       if (!locked)
+               DN_BH_WLOCK();
+       if (dn_cfg.fshash == NULL)
+               dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
+                                       offsetof(struct dn_fsk, fsk_next),
+                                       fsk_hash, fsk_match, fsk_new);
+       do { /* exit with break when done */
+           struct dn_schk *s;
+           int flags = nfs->sched_nr ? DNHT_INSERT : 0;
+           int j;
+           int oldc = dn_cfg.fsk_count;
+           fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
+           if (fs == NULL) {
+               D("missing sched for flowset %d", i);
+               break;
+           }
+           /* grab some defaults from the existing one */
+           if (nfs->sched_nr == 0) /* reuse */
+               nfs->sched_nr = fs->fs.sched_nr;
+           for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
+               if (nfs->par[j] == -1) /* reuse */
+                   nfs->par[j] = fs->fs.par[j];
+           }
+           if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
+               ND("flowset %d unchanged", i);
+               break; /* no change, nothing to do */
+           }
+           if (oldc != dn_cfg.fsk_count)       /* new item */
+               dn_cfg.id++;
+           s = locate_scheduler(nfs->sched_nr);
+           /* detach from old scheduler if needed, preserving
+            * queues if we need to reattach. Then update the
+            * configuration, and possibly attach to the new sched.
+            */
+           DX(2, "fs %d changed sched %d@%p to %d@%p",
+               fs->fs.fs_nr,
+               fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
+           if (fs->sched) {
+               int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
+               flags |= DN_DESTROY; /* XXX temporary */
+               fsk_detach(fs, flags);
+           }
+           fs->fs = *nfs; /* copy configuration */
+           if (s != NULL)
+               fsk_attach(fs, s);
+       } while (0);
+       if (!locked)
+               DN_BH_WUNLOCK();
+       return fs;
+}
+
+/*
+ * config/reconfig a scheduler and its FIFO variant.
+ * For !MULTIQUEUE schedulers, also set up the flowset.
+ *
+ * On reconfigurations (detected because s->fp is set),
+ * detach existing flowsets preserving traffic, preserve link,
+ * and delete the old scheduler creating a new one.
+ */
+static int
+config_sched(struct dn_sch *_nsch, struct dn_id *arg)
+{
+       struct dn_schk *s;
+       struct schk_new_arg a; /* argument for schk_new */
+       int i;
+       struct dn_link p;       /* copy of oldlink */
+       struct dn_profile *pf = NULL;   /* copy of old link profile */
+       /* Used to preserv mask parameter */
+       struct ipfw_flow_id new_mask;
+       int new_buckets = 0;
+       int new_flags = 0;
+       int pipe_cmd;
+       int err = ENOMEM;
+
+       a.sch = _nsch;
+       if (a.sch->oid.len != sizeof(*a.sch)) {
+               D("bad sched len %d", a.sch->oid.len);
+               return EINVAL;
+       }
+       i = a.sch->sched_nr;
+       if (i <= 0 || i >= DN_MAX_ID)
+               return EINVAL;
+       /* make sure we have some buckets */
+       if (a.sch->flags & DN_HAVE_MASK)
+               ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size,
+                       1, dn_cfg.max_hash_size, "sched buckets");
+       /* XXX other sanity checks */
+       bzero(&p, sizeof(p));
+
+       pipe_cmd = a.sch->flags & DN_PIPE_CMD;
+       a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
+       if (pipe_cmd) {
+               /* Copy mask parameter */
+               new_mask = a.sch->sched_mask;
+               new_buckets = a.sch->buckets;
+               new_flags = a.sch->flags;
+       }
+       DN_BH_WLOCK();
+       if (dn_cfg.schedhash == NULL)
+               dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
+                                       offsetof(struct dn_schk, schk_next),
+                                       schk_hash, schk_match, schk_new);
+again: /* run twice, for wfq and fifo */
+       /*
+        * lookup the type. If not supplied, use the previous one
+        * or default to WF2Q+. Otherwise, return an error.
+        */
+       dn_cfg.id++;
+       a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
+       if (a.fp != NULL) {
+               /* found. Lookup or create entry */
+               s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
+       } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
+               /* No type. search existing s* or retry with WF2Q+ */
+               s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
+               if (s != NULL) {
+                       a.fp = s->fp;
+                       /* Scheduler exists, skip to FIFO scheduler 
+                        * if command was pipe config...
+                        */
+                       if (pipe_cmd)
+                               goto next;
+               } else {
+                       /* New scheduler, create a wf2q+ with no mask
+                        * if command was pipe config...
+                        */
+                       if (pipe_cmd) {
+                               /* clear mask parameter */
+                               bzero(&a.sch->sched_mask, sizeof(new_mask));
+                               a.sch->buckets = 0;
+                               a.sch->flags &= ~DN_HAVE_MASK;
+                       }
+                       a.sch->oid.subtype = DN_SCHED_WF2QP;
+                       goto again;
+               }
+       } else {
+               D("invalid scheduler type %d %s",
+                       a.sch->oid.subtype, a.sch->name);
+               err = EINVAL;
+               goto error;
+       }
+       /* normalize name and subtype */
+       a.sch->oid.subtype = a.fp->type;
+       bzero(a.sch->name, sizeof(a.sch->name));
+       strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
+       if (s == NULL) {
+               D("cannot allocate scheduler %d", i);
+               goto error;
+       }
+       /* restore existing link if any */
+       if (p.link_nr) {
+               s->link = p;
+               if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
+                       s->profile = NULL; /* XXX maybe not needed */
+               } else {
+                       size_t pf_size = sizeof(struct dn_profile) +
+                               s->profile->samples_no * sizeof(int);
+
+                       s->profile = malloc(pf_size,
+                                            M_DUMMYNET, M_NOWAIT | M_ZERO);
+                       if (s->profile == NULL) {
+                               D("cannot allocate profile");
+                               goto error; //XXX
+                       }
+                       bcopy(pf, s->profile, pf_size);
+               }
+       }
+       p.link_nr = 0;
+       if (s->fp == NULL) {
+               DX(2, "sched %d new type %s", i, a.fp->name);
+       } else if (s->fp != a.fp ||
+                       bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
+               /* already existing. */
+               DX(2, "sched %d type changed from %s to %s",
+                       i, s->fp->name, a.fp->name);
+               DX(4, "   type/sub %d/%d -> %d/%d",
+                       s->sch.oid.type, s->sch.oid.subtype, 
+                       a.sch->oid.type, a.sch->oid.subtype);
+               if (s->link.link_nr == 0)
+                       D("XXX WARNING link 0 for sched %d", i);
+               p = s->link;    /* preserve link */
+               if (s->profile) {/* preserve profile */
+                       if (!pf)
+                               pf = malloc(sizeof(*pf),
+                                   M_DUMMYNET, M_NOWAIT | M_ZERO);
+                       if (pf) /* XXX should issue a warning otherwise */
+                               bcopy(s->profile, pf, sizeof(*pf));
+               }
+               /* remove from the hash */
+               dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+               /* Detach flowsets, preserve queues. */
+               // schk_delete_cb(s, NULL);
+               // XXX temporarily, kill queues
+               schk_delete_cb(s, (void *)DN_DESTROY);
+               goto again;
+       } else {
+               DX(4, "sched %d unchanged type %s", i, a.fp->name);
+       }
+       /* complete initialization */
+       s->sch = *a.sch;
+       s->fp = a.fp;
+       s->cfg = arg;
+       // XXX schk_reset_credit(s);
+       /* create the internal flowset if needed,
+        * trying to reuse existing ones if available
+        */
+       if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
+               s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
+               if (!s->fs) {
+                       struct dn_fs fs;
+                       bzero(&fs, sizeof(fs));
+                       set_oid(&fs.oid, DN_FS, sizeof(fs));
+                       fs.fs_nr = i + DN_MAX_ID;
+                       fs.sched_nr = i;
+                       s->fs = config_fs(&fs, NULL, 1 /* locked */);
+               }
+               if (!s->fs) {
+                       schk_delete_cb(s, (void *)DN_DESTROY);
+                       D("error creating internal fs for %d", i);
+                       goto error;
+               }
+       }
+       /* call init function after the flowset is created */
+       if (s->fp->config)
+               s->fp->config(s);
+       update_fs(s);
+next:
+       if (i < DN_MAX_ID) { /* now configure the FIFO instance */
+               i += DN_MAX_ID;
+               if (pipe_cmd) {
+                       /* Restore mask parameter for FIFO */
+                       a.sch->sched_mask = new_mask;
+                       a.sch->buckets = new_buckets;
+                       a.sch->flags = new_flags;
+               } else {
+                       /* sched config shouldn't modify the FIFO scheduler */
+                       if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
+                               /* FIFO already exist, don't touch it */
+                               err = 0; /* and this is not an error */
+                               goto error;
+                       }
+               }
+               a.sch->sched_nr = i;
+               a.sch->oid.subtype = DN_SCHED_FIFO;
+               bzero(a.sch->name, sizeof(a.sch->name));
+               goto again;
+       }
+       err = 0;
+error:
+       DN_BH_WUNLOCK();
+       if (pf)
+               free(pf, M_DUMMYNET);
+       return err;
+}
+
+/*
+ * attach a profile to a link
+ */
+static int
+config_profile(struct dn_profile *pf, struct dn_id *arg)
+{
+       struct dn_schk *s;
+       int i, olen, err = 0;
+
+       if (pf->oid.len < sizeof(*pf)) {
+               D("short profile len %d", pf->oid.len);
+               return EINVAL;
+       }
+       i = pf->link_nr;
+       if (i <= 0 || i >= DN_MAX_ID)
+               return EINVAL;
+       /* XXX other sanity checks */
+       DN_BH_WLOCK();
+       for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+               s = locate_scheduler(i);
+
+               if (s == NULL) {
+                       err = EINVAL;
+                       break;
+               }
+               dn_cfg.id++;
+               /*
+                * If we had a profile and the new one does not fit,
+                * or it is deleted, then we need to free memory.
+                */
+               if (s->profile && (pf->samples_no == 0 ||
+                   s->profile->oid.len < pf->oid.len)) {
+                       free(s->profile, M_DUMMYNET);
+                       s->profile = NULL;
+               }
+               if (pf->samples_no == 0)
+                       continue;
+               /*
+                * new profile, possibly allocate memory
+                * and copy data.
+                */
+               if (s->profile == NULL)
+                       s->profile = malloc(pf->oid.len,
+                                   M_DUMMYNET, M_NOWAIT | M_ZERO);
+               if (s->profile == NULL) {
+                       D("no memory for profile %d", i);
+                       err = ENOMEM;
+                       break;
+               }
+               /* preserve larger length XXX double check */
+               olen = s->profile->oid.len;
+               if (olen < pf->oid.len)
+                       olen = pf->oid.len;
+               bcopy(pf, s->profile, pf->oid.len);
+               s->profile->oid.len = olen;
+       }
+       DN_BH_WUNLOCK();
+       return err;
+}
+
+/*
+ * Delete all objects:
+ */
+static void
+dummynet_flush(void)
+{
+
+       /* delete all schedulers and related links/queues/flowsets */
+       dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
+               (void *)(uintptr_t)DN_DELETE_FS);
+       /* delete all remaining (unlinked) flowsets */
+       DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
+       dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
+       fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
+
+       dn_ht_free(dn_cfg.schedhash, DNHT_REMOVE);
+       /* Reinitialize system heap... */
+       heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+}
+
+/*
+ * Main handler for configuration. We are guaranteed to be called
+ * with an oid which is at least a dn_id.
+ * - the first object is the command (config, delete, flush, ...)
+ * - config_link must be issued after the corresponding config_sched
+ * - parameters (DN_TXT) for an object must preceed the object
+ *   processed on a config_sched.
+ */
+int
+do_config(void *p, int l)
+{
+       struct dn_id *next, *o;
+       int err = 0, err2 = 0;
+       struct dn_id *arg = NULL;
+       uintptr_t *a;
+
+       o = p;
+       if (o->id != DN_API_VERSION) {
+               D("invalid api version got %d need %d",
+                       o->id, DN_API_VERSION);
+               return EINVAL;
+       }
+       for (; l >= sizeof(*o); o = next) {
+               struct dn_id *prev = arg;
+               if (o->len < sizeof(*o) || l < o->len) {
+                       D("bad len o->len %d len %d", o->len, l);
+                       err = EINVAL;
+                       break;
+               }
+               l -= o->len;
+               next = (struct dn_id *)((char *)o + o->len);
+               err = 0;
+               switch (o->type) {
+               default:
+                       D("cmd %d not implemented", o->type);
+                       break;
+
+#ifdef EMULATE_SYSCTL
+               /* sysctl emulation.
+                * if we recognize the command, jump to the correct
+                * handler and return
+                */
+               case DN_SYSCTL_SET:
+                       err = kesysctl_emu_set(p, l);
+                       return err;
+#endif
+
+               case DN_CMD_CONFIG: /* simply a header */
+                       break;
+
+               case DN_CMD_DELETE:
+                       /* the argument is in the first uintptr_t after o */
+                       a = (uintptr_t *)(o+1);
+                       if (o->len < sizeof(*o) + sizeof(*a)) {
+                               err = EINVAL;
+                               break;
+                       }
+                       switch (o->subtype) {
+                       case DN_LINK:
+                               /* delete base and derived schedulers */
+                               DN_BH_WLOCK();
+                               err = delete_schk(*a);
+                               err2 = delete_schk(*a + DN_MAX_ID);
+                               DN_BH_WUNLOCK();
+                               if (!err)
+                                       err = err2;
+                               break;
+
+                       default:
+                               D("invalid delete type %d",
+                                       o->subtype);
+                               err = EINVAL;
+                               break;
+
+                       case DN_FS:
+                               err = (*a <1 || *a >= DN_MAX_ID) ?
+                                       EINVAL : delete_fs(*a, 0) ;
+                               break;
+                       }
+                       break;
+
+               case DN_CMD_FLUSH:
+                       DN_BH_WLOCK();
+                       dummynet_flush();
+                       DN_BH_WUNLOCK();
+                       break;
+               case DN_TEXT:   /* store argument the next block */
+                       prev = NULL;
+                       arg = o;
+                       break;
+               case DN_LINK:
+                       err = config_link((struct dn_link *)o, arg);
+                       break;
+               case DN_PROFILE:
+                       err = config_profile((struct dn_profile *)o, arg);
+                       break;
+               case DN_SCH:
+                       err = config_sched((struct dn_sch *)o, arg);
+                       break;
+               case DN_FS:
+                       err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
+                       break;
+               }
+               if (prev)
+                       arg = NULL;
+               if (err != 0)
+                       break;
+       }
+       return err;
+}
+
+static int
+compute_space(struct dn_id *cmd, struct copy_args *a)
+{
+       int x = 0, need = 0;
+       int profile_size = sizeof(struct dn_profile);
+
+       /* NOTE about compute space:
+        * NP   = dn_cfg.schk_count
+        * NSI  = dn_cfg.si_count
+        * NF   = dn_cfg.fsk_count
+        * NQ   = dn_cfg.queue_count
+        * - ipfw pipe show
+        *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+        *                             link, scheduler template, flowset
+        *                             integrated in scheduler and header
+        *                             for flowset list
+        *   (NSI)*(dn_flow) all scheduler instance (includes
+        *                              the queue instance)
+        * - ipfw sched show
+        *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+        *                             link, scheduler template, flowset
+        *                             integrated in scheduler and header
+        *                             for flowset list
+        *   (NSI * dn_flow) all scheduler instances
+        *   (NF * sizeof(uint_32)) space for flowset list linked to scheduler
+        *   (NQ * dn_queue) all queue [XXXfor now not listed]
+        * - ipfw queue show
+        *   (NF * dn_fs) all flowset
+        *   (NQ * dn_queue) all queues
+        */
+       switch (cmd->subtype) {
+       default:
+               return -1;
+       /* XXX where do LINK and SCH differ ? */
+       /* 'ipfw sched show' could list all queues associated to
+        * a scheduler. This feature for now is disabled
+        */
+       case DN_LINK:   /* pipe show */
+               x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
+               need += dn_cfg.schk_count *
+                       (sizeof(struct dn_fs) + profile_size) / 2;
+               need += dn_cfg.fsk_count * sizeof(uint32_t);
+               break;
+       case DN_SCH:    /* sched show */
+               need += dn_cfg.schk_count *
+                       (sizeof(struct dn_fs) + profile_size) / 2;
+               need += dn_cfg.fsk_count * sizeof(uint32_t);
+               x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
+               break;
+       case DN_FS:     /* queue show */
+               x = DN_C_FS | DN_C_QUEUE;
+               break;
+       case DN_GET_COMPAT:     /* compatibility mode */
+               need =  dn_compat_calc_size(); 
+               break;
+       }
+       a->flags = x;
+       if (x & DN_C_SCH) {
+               need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
+               /* NOT also, each fs might be attached to a sched */
+               need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
+       }
+       if (x & DN_C_FS)
+               need += dn_cfg.fsk_count * sizeof(struct dn_fs);
+       if (x & DN_C_LINK) {
+               need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
+       }
+       /*
+        * When exporting a queue to userland, only pass up the
+        * struct dn_flow, which is the only visible part.
+        */
+
+       if (x & DN_C_QUEUE)
+               need += dn_cfg.queue_count * sizeof(struct dn_flow);
+       if (x & DN_C_FLOW)
+               need += dn_cfg.si_count * (sizeof(struct dn_flow));
+       return need;
+}
+
+/*
+ * If compat != NULL dummynet_get is called in compatibility mode.
+ * *compat will be the pointer to the buffer to pass to ipfw
+ */
+int
+dummynet_get(struct sockopt *sopt, void **compat)
+{
+       int have, i, need, error;
+       char *start = NULL, *buf;
+       size_t sopt_valsize;
+       struct dn_id *cmd;
+       struct copy_args a;
+       struct copy_range r;
+       int l = sizeof(struct dn_id);
+
+       bzero(&a, sizeof(a));
+       bzero(&r, sizeof(r));
+
+       /* save and restore original sopt_valsize around copyin */
+       sopt_valsize = sopt->sopt_valsize;
+
+       cmd = &r.o;
+
+       if (!compat) {
+               /* copy at least an oid, and possibly a full object */
+               error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
+               sopt->sopt_valsize = sopt_valsize;
+               if (error)
+                       goto done;
+               l = cmd->len;
+#ifdef EMULATE_SYSCTL
+               /* sysctl emulation. */
+               if (cmd->type == DN_SYSCTL_GET)
+                       return kesysctl_emu_get(sopt);
+#endif
+               if (l > sizeof(r)) {
+                       /* request larger than default, allocate buffer */
+                       cmd = malloc(l,  M_DUMMYNET, M_WAITOK);
+                       error = sooptcopyin(sopt, cmd, l, l);
+                       sopt->sopt_valsize = sopt_valsize;
+                       if (error)
+                               goto done;
+               }
+       } else { /* compatibility */
+               error = 0;
+               cmd->type = DN_CMD_GET;
+               cmd->len = sizeof(struct dn_id);
+               cmd->subtype = DN_GET_COMPAT;
+               // cmd->id = sopt_valsize;
+               D("compatibility mode");
+       }
+       a.extra = (struct copy_range *)cmd;
+       if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
+               uint32_t *rp = (uint32_t *)(cmd + 1);
+               cmd->len += 2* sizeof(uint32_t);
+               rp[0] = 1;
+               rp[1] = DN_MAX_ID - 1;
+               if (cmd->subtype == DN_LINK) {
+                       rp[0] += DN_MAX_ID;
+                       rp[1] += DN_MAX_ID;
+               }
+       }
+       /* Count space (under lock) and allocate (outside lock).
+        * Exit with lock held if we manage to get enough buffer.
+        * Try a few times then give up.
+        */
+       for (have = 0, i = 0; i < 10; i++) {
+               DN_BH_WLOCK();
+               need = compute_space(cmd, &a);
+
+               /* if there is a range, ignore value from compute_space() */
+               if (l > sizeof(*cmd))
+                       need = sopt_valsize - sizeof(*cmd);
+
+               if (need < 0) {
+                       DN_BH_WUNLOCK();
+                       error = EINVAL;
+                       goto done;
+               }
+               need += sizeof(*cmd);
+               cmd->id = need;
+               if (have >= need)
+                       break;
+
+               DN_BH_WUNLOCK();
+               if (start)
+                       free(start, M_DUMMYNET);
+               start = NULL;
+               if (need > sopt_valsize)
+                       break;
+
+               have = need;
+               start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
+       }
+
+       if (start == NULL) {
+               if (compat) {
+                       *compat = NULL;
+                       error =  1; // XXX
+               } else {
+                       error = sooptcopyout(sopt, cmd, sizeof(*cmd));
+               }
+               goto done;
+       }
+       ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
+               "%d:%d si %d, %d:%d queues %d",
+               dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
+               dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
+               dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
+               dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
+               dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
+       sopt->sopt_valsize = sopt_valsize;
+       a.type = cmd->subtype;
+
+       if (compat == NULL) {
+               bcopy(cmd, start, sizeof(*cmd));
+               ((struct dn_id*)(start))->len = sizeof(struct dn_id);
+               buf = start + sizeof(*cmd);
+       } else
+               buf = start;
+       a.start = &buf;
+       a.end = start + have;
+       /* start copying other objects */
+       if (compat) {
+               a.type = DN_COMPAT_PIPE;
+               dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
+               a.type = DN_COMPAT_QUEUE;
+               dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
+       } else if (a.type == DN_FS) {
+               dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
+       } else {
+               dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
+       }
+       DN_BH_WUNLOCK();
+
+       if (compat) {
+               *compat = start;
+               sopt->sopt_valsize = buf - start;
+               /* free() is done by ip_dummynet_compat() */
+               start = NULL; //XXX hack
+       } else {
+               error = sooptcopyout(sopt, start, buf - start);
+       }
+done:
+       if (cmd && cmd != &r.o)
+               free(cmd, M_DUMMYNET);
+       if (start)
+               free(start, M_DUMMYNET);
+       return error;
+}
+
+/*
+ * Functions to drain idle objects -- see dummynet_task() for some notes
+ */
+/* Callback called on scheduler instance to delete it if idle */
+static int
+drain_scheduler_cb(void *_si, void *_arg)
+{
+       struct dn_sch_inst *si = _si;
+       int *arg = _arg;
+       int empty;
+
+       if ( (*arg++) > dn_cfg.expire_object_examined)
+               return DNHT_SCAN_END;
+
+       if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
+               return 0;
+
+       /*
+        * if the scheduler is multiqueue, q_count also reflects empty
+        * queues that point to si, so we need to check si->q_count to
+        * tell whether we can remove the instance.
+        */
+       if (si->ni.length == 0) {
+               /* si was marked as idle:
+                * remove it or increment idle_si_wait counter
+                */
+               empty = (si->sched->fp->flags & DN_MULTIQUEUE) ? 
+                               (si->q_count == 0) : 1;
+               if (empty && 
+                       (si->idle_time < dn_cfg.curr_time - dn_cfg.object_idle_tick))
+                               return si_destroy(si, NULL);
+               else
+                       dn_cfg.idle_si_wait++;
+       }
+       return 0;
+}
+
+/* Callback called on scheduler to check if it has instances */
+static int
+drain_scheduler_sch_cb(void *_s, void *_arg)
+{
+       struct dn_schk *s = _s;
+       int *arg = _arg;
+
+       if (s->sch.flags & DN_HAVE_MASK) {
+               dn_ht_scan_bucket(s->siht, &s->drain_bucket,
+                               drain_scheduler_cb, _arg);
+       } else {
+               if (s->siht) {
+                       if (drain_scheduler_cb(s->siht, _arg) == DNHT_SCAN_DEL)
+                               s->siht = NULL;
+               }
+       }
+       return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0;
+}
+
+/* Called every tick, try to delete a 'bucket' of scheduler */
+void
+dn_drain_scheduler(void)
+{
+       int arg = 0;
+
+       dn_ht_scan_bucket(dn_cfg.schedhash, (int *)&dn_cfg.drain_sch,
+                          drain_scheduler_sch_cb, &arg);
+}
+
+/* Callback called on queue to delete if it is idle */
+static int
+drain_queue_cb(void *_q, void *_arg)
+{
+       struct dn_queue *q = _q;
+       int *arg = _arg;
+
+       if ( (*arg++) > dn_cfg.expire_object_examined)
+               return DNHT_SCAN_END;
+
+       if (q->ni.length == 0) {
+               if (q->q_time < dn_cfg.curr_time - dn_cfg.object_idle_tick) {
+                       if (dn_delete_queue(q, DN_DESTROY | DN_DEL_SAFE) == 0)
+                               return DNHT_SCAN_DEL; /* queue is deleted */
+               } else
+                       dn_cfg.idle_queue_wait++;
+       }
+
+       return 0; /* queue isn't deleted */
+}
+
+/* Callback called on flowset used to check if it has queues */
+static int
+drain_queue_fs_cb(void *_fs, void *_arg)
+{
+       struct dn_fsk *fs = _fs;
+       int *arg = _arg;
+
+       if (fs->fs.flags & DN_QHT_HASH) {
+               /* Flowset has a hash table for queues */
+               dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
+                               drain_queue_cb, _arg);
+       } else {
+               /* No hash table for this flowset, null the pointer 
+                * if the queue is deleted
+                */
+               if (fs->qht) {
+                       if (drain_queue_cb(fs->qht, _arg) == DNHT_SCAN_DEL)
+                               fs->qht = NULL;
+               }
+       }
+       return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0;
+}
+
+/* Called every tick, try to delete a 'bucket' of queue */
+void
+dn_drain_queue(void)
+{
+       int arg = 0;
+
+       /* scan a bucket of flowset */
+       dn_ht_scan_bucket(dn_cfg.fshash, (int *)&dn_cfg.drain_fs,
+                               drain_queue_fs_cb, &arg);
+}
+
+/*
+ * Handler for the various dummynet socket options
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+       void *p = NULL;
+       int error, l;
+
+       error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+       if (error)
+               return (error);
+
+       /* Disallow sets in really-really secure mode. */
+       if (sopt->sopt_dir == SOPT_SET) {
+               error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+               if (error)
+                       return (error);
+       }
+
+       switch (sopt->sopt_name) {
+       default :
+               D("dummynet: unknown option %d", sopt->sopt_name);
+               error = EINVAL;
+               break;
+
+       case IP_DUMMYNET_FLUSH:
+       case IP_DUMMYNET_CONFIGURE:
+       case IP_DUMMYNET_DEL:   /* remove a pipe or queue */
+       case IP_DUMMYNET_GET:
+               D("dummynet: compat option %d", sopt->sopt_name);
+               error = ip_dummynet_compat(sopt);
+               break;
+
+       case IP_DUMMYNET3 :
+               if (sopt->sopt_dir == SOPT_GET) {
+                       error = dummynet_get(sopt, NULL);
+                       break;
+               }
+               l = sopt->sopt_valsize;
+               if (l < sizeof(struct dn_id) || l > 12000) {
+                       D("argument len %d invalid", l);
+                       break;
+               }
+               p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
+               error = sooptcopyin(sopt, p, l, l);
+               if (error)
+                       break ;
+               error = do_config(p, l);
+               break;
+       }
+
+       if (p != NULL)
+               free(p, M_TEMP);
+
+       return error ;
+}
+
+
+static void
+ip_dn_init(void)
+{
+       if (dn_cfg.init_done)
+               return;
+       printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet);
+       dn_cfg.init_done = 1;
+       /* Set defaults here. MSVC does not accept initializers,
+        * and this is also useful for vimages
+        */
+       /* queue limits */
+       dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
+       dn_cfg.byte_limit = 1024 * 1024;
+       dn_cfg.expire = 1;
+
+       /* RED parameters */
+       dn_cfg.red_lookup_depth = 256;  /* default lookup table depth */
+       dn_cfg.red_avg_pkt_size = 512;  /* default medium packet size */
+       dn_cfg.red_max_pkt_size = 1500; /* default max packet size */
+
+       /* hash tables */
+       dn_cfg.max_hash_size = 1024;    /* max in the hash tables */
+
+       if (dn_cfg.hash_size == 0) /* XXX or <= 0 ? */
+               dn_cfg.hash_size = 64;          /* default hash size */
+
+       /* hash tables for schedulers and flowsets are created
+        * when the first scheduler/flowset is inserted.
+        * This is done to allow to use the right hash_size value.
+        * When the last object is deleted, the table is destroyed,
+        * so a new hash_size value can be used.
+        * XXX rehash is not supported for now
+        */
+       dn_cfg.schedhash = NULL;
+       dn_cfg.fshash = NULL;
+       /* bucket index to drain object */
+       dn_cfg.drain_fs = 0;
+       dn_cfg.drain_sch = 0;
+
+       if (dn_cfg.expire_object == 0)
+               dn_cfg.expire_object = 50;
+       if (dn_cfg.object_idle_tick == 0)
+               dn_cfg.object_idle_tick = 1000;
+       if (dn_cfg.expire_object_examined == 0)
+               dn_cfg.expire_object_examined = 10;
+       if (dn_cfg.drain_ratio == 0)
+               dn_cfg.drain_ratio = 1;
+
+       // XXX what if we don't have a tsc ?
+#ifdef HAVE_TSC
+       dn_cfg.cycle_task_new = dn_cfg.cycle_task_old = readTSC();
+#endif
+       heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+       SLIST_INIT(&dn_cfg.fsu);
+       SLIST_INIT(&dn_cfg.schedlist);
+
+       DN_LOCK_INIT();
+
+       TASK_INIT(&dn_task, 0, dummynet_task, curvnet);
+       dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
+           taskqueue_thread_enqueue, &dn_tq);
+       taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+       callout_init(&dn_timeout, CALLOUT_MPSAFE);
+       callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0);
+
+       /* Initialize curr_time adjustment mechanics. */
+       getmicrouptime(&dn_cfg.prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(int last)
+{
+       callout_drain(&dn_timeout);
+
+       DN_BH_WLOCK();
+       if (last) {
+               ND("removing last instance\n");
+               ip_dn_ctl_ptr = NULL;
+               ip_dn_io_ptr = NULL;
+       }
+
+       dummynet_flush();
+       DN_BH_WUNLOCK();
+       taskqueue_drain(dn_tq, &dn_task);
+       taskqueue_free(dn_tq);
+
+       dn_ht_free(dn_cfg.schedhash, 0);
+       dn_ht_free(dn_cfg.fshash, 0);
+       heap_free(&dn_cfg.evheap);
+
+       DN_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+       if (type == MOD_LOAD) {
+               if (ip_dn_io_ptr) {
+                       printf("DUMMYNET already loaded\n");
+                       return EEXIST ;
+               }
+               ip_dn_init();
+               ip_dn_ctl_ptr = ip_dn_ctl;
+               ip_dn_io_ptr = dummynet_io;
+               return 0;
+       } else if (type == MOD_UNLOAD) {
+#if !defined(KLD_MODULE)
+               printf("dummynet statically compiled, cannot unload\n");
+               return EINVAL ;
+#else
+               ip_dn_destroy(1 /* last */);
+               return 0;
+#endif
+       } else
+               return EOPNOTSUPP;
+}
+
+/* modevent helpers for the modules */
+static int
+load_dn_sched(struct dn_alg *d)
+{
+       struct dn_alg *s;
+
+       if (d == NULL)
+               return 1; /* error */
+       ip_dn_init();   /* just in case, we need the lock */
+
+       /* Check that mandatory funcs exists */
+       if (d->enqueue == NULL || d->dequeue == NULL) {
+               D("missing enqueue or dequeue for %s", d->name);
+               return 1;
+       }
+
+       /* Search if scheduler already exists */
+       DN_BH_WLOCK();
+       SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
+               if (strcmp(s->name, d->name) == 0) {
+                       D("%s already loaded", d->name);
+                       break; /* scheduler already exists */
+               }
+       }
+       if (s == NULL)
+               SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
+       DN_BH_WUNLOCK();
+       D("dn_sched %s %sloaded", d->name, s ? "not ":"");
+       return s ? 1 : 0;
+}
+
+static int
+unload_dn_sched(struct dn_alg *s)
+{
+       struct dn_alg *tmp, *r;
+       int err = EINVAL;
+
+       ND("called for %s", s->name);
+
+       DN_BH_WLOCK();
+       SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
+               if (strcmp(s->name, r->name) != 0)
+                       continue;
+               ND("ref_count = %d", r->ref_count);
+               err = (r->ref_count != 0) ? EBUSY : 0;
+               if (err == 0)
+                       SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
+               break;
+       }
+       DN_BH_WUNLOCK();
+       D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
+       return err;
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+       struct dn_alg *sch = arg;
+
+       if (cmd == MOD_LOAD)
+               return load_dn_sched(sch);
+       else if (cmd == MOD_UNLOAD)
+               return unload_dn_sched(sch);
+       else
+               return EINVAL;
+}
+
+static moduledata_t dummynet_mod = {
+       "dummynet", dummynet_modevent, NULL
+};
+
+#define        DN_SI_SUB       SI_SUB_PROTO_IFATTACHDOMAIN
+#define        DN_MODEV_ORD    (SI_ORDER_ANY - 128) /* after ipfw */
+DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 3);
+
+/*
+ * Starting up. Done in order after dummynet_modevent() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL);
+
+/*
+ * Shutdown handlers up shop. These are done in REVERSE ORDER, but still
+ * after dummynet_modevent() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
+
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
new file mode 100644 (file)
index 0000000..c95f896
--- /dev/null
@@ -0,0 +1,2491 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * The FreeBSD IP packet firewall, main file
+ */
+
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_inet.h"
+#ifndef INET
+#error "IPFIREWALL requires INET"
+#endif /* INET */
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pf_mtag.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_carp.h>
+#include <netinet/pim.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/sctp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#include <netinet6/scope6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
+ */
+
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define        V_ipfw_vnet_ready       VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define        V_fw_deny_unknown_exthdrs       VNET(fw_deny_unknown_exthdrs)
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+
+VNET_DEFINE(int, autoinc_step);
+
+/*
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
+ */
+VNET_DEFINE(u_int32_t, set_disable);
+#define        V_set_disable                   VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
+VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#ifdef SYSCTL_NODE
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+
+SYSBEGIN(f3)
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+    "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+    CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+    "Rule number auto-increment step");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+    "Log matches to ipfw rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+    CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+    "Set upper limit of matches of ipfw rules logged");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+    &dummy_def, 0,
+    "The default/max possible rule number.");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
+    &dummy_tables_max, 0,
+    "The maximum number of tables.");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+    &default_to_accept, 0,
+    "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+    CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+    "Number of static rules");
+
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6);
+SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+    "Deny packets with unknown IPv6 Extension Headers");
+#endif /* INET6 */
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+/*
+ * Some macros used in the various matching options.
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT     ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ *     (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+       u_char want_clear;
+       bits = ~bits;
+
+       if ( ((cmd->arg1 & 0xff) & bits) != 0)
+               return 0; /* some bits we want set were clear */
+       want_clear = (cmd->arg1 >> 8) & 0xff;
+       if ( (want_clear & bits) != want_clear)
+               return 0; /* some bits we want clear were set */
+       return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(ip + 1);
+       int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[IPOPT_OPTVAL];
+
+               if (opt == IPOPT_EOL)
+                       break;
+               if (opt == IPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[IPOPT_OLEN];
+                       if (optlen <= 0 || optlen > x)
+                               return 0; /* invalid or truncated */
+               }
+               switch (opt) {
+
+               default:
+                       break;
+
+               case IPOPT_LSRR:
+                       bits |= IP_FW_IPOPT_LSRR;
+                       break;
+
+               case IPOPT_SSRR:
+                       bits |= IP_FW_IPOPT_SSRR;
+                       break;
+
+               case IPOPT_RR:
+                       bits |= IP_FW_IPOPT_RR;
+                       break;
+
+               case IPOPT_TS:
+                       bits |= IP_FW_IPOPT_TS;
+                       break;
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(tcp + 1);
+       int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[0];
+               if (opt == TCPOPT_EOL)
+                       break;
+               if (opt == TCPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[1];
+                       if (optlen <= 0)
+                               break;
+               }
+
+               switch (opt) {
+
+               default:
+                       break;
+
+               case TCPOPT_MAXSEG:
+                       bits |= IP_FW_TCPOPT_MSS;
+                       break;
+
+               case TCPOPT_WINDOW:
+                       bits |= IP_FW_TCPOPT_WINDOW;
+                       break;
+
+               case TCPOPT_SACK_PERMITTED:
+               case TCPOPT_SACK:
+                       bits |= IP_FW_TCPOPT_SACK;
+                       break;
+
+               case TCPOPT_TIMESTAMP:
+                       bits |= IP_FW_TCPOPT_TS;
+                       break;
+
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
+{
+       if (ifp == NULL)        /* no iface with this packet, match fails */
+               return 0;
+       /* Check by name or by IP address */
+       if (cmd->name[0] != '\0') { /* match by name */
+               /* Check name */
+               if (cmd->p.glob) {
+                       if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+                               return(1);
+               } else {
+                       if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+                               return(1);
+               }
+       } else {
+#ifdef __FreeBSD__     /* and OSX too ? */
+               struct ifaddr *ia;
+
+               if_addr_rlock(ifp);
+               TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+                       if (ia->ifa_addr->sa_family != AF_INET)
+                               continue;
+                       if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+                           (ia->ifa_addr))->sin_addr.s_addr) {
+                               if_addr_runlock(ifp);
+                               return(1);      /* match */
+                       }
+               }
+               if_addr_runlock(ifp);
+#endif /* __FreeBSD__ */
+       }
+       return(0);      /* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ * 
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ *   ip verify unicast reverse-path
+ *   ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+#ifndef __FreeBSD__
+       return 0;
+#else
+       struct route ro;
+       struct sockaddr_in *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in *)&(ro.ro_dst);
+       dst->sin_family = AF_INET;
+       dst->sin_len = sizeof(*dst);
+       dst->sin_addr = src;
+       in_rtalloc_ign(&ro, 0, fib);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /*
+        * If ifp is provided, check for equality with rtentry.
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * in order to pass packets injected back by if_simloop():
+        * if useloopback == 1 routing entry (via lo0) for our own address
+        * may exist, so we need to handle routing assymetry.
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+            satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+#endif /* __FreeBSD__ */
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+       return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+       int i;
+       for (i=0; i <= cmd->o.arg1; ++i )
+               if (curr_flow == cmd->d[i] )
+                       return 1;
+       return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+       struct ifnet *mdc;
+       struct ifaddr *mdc2;
+       struct in6_ifaddr *fdm;
+       struct in6_addr copia;
+
+       TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+               if_addr_rlock(mdc);
+               TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+                       if (mdc2->ifa_addr->sa_family == AF_INET6) {
+                               fdm = (struct in6_ifaddr *)mdc2;
+                               copia = fdm->ia_addr.sin6_addr;
+                               /* need for leaving scope_id in the sock_addr */
+                               in6_clearscope(&copia);
+                               if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+                                       if_addr_runlock(mdc);
+                                       return 1;
+                               }
+                       }
+               }
+               if_addr_runlock(mdc);
+       }
+       return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp)
+{
+       struct route_in6 ro;
+       struct sockaddr_in6 *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+       dst->sin6_family = AF_INET6;
+       dst->sin6_len = sizeof(*dst);
+       dst->sin6_addr = *src;
+       /* XXX MRT 0 for ipv6 at this time */
+       rtalloc_ign((struct route *)&ro, 0);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /* 
+        * if ifp is provided, check for equality with rtentry
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * to support the case of sending packets to an address of our own.
+        * (where the former interface is the first argument of if_simloop()
+        *  (=ifp), the latter is lo0)
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+           IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+       if ((icmp6_type <= ICMP6_MAXTYPE) &&
+           (icmp6_type == ICMP6_ECHO_REQUEST ||
+           icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+           icmp6_type == ICMP6_WRUREQUEST ||
+           icmp6_type == ICMP6_FQDN_QUERY ||
+           icmp6_type == ICMP6_NI_QUERY))
+               return (1);
+
+       return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+       struct mbuf *m;
+
+       m = args->m;
+       if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *tcp;
+               tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+               if ((tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m0;
+                       m0 = ipfw_send_pkt(args->m, &(args->f_id),
+                           ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                           tcp->th_flags | TH_RST);
+                       if (m0 != NULL)
+                               ip6_output(m0, NULL, NULL, 0, NULL, NULL,
+                                   NULL);
+               }
+               FREE_PKT(m);
+       } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+               /*
+                * Unlike above, the mbufs need to line up with the ip6 hdr,
+                * as the contents are read. We need to m_adj() the
+                * needed amount.
+                * The mbuf will however be thrown away so we can adjust it.
+                * Remember we did an m_pullup on it already so we
+                * can make some assumptions about contiguousness.
+                */
+               if (args->L3offset)
+                       m_adj(m, args->L3offset);
+#endif
+               icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+       } else
+               FREE_PKT(m);
+
+       args->m = NULL;
+}
+
+#endif /* INET6 */
+
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
+{
+
+#if 0
+       /* XXX When ip is not guaranteed to be at mtod() we will
+        * need to account for this */
+        * The mbuf will however be thrown away so we can adjust it.
+        * Remember we did an m_pullup on it already so we
+        * can make some assumptions about contiguousness.
+        */
+       if (args->L3offset)
+               m_adj(m, args->L3offset);
+#endif
+       if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+               /* We need the IP header in host order for icmp_error(). */
+               SET_HOST_IPLEN(ip);
+               icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+       } else if (args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *const tcp =
+                   L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+               if ( (tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m;
+                       m = ipfw_send_pkt(args->m, &(args->f_id),
+                               ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                               tcp->th_flags | TH_RST);
+                       if (m != NULL)
+                               ip_output(m, NULL, NULL, 0, NULL, NULL);
+               }
+               FREE_PKT(args->m);
+       } else
+               FREE_PKT(args->m);
+       args->m = NULL;
+}
+
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
+ */
+static int
+check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, int *ugid_lookupp,
+    struct ucred **uc, struct inpcb *inp)
+{
+#ifndef __FreeBSD__
+       return cred_check(insn, proto, oif,
+           dst_ip, dst_port, src_ip, src_port,
+           (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else  /* FreeBSD */
+       struct inpcbinfo *pi;
+       int wildcard;
+       struct inpcb *pcb;
+       int match;
+
+       /*
+        * Check to see if the UDP or TCP stack supplied us with
+        * the PCB. If so, rather then holding a lock and looking
+        * up the PCB, we can use the one that was supplied.
+        */
+       if (inp && *ugid_lookupp == 0) {
+               INP_LOCK_ASSERT(inp);
+               if (inp->inp_socket != NULL) {
+                       *uc = crhold(inp->inp_cred);
+                       *ugid_lookupp = 1;
+               } else
+                       *ugid_lookupp = -1;
+       }
+       /*
+        * If we have already been here and the packet has no
+        * PCB entry associated with it, then we can safely
+        * assume that this is a no match.
+        */
+       if (*ugid_lookupp == -1)
+               return (0);
+       if (proto == IPPROTO_TCP) {
+               wildcard = 0;
+               pi = &V_tcbinfo;
+       } else if (proto == IPPROTO_UDP) {
+               wildcard = INPLOOKUP_WILDCARD;
+               pi = &V_udbinfo;
+       } else
+               return 0;
+       match = 0;
+       if (*ugid_lookupp == 0) {
+               INP_INFO_RLOCK(pi);
+               pcb =  (oif) ?
+                       in_pcblookup_hash(pi,
+                               dst_ip, htons(dst_port),
+                               src_ip, htons(src_port),
+                               wildcard, oif) :
+                       in_pcblookup_hash(pi,
+                               src_ip, htons(src_port),
+                               dst_ip, htons(dst_port),
+                               wildcard, NULL);
+               if (pcb != NULL) {
+                       *uc = crhold(pcb->inp_cred);
+                       *ugid_lookupp = 1;
+               }
+               INP_INFO_RUNLOCK(pi);
+               if (*ugid_lookupp == 0) {
+                       /*
+                        * We tried and failed, set the variable to -1
+                        * so we will not try again on this packet.
+                        */
+                       *ugid_lookupp = -1;
+                       return (0);
+               }
+       } 
+       if (insn->o.opcode == O_UID)
+               match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = groupmember((gid_t)insn->d[0], *uc);
+       else if (insn->o.opcode == O_JAIL)
+               match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
+       return match;
+#endif /* __FreeBSD__ */
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+       struct ip_fw_chain *chain)
+{
+       args->rule.chain_id = chain->id;
+       args->rule.slot = slot + 1; /* we use 0 as a marker */
+       args->rule.rule_id = 1 + chain->map[slot]->id;
+       args->rule.rulenum = chain->map[slot]->rulenum;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ *     args->m (in/out) The packet; we set to NULL when/if we nuke it.
+ *             Starts with the IP header.
+ *     args->eh (in)   Mac header if present, NULL for layer3 packet.
+ *     args->L3offset  Number of bytes bypassed if we came from L2.
+ *                     e.g. often sizeof(eh)  ** NOTYET **
+ *     args->oif       Outgoing interface, NULL if packet is incoming.
+ *             The incoming interface is in the mbuf. (in)
+ *     args->divert_rule (in/out)
+ *             Skip up to the first rule past this rule number;
+ *             upon return, non-zero port number for divert or tee.
+ *
+ *     args->rule      Pointer to the last matching rule (in/out)
+ *     args->next_hop  Socket we are forwarding to (out).
+ *     args->f_id      Addresses grabbed from the packet (out)
+ *     args->rule.info a cookie depending on rule action
+ *
+ * Return value:
+ *
+ *     IP_FW_PASS      the packet must be accepted
+ *     IP_FW_DENY      the packet must be dropped
+ *     IP_FW_DIVERT    divert packet, port in m_tag
+ *     IP_FW_TEE       tee packet, port in m_tag
+ *     IP_FW_DUMMYNET  to dummynet, pipe in args->cookie
+ *     IP_FW_NETGRAPH  into netgraph, cookie args->cookie
+ *             args->rule contains the matching rule,
+ *             args->rule.info has additional information.
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+
+       /*
+        * Local variables holding state while processing a packet:
+        *
+        * IMPORTANT NOTE: to speed up the processing of rules, there
+        * are some assumption on the values of the variables, which
+        * are documented here. Should you change them, please check
+        * the implementation of the various instructions to make sure
+        * that they still work.
+        *
+        * args->eh     The MAC header. It is non-null for a layer2
+        *      packet, it is NULL for a layer-3 packet.
+        * **notyet**
+        * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+        *
+        * m | args->m  Pointer to the mbuf, as received from the caller.
+        *      It may change if ipfw_chk() does an m_pullup, or if it
+        *      consumes the packet because it calls send_reject().
+        *      XXX This has to change, so that ipfw_chk() never modifies
+        *      or consumes the buffer.
+        * ip   is the beginning of the ip(4 or 6) header.
+        *      Calculated by adding the L3offset to the start of data.
+        *      (Until we start using L3offset, the packet is
+        *      supposed to start with the ip header).
+        */
+       struct mbuf *m = args->m;
+       struct ip *ip = mtod(m, struct ip *);
+
+       /*
+        * For rules which contain uid/gid or jail constraints, cache
+        * a copy of the users credentials after the pcb lookup has been
+        * executed. This will speed up the processing of rules with
+        * these types of constraints, as well as decrease contention
+        * on pcb related locks.
+        */
+#ifndef __FreeBSD__
+       struct bsd_ucred ucred_cache;
+#else
+       struct ucred *ucred_cache = NULL;
+#endif
+       int ucred_lookup = 0;
+
+       /*
+        * oif | args->oif      If NULL, ipfw_chk has been called on the
+        *      inbound path (ether_input, ip_input).
+        *      If non-NULL, ipfw_chk has been called on the outbound path
+        *      (ether_output, ip_output).
+        */
+       struct ifnet *oif = args->oif;
+
+       int f_pos = 0;          /* index of current rule in the array */
+       int retval = 0;
+
+       /*
+        * hlen The length of the IP header.
+        */
+       u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
+
+       /*
+        * offset       The offset of a fragment. offset != 0 means that
+        *      we have a fragment at this offset of an IPv4 packet.
+        *      offset == 0 means that (if this is an IPv4 packet)
+        *      this is the first or only fragment.
+        *      For IPv6 offset == 0 means there is no Fragment Header. 
+        *      If offset != 0 for IPv6 always use correct mask to
+        *      get the correct offset because we add IP6F_MORE_FRAG
+        *      to be able to dectect the first fragment which would
+        *      otherwise have offset = 0.
+        */
+       u_short offset = 0;
+
+       /*
+        * Local copies of addresses. They are only valid if we have
+        * an IP packet.
+        *
+        * proto        The protocol. Set to 0 for non-ip packets,
+        *      or to the protocol read from the packet otherwise.
+        *      proto != 0 means that we have an IPv4 packet.
+        *
+        * src_port, dst_port   port numbers, in HOST format. Only
+        *      valid for TCP and UDP packets.
+        *
+        * src_ip, dst_ip       ip addresses, in NETWORK format.
+        *      Only valid for IPv4 packets.
+        */
+       uint8_t proto;
+       uint16_t src_port = 0, dst_port = 0;    /* NOTE: host format    */
+       struct in_addr src_ip, dst_ip;          /* NOTE: network format */
+       uint16_t iplen=0;
+       int pktlen;
+       uint16_t        etype = 0;      /* Host order stored ether type */
+
+       /*
+        * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+        *      MATCH_NONE when checked and not matched (q = NULL),
+        *      MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+        */
+       int dyn_dir = MATCH_UNKNOWN;
+       ipfw_dyn_rule *q = NULL;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       /*
+        * We store in ulp a pointer to the upper layer protocol header.
+        * In the ipv4 case this is easy to determine from the header,
+        * but for ipv6 we might have some additional headers in the middle.
+        * ulp is NULL if not found.
+        */
+       void *ulp = NULL;               /* upper layer protocol pointer. */
+
+       /* XXX ipv6 variables */
+       int is_ipv6 = 0;
+       uint8_t icmp6_type = 0;
+       uint16_t ext_hd = 0;    /* bits vector for extension header filtering */
+       /* end of ipv6 variables */
+
+       int is_ipv4 = 0;
+
+       int done = 0;           /* flag to exit the outer loop */
+
+       if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
+               return (IP_FW_PASS);    /* accept */
+
+       dst_ip.s_addr = 0;              /* make sure it is initialized */
+       src_ip.s_addr = 0;              /* make sure it is initialized */
+       pktlen = m->m_pkthdr.len;
+       args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+       proto = args->f_id.proto = 0;   /* mark f_id invalid */
+               /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T)                                  \
+do {                                                           \
+       int x = (_len) + sizeof(T);                             \
+       if ((m)->m_len < x) {                                   \
+               args->m = m = m_pullup(m, x);                   \
+               if (m == NULL)                                  \
+                       goto pullup_failed;                     \
+       }                                                       \
+       p = (mtod(m, char *) + (_len));                         \
+} while (0)
+
+       /*
+        * if we have an ether header,
+        */
+       if (args->eh)
+               etype = ntohs(args->eh->ether_type);
+
+       /* Identify IP packets and fill up variables. */
+       if (pktlen >= sizeof(struct ip6_hdr) &&
+           (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+               is_ipv6 = 1;
+               args->f_id.addr_type = 6;
+               hlen = sizeof(struct ip6_hdr);
+               proto = ip6->ip6_nxt;
+
+               /* Search extension headers to find upper layer protocols */
+               while (ulp == NULL) {
+                       switch (proto) {
+                       case IPPROTO_ICMPV6:
+                               PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+                               icmp6_type = ICMP6(ulp)->icmp6_type;
+                               break;
+
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               /* save flags for dynamic rules */
+                               args->f_id._flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_SCTP:
+                               PULLUP_TO(hlen, ulp, struct sctphdr);
+                               src_port = SCTP(ulp)->src_port;
+                               dst_port = SCTP(ulp)->dest_port;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_HOPOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_HOPOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ROUTING:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+                               switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+                               case 0:
+                                       ext_hd |= EXT_RTHDR0;
+                                       break;
+                               case 2:
+                                       ext_hd |= EXT_RTHDR2;
+                                       break;
+                               default:
+                                       printf("IPFW2: IPV6 - Unknown Routing "
+                                           "Header type(%d)\n",
+                                           ((struct ip6_rthdr *)ulp)->ip6r_type);
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               ext_hd |= EXT_ROUTING;
+                               hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+                               proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_FRAGMENT:  /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_frag);
+                               ext_hd |= EXT_FRAGMENT;
+                               hlen += sizeof (struct ip6_frag);
+                               proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+                               offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_OFF_MASK;
+                               /* Add IP6F_MORE_FRAG for offset of first
+                                * fragment to be != 0. */
+                               offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_MORE_FRAG;
+                               if (offset == 0) {
+                                       printf("IPFW2: IPV6 - Invalid Fragment "
+                                           "Header\n");
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               args->f_id.extra =
+                                   ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_DSTOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_DSTOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_AH:        /* RFC 2402 */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               ext_hd |= EXT_AH;
+                               hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+                               proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ESP:       /* RFC 2406 */
+                               PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
+                               /* Anything past Seq# is variable length and
+                                * data past this ext. header is encrypted. */
+                               ext_hd |= EXT_ESP;
+                               break;
+
+                       case IPPROTO_NONE:      /* RFC 2460 */
+                               /*
+                                * Packet ends here, and IPv6 header has
+                                * already been pulled up. If ip6e_len!=0
+                                * then octets must be ignored.
+                                */
+                               ulp = ip; /* non-NULL to get out of loop. */
+                               break;
+
+                       case IPPROTO_OSPFIGP:
+                               /* XXX OSPF header check? */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+
+                       case IPPROTO_PIM:
+                               /* XXX PIM header check? */
+                               PULLUP_TO(hlen, ulp, struct pim);
+                               break;
+
+                       case IPPROTO_CARP:
+                               PULLUP_TO(hlen, ulp, struct carp_header);
+                               if (((struct carp_header *)ulp)->carp_version !=
+                                   CARP_VERSION) 
+                                       return (IP_FW_DENY);
+                               if (((struct carp_header *)ulp)->carp_type !=
+                                   CARP_ADVERTISEMENT) 
+                                       return (IP_FW_DENY);
+                               break;
+
+                       case IPPROTO_IPV6:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hdr);
+                               break;
+
+                       case IPPROTO_IPV4:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip);
+                               break;
+
+                       default:
+                               printf("IPFW2: IPV6 - Unknown Extension "
+                                   "Header(%d), ext_hd=%x\n", proto, ext_hd);
+                               if (V_fw_deny_unknown_exthdrs)
+                                   return (IP_FW_DENY);
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+                       } /*switch */
+               }
+               ip = mtod(m, struct ip *);
+               ip6 = (struct ip6_hdr *)ip;
+               args->f_id.src_ip6 = ip6->ip6_src;
+               args->f_id.dst_ip6 = ip6->ip6_dst;
+               args->f_id.src_ip = 0;
+               args->f_id.dst_ip = 0;
+               args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+       } else if (pktlen >= sizeof(struct ip) &&
+           (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+               is_ipv4 = 1;
+               hlen = ip->ip_hl << 2;
+               args->f_id.addr_type = 4;
+
+               /*
+                * Collect parameters into local variables for faster matching.
+                */
+               proto = ip->ip_p;
+               src_ip = ip->ip_src;
+               dst_ip = ip->ip_dst;
+               offset = ntohs(ip->ip_off) & IP_OFFMASK;
+               iplen = ntohs(ip->ip_len);
+               pktlen = iplen < pktlen ? iplen : pktlen;
+
+               if (offset == 0) {
+                       switch (proto) {
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               /* save flags for dynamic rules */
+                               args->f_id._flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_ICMP:
+                               PULLUP_TO(hlen, ulp, struct icmphdr);
+                               //args->f_id.flags = ICMP(ulp)->icmp_type;
+                               break;
+
+                       default:
+                               break;
+                       }
+               }
+
+               ip = mtod(m, struct ip *);
+               args->f_id.src_ip = ntohl(src_ip.s_addr);
+               args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+       }
+#undef PULLUP_TO
+       if (proto) { /* we may have port numbers, store them */
+               args->f_id.proto = proto;
+               args->f_id.src_port = src_port = ntohs(src_port);
+               args->f_id.dst_port = dst_port = ntohs(dst_port);
+       }
+
+       IPFW_RLOCK(chain);
+       if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+               IPFW_RUNLOCK(chain);
+               return (IP_FW_PASS);    /* accept */
+       }
+       if (args->rule.slot) {
+               /*
+                * Packet has already been tagged as a result of a previous
+                * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+                * REASS, NETGRAPH, DIVERT/TEE...)
+                * Validate the slot and continue from the next one
+                * if still present, otherwise do a lookup.
+                */
+               f_pos = (args->rule.chain_id == chain->id) ?
+                   args->rule.slot :
+                   ipfw_find_rule(chain, args->rule.rulenum,
+                       args->rule.rule_id);
+       } else {
+               f_pos = 0;
+       }
+
+       /*
+        * Now scan the rules, and parse microinstructions for each rule.
+        * We have two nested loops and an inner switch. Sometimes we
+        * need to break out of one or both loops, or re-enter one of
+        * the loops with updated variables. Loop variables are:
+        *
+        *      f_pos (outer loop) points to the current rule.
+        *              On output it points to the matching rule.
+        *      done (outer loop) is used as a flag to break the loop.
+        *      l (inner loop)  residual length of current rule.
+        *              cmd points to the current microinstruction.
+        *
+        * We break the inner loop by setting l=0 and possibly
+        * cmdlen=0 if we don't want to advance cmd.
+        * We break the outer loop by setting done=1
+        * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+        * as needed.
+        */
+       for (; f_pos < chain->n_rules; f_pos++) {
+               ipfw_insn *cmd;
+               uint32_t tablearg = 0;
+               int l, cmdlen, skip_or; /* skip rest of OR block */
+               struct ip_fw *f;
+
+               f = chain->map[f_pos];
+               if (V_set_disable & (1 << f->set) )
+                       continue;
+
+               skip_or = 0;
+               for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+                   l -= cmdlen, cmd += cmdlen) {
+                       int match;
+
+                       /*
+                        * check_body is a jump target used when we find a
+                        * CHECK_STATE, and need to jump to the body of
+                        * the target rule.
+                        */
+
+/* check_body: */
+                       cmdlen = F_LEN(cmd);
+                       /*
+                        * An OR block (insn_1 || .. || insn_n) has the
+                        * F_OR bit set in all but the last instruction.
+                        * The first match will set "skip_or", and cause
+                        * the following instructions to be skipped until
+                        * past the one with the F_OR bit clear.
+                        */
+                       if (skip_or) {          /* skip this instruction */
+                               if ((cmd->len & F_OR) == 0)
+                                       skip_or = 0;    /* next one is good */
+                               continue;
+                       }
+                       match = 0; /* set to 1 if we succeed */
+
+                       switch (cmd->opcode) {
+                       /*
+                        * The first set of opcodes compares the packet's
+                        * fields with some pattern, setting 'match' if a
+                        * match is found. At the end of the loop there is
+                        * logic to deal with F_NOT and F_OR flags associated
+                        * with the opcode.
+                        */
+                       case O_NOP:
+                               match = 1;
+                               break;
+
+                       case O_FORWARD_MAC:
+                               printf("ipfw: opcode %d unimplemented\n",
+                                   cmd->opcode);
+                               break;
+
+                       case O_GID:
+                       case O_UID:
+                       case O_JAIL:
+                               /*
+                                * We only check offset == 0 && proto != 0,
+                                * as this ensures that we have a
+                                * packet with the ports info.
+                                */
+                               if (offset!=0)
+                                       break;
+                               if (is_ipv6) /* XXX to be fixed later */
+                                       break;
+                               if (proto == IPPROTO_TCP ||
+                                   proto == IPPROTO_UDP)
+                                       match = check_uidgid(
+                                                   (ipfw_insn_u32 *)cmd,
+                                                   proto, oif,
+                                                   dst_ip, dst_port,
+                                                   src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+                                                   &ucred_cache, args->inp);
+#else
+                                                   (void *)&ucred_cache,
+                                                   (struct inpcb *)args->m);
+#endif
+                               break;
+
+                       case O_RECV:
+                               match = iface_match(m->m_pkthdr.rcvif,
+                                   (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_XMIT:
+                               match = iface_match(oif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_VIA:
+                               match = iface_match(oif ? oif :
+                                   m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_MACADDR2:
+                               if (args->eh != NULL) { /* have MAC header */
+                                       u_int32_t *want = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->addr;
+                                       u_int32_t *mask = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->mask;
+                                       u_int32_t *hdr = (u_int32_t *)args->eh;
+
+                                       match =
+                                           ( want[0] == (hdr[0] & mask[0]) &&
+                                             want[1] == (hdr[1] & mask[1]) &&
+                                             want[2] == (hdr[2] & mask[2]) );
+                               }
+                               break;
+
+                       case O_MAC_TYPE:
+                               if (args->eh != NULL) {
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (etype >= p[0] &&
+                                                   etype <= p[1]);
+                               }
+                               break;
+
+                       case O_FRAG:
+                               match = (offset != 0);
+                               break;
+
+                       case O_IN:      /* "out" is "not in" */
+                               match = (oif == NULL);
+                               break;
+
+                       case O_LAYER2:
+                               match = (args->eh != NULL);
+                               break;
+
+                       case O_DIVERTED:
+                           {
+                               /* For diverted packets, args->rule.info
+                                * contains the divert port (in host format)
+                                * reason and direction.
+                                */
+                               uint32_t i = args->rule.info;
+                               match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+                                   cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+                           }
+                               break;
+
+                       case O_PROTO:
+                               /*
+                                * We do not allow an arg of 0 so the
+                                * check of "proto" only suffices.
+                                */
+                               match = (proto == cmd->arg1);
+                               break;
+
+                       case O_IP_SRC:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   src_ip.s_addr);
+                               break;
+
+                       case O_IP_SRC_LOOKUP:
+                       case O_IP_DST_LOOKUP:
+                               if (is_ipv4) {
+                                   uint32_t key =
+                                       (cmd->opcode == O_IP_DST_LOOKUP) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t v = 0;
+
+                                   if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+                                       /* generic lookup. The key must be
+                                        * in 32bit big-endian format.
+                                        */
+                                       v = ((ipfw_insn_u32 *)cmd)->d[1];
+                                       if (v == 0)
+                                           key = dst_ip.s_addr;
+                                       else if (v == 1)
+                                           key = src_ip.s_addr;
+                                       else if (v == 6) /* dscp */
+                                           key = (ip->ip_tos >> 2) & 0x3f;
+                                       else if (offset != 0)
+                                           break;
+                                       else if (proto != IPPROTO_TCP &&
+                                               proto != IPPROTO_UDP)
+                                           break;
+                                       else if (v == 2)
+                                           key = htonl(dst_port);
+                                       else if (v == 3)
+                                           key = htonl(src_port);
+                                       else if (v == 4 || v == 5) {
+                                           check_uidgid(
+                                               (ipfw_insn_u32 *)cmd,
+                                               proto, oif,
+                                               dst_ip, dst_port,
+                                               src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+                                               &ucred_cache, args->inp);
+                                           if (v == 4 /* O_UID */)
+                                               key = ucred_cache->cr_uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache->cr_prison->pr_id;
+#else /* !__FreeBSD__ */
+                                               (void *)&ucred_cache,
+                                               (struct inpcb *)args->m);
+                                           if (v ==4 /* O_UID */)
+                                               key = ucred_cache.uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache.xid;
+#endif /* !__FreeBSD__ */
+                                           key = htonl(key);
+                                       } else
+                                           break;
+                                   }
+                                   match = ipfw_lookup_table(chain,
+                                       cmd->arg1, key, &v);
+                                   if (!match)
+                                       break;
+                                   if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+                                       match =
+                                           ((ipfw_insn_u32 *)cmd)->d[0] == v;
+                                   else
+                                       tablearg = v;
+                               }
+                               break;
+
+                       case O_IP_SRC_MASK:
+                       case O_IP_DST_MASK:
+                               if (is_ipv4) {
+                                   uint32_t a =
+                                       (cmd->opcode == O_IP_DST_MASK) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+                                   int i = cmdlen-1;
+
+                                   for (; !match && i>0; i-= 2, p+= 2)
+                                       match = (p[0] == (a & p[1]));
+                               }
+                               break;
+
+                       case O_IP_SRC_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(src_ip, tif);
+                                       match = (tif != NULL);
+                                       break;
+                               }
+#ifdef INET6
+                               /* FALLTHROUGH */
+                       case O_IP6_SRC_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+#endif
+                               break;
+
+                       case O_IP_DST_SET:
+                       case O_IP_SRC_SET:
+                               if (is_ipv4) {
+                                       u_int32_t *d = (u_int32_t *)(cmd+1);
+                                       u_int32_t addr =
+                                           cmd->opcode == O_IP_DST_SET ?
+                                               args->f_id.dst_ip :
+                                               args->f_id.src_ip;
+
+                                           if (addr < d[0])
+                                                   break;
+                                           addr -= d[0]; /* subtract base */
+                                           match = (addr < cmd->arg1) &&
+                                               ( d[ 1 + (addr>>5)] &
+                                                 (1<<(addr & 0x1f)) );
+                               }
+                               break;
+
+                       case O_IP_DST:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   dst_ip.s_addr);
+                               break;
+
+                       case O_IP_DST_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(dst_ip, tif);
+                                       match = (tif != NULL);
+                                       break;
+                               }
+#ifdef INET6
+                               /* FALLTHROUGH */
+                       case O_IP6_DST_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+#endif
+                               break;
+
+
+                       case O_IP_SRCPORT:
+                       case O_IP_DSTPORT:
+                               /*
+                                * offset == 0 && proto != 0 is enough
+                                * to guarantee that we have a
+                                * packet with port info.
+                                */
+                               if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+                                   && offset == 0) {
+                                       u_int16_t x =
+                                           (cmd->opcode == O_IP_SRCPORT) ?
+                                               src_port : dst_port ;
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (x>=p[0] && x<=p[1]);
+                               }
+                               break;
+
+                       case O_ICMPTYPE:
+                               match = (offset == 0 && proto==IPPROTO_ICMP &&
+                                   icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+                               break;
+
+#ifdef INET6
+                       case O_ICMP6TYPE:
+                               match = is_ipv6 && offset == 0 &&
+                                   proto==IPPROTO_ICMPV6 &&
+                                   icmp6type_match(
+                                       ICMP6(ulp)->icmp6_type,
+                                       (ipfw_insn_u32 *)cmd);
+                               break;
+#endif /* INET6 */
+
+                       case O_IPOPT:
+                               match = (is_ipv4 &&
+                                   ipopts_match(ip, cmd) );
+                               break;
+
+                       case O_IPVER:
+                               match = (is_ipv4 &&
+                                   cmd->arg1 == ip->ip_v);
+                               break;
+
+                       case O_IPID:
+                       case O_IPLEN:
+                       case O_IPTTL:
+                               if (is_ipv4) {  /* only for IP packets */
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   if (cmd->opcode == O_IPLEN)
+                                       x = iplen;
+                                   else if (cmd->opcode == O_IPTTL)
+                                       x = ip->ip_ttl;
+                                   else /* must be IPID */
+                                       x = ntohs(ip->ip_id);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_IPPRECEDENCE:
+                               match = (is_ipv4 &&
+                                   (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+                               break;
+
+                       case O_IPTOS:
+                               match = (is_ipv4 &&
+                                   flags_match(cmd, ip->ip_tos));
+                               break;
+
+                       case O_TCPDATALEN:
+                               if (proto == IPPROTO_TCP && offset == 0) {
+                                   struct tcphdr *tcp;
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   tcp = TCP(ulp);
+                                   x = iplen -
+                                       ((ip->ip_hl + tcp->th_off) << 2);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_TCPFLAGS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   flags_match(cmd, TCP(ulp)->th_flags));
+                               break;
+
+                       case O_TCPOPTS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   tcpopts_match(TCP(ulp), cmd));
+                               break;
+
+                       case O_TCPSEQ:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_seq);
+                               break;
+
+                       case O_TCPACK:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_ack);
+                               break;
+
+                       case O_TCPWIN:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   cmd->arg1 == TCP(ulp)->th_win);
+                               break;
+
+                       case O_ESTAB:
+                               /* reject packets which have SYN only */
+                               /* XXX should i also check for TH_ACK ? */
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   (TCP(ulp)->th_flags &
+                                    (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+                               break;
+
+                       case O_ALTQ: {
+                               struct pf_mtag *at;
+                               ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                               match = 1;
+                               at = pf_find_mtag(m);
+                               if (at != NULL && at->qid != 0)
+                                       break;
+                               at = pf_get_mtag(m);
+                               if (at == NULL) {
+                                       /*
+                                        * Let the packet fall back to the
+                                        * default ALTQ.
+                                        */
+                                       break;
+                               }
+                               at->qid = altq->qid;
+                               if (is_ipv4)
+                                       at->af = AF_INET;
+                               else
+                                       at->af = AF_LINK;
+                               at->hdr = ip;
+                               break;
+                       }
+
+                       case O_LOG:
+                               ipfw_log(f, hlen, args, m,
+                                           oif, offset, tablearg, ip);
+                               match = 1;
+                               break;
+
+                       case O_PROB:
+                               match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+                               break;
+
+                       case O_VERREVPATH:
+                               /* Outgoing packets automatically pass/match */
+                               match = ((oif != NULL) ||
+                                   (m->m_pkthdr.rcvif == NULL) ||
+                                   (
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           m->m_pkthdr.rcvif) :
+#endif
+                                   verify_path(src_ip, m->m_pkthdr.rcvif,
+                                       args->f_id.fib)));
+                               break;
+
+                       case O_VERSRCREACH:
+                               /* Outgoing packets automatically pass/match */
+                               match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           NULL) :
+#endif
+                                   verify_path(src_ip, NULL, args->f_id.fib)));
+                               break;
+
+                       case O_ANTISPOOF:
+                               /* Outgoing packets automatically pass/match */
+                               if (oif == NULL && hlen > 0 &&
+                                   (  (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+                                   || (is_ipv6 &&
+                                       in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+                                   ))
+                                       match =
+#ifdef INET6
+                                           is_ipv6 ? verify_path6(
+                                               &(args->f_id.src_ip6),
+                                               m->m_pkthdr.rcvif) :
+#endif
+                                           verify_path(src_ip,
+                                               m->m_pkthdr.rcvif,
+                                               args->f_id.fib);
+                               else
+                                       match = 1;
+                               break;
+
+                       case O_IPSEC:
+#ifdef IPSEC
+                               match = (m_tag_find(m,
+                                   PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+                               /* otherwise no match */
+                               break;
+
+#ifdef INET6
+                       case O_IP6_SRC:
+                               match = is_ipv6 &&
+                                   IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+
+                       case O_IP6_DST:
+                               match = is_ipv6 &&
+                               IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                               if (is_ipv6) {
+                                       int i = cmdlen - 1;
+                                       struct in6_addr p;
+                                       struct in6_addr *d =
+                                           &((ipfw_insn_ip6 *)cmd)->addr6;
+
+                                       for (; !match && i > 0; d += 2,
+                                           i -= F_INSN_SIZE(struct in6_addr)
+                                           * 2) {
+                                               p = (cmd->opcode ==
+                                                   O_IP6_SRC_MASK) ?
+                                                   args->f_id.src_ip6:
+                                                   args->f_id.dst_ip6;
+                                               APPLY_MASK(&p, &d[1]);
+                                               match =
+                                                   IN6_ARE_ADDR_EQUAL(&d[0],
+                                                   &p);
+                                       }
+                               }
+                               break;
+
+                       case O_FLOW6ID:
+                               match = is_ipv6 &&
+                                   flow6id_match(args->f_id.flow_id6,
+                                   (ipfw_insn_u32 *) cmd);
+                               break;
+
+                       case O_EXT_HDR:
+                               match = is_ipv6 &&
+                                   (ext_hd & ((ipfw_insn *) cmd)->arg1);
+                               break;
+
+                       case O_IP6:
+                               match = is_ipv6;
+                               break;
+#endif
+
+                       case O_IP4:
+                               match = is_ipv4;
+                               break;
+
+                       case O_TAG: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               /* Packet is already tagged with this tag? */
+                               mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+                               /* We have `untag' action when F_NOT flag is
+                                * present. And we must remove this mtag from
+                                * mbuf and reset `match' to zero (`match' will
+                                * be inversed later).
+                                * Otherwise we should allocate new mtag and
+                                * push it into mbuf.
+                                */
+                               if (cmd->len & F_NOT) { /* `untag' action */
+                                       if (mtag != NULL)
+                                               m_tag_delete(m, mtag);
+                                       match = 0;
+                               } else if (mtag == NULL) {
+                                       if ((mtag = m_tag_alloc(MTAG_IPFW,
+                                           tag, 0, M_NOWAIT)) != NULL)
+                                               m_tag_prepend(m, mtag);
+                                       match = 1;
+                               }
+                               break;
+                       }
+
+                       case O_FIB: /* try match the specified fib */
+                               if (args->f_id.fib == cmd->arg1)
+                                       match = 1;
+                               break;
+
+                       case O_TAGGED: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               if (cmdlen == 1) {
+                                       match = m_tag_locate(m, MTAG_IPFW,
+                                           tag, NULL) != NULL;
+                                       break;
+                               }
+
+                               /* we have ranges */
+                               for (mtag = m_tag_first(m);
+                                   mtag != NULL && !match;
+                                   mtag = m_tag_next(m, mtag)) {
+                                       uint16_t *p;
+                                       int i;
+
+                                       if (mtag->m_tag_cookie != MTAG_IPFW)
+                                               continue;
+
+                                       p = ((ipfw_insn_u16 *)cmd)->ports;
+                                       i = cmdlen - 1;
+                                       for(; !match && i > 0; i--, p += 2)
+                                               match =
+                                                   mtag->m_tag_id >= p[0] &&
+                                                   mtag->m_tag_id <= p[1];
+                               }
+                               break;
+                       }
+                               
+                       /*
+                        * The second set of opcodes represents 'actions',
+                        * i.e. the terminal part of a rule once the packet
+                        * matches all previous patterns.
+                        * Typically there is only one action for each rule,
+                        * and the opcode is stored at the end of the rule
+                        * (but there are exceptions -- see below).
+                        *
+                        * In general, here we set retval and terminate the
+                        * outer loop (would be a 'break 3' in some language,
+                        * but we need to set l=0, done=1)
+                        *
+                        * Exceptions:
+                        * O_COUNT and O_SKIPTO actions:
+                        *   instead of terminating, we jump to the next rule
+                        *   (setting l=0), or to the SKIPTO target (setting
+                        *   f/f_len, cmd and l as needed), respectively.
+                        *
+                        * O_TAG, O_LOG and O_ALTQ action parameters:
+                        *   perform some action and set match = 1;
+                        *
+                        * O_LIMIT and O_KEEP_STATE: these opcodes are
+                        *   not real 'actions', and are stored right
+                        *   before the 'action' part of the rule.
+                        *   These opcodes try to install an entry in the
+                        *   state tables; if successful, we continue with
+                        *   the next opcode (match=1; break;), otherwise
+                        *   the packet must be dropped (set retval,
+                        *   break loops with l=0, done=1)
+                        *
+                        * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+                        *   cause a lookup of the state table, and a jump
+                        *   to the 'action' part of the parent rule
+                        *   if an entry is found, or
+                        *   (CHECK_STATE only) a jump to the next rule if
+                        *   the entry is not found.
+                        *   The result of the lookup is cached so that
+                        *   further instances of these opcodes become NOPs.
+                        *   The jump to the next rule is done by setting
+                        *   l=0, cmdlen=0.
+                        */
+                       case O_LIMIT:
+                       case O_KEEP_STATE:
+                               if (ipfw_install_state(f,
+                                   (ipfw_insn_limit *)cmd, args, tablearg)) {
+                                       /* error or limit violation */
+                                       retval = IP_FW_DENY;
+                                       l = 0;  /* exit inner loop */
+                                       done = 1; /* exit outer loop */
+                               }
+                               match = 1;
+                               break;
+
+                       case O_PROBE_STATE:
+                       case O_CHECK_STATE:
+                               /*
+                                * dynamic rules are checked at the first
+                                * keep-state or check-state occurrence,
+                                * with the result being stored in dyn_dir.
+                                * The compiler introduces a PROBE_STATE
+                                * instruction for us when we have a
+                                * KEEP_STATE (because PROBE_STATE needs
+                                * to be run first).
+                                */
+                               if (dyn_dir == MATCH_UNKNOWN &&
+                                   (q = ipfw_lookup_dyn_rule(&args->f_id,
+                                    &dyn_dir, proto == IPPROTO_TCP ?
+                                       TCP(ulp) : NULL))
+                                       != NULL) {
+                                       /*
+                                        * Found dynamic entry, update stats
+                                        * and jump to the 'action' part of
+                                        * the parent rule by setting
+                                        * f, cmd, l and clearing cmdlen.
+                                        */
+                                       q->pcnt++;
+                                       q->bcnt += pktlen;
+                                       /* XXX we would like to have f_pos
+                                        * readily accessible in the dynamic
+                                        * rule, instead of having to
+                                        * lookup q->rule.
+                                        */
+                                       f = q->rule;
+                                       f_pos = ipfw_find_rule(chain,
+                                               f->rulenum, f->id);
+                                       cmd = ACTION_PTR(f);
+                                       l = f->cmd_len - f->act_ofs;
+                                       ipfw_dyn_unlock();
+                                       cmdlen = 0;
+                                       match = 1;
+                                       break;
+                               }
+                               /*
+                                * Dynamic entry not found. If CHECK_STATE,
+                                * skip to next rule, if PROBE_STATE just
+                                * ignore and continue with next opcode.
+                                */
+                               if (cmd->opcode == O_CHECK_STATE)
+                                       l = 0;  /* exit inner loop */
+                               match = 1;
+                               break;
+
+                       case O_ACCEPT:
+                               retval = 0;     /* accept */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_PIPE:
+                       case O_QUEUE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               if (cmd->opcode == O_PIPE)
+                                       args->rule.info |= IPFW_IS_PIPE;
+                               if (V_fw_one_pass)
+                                       args->rule.info |= IPFW_ONEPASS;
+                               retval = IP_FW_DUMMYNET;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_DIVERT:
+                       case O_TEE:
+                               if (args->eh) /* not on layer 2 */
+                                   break;
+                               /* otherwise this is terminal */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               retval = (cmd->opcode == O_DIVERT) ?
+                                       IP_FW_DIVERT : IP_FW_TEE;
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+                               break;
+
+                       case O_COUNT:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_SKIPTO:
+                           f->pcnt++;  /* update stats */
+                           f->bcnt += pktlen;
+                           f->timestamp = time_uptime;
+                           /* If possible use cached f_pos (in f->next_rule),
+                            * whose version is written in f->next_rule
+                            * (horrible hacks to avoid changing the ABI).
+                            */
+                           if (cmd->arg1 != IP_FW_TABLEARG &&
+                                   (uintptr_t)f->x_next == chain->id) {
+                               f_pos = (uintptr_t)f->next_rule;
+                           } else {
+                               int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               /* make sure we do not jump backward */
+                               if (i <= f->rulenum)
+                                   i = f->rulenum + 1;
+                               f_pos = ipfw_find_rule(chain, i, 0);
+                               /* update the cache */
+                               if (cmd->arg1 != IP_FW_TABLEARG) {
+                                   f->next_rule =
+                                       (void *)(uintptr_t)f_pos;
+                                   f->x_next =
+                                       (void *)(uintptr_t)chain->id;
+                               }
+                           }
+                           /*
+                            * Skip disabled rules, and re-enter
+                            * the inner loop with the correct
+                            * f_pos, f, l and cmd.
+                            * Also clear cmdlen and skip_or
+                            */
+                           for (; f_pos < chain->n_rules - 1 &&
+                                   (V_set_disable &
+                                    (1 << chain->map[f_pos]->set));
+                                   f_pos++)
+                               ;
+                           /* Re-enter the inner loop at the skipto rule. */
+                           f = chain->map[f_pos];
+                           l = f->cmd_len;
+                           cmd = f->cmd;
+                           match = 1;
+                           cmdlen = 0;
+                           skip_or = 0;
+                           continue;
+                           break;      /* not reached */
+
+                       case O_REJECT:
+                               /*
+                                * Drop the packet and send a reject notice
+                                * if the packet is not ICMP (or is an ICMP
+                                * query), and it is not multicast/broadcast.
+                                */
+                               if (hlen > 0 && is_ipv4 && offset == 0 &&
+                                   (proto != IPPROTO_ICMP ||
+                                    is_icmp_query(ICMP(ulp))) &&
+                                   !(m->m_flags & (M_BCAST|M_MCAST)) &&
+                                   !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
+                                       send_reject(args, cmd->arg1, iplen, ip);
+                                       m = args->m;
+                               }
+                               /* FALLTHROUGH */
+#ifdef INET6
+                       case O_UNREACH6:
+                               if (hlen > 0 && is_ipv6 &&
+                                   ((offset & IP6F_OFF_MASK) == 0) &&
+                                   (proto != IPPROTO_ICMPV6 ||
+                                    (is_icmp6_query(icmp6_type) == 1)) &&
+                                   !(m->m_flags & (M_BCAST|M_MCAST)) &&
+                                   !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
+                                       send_reject6(
+                                           args, cmd->arg1, hlen,
+                                           (struct ip6_hdr *)ip);
+                                       m = args->m;
+                               }
+                               /* FALLTHROUGH */
+#endif
+                       case O_DENY:
+                               retval = IP_FW_DENY;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_FORWARD_IP:
+                               if (args->eh)   /* not valid on layer2 pkts */
+                                       break;
+                               if (!q || dyn_dir == MATCH_FORWARD) {
+                                   struct sockaddr_in *sa;
+                                   sa = &(((ipfw_insn_sa *)cmd)->sa);
+                                   if (sa->sin_addr.s_addr == INADDR_ANY) {
+                                       bcopy(sa, &args->hopstore,
+                                                       sizeof(*sa));
+                                       args->hopstore.sin_addr.s_addr =
+                                                   htonl(tablearg);
+                                       args->next_hop = &args->hopstore;
+                                   } else {
+                                       args->next_hop = sa;
+                                   }
+                               }
+                               retval = IP_FW_PASS;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_NETGRAPH:
+                       case O_NGTEE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               if (V_fw_one_pass)
+                                       args->rule.info |= IPFW_ONEPASS;
+                               retval = (cmd->opcode == O_NETGRAPH) ?
+                                   IP_FW_NETGRAPH : IP_FW_NGTEE;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_SETFIB:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               M_SETFIB(m, cmd->arg1);
+                               args->f_id.fib = cmd->arg1;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_NAT:
+                               if (!IPFW_NAT_LOADED) {
+                                   retval = IP_FW_DENY;
+                               } else {
+                                   struct cfg_nat *t;
+                                   int nat_id;
+
+                                   set_match(args, f_pos, chain);
+                                   t = ((ipfw_insn_nat *)cmd)->nat;
+                                   if (t == NULL) {
+                                       nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                               tablearg : cmd->arg1;
+                                       t = (*lookup_nat_ptr)(&chain->nat, nat_id);
+
+                                       if (t == NULL) {
+                                           retval = IP_FW_DENY;
+                                           l = 0;      /* exit inner loop */
+                                           done = 1;   /* exit outer loop */
+                                           break;
+                                       }
+                                       if (cmd->arg1 != IP_FW_TABLEARG)
+                                           ((ipfw_insn_nat *)cmd)->nat = t;
+                                   }
+                                   retval = ipfw_nat_ptr(args, t, m);
+                               }
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_REASS: {
+                               int ip_off;
+
+                               f->pcnt++;
+                               f->bcnt += pktlen;
+                               l = 0;  /* in any case exit inner loop */
+                               ip_off = ntohs(ip->ip_off);
+
+                               /* if not fragmented, go to next rule */
+                               if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
+                                   break;
+                               /* 
+                                * ip_reass() expects len & off in host
+                                * byte order.
+                                */
+                               SET_HOST_IPLEN(ip);
+
+                               args->m = m = ip_reass(m);
+
+                               /*
+                                * do IP header checksum fixup.
+                                */
+                               if (m == NULL) { /* fragment got swallowed */
+                                   retval = IP_FW_DENY;
+                               } else { /* good, packet complete */
+                                   int hlen;
+
+                                   ip = mtod(m, struct ip *);
+                                   hlen = ip->ip_hl << 2;
+                                   SET_NET_IPLEN(ip);
+                                   ip->ip_sum = 0;
+                                   if (hlen == sizeof(struct ip))
+                                       ip->ip_sum = in_cksum_hdr(ip);
+                                   else
+                                       ip->ip_sum = in_cksum(m, hlen);
+                                   retval = IP_FW_REASS;
+                                   set_match(args, f_pos, chain);
+                               }
+                               done = 1;       /* exit outer loop */
+                               break;
+                       }
+
+                       default:
+                               panic("-- unknown opcode %d\n", cmd->opcode);
+                       } /* end of switch() on opcodes */
+                       /*
+                        * if we get here with l=0, then match is irrelevant.
+                        */
+
+                       if (cmd->len & F_NOT)
+                               match = !match;
+
+                       if (match) {
+                               if (cmd->len & F_OR)
+                                       skip_or = 1;
+                       } else {
+                               if (!(cmd->len & F_OR)) /* not an OR block, */
+                                       break;          /* try next rule    */
+                       }
+
+               }       /* end of inner loop, scan opcodes */
+
+               if (done)
+                       break;
+
+/* next_rule:; */      /* try next rule                */
+
+       }               /* end of outer for, scan rules */
+
+       if (done) {
+               struct ip_fw *rule = chain->map[f_pos];
+               /* Update statistics */
+               rule->pcnt++;
+               rule->bcnt += pktlen;
+               rule->timestamp = time_uptime;
+       } else {
+               retval = IP_FW_DENY;
+               printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+       }
+       IPFW_RUNLOCK(chain);
+#ifdef __FreeBSD__
+       if (ucred_cache != NULL)
+               crfree(ucred_cache);
+#endif
+       return (retval);
+
+pullup_failed:
+       if (V_fw_verbose)
+               printf("ipfw: pullup failed\n");
+       return (IP_FW_DENY);
+}
+
+/*
+ * Module and VNET glue
+ */
+
+/*
+ * Stuff that must be initialised only on boot or module load
+ */
+static int
+ipfw_init(void)
+{
+       int error = 0;
+
+       ipfw_dyn_attach();
+       /*
+        * Only print out this stuff the first time around,
+        * when called from the sysinit code.
+        */
+       printf("ipfw2 "
+#ifdef INET6
+               "(+ipv6) "
+#endif
+               "initialized, divert %s, nat %s, "
+               "rule-based forwarding "
+#ifdef IPFIREWALL_FORWARD
+               "enabled, "
+#else
+               "disabled, "
+#endif
+               "default to %s, logging ",
+#ifdef IPDIVERT
+               "enabled",
+#else
+               "loadable",
+#endif
+#ifdef IPFIREWALL_NAT
+               "enabled",
+#else
+               "loadable",
+#endif
+               default_to_accept ? "accept" : "deny");
+
+       /*
+        * Note: V_xxx variables can be accessed here but the vnet specific
+        * initializer may not have been called yet for the VIMAGE case.
+        * Tuneables will have been processed. We will print out values for
+        * the default vnet. 
+        * XXX This should all be rationalized AFTER 8.0
+        */
+       if (V_fw_verbose == 0)
+               printf("disabled\n");
+       else if (V_verbose_limit == 0)
+               printf("unlimited\n");
+       else
+               printf("limited to %d packets/entry by default\n",
+                   V_verbose_limit);
+
+       ipfw_log_bpf(1); /* init */
+       return (error);
+}
+
+/*
+ * Called for the removal of the last instance only on module unload.
+ */
+static void
+ipfw_destroy(void)
+{
+
+       ipfw_log_bpf(0); /* uninit */
+       ipfw_dyn_detach();
+       printf("IP firewall unloaded\n");
+}
+
+/*
+ * Stuff that must be initialized for every instance
+ * (including the first of course).
+ */
+static int
+vnet_ipfw_init(const void *unused)
+{
+       int error;
+       struct ip_fw *rule = NULL;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+
+       /* First set up some values that are compile time options */
+       V_autoinc_step = 100;   /* bounded to 1..1000 in add_rule() */
+       V_fw_deny_unknown_exthdrs = 1;
+#ifdef IPFIREWALL_VERBOSE
+       V_fw_verbose = 1;
+#endif
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+       V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#endif
+#ifdef IPFIREWALL_NAT
+       LIST_INIT(&chain->nat);
+#endif
+
+       /* insert the default rule and create the initial map */
+       chain->n_rules = 1;
+       chain->static_len = sizeof(struct ip_fw);
+       chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
+       if (chain->map)
+               rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
+       if (rule == NULL) {
+               if (chain->map)
+                       free(chain->map, M_IPFW);
+               printf("ipfw2: ENOSPC initializing default rule "
+                       "(support disabled)\n");
+               return (ENOSPC);
+       }
+       error = ipfw_init_tables(chain);
+       if (error) {
+               panic("init_tables"); /* XXX Marko fix this ! */
+       }
+
+       /* fill and insert the default rule */
+       rule->act_ofs = 0;
+       rule->rulenum = IPFW_DEFAULT_RULE;
+       rule->cmd_len = 1;
+       rule->set = RESVD_SET;
+       rule->cmd[0].len = 1;
+       rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+       chain->rules = chain->default_rule = chain->map[0] = rule;
+       chain->id = rule->id = 1;
+
+       IPFW_LOCK_INIT(chain);
+       ipfw_dyn_init();
+
+       /* First set up some values that are compile time options */
+       V_ipfw_vnet_ready = 1;          /* Open for business */
+
+       /*
+        * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
+        * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
+        * we still keep the module alive because the sockopt and
+        * layer2 paths are still useful.
+        * ipfw[6]_hook return 0 on success, ENOENT on failure,
+        * so we can ignore the exact return value and just set a flag.
+        *
+        * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
+        * changes in the underlying (per-vnet) variables trigger
+        * immediate hook()/unhook() calls.
+        * In layer2 we have the same behaviour, except that V_ether_ipfw
+        * is checked on each packet because there are no pfil hooks.
+        */
+       V_ip_fw_ctl_ptr = ipfw_ctl;
+       V_ip_fw_chk_ptr = ipfw_chk;
+       error = ipfw_attach_hooks(1);
+       return (error);
+}
+
+/*
+ * Called for the removal of each instance.
+ */
+static int
+vnet_ipfw_uninit(const void *unused)
+{
+       struct ip_fw *reap, *rule;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+       int i;
+
+       V_ipfw_vnet_ready = 0; /* tell new callers to go away */
+       /*
+        * disconnect from ipv4, ipv6, layer2 and sockopt.
+        * Then grab, release and grab again the WLOCK so we make
+        * sure the update is propagated and nobody will be in.
+        */
+       (void)ipfw_attach_hooks(0 /* detach */);
+       V_ip_fw_chk_ptr = NULL;
+       V_ip_fw_ctl_ptr = NULL;
+       IPFW_UH_WLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+       IPFW_UH_WLOCK(chain);
+
+       IPFW_WLOCK(chain);
+       IPFW_WUNLOCK(chain);
+       IPFW_WLOCK(chain);
+
+       ipfw_dyn_uninit(0);     /* run the callout_drain */
+       ipfw_destroy_tables(chain);
+       reap = NULL;
+       for (i = 0; i < chain->n_rules; i++) {
+               rule = chain->map[i];
+               rule->x_next = reap;
+               reap = rule;
+       }
+       if (chain->map)
+               free(chain->map, M_IPFW);
+       IPFW_WUNLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+       if (reap != NULL)
+               ipfw_reap_rules(reap);
+       IPFW_LOCK_DESTROY(chain);
+       ipfw_dyn_uninit(1);     /* free the remaining parts */
+       return 0;
+}
+
+/*
+ * Module event handler.
+ * In general we have the choice of handling most of these events by the
+ * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
+ * use the SYSINIT handlers as they are more capable of expressing the
+ * flow of control during module and vnet operations, so this is just
+ * a skeleton. Note there is no SYSINIT equivalent of the module
+ * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
+ */
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+       int err = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               /* Called once at module load or
+                * system boot if compiled in. */
+               break;
+       case MOD_QUIESCE:
+               /* Called before unload. May veto unloading. */
+               break;
+       case MOD_UNLOAD:
+               /* Called during unload. */
+               break;
+       case MOD_SHUTDOWN:
+               /* Called during system shutdown. */
+               break;
+       default:
+               err = EOPNOTSUPP;
+               break;
+       }
+       return err;
+}
+
+static moduledata_t ipfwmod = {
+       "ipfw",
+       ipfw_modevent,
+       0
+};
+
+/* Define startup order. */
+#define        IPFW_SI_SUB_FIREWALL    SI_SUB_PROTO_IFATTACHDOMAIN
+#define        IPFW_MODEVENT_ORDER     (SI_ORDER_ANY - 255) /* On boot slot in here. */
+#define        IPFW_MODULE_ORDER       (IPFW_MODEVENT_ORDER + 1) /* A little later. */
+#define        IPFW_VNET_ORDER         (IPFW_MODEVENT_ORDER + 2) /* Later still. */
+
+DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
+MODULE_VERSION(ipfw, 2);
+/* should declare some dependencies here */
+
+/*
+ * Starting up. Done in order after ipfwmod() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+           ipfw_init, NULL);
+VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+           vnet_ipfw_init, NULL);
+/*
+ * Closing up shop. These are done in REVERSE ORDER, but still
+ * after ipfwmod() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+           ipfw_destroy, NULL);
+VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+           vnet_ipfw_uninit, NULL);
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c
new file mode 100644 (file)
index 0000000..2bdd299
--- /dev/null
@@ -0,0 +1,1241 @@
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#define        DEB(x)
+#define        DDB(x) x
+
+/*
+ * Dynamic rule support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>    /* ip_defttl */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>       /* IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ *  + stateful rules;
+ *  + enforcing limits on the number of sessions;
+ *  + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. XXX we should make them survive.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+
+/*
+ * Static variables followed by global ones
+ */
+static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets);
+static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
+static VNET_DEFINE(struct callout, ipfw_timeout);
+#define        V_ipfw_dyn_v                    VNET(ipfw_dyn_v)
+#define        V_dyn_buckets                   VNET(dyn_buckets)
+#define        V_curr_dyn_buckets              VNET(curr_dyn_buckets)
+#define V_ipfw_timeout                  VNET(ipfw_timeout)
+
+static uma_zone_t ipfw_dyn_rule_zone;
+#ifndef __FreeBSD__
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
+static struct mtx ipfw_dyn_mtx;                /* mutex guarding dynamic rules */
+#endif
+
+#define        IPFW_DYN_LOCK_INIT() \
+       mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
+#define        IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
+#define        IPFW_DYN_LOCK()         mtx_lock(&ipfw_dyn_mtx)
+#define        IPFW_DYN_UNLOCK()       mtx_unlock(&ipfw_dyn_mtx)
+#define        IPFW_DYN_LOCK_ASSERT()  mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+
+void
+ipfw_dyn_unlock(void)
+{
+       IPFW_DYN_UNLOCK();
+}
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
+
+#define        V_dyn_ack_lifetime              VNET(dyn_ack_lifetime)
+#define        V_dyn_syn_lifetime              VNET(dyn_syn_lifetime)
+#define        V_dyn_fin_lifetime              VNET(dyn_fin_lifetime)
+#define        V_dyn_rst_lifetime              VNET(dyn_rst_lifetime)
+#define        V_dyn_udp_lifetime              VNET(dyn_udp_lifetime)
+#define        V_dyn_short_lifetime            VNET(dyn_short_lifetime)
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
+static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
+static VNET_DEFINE(u_int32_t, dyn_keepalive);
+
+#define        V_dyn_keepalive_interval        VNET(dyn_keepalive_interval)
+#define        V_dyn_keepalive_period          VNET(dyn_keepalive_period)
+#define        V_dyn_keepalive                 VNET(dyn_keepalive)
+
+static VNET_DEFINE(u_int32_t, dyn_count);      /* # of dynamic rules */
+static VNET_DEFINE(u_int32_t, dyn_max);                /* max # of dynamic rules */
+
+#define        V_dyn_count                     VNET(dyn_count)
+#define        V_dyn_max                       VNET(dyn_max)
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f2)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+    CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
+    "Number of dyn. buckets");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+    CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+    "Current Number of dyn. buckets");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count,
+    CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
+    "Number of dyn. rules");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max,
+    CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
+    "Max number of dyn. rules");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+    "Lifetime of dyn. rules for acks");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+    "Lifetime of dyn. rules for syn");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+    "Lifetime of dyn. rules for fin");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+    "Lifetime of dyn. rules for rst");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+    "Lifetime of dyn. rules for UDP");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+    CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+    "Lifetime of dyn. rules for other situations");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+    CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+    "Enable keepalives for dyn. rules");
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+       u_int32_t i;
+       i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+           (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+           (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+           (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+           (id->dst_port) ^ (id->src_port);
+       return i;
+}
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+       u_int32_t i;
+
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(id)) 
+               i = hash_packet6(id);
+       else
+#endif /* INET6 */
+       i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+       i &= (V_curr_dyn_buckets - 1);
+       return i;
+}
+
+static __inline void
+unlink_dyn_rule_print(struct ipfw_flow_id *id)
+{
+       struct in_addr da;
+#ifdef INET6
+       char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+#else
+       char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(id)) {
+               ip6_sprintf(src, &id->src_ip6);
+               ip6_sprintf(dst, &id->dst_ip6);
+       } else
+#endif
+       {
+               da.s_addr = htonl(id->src_ip);
+               inet_ntoa_r(da, src);
+               da.s_addr = htonl(id->dst_ip);
+               inet_ntoa_r(da, dst);
+       }
+       printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
+           src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) {                               \
+       ipfw_dyn_rule *old_q = q;                                       \
+                                                                       \
+       /* remove a refcount to the parent */                           \
+       if (q->dyn_type == O_LIMIT)                                     \
+               q->parent->count--;                                     \
+       DEB(unlink_dyn_rule_print(&q->id);)                             \
+       if (prev != NULL)                                               \
+               prev->next = q = q->next;                               \
+       else                                                            \
+               head = q = q->next;                                     \
+       V_dyn_count--;                                                  \
+       uma_zfree(ipfw_dyn_rule_zone, old_q); }
+
+#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
+
+/**
+ * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
+ *
+ * If keep_me == NULL, rules are deleted even if not expired,
+ * otherwise only expired rules are removed.
+ *
+ * The value of the second parameter is also used to point to identify
+ * a rule we absolutely do not want to remove (e.g. because we are
+ * holding a reference to it -- this is the case with O_LIMIT_PARENT
+ * rules). The pointer is only used for comparison, so any non-null
+ * value will do.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
+{
+       static u_int32_t last_remove = 0;
+
+#define FORCE (keep_me == NULL)
+
+       ipfw_dyn_rule *prev, *q;
+       int i, pass = 0, max_pass = 0;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+               return;
+       /* do not expire more than once per second, it is useless */
+       if (!FORCE && last_remove == time_uptime)
+               return;
+       last_remove = time_uptime;
+
+       /*
+        * because O_LIMIT refer to parent rules, during the first pass only
+        * remove child and mark any pending LIMIT_PARENT, and remove
+        * them in a second pass.
+        */
+next_pass:
+       for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+               for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
+                       /*
+                        * Logic can become complex here, so we split tests.
+                        */
+                       if (q == keep_me)
+                               goto next;
+                       if (rule != NULL && rule != q->rule)
+                               goto next; /* not the one we are looking for */
+                       if (q->dyn_type == O_LIMIT_PARENT) {
+                               /*
+                                * handle parent in the second pass,
+                                * record we need one.
+                                */
+                               max_pass = 1;
+                               if (pass == 0)
+                                       goto next;
+                               if (FORCE && q->count != 0 ) {
+                                       /* XXX should not happen! */
+                                       printf("ipfw: OUCH! cannot remove rule,"
+                                            " count %d\n", q->count);
+                               }
+                       } else {
+                               if (!FORCE &&
+                                   !TIME_LEQ( q->expire, time_uptime ))
+                                       goto next;
+                       }
+             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
+                     UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                     continue;
+             }
+next:
+                       prev=q;
+                       q=q->next;
+               }
+       }
+       if (pass++ < max_pass)
+               goto next_pass;
+}
+
+void
+ipfw_remove_dyn_children(struct ip_fw *rule)
+{
+       IPFW_DYN_LOCK();
+       remove_dyn_rule(rule, NULL /* force removal */);
+       IPFW_DYN_UNLOCK();
+}
+
+/**
+ * lookup a dynamic rule, locked version
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+       /*
+        * stateful ipfw extensions.
+        * Lookup into dynamic session queue
+        */
+#define MATCH_REVERSE  0
+#define MATCH_FORWARD  1
+#define MATCH_NONE     2
+#define MATCH_UNKNOWN  3
+       int i, dir = MATCH_NONE;
+       ipfw_dyn_rule *prev, *q=NULL;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL)
+               goto done;      /* not found */
+       i = hash_packet( pkt );
+       for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
+               if (q->dyn_type == O_LIMIT_PARENT && q->count)
+                       goto next;
+               if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
+                       UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                       continue;
+               }
+               if (pkt->proto == q->id.proto &&
+                   q->dyn_type != O_LIMIT_PARENT) {
+                       if (IS_IP6_FLOW_ID(pkt)) {
+                           if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                               &(q->id.src_ip6)) &&
+                           IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                               &(q->id.dst_ip6)) &&
+                           pkt->src_port == q->id.src_port &&
+                           pkt->dst_port == q->id.dst_port ) {
+                               dir = MATCH_FORWARD;
+                               break;
+                           }
+                           if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                                   &(q->id.dst_ip6)) &&
+                               IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                                   &(q->id.src_ip6)) &&
+                               pkt->src_port == q->id.dst_port &&
+                               pkt->dst_port == q->id.src_port ) {
+                                   dir = MATCH_REVERSE;
+                                   break;
+                           }
+                       } else {
+                           if (pkt->src_ip == q->id.src_ip &&
+                               pkt->dst_ip == q->id.dst_ip &&
+                               pkt->src_port == q->id.src_port &&
+                               pkt->dst_port == q->id.dst_port ) {
+                                   dir = MATCH_FORWARD;
+                                   break;
+                           }
+                           if (pkt->src_ip == q->id.dst_ip &&
+                               pkt->dst_ip == q->id.src_ip &&
+                               pkt->src_port == q->id.dst_port &&
+                               pkt->dst_port == q->id.src_port ) {
+                                   dir = MATCH_REVERSE;
+                                   break;
+                           }
+                       }
+               }
+next:
+               prev = q;
+               q = q->next;
+       }
+       if (q == NULL)
+               goto done; /* q = NULL, not found */
+
+       if ( prev != NULL) { /* found and not in front */
+               prev->next = q->next;
+               q->next = V_ipfw_dyn_v[i];
+               V_ipfw_dyn_v[i] = q;
+       }
+       if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
+               u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST);
+
+#define BOTH_SYN       (TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN       (TH_FIN | (TH_FIN << 8))
+               q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
+               switch (q->state) {
+               case TH_SYN:                            /* opening */
+                       q->expire = time_uptime + V_dyn_syn_lifetime;
+                       break;
+
+               case BOTH_SYN:                  /* move to established */
+               case BOTH_SYN | TH_FIN :        /* one side tries to close */
+               case BOTH_SYN | (TH_FIN << 8) :
+                       if (tcp) {
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+                           u_int32_t ack = ntohl(tcp->th_ack);
+                           if (dir == MATCH_FORWARD) {
+                               if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
+                                   q->ack_fwd = ack;
+                               else { /* ignore out-of-sequence */
+                                   break;
+                               }
+                           } else {
+                               if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
+                                   q->ack_rev = ack;
+                               else { /* ignore out-of-sequence */
+                                   break;
+                               }
+                           }
+                       }
+                       q->expire = time_uptime + V_dyn_ack_lifetime;
+                       break;
+
+               case BOTH_SYN | BOTH_FIN:       /* both sides closed */
+                       if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+                               V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
+                       q->expire = time_uptime + V_dyn_fin_lifetime;
+                       break;
+
+               default:
+#if 0
+                       /*
+                        * reset or some invalid combination, but can also
+                        * occur if we use keep-state the wrong way.
+                        */
+                       if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+                               printf("invalid state: 0x%x\n", q->state);
+#endif
+                       if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+                               V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
+                       q->expire = time_uptime + V_dyn_rst_lifetime;
+                       break;
+               }
+       } else if (pkt->proto == IPPROTO_UDP) {
+               q->expire = time_uptime + V_dyn_udp_lifetime;
+       } else {
+               /* other protocols */
+               q->expire = time_uptime + V_dyn_short_lifetime;
+       }
+done:
+       if (match_direction)
+               *match_direction = dir;
+       return q;
+}
+
+ipfw_dyn_rule *
+ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+       ipfw_dyn_rule *q;
+
+       IPFW_DYN_LOCK();
+       q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+       if (q == NULL)
+               IPFW_DYN_UNLOCK();
+       /* NB: return table locked when q is not NULL */
+       return q;
+}
+
+static void
+realloc_dynamic_table(void)
+{
+       IPFW_DYN_LOCK_ASSERT();
+
+       /*
+        * Try reallocation, make sure we have a power of 2 and do
+        * not allow more than 64k entries. In case of overflow,
+        * default to 1024.
+        */
+
+       if (V_dyn_buckets > 65536)
+               V_dyn_buckets = 1024;
+       if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
+               V_dyn_buckets = V_curr_dyn_buckets; /* reset */
+               return;
+       }
+       V_curr_dyn_buckets = V_dyn_buckets;
+       if (V_ipfw_dyn_v != NULL)
+               free(V_ipfw_dyn_v, M_IPFW);
+       for (;;) {
+               V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
+                      M_IPFW, M_NOWAIT | M_ZERO);
+               if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
+                       break;
+               V_curr_dyn_buckets /= 2;
+       }
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ *   (O_LIMIT). When they are created, the parent is
+ *   increased by 1, and decreased on delete. In this case,
+ *   the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+       ipfw_dyn_rule *r;
+       int i;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v == NULL ||
+           (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
+               realloc_dynamic_table();
+               if (V_ipfw_dyn_v == NULL)
+                       return NULL; /* failed ! */
+       }
+       i = hash_packet(id);
+
+       r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+       if (r == NULL) {
+               printf ("ipfw: sorry cannot allocate state\n");
+               return NULL;
+       }
+
+       /* increase refcount on parent, and set pointer */
+       if (dyn_type == O_LIMIT) {
+               ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+               if ( parent->dyn_type != O_LIMIT_PARENT)
+                       panic("invalid parent");
+               parent->count++;
+               r->parent = parent;
+               rule = parent->rule;
+       }
+
+       r->id = *id;
+       r->expire = time_uptime + V_dyn_syn_lifetime;
+       r->rule = rule;
+       r->dyn_type = dyn_type;
+       r->pcnt = r->bcnt = 0;
+       r->count = 0;
+
+       r->bucket = i;
+       r->next = V_ipfw_dyn_v[i];
+       V_ipfw_dyn_v[i] = r;
+       V_dyn_count++;
+       DEB({
+               struct in_addr da;
+#ifdef INET6
+               char src[INET6_ADDRSTRLEN];
+               char dst[INET6_ADDRSTRLEN];
+#else
+               char src[INET_ADDRSTRLEN];
+               char dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(r->id))) {
+                       ip6_sprintf(src, &r->id.src_ip6);
+                       ip6_sprintf(dst, &r->id.dst_ip6);
+               } else
+#endif
+               {
+                       da.s_addr = htonl(r->id.src_ip);
+                       inet_ntoa_r(da, src);
+                       da.s_addr = htonl(r->id.dst_ip);
+                       inet_ntoa_r(da, dst);
+               }
+               printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
+                   dyn_type, src, r->id.src_port, dst, r->id.dst_port,
+                   V_dyn_count);
+       })
+       return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+       ipfw_dyn_rule *q;
+       int i;
+
+       IPFW_DYN_LOCK_ASSERT();
+
+       if (V_ipfw_dyn_v) {
+               int is_v6 = IS_IP6_FLOW_ID(pkt);
+               i = hash_packet( pkt );
+               for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+                       if (q->dyn_type == O_LIMIT_PARENT &&
+                           rule== q->rule &&
+                           pkt->proto == q->id.proto &&
+                           pkt->src_port == q->id.src_port &&
+                           pkt->dst_port == q->id.dst_port &&
+                           (
+                               (is_v6 &&
+                                IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+                                       &(q->id.src_ip6)) &&
+                                IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+                                       &(q->id.dst_ip6))) ||
+                               (!is_v6 &&
+                                pkt->src_ip == q->id.src_ip &&
+                                pkt->dst_ip == q->id.dst_ip)
+                           )
+                       ) {
+                               q->expire = time_uptime + V_dyn_short_lifetime;
+                               DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
+                               return q;
+                       }
+       }
+       return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+int
+ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg)
+{
+       static int last_log;
+       ipfw_dyn_rule *q;
+       struct in_addr da;
+#ifdef INET6
+       char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+       char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+       src[0] = '\0';
+       dst[0] = '\0';
+
+       IPFW_DYN_LOCK();
+
+       DEB(
+#ifdef INET6
+       if (IS_IP6_FLOW_ID(&(args->f_id))) {
+               ip6_sprintf(src, &args->f_id.src_ip6);
+               ip6_sprintf(dst, &args->f_id.dst_ip6);
+       } else
+#endif
+       {
+               da.s_addr = htonl(args->f_id.src_ip);
+               inet_ntoa_r(da, src);
+               da.s_addr = htonl(args->f_id.dst_ip);
+               inet_ntoa_r(da, dst);
+       }
+       printf("ipfw: %s: type %d %s %u -> %s %u\n",
+           __func__, cmd->o.opcode, src, args->f_id.src_port,
+           dst, args->f_id.dst_port);
+       src[0] = '\0';
+       dst[0] = '\0';
+       )
+
+       q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+       if (q != NULL) {        /* should never occur */
+               if (last_log != time_uptime) {
+                       last_log = time_uptime;
+                       printf("ipfw: %s: entry already present, done\n",
+                           __func__);
+               }
+               IPFW_DYN_UNLOCK();
+               return (0);
+       }
+
+       if (V_dyn_count >= V_dyn_max)
+               /* Run out of slots, try to remove any expired rule. */
+               remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
+
+       if (V_dyn_count >= V_dyn_max) {
+               if (last_log != time_uptime) {
+                       last_log = time_uptime;
+                       printf("ipfw: %s: Too many dynamic rules\n", __func__);
+               }
+               IPFW_DYN_UNLOCK();
+               return (1);     /* cannot install, notify caller */
+       }
+
+       switch (cmd->o.opcode) {
+       case O_KEEP_STATE:      /* bidir rule */
+               add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+               break;
+
+       case O_LIMIT: {         /* limit number of sessions */
+               struct ipfw_flow_id id;
+               ipfw_dyn_rule *parent;
+               uint32_t conn_limit;
+               uint16_t limit_mask = cmd->limit_mask;
+
+               conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
+                   tablearg : cmd->conn_limit;
+                 
+               DEB(
+               if (cmd->conn_limit == IP_FW_TABLEARG)
+                       printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+                           "(tablearg)\n", __func__, conn_limit);
+               else
+                       printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+                           __func__, conn_limit);
+               )
+
+               id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+               id.proto = args->f_id.proto;
+               id.addr_type = args->f_id.addr_type;
+               id.fib = M_GETFIB(args->m);
+
+               if (IS_IP6_FLOW_ID (&(args->f_id))) {
+                       if (limit_mask & DYN_SRC_ADDR)
+                               id.src_ip6 = args->f_id.src_ip6;
+                       if (limit_mask & DYN_DST_ADDR)
+                               id.dst_ip6 = args->f_id.dst_ip6;
+               } else {
+                       if (limit_mask & DYN_SRC_ADDR)
+                               id.src_ip = args->f_id.src_ip;
+                       if (limit_mask & DYN_DST_ADDR)
+                               id.dst_ip = args->f_id.dst_ip;
+               }
+               if (limit_mask & DYN_SRC_PORT)
+                       id.src_port = args->f_id.src_port;
+               if (limit_mask & DYN_DST_PORT)
+                       id.dst_port = args->f_id.dst_port;
+               if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+                       printf("ipfw: %s: add parent failed\n", __func__);
+                       IPFW_DYN_UNLOCK();
+                       return (1);
+               }
+
+               if (parent->count >= conn_limit) {
+                       /* See if we can remove some expired rule. */
+                       remove_dyn_rule(rule, parent);
+                       if (parent->count >= conn_limit) {
+                               if (V_fw_verbose && last_log != time_uptime) {
+                                       last_log = time_uptime;
+#ifdef INET6
+                                       /*
+                                        * XXX IPv6 flows are not
+                                        * supported yet.
+                                        */
+                                       if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                                               char ip6buf[INET6_ADDRSTRLEN];
+                                               snprintf(src, sizeof(src),
+                                                   "[%s]", ip6_sprintf(ip6buf,
+                                                       &args->f_id.src_ip6));
+                                               snprintf(dst, sizeof(dst),
+                                                   "[%s]", ip6_sprintf(ip6buf,
+                                                       &args->f_id.dst_ip6));
+                                       } else
+#endif
+                                       {
+                                               da.s_addr =
+                                                   htonl(args->f_id.src_ip);
+                                               inet_ntoa_r(da, src);
+                                               da.s_addr =
+                                                   htonl(args->f_id.dst_ip);
+                                               inet_ntoa_r(da, dst);
+                                       }
+                                       log(LOG_SECURITY | LOG_DEBUG,
+                                           "ipfw: %d %s %s:%u -> %s:%u, %s\n",
+                                           parent->rule->rulenum,
+                                           "drop session",
+                                           src, (args->f_id.src_port),
+                                           dst, (args->f_id.dst_port),
+                                           "too many entries");
+                               }
+                               IPFW_DYN_UNLOCK();
+                               return (1);
+                       }
+               }
+               add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
+               break;
+       }
+       default:
+               printf("ipfw: %s: unknown dynamic rule type %u\n",
+                   __func__, cmd->o.opcode);
+               IPFW_DYN_UNLOCK();
+               return (1);
+       }
+
+       /* XXX just set lifetime */
+       lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+       IPFW_DYN_UNLOCK();
+       return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+struct mbuf *
+ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+    u_int32_t ack, int flags)
+{
+       struct mbuf *m = NULL;          /* stupid compiler */
+       int len, dir;
+       struct ip *h = NULL;            /* stupid compiler */
+#ifdef INET6
+       struct ip6_hdr *h6 = NULL;
+#endif
+       struct tcphdr *th = NULL;
+
+       MGETHDR(m, M_DONTWAIT, MT_DATA);
+       if (m == NULL)
+               return (NULL);
+
+       M_SETFIB(m, id->fib);
+#ifdef MAC
+       if (replyto != NULL)
+               mac_netinet_firewall_reply(replyto, m);
+       else
+               mac_netinet_firewall_send(m);
+#else
+       (void)replyto;          /* don't warn about unused arg */
+#endif
+
+       switch (id->addr_type) {
+       case 4:
+               len = sizeof(struct ip) + sizeof(struct tcphdr);
+               break;
+#ifdef INET6
+       case 6:
+               len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+               break;
+#endif
+       default:
+               /* XXX: log me?!? */
+               FREE_PKT(m);
+               return (NULL);
+       }
+       dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
+
+       m->m_data += max_linkhdr;
+       m->m_flags |= M_SKIP_FIREWALL;
+       m->m_pkthdr.len = m->m_len = len;
+       m->m_pkthdr.rcvif = NULL;
+       bzero(m->m_data, len);
+
+       switch (id->addr_type) {
+       case 4:
+               h = mtod(m, struct ip *);
+
+               /* prepare for checksum */
+               h->ip_p = IPPROTO_TCP;
+               h->ip_len = htons(sizeof(struct tcphdr));
+               if (dir) {
+                       h->ip_src.s_addr = htonl(id->src_ip);
+                       h->ip_dst.s_addr = htonl(id->dst_ip);
+               } else {
+                       h->ip_src.s_addr = htonl(id->dst_ip);
+                       h->ip_dst.s_addr = htonl(id->src_ip);
+               }
+
+               th = (struct tcphdr *)(h + 1);
+               break;
+#ifdef INET6
+       case 6:
+               h6 = mtod(m, struct ip6_hdr *);
+
+               /* prepare for checksum */
+               h6->ip6_nxt = IPPROTO_TCP;
+               h6->ip6_plen = htons(sizeof(struct tcphdr));
+               if (dir) {
+                       h6->ip6_src = id->src_ip6;
+                       h6->ip6_dst = id->dst_ip6;
+               } else {
+                       h6->ip6_src = id->dst_ip6;
+                       h6->ip6_dst = id->src_ip6;
+               }
+
+               th = (struct tcphdr *)(h6 + 1);
+               break;
+#endif
+       }
+
+       if (dir) {
+               th->th_sport = htons(id->src_port);
+               th->th_dport = htons(id->dst_port);
+       } else {
+               th->th_sport = htons(id->dst_port);
+               th->th_dport = htons(id->src_port);
+       }
+       th->th_off = sizeof(struct tcphdr) >> 2;
+
+       if (flags & TH_RST) {
+               if (flags & TH_ACK) {
+                       th->th_seq = htonl(ack);
+                       th->th_flags = TH_RST;
+               } else {
+                       if (flags & TH_SYN)
+                               seq++;
+                       th->th_ack = htonl(seq);
+                       th->th_flags = TH_RST | TH_ACK;
+               }
+       } else {
+               /*
+                * Keepalive - use caller provided sequence numbers
+                */
+               th->th_seq = htonl(seq);
+               th->th_ack = htonl(ack);
+               th->th_flags = TH_ACK;
+       }
+
+       switch (id->addr_type) {
+       case 4:
+               th->th_sum = in_cksum(m, len);
+
+               /* finish the ip header */
+               h->ip_v = 4;
+               h->ip_hl = sizeof(*h) >> 2;
+               h->ip_tos = IPTOS_LOWDELAY;
+               h->ip_off = 0;
+               /* ip_len must be in host format for ip_output */
+               h->ip_len = len;
+               h->ip_ttl = V_ip_defttl;
+               h->ip_sum = 0;
+               break;
+#ifdef INET6
+       case 6:
+               th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
+                   sizeof(struct tcphdr));
+
+               /* finish the ip6 header */
+               h6->ip6_vfc |= IPV6_VERSION;
+               h6->ip6_hlim = IPV6_DEFHLIM;
+               break;
+#endif
+       }
+
+       return (m);
+}
+
+/*
+ * This procedure is only used to handle keepalives. It is invoked
+ * every dyn_keepalive_period
+ */
+ /* dummynet() and ipfw_tick() can't be static in windows */
+void
+ipfw_tick(void * vnetx) 
+{
+       struct mbuf *m0, *m, *mnext, **mtailp;
+#ifdef INET6
+       struct mbuf *m6, **m6_tailp;
+#endif
+       int i;
+       ipfw_dyn_rule *q;
+#ifdef VIMAGE
+       struct vnet *vp = vnetx;
+#endif
+
+       CURVNET_SET(vp);
+       if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+               goto done;
+
+       /*
+        * We make a chain of packets to go out here -- not deferring
+        * until after we drop the IPFW dynamic rule lock would result
+        * in a lock order reversal with the normal packet input -> ipfw
+        * call stack.
+        */
+       m0 = NULL;
+       mtailp = &m0;
+#ifdef INET6
+       m6 = NULL;
+       m6_tailp = &m6;
+#endif
+       IPFW_DYN_LOCK();
+       for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+               for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
+                       if (q->dyn_type == O_LIMIT_PARENT)
+                               continue;
+                       if (q->id.proto != IPPROTO_TCP)
+                               continue;
+                       if ( (q->state & BOTH_SYN) != BOTH_SYN)
+                               continue;
+                       if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
+                           q->expire))
+                               continue;       /* too early */
+                       if (TIME_LEQ(q->expire, time_uptime))
+                               continue;       /* too late, rule expired */
+
+                       m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1,
+                               q->ack_fwd, TH_SYN);
+                       mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1,
+                               q->ack_rev, 0);
+
+                       switch (q->id.addr_type) {
+                       case 4:
+                               if (m != NULL) {
+                                       *mtailp = m;
+                                       mtailp = &(*mtailp)->m_nextpkt;
+                               }
+                               if (mnext != NULL) {
+                                       *mtailp = mnext;
+                                       mtailp = &(*mtailp)->m_nextpkt;
+                               }
+                               break;
+#ifdef INET6
+                       case 6:
+                               if (m != NULL) {
+                                       *m6_tailp = m;
+                                       m6_tailp = &(*m6_tailp)->m_nextpkt;
+                               }
+                               if (mnext != NULL) {
+                                       *m6_tailp = mnext;
+                                       m6_tailp = &(*m6_tailp)->m_nextpkt;
+                               }
+                               break;
+#endif
+                       }
+
+                       m = mnext = NULL;
+               }
+       }
+       IPFW_DYN_UNLOCK();
+       for (m = mnext = m0; m != NULL; m = mnext) {
+               mnext = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip_output(m, NULL, NULL, 0, NULL, NULL);
+       }
+#ifdef INET6
+       for (m = mnext = m6; m != NULL; m = mnext) {
+               mnext = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+       }
+#endif
+done:
+       callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
+                     ipfw_tick, vnetx, 0);
+       CURVNET_RESTORE();
+}
+
+void
+ipfw_dyn_attach(void)
+{
+        ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+            sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+            UMA_ALIGN_PTR, 0);
+
+        IPFW_DYN_LOCK_INIT();
+}
+
+void
+ipfw_dyn_detach(void)
+{
+        uma_zdestroy(ipfw_dyn_rule_zone);
+        IPFW_DYN_LOCK_DESTROY();
+}
+
+void
+ipfw_dyn_init(void)
+{
+        V_ipfw_dyn_v = NULL;
+        V_dyn_buckets = 256;    /* must be power of 2 */
+        V_curr_dyn_buckets = 256; /* must be power of 2 */
+        V_dyn_ack_lifetime = 300;
+        V_dyn_syn_lifetime = 20;
+        V_dyn_fin_lifetime = 1;
+        V_dyn_rst_lifetime = 1;
+        V_dyn_udp_lifetime = 10;
+        V_dyn_short_lifetime = 5;
+
+        V_dyn_keepalive_interval = 20;
+        V_dyn_keepalive_period = 5;
+        V_dyn_keepalive = 1;    /* do send keepalives */
+        
+        V_dyn_max = 4096;       /* max # of dynamic rules */
+        callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
+        callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0);
+}
+
+void
+ipfw_dyn_uninit(int pass)
+{
+       if (pass == 0)
+               callout_drain(&V_ipfw_timeout);
+       else {
+               if (V_ipfw_dyn_v != NULL)
+                       free(V_ipfw_dyn_v, M_IPFW);
+       }
+}
+
+int
+ipfw_dyn_len(void)
+{
+       return (V_ipfw_dyn_v == NULL) ? 0 :
+               (V_dyn_count * sizeof(ipfw_dyn_rule));
+}
+
+void
+ipfw_get_dynamic(char **pbp, const char *ep)
+{
+       ipfw_dyn_rule *p, *last = NULL;
+       char *bp;
+       int i;
+
+       if (V_ipfw_dyn_v == NULL)
+               return;
+       bp = *pbp;
+
+       IPFW_DYN_LOCK();
+       for (i = 0 ; i < V_curr_dyn_buckets; i++)
+               for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
+                       if (bp + sizeof *p <= ep) {
+                               ipfw_dyn_rule *dst =
+                                       (ipfw_dyn_rule *)bp;
+                               bcopy(p, dst, sizeof *p);
+                               bcopy(&(p->rule->rulenum), &(dst->rule),
+                                   sizeof(p->rule->rulenum));
+                               /*
+                                * store set number into high word of
+                                * dst->rule pointer.
+                                */
+                               bcopy(&(p->rule->set),
+                                   (char *)&dst->rule +
+                                   sizeof(p->rule->rulenum),
+                                   sizeof(p->rule->set));
+                               /*
+                                * store a non-null value in "next".
+                                * The userland code will interpret a
+                                * NULL here as a marker
+                                * for the last dynamic rule.
+                                */
+                               bcopy(&dst, &dst->next, sizeof(dst));
+                               last = dst;
+                               dst->expire =
+                                   TIME_LEQ(dst->expire, time_uptime) ?
+                                       0 : dst->expire - time_uptime ;
+                               bp += sizeof(ipfw_dyn_rule);
+                       }
+               }
+       IPFW_DYN_UNLOCK();
+       if (last != NULL) /* mark last dynamic rule */
+               bzero(&last->next, sizeof(last));
+       *pbp = bp;
+}
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c
new file mode 100644 (file)
index 0000000..55b5c26
--- /dev/null
@@ -0,0 +1,449 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 209845 2010-07-09 11:27:33Z glebius $");
+
+/*
+ * Logging support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/if_types.h>      /* for IFT_ETHER */
+#include <net/bpf.h>           /* for BPF */
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_var.h>  /* ip6_sprintf() */
+#endif
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+#ifdef WITHOUT_BPF
+void
+ipfw_log_bpf(int onoff)
+{
+}
+#else /* !WITHOUT_BPF */
+static struct ifnet *log_if;   /* hook to attach to bpf */
+
+/* we use this dummy function for all ifnet callbacks */
+static int
+log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+       return EINVAL;
+}
+
+static int
+ipfw_log_output(struct ifnet *ifp, struct mbuf *m,
+       struct sockaddr *dst, struct route *ro)
+{
+       if (m != NULL)
+               m_freem(m);
+       return EINVAL;
+}
+
+static void
+ipfw_log_start(struct ifnet* ifp)
+{
+       panic("ipfw_log_start() must not be called");
+}
+
+static const u_char ipfwbroadcastaddr[6] =
+       { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+void
+ipfw_log_bpf(int onoff)
+{
+       struct ifnet *ifp;
+
+       if (onoff) {
+               if (log_if)
+                       return;
+               ifp = if_alloc(IFT_ETHER);
+               if (ifp == NULL)
+                       return;
+               if_initname(ifp, "ipfw", 0);
+               ifp->if_mtu = 65536;
+               ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+               ifp->if_init = (void *)log_dummy;
+               ifp->if_ioctl = log_dummy;
+               ifp->if_start = ipfw_log_start;
+               ifp->if_output = ipfw_log_output;
+               ifp->if_addrlen = 6;
+               ifp->if_hdrlen = 14;
+               if_attach(ifp);
+               ifp->if_broadcastaddr = ipfwbroadcastaddr;
+               ifp->if_baudrate = IF_Mbps(10);
+               bpfattach(ifp, DLT_EN10MB, 14);
+               log_if = ifp;
+       } else {
+               if (log_if) {
+                       ether_ifdetach(log_if);
+                       if_free(log_if);
+               }
+               log_if = NULL;
+       }
+}
+#endif /* !WITHOUT_BPF */
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+    struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+    struct ip *ip)
+{
+       char *action;
+       int limit_reached = 0;
+       char action2[40], proto[128], fragment[32];
+
+       if (V_fw_verbose == 0) {
+#ifndef WITHOUT_BPF
+
+               if (log_if == NULL || log_if->if_bpf == NULL)
+                       return;
+
+               if (args->eh) /* layer2, use orig hdr */
+                       BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
+               else
+                       /* Add fake header. Later we will store
+                        * more info in the header.
+                        */
+                       BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
+#endif /* !WITHOUT_BPF */
+               return;
+       }
+       /* the old 'log' function */
+       fragment[0] = '\0';
+       proto[0] = '\0';
+
+       if (f == NULL) {        /* bogus pkt */
+               if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+                       return;
+               V_norule_counter++;
+               if (V_norule_counter == V_verbose_limit)
+                       limit_reached = V_verbose_limit;
+               action = "Refuse";
+       } else {        /* O_LOG is the first action, find the real one */
+               ipfw_insn *cmd = ACTION_PTR(f);
+               ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+               if (l->max_log != 0 && l->log_left == 0)
+                       return;
+               l->log_left--;
+               if (l->log_left == 0)
+                       limit_reached = l->max_log;
+               cmd += F_LEN(cmd);      /* point to first action */
+               if (cmd->opcode == O_ALTQ) {
+                       ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                       snprintf(SNPARGS(action2, 0), "Altq %d",
+                               altq->qid);
+                       cmd += F_LEN(cmd);
+               }
+               if (cmd->opcode == O_PROB)
+                       cmd += F_LEN(cmd);
+
+               if (cmd->opcode == O_TAG)
+                       cmd += F_LEN(cmd);
+
+               action = action2;
+               switch (cmd->opcode) {
+               case O_DENY:
+                       action = "Deny";
+                       break;
+
+               case O_REJECT:
+                       if (cmd->arg1==ICMP_REJECT_RST)
+                               action = "Reset";
+                       else if (cmd->arg1==ICMP_UNREACH_HOST)
+                               action = "Reject";
+                       else
+                               snprintf(SNPARGS(action2, 0), "Unreach %d",
+                                       cmd->arg1);
+                       break;
+
+               case O_UNREACH6:
+                       if (cmd->arg1==ICMP6_UNREACH_RST)
+                               action = "Reset";
+                       else
+                               snprintf(SNPARGS(action2, 0), "Unreach %d",
+                                       cmd->arg1);
+                       break;
+
+               case O_ACCEPT:
+                       action = "Accept";
+                       break;
+               case O_COUNT:
+                       action = "Count";
+                       break;
+               case O_DIVERT:
+                       snprintf(SNPARGS(action2, 0), "Divert %d",
+                               cmd->arg1);
+                       break;
+               case O_TEE:
+                       snprintf(SNPARGS(action2, 0), "Tee %d",
+                               cmd->arg1);
+                       break;
+               case O_SETFIB:
+                       snprintf(SNPARGS(action2, 0), "SetFib %d",
+                               cmd->arg1);
+                       break;
+               case O_SKIPTO:
+                       snprintf(SNPARGS(action2, 0), "SkipTo %d",
+                               cmd->arg1);
+                       break;
+               case O_PIPE:
+                       snprintf(SNPARGS(action2, 0), "Pipe %d",
+                               cmd->arg1);
+                       break;
+               case O_QUEUE:
+                       snprintf(SNPARGS(action2, 0), "Queue %d",
+                               cmd->arg1);
+                       break;
+               case O_FORWARD_IP: {
+                       ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+                       int len;
+                       struct in_addr dummyaddr;
+                       if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+                               dummyaddr.s_addr = htonl(tablearg);
+                       else
+                               dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+                       len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+                               inet_ntoa(dummyaddr));
+
+                       if (sa->sa.sin_port)
+                               snprintf(SNPARGS(action2, len), ":%d",
+                                   sa->sa.sin_port);
+                       }
+                       break;
+               case O_NETGRAPH:
+                       snprintf(SNPARGS(action2, 0), "Netgraph %d",
+                               cmd->arg1);
+                       break;
+               case O_NGTEE:
+                       snprintf(SNPARGS(action2, 0), "Ngtee %d",
+                               cmd->arg1);
+                       break;
+               case O_NAT:
+                       action = "Nat";
+                       break;
+               case O_REASS:
+                       action = "Reass";
+                       break;
+               default:
+                       action = "UNKNOWN";
+                       break;
+               }
+       }
+
+       if (hlen == 0) {        /* non-ip */
+               snprintf(SNPARGS(proto, 0), "MAC");
+
+       } else {
+               int len;
+#ifdef INET6
+               char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+               char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+               struct icmphdr *icmp;
+               struct tcphdr *tcp;
+               struct udphdr *udp;
+#ifdef INET6
+               struct ip6_hdr *ip6 = NULL;
+               struct icmp6_hdr *icmp6;
+#endif
+               src[0] = '\0';
+               dst[0] = '\0';
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                       char ip6buf[INET6_ADDRSTRLEN];
+                       snprintf(src, sizeof(src), "[%s]",
+                           ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+                       snprintf(dst, sizeof(dst), "[%s]",
+                           ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+                       ip6 = (struct ip6_hdr *)ip;
+                       tcp = (struct tcphdr *)(((char *)ip) + hlen);
+                       udp = (struct udphdr *)(((char *)ip) + hlen);
+               } else
+#endif
+               {
+                       tcp = L3HDR(struct tcphdr, ip);
+                       udp = L3HDR(struct udphdr, ip);
+
+                       inet_ntoa_r(ip->ip_src, src);
+                       inet_ntoa_r(ip->ip_dst, dst);
+               }
+
+               switch (args->f_id.proto) {
+               case IPPROTO_TCP:
+                       len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+                       if (offset == 0)
+                               snprintf(SNPARGS(proto, len), ":%d %s:%d",
+                                   ntohs(tcp->th_sport),
+                                   dst,
+                                   ntohs(tcp->th_dport));
+                       else
+                               snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+
+               case IPPROTO_UDP:
+                       len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+                       if (offset == 0)
+                               snprintf(SNPARGS(proto, len), ":%d %s:%d",
+                                   ntohs(udp->uh_sport),
+                                   dst,
+                                   ntohs(udp->uh_dport));
+                       else
+                               snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+
+               case IPPROTO_ICMP:
+                       icmp = L3HDR(struct icmphdr, ip);
+                       if (offset == 0)
+                               len = snprintf(SNPARGS(proto, 0),
+                                   "ICMP:%u.%u ",
+                                   icmp->icmp_type, icmp->icmp_code);
+                       else
+                               len = snprintf(SNPARGS(proto, 0), "ICMP ");
+                       len += snprintf(SNPARGS(proto, len), "%s", src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+#ifdef INET6
+               case IPPROTO_ICMPV6:
+                       icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+                       if (offset == 0)
+                               len = snprintf(SNPARGS(proto, 0),
+                                   "ICMPv6:%u.%u ",
+                                   icmp6->icmp6_type, icmp6->icmp6_code);
+                       else
+                               len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+                       len += snprintf(SNPARGS(proto, len), "%s", src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+#endif
+               default:
+                       len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+                           args->f_id.proto, src);
+                       snprintf(SNPARGS(proto, len), " %s", dst);
+                       break;
+               }
+
+#ifdef INET6
+               if (IS_IP6_FLOW_ID(&(args->f_id))) {
+                       if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+                               snprintf(SNPARGS(fragment, 0),
+                                   " (frag %08x:%d@%d%s)",
+                                   args->f_id.extra,
+                                   ntohs(ip6->ip6_plen) - hlen,
+                                   ntohs(offset & IP6F_OFF_MASK) << 3,
+                                   (offset & IP6F_MORE_FRAG) ? "+" : "");
+               } else
+#endif
+               {
+                       int ipoff, iplen;
+                       ipoff = ntohs(ip->ip_off);
+                       iplen = ntohs(ip->ip_len);
+                       if (ipoff & (IP_MF | IP_OFFMASK))
+                               snprintf(SNPARGS(fragment, 0),
+                                   " (frag %d:%d@%d%s)",
+                                   ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
+                                   offset << 3,
+                                   (ipoff & IP_MF) ? "+" : "");
+               }
+       }
+#ifdef __FreeBSD__
+       if (oif || m->m_pkthdr.rcvif)
+               log(LOG_SECURITY | LOG_INFO,
+                   "ipfw: %d %s %s %s via %s%s\n",
+                   f ? f->rulenum : -1,
+                   action, proto, oif ? "out" : "in",
+                   oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+                   fragment);
+       else
+#endif
+               log(LOG_SECURITY | LOG_INFO,
+                   "ipfw: %d %s %s [no if info]%s\n",
+                   f ? f->rulenum : -1,
+                   action, proto, fragment);
+       if (limit_reached)
+               log(LOG_SECURITY | LOG_NOTICE,
+                   "ipfw: limit %d reached on entry %d\n",
+                   limit_reached, f ? f->rulenum : -1);
+}
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_lookup.c b/sys/netinet/ipfw/ip_fw_lookup.c
new file mode 100644 (file)
index 0000000..bf04cb6
--- /dev/null
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Rule and pipe lookup support for ipfw.
+ *
+
+ipfw and dummynet need to quickly find objects (rules, pipes)
+that may be dynamically created or destroyed.
+To address the problem, we label each new object with a unique
+32-bit identifier whose low K bits are the index in a lookup
+table. All existing objects are referred by the lookup table,
+and identifiers are chosen so that for each slot there is
+at most one active object (whose identifier points to the slot).
+This is almost a hash table, except that we can pick the
+identifiers after looking at the table's occupation so
+we have a trivial hash function and are collision free.
+
+With this structure, operations are very fast and simple:
+- the table has N entries s[i] with two fields, 'id' and 'ptr',
+  with N <= M = 2^k (M is an upper bound to the size of the table);
+- initially, all slots have s[i].id = i, and the pointers
+  are used to build a freelist (tailq).
+- a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N].
+  This is easy to detect and we can use ptr to build the freelist.
+- when a new object is created, we put it in the empty slot i at the
+  head of the freelist, and set the id to s[i].id;
+- when an object is destroyed, we append its slot i to the end
+  of the freelist, and set s[i].id += M (note M, not N).
+- on a lookup for id = X, we look at slot i = X & (M-1),
+  and consider the lookup successful only if the slot is not
+  empty and s[i].id == X;
+- wraps occur at most every F * 2^32/M operations, where F is
+  the number of free slots. Because F is usually a reasonable
+  fraction of M, we should not worry too much.
+- if the table fills up, we can extend it by increasing N
+- shrinking the table is more difficult as we might create
+  collisions during the rehashing.
+ *
+ */
+
+#include <sys/cdefs.h>
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup");
+#define Malloc(n)      malloc(n, M_IPFW_LUT, M_WAITOK)
+#define Calloc(n)      calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO)
+#define Free(p)                free(p, M_IPFW_LUT)
+
+#define log(x, arg...)
+
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define Malloc(n)      malloc(n)
+#define Calloc(n)      calloc(1, n)
+#define Free(p)                free(p)
+#define log(x, arg...) fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg)
+#endif /* !_KERNEL */
+
+struct entry {
+       uint32_t        id;
+       struct entry    *ptr;
+};
+
+struct lookup_table {
+       int _size;
+       int used;
+       int mask; /* 2^k -1, used for hashing */
+       struct entry *f_head, *f_tail; /* freelist */
+       struct entry *  s;      /* slots, array of N entries */
+};
+
+static __inline int empty(struct lookup_table *head, const void *p)
+{
+       const struct entry *ep = p;
+       return (ep == NULL ||
+               (ep >= head->s && ep < &head->s[head->_size]));
+}
+
+/*
+ * init or reinit a table
+ */
+struct lookup_table *
+ipfw_lut_init(struct lookup_table *head, int new_size, int mask)
+{
+       int i;
+       struct entry *s;        /* the new slots */
+       struct entry *fh, *ft;  /* the freelist */
+
+       if (head != NULL) {
+               mask = head->mask;
+               if (new_size <= head->_size)
+                       return head;
+               if (new_size >= mask+1) {
+                       log("size larger than mask");
+                       return NULL;
+               }
+       } else {
+               log("old is null, initialize");
+               head = Calloc(sizeof(*head));
+               if (head == NULL)
+                       return NULL;
+               if (new_size >= mask)
+                       mask = new_size;
+               if (mask & (mask -1)) {
+                       for (i = 1; i < mask; i += i)
+                           ;
+                       log("mask %d not 2^k, round up to %d", mask, i);
+                       mask = i;
+               }
+               mask = head->mask = mask - 1;
+       }
+
+       s = Calloc(new_size * sizeof(*s));
+       if (s == NULL)
+               return NULL;
+       if (!head->s) {
+               head->s = s;
+               head->_size = 1;
+       }
+       fh = ft = NULL;
+       /* remap the entries, adjust the freelist */
+       for (i = 0; i < new_size; i++) {
+               s[i].id = (i >= head->_size) ? i : head->s[i].id;
+               if (i < head->_size && !empty(head, head->s[i].ptr)) {
+                       s[i].ptr = head->s[i].ptr;
+                       continue;
+               }
+               if (fh == NULL)
+                       fh = &s[i];
+               else
+                       ft->ptr = &s[i];
+               ft = &s[i];
+       }
+       head->f_head = fh;
+       head->f_tail = ft;
+
+       /* write lock on the structure, to protect the readers */
+       fh = head->s;
+       head->s = s;
+       head->_size = new_size;
+       /* release write lock */
+       if (fh != s)
+               Free(fh);
+       log("done");
+       return head;
+}
+
+/* insert returns the id */
+int
+ipfw_lut_insert(struct lookup_table *head, void *d)
+{
+       struct entry *e;
+
+       e = head->f_head;
+       if (e == NULL)
+               return -1;
+       head->f_head = e->ptr;
+       e->ptr = d;
+       head->used++;
+       return e->id;
+}
+
+/* delete, returns the original entry */
+void *
+ipfw_lut_delete(struct lookup_table *head, int id)
+{
+       int i = id & head->mask;
+       void *result;
+       struct entry *e;
+
+       if (i >= head->_size)
+               return NULL;
+       e = &head->s[i];
+       if (e->id != id)
+               return NULL;
+       result = e->ptr;
+       /* write lock to invalidate the entry to readers */
+       e->id += head->mask + 1; /* prepare for next insert */
+       e->ptr = NULL;
+       /* release write lock */
+       if (head->f_head == NULL)
+               head->f_head = e;
+       else
+               head->f_tail->ptr = e;
+       head->f_tail = e;
+       head->used--;
+       return result;
+}
+
+void *
+ipfw_lut_lookup(struct lookup_table *head, int id)
+{
+       int i = id & head->mask;
+       struct entry *e;
+
+       if (i >= head->_size)
+               return NULL;
+       e = &head->s[i];
+       return (e->id == id) ? e->ptr : NULL;
+}
+
+void
+ipfw_lut_dump(struct lookup_table *head)
+{
+       int i;
+
+       log("head %p size %d used %d freelist %d",
+           head, head->_size, head->used, head->f_head ?
+                   head->f_head - head->s : -1);
+       for (i = 0; i < head->_size; i++) {
+               struct entry *e = &head->s[i];
+               char ee = empty(head, e->ptr) ? 'E' : ' ';
+               log("%5d  %5d %c %p", i, e->id, ee,
+                   ee == 'E' && e->ptr != NULL ?
+                   (void *)((struct entry *)e->ptr - head->s) : e->ptr);
+       }
+}
+
+#ifndef _KERNEL
+void dump_p(struct lookup_table *p, int *map)
+{
+       int i;
+       for (i = 0; i < p->_size; i++) {
+           int id = (int)ipfw_lut_lookup(p, map[i]);
+           log("%3d: %3d: %c", map[i] % 64, i, id);
+       }
+}
+int main(int argc, char *argv[])
+{
+       int i, j, l;
+#define S 1000
+       int map[S];
+       struct lookup_table *p;
+       struct lookup_table *p1;
+       const char *m = "nel mezzo del cammin di nostra vita mi ritrovai"
+               " in una selva oscura e la diritta via era smarrita!";
+
+       fprintf(stderr, "testing lookup\n");
+
+       l = strlen(m);
+
+       p = ipfw_lut_init(NULL, 120, 33);
+
+       ipfw_lut_dump(p);
+       for (i = 0; i < l; i++) {
+           int x = m[i];
+           int id = ipfw_lut_insert(p, (void *)x);
+           //ipfw_lut_dump(p);
+           map[i] = id;
+           for (j=0; j < 10; j++) {
+                   id = ipfw_lut_insert(p, (void *)'a');
+                   // ipfw_lut_dump(p);
+                   ipfw_lut_delete(p, id);
+                   // ipfw_lut_dump(p);
+           }
+       //    ipfw_lut_dump(p);
+       } 
+       dump_p(p, map);
+       p1 = ipfw_lut_init(p, 23, 0);
+       if (!p1)
+               return 1;
+       dump_p(p1, map);
+       p1 = ipfw_lut_init(p1, 120, 0);
+       if (!p1)
+               return 1;
+       dump_p(p1, map);
+       return 0;
+}
+#endif
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c
new file mode 100644 (file)
index 0000000..d093924
--- /dev/null
@@ -0,0 +1,605 @@
+/*-
+ * Copyright (c) 2008 Paolo Pisati
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+
+#define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */
+
+#include <netinet/libalias/alias.h>
+#include <netinet/libalias/alias_local.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
+#define        V_ifaddr_event_tag      VNET(ifaddr_event_tag)
+
+static void
+ifaddr_change(void *arg, struct ifnet *ifp)
+{
+       struct cfg_nat *ptr;
+       struct ifaddr *ifa;
+       struct ip_fw_chain *chain;
+
+       (void)arg;
+       chain = &V_layer3_chain;
+       IPFW_WLOCK(chain);
+       /* Check every nat entry... */
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               /* ...using nic 'ifp->if_xname' as dynamic alias address. */
+               if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
+                       continue;
+               if_addr_rlock(ifp);
+               TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+                       if (ifa->ifa_addr == NULL)
+                               continue;
+                       if (ifa->ifa_addr->sa_family != AF_INET)
+                               continue;
+                       ptr->ip = ((struct sockaddr_in *)
+                           (ifa->ifa_addr))->sin_addr;
+                       LibAliasSetAddress(ptr->lib, ptr->ip);
+               }
+               if_addr_runlock(ifp);
+       }
+       IPFW_WUNLOCK(chain);
+}
+
+/*
+ * delete the pointers for nat entry ix, or all of them if ix < 0
+ */
+static void
+flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
+{
+       int i;
+       ipfw_insn_nat *cmd;
+
+       IPFW_WLOCK_ASSERT(chain);
+       for (i = 0; i < chain->n_rules; i++) {
+               cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
+               /* XXX skip log and the like ? */
+               if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
+                           (ix < 0 || cmd->nat->id == ix))
+                       cmd->nat = NULL;
+       }
+}
+
+static void
+del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
+{
+       struct cfg_redir *r, *tmp_r;
+       struct cfg_spool *s, *tmp_s;
+       int i, num;
+
+       LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
+               num = 1; /* Number of alias_link to delete. */
+               switch (r->mode) {
+               case REDIR_PORT:
+                       num = r->pport_cnt;
+                       /* FALLTHROUGH */
+               case REDIR_ADDR:
+               case REDIR_PROTO:
+                       /* Delete all libalias redirect entry. */
+                       for (i = 0; i < num; i++)
+                               LibAliasRedirectDelete(n->lib, r->alink[i]);
+                       /* Del spool cfg if any. */
+                       LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
+                               LIST_REMOVE(s, _next);
+                               free(s, M_IPFW);
+                       }
+                       free(r->alink, M_IPFW);
+                       LIST_REMOVE(r, _next);
+                       free(r, M_IPFW);
+                       break;
+               default:
+                       printf("unknown redirect mode: %u\n", r->mode);
+                       /* XXX - panic?!?!? */
+                       break;
+               }
+       }
+}
+
+static int
+add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
+{
+       struct cfg_redir *r, *ser_r;
+       struct cfg_spool *s, *ser_s;
+       int cnt, off, i;
+
+       for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
+               ser_r = (struct cfg_redir *)&buf[off];
+               r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+               memcpy(r, ser_r, SOF_REDIR);
+               LIST_INIT(&r->spool_chain);
+               off += SOF_REDIR;
+               r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
+                   M_IPFW, M_WAITOK | M_ZERO);
+               switch (r->mode) {
+               case REDIR_ADDR:
+                       r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
+                           r->paddr);
+                       break;
+               case REDIR_PORT:
+                       for (i = 0 ; i < r->pport_cnt; i++) {
+                               /* If remotePort is all ports, set it to 0. */
+                               u_short remotePortCopy = r->rport + i;
+                               if (r->rport_cnt == 1 && r->rport == 0)
+                                       remotePortCopy = 0;
+                               r->alink[i] = LibAliasRedirectPort(ptr->lib,
+                                   r->laddr, htons(r->lport + i), r->raddr,
+                                   htons(remotePortCopy), r->paddr,
+                                   htons(r->pport + i), r->proto);
+                               if (r->alink[i] == NULL) {
+                                       r->alink[0] = NULL;
+                                       break;
+                               }
+                       }
+                       break;
+               case REDIR_PROTO:
+                       r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
+                           r->raddr, r->paddr, r->proto);
+                       break;
+               default:
+                       printf("unknown redirect mode: %u\n", r->mode);
+                       break;
+               }
+               /* XXX perhaps return an error instead of panic ? */
+               if (r->alink[0] == NULL)
+                       panic("LibAliasRedirect* returned NULL");
+               /* LSNAT handling. */
+               for (i = 0; i < r->spool_cnt; i++) {
+                       ser_s = (struct cfg_spool *)&buf[off];
+                       s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+                       memcpy(s, ser_s, SOF_SPOOL);
+                       LibAliasAddServer(ptr->lib, r->alink[0],
+                           s->addr, htons(s->port));
+                       off += SOF_SPOOL;
+                       /* Hook spool entry. */
+                       LIST_INSERT_HEAD(&r->spool_chain, s, _next);
+               }
+               /* And finally hook this redir entry. */
+               LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
+       }
+       return (1);
+}
+
+static int
+ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
+{
+       struct mbuf *mcl;
+       struct ip *ip;
+       /* XXX - libalias duct tape */
+       int ldt, retval;
+       char *c;
+
+       ldt = 0;
+       retval = 0;
+       mcl = m_megapullup(m, m->m_pkthdr.len);
+       if (mcl == NULL) {
+               args->m = NULL;
+               return (IP_FW_DENY);
+       }
+       ip = mtod(mcl, struct ip *);
+
+       /*
+        * XXX - Libalias checksum offload 'duct tape':
+        *
+        * locally generated packets have only pseudo-header checksum
+        * calculated and libalias will break it[1], so mark them for
+        * later fix.  Moreover there are cases when libalias modifies
+        * tcp packet data[2], mark them for later fix too.
+        *
+        * [1] libalias was never meant to run in kernel, so it does
+        * not have any knowledge about checksum offloading, and
+        * expects a packet with a full internet checksum.
+        * Unfortunately, packets generated locally will have just the
+        * pseudo header calculated, and when libalias tries to adjust
+        * the checksum it will actually compute a wrong value.
+        *
+        * [2] when libalias modifies tcp's data content, full TCP
+        * checksum has to be recomputed: the problem is that
+        * libalias does not have any idea about checksum offloading.
+        * To work around this, we do not do checksumming in LibAlias,
+        * but only mark the packets in th_x2 field. If we receive a
+        * marked packet, we calculate correct checksum for it
+        * aware of offloading.  Why such a terrible hack instead of
+        * recalculating checksum for each packet?
+        * Because the previous checksum was not checked!
+        * Recalculating checksums for EVERY packet will hide ALL
+        * transmission errors. Yes, marked packets still suffer from
+        * this problem. But, sigh, natd(8) has this problem, too.
+        *
+        * TODO: -make libalias mbuf aware (so
+        * it can handle delayed checksum and tso)
+        */
+
+       if (mcl->m_pkthdr.rcvif == NULL &&
+           mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
+               ldt = 1;
+
+       c = mtod(mcl, char *);
+       if (args->oif == NULL)
+               retval = LibAliasIn(t->lib, c,
+                       mcl->m_len + M_TRAILINGSPACE(mcl));
+       else
+               retval = LibAliasOut(t->lib, c,
+                       mcl->m_len + M_TRAILINGSPACE(mcl));
+       if (retval == PKT_ALIAS_RESPOND) {
+               m->m_flags |= M_SKIP_FIREWALL;
+               retval = PKT_ALIAS_OK;
+       }
+       if (retval != PKT_ALIAS_OK &&
+           retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
+               /* XXX - should i add some logging? */
+               m_free(mcl);
+               args->m = NULL;
+               return (IP_FW_DENY);
+       }
+       mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
+
+       /*
+        * XXX - libalias checksum offload
+        * 'duct tape' (see above)
+        */
+
+       if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
+           ip->ip_p == IPPROTO_TCP) {
+               struct tcphdr   *th;
+
+               th = (struct tcphdr *)(ip + 1);
+               if (th->th_x2)
+                       ldt = 1;
+       }
+
+       if (ldt) {
+               struct tcphdr   *th;
+               struct udphdr   *uh;
+               u_short cksum;
+
+               ip->ip_len = ntohs(ip->ip_len);
+               cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+                   htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)));
+
+               switch (ip->ip_p) {
+               case IPPROTO_TCP:
+                       th = (struct tcphdr *)(ip + 1);
+                       /*
+                        * Maybe it was set in
+                        * libalias...
+                        */
+                       th->th_x2 = 0;
+                       th->th_sum = cksum;
+                       mcl->m_pkthdr.csum_data =
+                           offsetof(struct tcphdr, th_sum);
+                       break;
+               case IPPROTO_UDP:
+                       uh = (struct udphdr *)(ip + 1);
+                       uh->uh_sum = cksum;
+                       mcl->m_pkthdr.csum_data =
+                           offsetof(struct udphdr, uh_sum);
+                       break;
+               }
+               /* No hw checksum offloading: do it ourselves */
+               if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
+                       in_delayed_cksum(mcl);
+                       mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+               }
+               ip->ip_len = htons(ip->ip_len);
+       }
+       args->m = mcl;
+       return (IP_FW_NAT);
+}
+
+static struct cfg_nat *
+lookup_nat(struct nat_list *l, int nat_id)
+{
+       struct cfg_nat *res;
+
+       LIST_FOREACH(res, l, _next) {
+               if (res->id == nat_id)
+                       break;
+       }
+       return res;
+}
+
+static int
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+       struct cfg_nat *ptr, *ser_n;
+       char *buf;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+       sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat));
+       ser_n = (struct cfg_nat *)buf;
+
+       /* check valid parameter ser_n->id > 0 ? */
+       /*
+        * Find/create nat rule.
+        */
+       IPFW_WLOCK(chain);
+       ptr = lookup_nat(&chain->nat, ser_n->id);
+       if (ptr == NULL) {
+               /* New rule: allocate and init new instance. */
+               ptr = malloc(sizeof(struct cfg_nat),
+                   M_IPFW, M_NOWAIT | M_ZERO);
+               if (ptr == NULL) {
+                       IPFW_WUNLOCK(chain);
+                       free(buf, M_IPFW);
+                       return (ENOSPC);
+               }
+               ptr->lib = LibAliasInit(NULL);
+               if (ptr->lib == NULL) {
+                       IPFW_WUNLOCK(chain);
+                       free(ptr, M_IPFW);
+                       free(buf, M_IPFW);
+                       return (EINVAL);
+               }
+               LIST_INIT(&ptr->redir_chain);
+       } else {
+               /* Entry already present: temporarly unhook it. */
+               LIST_REMOVE(ptr, _next);
+               flush_nat_ptrs(chain, ser_n->id);
+       }
+       IPFW_WUNLOCK(chain);
+
+       /*
+        * Basic nat configuration.
+        */
+       ptr->id = ser_n->id;
+       /*
+        * XXX - what if this rule doesn't nat any ip and just
+        * redirect?
+        * do we set aliasaddress to 0.0.0.0?
+        */
+       ptr->ip = ser_n->ip;
+       ptr->redir_cnt = ser_n->redir_cnt;
+       ptr->mode = ser_n->mode;
+       LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
+       LibAliasSetAddress(ptr->lib, ptr->ip);
+       memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
+
+       /*
+        * Redir and LSNAT configuration.
+        */
+       /* Delete old cfgs. */
+       del_redir_spool_cfg(ptr, &ptr->redir_chain);
+       /* Add new entries. */
+       add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+       free(buf, M_IPFW);
+       IPFW_WLOCK(chain);
+       LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+       IPFW_WUNLOCK(chain);
+       return (0);
+}
+
+static int
+ipfw_nat_del(struct sockopt *sopt)
+{
+       struct cfg_nat *ptr;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+       int i;
+
+       sooptcopyin(sopt, &i, sizeof i, sizeof i);
+       /* XXX validate i */
+       IPFW_WLOCK(chain);
+       ptr = lookup_nat(&chain->nat, i);
+       if (ptr == NULL) {
+               IPFW_WUNLOCK(chain);
+               return (EINVAL);
+       }
+       LIST_REMOVE(ptr, _next);
+       flush_nat_ptrs(chain, i);
+       IPFW_WUNLOCK(chain);
+       del_redir_spool_cfg(ptr, &ptr->redir_chain);
+       LibAliasUninit(ptr->lib);
+       free(ptr, M_IPFW);
+       return (0);
+}
+
+static int
+ipfw_nat_get_cfg(struct sockopt *sopt)
+{
+       uint8_t *data;
+       struct cfg_nat *n;
+       struct cfg_redir *r;
+       struct cfg_spool *s;
+       int nat_cnt, off;
+       struct ip_fw_chain *chain;
+       int err = ENOSPC;
+
+       chain = &V_layer3_chain;
+       nat_cnt = 0;
+       off = sizeof(nat_cnt);
+
+       data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+       IPFW_RLOCK(chain);
+       /* Serialize all the data. */
+       LIST_FOREACH(n, &chain->nat, _next) {
+               nat_cnt++;
+               if (off + SOF_NAT >= NAT_BUF_LEN)
+                       goto nospace;
+               bcopy(n, &data[off], SOF_NAT);
+               off += SOF_NAT;
+               LIST_FOREACH(r, &n->redir_chain, _next) {
+                       if (off + SOF_REDIR >= NAT_BUF_LEN)
+                               goto nospace;
+                       bcopy(r, &data[off], SOF_REDIR);
+                       off += SOF_REDIR;
+                       LIST_FOREACH(s, &r->spool_chain, _next) {
+                               if (off + SOF_SPOOL >= NAT_BUF_LEN)
+                                       goto nospace;
+                               bcopy(s, &data[off], SOF_SPOOL);
+                               off += SOF_SPOOL;
+                       }
+               }
+       }
+       err = 0; /* all good */
+nospace:
+       IPFW_RUNLOCK(chain);
+       if (err == 0) {
+               bcopy(&nat_cnt, data, sizeof(nat_cnt));
+               sooptcopyout(sopt, data, NAT_BUF_LEN);
+       } else {
+               printf("serialized data buffer not big enough:"
+                   "please increase NAT_BUF_LEN\n");
+       }
+       free(data, M_IPFW);
+       return (err);
+}
+
+static int
+ipfw_nat_get_log(struct sockopt *sopt)
+{
+       uint8_t *data;
+       struct cfg_nat *ptr;
+       int i, size;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+
+       IPFW_RLOCK(chain);
+       /* one pass to count, one to copy the data */
+       i = 0;
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               if (ptr->lib->logDesc == NULL)
+                       continue;
+               i++;
+       }
+       size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
+       data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
+       if (data == NULL) {
+               IPFW_RUNLOCK(chain);
+               return (ENOSPC);
+       }
+       i = 0;
+       LIST_FOREACH(ptr, &chain->nat, _next) {
+               if (ptr->lib->logDesc == NULL)
+                       continue;
+               bcopy(&ptr->id, &data[i], sizeof(int));
+               i += sizeof(int);
+               bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
+               i += LIBALIAS_BUF_SIZE;
+       }
+       IPFW_RUNLOCK(chain);
+       sooptcopyout(sopt, data, size);
+       free(data, M_IPFW);
+       return(0);
+}
+
+static void
+ipfw_nat_init(void)
+{
+
+       IPFW_WLOCK(&V_layer3_chain);
+       /* init ipfw hooks */
+       ipfw_nat_ptr = ipfw_nat;
+       lookup_nat_ptr = lookup_nat;
+       ipfw_nat_cfg_ptr = ipfw_nat_cfg;
+       ipfw_nat_del_ptr = ipfw_nat_del;
+       ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
+       ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+       IPFW_WUNLOCK(&V_layer3_chain);
+       V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
+           ifaddr_event, ifaddr_change,
+           NULL, EVENTHANDLER_PRI_ANY);
+}
+
+static void
+ipfw_nat_destroy(void)
+{
+       struct cfg_nat *ptr, *ptr_temp;
+       struct ip_fw_chain *chain;
+
+       chain = &V_layer3_chain;
+       IPFW_WLOCK(chain);
+       LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
+               LIST_REMOVE(ptr, _next);
+               del_redir_spool_cfg(ptr, &ptr->redir_chain);
+               LibAliasUninit(ptr->lib);
+               free(ptr, M_IPFW);
+       }
+       EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
+       flush_nat_ptrs(chain, -1 /* flush all */);
+       /* deregister ipfw_nat */
+       ipfw_nat_ptr = NULL;
+       lookup_nat_ptr = NULL;
+       ipfw_nat_cfg_ptr = NULL;
+       ipfw_nat_del_ptr = NULL;
+       ipfw_nat_get_cfg_ptr = NULL;
+       ipfw_nat_get_log_ptr = NULL;
+       IPFW_WUNLOCK(chain);
+}
+
+static int
+ipfw_nat_modevent(module_t mod, int type, void *unused)
+{
+       int err = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               ipfw_nat_init();
+               break;
+
+       case MOD_UNLOAD:
+               ipfw_nat_destroy();
+               break;
+
+       default:
+               return EOPNOTSUPP;
+               break;
+       }
+       return err;
+}
+
+static moduledata_t ipfw_nat_mod = {
+       "ipfw_nat",
+       ipfw_nat_modevent,
+       0
+};
+
+DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
+MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_VERSION(ipfw_nat, 1);
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c
new file mode 100644 (file)
index 0000000..a125ef2
--- /dev/null
@@ -0,0 +1,415 @@
+/*-
+ * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif /* KLD_MODULE */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pfil.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netgraph/ng_ipfw.h>
+
+#include <machine/in_cksum.h>
+
+static VNET_DEFINE(int, fw_enable) = 1;
+#define V_fw_enable    VNET(fw_enable)
+
+#ifdef INET6
+static VNET_DEFINE(int, fw6_enable) = 1;
+#define V_fw6_enable   VNET(fw6_enable)
+#endif
+
+int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+/* Forward declarations. */
+static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f1)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
+    ipfw_chg_hook, "I", "Enable ipfw");
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6_fw);
+SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
+    ipfw_chg_hook, "I", "Enable ipfw+6");
+#endif /* INET6 */
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+/*
+ * The pfilter hook to pass packets to ipfw_chk and then to
+ * dummynet, divert, netgraph or other modules.
+ * The packet may be consumed.
+ */
+int
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp)
+{
+       struct ip_fw_args args;
+       struct m_tag *tag;
+       int ipfw;
+       int ret;
+
+       /* all the processing now uses ip_len in net format */
+       if (mtod(*m0, struct ip *)->ip_v == 4)
+               SET_NET_IPLEN(mtod(*m0, struct ip *));
+
+       /* convert dir to IPFW values */
+       dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
+       bzero(&args, sizeof(args));
+
+again:
+       /*
+        * extract and remove the tag if present. If we are left
+        * with onepass, optimize the outgoing path.
+        */
+       tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+       if (tag != NULL) {
+               args.rule = *((struct ipfw_rule_ref *)(tag+1));
+               m_tag_delete(*m0, tag);
+               if (args.rule.info & IPFW_ONEPASS) {
+                       SET_HOST_IPLEN(mtod(*m0, struct ip *));
+                       return 0;
+               }
+       }
+
+       args.m = *m0;
+       args.oif = dir == DIR_OUT ? ifp : NULL;
+       args.inp = inp;
+
+       ipfw = ipfw_chk(&args);
+       *m0 = args.m;
+
+       KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+           __func__));
+
+       /* breaking out of the switch means drop */
+       ret = 0;        /* default return value for pass */
+       switch (ipfw) {
+       case IP_FW_PASS:
+               /* next_hop may be set by ipfw_chk */
+               if (args.next_hop == NULL)
+                       break; /* pass */
+#ifndef IPFIREWALL_FORWARD
+               ret = EACCES;
+#else
+           {
+               struct m_tag *fwd_tag;
+
+               /* Incoming packets should not be tagged so we do not
+                * m_tag_find. Outgoing packets may be tagged, so we
+                * reuse the tag if present.
+                */
+               fwd_tag = (dir == DIR_IN) ? NULL :
+                       m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+               if (fwd_tag != NULL) {
+                       m_tag_unlink(*m0, fwd_tag);
+               } else {
+                       fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
+                               sizeof(struct sockaddr_in), M_NOWAIT);
+                       if (fwd_tag == NULL) {
+                               ret = EACCES;
+                               break; /* i.e. drop */
+                       }
+               }
+               bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
+               m_tag_prepend(*m0, fwd_tag);
+
+               if (in_localip(args.next_hop->sin_addr))
+                       (*m0)->m_flags |= M_FASTFWD_OURS;
+           }
+#endif
+               break;
+
+       case IP_FW_DENY:
+               ret = EACCES;
+               break; /* i.e. drop */
+
+       case IP_FW_DUMMYNET:
+               ret = EACCES;
+               if (ip_dn_io_ptr == NULL)
+                       break; /* i.e. drop */
+               if (mtod(*m0, struct ip *)->ip_v == 4)
+                       ret = ip_dn_io_ptr(m0, dir, &args);
+               else if (mtod(*m0, struct ip *)->ip_v == 6)
+                       ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
+               else
+                       break; /* drop it */
+               /*
+                * XXX should read the return value.
+                * dummynet normally eats the packet and sets *m0=NULL
+                * unless the packet can be sent immediately. In this
+                * case args is updated and we should re-run the
+                * check without clearing args.
+                */
+               if (*m0 != NULL)
+                       goto again;
+               break;
+
+       case IP_FW_TEE:
+       case IP_FW_DIVERT:
+               if (ip_divert_ptr == NULL) {
+                       ret = EACCES;
+                       break; /* i.e. drop */
+               }
+               ret = ipfw_divert(m0, dir, &args.rule,
+                       (ipfw == IP_FW_TEE) ? 1 : 0);
+               /* continue processing for the original packet (tee). */
+               if (*m0)
+                       goto again;
+               break;
+
+       case IP_FW_NGTEE:
+       case IP_FW_NETGRAPH:
+               if (ng_ipfw_input_p == NULL) {
+                       ret = EACCES;
+                       break; /* i.e. drop */
+               }
+               ret = ng_ipfw_input_p(m0, dir, &args,
+                       (ipfw == IP_FW_NGTEE) ? 1 : 0);
+               if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
+                       goto again;     /* continue with packet */
+               break;
+
+       case IP_FW_NAT:
+               /* honor one-pass in case of successful nat */
+               if (V_fw_one_pass)
+                       break; /* ret is already 0 */
+               goto again;
+
+       case IP_FW_REASS:
+               goto again;             /* continue with packet */
+       
+       default:
+               KASSERT(0, ("%s: unknown retval", __func__));
+       }
+
+       if (ret != 0) {
+               if (*m0)
+                       FREE_PKT(*m0);
+               *m0 = NULL;
+       }
+       if (*m0 && mtod(*m0, struct ip *)->ip_v == 4)
+               SET_HOST_IPLEN(mtod(*m0, struct ip *));
+       return ret;
+}
+
+/* do the divert, return 1 on error 0 on success */
+static int
+ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
+       int tee)
+{
+       /*
+        * ipfw_chk() has already tagged the packet with the divert tag.
+        * If tee is set, copy packet and return original.
+        * If not tee, consume packet and send it to divert socket.
+        */
+       struct mbuf *clone;
+       struct ip *ip;
+       struct m_tag *tag;
+
+       /* Cloning needed for tee? */
+       if (tee == 0) {
+               clone = *m0;    /* use the original mbuf */
+               *m0 = NULL;
+       } else {
+               clone = m_dup(*m0, M_DONTWAIT);
+               /* If we cannot duplicate the mbuf, we sacrifice the divert
+                * chain and continue with the tee-ed packet.
+                */
+               if (clone == NULL)
+                       return 1;
+       }
+
+       /*
+        * Divert listeners can normally handle non-fragmented packets,
+        * but we can only reass in the non-tee case.
+        * This means that listeners on a tee rule may get fragments,
+        * and have to live with that.
+        * Note that we now have the 'reass' ipfw option so if we care
+        * we can do it before a 'tee'.
+        */
+       ip = mtod(clone, struct ip *);
+       if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
+               int hlen;
+               struct mbuf *reass;
+
+               SET_HOST_IPLEN(ip); /* ip_reass wants host order */
+               reass = ip_reass(clone); /* Reassemble packet. */
+               if (reass == NULL)
+                       return 0; /* not an error */
+               /* if reass = NULL then it was consumed by ip_reass */
+               /*
+                * IP header checksum fixup after reassembly and leave header
+                * in network byte order.
+                */
+               ip = mtod(reass, struct ip *);
+               hlen = ip->ip_hl << 2;
+               SET_NET_IPLEN(ip);
+               ip->ip_sum = 0;
+               if (hlen == sizeof(struct ip))
+                       ip->ip_sum = in_cksum_hdr(ip);
+               else
+                       ip->ip_sum = in_cksum(reass, hlen);
+               clone = reass;
+       }
+       /* attach a tag to the packet with the reinject info */
+       tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+                   sizeof(struct ipfw_rule_ref), M_NOWAIT);
+       if (tag == NULL) {
+               FREE_PKT(clone);
+               return 1;
+       }
+       *((struct ipfw_rule_ref *)(tag+1)) = *rule;
+       m_tag_prepend(clone, tag);
+
+       /* Do the dirty job... */
+       ip_divert_ptr(clone, incoming);
+       return 0;
+}
+
+/*
+ * attach or detach hooks for a given protocol family
+ */
+static int
+ipfw_hook(int onoff, int pf)
+{
+       struct pfil_head *pfh;
+
+       pfh = pfil_head_get(PFIL_TYPE_AF, pf);
+       if (pfh == NULL)
+               return ENOENT;
+
+       (void) (onoff ? pfil_add_hook : pfil_remove_hook)
+           (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
+
+       return 0;
+}
+
+int
+ipfw_attach_hooks(int arg)
+{
+       int error = 0;
+
+       if (arg == 0) /* detach */
+               ipfw_hook(0, AF_INET);
+       else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+                error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
+                printf("ipfw_hook() error\n");
+        }
+#ifdef INET6
+       if (arg == 0) /* detach */
+               ipfw_hook(0, AF_INET6);
+       else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+                error = ENOENT;
+                printf("ipfw6_hook() error\n");
+        }
+#endif
+       return error;
+}
+
+int
+ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
+{
+       int enable;
+       int oldenable;
+       int error;
+       int af;
+
+       if (arg1 == &VNET_NAME(fw_enable)) {
+               enable = V_fw_enable;
+               af = AF_INET;
+       }
+#ifdef INET6
+       else if (arg1 == &VNET_NAME(fw6_enable)) {
+               enable = V_fw6_enable;
+               af = AF_INET6;
+       }
+#endif
+       else 
+               return (EINVAL);
+
+       oldenable = enable;
+
+       error = sysctl_handle_int(oidp, &enable, 0, req);
+
+       if (error)
+               return (error);
+
+       enable = (enable) ? 1 : 0;
+
+       if (enable == oldenable)
+               return (0);
+
+       error = ipfw_hook(enable, af);
+       if (error)
+               return (error);
+       if (af == AF_INET)
+               V_fw_enable = enable;
+#ifdef INET6
+       else if (af == AF_INET6)
+               V_fw6_enable = enable;
+#endif
+
+       return (0);
+}
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
new file mode 100644 (file)
index 0000000..334face
--- /dev/null
@@ -0,0 +1,301 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+/*
+ * For platforms that do not have SYSCTL support, we wrap the
+ * SYSCTL_* into a function (one per file) to collect the values
+ * into an array at module initialization. The wrapping macros,
+ * SYSBEGIN() and SYSEND, are empty in the default case.
+ */
+#ifndef SYSBEGIN
+#define SYSBEGIN(x)
+#endif
+#ifndef SYSEND
+#define SYSEND
+#endif
+
+/* Return values from ipfw_chk() */
+enum {
+       IP_FW_PASS = 0,
+       IP_FW_DENY,
+       IP_FW_DIVERT,
+       IP_FW_TEE,
+       IP_FW_DUMMYNET,
+       IP_FW_NETGRAPH,
+       IP_FW_NGTEE,
+       IP_FW_NAT,
+       IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+       struct ip6_pktopts *opt_or;
+       struct route_in6 ro_or;
+       int flags_or;
+       struct ip6_moptions *im6o_or;
+       struct ifnet *origifp_or;
+       struct ifnet *ifp_or;
+       struct sockaddr_in6 dst_or;
+       u_long mtu_or;
+       struct route_in6 ro_pmtu_or;
+};
+
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+       struct mbuf     *m;             /* the mbuf chain               */
+       struct ifnet    *oif;           /* output interface             */
+       struct sockaddr_in *next_hop;   /* forward address              */
+
+       /*
+        * On return, it points to the matching rule.
+        * On entry, rule.slot > 0 means the info is valid and
+        * contains the the starting rule for an ipfw search.
+        * If chain_id == chain->id && slot >0 then jump to that slot.
+        * Otherwise, we locate the first rule >= rulenum:rule_id
+        */
+       struct ipfw_rule_ref rule;      /* match/restart info           */
+
+       struct ether_header *eh;        /* for bridged packets          */
+
+       struct ipfw_flow_id f_id;       /* grabbed from IP header       */
+       //uint32_t      cookie;         /* a cookie depending on rule action */
+       struct inpcb    *inp;
+
+       struct _ip6dn_args      dummypar; /* dummynet->ip6_output */
+       struct sockaddr_in hopstore;    /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+       DIR_MASK =      0x3,
+       DIR_OUT =       0,
+       DIR_IN =        1,
+       DIR_FWD =       2,
+       DIR_DROP =      3,
+       PROTO_LAYER2 =  0x4, /* set for layer 2 */
+       /* PROTO_DEFAULT = 0, */
+       PROTO_IPV4 =    0x08,
+       PROTO_IPV6 =    0x10,
+       PROTO_IFB =     0x0c, /* layer2 + ifbridge */
+   /*  PROTO_OLDBDG =  0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifndef FREE_PKT
+#if defined(__linux__) || defined(_WIN32)
+#define FREE_PKT(m)    netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m)    m_freem(m)
+#endif
+#endif /* !FREE_PKT */
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+       struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+       struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define        V_norule_counter        VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define        V_verbose_limit         VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+       MATCH_REVERSE = 0,
+       MATCH_FORWARD,
+       MATCH_NONE,
+       MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+    u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+       int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void);    /* uma_zcreate .... */
+void ipfw_dyn_detach(void);    /* uma_zdestroy ... */
+void ipfw_dyn_init(void);      /* per-vnet initialization */
+void ipfw_dyn_uninit(int);     /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define        V_fw_one_pass           VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define        V_fw_verbose            VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define        V_layer3_chain          VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define        V_set_disable           VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step         VNET(autoinc_step)
+
+struct ip_fw_chain {
+       struct ip_fw    *rules;         /* list of rules */
+       struct ip_fw    *reap;          /* list of rules to reap */
+       struct ip_fw    *default_rule;
+       int             n_rules;        /* number of static rules */
+       int             static_len;     /* total len of static rules */
+       struct ip_fw    **map;          /* array of rule ptrs to ease lookup */
+       LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
+       struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+       spinlock_t rwmtx;
+       spinlock_t uh_lock;
+#else
+       struct rwlock   rwmtx;
+       struct rwlock   uh_lock;        /* lock for upper half */
+#endif
+       uint32_t        id;             /* ruleset id */
+};
+
+struct sockopt;        /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define        IPFW_LOCK_INIT(_chain) do {                     \
+       rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+       rw_init(&(_chain)->uh_lock, "IPFW UH lock");    \
+       } while (0)
+
+#define        IPFW_LOCK_DESTROY(_chain) do {                  \
+       rw_destroy(&(_chain)->rwmtx);                   \
+       rw_destroy(&(_chain)->uh_lock);                 \
+       } while (0)
+
+#define        IPFW_WLOCK_ASSERT(_chain)       rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_pfil */
+int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+     struct inpcb *inp);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+void ipfw_destroy_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+
+/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
new file mode 100644 (file)
index 0000000..6938aca
--- /dev/null
@@ -0,0 +1,1343 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 206339 2010-04-07 08:23:58Z luigi $");
+
+/*
+ * Sockopt support for ipfw. The routines here implement
+ * the upper half of the ipfw code.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>  /* struct m_tag used by nested headers */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* hooks */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+/*
+ * static variables followed by global ones (none in this file)
+ */
+
+/*
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
+ */
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
+{
+       int i, lo, hi;
+       struct ip_fw *r;
+
+       for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+               i = (lo + hi) / 2;
+               r = chain->map[i];
+               if (r->rulenum < key)
+                       lo = i + 1;     /* continue from the next one */
+               else if (r->rulenum > key)
+                       hi = i;         /* this might be good */
+               else if (r->id < id)
+                       lo = i + 1;     /* continue from the next one */
+               else /* r->id >= id */
+                       hi = i;         /* this might be good */
+       };
+       return hi;
+}
+
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
+
+       for (;;) {
+               struct ip_fw **map;
+               int i;
+
+               i = chain->n_rules + extra;
+               map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
+                       locked ? M_NOWAIT : M_WAITOK);
+               if (map == NULL) {
+                       printf("%s: cannot allocate map\n", __FUNCTION__);
+                       return NULL;
+               }
+               if (!locked)
+                       IPFW_UH_WLOCK(chain);
+               if (i >= chain->n_rules + extra) /* good */
+                       return map;
+               /* otherwise we lost the race, free and retry */
+               if (!locked)
+                       IPFW_UH_WUNLOCK(chain);
+               free(map, M_IPFW);
+       }
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+       struct ip_fw **old_map;
+
+       IPFW_WLOCK(chain);
+       chain->id++;
+       chain->n_rules = new_len;
+       old_map = chain->map;
+       chain->map = new_map;
+       IPFW_WUNLOCK(chain);
+       return old_map;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
+ */
+int
+ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+       struct ip_fw *rule;
+       int i, l, insert_before;
+       struct ip_fw **map;     /* the new array of pointers */
+
+       if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
+               return (EINVAL);
+
+       l = RULESIZE(input_rule);
+       rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+       if (rule == NULL)
+               return (ENOSPC);
+       /* get_map returns with IPFW_UH_WLOCK if successful */
+       map = get_map(chain, 1, 0 /* not locked */);
+       if (map == NULL) {
+               free(rule, M_IPFW);
+               return ENOSPC;
+       }
+
+       bcopy(input_rule, rule, l);
+       /* clear fields not settable from userland */
+       rule->x_next = NULL;
+       rule->next_rule = NULL;
+       rule->pcnt = 0;
+       rule->bcnt = 0;
+       rule->timestamp = 0;
+
+       if (V_autoinc_step < 1)
+               V_autoinc_step = 1;
+       else if (V_autoinc_step > 1000)
+               V_autoinc_step = 1000;
+       /* find the insertion point, we will insert before */
+       insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+       i = ipfw_find_rule(chain, insert_before, 0);
+       /* duplicate first part */
+       if (i > 0)
+               bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+       map[i] = rule;
+       /* duplicate remaining part, we always have the default rule */
+       bcopy(chain->map + i, map + i + 1,
+               sizeof(struct ip_fw *) *(chain->n_rules - i));
+       if (rule->rulenum == 0) {
+               /* write back the number */
+               rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
+               if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+                       rule->rulenum += V_autoinc_step;
+               input_rule->rulenum = rule->rulenum;
+       }
+
+       rule->id = chain->id + 1;
+       map = swap_map(chain, map, chain->n_rules + 1);
+       chain->static_len += l;
+       IPFW_UH_WUNLOCK(chain);
+       if (map)
+               free(map, M_IPFW);
+       return (0);
+}
+
+/*
+ * Reclaim storage associated with a list of rules.  This is
+ * typically the list created using remove_rule.
+ * A NULL pointer on input is handled correctly.
+ */
+void
+ipfw_reap_rules(struct ip_fw *head)
+{
+       struct ip_fw *rule;
+
+       while ((rule = head) != NULL) {
+               head = head->x_next;
+               free(rule, M_IPFW);
+       }
+}
+
+/*
+ * Used by del_entry() to check if a rule should be kept.
+ * Returns 1 if the rule must be kept, 0 otherwise.
+ *
+ * Called with cmd = {0,1,5}.
+ * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ;
+ * cmd == 1 matches on set numbers only, rule numbers are ignored;
+ * cmd == 5 matches on rule and set numbers.
+ *
+ * n == 0 is a wildcard for rule numbers, there is no wildcard for sets.
+ *
+ * Rules to keep are
+ *     (default || reserved || !match_set || !match_number)
+ * where
+ *   default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
+ *     // the default rule is always protected
+ *
+ *   reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
+ *     // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
+ *
+ *   match_set ::= (cmd == 0 || rule->set == set)
+ *     // set number is ignored for cmd == 0
+ *
+ *   match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
+ *     // number is ignored for cmd == 1 or n == 0
+ *
+ */
+static int
+keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
+{
+       return
+                (rule->rulenum == IPFW_DEFAULT_RULE)           ||
+                (cmd == 0 && n == 0 && rule->set == RESVD_SET) ||
+               !(cmd == 0 || rule->set == set)                 ||
+               !(cmd == 1 || n == 0 || n == rule->rulenum);
+}
+
+/**
+ * Remove all rules with given number, or do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an uint32_t. The low 16 bit are the rule or set number;
+ * the next 8 bits are the new set; the top 8 bits indicate the command:
+ *
+ *     0       delete rules numbered "rulenum"
+ *     1       delete rules in set "rulenum"
+ *     2       move rules "rulenum" to set "new_set"
+ *     3       move rules from set "rulenum" to set "new_set"
+ *     4       swap sets "rulenum" and "new_set"
+ *     5       delete rules "rulenum" and set "new_set"
+ */
+static int
+del_entry(struct ip_fw_chain *chain, uint32_t arg)
+{
+       struct ip_fw *rule;
+       uint32_t num;   /* rule number or old_set */
+       uint8_t cmd, new_set;
+       int start, end, i, ofs, n;
+       struct ip_fw **map = NULL;
+       int error = 0;
+
+       num = arg & 0xffff;
+       cmd = (arg >> 24) & 0xff;
+       new_set = (arg >> 16) & 0xff;
+
+       if (cmd > 5 || new_set > RESVD_SET)
+               return EINVAL;
+       if (cmd == 0 || cmd == 2 || cmd == 5) {
+               if (num >= IPFW_DEFAULT_RULE)
+                       return EINVAL;
+       } else {
+               if (num > RESVD_SET)    /* old_set */
+                       return EINVAL;
+       }
+
+       IPFW_UH_WLOCK(chain);   /* arbitrate writers */
+       chain->reap = NULL;     /* prepare for deletions */
+
+       switch (cmd) {
+       case 0: /* delete rules "num" (num == 0 matches all) */
+       case 1: /* delete all rules in set N */
+       case 5: /* delete rules with number N and set "new_set". */
+
+               /*
+                * Locate first rule to delete (start), the rule after
+                * the last one to delete (end), and count how many
+                * rules to delete (n). Always use keep_rule() to
+                * determine which rules to keep.
+                */
+               n = 0;
+               if (cmd == 1) {
+                       /* look for a specific set including RESVD_SET.
+                        * Must scan the entire range, ignore num.
+                        */
+                       new_set = num;
+                       for (start = -1, end = i = 0; i < chain->n_rules; i++) {
+                               if (keep_rule(chain->map[i], cmd, new_set, 0))
+                                       continue;
+                               if (start < 0)
+                                       start = i;
+                               end = i;
+                               n++;
+                       }
+                       end++;  /* first non-matching */
+               } else {
+                       /* Optimized search on rule numbers */
+                       start = ipfw_find_rule(chain, num, 0);
+                       for (end = start; end < chain->n_rules; end++) {
+                               rule = chain->map[end];
+                               if (num > 0 && rule->rulenum != num)
+                                       break;
+                               if (!keep_rule(rule, cmd, new_set, num))
+                                       n++;
+                       }
+               }
+
+               if (n == 0) {
+                       /* A flush request (arg == 0) on empty ruleset
+                        * returns with no error. On the contrary,
+                        * if there is no match on a specific request,
+                        * we return EINVAL.
+                        */
+                       error = (arg == 0) ? 0 : EINVAL;
+                       break;
+               }
+
+               /* We have something to delete. Allocate the new map */
+               map = get_map(chain, -n, 1 /* locked */);
+               if (map == NULL) {
+                       error = EINVAL;
+                       break;
+               }
+
+               /* 1. bcopy the initial part of the map */
+               if (start > 0)
+                       bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+               /* 2. copy active rules between start and end */
+               for (i = ofs = start; i < end; i++) {
+                       rule = chain->map[i];
+                       if (keep_rule(rule, cmd, new_set, num))
+                               map[ofs++] = rule;
+               }
+               /* 3. copy the final part of the map */
+               bcopy(chain->map + end, map + ofs,
+                       (chain->n_rules - end) * sizeof(struct ip_fw *));
+               /* 4. swap the maps (under BH_LOCK) */
+               map = swap_map(chain, map, chain->n_rules - n);
+               /* 5. now remove the rules deleted from the old map */
+               for (i = start; i < end; i++) {
+                       int l;
+                       rule = map[i];
+                       if (keep_rule(rule, cmd, new_set, num))
+                               continue;
+                       l = RULESIZE(rule);
+                       chain->static_len -= l;
+                       ipfw_remove_dyn_children(rule);
+                       rule->x_next = chain->reap;
+                       chain->reap = rule;
+               }
+               break;
+
+       /*
+        * In the next 3 cases the loop stops at (n_rules - 1)
+        * because the default rule is never eligible..
+        */
+
+       case 2: /* move rules with given RULE number to new set */
+               for (i = 0; i < chain->n_rules - 1; i++) {
+                       rule = chain->map[i];
+                       if (rule->rulenum == num)
+                               rule->set = new_set;
+               }
+               break;
+
+       case 3: /* move rules with given SET number to new set */
+               for (i = 0; i < chain->n_rules - 1; i++) {
+                       rule = chain->map[i];
+                       if (rule->set == num)
+                               rule->set = new_set;
+               }
+               break;
+
+       case 4: /* swap two sets */
+               for (i = 0; i < chain->n_rules - 1; i++) {
+                       rule = chain->map[i];
+                       if (rule->set == num)
+                               rule->set = new_set;
+                       else if (rule->set == new_set)
+                               rule->set = num;
+               }
+               break;
+       }
+
+       rule = chain->reap;
+       chain->reap = NULL;
+       IPFW_UH_WUNLOCK(chain);
+       ipfw_reap_rules(rule);
+       if (map)
+               free(map, M_IPFW);
+       return error;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+       ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+       if (log_only == 0) {
+               rule->bcnt = rule->pcnt = 0;
+               rule->timestamp = 0;
+       }
+       if (l->o.opcode == O_LOG)
+               l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ *     0       work with rules from all set's;
+ *     1       work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+       struct ip_fw *rule;
+       char *msg;
+       int i;
+
+       uint16_t rulenum = arg & 0xffff;
+       uint8_t set = (arg >> 16) & 0xff;
+       uint8_t cmd = (arg >> 24) & 0xff;
+
+       if (cmd > 1)
+               return (EINVAL);
+       if (cmd == 1 && set > RESVD_SET)
+               return (EINVAL);
+
+       IPFW_UH_RLOCK(chain);
+       if (rulenum == 0) {
+               V_norule_counter = 0;
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       /* Skip rules not in our set. */
+                       if (cmd == 1 && rule->set != set)
+                               continue;
+                       clear_counters(rule, log_only);
+               }
+               msg = log_only ? "All logging counts reset" :
+                   "Accounting cleared";
+       } else {
+               int cleared = 0;
+               for (i = 0; i < chain->n_rules; i++) {
+                       rule = chain->map[i];
+                       if (rule->rulenum == rulenum) {
+                               if (cmd == 0 || rule->set == set)
+                                       clear_counters(rule, log_only);
+                               cleared = 1;
+                       }
+                       if (rule->rulenum > rulenum)
+                               break;
+               }
+               if (!cleared) { /* we did not find any matching rules */
+                       IPFW_UH_RUNLOCK(chain);
+                       return (EINVAL);
+               }
+               msg = log_only ? "logging count reset" : "cleared";
+       }
+       IPFW_UH_RUNLOCK(chain);
+
+       if (V_fw_verbose) {
+               int lev = LOG_SECURITY | LOG_NOTICE;
+
+               if (rulenum)
+                       log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+               else
+                       log(lev, "ipfw: %s.\n", msg);
+       }
+       return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+       int l, cmdlen = 0;
+       int have_action=0;
+       ipfw_insn *cmd;
+
+       if (size < sizeof(*rule)) {
+               printf("ipfw: rule too short\n");
+               return (EINVAL);
+       }
+       /* first, check for valid size */
+       l = RULESIZE(rule);
+       if (l != size) {
+               printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+               return (EINVAL);
+       }
+       if (rule->act_ofs >= rule->cmd_len) {
+               printf("ipfw: bogus action offset (%u > %u)\n",
+                   rule->act_ofs, rule->cmd_len - 1);
+               return (EINVAL);
+       }
+       /*
+        * Now go for the individual checks. Very simple ones, basically only
+        * instruction sizes.
+        */
+       for (l = rule->cmd_len, cmd = rule->cmd ;
+                       l > 0 ; l -= cmdlen, cmd += cmdlen) {
+               cmdlen = F_LEN(cmd);
+               if (cmdlen > l) {
+                       printf("ipfw: opcode %d size truncated\n",
+                           cmd->opcode);
+                       return EINVAL;
+               }
+               switch (cmd->opcode) {
+               case O_PROBE_STATE:
+               case O_KEEP_STATE:
+               case O_PROTO:
+               case O_IP_SRC_ME:
+               case O_IP_DST_ME:
+               case O_LAYER2:
+               case O_IN:
+               case O_FRAG:
+               case O_DIVERTED:
+               case O_IPOPT:
+               case O_IPTOS:
+               case O_IPPRECEDENCE:
+               case O_IPVER:
+               case O_TCPWIN:
+               case O_TCPFLAGS:
+               case O_TCPOPTS:
+               case O_ESTAB:
+               case O_VERREVPATH:
+               case O_VERSRCREACH:
+               case O_ANTISPOOF:
+               case O_IPSEC:
+#ifdef INET6
+               case O_IP6_SRC_ME:
+               case O_IP6_DST_ME:
+               case O_EXT_HDR:
+               case O_IP6:
+#endif
+               case O_IP4:
+               case O_TAG:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       break;
+
+               case O_FIB:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       if (cmd->arg1 >= rt_numfibs) {
+                               printf("ipfw: invalid fib number %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       break;
+
+               case O_SETFIB:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       if (cmd->arg1 >= rt_numfibs) {
+                               printf("ipfw: invalid fib number %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       goto check_action;
+
+               case O_UID:
+               case O_GID:
+               case O_JAIL:
+               case O_IP_SRC:
+               case O_IP_DST:
+               case O_TCPSEQ:
+               case O_TCPACK:
+               case O_PROB:
+               case O_ICMPTYPE:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+                               goto bad_size;
+                       break;
+
+               case O_LIMIT:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+                               goto bad_size;
+                       break;
+
+               case O_LOG:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+                               goto bad_size;
+
+                       ((ipfw_insn_log *)cmd)->log_left =
+                           ((ipfw_insn_log *)cmd)->max_log;
+
+                       break;
+
+               case O_IP_SRC_MASK:
+               case O_IP_DST_MASK:
+                       /* only odd command lengths */
+                       if ( !(cmdlen & 1) || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_IP_SRC_SET:
+               case O_IP_DST_SET:
+                       if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+                               printf("ipfw: invalid set size %d\n",
+                                       cmd->arg1);
+                               return EINVAL;
+                       }
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+                           (cmd->arg1+31)/32 )
+                               goto bad_size;
+                       break;
+
+               case O_IP_SRC_LOOKUP:
+               case O_IP_DST_LOOKUP:
+                       if (cmd->arg1 >= IPFW_TABLES_MAX) {
+                               printf("ipfw: invalid table number %d\n",
+                                   cmd->arg1);
+                               return (EINVAL);
+                       }
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+                           cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
+                           cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+                               goto bad_size;
+                       break;
+
+               case O_MACADDR2:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+                               goto bad_size;
+                       break;
+
+               case O_NOP:
+               case O_IPID:
+               case O_IPTTL:
+               case O_IPLEN:
+               case O_TCPDATALEN:
+               case O_TAGGED:
+                       if (cmdlen < 1 || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_MAC_TYPE:
+               case O_IP_SRCPORT:
+               case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+                       if (cmdlen < 2 || cmdlen > 31)
+                               goto bad_size;
+                       break;
+
+               case O_RECV:
+               case O_XMIT:
+               case O_VIA:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+                               goto bad_size;
+                       break;
+
+               case O_ALTQ:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+                               goto bad_size;
+                       break;
+
+               case O_PIPE:
+               case O_QUEUE:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       goto check_action;
+
+               case O_FORWARD_IP:
+#ifdef IPFIREWALL_FORWARD
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+                               goto bad_size;
+                       goto check_action;
+#else
+                       return EINVAL;
+#endif
+
+               case O_DIVERT:
+               case O_TEE:
+                       if (ip_divert_ptr == NULL)
+                               return EINVAL;
+                       else
+                               goto check_size;
+               case O_NETGRAPH:
+               case O_NGTEE:
+                       if (ng_ipfw_input_p == NULL)
+                               return EINVAL;
+                       else
+                               goto check_size;
+               case O_NAT:
+                       if (!IPFW_NAT_LOADED)
+                               return EINVAL;
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+                               goto bad_size;          
+                       goto check_action;
+               case O_FORWARD_MAC: /* XXX not implemented yet */
+               case O_CHECK_STATE:
+               case O_COUNT:
+               case O_ACCEPT:
+               case O_DENY:
+               case O_REJECT:
+#ifdef INET6
+               case O_UNREACH6:
+#endif
+               case O_SKIPTO:
+               case O_REASS:
+check_size:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+check_action:
+                       if (have_action) {
+                               printf("ipfw: opcode %d, multiple actions"
+                                       " not allowed\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+                       have_action = 1;
+                       if (l != cmdlen) {
+                               printf("ipfw: opcode %d, action must be"
+                                       " last opcode\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+                       break;
+#ifdef INET6
+               case O_IP6_SRC:
+               case O_IP6_DST:
+                       if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+                           F_INSN_SIZE(ipfw_insn))
+                               goto bad_size;
+                       break;
+
+               case O_FLOW6ID:
+                       if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+                           ((ipfw_insn_u32 *)cmd)->o.arg1)
+                               goto bad_size;
+                       break;
+
+               case O_IP6_SRC_MASK:
+               case O_IP6_DST_MASK:
+                       if ( !(cmdlen & 1) || cmdlen > 127)
+                               goto bad_size;
+                       break;
+               case O_ICMP6TYPE:
+                       if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+                               goto bad_size;
+                       break;
+#endif
+
+               default:
+                       switch (cmd->opcode) {
+#ifndef INET6
+                       case O_IP6_SRC_ME:
+                       case O_IP6_DST_ME:
+                       case O_EXT_HDR:
+                       case O_IP6:
+                       case O_UNREACH6:
+                       case O_IP6_SRC:
+                       case O_IP6_DST:
+                       case O_FLOW6ID:
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                       case O_ICMP6TYPE:
+                               printf("ipfw: no IPv6 support in kernel\n");
+                               return EPROTONOSUPPORT;
+#endif
+                       default:
+                               printf("ipfw: opcode %d, unknown opcode\n",
+                                       cmd->opcode);
+                               return EINVAL;
+                       }
+               }
+       }
+       if (have_action == 0) {
+               printf("ipfw: missing action\n");
+               return EINVAL;
+       }
+       return 0;
+
+bad_size:
+       printf("ipfw: opcode %d size %d wrong\n",
+               cmd->opcode, cmdlen);
+       return EINVAL;
+}
+
+
+/*
+ * Translation of requests for compatibility with FreeBSD 7.2/8.
+ * a static variable tells us if we have an old client from userland,
+ * and if necessary we translate requests and responses between the
+ * two formats.
+ */
+static int is7 = 0;
+
+struct ip_fw7 {
+       struct ip_fw7   *next;          /* linked list of rules     */
+       struct ip_fw7   *next_rule;     /* ptr to next [skipto] rule    */
+       /* 'next_rule' is used to pass up 'set_disable' status      */
+
+       uint16_t        act_ofs;        /* offset of action in 32-bit units */
+       uint16_t        cmd_len;        /* # of 32-bit words in cmd */
+       uint16_t        rulenum;        /* rule number          */
+       uint8_t         set;            /* rule set (0..31)     */
+       // #define RESVD_SET   31  /* set for default and persistent rules */
+       uint8_t         _pad;           /* padding          */
+       // uint32_t        id;             /* rule id, only in v.8 */
+       /* These fields are present in all rules.           */
+       uint64_t        pcnt;           /* Packet counter       */
+       uint64_t        bcnt;           /* Byte counter         */
+       uint32_t        timestamp;      /* tv_sec of last match     */
+
+       ipfw_insn       cmd[1];         /* storage for commands     */
+};
+
+       int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_8(struct ip_fw *rule);
+
+#ifndef RULESIZE7
+#define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
+       ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
+#endif
+
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+       char *bp = buf;
+       char *ep = bp + space;
+       struct ip_fw *rule, *dst;
+       int l, i;
+       time_t  boot_seconds;
+
+        boot_seconds = boottime.tv_sec;
+       for (i = 0; i < chain->n_rules; i++) {
+               rule = chain->map[i];
+
+               if (is7) {
+                   /* Convert rule to FreeBSd 7.2 format */
+                   l = RULESIZE7(rule);
+                   if (bp + l + sizeof(uint32_t) <= ep) {
+                       int error;
+                       bcopy(rule, bp, l + sizeof(uint32_t));
+                       error = convert_rule_to_7((struct ip_fw *) bp);
+                       if (error)
+                               return 0; /*XXX correct? */
+                       /*
+                        * XXX HACK. Store the disable mask in the "next"
+                        * pointer in a wild attempt to keep the ABI the same.
+                        * Why do we do this on EVERY rule?
+                        */
+                       bcopy(&V_set_disable,
+                               &(((struct ip_fw7 *)bp)->next_rule),
+                               sizeof(V_set_disable));
+                       if (((struct ip_fw7 *)bp)->timestamp)
+                           ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
+                       bp += l;
+                   }
+                   continue; /* go to next rule */
+               }
+
+               /* normal mode, don't touch rules */
+               l = RULESIZE(rule);
+               if (bp + l > ep) { /* should not happen */
+                       printf("overflow dumping static rules\n");
+                       break;
+               }
+               dst = (struct ip_fw *)bp;
+               bcopy(rule, dst, l);
+               /*
+                * XXX HACK. Store the disable mask in the "next"
+                * pointer in a wild attempt to keep the ABI the same.
+                * Why do we do this on EVERY rule?
+                */
+               bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+               if (dst->timestamp)
+                       dst->timestamp += boot_seconds;
+               bp += l;
+       }
+       ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
+       return (bp - (char *)buf);
+}
+
+
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define        RULE_MAXSIZE    (256*sizeof(u_int32_t))
+       int error;
+       size_t size;
+       struct ip_fw *buf, *rule;
+       struct ip_fw_chain *chain;
+       u_int32_t rulenum[2];
+
+       error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+       if (error)
+               return (error);
+
+       /*
+        * Disallow modifications in really-really secure mode, but still allow
+        * the logging counters to be reset.
+        */
+       if (sopt->sopt_name == IP_FW_ADD ||
+           (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+               error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+               if (error)
+                       return (error);
+       }
+
+       chain = &V_layer3_chain;
+       error = 0;
+
+       switch (sopt->sopt_name) {
+       case IP_FW_GET:
+               /*
+                * pass up a copy of the current rules. Static rules
+                * come first (the last of which has number IPFW_DEFAULT_RULE),
+                * followed by a possibly empty list of dynamic rule.
+                * The last dynamic rule has NULL in the "next" field.
+                *
+                * Note that the calculated size is used to bound the
+                * amount of data returned to the user.  The rule set may
+                * change between calculating the size and returning the
+                * data in which case we'll just return what fits.
+                */
+               for (;;) {
+                       int len = 0, want;
+
+                       size = chain->static_len;
+                       size += ipfw_dyn_len();
+                       if (size >= sopt->sopt_valsize)
+                               break;
+                       buf = malloc(size, M_TEMP, M_WAITOK);
+                       if (buf == NULL)
+                               break;
+                       IPFW_UH_RLOCK(chain);
+                       /* check again how much space we need */
+                       want = chain->static_len + ipfw_dyn_len();
+                       if (size >= want)
+                               len = ipfw_getrules(chain, buf, size);
+                       IPFW_UH_RUNLOCK(chain);
+                       if (size >= want)
+                               error = sooptcopyout(sopt, buf, len);
+                       free(buf, M_TEMP);
+                       if (size >= want)
+                               break;
+               }
+               break;
+
+       case IP_FW_FLUSH:
+               /* locking is done within del_entry() */
+               error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
+               break;
+
+       case IP_FW_ADD:
+               rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+               error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+                       sizeof(struct ip_fw7) );
+
+               /*
+                * If the size of commands equals RULESIZE7 then we assume
+                * a FreeBSD7.2 binary is talking to us (set is7=1).
+                * is7 is persistent so the next 'ipfw list' command
+                * will use this format.
+                * NOTE: If wrong version is guessed (this can happen if
+                *       the first ipfw command is 'ipfw [pipe] list')
+                *       the ipfw binary may crash or loop infinitly...
+                */
+               if (sopt->sopt_valsize == RULESIZE7(rule)) {
+                   is7 = 1;
+                   error = convert_rule_to_8(rule);
+                   if (error)
+                       return error;
+                   if (error == 0)
+                       error = check_ipfw_struct(rule, RULESIZE(rule));
+               } else {
+                   is7 = 0;
+               if (error == 0)
+                       error = check_ipfw_struct(rule, sopt->sopt_valsize);
+               }
+               if (error == 0) {
+                       /* locking is done within ipfw_add_rule() */
+                       error = ipfw_add_rule(chain, rule);
+                       size = RULESIZE(rule);
+                       if (!error && sopt->sopt_dir == SOPT_GET) {
+                               if (is7) {
+                                       error = convert_rule_to_7(rule);
+                                       size = RULESIZE7(rule);
+                                       if (error)
+                                               return error;
+                               }
+                               error = sooptcopyout(sopt, rule, size);
+               }
+               }
+               free(rule, M_TEMP);
+               break;
+
+       case IP_FW_DEL:
+               /*
+                * IP_FW_DEL is used for deleting single rules or sets,
+                * and (ab)used to atomically manipulate sets. Argument size
+                * is used to distinguish between the two:
+                *    sizeof(u_int32_t)
+                *      delete single rule or set of rules,
+                *      or reassign rules (or sets) to a different set.
+                *    2*sizeof(u_int32_t)
+                *      atomic disable/enable sets.
+                *      first u_int32_t contains sets to be disabled,
+                *      second u_int32_t contains sets to be enabled.
+                */
+               error = sooptcopyin(sopt, rulenum,
+                       2*sizeof(u_int32_t), sizeof(u_int32_t));
+               if (error)
+                       break;
+               size = sopt->sopt_valsize;
+               if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
+                       /* delete or reassign, locking done in del_entry() */
+                       error = del_entry(chain, rulenum[0]);
+               } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+                       IPFW_UH_WLOCK(chain);
+                       V_set_disable =
+                           (V_set_disable | rulenum[0]) & ~rulenum[1] &
+                           ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+                       IPFW_UH_WUNLOCK(chain);
+               } else
+                       error = EINVAL;
+               break;
+
+       case IP_FW_ZERO:
+       case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+               rulenum[0] = 0;
+               if (sopt->sopt_val != 0) {
+                   error = sooptcopyin(sopt, rulenum,
+                           sizeof(u_int32_t), sizeof(u_int32_t));
+                   if (error)
+                       break;
+               }
+               error = zero_entry(chain, rulenum[0],
+                       sopt->sopt_name == IP_FW_RESETLOG);
+               break;
+
+       /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+       case IP_FW_TABLE_ADD:
+               {
+                       ipfw_table_entry ent;
+
+                       error = sooptcopyin(sopt, &ent,
+                           sizeof(ent), sizeof(ent));
+                       if (error)
+                               break;
+                       error = ipfw_add_table_entry(chain, ent.tbl,
+                           ent.addr, ent.masklen, ent.value);
+               }
+               break;
+
+       case IP_FW_TABLE_DEL:
+               {
+                       ipfw_table_entry ent;
+
+                       error = sooptcopyin(sopt, &ent,
+                           sizeof(ent), sizeof(ent));
+                       if (error)
+                               break;
+                       error = ipfw_del_table_entry(chain, ent.tbl,
+                           ent.addr, ent.masklen);
+               }
+               break;
+
+       case IP_FW_TABLE_FLUSH:
+               {
+                       u_int16_t tbl;
+
+                       error = sooptcopyin(sopt, &tbl,
+                           sizeof(tbl), sizeof(tbl));
+                       if (error)
+                               break;
+                       IPFW_WLOCK(chain);
+                       error = ipfw_flush_table(chain, tbl);
+                       IPFW_WUNLOCK(chain);
+               }
+               break;
+
+       case IP_FW_TABLE_GETSIZE:
+               {
+                       u_int32_t tbl, cnt;
+
+                       if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+                           sizeof(tbl))))
+                               break;
+                       IPFW_RLOCK(chain);
+                       error = ipfw_count_table(chain, tbl, &cnt);
+                       IPFW_RUNLOCK(chain);
+                       if (error)
+                               break;
+                       error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+               }
+               break;
+
+       case IP_FW_TABLE_LIST:
+               {
+                       ipfw_table *tbl;
+
+                       if (sopt->sopt_valsize < sizeof(*tbl)) {
+                               error = EINVAL;
+                               break;
+                       }
+                       size = sopt->sopt_valsize;
+                       tbl = malloc(size, M_TEMP, M_WAITOK);
+                       error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+                       if (error) {
+                               free(tbl, M_TEMP);
+                               break;
+                       }
+                       tbl->size = (size - sizeof(*tbl)) /
+                           sizeof(ipfw_table_entry);
+                       IPFW_RLOCK(chain);
+                       error = ipfw_dump_table(chain, tbl);
+                       IPFW_RUNLOCK(chain);
+                       if (error) {
+                               free(tbl, M_TEMP);
+                               break;
+                       }
+                       error = sooptcopyout(sopt, tbl, size);
+                       free(tbl, M_TEMP);
+               }
+               break;
+
+       /*--- NAT operations are protected by the IPFW_LOCK ---*/
+       case IP_FW_NAT_CFG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_cfg_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_CFG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_DEL:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_del_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_DEL: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_GET_CONFIG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_get_cfg_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_GET_CFG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       case IP_FW_NAT_GET_LOG:
+               if (IPFW_NAT_LOADED)
+                       error = ipfw_nat_get_log_ptr(sopt);
+               else {
+                       printf("IP_FW_NAT_GET_LOG: %s\n",
+                           "ipfw_nat not present, please load it");
+                       error = EINVAL;
+               }
+               break;
+
+       default:
+               printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+               error = EINVAL;
+       }
+
+       return (error);
+#undef RULE_MAXSIZE
+}
+
+
+#define        RULE_MAXSIZE    (256*sizeof(u_int32_t))
+
+/* Functions to convert rules 7.2 <==> 8.0 */
+int
+convert_rule_to_7(struct ip_fw *rule)
+{
+       /* Used to modify original rule */
+       struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
+       /* copy of original rule, version 8 */
+       struct ip_fw *tmp;
+
+       /* Used to copy commands */
+       ipfw_insn *ccmd, *dst;
+       int ll = 0, ccmdlen = 0;
+
+       tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+       if (tmp == NULL) {
+               return 1; //XXX error
+       }
+       bcopy(rule, tmp, RULE_MAXSIZE);
+
+       /* Copy fields */
+       rule7->_pad = tmp->_pad;
+       rule7->set = tmp->set;
+       rule7->rulenum = tmp->rulenum;
+       rule7->cmd_len = tmp->cmd_len;
+       rule7->act_ofs = tmp->act_ofs;
+       rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
+       rule7->next = (struct ip_fw7 *)tmp->x_next;
+       rule7->cmd_len = tmp->cmd_len;
+       rule7->pcnt = tmp->pcnt;
+       rule7->bcnt = tmp->bcnt;
+       rule7->timestamp = tmp->timestamp;
+
+       /* Copy commands */
+       for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
+                       ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+               ccmdlen = F_LEN(ccmd);
+
+               bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+               if (dst->opcode > O_NAT)
+                       /* O_REASS doesn't exists in 7.2 version, so
+                        * decrement opcode if it is after O_REASS
+                        */
+                       dst->opcode--;
+
+               if (ccmdlen > ll) {
+                       printf("ipfw: opcode %d size truncated\n",
+                               ccmd->opcode);
+                       return EINVAL;
+               }
+       }
+       free(tmp, M_TEMP);
+
+       return 0;
+}
+
+int
+convert_rule_to_8(struct ip_fw *rule)
+{
+       /* Used to modify original rule */
+       struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
+
+       /* Used to copy commands */
+       ipfw_insn *ccmd, *dst;
+       int ll = 0, ccmdlen = 0;
+
+       /* Copy of original rule */
+       struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+       if (tmp == NULL) {
+               return 1; //XXX error
+       }
+
+       bcopy(rule7, tmp, RULE_MAXSIZE);
+
+       for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
+                       ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+               ccmdlen = F_LEN(ccmd);
+               
+               bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+               if (dst->opcode > O_NAT)
+                       /* O_REASS doesn't exists in 7.2 version, so
+                        * increment opcode if it is after O_REASS
+                        */
+                       dst->opcode++;
+
+               if (ccmdlen > ll) {
+                       printf("ipfw: opcode %d size truncated\n",
+                           ccmd->opcode);
+                       return EINVAL;
+               }
+       }
+
+       rule->_pad = tmp->_pad;
+       rule->set = tmp->set;
+       rule->rulenum = tmp->rulenum;
+       rule->cmd_len = tmp->cmd_len;
+       rule->act_ofs = tmp->act_ofs;
+       rule->next_rule = (struct ip_fw *)tmp->next_rule;
+       rule->x_next = (struct ip_fw *)tmp->next;
+       rule->cmd_len = tmp->cmd_len;
+       rule->id = 0; /* XXX see if is ok = 0 */
+       rule->pcnt = tmp->pcnt;
+       rule->bcnt = tmp->bcnt;
+       rule->timestamp = tmp->timestamp;
+
+       free (tmp, M_TEMP);
+       return 0;
+}
+
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c
new file mode 100644 (file)
index 0000000..d8973d5
--- /dev/null
@@ -0,0 +1,286 @@
+/*-
+ * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * Lookup table support for ipfw
+ *
+ * Lookup tables are implemented (at the moment) using the radix
+ * tree used for routing tables. Tables store key-value entries, where
+ * keys are network prefixes (addr/masklen), and values are integers.
+ * As a degenerate case we can interpret keys as 32-bit integers
+ * (with a /32 mask).
+ *
+ * The table is protected by the IPFW lock even for manipulation coming
+ * from userland, because operations are typically fast.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <net/if.h>    /* ip_fw.h requires IFNAMSIZ */
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h>    /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+#include <sys/queue.h> /* LIST_HEAD */
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+struct table_entry {
+       struct radix_node       rn[2];
+       struct sockaddr_in      addr, mask;
+       u_int32_t               value;
+};
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v)     *((uint8_t *)&(v))
+#define KEY_OFS                (8*offsetof(struct sockaddr_in, sin_addr))
+
+int
+ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct radix_node *rn;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
+       if (ent == NULL)
+               return (ENOMEM);
+       ent->value = value;
+       KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8;
+       ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+       ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+       IPFW_WLOCK(ch);
+       rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
+       if (rn == NULL) {
+               IPFW_WUNLOCK(ch);
+               free(ent, M_IPFW_TBL);
+               return (EEXIST);
+       }
+       IPFW_WUNLOCK(ch);
+       return (0);
+}
+
+int
+ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct sockaddr_in sa, mask;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       KEY_LEN(sa) = KEY_LEN(mask) = 8;
+       mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+       sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+       IPFW_WLOCK(ch);
+       ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+       if (ent == NULL) {
+               IPFW_WUNLOCK(ch);
+               return (ESRCH);
+       }
+       IPFW_WUNLOCK(ch);
+       free(ent, M_IPFW_TBL);
+       return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+       struct radix_node_head * const rnh = arg;
+       struct table_entry *ent;
+
+       ent = (struct table_entry *)
+           rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+       if (ent != NULL)
+               free(ent, M_IPFW_TBL);
+       return (0);
+}
+
+int
+ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+       struct radix_node_head *rnh;
+
+       IPFW_WLOCK_ASSERT(ch);
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       KASSERT(rnh != NULL, ("NULL IPFW table"));
+       rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+       return (0);
+}
+
+void
+ipfw_destroy_tables(struct ip_fw_chain *ch)
+{
+       uint16_t tbl;
+       struct radix_node_head *rnh;
+
+       IPFW_WLOCK_ASSERT(ch);
+
+       for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) {
+               ipfw_flush_table(ch, tbl);
+               rnh = ch->tables[tbl];
+               rn_detachhead((void **)&rnh);
+       }
+}
+
+int
+ipfw_init_tables(struct ip_fw_chain *ch)
+{ 
+       int i;
+       uint16_t j;
+
+       for (i = 0; i < IPFW_TABLES_MAX; i++) {
+               if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) {
+                       for (j = 0; j < i; j++) {
+                               (void) ipfw_flush_table(ch, j);
+                       }
+                       return (ENOMEM);
+               }
+       }
+       return (0);
+}
+
+int
+ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val)
+{
+       struct radix_node_head *rnh;
+       struct table_entry *ent;
+       struct sockaddr_in sa;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (0);
+       rnh = ch->tables[tbl];
+       KEY_LEN(sa) = 8;
+       sa.sin_addr.s_addr = addr;
+       ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+       if (ent != NULL) {
+               *val = ent->value;
+               return (1);
+       }
+       return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+       u_int32_t * const cnt = arg;
+
+       (*cnt)++;
+       return (0);
+}
+
+int
+ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+       struct radix_node_head *rnh;
+
+       if (tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl];
+       *cnt = 0;
+       rnh->rnh_walktree(rnh, count_table_entry, cnt);
+       return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+       struct table_entry * const n = (struct table_entry *)rn;
+       ipfw_table * const tbl = arg;
+       ipfw_table_entry *ent;
+
+       if (tbl->cnt == tbl->size)
+               return (1);
+       ent = &tbl->ent[tbl->cnt];
+       ent->tbl = tbl->tbl;
+       if (in_nullhost(n->mask.sin_addr))
+               ent->masklen = 0;
+       else
+               ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+       ent->addr = n->addr.sin_addr.s_addr;
+       ent->value = n->value;
+       tbl->cnt++;
+       return (0);
+}
+
+int
+ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+       struct radix_node_head *rnh;
+
+       if (tbl->tbl >= IPFW_TABLES_MAX)
+               return (EINVAL);
+       rnh = ch->tables[tbl->tbl];
+       tbl->cnt = 0;
+       rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+       return (0);
+}
+/* end of file */
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
new file mode 100644 (file)
index 0000000..5af35a7
--- /dev/null
@@ -0,0 +1,228 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)tcp.h       8.1 (Berkeley) 6/10/93
+ * $FreeBSD: src/sys/netinet/tcp.h,v 1.40.2.2 2008/07/31 06:10:25 kmacy Exp $
+ */
+
+#ifndef _NETINET_TCP_H_
+#define _NETINET_TCP_H_
+
+#include <sys/cdefs.h>
+
+#define __BSD_VISIBLE 1
+
+#if __BSD_VISIBLE
+
+typedef        u_int32_t tcp_seq;
+
+#define tcp6_seq       tcp_seq /* for KAME src sync over BSD*'s */
+#define tcp6hdr                tcphdr  /* for KAME src sync over BSD*'s */
+
+/*
+ * TCP header.
+ * Per RFC 793, September, 1981.
+ */
+struct tcphdr {
+       u_short th_sport;               /* source port */
+       u_short th_dport;               /* destination port */
+       tcp_seq th_seq;                 /* sequence number */
+       tcp_seq th_ack;                 /* acknowledgement number */
+#if BYTE_ORDER == LITTLE_ENDIAN
+       u_char  th_x2:4,                /* (unused) */
+               th_off:4;               /* data offset */
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+       u_char  th_off:4,               /* data offset */
+               th_x2:4;                /* (unused) */
+#endif
+       u_char  th_flags;
+#define        TH_FIN  0x01
+#define        TH_SYN  0x02
+#define        TH_RST  0x04
+#define        TH_PUSH 0x08
+#define        TH_ACK  0x10
+#define        TH_URG  0x20
+#define        TH_ECE  0x40
+#define        TH_CWR  0x80
+#define        TH_FLAGS        (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR)
+#define        PRINT_TH_FLAGS  "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR"
+
+       u_short th_win;                 /* window */
+       u_short th_sum;                 /* checksum */
+       u_short th_urp;                 /* urgent pointer */
+};
+
+#define        TCPOPT_EOL              0
+#define           TCPOLEN_EOL                  1
+#define        TCPOPT_PAD              0               /* padding after EOL */
+#define           TCPOLEN_PAD                  1
+#define        TCPOPT_NOP              1
+#define           TCPOLEN_NOP                  1
+#define        TCPOPT_MAXSEG           2
+#define    TCPOLEN_MAXSEG              4
+#define TCPOPT_WINDOW          3
+#define    TCPOLEN_WINDOW              3
+#define TCPOPT_SACK_PERMITTED  4
+#define    TCPOLEN_SACK_PERMITTED      2
+#define TCPOPT_SACK            5
+#define           TCPOLEN_SACKHDR              2
+#define    TCPOLEN_SACK                        8       /* 2*sizeof(tcp_seq) */
+#define TCPOPT_TIMESTAMP       8
+#define    TCPOLEN_TIMESTAMP           10
+#define    TCPOLEN_TSTAMP_APPA         (TCPOLEN_TIMESTAMP+2) /* appendix A */
+#define        TCPOPT_SIGNATURE        19              /* Keyed MD5: RFC 2385 */
+#define           TCPOLEN_SIGNATURE            18
+
+/* Miscellaneous constants */
+#define        MAX_SACK_BLKS   6       /* Max # SACK blocks stored at receiver side */
+#define        TCP_MAX_SACK    4       /* MAX # SACKs sent in any segment */
+
+
+/*
+ * Default maximum segment size for TCP.
+ * With an IP MTU of 576, this is 536,
+ * but 512 is probably more convenient.
+ * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)).
+ */
+#define        TCP_MSS 512
+/*
+ * TCP_MINMSS is defined to be 216 which is fine for the smallest
+ * link MTU (256 bytes, AX.25 packet radio) in the Internet.
+ * However it is very unlikely to come across such low MTU interfaces
+ * these days (anno dato 2003).
+ * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
+ * Setting this to "0" disables the minmss check.
+ */
+#define        TCP_MINMSS 216
+
+/*
+ * Default maximum segment size for TCP6.
+ * With an IP6 MSS of 1280, this is 1220,
+ * but 1024 is probably more convenient. (xxx kazu in doubt)
+ * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr))
+ */
+#define        TCP6_MSS        1024
+
+#define        TCP_MAXWIN      65535   /* largest value for (unscaled) window */
+#define        TTCP_CLIENT_SND_WND     4096    /* dflt send window for T/TCP client */
+
+#define TCP_MAX_WINSHIFT       14      /* maximum window shift */
+
+#define TCP_MAXBURST           4       /* maximum segments in a burst */
+
+#define TCP_MAXHLEN    (0xf<<2)        /* max length of header in bytes */
+#define TCP_MAXOLEN    (TCP_MAXHLEN - sizeof(struct tcphdr))
+                                       /* max space left for options */
+#endif /* __BSD_VISIBLE */
+
+/*
+ * User-settable options (used with setsockopt).
+ */
+#define        TCP_NODELAY     0x01    /* don't delay send to coalesce packets */
+#if __BSD_VISIBLE
+#define        TCP_MAXSEG      0x02    /* set maximum segment size */
+#define TCP_NOPUSH     0x04    /* don't push last block of write */
+#define TCP_NOOPT      0x08    /* don't use TCP options */
+#define TCP_MD5SIG     0x10    /* use MD5 digests (RFC2385) */
+#define        TCP_INFO        0x20    /* retrieve tcp_info structure */
+#define        TCP_CONGESTION  0x40    /* get/set congestion control algorithm */
+
+#define        TCP_CA_NAME_MAX 16      /* max congestion control name length */
+
+#define        TCPI_OPT_TIMESTAMPS     0x01
+#define        TCPI_OPT_SACK           0x02
+#define        TCPI_OPT_WSCALE         0x04
+#define        TCPI_OPT_ECN            0x08
+#define        TCPI_OPT_TOE            0x10
+
+/*
+ * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
+ * the caller to query certain information about the state of a TCP
+ * connection.  We provide an overlapping set of fields with the Linux
+ * implementation, but since this is a fixed size structure, room has been
+ * left for growth.  In order to maximize potential future compatibility with
+ * the Linux API, the same variable names and order have been adopted, and
+ * padding left to make room for omitted fields in case they are added later.
+ *
+ * XXX: This is currently an unstable ABI/API, in that it is expected to
+ * change.
+ */
+struct tcp_info {
+       u_int8_t        tcpi_state;             /* TCP FSM state. */
+       u_int8_t        __tcpi_ca_state;
+       u_int8_t        __tcpi_retransmits;
+       u_int8_t        __tcpi_probes;
+       u_int8_t        __tcpi_backoff;
+       u_int8_t        tcpi_options;           /* Options enabled on conn. */
+       u_int8_t        tcpi_snd_wscale:4,      /* RFC1323 send shift value. */
+                       tcpi_rcv_wscale:4;      /* RFC1323 recv shift value. */
+
+       u_int32_t       __tcpi_rto;
+       u_int32_t       __tcpi_ato;
+       u_int32_t       __tcpi_snd_mss;
+       u_int32_t       __tcpi_rcv_mss;
+
+       u_int32_t       __tcpi_unacked;
+       u_int32_t       __tcpi_sacked;
+       u_int32_t       __tcpi_lost;
+       u_int32_t       __tcpi_retrans;
+       u_int32_t       __tcpi_fackets;
+
+       /* Times; measurements in usecs. */
+       u_int32_t       __tcpi_last_data_sent;
+       u_int32_t       __tcpi_last_ack_sent;   /* Also unimpl. on Linux? */
+       u_int32_t       __tcpi_last_data_recv;
+       u_int32_t       __tcpi_last_ack_recv;
+
+       /* Metrics; variable units. */
+       u_int32_t       __tcpi_pmtu;
+       u_int32_t       __tcpi_rcv_ssthresh;
+       u_int32_t       tcpi_rtt;               /* Smoothed RTT in usecs. */
+       u_int32_t       tcpi_rttvar;            /* RTT variance in usecs. */
+       u_int32_t       tcpi_snd_ssthresh;      /* Slow start threshold. */
+       u_int32_t       tcpi_snd_cwnd;          /* Send congestion window. */
+       u_int32_t       __tcpi_advmss;
+       u_int32_t       __tcpi_reordering;
+
+       u_int32_t       __tcpi_rcv_rtt;
+       u_int32_t       tcpi_rcv_space;         /* Advertised recv window. */
+
+       /* FreeBSD extensions to tcp_info. */
+       u_int32_t       tcpi_snd_wnd;           /* Advertised send window. */
+       u_int32_t       tcpi_snd_bwnd;          /* Bandwidth send window. */
+       u_int32_t       tcpi_snd_nxt;           /* Next egress seqno */
+       u_int32_t       tcpi_rcv_nxt;           /* Next ingress seqno */
+       u_int32_t       tcpi_toe_tid;           /* HWTID for TOE endpoints */
+       
+       /* Padding to grow without breaking ABI. */
+       u_int32_t       __tcpi_pad[29];         /* Padding. */
+};
+#endif
+
+#endif /* !_NETINET_TCP_H_ */
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
new file mode 100644 (file)
index 0000000..35196a2
--- /dev/null
@@ -0,0 +1,4 @@
+#ifndef _NETINET_TCP_VAR_H_
+#define _NETINET_TCP_VAR_H_
+#include <netinet/tcp.h>
+#endif /* !_NETINET_TCP_VAR_H_ */
diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h
new file mode 100644 (file)
index 0000000..cd75bd1
--- /dev/null
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *     The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)udp.h       8.1 (Berkeley) 6/10/93
+ * $FreeBSD: src/sys/netinet/udp.h,v 1.10 2007/02/20 10:13:11 rwatson Exp $
+ */
+
+#ifndef _NETINET_UDP_H_
+#define        _NETINET_UDP_H_
+
+/*
+ * UDP protocol header.
+ * Per RFC 768, September, 1981.
+ */
+struct udphdr {
+       u_short uh_sport;               /* source port */
+       u_short uh_dport;               /* destination port */
+       u_short uh_ulen;                /* udp length */
+       u_short uh_sum;                 /* udp checksum */
+};
+
+/* 
+ * User-settable options (used with setsockopt).
+ */
+#define        UDP_ENCAP                       0x01
+
+
+/*
+ * UDP Encapsulation of IPsec Packets options.
+ */
+/* Encapsulation types. */
+#define        UDP_ENCAP_ESPINUDP_NON_IKE      1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
+#define        UDP_ENCAP_ESPINUDP              2 /* draft-ietf-ipsec-udp-encaps-02+ */
+
+/* Default ESP in UDP encapsulation port. */
+#define        UDP_ENCAP_ESPINUDP_PORT         500
+
+/* Maximum UDP fragment size for ESP over UDP. */
+#define        UDP_ENCAP_ESPINUDP_MAXFRAGLEN   552
+
+#endif
diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h
new file mode 100644 (file)
index 0000000..82c9851
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _CDEFS_H_
+#define _CDEFS_H_
+
+/*
+ * various compiler macros and common functions
+ */
+
+#ifndef __packed
+#define __packed       __attribute__ ((__packed__))
+#endif
+
+#ifndef __aligned
+#define __aligned(x) __attribute__((__aligned__(x)))
+#endif
+
+/* defined as assert */
+void panic(const char *fmt, ...);
+
+#define KASSERT(exp,msg) do {                                           \
+        if (__predict_false(!(exp)))                                    \
+                panic msg;                                              \
+} while (0)
+
+/* don't bother to optimize */
+#ifndef __predict_false
+#define __predict_false(x)   (x)       /* __builtin_expect((exp), 0) */
+#endif
+
+#endif /* !_CDEFS_H_ */
diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h
new file mode 100644 (file)
index 0000000..fbc9581
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+ * from freebsd's kernel.h
+ */
+#ifndef _SYS_KERNEL_H_
+#define _SYS_KERNEL_H_
+
+#define SYSINIT(a, b, c, d, e)  \
+        void *sysinit_ ## d = d
+#define VNET_SYSINIT(a, b, c, d, e)  \
+        void *sysinit_ ## d = d
+#define SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
+#define VNET_SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
+
+/*
+ * Some enumerated orders; "ANY" sorts last.
+ */
+enum sysinit_elem_order {
+        SI_ORDER_FIRST          = 0x0000000,    /* first*/
+        SI_ORDER_SECOND         = 0x0000001,    /* second*/
+        SI_ORDER_THIRD          = 0x0000002,    /* third*/
+        SI_ORDER_MIDDLE         = 0x1000000,    /* somewhere in the middle */
+        SI_ORDER_ANY            = 0xfffffff     /* last*/
+};
+#endif
diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h
new file mode 100644 (file)
index 0000000..ac16aed
--- /dev/null
@@ -0,0 +1,59 @@
+#ifndef _SYS_MALLOC_H_
+#define _SYS_MALLOC_H_
+
+/*
+ * No matter what, try to get clear memory and be non-blocking.
+ * XXX check if 2.4 has a native way to zero memory,
+ * XXX obey to the flags (M_NOWAIT <-> GPF_ATOMIC, M_WAIT <-> GPF_KERNEL)
+ */
+#ifndef _WIN32 /* this is the linux version */
+
+/*
+ * XXX On zeroshell (2.6.25.17) we get a load error
+ *     __you_cannot_kmalloc_that_much
+ * which is triggered when kmalloc() is called with a large
+ * compile-time constant argument (include/linux/slab_def.h)
+ *
+ * I think it may be a compiler (or source) bug because there is no
+ * evidence that such a large request is made.
+ * Making the _size argument to kmalloc volatile prevents the compiler
+ * from making the mistake, though it is clearly not ideal.
+ */
+
+#if !defined (LINUX_24) && LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
+#define malloc(_size, type, flags)                     \
+       ({ volatile int _v = _size; kmalloc(_v, GFP_ATOMIC | __GFP_ZERO); })
+#else /* LINUX <= 2.6.22 and LINUX_24 */
+/* linux 2.6.22 does not zero allocated memory */
+#define malloc(_size, type, flags)                     \
+       ({ int _s = _size;                              \
+       void *_ret = kmalloc(_s, GFP_ATOMIC);           \
+       if (_ret) memset(_ret, 0, _s);                  \
+        (_ret);                                                \
+        })
+#endif /* LINUX <= 2.6.22 */
+
+#define calloc(_n, _s) malloc((_n * _s), NULL, GFP_ATOMIC | __GFP_ZERO)
+#define free(_var, type) kfree(_var)
+
+#else /* _WIN32, the windows version */
+
+/*
+ * ntddk.h uses win_malloc() and MmFreeContiguousMemory().
+ * wipfw uses
+ * ExAllocatePoolWithTag(, pool, len, tag)
+ * ExFreePoolWithTag(ptr, tag)
+ */
+#define malloc(_size, _type, _flags) my_alloc(_size)
+#define calloc(_size, _type, _flags) my_alloc(_size)
+
+void *my_alloc(int _size);
+/* the 'tag' version does not work without -Gz in the linker */
+#define free(_var, type) ExFreePool(_var)
+//#define free(_var, type) ExFreePoolWithTag(_var, 'wfpi')
+
+#endif /* _WIN32 */
+
+#define M_NOWAIT        0x0001          /* do not block */
+#define M_ZERO          0x0100          /* bzero the allocation */
+#endif /* _SYS_MALLOC_H_ */
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
new file mode 100644 (file)
index 0000000..894b221
--- /dev/null
@@ -0,0 +1,267 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * BSD copyright.
+ *
+ * A simple compatibility interface to map mbufs onto sk_buff
+ */
+
+#ifndef _SYS_MBUF_H_
+#define        _SYS_MBUF_H_
+
+#include <sys/malloc.h>                /* we use free() */
+/* hopefully queue.h is already included by someone else */
+#include <sys/queue.h>
+#ifdef _KERNEL
+
+/* bzero not present on linux, but this should go in glue.h */
+// #define bzero(s, n) memset(s, 0, n)
+
+/*
+ * We implement a very simplified UMA allocator where the backend
+ * is simply malloc, and uma_zone only stores the length of the components.
+ */
+typedef int uma_zone_t;                /* the zone size */
+
+#define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len)
+
+
+#define uma_zfree(zone, item)  free(item, M_IPFW)
+#define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags)
+#define uma_zdestroy(zone)     do {} while (0)
+
+/*-
+ * Macros for type conversion:
+ * mtod(m, t)  -- Convert mbuf pointer to data pointer of correct type.
+ */
+#define        mtod(m, t)      ((t)((m)->m_data))
+
+#endif /* _KERNEL */
+
+/*
+ * Packet tag structure (see below for details).
+ */
+struct m_tag {
+       SLIST_ENTRY(m_tag)      m_tag_link;     /* List of packet tags */
+       u_int16_t               m_tag_id;       /* Tag ID */
+       u_int16_t               m_tag_len;      /* Length of data */
+       u_int32_t               m_tag_cookie;   /* ABI/Module ID */
+       void                    (*m_tag_free)(struct m_tag *);
+};
+
+#if defined(__linux__) || defined( _WIN32 )
+
+/*
+ * Auxiliary structure to store values from the sk_buf.
+ * Note that we should not alter the sk_buff, and if we do
+ * so make sure to keep the values in sync between the mbuf
+ * and the sk_buff (especially m_len and m_pkthdr.len).
+ */
+
+struct mbuf {
+       struct mbuf *m_next;
+       struct mbuf *m_nextpkt;
+       char *m_data; // XXX was void *
+       int m_len;      /* length in this mbuf */
+       int m_flags;
+#ifdef __linux__
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+       struct nf_info *queue_entry;
+#else
+       struct nf_queue_entry *queue_entry;
+#endif
+#else /* _WIN32 */
+       int             direction;      /* could go in rcvif */
+       NDIS_HANDLE     context;        /* replaces queue_entry or skb ?*/
+       PNDIS_PACKET    pkt;
+#endif
+       struct sk_buff *m_skb;
+       struct {
+#ifdef __linux__
+               struct net_device *rcvif;
+#else
+               struct ifnet *rcvif;
+#endif
+               int len;        /* total packet len */
+               SLIST_HEAD (packet_tags, m_tag) tags;
+       } m_pkthdr;
+};
+
+#define M_SKIP_FIREWALL        0x01            /* skip firewall processing */
+#define M_BCAST         0x02 /* send/received as link-level broadcast */
+#define M_MCAST         0x04 /* send/received as link-level multicast */
+
+#define M_DONTWAIT      M_NOWAIT       /* should not be here... */
+
+
+/*
+ * m_dup() is used in the TEE case, currently unsupported so we
+ * just return.
+ */
+static __inline struct mbuf    *m_dup(struct mbuf *m, int n)
+{
+       (void)m; (void)n;
+       return NULL;
+};
+
+#define        MTAG_ABI_COMPAT         0               /* compatibility ABI */
+static __inline struct m_tag *
+m_tag_find(struct mbuf *m, int type, struct m_tag *start)
+{
+       (void)m; (void)type; (void)start;
+       return NULL;
+};
+
+
+static __inline void
+m_tag_prepend(struct mbuf *m, struct m_tag *t)
+{
+       SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
+}
+
+/*
+ * Return the next tag in the list of tags associated with an mbuf.
+ */
+static __inline struct m_tag *
+m_tag_next(struct mbuf *m, struct m_tag *t)
+{
+        return (SLIST_NEXT(t, m_tag_link));
+}
+
+/*
+ * Create an mtag of the given type
+ */
+static __inline struct m_tag *
+m_tag_alloc(uint32_t cookie, int type, int length, int wait)
+{
+       int l = length + sizeof(struct m_tag);
+       struct m_tag *m = malloc(l, 0, M_NOWAIT);
+       if (m) {
+               memset(m, 0, l);
+               m->m_tag_id = type;
+               m->m_tag_len = length;
+               m->m_tag_cookie = cookie;
+       }
+       return m;
+};
+
+static __inline struct m_tag *
+m_tag_get(int type, int length, int wait)
+{
+       return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait);
+}
+
+static __inline struct m_tag *
+m_tag_first(struct mbuf *m)
+{
+       return SLIST_FIRST(&m->m_pkthdr.tags);
+};
+
+static __inline void
+m_tag_delete(struct mbuf *m, struct m_tag *t)
+{
+};
+
+static __inline struct m_tag *
+m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t)
+{
+       struct m_tag *tag;
+
+       tag = m_tag_first(m);
+       if (tag == NULL)
+               return NULL;
+
+       if (tag->m_tag_cookie != n || tag->m_tag_id != x)
+               return NULL;
+       else
+               return tag;
+};
+
+#define M_SETFIB(_m, _fib)     /* nothing on linux */
+
+static __inline void
+m_freem(struct mbuf *m)
+{
+       struct m_tag *t;
+
+       /* free the m_tag chain */
+       while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) {
+               SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link);
+               free(t, 0);
+       }
+
+       /* free the mbuf */
+       free(m, M_IPFW);
+};
+
+/* m_pullup is not supported, there is a macro in missing.h */
+
+#define M_GETFIB(_m)   0
+
+/* macro used to create a new mbuf */
+#define MT_DATA         1       /* dynamic (data) allocation */
+#define MSIZE           256     /* size of an mbuf */
+#define MGETHDR(_m, _how, _type)   ((_m) = m_gethdr((_how), (_type)))
+
+/* allocate and init a new mbuf using the same structure of FreeBSD */
+static __inline struct mbuf *
+m_gethdr(int how, short type)
+{
+       struct mbuf *m;
+
+       m = malloc(MSIZE, M_IPFW, M_NOWAIT);
+
+       if (m == NULL) {
+               return m;
+       }
+
+       /* here we have MSIZE - sizeof(struct mbuf) available */
+       m->m_data = (char *)(m + 1);
+
+       return m;
+}
+
+#endif /* __linux__ || _WIN32 */
+
+/*
+ * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
+ * tags are expected to ``vanish'' when they pass through a network
+ * interface.  For most interfaces this happens normally as the tags are
+ * reclaimed when the mbuf is free'd.  However in some special cases
+ * reclaiming must be done manually.  An example is packets that pass through
+ * the loopback interface.  Also, one must be careful to do this when
+ * ``turning around'' packets (e.g., icmp_reflect).
+ *
+ * To mark a tag persistent bit-or this flag in when defining the tag id.
+ * The tag will then be treated as described above.
+ */
+#define        MTAG_PERSISTENT                         0x800
+
+#define        PACKET_TAG_NONE                         0  /* Nadda */
+
+/* Packet tags for use with PACKET_ABI_COMPAT. */
+#define        PACKET_TAG_IPSEC_IN_DONE                1  /* IPsec applied, in */
+#define        PACKET_TAG_IPSEC_OUT_DONE               2  /* IPsec applied, out */
+#define        PACKET_TAG_IPSEC_IN_CRYPTO_DONE         3  /* NIC IPsec crypto done */
+#define        PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED      4  /* NIC IPsec crypto req'ed */
+#define        PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO     5  /* NIC notifies IPsec */
+#define        PACKET_TAG_IPSEC_PENDING_TDB            6  /* Reminder to do IPsec */
+#define        PACKET_TAG_BRIDGE                       7  /* Bridge processing done */
+#define        PACKET_TAG_GIF                          8  /* GIF processing done */
+#define        PACKET_TAG_GRE                          9  /* GRE processing done */
+#define        PACKET_TAG_IN_PACKET_CHECKSUM           10 /* NIC checksumming done */
+#define        PACKET_TAG_ENCAP                        11 /* Encap.  processing */
+#define        PACKET_TAG_IPSEC_SOCKET                 12 /* IPSEC socket ref */
+#define        PACKET_TAG_IPSEC_HISTORY                13 /* IPSEC history */
+#define        PACKET_TAG_IPV6_INPUT                   14 /* IPV6 input processing */
+#define        PACKET_TAG_DUMMYNET                     15 /* dummynet info */
+#define        PACKET_TAG_DIVERT                       17 /* divert info */
+#define        PACKET_TAG_IPFORWARD                    18 /* ipforward info */
+#define        PACKET_TAG_MACLABEL     (19 | MTAG_PERSISTENT) /* MAC label */
+#define        PACKET_TAG_PF                           21 /* PF + ALTQ information */
+#define        PACKET_TAG_RTSOCKFAM                    25 /* rtsock sa family */
+#define        PACKET_TAG_IPOPTIONS                    27 /* Saved IP options */
+#define        PACKET_TAG_CARP                         28 /* CARP info */
+
+#endif /* !_SYS_MBUF_H_ */
diff --git a/sys/sys/module.h b/sys/sys/module.h
new file mode 100644 (file)
index 0000000..85bf220
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * trivial module support
+ */
+#ifndef _SYS_MODULE_H_
+#define _SYS_MODULE_H_
+typedef struct module *module_t;
+typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *);
+typedef enum modeventtype {
+        MOD_LOAD,
+        MOD_UNLOAD,
+        MOD_SHUTDOWN,
+        MOD_QUIESCE
+} modeventtype_t;
+typedef struct moduledata {
+        const char      *name;          /* module name */
+        modeventhand_t  evhand;         /* event handler */
+        void            *priv;          /* extra data */
+} moduledata_t;
+
+/*
+ * Hook the module descriptor, md, into our list of things to do.
+ * We should in principle respect the order of loading.
+ *
+ * XXX use the gcc .init functions
+ */
+#define DECLARE_MODULE(a, md, c,d)                             \
+    moduledata_t *moddesc_##a = &md;
+
+/*
+ * XXX MODULE_VERSION is define in linux too
+ */
+#define MODULE_DEPEND(a,b,c,d,e)
+#if defined( __linux__ ) || defined( _WIN32 )
+#undef MODULE_VERSION
+#define MODULE_VERSION(a,b)
+#endif
+
+#endif /* _SYS_MODULE_H_ */
+
diff --git a/sys/sys/param.h b/sys/sys/param.h
new file mode 100644 (file)
index 0000000..f068998
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _SYS_PARAM_H_
+#define _SYS_PARAM_H_
+
+/*
+ * number of additional groups
+ */
+#ifndef LINUX_24
+#define NGROUPS                16
+#endif
+
+#endif /* _SYS_PARAM_H_ */
diff --git a/sys/sys/queue.h b/sys/sys/queue.h
new file mode 100644 (file)
index 0000000..3630218
--- /dev/null
@@ -0,0 +1,623 @@
+/*-
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)queue.h     8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.68 2006/10/24 11:20:29 ru Exp $
+ */
+
+#ifndef _SYS_QUEUE_H_
+#define        _SYS_QUEUE_H_
+
+//#include <sys/cdefs.h>
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *                             SLIST   LIST    STAILQ  TAILQ
+ * _HEAD                       +       +       +       +
+ * _HEAD_INITIALIZER           +       +       +       +
+ * _ENTRY                      +       +       +       +
+ * _INIT                       +       +       +       +
+ * _EMPTY                      +       +       +       +
+ * _FIRST                      +       +       +       +
+ * _NEXT                       +       +       +       +
+ * _PREV                       -       -       -       +
+ * _LAST                       -       -       +       +
+ * _FOREACH                    +       +       +       +
+ * _FOREACH_SAFE               +       +       +       +
+ * _FOREACH_REVERSE            -       -       -       +
+ * _FOREACH_REVERSE_SAFE       -       -       -       +
+ * _INSERT_HEAD                        +       +       +       +
+ * _INSERT_BEFORE              -       +       -       +
+ * _INSERT_AFTER               +       +       +       +
+ * _INSERT_TAIL                        -       -       +       +
+ * _CONCAT                     -       -       +       +
+ * _REMOVE_HEAD                        +       -       +       -
+ * _REMOVE                     +       +       +       +
+ *
+ */
+#ifdef QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+       char * lastfile;
+       int lastline;
+       char * prevfile;
+       int prevline;
+};
+
+#define        TRACEBUF        struct qm_trace trace;
+#define        TRASHIT(x)      do {(x) = (void *)-1;} while (0)
+
+#define        QMD_TRACE_HEAD(head) do {                                       \
+       (head)->trace.prevline = (head)->trace.lastline;                \
+       (head)->trace.prevfile = (head)->trace.lastfile;                \
+       (head)->trace.lastline = __LINE__;                              \
+       (head)->trace.lastfile = __FILE__;                              \
+} while (0)
+
+#define        QMD_TRACE_ELEM(elem) do {                                       \
+       (elem)->trace.prevline = (elem)->trace.lastline;                \
+       (elem)->trace.prevfile = (elem)->trace.lastfile;                \
+       (elem)->trace.lastline = __LINE__;                              \
+       (elem)->trace.lastfile = __FILE__;                              \
+} while (0)
+
+#else
+#define        QMD_TRACE_ELEM(elem)
+#define        QMD_TRACE_HEAD(head)
+#define        TRACEBUF
+#define        TRASHIT(x)
+#endif /* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define        SLIST_HEAD(name, type)                                          \
+struct name {                                                          \
+       struct type *slh_first; /* first element */                     \
+}
+
+#define        SLIST_HEAD_INITIALIZER(head)                                    \
+       { NULL }
+
+#if defined( _WIN32 ) && defined(SLIST_ENTRY)
+#undef SLIST_ENTRY
+#endif
+#define        SLIST_ENTRY(type)                                               \
+struct {                                                               \
+       struct type *sle_next;  /* next element */                      \
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define        SLIST_EMPTY(head)       ((head)->slh_first == NULL)
+
+#define        SLIST_FIRST(head)       ((head)->slh_first)
+
+#define        SLIST_FOREACH(var, head, field)                                 \
+       for ((var) = SLIST_FIRST((head));                               \
+           (var);                                                      \
+           (var) = SLIST_NEXT((var), field))
+
+#define        SLIST_FOREACH_SAFE(var, head, field, tvar)                      \
+       for ((var) = SLIST_FIRST((head));                               \
+           (var) && ((tvar) = SLIST_NEXT((var), field), 1);            \
+           (var) = (tvar))
+
+#define        SLIST_FOREACH_PREVPTR(var, varp, head, field)                   \
+       for ((varp) = &SLIST_FIRST((head));                             \
+           ((var) = *(varp)) != NULL;                                  \
+           (varp) = &SLIST_NEXT((var), field))
+
+#define        SLIST_INIT(head) do {                                           \
+       SLIST_FIRST((head)) = NULL;                                     \
+} while (0)
+
+#define        SLIST_INSERT_AFTER(slistelm, elm, field) do {                   \
+       SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);       \
+       SLIST_NEXT((slistelm), field) = (elm);                          \
+} while (0)
+
+#define        SLIST_INSERT_HEAD(head, elm, field) do {                        \
+       SLIST_NEXT((elm), field) = SLIST_FIRST((head));                 \
+       SLIST_FIRST((head)) = (elm);                                    \
+} while (0)
+
+#define        SLIST_NEXT(elm, field)  ((elm)->field.sle_next)
+
+#define        SLIST_REMOVE(head, elm, type, field) do {                       \
+       if (SLIST_FIRST((head)) == (elm)) {                             \
+               SLIST_REMOVE_HEAD((head), field);                       \
+       }                                                               \
+       else {                                                          \
+               struct type *curelm = SLIST_FIRST((head));              \
+               while (SLIST_NEXT(curelm, field) != (elm))              \
+                       curelm = SLIST_NEXT(curelm, field);             \
+               SLIST_NEXT(curelm, field) =                             \
+                   SLIST_NEXT(SLIST_NEXT(curelm, field), field);       \
+       }                                                               \
+       TRASHIT((elm)->field.sle_next);                                 \
+} while (0)
+
+#define        SLIST_REMOVE_HEAD(head, field) do {                             \
+       SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);   \
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define        STAILQ_HEAD(name, type)                                         \
+struct name {                                                          \
+       struct type *stqh_first;/* first element */                     \
+       struct type **stqh_last;/* addr of last next element */         \
+}
+
+#define        STAILQ_HEAD_INITIALIZER(head)                                   \
+       { NULL, &(head).stqh_first }
+
+#define        STAILQ_ENTRY(type)                                              \
+struct {                                                               \
+       struct type *stqe_next; /* next element */                      \
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define        STAILQ_CONCAT(head1, head2) do {                                \
+       if (!STAILQ_EMPTY((head2))) {                                   \
+               *(head1)->stqh_last = (head2)->stqh_first;              \
+               (head1)->stqh_last = (head2)->stqh_last;                \
+               STAILQ_INIT((head2));                                   \
+       }                                                               \
+} while (0)
+
+#define        STAILQ_EMPTY(head)      ((head)->stqh_first == NULL)
+
+#define        STAILQ_FIRST(head)      ((head)->stqh_first)
+
+#define        STAILQ_FOREACH(var, head, field)                                \
+       for((var) = STAILQ_FIRST((head));                               \
+          (var);                                                       \
+          (var) = STAILQ_NEXT((var), field))
+
+
+#define        STAILQ_FOREACH_SAFE(var, head, field, tvar)                     \
+       for ((var) = STAILQ_FIRST((head));                              \
+           (var) && ((tvar) = STAILQ_NEXT((var), field), 1);           \
+           (var) = (tvar))
+
+#define        STAILQ_INIT(head) do {                                          \
+       STAILQ_FIRST((head)) = NULL;                                    \
+       (head)->stqh_last = &STAILQ_FIRST((head));                      \
+} while (0)
+
+#define        STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {               \
+       if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+               (head)->stqh_last = &STAILQ_NEXT((elm), field);         \
+       STAILQ_NEXT((tqelm), field) = (elm);                            \
+} while (0)
+
+#define        STAILQ_INSERT_HEAD(head, elm, field) do {                       \
+       if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
+               (head)->stqh_last = &STAILQ_NEXT((elm), field);         \
+       STAILQ_FIRST((head)) = (elm);                                   \
+} while (0)
+
+#define        STAILQ_INSERT_TAIL(head, elm, field) do {                       \
+       STAILQ_NEXT((elm), field) = NULL;                               \
+       *(head)->stqh_last = (elm);                                     \
+       (head)->stqh_last = &STAILQ_NEXT((elm), field);                 \
+} while (0)
+
+#define        STAILQ_LAST(head, type, field)                                  \
+       (STAILQ_EMPTY((head)) ?                                         \
+               NULL :                                                  \
+               ((struct type *)(void *)                                \
+               ((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define        STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
+
+#define        STAILQ_REMOVE(head, elm, type, field) do {                      \
+       if (STAILQ_FIRST((head)) == (elm)) {                            \
+               STAILQ_REMOVE_HEAD((head), field);                      \
+       }                                                               \
+       else {                                                          \
+               struct type *curelm = STAILQ_FIRST((head));             \
+               while (STAILQ_NEXT(curelm, field) != (elm))             \
+                       curelm = STAILQ_NEXT(curelm, field);            \
+               if ((STAILQ_NEXT(curelm, field) =                       \
+                    STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+                       (head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+       }                                                               \
+       TRASHIT((elm)->field.stqe_next);                                \
+} while (0)
+
+#define        STAILQ_REMOVE_HEAD(head, field) do {                            \
+       if ((STAILQ_FIRST((head)) =                                     \
+            STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)         \
+               (head)->stqh_last = &STAILQ_FIRST((head));              \
+} while (0)
+
+#ifndef LIST_HEAD
+/*
+ * List declarations.
+ */
+#define        LIST_HEAD(name, type)                                           \
+struct name {                                                          \
+       struct type *lh_first;  /* first element */                     \
+}
+
+#define        LIST_HEAD_INITIALIZER(head)                                     \
+       { NULL }
+
+#define        LIST_ENTRY(type)                                                \
+struct {                                                               \
+       struct type *le_next;   /* next element */                      \
+       struct type **le_prev;  /* address of previous next element */  \
+}
+
+/*
+ * List functions.
+ */
+
+#if (defined(_KERNEL) && defined(INVARIANTS))
+#define        QMD_LIST_CHECK_HEAD(head, field) do {                           \
+       if (LIST_FIRST((head)) != NULL &&                               \
+           LIST_FIRST((head))->field.le_prev !=                        \
+            &LIST_FIRST((head)))                                       \
+               panic("Bad list head %p first->prev != head", (head));  \
+} while (0)
+
+#define        QMD_LIST_CHECK_NEXT(elm, field) do {                            \
+       if (LIST_NEXT((elm), field) != NULL &&                          \
+           LIST_NEXT((elm), field)->field.le_prev !=                   \
+            &((elm)->field.le_next))                                   \
+               panic("Bad link elm %p next->prev != elm", (elm));      \
+} while (0)
+
+#define        QMD_LIST_CHECK_PREV(elm, field) do {                            \
+       if (*(elm)->field.le_prev != (elm))                             \
+               panic("Bad link elm %p prev->next != elm", (elm));      \
+} while (0)
+#else
+#define        QMD_LIST_CHECK_HEAD(head, field)
+#define        QMD_LIST_CHECK_NEXT(elm, field)
+#define        QMD_LIST_CHECK_PREV(elm, field)
+#endif /* (_KERNEL && INVARIANTS) */
+
+#define        LIST_EMPTY(head)        ((head)->lh_first == NULL)
+
+#define        LIST_FIRST(head)        ((head)->lh_first)
+
+#define        LIST_FOREACH(var, head, field)                                  \
+       for ((var) = LIST_FIRST((head));                                \
+           (var);                                                      \
+           (var) = LIST_NEXT((var), field))
+
+#define        LIST_FOREACH_SAFE(var, head, field, tvar)                       \
+       for ((var) = LIST_FIRST((head));                                \
+           (var) && ((tvar) = LIST_NEXT((var), field), 1);             \
+           (var) = (tvar))
+
+#define        LIST_INIT(head) do {                                            \
+       LIST_FIRST((head)) = NULL;                                      \
+} while (0)
+
+#define        LIST_INSERT_AFTER(listelm, elm, field) do {                     \
+       QMD_LIST_CHECK_NEXT(listelm, field);                            \
+       if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+               LIST_NEXT((listelm), field)->field.le_prev =            \
+                   &LIST_NEXT((elm), field);                           \
+       LIST_NEXT((listelm), field) = (elm);                            \
+       (elm)->field.le_prev = &LIST_NEXT((listelm), field);            \
+} while (0)
+
+#define        LIST_INSERT_BEFORE(listelm, elm, field) do {                    \
+       QMD_LIST_CHECK_PREV(listelm, field);                            \
+       (elm)->field.le_prev = (listelm)->field.le_prev;                \
+       LIST_NEXT((elm), field) = (listelm);                            \
+       *(listelm)->field.le_prev = (elm);                              \
+       (listelm)->field.le_prev = &LIST_NEXT((elm), field);            \
+} while (0)
+
+#define        LIST_INSERT_HEAD(head, elm, field) do {                         \
+       QMD_LIST_CHECK_HEAD((head), field);                             \
+       if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)     \
+               LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+       LIST_FIRST((head)) = (elm);                                     \
+       (elm)->field.le_prev = &LIST_FIRST((head));                     \
+} while (0)
+
+#define        LIST_NEXT(elm, field)   ((elm)->field.le_next)
+
+#define        LIST_REMOVE(elm, field) do {                                    \
+       QMD_LIST_CHECK_NEXT(elm, field);                                \
+       QMD_LIST_CHECK_PREV(elm, field);                                \
+       if (LIST_NEXT((elm), field) != NULL)                            \
+               LIST_NEXT((elm), field)->field.le_prev =                \
+                   (elm)->field.le_prev;                               \
+       *(elm)->field.le_prev = LIST_NEXT((elm), field);                \
+       TRASHIT((elm)->field.le_next);                                  \
+       TRASHIT((elm)->field.le_prev);                                  \
+} while (0)
+#endif /* LIST_HEAD */
+
+/*
+ * Tail queue declarations.
+ */
+#define        TAILQ_HEAD(name, type)                                          \
+struct name {                                                          \
+       struct type *tqh_first; /* first element */                     \
+       struct type **tqh_last; /* addr of last next element */         \
+       TRACEBUF                                                        \
+}
+
+#define        TAILQ_HEAD_INITIALIZER(head)                                    \
+       { NULL, &(head).tqh_first }
+
+#define        TAILQ_ENTRY(type)                                               \
+struct {                                                               \
+       struct type *tqe_next;  /* next element */                      \
+       struct type **tqe_prev; /* address of previous next element */  \
+       TRACEBUF                                                        \
+}
+
+/*
+ * Tail queue functions.
+ */
+#if (defined(_KERNEL) && defined(INVARIANTS))
+#define        QMD_TAILQ_CHECK_HEAD(head, field) do {                          \
+       if (!TAILQ_EMPTY(head) &&                                       \
+           TAILQ_FIRST((head))->field.tqe_prev !=                      \
+            &TAILQ_FIRST((head)))                                      \
+               panic("Bad tailq head %p first->prev != head", (head)); \
+} while (0)
+
+#define        QMD_TAILQ_CHECK_TAIL(head, field) do {                          \
+       if (*(head)->tqh_last != NULL)                                  \
+               panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head));  \
+} while (0)
+
+#define        QMD_TAILQ_CHECK_NEXT(elm, field) do {                           \
+       if (TAILQ_NEXT((elm), field) != NULL &&                         \
+           TAILQ_NEXT((elm), field)->field.tqe_prev !=                 \
+            &((elm)->field.tqe_next))                                  \
+               panic("Bad link elm %p next->prev != elm", (elm));      \
+} while (0)
+
+#define        QMD_TAILQ_CHECK_PREV(elm, field) do {                           \
+       if (*(elm)->field.tqe_prev != (elm))                            \
+               panic("Bad link elm %p prev->next != elm", (elm));      \
+} while (0)
+#else
+#define        QMD_TAILQ_CHECK_HEAD(head, field)
+#define        QMD_TAILQ_CHECK_TAIL(head, headname)
+#define        QMD_TAILQ_CHECK_NEXT(elm, field)
+#define        QMD_TAILQ_CHECK_PREV(elm, field)
+#endif /* (_KERNEL && INVARIANTS) */
+
+#define        TAILQ_CONCAT(head1, head2, field) do {                          \
+       if (!TAILQ_EMPTY(head2)) {                                      \
+               *(head1)->tqh_last = (head2)->tqh_first;                \
+               (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
+               (head1)->tqh_last = (head2)->tqh_last;                  \
+               TAILQ_INIT((head2));                                    \
+               QMD_TRACE_HEAD(head1);                                  \
+               QMD_TRACE_HEAD(head2);                                  \
+       }                                                               \
+} while (0)
+
+#define        TAILQ_EMPTY(head)       ((head)->tqh_first == NULL)
+
+#define        TAILQ_FIRST(head)       ((head)->tqh_first)
+
+#define        TAILQ_FOREACH(var, head, field)                                 \
+       for ((var) = TAILQ_FIRST((head));                               \
+           (var);                                                      \
+           (var) = TAILQ_NEXT((var), field))
+
+#define        TAILQ_FOREACH_SAFE(var, head, field, tvar)                      \
+       for ((var) = TAILQ_FIRST((head));                               \
+           (var) && ((tvar) = TAILQ_NEXT((var), field), 1);            \
+           (var) = (tvar))
+
+#define        TAILQ_FOREACH_REVERSE(var, head, headname, field)               \
+       for ((var) = TAILQ_LAST((head), headname);                      \
+           (var);                                                      \
+           (var) = TAILQ_PREV((var), headname, field))
+
+#define        TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar)    \
+       for ((var) = TAILQ_LAST((head), headname);                      \
+           (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1);  \
+           (var) = (tvar))
+
+#define        TAILQ_INIT(head) do {                                           \
+       TAILQ_FIRST((head)) = NULL;                                     \
+       (head)->tqh_last = &TAILQ_FIRST((head));                        \
+       QMD_TRACE_HEAD(head);                                           \
+} while (0)
+
+#define        TAILQ_INSERT_AFTER(head, listelm, elm, field) do {              \
+       QMD_TAILQ_CHECK_NEXT(listelm, field);                           \
+       if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+               TAILQ_NEXT((elm), field)->field.tqe_prev =              \
+                   &TAILQ_NEXT((elm), field);                          \
+       else {                                                          \
+               (head)->tqh_last = &TAILQ_NEXT((elm), field);           \
+               QMD_TRACE_HEAD(head);                                   \
+       }                                                               \
+       TAILQ_NEXT((listelm), field) = (elm);                           \
+       (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);          \
+       QMD_TRACE_ELEM(&(elm)->field);                                  \
+       QMD_TRACE_ELEM(&listelm->field);                                \
+} while (0)
+
+#define        TAILQ_INSERT_BEFORE(listelm, elm, field) do {                   \
+       QMD_TAILQ_CHECK_PREV(listelm, field);                           \
+       (elm)->field.tqe_prev = (listelm)->field.tqe_prev;              \
+       TAILQ_NEXT((elm), field) = (listelm);                           \
+       *(listelm)->field.tqe_prev = (elm);                             \
+       (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);          \
+       QMD_TRACE_ELEM(&(elm)->field);                                  \
+       QMD_TRACE_ELEM(&listelm->field);                                \
+} while (0)
+
+#define        TAILQ_INSERT_HEAD(head, elm, field) do {                        \
+       QMD_TAILQ_CHECK_HEAD(head, field);                              \
+       if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)   \
+               TAILQ_FIRST((head))->field.tqe_prev =                   \
+                   &TAILQ_NEXT((elm), field);                          \
+       else                                                            \
+               (head)->tqh_last = &TAILQ_NEXT((elm), field);           \
+       TAILQ_FIRST((head)) = (elm);                                    \
+       (elm)->field.tqe_prev = &TAILQ_FIRST((head));                   \
+       QMD_TRACE_HEAD(head);                                           \
+       QMD_TRACE_ELEM(&(elm)->field);                                  \
+} while (0)
+
+#define        TAILQ_INSERT_TAIL(head, elm, field) do {                        \
+       QMD_TAILQ_CHECK_TAIL(head, field);                              \
+       TAILQ_NEXT((elm), field) = NULL;                                \
+       (elm)->field.tqe_prev = (head)->tqh_last;                       \
+       *(head)->tqh_last = (elm);                                      \
+       (head)->tqh_last = &TAILQ_NEXT((elm), field);                   \
+       QMD_TRACE_HEAD(head);                                           \
+       QMD_TRACE_ELEM(&(elm)->field);                                  \
+} while (0)
+
+#define        TAILQ_LAST(head, headname)                                      \
+       (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define        TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define        TAILQ_PREV(elm, headname, field)                                \
+       (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define        TAILQ_REMOVE(head, elm, field) do {                             \
+       QMD_TAILQ_CHECK_NEXT(elm, field);                               \
+       QMD_TAILQ_CHECK_PREV(elm, field);                               \
+       if ((TAILQ_NEXT((elm), field)) != NULL)                         \
+               TAILQ_NEXT((elm), field)->field.tqe_prev =              \
+                   (elm)->field.tqe_prev;                              \
+       else {                                                          \
+               (head)->tqh_last = (elm)->field.tqe_prev;               \
+               QMD_TRACE_HEAD(head);                                   \
+       }                                                               \
+       *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);              \
+       TRASHIT((elm)->field.tqe_next);                                 \
+       TRASHIT((elm)->field.tqe_prev);                                 \
+       QMD_TRACE_ELEM(&(elm)->field);                                  \
+} while (0)
+
+
+#ifdef _KERNEL
+
+/*
+ * XXX insque() and remque() are an old way of handling certain queues.
+ * They bogusly assumes that all queue heads look alike.
+ */
+
+struct quehead {
+       struct quehead *qh_link;
+       struct quehead *qh_rlink;
+};
+
+#ifdef __CC_SUPPORTS___INLINE
+
+static __inline void
+insque(void *a, void *b)
+{
+       struct quehead *element = (struct quehead *)a,
+                *head = (struct quehead *)b;
+
+       element->qh_link = head->qh_link;
+       element->qh_rlink = head;
+       head->qh_link = element;
+       element->qh_link->qh_rlink = element;
+}
+
+static __inline void
+remque(void *a)
+{
+       struct quehead *element = (struct quehead *)a;
+
+       element->qh_link->qh_rlink = element->qh_rlink;
+       element->qh_rlink->qh_link = element->qh_link;
+       element->qh_rlink = 0;
+}
+
+#else /* !__CC_SUPPORTS___INLINE */
+
+void   insque(void *a, void *b);
+void   remque(void *a);
+
+#endif /* __CC_SUPPORTS___INLINE */
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_QUEUE_H_ */
diff --git a/sys/sys/syslog.h b/sys/sys/syslog.h
new file mode 100644 (file)
index 0000000..143df1f
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef _SYS_SYSLOG_H_
+#define _SYS_SYSLOG_H_
+/* XXX find linux equivalent */
+#define LOG_SECURITY 0
+#define LOG_NOTICE 0
+#define LOG_DEBUG 0
+#endif /* _SYS_SYSLOG_H_ */
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
new file mode 100644 (file)
index 0000000..e98335e
--- /dev/null
@@ -0,0 +1,126 @@
+#ifndef _SYS_SYSTM_H_
+#define _SYS_SYSTM_H_
+
+#define CALLOUT_ACTIVE          0x0002 /* callout is currently active */
+#define CALLOUT_MPSAFE          0x0008 /* callout handler is mp safe */
+
+#ifndef _WIN32 /* this is the linux version */
+/* callout support, in <sys/callout.h> on FreeBSD */
+/*
+ * callout support on linux module is done using timers
+ */
+#include <linux/timer.h>
+#ifdef LINUX_24
+#include <linux/sched.h>        /* jiffies definition is here in 2.4 */
+#endif
+#define callout timer_list
+static __inline int
+callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu)
+{
+        co->expires = jiffies + ticks;
+        co->function = (void (*)(unsigned long))fn;
+        co->data = (unsigned long)arg;
+       /*
+        * Linux 2.6.31 and above has add_timer_on(co, cpu),
+        * otherwise add_timer() always schedules a callout on the same
+        * CPU used the first time, so we don't need more.
+        */
+        add_timer(co);
+        return 0;
+}
+
+#define callout_init(co, safe)  init_timer(co)
+#define callout_drain(co)       del_timer(co)
+#define callout_stop(co)        del_timer(co)
+
+#else /* _WIN32 */
+#include <ndis.h>
+
+/* This is the windows part for callout support */
+struct callout {
+       KTIMER thetimer;
+       KDPC timerdpc;
+       int dpcinitialized;
+       LARGE_INTEGER duetime;
+};
+
+void dummynet (void*);
+VOID dummynet_dpc(
+    __in struct _KDPC  *Dpc,
+    __in_opt PVOID  DeferredContext,
+    __in_opt PVOID  SystemArgument1,
+    __in_opt PVOID  SystemArgument2
+    );
+
+VOID ipfw_dpc(
+    __in struct _KDPC  *Dpc,
+    __in_opt PVOID  DeferredContext,
+    __in_opt PVOID  SystemArgument1,
+    __in_opt PVOID  SystemArgument2
+    );
+
+/* callout_reset must handle two problems:
+ * - dummynet() scheduler must be run always on the same processor
+ * because do_gettimeofday() is based on cpu performance counter, and
+ * _occasionally_ can leap backward in time if we query another cpu.
+ * typically this won't happen that much, and the cpu will almost always
+ * be the same even without the affinity restriction, but better to be sure.
+ * - ipfw_tick() does not have the granularity requirements of dummynet()
+ * but we need to pass a pointer as argument.
+ *
+ * for these reasons, if we are called for dummynet() timer,
+ * KeInitializeDpc is called only once as it should be, and the thread
+ * is forced on cpu0 (which is always present), while if we're called
+ * for ipfw_tick(), we re-initialize the DPC each time, using
+ * parameter DeferredContext to pass the needed pointer. since this
+ * timer is called only once a sec, this won't hurt that much.
+ */
+static __inline int
+callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) 
+{
+       if(fn == &dummynet)
+       {
+               if(co->dpcinitialized == 0)
+               {
+                       KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL);
+                       KeSetTargetProcessorDpc(&co->timerdpc, cpu);
+                       co->dpcinitialized = 1;
+               }
+       }
+       else
+       {
+               KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg);
+       }
+       co->duetime.QuadPart = (-ticks)*10000;
+       KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc);
+       return 0;
+}
+
+static __inline void
+callout_init(struct callout* co, int safe)
+{
+       printf("%s: initializing timer at %p\n",__FUNCTION__,co);
+       KeInitializeTimer(&co->thetimer);
+}
+
+static __inline int
+callout_drain(struct callout* co)
+{
+       BOOLEAN canceled = KeCancelTimer(&co->thetimer);
+       while (canceled != TRUE)
+       {
+               canceled = KeCancelTimer(&co->thetimer);
+       }
+       printf("%s: stopping timer at %p\n",__FUNCTION__,co);
+       return 0;
+}
+
+static __inline int
+callout_stop(struct callout* co)
+{
+       return callout_drain(co);
+}
+
+#endif /* _WIN32 */
+
+#endif /* _SYS_SYSTM_H_ */
diff --git a/sys/sys/taskqueue.h b/sys/sys/taskqueue.h
new file mode 100644 (file)
index 0000000..43efdd5
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef _SYS_TASKQUEUE_H_
+#define _SYS_TASKQUEUE_H_
+
+/*
+ * Remap taskqueue to direct calls
+ */
+
+#ifdef _WIN32
+struct task {
+       void (*func)(void*, int);
+};
+#define taskqueue_enqueue(tq, ta)      (ta)->func(NULL,1)
+#define TASK_INIT(a,b,c,d) do {                                \
+       (a)->func = (c); } while (0)
+#else
+struct task {
+       void (*func)(void);
+};
+#define taskqueue_enqueue(tq, ta)      (ta)->func()
+#define TASK_INIT(a,b,c,d) do {                                \
+       (a)->func = (void (*)(void))c; } while (0)
+
+#endif
+#define taskqueue_create_fast(_a, _b, _c, _d)  NULL
+#define taskqueue_start_threads(_a, _b, _c, _d)
+
+#define        taskqueue_drain(_a, _b) /* XXX to be completed */
+#define        taskqueue_free(_a)      /* XXX to be completed */
+
+#define PRI_MIN                 (0)             /* Highest priority. */
+#define PRI_MIN_ITHD            (PRI_MIN)
+#define PI_NET                  (PRI_MIN_ITHD + 16)
+
+#endif /* !_SYS_TASKQUEUE_H_ */
diff --git a/tcc-0.9.25-bsd.zip b/tcc-0.9.25-bsd.zip
new file mode 100644 (file)
index 0000000..06d7c37
Binary files /dev/null and b/tcc-0.9.25-bsd.zip differ
diff --git a/tcc_glue.h b/tcc_glue.h
new file mode 100644 (file)
index 0000000..db757ed
--- /dev/null
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * headers to build userland ipfw under tcc.
+ */
+#ifndef _TCC_GLUE_H
+#define        _TCC_GLUE_H
+
+//#define      __restrict
+#define        NULL    ((void *)0)
+typedef int size_t;
+typedef unsigned char  u_char;
+typedef unsigned char  uint8_t;
+typedef unsigned char  u_int8_t;
+typedef unsigned short u_short;
+typedef unsigned short uint16_t;
+typedef unsigned short u_int16_t;
+typedef int            __int32_t;
+typedef int            int32_t;
+typedef int            socklen_t;
+typedef int            pid_t;
+typedef unsigned int   time_t;
+typedef unsigned int   uint;
+typedef unsigned int   u_int;
+typedef unsigned int   uint32_t;
+typedef unsigned int   u_int32_t;
+typedef unsigned int   gid_t;
+typedef unsigned int   uid_t;
+typedef unsigned long  u_long;
+typedef unsigned long  uintptr_t;
+typedef long long int  int64_t;
+typedef unsigned long long     int uint64_t;
+typedef unsigned long long     int u_int64_t;
+
+typedef uint32_t       in_addr_t;
+struct in_addr {
+       uint32_t        s_addr;
+};
+struct sockaddr_in {
+       uint8_t _sin_len;
+        uint8_t        sin_family;
+        uint16_t       sin_port;
+        struct  in_addr sin_addr;
+        char    sin_zero[8];
+};
+#define IFNAMSIZ       16
+#define INET6_ADDRSTRLEN       64
+
+struct in6_addr {
+        union {
+                uint8_t         __s6_addr8[16];
+                uint16_t        __s6_addr16[8];
+                uint32_t        __s6_addr32[4];
+        } __u6; // _addr;                    /* 128-bit IP6 address */
+};
+
+
+#define LITTLE_ENDIAN 1234
+#define BYTE_ORDER LITTLE_ENDIAN
+
+/* to be revised */
+#define        EX_OK           0
+#define        EX_DATAERR      1
+#define        EX_OSERR        2
+#define        EX_UNAVAILABLE  3
+#define        EX_USAGE        4
+#define        EX_NOHOST       5
+
+#define        EEXIST          1
+#define        EINVAL          2
+#define        ERANGE          3
+#define        ESRCH           4
+
+#define        IPPROTO_IP              1
+#define        IPPROTO_IPV6            2
+#define        IPPROTO_RAW             100
+
+#define        IPTOS_LOWDELAY          100
+#define        IPTOS_MINCOST           101
+#define        IPTOS_RELIABILITY       102
+#define        IPTOS_THROUGHPUT        103
+#define        SOCK_RAW                12
+#define        AF_INET                 2
+#define        AF_INET6                28
+
+#define        INADDR_ANY              0
+
+
+#define bcmp(src, dst, len)    memcmp(src, dst, len)
+#define bcopy(src, dst, len)   memcpy(dst, src, len)
+#define bzero(p, len)  memset(p, 0, len)
+#define index(s, c)    strchr(s, c)
+
+char *strsep(char **stringp, const char *delim);
+
+void    warn(const char *, ...);
+//void    warnx(const char *, ...);
+#define warnx warn
+void    err(int, const char *, ...);
+#define        errx err
+
+uint16_t       htons(uint16_t)__attribute__ ((stdcall));
+uint16_t       ntohs(uint16_t)__attribute__ ((stdcall));
+uint32_t       htonl(uint32_t)__attribute__ ((stdcall));
+uint32_t       ntohl(uint32_t)__attribute__ ((stdcall));
+int inet_aton(const char *cp, struct in_addr *pin)__attribute__ ((stdcall));;
+char * inet_ntoa(struct in_addr)__attribute__ ((stdcall));;
+const char * inet_ntop(int af, const void * src, char * dst,
+         socklen_t size)__attribute__ ((stdcall));;
+int inet_pton(int af, const char * src, void * dst)__attribute__ ((stdcall));;
+
+struct group {
+       gid_t   gr_gid;
+       char    gr_name[16];
+};
+struct passwd {
+       uid_t   pw_uid;
+       char    pw_name[16];
+};
+
+#define getpwnam(s)    (NULL)
+#define getpwuid(s)    (NULL)
+
+#define getgrnam(x) (NULL)
+#define getgrgid(x) (NULL)
+
+int getopt(int argc, char * const argv[], const char *optstring);
+
+int getsockopt(int s, int level, int optname, void * optval,
+         socklen_t * optlen);
+
+int setsockopt(int s, int level, int optname, const void *optval,
+         socklen_t optlen);
+
+struct  protoent {
+        char    *p_name;           /* official protocol name */
+        char    **p_aliases;  /* alias list */
+        short   p_proto;                /* protocol # */
+};
+
+struct  servent {
+        char    *s_name;           /* official service name */
+        char    **s_aliases;  /* alias list */
+        short   s_port;                 /* port # */
+        char    *s_proto;          /* protocol to use */
+};
+
+struct  hostent {
+        char    *h_name;           /* official name of host */
+        char    **h_aliases;  /* alias list */
+        short   h_addrtype;             /* host address type */
+        short   h_length;               /* length of address */
+        char    **h_addr_list; /* list of addresses */
+#define h_addr  h_addr_list[0]          /* address, for backward compat */
+};
+
+struct hostent* gethostbyaddr(const char* addr, int len, int type)__attribute__ ((stdcall));
+struct hostent* gethostbyname(const char *name)__attribute__ ((stdcall));
+
+struct protoent* getprotobynumber(int number)__attribute__ ((stdcall));
+struct protoent* getprotobyname(const char* name)__attribute__ ((stdcall));
+
+struct servent* getservbyport(int port, const char* proto)__attribute__ ((stdcall));
+struct servent* getservbyname(const char* name, const char* proto) __attribute__ ((stdcall));
+
+extern int optind;
+extern char *optarg;
+
+#include <windef.h>
+
+#define WSADESCRIPTION_LEN      256
+#define WSASYS_STATUS_LEN       128
+
+typedef struct WSAData {
+        WORD                    wVersion;
+        WORD                    wHighVersion;
+        char                    szDescription[WSADESCRIPTION_LEN+1];
+        char                    szSystemStatus[WSASYS_STATUS_LEN+1];
+        unsigned short          iMaxSockets;
+        unsigned short          iMaxUdpDg;
+        char FAR *              lpVendorInfo;
+} WSADATA, * LPWSADATA;
+
+int WSAStartup(
+    WORD wVersionRequested,
+    LPWSADATA lpWSAData
+    );
+
+int
+WSACleanup(void);
+
+int WSAGetLastError();
+
+/* return error on process handling */
+#define        pipe(f)         (-1)
+#define        kill(p, s)      (-1)
+#define        waitpid(w,s,o)  (-1)
+#define fork(x)                (-1)
+#define execvp(f, a)   (-1)
+
+#define _W_INT(i)       (i)
+#define _WSTATUS(x)     (_W_INT(x) & 0177)
+#define WIFEXITED(x)    (_WSTATUS(x) == 0)
+#define WEXITSTATUS(x)  (_W_INT(x) >> 8)
+#define _WSTOPPED       0177            /* _WSTATUS if process is stopped */
+#define WIFSIGNALED(x)  (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0)
+#define WTERMSIG(x)     (_WSTATUS(x))
+
+#endif /* _TCC_GLUE_H */
diff --git a/test/Makefile b/test/Makefile
new file mode 100644 (file)
index 0000000..9ed47f8
--- /dev/null
@@ -0,0 +1,53 @@
+#
+# $Id: Makefile 5626 2010-03-04 21:55:22Z luigi $
+#
+# Makefile for building userland tests
+# this is written in a form compatible with gmake
+
+SCHED_SRCS = test_dn_sched.c
+SCHED_SRCS += dn_sched_fifo.c
+SCHED_SRCS += dn_sched_wf2q.c
+SCHED_SRCS += dn_sched_qfq.c
+SCHED_SRCS += dn_sched_rr.c
+SCHED_SRCS += dn_heap.c
+SCHED_SRCS += main.c
+
+SCHED_OBJS=$(SCHED_SRCS:.c=.o)
+
+HEAP_SRCS = dn_heap.c test_dn_heap.c
+HEAP_OBJS=$(HEAP_SRCS:.c=.o)
+
+VPATH= .:../dummynet2
+
+#CFLAGS = -I../dummynet2/include -I. -Wall -Werror -O3 -DIPFW
+CFLAGS = -I. -I../dummynet2/include/netinet/ipfw -DIPFW
+CFLAGS +=  -Wall -Werror
+CFLAGS += -g -O3
+TARGETS= test_sched # no test_heap by default
+
+all: $(TARGETS)
+
+test_heap : $(HEAP_OBJS)
+       $(CC) -o $@ $(HEAP_OBJS)
+
+test_sched : $(SCHED_OBJS)
+       $(CC) -o $@ $(SCHED_OBJS)
+
+$(SCHED_OBJS): dn_test.h
+main.o: mylist.h
+
+clean:
+       - rm *.o $(TARGETS) *.core
+
+ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \
+       dn_sched.h dn_heap.h ip_dn_private.h Makefile
+TMPBASE = /tmp/testXYZ
+TMPDIR = $(TMPBASE)/test
+
+tgz:
+       -rm -rf $(TMPDIR)
+       mkdir -p $(TMPDIR)
+       -cp -p $(ALLSRCS) $(TMPDIR)
+       -(cd ..; cp -p $(ALLSRCS) $(TMPDIR))
+       ls -la  $(TMPDIR)
+       (cd $(TMPBASE); tar cvzf /tmp/test.tgz test)
diff --git a/test/basic_ipfw.sh b/test/basic_ipfw.sh
new file mode 100755 (executable)
index 0000000..08b66f9
--- /dev/null
@@ -0,0 +1,72 @@
+#!/bin/sh
+
+IPFW=./ipfw/ipfw
+PING=/bin/ping
+RH=127.0.0.1           # remote host
+R=10                   # test rule number
+P=1                    # test pipe number
+
+abort()
+{ 
+echo $* 
+}
+
+#insmod dummynet2/ipfw_mod.ko
+#$IPFW show > /dev/null
+#$IPFW pipe show 
+echo "Flushing rules, do you agree ?"
+$IPFW flush
+
+# test_msg rule counter
+clean() 
+{ 
+       $IPFW delete $R 2> /dev/null
+       $IPFW pipe $P delete 2> /dev/null
+}
+
+# simple counter/allow test
+echo -n "counter/allow test..."
+clean
+$IPFW add $R allow icmp from any to 127.0.0.1 > /dev/null
+$PING -f -c100 $RH > /dev/null
+counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f3`
+[ ! $counter -eq 400 ] && abort "Wrong counter $counter 400"
+echo "...OK"
+
+# simple drop test
+echo -n "deny test..."
+clean
+$IPFW add $R deny icmp from any to 127.0.0.1 > /dev/null
+$PING -f -c10 -W 1 $RH > /dev/null
+counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
+[ ! $counter -eq 10 ] && abort "Wrong counter $counter 10"
+echo "...OK"
+
+# pipe delay test
+echo -n "pipe delay test..."
+clean
+$IPFW pipe $P config delay 2000ms >/dev/null
+$IPFW add $R pipe $P icmp from any to $RH >/dev/null
+$PING -f -c10 -W 1 $RH > /dev/null
+counter1=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
+sleep 2
+counter2=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
+[ ! $counter1 -eq 10 ] && abort "Wrong counter $counter 10"
+[ ! $counter2 -eq 20 ] && abort "Wrong counter $counter 20"
+echo "...OK"
+
+# pipe bw test
+echo -n "pipe bw test..."
+clean
+$IPFW pipe $P config bw 2Kbit/s >/dev/null
+$IPFW add $R pipe $P icmp from any to $RH >/dev/null
+$PING -i 0.1 -c10 -W 1 $RH > /dev/null
+counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
+[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30"
+sleep 1
+counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4`
+[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30"
+echo "...OK"
+
+# Final clean
+clean
diff --git a/test/dn_test.h b/test/dn_test.h
new file mode 100644 (file)
index 0000000..f2a4a51
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * $Id: dn_test.h 5626 2010-03-04 21:55:22Z luigi $
+ *
+ * userspace compatibility code for dummynet schedulers
+ */
+
+#ifndef _DN_TEST_H
+#define _DN_TEST_H
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>   /* bzero, ffs, ... */
+#include <string.h>    /* strcmp */
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+
+extern int debug;
+#define ND(fmt, args...) do {} while (0)
+#define D1(fmt, args...) do {} while (0)
+#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n",      \
+        __FUNCTION__, ## args)
+#define DX(lev, fmt, args...) do {              \
+        if (debug > lev) D(fmt, ## args); } while (0)
+
+
+#define offsetof(t,m) (int)((&((t *)0L)->m))
+
+#include <mylist.h>
+
+/* prevent include of other system headers */
+#define        _NETINET_IP_VAR_H_      /* ip_fw_args */
+#define _IPFW2_H
+#define _SYS_MBUF_H_
+
+enum   {
+       DN_QUEUE,
+};
+
+enum   {
+       DN_SCHED_FIFO,
+       DN_SCHED_WF2QP,
+};
+
+struct dn_id {
+       int type, subtype, len, id;
+};
+struct dn_fs {
+       int par[4];     /* flowset parameters */
+
+       /* simulation entries.
+        * 'index' is not strictly necessary
+        * y is used for the inverse mapping ,
+        */
+       int index;
+       int y;  /* inverse mapping */
+       int base_y;     /* inverse mapping */
+       int next_y;     /* inverse mapping */
+       int n_flows;
+       int first_flow;
+       int next_flow;  /* first_flow + n_flows */
+       /*
+        * when generating, let 'cur' go from 0 to n_flows-1,
+        * then point to flow first_flow + cur
+        */
+       int     cur;
+};
+struct dn_sch {
+};
+struct dn_flow {
+       struct dn_id oid;
+       int length;
+       int len_bytes;
+       int drops;
+       uint64_t tot_bytes;
+       uint32_t flow_id;
+       struct list_head h;     /* used by the generator */
+};
+struct dn_link {
+};
+
+struct ip_fw_args {
+};
+
+struct mbuf {
+        struct {
+                int len;
+        } m_pkthdr;
+        struct mbuf *m_nextpkt;
+       int flow_id;    /* for testing, index of a flow */
+       //int flowset_id;       /* for testing, index of a flowset */
+       void *cfg;      /* config args */
+};
+
+#define MALLOC_DECLARE(x)
+#define KASSERT(x, y)  do { if (!(x)) printf y ; exit(0); } while (0)
+struct ipfw_flow_id {
+};
+
+typedef void * module_t;
+struct _md_t {
+       const char *name;
+       int (*f)(module_t, int, void *);
+       void *p;
+};
+typedef struct _md_t moduledata_t;
+#define DECLARE_MODULE(name, b, c, d)  \
+       moduledata_t *_g_##name = & b
+#define MODULE_DEPEND(a, b, c, d, e)
+
+#ifdef IPFW
+#include <dn_heap.h>
+#include <ip_dn_private.h>
+#include <dn_sched.h>
+#else
+struct dn_queue {
+        struct dn_fsk *fs;             /* parent flowset. */
+        struct dn_sch_inst *_si;       /* parent sched instance. */
+};
+struct dn_schk {
+};
+struct dn_fsk {
+       struct dn_fs fs;
+       struct dn_schk *sched;
+};
+struct dn_sch_inst {
+       struct dn_schk *sched;
+};
+struct dn_alg {
+       int type;
+       const char *name;
+       void *enqueue, *dequeue;
+       int q_datalen, si_datalen, schk_datalen;
+       int (*config)(struct dn_schk *);
+       int (*new_sched)(struct dn_sch_inst *);
+       int (*new_fsk)(struct dn_fsk *);
+        int (*new_queue)(struct dn_queue *q);
+};
+
+#endif
+
+#ifndef __FreeBSD__
+int fls(int);
+#endif
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+        if (q->head == NULL)
+                q->head = m;
+        else
+                q->tail->m_nextpkt = m;
+        q->tail = m;
+        m->m_nextpkt = NULL;
+}
+
+#endif /* _DN_TEST_H */
diff --git a/test/dynrules.sh b/test/dynrules.sh
new file mode 100644 (file)
index 0000000..98f5fe6
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+# 20100507 marta, quick test for dyn rules
+# ./ipfw/ipfw -d show |grep \ 80
+
+IPFW_MOD=dummynet2/ipfw_mod.ko
+IPFW=ipfw/ipfw
+
+# main
+# remove any previous loaded module
+/sbin/rmmod ipfw_mod 
+/sbin/insmod ${IPFW_MOD}
+echo "25" >  /sys/module/ipfw_mod/parameters/dyn_ack_lifetime
+${IPFW} add 1 check-state
+${IPFW} add 9 allow all from any to any keep-state
+${IPFW} add 10 allow all from any to onelab1.iet.unipi.it keep-state
+
+telnet 72.14.234.104 80 
+
+
diff --git a/test/interpolation.c b/test/interpolation.c
new file mode 100644 (file)
index 0000000..d6731f1
--- /dev/null
@@ -0,0 +1,335 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* gcc interpolation.c -o interpolation */
+
+void    
+err(int eval, const char *fmt, ...) 
+{
+}           
+void    
+errx(int eval, const char *fmt, ...) 
+{
+}           
+        
+
+#define ED_MAX_SAMPLES_NO 1000
+#define ED_MAX_LINE_LEN 128
+#define EX_DATAERR 1
+#define EX_UNAVAILABLE  3
+#define ED_TOK_DELAY    "delay"
+#define ED_TOK_PROB     "prob"
+#define ED_SEPARATORS   " \t\n"
+#define ED_TOK_PROFILE_NO "profile_no"
+
+
+struct point {
+       double prob;            /* y */
+       double delay;           /* x */
+};
+
+struct profile {
+        char    filename[128];                   /* profile filename */
+        int     samples[ED_MAX_SAMPLES_NO+1];    /* may be shorter */
+        int     samples_no;                     /* actual len of samples[] */
+};
+
+/*
+ * returns 1 if s is a non-negative number, with at least one '.'
+ */
+static int
+is_valid_number(const char *s)
+{
+#if 0
+        int i, dots_found = 0;
+        int len = strlen(s);
+
+        for (i = 0; i<len; ++i)
+                if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1))
+                        return 0;
+#endif
+        return 1;
+}
+
+static int
+compare_points(const void *vp1, const void *vp2)
+{
+       const struct point *p1 = vp1;
+       const struct point *p2 = vp2;
+       double res = 0;
+
+       res = p1->prob - p2->prob;
+       if (res == 0)
+               res = p1->delay - p2->delay;
+       if (res < 0)
+               return -1;
+       else if (res > 0)
+               return 1;
+       else
+               return 0;
+}
+
+#define ED_EFMT(s) 1,"error in %s at line %d: "#s,filename,lineno
+
+/*
+ * The points defined by the user are stored in the ponts structure.
+ * The number of user defined points is stored in points_no.
+ *       We assume that The last point for the '1' value of the
+ *       probability should be defined. (XXX add checks for this)
+ * The user defined sampling value is stored in samples_no.
+ * The resulting samples are in the "samples" pointer.
+ */
+static void
+interpolate_samples(struct point *p, int points_no, 
+               int *samples, int samples_no, const char *filename)
+{
+       double dy;              /* delta on the y axis */
+       double y;               /* current value of y */
+       double x;               /* current value of x */
+       double m;               /* the y slope */
+       int i;                  /* samples index */
+       int curr;               /* points current index */
+
+       dy = 1.0/samples_no;
+       y = 0;
+
+       for (i=0, curr = 0; i < samples_no; i++, y+=dy) {
+               /* This statment move the curr pointer to the next point
+                * skipping the points with the same x value. We are
+                * guaranteed to exit from the loop because the
+                * last possible value of y is stricly less than 1
+                * and the last possible value of the y points is 1 */
+               while ( y >= p[curr+1].prob ) curr++;
+
+               /* compute the slope of the curve */
+               m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob);
+               /* compute the x value starting from the current point */
+               x = p[curr].delay + (y - p[curr].prob) * m;
+               samples[i] = x;
+       }
+
+       /* add the last sample */
+       samples[i] = p[curr+1].delay;
+}
+
+#if 0
+static void
+interpolate_samples_old(struct point *points, int points_no, 
+               int *samples, int samples_no, const char *filename)
+{
+       int i;          /* pointer to the sampled array */
+       int j = 0;      /* pointer to user defined samples */
+       double dy;      /* delta y */
+       double y;       /* current value of y */
+       int x;          /* computed value of x */
+       double m;       /* slope of the line */
+       double y1, x1, y2, x2;  /* two points of the current line */
+
+       /* make sure that there are enough points. */
+       /* XXX Duplicated shoule be removed */
+       if (points_no < 3)
+           errx(EX_DATAERR, "%s too few samples, need at least %d",
+               filename, 3);
+
+       qsort(points, points_no, sizeof(struct point), compare_points);
+
+       samples_no--;
+       dy = 1.0/samples_no;
+       printf("\nsamples no is %d dy is %f ", samples_no, dy);
+
+       /* start with the first two points */
+       y1 = points[j].prob * samples_no;
+       x1 = points[j].delay;
+       j++;
+       y2 = points[j].prob * samples_no;
+       x2 = points[j].delay;
+
+       m = (y2-y1)/(x2-x1);
+       printf("\nStart");
+       printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f m %f\n",
+                x1, y1, x2, y2, m);
+
+       y = 0;
+       x = x1;
+
+       for(i=0; i < samples_no+1; i++, y+=dy) {
+               printf("\ni:%d j:%d y:%f real y:%f", i, j, y, y*samples_no);
+               if ( (y*samples_no) >= y2 ) { /* move to the next point */
+                       j++;
+                       if ( j >= points_no ) {
+                               printf("\n\tNo more points, exit with j: %d i: %d and y:%f %f\n",
+                                        j, i, y, (y*samples_no));
+                               break;  /* no more user defined points */
+                       }
+                       /* load a new point */
+                       y1 = y2;
+                       x1 = x2;
+                       y2 = points[j].prob * samples_no;
+                       x2 = points[j].delay;
+                       m = (y2-y1)/(x2-x1);
+                       if (x1==x2) { /* m = infinito */
+                               m = -1;
+                               x = x2;
+                       }
+                       /* very small m problem */
+                       printf ("\ndelta %f\n", (y1 - y2));
+                       if (abs(y1 - y2) < 0.00001) { /* m = 0 XXX Should this magic number depend on samples_no ? */
+                               m = 0;
+                               x = x2;
+                       }
+                       printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f (%f/%f)=m \n",
+                                x1, y1, x2, y2, (y2-y1), (x2-x1), m);
+               }
+               printf("\n\tcompute step y %f x[%d]=%d ",
+                       y, i, x);
+               if ((m != -1) && ( m != 0 )) {
+                       x = x + (dy * samples_no)/m;
+               }
+               samples[i] = x;
+               printf(" dy %f x new %d\n", dy*samples_no, x);
+               printf(" m %f (dy * samples_no)/m %f \n", m, (dy * samples_no)/m);
+       }
+
+       x = samples[i-1];
+       printf("Finish i is %d samples_no is %d\n", i, samples_no);
+       /* The last point has a probability less than 1 */
+       for (; i <= samples_no; i++)
+               samples[i] = x;
+}
+#endif
+
+static void
+load_profile(struct profile *p)
+{
+       FILE    *f;                     /* file handler */
+       char    line[ED_MAX_LINE_LEN];
+       int     lineno = 0;
+       int     do_points = 0;
+       int     delay_first = -1;
+       int i;
+
+       struct  point points[1000]; /* MAX_POINTS_NO */
+       int     points_no = 0;
+
+       char *filename = p->filename;
+       f = fopen(filename, "r");
+       if (f == NULL) {
+           err(EX_UNAVAILABLE, "fopen: %s", filename);
+       }
+
+
+       while (fgets(line, ED_MAX_LINE_LEN, f)) {         /* read commands */
+               char *s, *cur = line, *name = NULL, *arg = NULL;
+
+               ++lineno;
+
+               /* parse the line */
+               while (cur) {
+                       s = strsep(&cur, ED_SEPARATORS);
+                       if (s == NULL || *s == '#')
+                               break;
+                       if (*s == '\0')
+                               continue;
+                       if (arg)
+                               errx(ED_EFMT("too many arguments"));
+                       if (name == NULL)
+                               name = s;
+                       else
+                               arg = s;
+               }
+
+               if (name == NULL)
+                       continue;
+
+               if (!strcasecmp(name, ED_TOK_DELAY)) {
+                   if (do_points)
+                       errx(ED_EFMT("duplicated token: %s"), name);
+                   delay_first = 1;
+                   do_points = 1;
+                   continue;
+               } else if (!strcasecmp(name, ED_TOK_PROB)) {
+                   if (do_points)
+                       errx(ED_EFMT("duplicated token: %s"), name);
+                   delay_first = 0;
+                   do_points = 1;
+                   continue;
+               }
+               if (!strcasecmp(name, ED_TOK_PROFILE_NO)) {
+                       int p_no = atof(arg);
+                       if (p_no <= 0) {
+                               p_no = 100;
+                               printf("invalid interpolation samples, using %d\n",
+                                        p_no);
+                       }
+                       if (p_no > ED_MAX_SAMPLES_NO) {
+                               p_no = ED_MAX_SAMPLES_NO;
+                               printf("invalid interpolation samples, using %d\n",
+                                        p_no);
+                       }
+
+                       p->samples_no = p_no;
+                   continue;
+
+               } else if (do_points) {
+                   if (!is_valid_number(name) || !is_valid_number(arg))
+                       errx(ED_EFMT("invalid point found"));
+                   if (delay_first) {
+                       points[points_no].delay = atof(name);
+                       points[points_no].prob = atof(arg);
+                   } else {
+                       points[points_no].delay = atof(arg);
+                       points[points_no].prob = atof(name);
+                   }
+                   if (points[points_no].prob > 1.0)
+                       errx(ED_EFMT("probability greater than 1.0"));
+                   ++points_no;
+       /* XXX no more that 1000 */
+                   continue;
+               } else {
+                   errx(ED_EFMT("unrecognised command '%s'"), name);
+               }
+       }
+
+       for(i=0; i < p->samples_no; i++) {
+               p->samples[i] = 666;
+       }
+
+       /* This code assume the user define a value of X for the sampling value,
+        * and that:
+        * - the value stored in the emulator structure is X;
+        * - the allocated structure for the samples is X+1;
+        */
+       interpolate_samples(points, points_no, p->samples, p->samples_no, filename);
+
+       // User defined samples
+       printf("\nLoaded %d points:\n", points_no);
+       for(i=0; i < points_no; i++) {
+               printf("%f %f\n", points[i].prob, points[i].delay);
+       }
+       printf("\n");
+       printf("The sample value is %d \n", p->samples_no);
+
+}
+
+int main(int argc, char **argv)
+{
+       if (argc < 2) {
+               printf("Usage: ./interpolation <filename>\n");
+               return -1;
+       }
+
+       char *filename;
+       filename = argv[1];
+
+       struct profile p;
+       int i;
+
+       strncpy(p.filename, filename, 128);
+       load_profile(&p);
+       printf("-----------\n");
+       for (i=0; i<=p.samples_no; i++)
+               printf("%d %d\n", i, p.samples[i]);
+       printf("-----------\n");
+       return 0;
+}
diff --git a/test/main.c b/test/main.c
new file mode 100644 (file)
index 0000000..85fc621
--- /dev/null
@@ -0,0 +1,636 @@
+/*
+ * $Id: main.c 5626 2010-03-04 21:55:22Z luigi $
+ *
+ * Testing program for schedulers
+ *
+ * The framework include a simple controller which, at each
+ * iteration, decides whether we can enqueue and/or dequeue.
+ * Then the mainloop runs the required number of tests,
+ * keeping track of statistics.
+ */
+
+#include "dn_test.h"
+
+struct q_list {
+       struct list_head h;
+};
+
+struct cfg_s {
+       int ac;
+       char * const *av;
+
+       const char *name;
+       int loops;
+       struct timeval time;
+
+       /* running counters */
+       uint32_t        _enqueue;
+       uint32_t        drop;
+       uint32_t        pending;
+       uint32_t        dequeue;
+
+       /* generator parameters */
+       int th_min, th_max;
+       int maxburst;
+       int lmin, lmax; /* packet len */
+       int flows;      /* number of flows */
+       int flowsets;   /* number of flowsets */
+       int wsum;       /* sum of weights of all flows */
+       int max_y;      /* max random number in the generation */
+       int cur_y, cur_fs;      /* used in generation, between 0 and max_y - 1 */
+       const char *fs_config; /* flowset config */
+       int can_dequeue;
+       int burst;      /* count of packets sent in a burst */
+       struct mbuf *tosend;    /* packet to send -- also flag to enqueue */
+
+       struct mbuf *freelist;
+
+       struct mbuf *head, *tail;       /* a simple tailq */
+
+       /* scheduler hooks */
+       int (*enq)(struct dn_sch_inst *, struct dn_queue *,
+               struct mbuf *);
+       struct mbuf * (*deq)(struct dn_sch_inst *);
+       /* size of the three fields including sched-specific areas */
+       int schk_len;
+       int q_len; /* size of a queue including sched-fields */
+       int si_len; /* size of a sch_inst including sched-fields */
+       char *q;        /* array of flow queues */
+               /* use a char* because size is variable */
+       struct dn_fsk *fs;      /* array of flowsets */
+       struct dn_sch_inst *si;
+       struct dn_schk *sched;
+
+       /* generator state */
+       int state;              /* 0 = going up, 1: going down */
+
+       /*
+        * We keep lists for each backlog level, and always serve
+        * the one with shortest backlog. llmask contains a bitmap
+        * of lists, and ll are the heads of the lists. The last
+        * entry (BACKLOG) contains all entries considered 'full'
+        * XXX to optimize things, entry i could contain queues with
+        * 2^{i-1}+1 .. 2^i entries.
+        */
+#define BACKLOG        30
+       uint32_t        llmask;
+       struct list_head ll[BACKLOG + 10];
+};
+
+/* FI2Q and Q2FI converts from flow_id to dn_queue and back.
+ * We cannot easily use pointer arithmetic because it is variable size.
+  */
+#define FI2Q(c, i)     ((struct dn_queue *)((c)->q + (c)->q_len * (i)))
+#define Q2FI(c, q)     (((char *)(q) - (c)->q)/(c)->q_len)
+
+int debug = 0;
+
+struct dn_parms dn_cfg;
+
+static void controller(struct cfg_s *c);
+
+/* release a packet: put the mbuf in the freelist, and the queue in
+ * the bucket.
+ */
+int
+drop(struct cfg_s *c, struct mbuf *m)
+{
+       struct dn_queue *q;
+       int i;
+
+       c->drop++;
+       q = FI2Q(c, m->flow_id);
+       i = q->ni.length; // XXX or ffs...
+
+       ND("q %p id %d current length %d", q, m->flow_id, i);
+       if (i < BACKLOG) {
+               struct list_head *h = &q->ni.h;
+               c->llmask &= ~(1<<(i+1));
+               c->llmask |= (1<<(i));
+               list_del(h);
+               list_add_tail(h, &c->ll[i]);
+       }
+       m->m_nextpkt = c->freelist;
+       c->freelist = m;
+       return 0;
+}
+
+/* dequeue returns NON-NULL when a packet is dropped */
+static int
+enqueue(struct cfg_s *c, void *_m)
+{
+       struct mbuf *m = _m;
+       if (c->enq)
+               return c->enq(c->si, FI2Q(c, m->flow_id), m);
+       if (c->head == NULL)
+               c->head = m;
+       else
+               c->tail->m_nextpkt = m;
+       c->tail = m;
+       return 0; /* default - success */
+}
+
+/* dequeue returns NON-NULL when a packet is available */
+static void *
+dequeue(struct cfg_s *c)
+{
+       struct mbuf *m;
+       if (c->deq)
+               return c->deq(c->si);
+       if ((m = c->head)) {
+               m = c->head;
+               c->head = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+       }
+       return m;
+}
+
+static int
+mainloop(struct cfg_s *c)
+{
+       int i;
+       struct mbuf *m;
+
+       for (i=0; i < c->loops; i++) {
+               /* implement histeresis */
+               controller(c);
+               DX(3, "loop %d enq %d send %p rx %d",
+                       i, c->_enqueue, c->tosend, c->can_dequeue);
+               if ( (m = c->tosend) ) {
+                       c->_enqueue++;
+                       if (enqueue(c, m)) {
+                               drop(c, m);
+                               ND("loop %d enqueue fail", i );
+                       } else {
+                               ND("enqueue ok");
+                               c->pending++;
+                       }
+               }
+               if (c->can_dequeue) {
+                       c->dequeue++;
+                       if ((m = dequeue(c))) {
+                               c->pending--;
+                               drop(c, m);
+                               c->drop--;      /* compensate */
+                       }
+               }
+       }
+       DX(1, "mainloop ends %d", i);
+       return 0;
+}
+
+int
+dump(struct cfg_s *c)
+{
+       int i;
+       struct dn_queue *q;
+
+       for (i=0; i < c->flows; i++) {
+               q = FI2Q(c, i);
+               DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes);
+       }
+       DX(1, "done %d loops\n", c->loops);
+       return 0;
+}
+
+/* interpret a number in human form */
+static long
+getnum(const char *s, char **next, const char *key)
+{
+       char *end = NULL;
+       long l;
+
+       if (next)       /* default */
+               *next = NULL;
+       if (s && *s) {
+               DX(3, "token is <%s> %s", s, key ? key : "-");
+               l = strtol(s, &end, 0);
+       } else {
+               DX(3, "empty string");
+               l = -1;
+       }
+       if (l < 0) {
+               DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") );
+               return 0;       // invalid 
+       }
+       if (!end || !*end)
+               return l;
+       if (*end == 'n')
+               l = -l; /* multiply by n */
+       else if (*end == 'K')
+               l = l*1000;
+       else if (*end == 'M')
+               l = l*1000000;
+       else if (*end == 'k')
+               l = l*1024;
+       else if (*end == 'm')
+               l = l*1024*1024;
+       else if (*end == 'w')
+               ;
+       else {/* not recognized */
+               D("suffix %s for %s, next %p", end, key, next);
+               end--;
+       }
+       end++;
+       DX(3, "suffix now %s for %s, next %p", end, key, next);
+       if (next && *end) {
+               DX(3, "setting next to %s for %s", end, key);
+               *next = end;
+       }
+       return l;
+}
+
+/*
+ * flowsets are a comma-separated list of
+ *     weight:maxlen:flows
+ * indicating how many flows are hooked to that fs.
+ * Both weight and range can be min-max-steps.
+ * In a first pass we just count the number of flowsets and flows,
+ * in a second pass we complete the setup.
+ */
+static void
+parse_flowsets(struct cfg_s *c, const char *fs, int pass)
+{
+       char *s, *cur, *next;
+       int n_flows = 0, n_fs = 0, wsum = 0;
+       int i, j;
+       struct dn_fs *prev = NULL;
+
+       DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets);
+       if (pass == 0)
+               c->fs_config = fs;
+       s = c->fs_config ? strdup(c->fs_config) : NULL;
+       if (s == NULL) {
+               if (pass == 0)
+                       D("no fsconfig");
+               return;
+       }
+       for (next = s; (cur = strsep(&next, ","));) {
+               char *p = NULL;
+               int w, w_h, w_steps, wi;
+               int len, len_h, l_steps, li;
+               int flows;
+
+               w = getnum(strsep(&cur, ":"), &p, "weight");
+               if (w <= 0)
+                       w = 1;
+               w_h = p ? getnum(p+1, &p, "weight_max") : w;
+               w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2);
+               len = getnum(strsep(&cur, ":"), &p, "len");
+               if (len <= 0)
+                       len = 1000;
+               len_h = p ? getnum(p+1, &p, "len_max") : len;
+               l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2);
+               flows = getnum(strsep(&cur, ":"), NULL, "flows");
+               if (flows == 0)
+                       flows = 1;
+               DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d",
+                       w, w_h, w_steps, len, len_h, l_steps, flows);
+               if (w == 0 || w_h < w || len == 0 || len_h < len ||
+                               flows == 0) {
+                       DX(4,"wrong parameters %s", fs);
+                       return;
+               }
+               n_flows += flows * w_steps * l_steps;
+               for (i = 0; i < w_steps; i++) {
+                       wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1));
+                       for (j = 0; j < l_steps; j++, n_fs++) {
+                               struct dn_fs *fs = &c->fs[n_fs].fs; // tentative
+                               int x;
+
+                               li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1));
+                               x = (wi*2048)/li;
+                               DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d",
+                                       n_fs, wi, li, x, flows);
+                               if (pass == 0)
+                                       continue;
+                               if (c->fs == NULL || c->flowsets <= n_fs) {
+                                       D("error in number of flowsets");
+                                       return;
+                               }
+                               wsum += wi * flows;
+                               fs->par[0] = wi;
+                               fs->par[1] = li;
+                               fs->index = n_fs;
+                               fs->n_flows = flows;
+                               fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow;
+                               fs->next_flow = fs->first_flow + fs->n_flows;
+                               fs->y = x * flows;
+                               fs->base_y = (prev == NULL) ? 0 : prev->next_y;
+                               fs->next_y = fs->base_y + fs->y;
+                               prev = fs;
+                       }
+               }
+       }
+       c->max_y = prev ? prev->base_y + prev->y : 0;
+       c->flows = n_flows;
+       c->flowsets = n_fs;
+       c->wsum = wsum;
+       if (pass == 0)
+               return;
+
+       /* now link all flows to their parent flowsets */
+       DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y);
+       for (i=0; i < c->flowsets; i++) {
+               struct dn_fs *fs = &c->fs[i].fs;
+               DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d",
+                       i, fs->par[0], fs->par[1],
+                       fs->first_flow, fs->next_flow,
+                       fs->base_y, fs->next_y);
+               for (j = fs->first_flow; j < fs->next_flow; j++) {
+                       struct dn_queue *q = FI2Q(c, j);
+                       q->fs = &c->fs[i];
+               }
+       }
+}
+
+static int
+init(struct cfg_s *c)
+{
+       int i;
+       int ac = c->ac;
+       char * const *av = c->av;
+
+       c->si_len = sizeof(struct dn_sch_inst);
+       c->q_len = sizeof(struct dn_queue);
+       moduledata_t *mod = NULL;
+       struct dn_alg *p = NULL;
+
+       c->th_min = 0;
+       c->th_max = -20;/* 20 packets per flow */
+       c->lmin = c->lmax = 1280;       /* packet len */
+       c->flows = 1;
+       c->flowsets = 1;
+       c->name = "null";
+       ac--; av++;
+       while (ac > 1) {
+               if (!strcmp(*av, "-n")) {
+                       c->loops = getnum(av[1], NULL, av[0]);
+               } else if (!strcmp(*av, "-d")) {
+                       debug = atoi(av[1]);
+               } else if (!strcmp(*av, "-alg")) {
+                       extern moduledata_t *_g_dn_fifo;
+                       extern moduledata_t *_g_dn_wf2qp;
+                       extern moduledata_t *_g_dn_rr;
+                       extern moduledata_t *_g_dn_qfq;
+#ifdef WITH_KPS
+                       extern moduledata_t *_g_dn_kps;
+#endif
+                       if (!strcmp(av[1], "rr"))
+                               mod = _g_dn_rr;
+                       else if (!strcmp(av[1], "wf2qp"))
+                               mod = _g_dn_wf2qp;
+                       else if (!strcmp(av[1], "fifo"))
+                               mod = _g_dn_fifo;
+                       else if (!strcmp(av[1], "qfq"))
+                               mod = _g_dn_qfq;
+#ifdef WITH_KPS
+                       else if (!strcmp(av[1], "kps"))
+                               mod = _g_dn_kps;
+#endif
+                       else
+                               mod = NULL;
+                       c->name = mod ? mod->name : "NULL";
+                       DX(3, "using scheduler %s", c->name);
+               } else if (!strcmp(*av, "-len")) {
+                       c->lmin = getnum(av[1], NULL, av[0]);
+                       c->lmax = c->lmin;
+                       DX(3, "setting max to %d", c->th_max);
+               } else if (!strcmp(*av, "-burst")) {
+                       c->maxburst = getnum(av[1], NULL, av[0]);
+                       DX(3, "setting max to %d", c->th_max);
+               } else if (!strcmp(*av, "-qmax")) {
+                       c->th_max = getnum(av[1], NULL, av[0]);
+                       DX(3, "setting max to %d", c->th_max);
+               } else if (!strcmp(*av, "-qmin")) {
+                       c->th_min = getnum(av[1], NULL, av[0]);
+                       DX(3, "setting min to %d", c->th_min);
+               } else if (!strcmp(*av, "-flows")) {
+                       c->flows = getnum(av[1], NULL, av[0]);
+                       DX(3, "setting flows to %d", c->flows);
+               } else if (!strcmp(*av, "-flowsets")) {
+                       parse_flowsets(c, av[1], 0);
+                       DX(3, "setting flowsets to %d", c->flowsets);
+               } else {
+                       D("option %s not recognised, ignore", *av);
+               }
+               ac -= 2; av += 2;
+       }
+       if (c->maxburst <= 0)
+               c->maxburst = 1;
+       if (c->loops <= 0)
+               c->loops = 1;
+       if (c->flows <= 0)
+               c->flows = 1;
+       if (c->flowsets <= 0)
+               c->flowsets = 1;
+       if (c->lmin <= 0)
+               c->lmin = 1;
+       if (c->lmax <= 0)
+               c->lmax = 1;
+       /* multiply by N */
+       if (c->th_min < 0)
+               c->th_min = c->flows * -c->th_min;
+       if (c->th_max < 0)
+               c->th_max = c->flows * -c->th_max;
+       if (c->th_max <= c->th_min)
+               c->th_max = c->th_min + 1;
+       if (mod) {
+               p = mod->p;
+               DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p);
+               DX(3, "modname %s ty %d", p->name, p->type);
+               c->enq = p->enqueue;
+               c->deq = p->dequeue;
+               c->si_len += p->si_datalen;
+               c->q_len += p->q_datalen;
+               c->schk_len += p->schk_datalen;
+       }
+       /* allocate queues, flowsets and one scheduler */
+       c->q = calloc(c->flows, c->q_len);
+       c->fs = calloc(c->flowsets, sizeof(struct dn_fsk));
+       c->si = calloc(1, c->si_len);
+       c->sched = calloc(c->flows, c->schk_len);
+       if (c->q == NULL || c->fs == NULL) {
+               D("error allocating memory for flows");
+               exit(1);
+       }
+       c->si->sched = c->sched;
+       if (p) {
+               if (p->config)
+                       p->config(c->sched);
+               if (p->new_sched)
+                       p->new_sched(c->si);
+       }
+       /* parse_flowsets links queues to their flowsets */
+       parse_flowsets(c, av[1], 1);
+       /* complete the work calling new_fsk */
+       for (i = 0; i < c->flowsets; i++) {
+               if (c->fs[i].fs.par[1] == 0)
+                       c->fs[i].fs.par[1] = 1000;      /* default pkt len */
+               c->fs[i].sched = c->sched;
+               if (p && p->new_fsk)
+                       p->new_fsk(&c->fs[i]);
+       }
+
+       /* initialize the lists for the generator, and put
+        * all flows in the list for backlog = 0
+        */
+       for (i=0; i <= BACKLOG+5; i++)
+               INIT_LIST_HEAD(&c->ll[i]);
+
+       for (i = 0; i < c->flows; i++) {
+               struct dn_queue *q = FI2Q(c, i);
+               if (q->fs == NULL)
+                       q->fs = &c->fs[0]; /* XXX */
+               q->_si = c->si;
+               if (p && p->new_queue)
+                       p->new_queue(q);
+               INIT_LIST_HEAD(&q->ni.h);
+               list_add_tail(&q->ni.h, &c->ll[0]);
+       }
+       c->llmask = 1;
+       return 0;
+}
+
+
+int
+main(int ac, char *av[])
+{
+       struct cfg_s c;
+       struct timeval end;
+       double ll;
+       int i;
+       char msg[40];
+
+       bzero(&c, sizeof(c));
+       c.ac = ac;
+       c.av = av;
+       init(&c);
+       gettimeofday(&c.time, NULL);
+       mainloop(&c);
+       gettimeofday(&end, NULL);
+       end.tv_sec -= c.time.tv_sec;
+       end.tv_usec -= c.time.tv_usec;
+       if (end.tv_usec < 0) {
+               end.tv_usec += 1000000;
+               end.tv_sec--;
+       }
+       c.time = end;
+       ll = end.tv_sec*1000000 + end.tv_usec;
+       ll *= 1000;     /* convert to nanoseconds */
+       ll /= c._enqueue;
+       sprintf(msg, "1::%d", c.flows);
+       D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d",
+               c.name, c._enqueue, c.loops,
+               (int)c.time.tv_sec, (int)c.time.tv_usec, ll,
+               c.th_min, c.th_max,
+               c.fs_config ? c.fs_config : msg, c.drop);
+       dump(&c);
+       DX(1, "done ac %d av %p", ac, av);
+       for (i=0; i < ac; i++)
+               DX(1, "arg %d %s", i, av[i]);
+       return 0;
+}
+
+/*
+ * The controller decides whether in this iteration we should send
+ * (the packet is in c->tosend) and/or receive (flag c->can_dequeue)
+ */
+static void
+controller(struct cfg_s *c)
+{
+       struct mbuf *m;
+       struct dn_fs *fs;
+       int flow_id;
+
+       /* histeresis between max and min */
+       if (c->state == 0 && c->pending >= c->th_max)
+               c->state = 1;
+       else if (c->state == 1 && c->pending <= c->th_min)
+               c->state = 0;
+       ND(1, "state %d pending %2d", c->state, c->pending);
+       c->can_dequeue = c->state;
+       c->tosend = NULL;
+       if (c->state)
+               return;
+
+    if (1) {
+       int i;
+       struct dn_queue *q;
+       struct list_head *h;
+
+       i = ffs(c->llmask) - 1;
+       if (i < 0) {
+               DX(2, "no candidate");
+               c->can_dequeue = 1;
+               return;
+       }
+       h = &c->ll[i];
+       ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next);
+       q = list_first_entry(h, struct dn_queue, ni.h);
+       list_del(&q->ni.h);
+       flow_id = Q2FI(c, q);
+       DX(2, "extracted flow %p %d backlog %d", q, flow_id, i);
+       if (list_empty(h)) {
+               ND(2, "backlog %d empty", i);
+               c->llmask &= ~(1<<i);
+       }
+       ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+       list_add_tail(&q->ni.h, h+1);
+       ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+       if (i < BACKLOG) {
+               ND(2, "backlog %d full", i+1);
+               c->llmask |= 1<<(1+i);
+       }
+       fs = &q->fs->fs;
+       c->cur_fs = q->fs - c->fs;
+       fs->cur = flow_id;
+    } else {
+       /* XXX this does not work ? */
+       /* now decide whom to send the packet, and the length */
+       /* lookup in the flow table */
+       if (c->cur_y >= c->max_y) {     /* handle wraparound */
+               c->cur_y = 0;
+               c->cur_fs = 0;
+       }
+       fs = &c->fs[c->cur_fs].fs;
+       flow_id = fs->cur++;
+       if (fs->cur >= fs->next_flow)
+               fs->cur = fs->first_flow;
+       c->cur_y++;
+       if (c->cur_y >= fs->next_y)
+               c->cur_fs++;
+    }
+
+       /* construct a packet */
+       if (c->freelist) {
+               m = c->tosend = c->freelist;
+               c->freelist = c->freelist->m_nextpkt;
+       } else {
+               m = c->tosend = calloc(1, sizeof(struct mbuf));
+       }
+       if (m == NULL)
+               return;
+
+       m->cfg = c;
+       m->m_nextpkt = NULL;
+       m->m_pkthdr.len = fs->par[1]; // XXX maxlen
+       m->flow_id = flow_id;
+
+       ND(2,"y %6d flow %5d fs %3d weight %4d len %4d",
+               c->cur_y, m->flow_id, c->cur_fs,
+               fs->par[0], m->m_pkthdr.len);
+
+}
+
+/*
+Packet allocation:
+to achieve a distribution that matches weights, for each X=w/lmax class
+we should generate a number of packets proportional to Y = X times the number
+of flows in the class.
+So we construct an array with the cumulative distribution of Y's,
+and use it to identify the flow via inverse mapping (if the Y's are
+not too many we can use an array for the lookup). In practice,
+each flow will have X entries [virtually] pointing to it.
+
+*/
diff --git a/test/memory_leak.sh b/test/memory_leak.sh
new file mode 100644 (file)
index 0000000..9bdf093
--- /dev/null
@@ -0,0 +1,26 @@
+#!/bin/sh
+# this script execute N times the command CMD
+# collecting the memory usage on a file.
+# The value of the Dirty memory should not increase
+# between tests.
+
+BASE_NAME=ipfw_r5808_
+N=10000
+CMD1="/sbin/insmod ../dummynet2/ipfw_mod.ko"
+CMD2="/sbin/rmmod ipfw_mod"
+
+# main
+# remove any previous loaded module
+/sbin/rmmod ipfw_mod 
+
+# pre
+
+for n in `seq $N`; do
+       $CMD1
+       $CMD2
+       [ $n = 10 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n}
+       [ $n = 100 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n}
+       [ $n = 1000 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n}
+done;
+
+# post
diff --git a/test/mylist.h b/test/mylist.h
new file mode 100644 (file)
index 0000000..b546fc2
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * $Id: mylist.h 5626 2010-03-04 21:55:22Z luigi $
+ *
+ * linux-like bidirectional lists
+ */
+
+#ifndef _MYLIST_H
+#define _MYLIST_H
+struct list_head {
+        struct list_head *prev, *next;
+};
+
+#define INIT_LIST_HEAD(l) do {  (l)->prev = (l)->next = (l); } while (0)
+#define list_empty(l)   ( (l)->next == l )
+static inline void
+__list_add(struct list_head *o, struct list_head *prev,
+        struct list_head *next)
+{
+        next->prev = o;
+        o->next = next;
+        o->prev = prev;
+        prev->next = o;
+}
+static inline void
+list_add_tail(struct list_head *o, struct list_head *head)
+{
+        __list_add(o, head->prev, head);
+}
+
+#define list_first_entry(pL, ty, member)        \
+        (ty *)((char *)((pL)->next) - offsetof(ty, member))
+
+static inline void
+__list_del(struct list_head *prev, struct list_head *next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+       ND("called on %p", entry);
+        __list_del(entry->prev, entry->next);
+        entry->next = entry->prev = NULL;
+}
+
+#endif /* _MYLIST_H */
diff --git a/test/profile_bench1 b/test/profile_bench1
new file mode 100644 (file)
index 0000000..797650f
--- /dev/null
@@ -0,0 +1,26 @@
+profile_no 100
+delay prob
+207 0.000264
+255 0.034117
+270 0.072280
+279 0.106749
+288 0.148604
+298 0.184304
+302 0.202194
+353 0.384541
+423 0.588842
+510 0.782126
+516 0.800970
+545 0.845706
+553 0.861411
+573 0.889430
+586 0.912117
+620 0.920003
+661 0.938308
+695 0.944191
+740 0.949112
+765 0.952598
+848 0.957109
+1379 0.983768
+1555 0.983778
+1649 1
diff --git a/test/profile_bench2 b/test/profile_bench2
new file mode 100644 (file)
index 0000000..c733868
--- /dev/null
@@ -0,0 +1,7 @@
+samples 10
+delay prob
+0 0
+250 0
+250 0.5
+500 0.5
+500 1
diff --git a/test/profile_bench3 b/test/profile_bench3
new file mode 100644 (file)
index 0000000..5d1722e
--- /dev/null
@@ -0,0 +1,5 @@
+profile_no 100
+delay prob
+0 0
+50 0.5
+100 1
diff --git a/test/test_dn_heap.c b/test/test_dn_heap.c
new file mode 100644 (file)
index 0000000..7d3dc05
--- /dev/null
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Userland code for testing binary heaps and hash tables
+ *
+ * $Id: test_dn_heap.c 6131 2010-04-22 15:37:36Z svn_panicucci $
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#include "dn_test.h"
+#include  "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...)    fprintf(stderr, ## x), exit(1)
+
+#include <string.h>
+
+struct x {
+       struct x *ht_link;
+       char buf[0];
+};
+
+uint32_t hf(uintptr_t key, int flags, void *arg)
+{
+       return (flags & DNHT_KEY_IS_OBJ) ?
+               ((struct x *)key)->buf[0] : *(char *)key;
+}
+
+int matchf(void *obj, uintptr_t key, int flags, void *arg)
+{
+       char *s = (flags & DNHT_KEY_IS_OBJ) ?
+               ((struct x *)key)->buf : (char *)key;
+       return (strcmp(((struct x *)obj)->buf, s) == 0);
+}
+
+void *newfn(uintptr_t key, int flags, void *arg)
+{
+       char *s = (char *)key;
+       struct x *p = malloc(sizeof(*p) + 1 + strlen(s));
+       if (p)
+               strcpy(p->buf, s);
+       return p;
+}
+
+char *strings[] = {
+       "undici", "unico", "doppio", "devoto",
+       "uno", "due", "tre", "quattro", "cinque", "sei",
+       "uno", "due", "tre", "quattro", "cinque", "sei",
+       NULL,
+};
+
+int doprint(void *_x, void *arg)
+{
+       struct x *x = _x;
+       printf("found element <%s>\n", x->buf);
+       return (int)arg;
+}
+
+static void
+test_hash()
+{
+       char **p;
+       struct dn_ht *h;
+       uintptr_t x = 0;
+       uintptr_t x1 = 0;
+
+       /* first, find and allocate */
+       h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn);
+
+       for (p = strings; *p; p++) {
+               dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL);
+       }
+       dn_ht_scan(h, doprint, 0);
+       printf("/* second -- find without allocate */\n");
+       h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL);
+       for (p = strings; *p; p++) {
+               void **y = newfn((uintptr_t)*p, 0, NULL);
+               if (x == 0)
+                       x = (uintptr_t)y;
+               else {
+                       if (x1 == 0)
+                               x1 = (uintptr_t)*p;
+               }
+               dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL);
+       }
+       dn_ht_scan(h, doprint, 0);
+       printf("remove %p gives %p\n", (void *)x,
+               dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+       printf("remove %p gives %p\n", (void *)x,
+               dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+       printf("remove %p gives %p\n", (void *)x,
+               dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+       printf("remove %p gives %p\n", (void *)x,
+               dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+       dn_ht_scan(h, doprint, 0);
+}
+
+int
+main(int argc, char *argv[])
+{
+       struct dn_heap h;
+       int i, n, n2, n3;
+
+       test_hash();
+       return 0;
+
+       /* n = elements, n2 = cycles */
+       n = (argc > 1) ? atoi(argv[1]) : 0;
+       if (n <= 0 || n > 1000000)
+               n = 100;
+       n2 = (argc > 2) ? atoi(argv[2]) : 0;
+       if (n2 <= 0)
+               n = 1000000;
+       n3 = (argc > 3) ? atoi(argv[3]) : 0;
+       bzero(&h, sizeof(h));
+       heap_init(&h, n, -1);
+       while (n2-- > 0) {
+               uint64_t prevk = 0;
+               for (i=0; i < n; i++)
+                       heap_insert(&h, n3 ? n-i: random(), (void *)(100+i));
+               
+               for (i=0; h.elements > 0; i++) {
+                       uint64_t k = h.p[0].key;
+                       if (k < prevk)
+                               panic("wrong sequence\n");
+                       prevk = k;
+                       if (0)
+                       printf("%d key %llu, val %p\n",
+                               i, h.p[0].key, h.p[0].object);
+                       heap_extract(&h, NULL);
+               }
+       }
+       return 0;
+}
diff --git a/test/test_dn_sched.c b/test/test_dn_sched.c
new file mode 100644 (file)
index 0000000..65bbf18
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * $Id: test_dn_sched.c 5626 2010-03-04 21:55:22Z luigi $
+ *
+ * library functions for userland testing of dummynet schedulers
+ */
+
+#include "dn_test.h"
+
+void
+m_freem(struct mbuf *m)
+{
+       printf("free %p\n", m);
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+       return 0;
+}
+
+void
+dn_free_pkts(struct mbuf *m)
+{
+       struct mbuf *x;
+       while ( (x = m) ) {
+               m = m->m_nextpkt;
+               m_freem(x);
+       }
+}
+               
+int
+dn_delete_queue(void *_q, void *do_free)
+{
+       struct dn_queue *q = _q;
+        if (q->mq.head)
+                dn_free_pkts(q->mq.head);
+        free(q);
+        return 0;
+}
+
+/*
+ * This is a simplified function for testing purposes, which does
+ * not implement statistics or random loss.
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+        if (drop)
+                goto drop;
+        if (q->ni.length >= 200)
+                goto drop;
+        mq_append(&q->mq, m);
+        q->ni.length++;
+        q->ni.tot_bytes += m->m_pkthdr.len;
+        return 0;
+
+drop:
+        q->ni.drops++;
+        return 1;
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+        if (*v < lo) {
+                *v = dflt;
+        } else if (*v > hi) {
+                *v = hi;
+        }
+        return *v;
+}
+
+#ifndef __FreeBSD__
+int
+fls(int mask)
+{
+        int bit;
+        if (mask == 0)
+                return (0);
+        for (bit = 1; mask != 1; bit++)
+                mask = (unsigned int)mask >> 1;
+        return (bit);
+}
+#endif