From 7708b180bd347c489bd769f15b0d59ec0d0bb8a5 Mon Sep 17 00:00:00 2001 From: martacarbone Date: Thu, 21 Jun 2012 17:48:49 +0200 Subject: [PATCH] Initial import from onelab2 repository, Rev. 11325. --- Makefile | 151 + Makefile.openwrt | 95 + NOTES | 220 + README | 282 ++ configuration/README | 14 + configuration/change_rules.sh | 159 + configuration/change_rules_linux.sh | 12 + configuration/ipfw.conf | 29 + configuration/ipfw.rules | 12 + configuration/rc.firewall | 400 ++ dummynet2/Makefile | 352 ++ dummynet2/bsd_compat.c | 566 +++ dummynet2/debug.c | 67 + dummynet2/dn_heap.c | 588 +++ dummynet2/dn_sched_fifo.c | 120 + dummynet2/dn_sched_prio.c | 229 + dummynet2/dn_sched_qfq.c | 864 ++++ dummynet2/dn_sched_rr.c | 310 ++ dummynet2/dn_sched_wf2q.c | 378 ++ dummynet2/in_cksum.c | 150 + dummynet2/include/net/if.h | 1 + dummynet2/include/net/pfil.h | 121 + dummynet2/include/net/radix.h | 180 + dummynet2/include/netgraph/ng_ipfw.h | 33 + dummynet2/include/netinet/ip.h | 49 + dummynet2/include/netinet/ip6.h | 58 + dummynet2/include/netinet/ip_dummynet.h | 261 ++ dummynet2/include/netinet/ip_fw.h | 585 +++ dummynet2/include/netinet/ip_icmp.h | 17 + dummynet2/include/netinet/ipfw/dn_heap.h | 191 + dummynet2/include/netinet/ipfw/dn_sched.h | 206 + .../include/netinet/ipfw/ip_dn_private.h | 419 ++ .../include/netinet/ipfw/ip_fw_private.h | 301 ++ dummynet2/include/netinet/tcp.h | 228 + dummynet2/include/netinet/tcp_var.h | 4 + dummynet2/include/netinet/udp.h | 67 + dummynet2/include/sys/cdefs.h | 33 + dummynet2/include/sys/kernel.h | 26 + dummynet2/include/sys/malloc.h | 59 + dummynet2/include/sys/mbuf.h | 265 ++ dummynet2/include/sys/module.h | 41 + dummynet2/include/sys/param.h | 11 + dummynet2/include/sys/queue.h | 623 +++ dummynet2/include/sys/syslog.h | 7 + dummynet2/include/sys/systm.h | 126 + dummynet2/include/sys/taskqueue.h | 34 + dummynet2/ip_dn_glue.c | 845 ++++ dummynet2/ip_dn_io.c | 963 ++++ dummynet2/ip_dummynet.c | 2400 ++++++++++ dummynet2/ip_fw2.c | 2493 +++++++++++ dummynet2/ip_fw_dynamic.c | 1241 ++++++ dummynet2/ip_fw_log.c | 449 ++ dummynet2/ip_fw_lookup.c | 304 ++ dummynet2/ip_fw_nat.c | 604 +++ dummynet2/ip_fw_pfil.c | 415 ++ dummynet2/ip_fw_sockopt.c | 1343 ++++++ dummynet2/ip_fw_table.c | 286 ++ dummynet2/ipfw2_mod.c | 921 ++++ dummynet2/md_win.c | 630 +++ dummynet2/miniport.c | 1481 +++++++ dummynet2/missing.h | 639 +++ dummynet2/netipfw.inf | 79 + dummynet2/netipfw_m.inf | 54 + dummynet2/passthru.c | 469 ++ dummynet2/passthru.h | 500 +++ dummynet2/precomp.h | 11 + dummynet2/protocol.c | 1670 +++++++ dummynet2/radix.c | 1204 +++++ dummynet2/winmissing.h | 225 + glue.h | 580 +++ ipfw/Makefile | 120 + ipfw/add_rules | 27 + ipfw/dummynet.c | 1456 ++++++ ipfw/expand_number.c | 100 + ipfw/glue.c | 841 ++++ ipfw/humanize_number.c | 153 + ipfw/include/alias.h | 71 + ipfw/include/net/if_dl.h | 82 + ipfw/include/net/pfvar.h | 32 + ipfw/include/timeconv.h | 29 + ipfw/ipfw.8 | 3218 ++++++++++++++ ipfw/ipfw2.c | 3914 +++++++++++++++++ ipfw/ipfw2.h | 288 ++ ipfw/ipv6.c | 501 +++ ipfw/main.c | 615 +++ ipfw/qsort.c | 195 + ipfw/qsort_r.c | 8 + ipfw/rule_test.sh | 83 + ipfw/ws2_32.def | 120 + planetlab/check_planetlab_sync | 22 + planetlab/ipfw | 84 + planetlab/ipfw.8.gz | Bin 0 -> 26396 bytes planetlab/ipfw.cron | 3 + planetlab/ipfwroot.spec | 135 + planetlab/ipfwslice.spec | 94 + planetlab/netconfig | 14 + planetlab/planetlab-tags.mk | 6 + planetlab/planetlab.mk | 26 + planetlab/sample_hook | 34 + test/Makefile | 53 + test/basic_ipfw.sh | 72 + test/dn_test.h | 157 + test/dynrules.sh | 20 + test/interpolation.c | 335 ++ test/main.c | 636 +++ test/memory_leak.sh | 26 + test/mylist.h | 49 + test/profile_bench1 | 26 + test/profile_bench2 | 7 + test/profile_bench3 | 5 + test/test_dn_heap.c | 162 + test/test_dn_sched.c | 89 + 112 files changed, 42658 insertions(+) create mode 100644 Makefile create mode 100644 Makefile.openwrt create mode 100644 NOTES create mode 100644 README create mode 100644 configuration/README create mode 100755 configuration/change_rules.sh create mode 100755 configuration/change_rules_linux.sh create mode 100644 configuration/ipfw.conf create mode 100755 configuration/ipfw.rules create mode 100755 configuration/rc.firewall create mode 100644 dummynet2/Makefile create mode 100644 dummynet2/bsd_compat.c create mode 100644 dummynet2/debug.c create mode 100644 dummynet2/dn_heap.c create mode 100644 dummynet2/dn_sched_fifo.c create mode 100755 dummynet2/dn_sched_prio.c create mode 100644 dummynet2/dn_sched_qfq.c create mode 100644 dummynet2/dn_sched_rr.c create mode 100644 dummynet2/dn_sched_wf2q.c create mode 100644 dummynet2/in_cksum.c create mode 100644 dummynet2/include/net/if.h create mode 100644 dummynet2/include/net/pfil.h create mode 100644 dummynet2/include/net/radix.h create mode 100644 dummynet2/include/netgraph/ng_ipfw.h create mode 100644 dummynet2/include/netinet/ip.h create mode 100644 dummynet2/include/netinet/ip6.h create mode 100644 dummynet2/include/netinet/ip_dummynet.h create mode 100644 dummynet2/include/netinet/ip_fw.h create mode 100644 dummynet2/include/netinet/ip_icmp.h create mode 100644 dummynet2/include/netinet/ipfw/dn_heap.h create mode 100644 dummynet2/include/netinet/ipfw/dn_sched.h create mode 100644 dummynet2/include/netinet/ipfw/ip_dn_private.h create mode 100644 dummynet2/include/netinet/ipfw/ip_fw_private.h create mode 100644 dummynet2/include/netinet/tcp.h create mode 100644 dummynet2/include/netinet/tcp_var.h create mode 100644 dummynet2/include/netinet/udp.h create mode 100644 dummynet2/include/sys/cdefs.h create mode 100644 dummynet2/include/sys/kernel.h create mode 100644 dummynet2/include/sys/malloc.h create mode 100644 dummynet2/include/sys/mbuf.h create mode 100644 dummynet2/include/sys/module.h create mode 100644 dummynet2/include/sys/param.h create mode 100644 dummynet2/include/sys/queue.h create mode 100644 dummynet2/include/sys/syslog.h create mode 100644 dummynet2/include/sys/systm.h create mode 100644 dummynet2/include/sys/taskqueue.h create mode 100644 dummynet2/ip_dn_glue.c create mode 100644 dummynet2/ip_dn_io.c create mode 100644 dummynet2/ip_dummynet.c create mode 100644 dummynet2/ip_fw2.c create mode 100644 dummynet2/ip_fw_dynamic.c create mode 100644 dummynet2/ip_fw_log.c create mode 100644 dummynet2/ip_fw_lookup.c create mode 100644 dummynet2/ip_fw_nat.c create mode 100644 dummynet2/ip_fw_pfil.c create mode 100644 dummynet2/ip_fw_sockopt.c create mode 100644 dummynet2/ip_fw_table.c create mode 100644 dummynet2/ipfw2_mod.c create mode 100644 dummynet2/md_win.c create mode 100644 dummynet2/miniport.c create mode 100644 dummynet2/missing.h create mode 100644 dummynet2/netipfw.inf create mode 100644 dummynet2/netipfw_m.inf create mode 100644 dummynet2/passthru.c create mode 100644 dummynet2/passthru.h create mode 100644 dummynet2/precomp.h create mode 100644 dummynet2/protocol.c create mode 100644 dummynet2/radix.c create mode 100644 dummynet2/winmissing.h create mode 100644 glue.h create mode 100644 ipfw/Makefile create mode 100755 ipfw/add_rules create mode 100644 ipfw/dummynet.c create mode 100644 ipfw/expand_number.c create mode 100644 ipfw/glue.c create mode 100644 ipfw/humanize_number.c create mode 100644 ipfw/include/alias.h create mode 100644 ipfw/include/net/if_dl.h create mode 100644 ipfw/include/net/pfvar.h create mode 100644 ipfw/include/timeconv.h create mode 100644 ipfw/ipfw.8 create mode 100644 ipfw/ipfw2.c create mode 100644 ipfw/ipfw2.h create mode 100644 ipfw/ipv6.c create mode 100644 ipfw/main.c create mode 100644 ipfw/qsort.c create mode 100644 ipfw/qsort_r.c create mode 100755 ipfw/rule_test.sh create mode 100644 ipfw/ws2_32.def create mode 100755 planetlab/check_planetlab_sync create mode 100755 planetlab/ipfw create mode 100644 planetlab/ipfw.8.gz create mode 100644 planetlab/ipfw.cron create mode 100644 planetlab/ipfwroot.spec create mode 100644 planetlab/ipfwslice.spec create mode 100755 planetlab/netconfig create mode 100644 planetlab/planetlab-tags.mk create mode 100644 planetlab/planetlab.mk create mode 100755 planetlab/sample_hook create mode 100644 test/Makefile create mode 100755 test/basic_ipfw.sh create mode 100644 test/dn_test.h create mode 100644 test/dynrules.sh create mode 100644 test/interpolation.c create mode 100644 test/main.c create mode 100644 test/memory_leak.sh create mode 100644 test/mylist.h create mode 100644 test/profile_bench1 create mode 100644 test/profile_bench2 create mode 100644 test/profile_bench3 create mode 100644 test/test_dn_heap.c create mode 100644 test/test_dn_sched.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b2ed479 --- /dev/null +++ b/Makefile @@ -0,0 +1,151 @@ +# $Id: Makefile 8654 2011-05-23 08:39:50Z marta $ +# +# Top level makefile for building ipfw kernel and userspace. +# You can run it manually or also under the Planetlab build. +# Planetlab wants also the 'install' target. +# +# To build on system with non standard Kernel sources or userland files, +# you should run this with +# +# make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr +# +# We assume that $(USRDIR) contains include/ and lib/ used to build userland. + +DATE ?= $(shell date +%Y%m%d) +SNAPSHOT_NAME=$(DATE)-ipfw3.tgz +BINDIST=$(DATE)-dummynet-linux.tgz +WINDIST=$(DATE)-dummynet-windows.zip + +########################################### +# windows x86 and x64 specific variables # +########################################### +# DRIVE must be the hard drive letter where DDK is installed +# DDKDIR must be the path to the DDK root directory, without drive letter +# TARGETOS (x64 only) must be one of the following: +# wnet -> windows server 2003 +# wlh -> windows vista and windows server 2008 +# win7 -> windows 7 +# future version must be added here +export DDK +export DRIVE +export DDKDIR +DRIVE = C: +DDKDIR = /WinDDK/7600.16385.1 +DDK = $(DRIVE)$(DDKDIR) + +TARGETOS=win7 + +_all: all + +clean distclean: + echo target is $(@) + (cd ipfw && $(MAKE) $(@) ) + (cd dummynet2 && $(MAKE) $(@) ) + # -- windows x64 only + - rm -rf dummynet2-64 + - rm -rf ipfw-64 + - rm -rf binary64 + +all: + echo target is $(@) + (cd ipfw && $(MAKE) $(@) ) + (cd dummynet2 && $(MAKE) $(@) ) + # -- windows only + - [ -f ipfw/ipfw.exe ] && cp ipfw/ipfw.exe binary/ipfw.exe + - [ -f dummynet2/objchk_wxp_x86/i386/ipfw.sys ] && \ + cp dummynet2/objchk_wxp_x86/i386/ipfw.sys binary/ipfw.sys + +snapshot: + $(MAKE) distclean + (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME) --exclude .svn \ + --exclude README.openwrt --exclude tags --exclude NOTES \ + --exclude tcc-0.9.25-bsd \ + --exclude original_passthru \ + --exclude ipfw3.diff --exclude add_rules \ + --exclude test --exclude test_ \ + ipfw3 ) + +bindist: + $(MAKE) clean + $(MAKE) all + tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 dummynet2/ipfw_mod.ko + +windist: + $(MAKE) clean + -$(MAKE) all + -rm /tmp/$(WINDIST) + zip -r /tmp/$(WINDIST) binary -x \*.svn\* + +win64: clean + (cd dummynet2 && $(MAKE) include_e) + cp -r ipfw ipfw-64 + echo "EXTRA_CFLAGS += -D_X64EMU" >> ipfw-64/Makefile + (cd ipfw-64 && $(MAKE) all) + cp -r dummynet2 dummynet2-64 + rm -f dummynet2-64/Makefile + cp win64/sources dummynet2-64/sources + mkdir dummynet2-64/tmpbuild + mkdir binary64 + win64/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS) + cp binary/cygwin1.dll binary64/cygwin1.dll + cp ipfw-64/ipfw.exe binary64/ipfw.exe + cp win64/*.inf binary64 + cp binary/testme.bat binary64/testme.bat + cp binary/wget.exe binary64/wget.exe + +planetlab_update: + # clean and create a local working directory + rm -rf /tmp/pl-tmp + mkdir -p /tmp/pl-tmp/pl + mkdir -p /tmp/pl-tmp/ol2 + # get the trunk version of the PlanetLab repository + # to specify the sshkey use the .ssh/config file + (cd /tmp/pl-tmp/pl; \ + svn co svn+ssh://svn.planet-lab.org/svn/ipfw/trunk) + # get an updated copy of the main ipfw repository + (cd /tmp/pl-tmp/ol2; \ + svn export svn+ssh://onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3) + # copy the new version over the old one + (cd /tmp/pl-tmp; cp -rP ol2/ipfw3/* pl/trunk) + # files cleanup in the old version + (cd /tmp/pl-tmp; diff -r ol2/ipfw3 pl/trunk | \ + grep -v "svn" | awk '{print $$3 $$4}' | \ + sed 's/:/\//' | xargs rm -rf) + # local adjustmens here + rm -rf /tmp/pl-tmp/pl/trunk/planetlab/check_planetlab_sync + # commit to the remote repo + @echo "Please, revise the update with the commands:" + @echo "(cd /tmp/pl-tmp/pl/trunk; svn diff)" + @echo "(cd /tmp/pl-tmp/pl/trunk; svn status)" + @echo "and commit with:" + @echo "(cd /tmp/pl-tmp/pl/trunk; svn ci -m 'Update from the mail ipfw repo.')" + +openwrt_release: + # create a temporary directory + $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX)) + # create the source destination directory + $(eval IPFWDIR := ipfw3-$(DATE)) + $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR)) + mkdir $(DSTDIR) + # copy the package, clean objects and svn info + cp -r ./ipfw ./dummynet2 glue.h Makefile ./configuration README $(DSTDIR) + (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf) + (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR)) + + # create the port files in /tmp/ipfw3-port + $(eval PORTDIR := $(TMPDIR)/ipfw3) + mkdir -p $(PORTDIR)/patches + # generate the Makefile, PKG_VERSION and PKG_MD5SUM + md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum + cat ./OPENWRT/Makefile | \ + sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \ + sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \ + > $(PORTDIR)/Makefile + + @echo "" + @echo "The openwrt port is in $(TMPDIR)/ipfw3-port" + @echo "The source file should be copied to the public server:" + @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet" + @echo "after this the temporary directory $(TMPDIR) can be removed." + +install: diff --git a/Makefile.openwrt b/Makefile.openwrt new file mode 100644 index 0000000..3c7be80 --- /dev/null +++ b/Makefile.openwrt @@ -0,0 +1,95 @@ +# Makefile to build the package in openwrt. +# goes into package/ipfw3/Makefile +# +# Edit IPFW_DIR to point to the directory with the sources for ipfw + +IPFW_DIR := $(TOPDIR)/../ipfw3 + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=kmod-ipfw3 +PKG_RELEASE:=1 + +# MV is undefined +MV ?= mv + +include $(INCLUDE_DIR)/package.mk + +#Stuff depending on kernel version +ifeq ($(KERNEL),2.6) + +VERS:=2.6 +IPFW_MOD:=ipfw_mod.ko +IPFW_SRC_DIR:=M + +else + +VERS:=openwrt +CFLAGS_WRT:=-DSYSCTL_NODE -DEMULATE_SYSCTL +IPFW_MOD:=ipfw_mod.o +IPFW_SRC_DIR:=SUBDIRS + +endif + +# Description for the package. +# The names KernelPackage/ipfw3 must match the arguments to the +# call $(eval $(call KernelPackage,ipfw3)) used to build it + +define KernelPackage/ipfw3 + SUBMENU:=Other modules + TITLE:= IPFW and dummynet + # FILES is what makes up the module, both kernel and userland + # It must be in the KernelPackage section + FILES := $(PKG_BUILD_DIR)/dummynet2/$(IPFW_MOD) $(PKG_BUILD_DIR)/ipfw/ipfw + # AUTOLOAD:=$(call AutoLoad,80,ipfw_mod) +endef + +define KernelPackage/ipfw3/description + This package contains the ipfw and dummynet module +endef + +# Standard entries for the openwrt builds: Build/Prepare and Build/Compile +# Remember that commands must start with a tab + +# 'prepare' instructions for both kernel and userland +# We copy the entire subtree, then build include_e/ which +# contains empty headers used by the kernel sources. +define Build/Prepare + # $(warning Preparing ipfw sources) + mkdir -p $(PKG_BUILD_DIR) + $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/ + (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e ) + (cd $(PKG_BUILD_DIR)/dummynet2 && $(MAKE) include_e ) +endef + +define Build/Compile + # compile the kernel part for openwrt + $(MAKE) -C "$(LINUX_DIR)" \ + CROSS_COMPILE="$(TARGET_CROSS)" \ + ARCH="$(LINUX_KARCH)" \ + $(IPFW_SRC_DIR)="$(PKG_BUILD_DIR)/dummynet2" \ + VER=$(VERS) modules + # compile the userland part for openwrt + $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \ + $(TARGET_CONFIGURE_OPTS) \ + CFLAGS="$(TARGET_CFLAGS) $(CFLAGS_WRT) -I./include_e -I./include -include ../glue.h -DNO_ALTQ -D__BSD_VISIBLE" \ + VER=$(VERS) all +endef + +define Package/ipfw3-userland + SECTION:=utils + CATEGORY:=Utilities + TITLE := /sbin/ipfw + DESCRIPTION := This is the control program for ipfw and dummynet +endef + +define Package/ipfw3-userland/install + $(INSTALL_DIR) $(1) /sbin +endef + +# XXX not entirely clear why the install entry for userland works, +# given that /sbin/ipfw is in KernelPackage/ipfw3 + +$(eval $(call Package,ipfw3-userland)) +$(eval $(call KernelPackage,ipfw3)) diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..52bb5bf --- /dev/null +++ b/NOTES @@ -0,0 +1,220 @@ +# +# $Id: NOTES 6552 2010-06-15 11:24:59Z svn_panicucci $ +# + +--------------------------------------------------------------------- +--- DEVELOPER NOTES ------------------------------------------------ + +Both the client and the kernel code use almost unmodified sources +from FreeBSD (just a very small number of sections #ifdef'ed out +for features not relevant or not implemented). + +In both cases we provide two set of headers: + - one set is made of empty files, automatically generated, to replace + FreeBSD headers not available or conflicting on the ported platforms. + - one set is made of custom files, sometimes copied verbatim + from FreeBSD, sometimes containing only the minimal set of + macros/ struct/ prototypes required by the port. + +Additionally, we have a small set of .c files providing functions not +available in the port platforms, and hooks for the sockopt/packet +data. + + +TODO 20100205: ++ use an appropriate identifier instead of LINUX24 ++ find the discharging module hook, in order to force a queue flush ++ better matching on interface names (case insensitive etc ?) ++ match by interface address ++ verify path ++ send keepalives (20100301 marta: implemented) ++ pullup of data in external buffers ++ O_TAG ++ O_DIVERT ++ O_TEE ++ O_SETFIB ++ kmem_cache_alloc + +TODO (OpenWRT) 20090622 ++ add a module compilation for 2.6 + +TODO (FreeBSD, general) ++ New features related to the forthcoming IPv6 are missing, as the IPv6 +support for lookup tables that currently support IPv4 addresses only. +One of the goal of this project is to add the tables feature to the +IPv6 protocol. + ++ The current code implements rules listing requests as a single +request returning both static and dynamic rules as a whole block. This +operation requires a lock to be held for the time needed to get the +full list of rules, regardless of the requested rules. I propose to +break up the rule request in two parts, for static and dynamic rules, in +order to avoid to lock the whole struct for a subset of rules required. + ++ At last, due to improvement and contribution to the code, the tool +significantly grown over the time with new functionalities and features, +leaving the general view aside. An example of this will be the use of +dispatching table instead some very long switch case, making the resulting +code more readable and hopefully a faster execution. + ++ XXX can't find the ipfw_* indirection... + +DETAILED PORTING INFO + +--- ipfw (userland) on linux --- + +The port is relatively trivial. Communication with the kernel occurs +through a raw socket using [gs]etsockopt(), and all is needed is the +availability of ip_fw.h and ip_dummynet.h headers to describe the +relevant data structures. + +--- kernel ipfw on linux --- + +Sources are mostly unmodified, except for commenting out +unsupported features (tables, in-kernel nat...). +The port requires a rather large number of empty headers. +Other porting issues are in ipfw2_mod.c + +--- build as an Openwrt package + +------ WINDOWS PORT ------ + +We started from the wipfw port available at [WIPFW] , but +most of the port is done from scratch using the most recent +version of ipfw+dummynet from HEAD/RELENG_7 as of March 2009 + +# WIPFW: wipfw.sourceforge.net +#binary: +http://downloads.sourceforge.net/wipfw/wipfw-0.3.2b.zip?use_mirror=mesh +http://downloads.sourceforge.net/wipfw/wipfw-0.2.8-source.zip + +--- DEVELOPMENT TOOLS: + +At least initially, to build the code you need a pc with +windows installed and the [WINDDK] from the microsoft site. +Other tools like the new WDK should work as well. + +The 'standard' way used by WDK/WINDDK is to run a 'build' +script which in turn calls nmake and then the microsoft +compiler [CL] and linker [LINK]. See the documentation for +command line switches for these tools, they are similar but +not the same as the equivalent gcc switches. In particular, +a / is often used to replace - though both forms are accepted. + +The steps to do in order to launch the build environment follows: + + + download winddk from microsoft.com + + install + + run the Free Build Enviroment from: + + Start -> All Program -> WINDDK -> + [NT|XP|2000] -> Free Build Environment + + + change dir to .src and type `build' in command line + +For our purposes, however, it is much more convenient to use +cygwin [CYGWIN] and invoke CL and LINK using gmake + +A debugging tools is: + http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx +it simply display the kernel-mode debug output. +Use the DbgPrint() function, that is something similar to printk(). +Can be lauched with dbgview.exe. + +After a succesfully compilation and link, you can launch the program +in user space simply executing the binary file, while for the kernel +space you need to do the following steps: + +cp ipfw.sys /cygdrive/c/WINDOWS/system32/drivers/ +ipfw install_drv System32\DRIVERS\ip_fw.sys +net start ip_fw + + +======= +--- ARCHITECTURE --- + +The main part of the userland program mostly work as the +unix equivalent, the only issue is to provide empty +header files to replace those not available in Windows, +and include the winsock2 headers to access some network +related functions and headers. + +Communication with the kernel module does not use a raw IP socket +as in the unix version. Instead, we inherit the same method +used in ipfw -- a replacement for socket() creates a handle +to access the control structure, and setsockopt/getsockopt +replacements are also used to communicate with the kernel +side. This is implemented in win32.c + +In order to load the module and activate it, we also use +the same technique suggested in wipfw -- the main() is +extended (with a wrapper) so that it can handle additional +commands to install/control/deinstall the service and +call the appropriate actions. See svcmain.c for details. + +--- PORTING ISSUES: + +Most of the unix hierarchy of headers is not available so we +have to replicate them. + +gcc attributes are also not present. + +C99 types are not present, remapped in +Also, we don't have C99 initializers which sometimes gives trouble. + +--- USEFUL LINKS: + +[WIPFW] + http://wipfw.sourceforge.net/ + +[WINDDK] + http://www.microsoft.com/whdc/devtools/ddk/default.mspx + +[CL] + http://msdn.microsoft.com/en-us/library/610ecb4h.aspx + command line syntax + +[CYGWIN] + http://www.cygwin.com/setup.exe +Windows Driver Kit +http://www.microsoft.com/whdc/DevTools/WDK/WDKpkg.mspx + +Debug Symbols for WinXP SP3 +http://www.microsoft.com/whdc/devtools/debugging/symbolpkg.mspx#d + +DbgView +http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx + +Cygwin +http://www.cygwin.com/ +(installazione pacchetti di default + categoria devel) + +Winrar (il WDK e' distribuito in un file .iso) +http://www.rarlab.com/download.htm + +puttycyg (terminale per cygwin) +http://code.google.com/p/puttycyg/ + +Tortoise SVN +http://tortoisesvn.net/downloads + +EditPlus +http://www.editplus.com/ + +--------------------------------------------------------------------- +--- OPEN ISSUES/TODO ------------------------------------------------ + +- Fix the build on OpenWRT for linux 2.6 + [Forum: https://forum.openwrt.org/viewtopic.php?id=24990] +- Compilation on 2.6 OpenWRT (target is MIPS Artheros 71xx) gives compilation + errors; [Send updates to: https://forum.openwrt.org/viewtopic.php?id=24990] +- Windows stack corruption [a tricky bug in dummynet] +- Windows ipv6 port [RE: Windows port of ipv6 in ipfw+dummynet] + +NOTE: +- To allow compilation on OpenWRT with kernel 2.6 only the Makefile.opewrt + is modified to guess the kernel version (2.4/2.6) +- ipfw3 Makefile is not modified. +- Also compile on bigendian, but not tested yet... +- Little changes in source code. + diff --git a/README b/README new file mode 100644 index 0000000..9c33bab --- /dev/null +++ b/README @@ -0,0 +1,282 @@ +# +# $Id: README 8977 2011-07-04 11:47:59Z luigi $ +# + +This directory contains a port of ipfw and dummynet to Linux/OpenWrt +(including PlanetLab) and Windows. This version of ipfw and dummynet +is called "ipfw3" as it is the third major rewrite of the code. +The source code here comes straight from FreeBSD (roughly the +version in HEAD as of February 2010), plus some glue code +and headers written from scratch. +Unless specified otherwise, all the code here is under a BSD license. + +Specific build instructions are below, and in general produce + + a kernel module, ipfw_mod.ko (ipfw.sys on windows) + a userland program, /sbin/ipfw (ipfw.exe on windows) + +which you need to install on your system. + +CREDITS: + Luigi Rizzo (main design and development) + Marta Carbone (Linux and Planetlab ports) + Riccardo Panicucci (modular scheduler support) + Francesco Magno (Windows port) + Fabio Checconi (the QFQ scheduler) + Funding from Universita` di Pisa (NETOS project), + European Commission (ONELAB2 project) + +=========== INSTALL/REMOVE INSTRUCTIONS ======================== + +FreeBSD, OSX: + INSTALL: + kldload ipfw.ko ; kldload dummynet.ko + REMOVE: + kldunload dummynet.ko; kldunload ipfw.ko + +Linux + INSTALL: + # Do the following as root + insmod ./dummynet2/ipfw_mod.ko + cp ipfw/ipfw /usr/local/sbin + REMOVE: + rmmod ipfw_mod.ko + +OpenWRT + INSTALL: # use the correct name for your system + opkg install kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install + ls -l ls -l /lib/modules/2.4.35.4/ipfw* # check + insmod /lib/modules/2.4.35.4/ipfw_mod.o # load the module + /lib/modules/2.4.35.4/ipfw show # launch the userspace tool + REMOVE: + rmmod ipfw_mod.o # remove the module + +Windows: + INSTALL THE NDIS DRIVER + + - open the configuration panel for the network card in use + (right click on the icon on the SYSTRAY, or go to + Control Panel -> Network and select one card) + + - click on Properties->Install->Service->Add + - click on 'Driver Disk' and select 'netipfw.inf' in this folder + - select 'ipfw+dummynet' which is the only service you should see + - click accept on the warnings for the installation of an unknown + driver (roughly twice per existing network card) + + Now you are ready to use the emulator. To configure it, open a 'cmd' + window and you can use the ipfw command from the command line. + Otherwise click on the 'TESTME.bat' which is a batch program that + runs various tests. + + REMOVE: + - select a network card as above. + - click on Properties + - select 'ipfw+dummynet' + - click on 'Remove' + + +=================== BUILD INSTRUCTIONS ========================== + +***** Windows (XPi, Windows7) ****** + You can find a pre-built version in the binary/ subdirectory. + To build your own version of the package you need: + - MSVC DDK available from + http://msdn.microsoft.com/en-us/windows/hardware/gg487463.aspx + + - optionally, DbgView if you want to see diagnostic + http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx + + - cygwin, http://www.cygwin.com/ + with base packages, make, c compiler, possibly an editor + and subversion (suggest: tortoiseSvn) + + Edit Makefile in the root directory, and set configuration + variables to match your current system (hard drive + and path where DDK is installed) + Open a shell from cygwin, move to this directory, and simply + run "make". The output of the build will be in this + directory, made of 4 files: + ipfw.exe (you also need cygwin.dll) + ipfw.sys (an NDIS intermediate filter driver) + dummynet.inf and dummynet_m.inf (installer files) + + Cross compilation of the userland side under FreeBSD is possible with + gmake TCC=`pwd`/tcc-0.9.25-bsd/win32 CC=`pwd`/tcc-0.9.25-bsd/win32/bin/wintcc + (wintcc is a custom version of tcc which produces Windows code) + +***** Windows crosscompilation for 64 bit using DDK ****** + Edit root directory's Makefile and set target + operating system + From the root directory, run 'make win64', this will: + - create ipfw-64 and dummynet2-64 subdirs + - patch ipfw makefile to support comunication + with 64bit module and build it + - replace dummynet makefile with proprietary + WinDDK one, named 'sources', and build the module + - create a binary64 directory containing + module and .inf install files, program + binary and relative cygwin dll + - install the driver from this directory in the + usual way. + +***** Linux 2.6.x ****** + + make KERNELPATH=/path/to/linux USRDIR=/path/to/usr + + where the two variables are optional an point to the linux kernel + sources and the /usr directory. Defaults are USRDIR=/usr and + KERNELPATH=/lib/modules/`uname -r`/build --- XXX check ? + + NOTE: make sure CONFIG_NETFILTER is enabled in the kernel + configuration file. You need the ncurses devel library, + that can be installed according your distro with: + apt-get install ncurses-dev # for debian based distro + yum -y install ncurses-dev # for fedora based distro + You can enable CONFIG_NETFILTER by doing: + + "(cd ${KERNELPATH}; make menuconfig)" + + and enabling the option listed below: + + Networking ---> + Networking options ---> + [*] Network packet filtering framework (Netfilter) + + If you have not yet compiled your kernel source, you need to + prepare the build environment: + + (cd $(KERNELPATH); make oldconfig; make prepare; make scripts) + +***** Linux 2.4.x ***** + + Almost as above, with an additional VER=2.4 + + make VER=2.4 KERNELPATH=... + + For 2.4, if KERNELPATH is not specified then we use + KERNELPATH ?= /usr/src/`uname -r`/build + + You need to follow the same instruction for the 2.6 kernel, enabling + netfilter in the kernel options: + + Networking options ---> + [*] Network packet filtering (replaces ipchains) + +***** Openwrt package ***** + + (Tested with kamikaze_8.09.1 and Linux 2.4) + + + Download and extract the OpenWrt package, e.g. + + wget http://downloads.openwrt.org/kamikaze/8.09.1/kamikaze_8.09.1_source.tar.bz2 + tar xvjf kamikaze_8.09.1_source.tar.bz2 + + + move to the directory with the OpenWrt sources (the one that + contains Config.in, rules.mk ...) + + cd kamikaze_8.09.1 + + + Optional: Add support for 1ms resolution. + + By default OpenWRT kernel is compiled with HZ=100; this implies + that all timeouts are rounded to 10ms, too coarse for dummynet. + The file 020-mips-hz1000.patch contains a kernel patch to build + a kernel with HZ=1000 (i.e. 1ms resolution) as in Linux/FreeBSD. + To apply this patch, go in the kernel source directory and + patch the kernel + + cd build_dir/linux-brcm-2.4/linux-2.4.35.4 + cat $IPFW3_SOURCES/020-mips-hz1000.patch | patch -p0 + + where IPFW3_SOURCES contains the ipfw3 source code. + Now, the next kernel recompilation will use the right HZ value + + + Optional: to be sure that the tools are working, make a first + build as follows: + + - run "make menuconfig" and set the correct target device, + drivers, and so on; + - run "make" to do the build + + + Add ipfw3 to the openwrt package, as follows: + + - copy the code from this directory to the place used for the build: + + cp -Rp /path_to_ipfw3 ../ipfw3; + + If you want, you can fetch a newer version from the web + (cd ..; rm -rf ipfw3; \ + wget http://info.iet.unipi.it/~luigi/dummynet/ipfw3-latest.tgz;\ + tar xvzf ipfw3-latest.tgz) + + - run the following commands: + (mkdir package/ipfw3; \ + cp ../ipfw3/Makefile.openwrt package/ipfw3/Makefile) + + to create the package/ipfw3 directory in the OpenWrt source + directory, and copy Makefile.openwrt to package/ipfw3/Makefile ; + + - if necessary, edit package/ipfw3/Makefile and set IPFW_DIR to point to + the directory ipfw3, which contains the sources; + + - run "make menuconfig" and select kmod-ipfw3 as a module in + Kernel Modules -> Other modules -> kmod-ipfw3 + + - run "make" to build the package, "make V=99" for verbose build. + + - to modify the code, assuming you are in directory "kamikaze_8.09.1" + + (cd ../ipfw3 && vi ...the files you are interested in ) + rm -rf build_dir/linux-brcm-2.4/kmod-ipfw3 + make package/ipfw3/compile V=99 + + The resulting package is located in bin/packages/mipsel/kmod-ipfw3*, + upload the file and install on the target system, as follows: + + opkg install kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install + ls -l ls -l /lib/modules/2.4.35.4/ipfw* # check + insmod /lib/modules/2.4.35.4/ipfw_mod.o # load the module + /lib/modules/2.4.35.4/ipfw show # launch the userspace tool + rmmod ipfw_mod.o # remove the module + +***** PLANETLAB BUILD (within a slice) ***** +These instruction can be used by PlanetLab developers to compile +the dummynet module on a node. To install the module on the node +users need root access in root context. PlanetLab users that want +to use the dummynet package should ask to PlanetLab support for +nodes with dummynet emulation capabilities. + + Follow the instructions below. You can just cut&paste + + # install the various tools if not available + sudo yum -y install subversion rpm-build rpm-devel m4 redhat-rpm-config make gcc + # new build installation requires the gnupg package + sudo yum -y install gnupg + # the linux kernel and the ipfw source can be fetched by git + sudo yum -y install git + + # create and move to a work directory + mkdir -p test + # extract a planetlab distribution to directory XYZ + (cd test; git clone git://git.onelab.eu/build ./XYZ) + # download the specfiles and do some patching. + # Results are into SPEC/ (takes 5 minutes) + (cd test/XYZ; make stage1=true PLDISTRO=onelab) + # Building the slice code is fast, the root code takes longer + # as it needs to rebuild the whole kernel + (cd test/XYZ; sudo make ipfwslice PLDISTRO=onelab) + (cd test/XYZ; sudo make ipfwroot PLDISTRO=onelab) + + The kernel dependency phase is a bit time consuming, but does not + need to be redone if we are changing the ipfw sources only. + To clean up the code do + (cd test/XYZ; sudo make ipfwroot-clean ipfwslice-clean) + then after you have updated the repository again + (cd test/XYZ; sudo make ipfwslice ipfwroot) + +--- References +[1] https://svn.planet-lab.org/wiki/VserverCentos +[2] http://wiki.linux-vserver.org/Installation_on_CentOS +[3] http://mirror.centos.org/centos/5/isos/ +[4] More information are in /build/README* files diff --git a/configuration/README b/configuration/README new file mode 100644 index 0000000..778f7aa --- /dev/null +++ b/configuration/README @@ -0,0 +1,14 @@ +This directorty contains some ipfw configurations and a scripts +to safely change the firewall rules. + +The firewall configuration comes from the FreeBSD initial script. +The change_rules_linux.sh allows to change the ipfw rules and +in case os a misconfiguration which prevents to reach the remote +host, to restore the old ruleset. + +To configure the firewall behavior, edit the ipfw.conf file and +execute the ./change_rules_linux.sh script. + +The ipfw program executable should be located in /sbin (XXX) + +XXX seems we use something which is not compatible with dash diff --git a/configuration/change_rules.sh b/configuration/change_rules.sh new file mode 100755 index 0000000..8f23369 --- /dev/null +++ b/configuration/change_rules.sh @@ -0,0 +1,159 @@ +#!/bin/sh +# +# Copyright (c) 2000 Alexandre Peixoto +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD: src/share/examples/ipfw/change_rules.sh,v 1.6 2003/09/07 07:52:56 jmg Exp $ + +# Change ipfw(8) rules with safety guarantees for remote operation +# +# Invoke this script to edit ${firewall_script}. It will call ${EDITOR}, +# or vi(1) if the environment variable is not set, for you to edit +# ${firewall_script}, ask for confirmation, and then run +# ${firewall_script}. You can then examine the output of ipfw list and +# confirm whether you want the new version or not. +# +# If no answer is received in 30 seconds, the previous +# ${firewall_script} is run, restoring the old rules (this assumes ipfw +# flush is present in it). +# +# If the new rules are confirmed, they'll replace ${firewall_script} and +# the previous ones will be copied to ${firewall_script}.{date}. Mail +# will also be sent to root with a unified diff of the rule change. +# +# Unapproved rules are kept in ${firewall_script}.new, and you are +# offered the option of changing them instead of the present rules when +# you call this script. +# +# This script could be improved by using version control +# software. + +# XXX on linux /etc/rc.conf defines: +# firewall_type and firewall_script + +if [ -r /etc/defaults/rc.conf ]; then + . /etc/defaults/rc.conf + source_rc_confs +elif [ -r /etc/rc.conf ]; then + . /etc/rc.conf +fi + +EDITOR=${EDITOR:-/usr/bin/vi} +PAGER=${PAGER:-/usr/bin/more} + +# on linux the default mktemp invocation behavior +# is different, we should change the temporary file creation +tempfoo=`basename $0` +#TMPFILE=`mktemp -t ${tempfoo}` || exit 1 +TMPFILE=`mktemp -t ${tempfoo}.XXXXX` || exit 1 + +get_yes_no() { + while true + do + echo -n "$1 (Y/N) ? " + read -t 30 a + if [ $? != 0 ]; then + a="No"; + return; + fi + case $a in + [Yy]) a="Yes"; + return;; + [Nn]) a="No"; + return;; + *);; + esac + done +} + +restore_rules() { + nohup sh ${firewall_script} /dev/null 2>&1 + rm ${TMPFILE} + exit 1 +} + +case "${firewall_type}" in +[Cc][Ll][Ii][Ee][Nn][Tt]|\ +[Cc][Ll][Oo][Ss][Ee][Dd]|\ +[Oo][Pp][Ee][Nn]|\ +[Ss][Ii][Mm][Pp][Ll][Ee]|\ +[Uu][Nn][Kk][Nn][Oo][Ww][Nn]) + edit_file="${firewall_script}" + rules_edit=no + ;; +*) + if [ -r "${firewall_type}" ]; then + edit_file="${firewall_type}" + rules_edit=yes + fi + ;; +esac + +if [ -f ${edit_file}.new ]; then + get_yes_no "A new rules file already exists, do you want to use it" + [ $a = 'No' ] && cp ${edit_file} ${edit_file}.new +else + cp ${edit_file} ${edit_file}.new +fi + +trap restore_rules SIGHUP + +${EDITOR} ${edit_file}.new + +get_yes_no "Do you want to install the new rules" + +[ $a = 'No' ] && exit 1 + +cat < ${TMPFILE} 2>&1 +else + nohup sh ${firewall_script}.new \ + < /dev/null > ${TMPFILE} 2>&1 +fi +sleep 2; +get_yes_no "Would you like to see the resulting new rules" +[ $a = 'Yes' ] && ${PAGER} ${TMPFILE} +get_yes_no "Type y to keep the new rules" +[ $a != 'Yes' ] && restore_rules + +DATE=`date "+%Y%m%d%H%M"` +cp ${edit_file} ${edit_file}.$DATE +mv ${edit_file}.new ${edit_file} +cat </dev/null + fi + ${fwcmd} add deny $log ip from any to any + ;; + +[Cc][Ll][Oo][Ss][Ee][Dd]) + ${fwcmd} add 65000 deny ip from any to any + ;; +[Uu][Nn][Kk][Nn][Oo][Ww][Nn]) + ;; +*) + if [ -r "${firewall_type}" ]; then + ${fwcmd} ${firewall_flags} ${firewall_type} + fi + ;; +esac diff --git a/dummynet2/Makefile b/dummynet2/Makefile new file mode 100644 index 0000000..caee67b --- /dev/null +++ b/dummynet2/Makefile @@ -0,0 +1,352 @@ +# $Id: Makefile 11277 2012-06-10 17:44:15Z marta $ +# gnu Makefile to build linux/Windows module for ipfw+dummynet. +# +# The defaults are set to build without modifications on PlanetLab +# and possibly 2.6 versions. +# On Windows, we use gnu-make and MSC + +# Some variables need to have specific names, because they are used +# by the build infrastructure on Linux and OpenWrt. They are: +# +# ccflags-y additional $(CC) flags +# M used by Kbuild, we must set it to `pwd` +# obj-m list of .o modules to build +# $(MOD)-y for each $MOD in obj-m, the list of objects +# obj-y same as above, for openwrt +# O_TARGET the link target, for openwrt +# EXTRA_CFLAGS as the name says... in openwrt +# EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too +# KERNELPATH the path to the kernel sources or headers +# +# Not sure about this (the name might be reserved) +# ipfw-cflags our flags for building the module +# +# Other variables are only private and can be renamed. They include: +# +# VER linux version we are building for (2.4 2.6 or openwrt) +#--- + +UNAME:=$(shell uname) +$(warning including dummynet/Makefile) + +# lets default for 2.6 for planetlab builds +VER ?= 2.6 + +#--- General values for all types of build --- +# obj-m is the target module +obj-m := ipfw_mod.o + +#-- the list of source files. IPFW_SRCS is our own name. +# Original ipfw and dummynet sources + FreeBSD stuff, +IPFW_SRCS := ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c +IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c +IPFW_SRCS += radix.c in_cksum.c +IPFW_SRCS += ip_dummynet.c ip_dn_io.c ip_dn_glue.c +IPFW_SRCS += dn_heap.c +IPFW_SRCS += dn_sched_fifo.c dn_sched_wf2q.c +IPFW_SRCS += dn_sched_rr.c dn_sched_qfq.c +IPFW_SRCS += dn_sched_prio.c +# Module glue and functions missing in linux +IPFW_SRCS += ipfw2_mod.c bsd_compat.c + +# generic cflags used on all systems +#ipfw-cflags += -DIPFW_HASHTABLES +ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT +# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix) +ipfw-cflags += -D_BSD_SOURCE +ipfw-cflags += -DKERNEL_MODULE # build linux kernel module +# the two header trees for empty and override files +ipfw-cflags += -I $(M)/include_e +ipfw-cflags += -I $(M)/include +ipfw-cflags += -include $(M)/../glue.h # headers +ipfw-cflags += -include $(M)/missing.h # headers + +$(warning "---- Building dummynet kernel module for Version $(VER)") + +ifneq (,$(findstring CYGWIN,$(shell uname))) + ISWIN=1 +endif +ifneq ($(TCC),) + ISWIN=1 +endif +ifeq ($(ISWIN),1) + M ?= $(shell pwd) + WIN_SRCS += md_win.c + WIN_SRCS += miniport.c protocol.c passthru.c debug.c + #compiler, linker, target, sources and objects + #DDK is exported from the root makefile + #DDK = C:/WinDDK/7600.16385.1 + OBJDIR=objchk_wxp_x86/i386/ + + TARGET = ipfw + + CSOURCES = $(IPFW_SRCS) $(WIN_SRCS) + + COBJS := $(CSOURCES:.c=.obj) + COBJS := $(addprefix $(OBJDIR),$(COBJS)) + + #include paths + INCLUDE_PATHS = -Ii386 -Iinclude -Iinclude_e -I. + # INCLUDE_PATHS += -I$(OBJDIR) + INCLUDE_PATHS += -I$(DDK)/inc/api + INCLUDE_PATHS += -I$(DDK)/inc/ddk + INCLUDE_PATHS += -I$(DDK)/inc/crt + + # #preprocessor MS defines + PREPROC = -D_X86_=1 -Di386=1 -DSTD_CALL -DCONDITION_HANDLING=1 + PREPROC += -DNT_UP=0 -DNT_INST=0 -DWIN32=100 -D_NT1X_=100 -DWINNT=1 + PREPROC += -D_WIN32_WINNT=0x0501 -DWINVER=0x0501 -D_WIN32_IE=0x0603 + PREPROC += -DWIN32_LEAN_AND_MEAN=1 + PREPROC += -D__BUILDMACHINE__=WinDDK -DFPO=0 -D_DLL=1 + PREPROC += -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1 + PREPROC += -DNDIS51_MINIPORT=1 -DNDIS51=1 + PREPROC += -DMSC_NOOPT -DNTDDI_VERSION=0x05010200 + PREPROC += -DKMDF_MAJOR_VERSION_STRING=01 -DKMDF_MINOR_VERSION_STRING=009 + #PREPROC += -DDBG=1 #debug + PREPROC += -DNDEBUG #always up, seems no effect, possibly no debug? + PREPROC += -DDEVL=1 #always up, seems no effect + #macroing module name, WARNING: must match the one in .inf files + PREPROC += -DMODULENAME=Ipfw + + #our defines + OUR_PREPROC = -D_KERNEL -DKERNEL_MODULE -DKLD_MODULE + OUR_PREPROC += -D__BSD_VISIBLE -DIPFIREWALL_DEFAULT_TO_ACCEPT + OUR_PREPROC += -D__LITTLE_ENDIAN -DSYSCTL_NODE -DEMULATE_SYSCTL + +ifeq ($(TCC),) + CC = $(DDK)/bin/x86/x86/cl.exe + LD = $(DDK)/bin/x86/x86/link.exe + # #complier options + CFLAGS = -Fo$(OBJDIR) -c -FC -Zc:wchar_t- + CFLAGS += -Zl -Zp8 -Gy -Gm- -GF -cbstring -Gz -hotpatch -EHs-c- + CFLAGS += -W2 # -W3 gives too many conversion errors + CFLAGS += -GR- -GF -GS -Zi # XXX do we need this ? + CFLAGS += -Fd$(OBJDIR) + CFLAGS += -wd4603 -wd4627 -typedil- + CFLAGS += -FI $(DDK)/inc/api/warning.h + CFLAGS += -FI winmissing.h + CFLAGS += -FI missing.h # headers + CFLAGS += -FI ../glue.h # headers + + #optimization options + OPTIMIZE = -Od -Oi -Oy- + + #linker options + LDFLAGS = /MERGE:_PAGE=PAGE /MERGE:_TEXT=.text + LDFLAGS += /SECTION:INIT,d /OPT:REF /OPT:ICF + LDFLAGS += /IGNORE:4198,4010,4037,4039,4065,4070,4078,4087,4089,4221 + LDFLAGS += /INCREMENTAL:NO /release /NODEFAULTLIB /WX + LDFLAGS += /debug /debugtype:cv,fixup,pdata + LDFLAGS += /version:6.1 /osversion:6.1 /functionpadmin:5 + LDFLAGS += /safeseh /pdbcompress + LDFLAGS += /STACK:0x40000,0x1000 /driver /base:0x10000 /align:0x80 + LDFLAGS += /stub:$(DDK)\\lib\\wxp\\stub512.com + LDFLAGS += /subsystem:native,5.01 /entry:GsDriverEntry@8 + LDFLAGS += /out:$(OBJDIR)/ipfw.sys + + #libraries to build against + LIBS = $(DDK)/lib/wxp/i386/BufferOverflowK.lib + LIBS += $(DDK)/lib/wxp/i386/ntoskrnl.lib + LIBS += $(DDK)/lib/wxp/i386/hal.lib + LIBS += $(DDK)/lib/wxp/i386/wmilib.lib + LIBS += $(DDK)/lib/wxp/i386/ndis.lib + LIBS += $(DDK)/lib/wxp/i386/sehupd.lib +else + # TCC points to the root of tcc tree + CC=$(TCC)/bin/wintcc + EXTRA_CFLAGS += -DTCC -I.. + EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include + EXTRA_CFLAGS += -nostdinc + + CFLAGS += -include winmissing.h -include missing.h -include ../glue.h + CFLAGS += -I../../inc/api -I../../inc/ddk -I../../inc/crt + CFLAGS += -DRC_INVOKED +endif + + #empty include directory to be built + M ?= $(shell pwd) + EDIRS += asm linux + EFILES += asm/div64.h + EFILES += linux/if.h linux/random.h linux/errno.h + EFILES += net/if_types.h net/inet_hashtables.h net/route.h + + #targets +all: $(TARGET) + +$(TARGET): include_e + rm -rf objchk_wxp_x86 + mkdir -p objchk_wxp_x86/i386 + $(CC) $(INCLUDE_PATHS) $(PREPROC) $(OUR_PREPROC) $(CFLAGS) $(OPTIMIZE) $(CSOURCES) + $(LD) $(LDFLAGS) $(COBJS) $(LIBS) + +else # !windows + +# We have three sections for OpenWrt, Linux 2.4 and Linux 2.6 + +ifeq ($(VER),openwrt) + #--- The Makefile section for openwrt --- + # We do not include a dependency on include_e as it is called + # by Makefile.openwrt in Build/Prepare + M=. + obj-y := $(IPFW_SRCS:%.c=%.o) + O_TARGET := $(obj-m) + + # xcflags-y is a temporary variable where we store build options + xcflags-y += -O1 -DLINUX_24 + xcflags-y += -g + + EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL + + # we should not export anything + #export-objs := ipfw2_mod.o +-include $(TOPDIR)/Rules.make + +else # !openwrt, below we do linux builds for 2.4 and 2.6 + + # KERNELPATH is where the kernel headers reside. On PlanetLab + # it is set already by the build system. + # We can override it from the command line, or let the system guess. + +ifneq ($(shell echo $(VER)|grep '2.4'),) + # Makefile section for the linux 2.4 version + # tested on linux-2.4.35.4, does not work with 2.4.37 + # + # guess the kernel path -- or is it under /lib/modules ? + KERNELPATH ?= /usr/src/`uname -r` + + # We need to figure out the gcc include directory, if not + # set by the user through MYGCC_INCLUDE + # Find compiler version (3rd field in last line returned by gcc -v) + # e.g. gcc version 4.3.2 (Debian 4.3.2-1.1) + MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3) + # We don't know the exact directory under /usr/lib/gcc so we guess + MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include + $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)") + + # additional warning + WARN += -Wall -Wundef + WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing + WARN += -fno-common -Werror-implicit-function-declaration + # WARN += -O2 -fno-stack-protector -m32 -msoft-float -mregparm=3 + # -mregparm=3 gives a printk error + WARN += -m32 -msoft-float # -mregparm=3 + #WARN += -freg-struct-return -mpreferred-stack-boundary=2 + WARN += -Wno-sign-compare + WARN += -Wdeclaration-after-statement + ifneq ($(MYGCC_VER),3.4.6) + WARN += -Wno-pointer-sign + endif + + ccflags-y += -O1 -DLINUX_24 + CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \ + -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \ + ${ccflags-y} + # The Main target +all: mod24 + +else # !2.4 -- + + # This is the Makefile section for Linux 2.6.x including planetlab + +ifeq ($(IPFW_PLANETLAB),1) + $(warning "---- Building for PlanetLab") + ipfw-cflags += -DIPFW_PLANETLAB # PlanetLab compilation +endif + # if not set, use the version from the installed system + KERNELPATH ?= /lib/modules/`uname -r`/build + # Otherwise, if you have kernel sources, try something like this: + #KERNELPATH = /usr/src/linux-2.6.22 + $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)") + WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES + # The main target + + # Required by GCC 4.6 + ccflags-y += -Wno-unused-but-set-variable + + # Required by kernel <= 2.6.22, ccflags-y is used on newer version + LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3) + ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true) + $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)"); + endif + ifeq ($(shell if [ "$(LINUX_VERSION_CODE)" -le 132630 ] ; then echo "true"; fi),true) + EXTRA_CFLAGS += $(ccflags-y) + endif + +all: include_e + $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules +endif # !2.4 + +#-- back to the common section of code for Linux 2.4 and 2.6 + +# the list of objects used to build the module +ipfw_mod-y = $(IPFW_SRCS:%.c=%.o) + +# additional $(CC) flags +ccflags-y += $(WARN) +ccflags-y += $(ipfw-cflags) +# if we really want debug symbols... +ccflags-y += -g + +mod24: include_e $(obj-m) + +$(obj-m): $(ipfw_mod-y) + $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^ + +# M is the current directory, used in recursive builds +# so we allow it to be overridden +M ?= $(shell pwd) +endif # !openwrt +endif # !windows + +#--- various common targets +clean: + -rm -f *.o *.ko Module.symvers *.mod.c + -rm -rf objchk_wxp_x86 + -rm -rf include_e + +distclean: clean + -rm -f .*cmd modules.order opt_* + -rm -rf .tmp_versions include_e + -rm -rf .*.o.d _CL_* + +# support to create empty dirs and files in include_e/ +# EDIRS is the list of directories, EFILES is the list of files. + +EDIRS += altq arpa machine net netinet netinet6 sys + +EFILES += opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h +EFILES += opt_mbuf_stress_test.h opt_param.h + +EFILES += altq/if_altq.h +EFILES += arpa/inet.h +EFILES += machine/in_cksum.h +EFILES += net/ethernet.h net/netisr.h net/pf_mtag.h +EFILES += net/bpf.h net/if_types.h +EFILES += net/vnet.h + +EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h +EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h +EFILES += netinet/in_systm.h +EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h +EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h +EFILES += netinet/udp_var.h + +EFILES += netinet6/ip6_var.h + +EFILES += sys/_lock.h sys/_rwlock.h sys/rmlock.h sys/_mutex.h sys/jail.h +EFILES += sys/condvar.h sys/eventhandler.h sys/domain.h +EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h +EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h +EFILES += sys/sysctl.h sys/time.h sys/ucred.h + +include_e: + echo "running in $M" + -@rm -rf $(M)/include_e opt_* + -@mkdir -p $(M)/include_e + -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) + + +#--- some other targets for testing purposes +test_radix: test_radix.o radix.o +test_lookup: ip_fw_lookup.o +test_radix test_lookup: CFLAGS=-Wall -Werror -O1 diff --git a/dummynet2/bsd_compat.c b/dummynet2/bsd_compat.c new file mode 100644 index 0000000..1397951 --- /dev/null +++ b/dummynet2/bsd_compat.c @@ -0,0 +1,566 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: bsd_compat.c 6320 2010-05-24 11:54:36Z svn_panicucci $ + * + * kernel variables and functions that are not available in linux. + */ + +#include +#include /* do_div on 2.4 */ +#include /* get_random_bytes on 2.4 */ +#include +#include +#include + +/* + * gettimeofday would be in sys/time.h but it is not + * visible if _KERNEL is defined + */ +int gettimeofday(struct timeval *, struct timezone *); + +int ticks; /* kernel ticks counter */ +int hz = 1000; /* default clock time */ +long tick = 1000; /* XXX is this 100000/hz ? */ +int bootverbose = 0; +struct timeval boottime; + +int ip_defttl = 64; /* XXX set default value */ +int max_linkhdr = 16; +int fw_one_pass = 1; +u_long in_ifaddrhmask; /* mask for hash table */ +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +u_int rt_numfibs = RT_NUMFIBS; + +/* + * pfil hook support. + * We make pfil_head_get return a non-null pointer, which is then ignored + * in our 'add-hook' routines. + */ +struct pfil_head; +typedef int (pfil_hook_t) + (void *, struct mbuf **, struct ifnet *, int, struct inpcb *); + +struct pfil_head * +pfil_head_get(int proto, u_long flags) +{ + static int dummy; + return (struct pfil_head *)&dummy; +} + +int +pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) +{ + return 0; +} + +int +pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) +{ + return 0; +} + +/* define empty body for kernel function */ +int +priv_check(struct thread *td, int priv) +{ + return 0; +} + +int +securelevel_ge(struct ucred *cr, int level) +{ + return 0; +} + +int +sysctl_handle_int(SYSCTL_HANDLER_ARGS) +{ + return 0; +} + +int +sysctl_handle_long(SYSCTL_HANDLER_ARGS) +{ + return 0; +} + +void +ether_demux(struct ifnet *ifp, struct mbuf *m) +{ + return; +} + +int +ether_output_frame(struct ifnet *ifp, struct mbuf *m) +{ + return 0; +} + +void +in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) +{ + return; +} + +void +icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) +{ + return; +} + +u_short +in_cksum_skip(struct mbuf *m, int len, int skip) +{ + return 0; +} + +u_short +in_cksum_hdr(struct ip *ip) +{ + return 0; +} + +/* + * we don't really reassemble, just return whatever we had. + */ +struct mbuf * +ip_reass(struct mbuf *clone) +{ + return clone; +} +#ifdef INP_LOCK_ASSERT +#undef INP_LOCK_ASSERT +#define INP_LOCK_ASSERT(a) +#endif + +/* credentials check */ +#include +#ifdef __linux__ +int +cred_check(void *_insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb) +{ + int match = 0; + ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn; + + if (*ugid_lookupp == 0) { /* actively lookup and copy in cache */ + /* returns null if any element of the chain up to file is null. + * if sk != NULL then we also have a reference + */ + *ugid_lookupp = linux_lookup(proto, + src_ip.s_addr, htons(src_port), + dst_ip.s_addr, htons(dst_port), + skb, oif ? 1 : 0, u); + } + if (*ugid_lookupp < 0) + return 0; + + if (insn->o.opcode == O_UID) + match = (u->uid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_JAIL) + match = (u->xid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_GID) + match = (u->gid == (uid_t)insn->d[0]); + return match; +} +#endif /* __linux__ */ + +int +jailed(struct ucred *cred) +{ + return 0; +} + +/* +* Return 1 if an internet address is for a ``local'' host +* (one to which we have a connection). If subnetsarelocal +* is true, this includes other subnets of the local net. +* Otherwise, it includes only the directly-connected (sub)nets. +*/ +int +in_localaddr(struct in_addr in) +{ + return 1; +} + +int +sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) +{ + size_t valsize = sopt->sopt_valsize; + + if (len < valsize) + sopt->sopt_valsize = valsize = len; + //printf("copyout buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len); + bcopy(buf, sopt->sopt_val, valsize); + return 0; +} + +/* + * copy data from userland to kernel + */ +int +sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) +{ + size_t valsize = sopt->sopt_valsize; + + if (valsize < minlen) + return EINVAL; + if (valsize > len) + sopt->sopt_valsize = valsize = len; + //printf("copyin buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len); + bcopy(sopt->sopt_val, buf, valsize); + return 0; +} + +void +getmicrouptime(struct timeval *tv) +{ + do_gettimeofday(tv); +} + + +#include + +char * +inet_ntoa_r(struct in_addr ina, char *buf) +{ +#ifdef _WIN32 +#else + unsigned char *ucp = (unsigned char *)&ina; + + sprintf(buf, "%d.%d.%d.%d", + ucp[0] & 0xff, + ucp[1] & 0xff, + ucp[2] & 0xff, + ucp[3] & 0xff); +#endif + return buf; +} + +char * +inet_ntoa(struct in_addr ina) +{ + static char buf[16]; + return inet_ntoa_r(ina, buf); +} + +int +random(void) +{ +#ifdef _WIN32 + static unsigned long seed; + if (seed == 0) { + LARGE_INTEGER tm; + KeQuerySystemTime(&tm); + seed = tm.LowPart; + } + return RtlRandomEx(&seed) & 0x7fffffff; +#else + int r; + get_random_bytes(&r, sizeof(r)); + return r & 0x7fffffff; +#endif +} + + +/* + * do_div really does a u64 / u32 bit division. + * we save the sign and convert to uint befor calling. + * We are safe just because we always call it with small operands. + */ +int64_t +div64(int64_t a, int64_t b) +{ +#ifdef _WIN32 + int a1 = a, b1 = b; + return a1/b1; +#else + uint64_t ua, ub; + int sign = ((a>0)?1:-1) * ((b>0)?1:-1); + + ua = ((a>0)?a:-a); + ub = ((b>0)?b:-b); + do_div(ua, ub); + return sign*ua; +#endif +} + +#ifdef __MIPSEL__ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} +#endif // __MIPSEL__ + +/* + * compact version of fnmatch. + */ +int +fnmatch(const char *pattern, const char *string, int flags) +{ + char s; + + if (!string || !pattern) + return 1; /* no match */ + while ( (s = *string++) ) { + char p = *pattern++; + if (p == '\0') /* pattern is over, no match */ + return 1; + if (p == '*') /* wildcard, match */ + return 0; + if (p == '.' || p == s) /* char match, continue */ + continue; + return 1; /* no match */ + } + /* end of string, make sure the pattern is over too */ + if (*pattern == '\0' || *pattern == '*') + return 0; + return 1; /* no match */ +} + + +/* + * linux 2.6.33 defines these functions to access to + * skbuff internal structures. Define the missing + * function for the previous versions too. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) +inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) +{ + skb->dst = dst; +} + +inline struct dst_entry *skb_dst(const struct sk_buff *skb) +{ + return (struct dst_entry *)skb->dst; +} +#endif + + +/* support for sysctl emulation. + * XXX this is actually MI code that should be enabled also on openwrt + */ +#ifdef EMULATE_SYSCTL +static struct sysctltable GST; + +int +kesysctl_emu_get(struct sockopt* sopt) +{ + struct dn_id* oid = sopt->sopt_val; + struct sysctlhead* entry; + int sizeneeded = sizeof(struct dn_id) + GST.totalsize + + sizeof(struct sysctlhead); + unsigned char* pstring; + unsigned char* pdata; + int i; + + if (sopt->sopt_valsize < sizeneeded) { + // this is a probe to retrieve the space needed for + // a dump of the sysctl table + oid->id = sizeneeded; + sopt->sopt_valsize = sizeof(struct dn_id); + return 0; + } + + entry = (struct sysctlhead*)(oid+1); + for( i=0; iblocklen = GST.entry[i].head.blocklen; + entry->namelen = GST.entry[i].head.namelen; + entry->flags = GST.entry[i].head.flags; + entry->datalen = GST.entry[i].head.datalen; + pdata = (unsigned char*)(entry+1); + pstring = pdata+GST.entry[i].head.datalen; + bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen); + bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen); + entry = (struct sysctlhead*) + ((unsigned char*)(entry) + GST.entry[i].head.blocklen); + } + sopt->sopt_valsize = sizeneeded; + return 0; +} + +int +kesysctl_emu_set(void* p, int l) +{ + struct sysctlhead* entry; + unsigned char* pdata; + unsigned char* pstring; + int i = 0; + + entry = (struct sysctlhead*)(((struct dn_id*)p)+1); + pdata = (unsigned char*)(entry+1); + pstring = pdata + entry->datalen; + + for (i=0; idatalen != GST.entry[i].head.datalen) { + printf("%s: len mismatch, user %d vs kernel %d\n", + __FUNCTION__, entry->datalen, + GST.entry[i].head.datalen); + return -1; + } + // check access (at the moment flags handles only the R/W rights + //later on will be type + access + if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) { + printf("%s: the entry %s is read only\n", + __FUNCTION__,GST.entry[i].name); + return -1; + } + bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen); + return 0; + } + printf("%s: match not found\n",__FUNCTION__); + return 0; +} + +/* convert all _ to . until the first . */ +static void +underscoretopoint(char* s) +{ + for (; *s && *s != '.'; s++) + if (*s == '_') + *s = '.'; +} + +static int +formatnames() +{ + int i; + int size=0; + char* name; + + for (i=0; i> 2, + GST.entry[i].head.flags & 0x00000003); + printf("data %i\n", *(int*)(GST.entry[i].data)); + printf("datalen %i\n", GST.entry[i].head.datalen); + printf("blocklen %i\n", GST.entry[i].head.blocklen); + } +} + +void sysctl_addgroup_f1(); +void sysctl_addgroup_f2(); +void sysctl_addgroup_f3(); +void sysctl_addgroup_f4(); + +void +keinit_GST() +{ + int ret; + + sysctl_addgroup_f1(); + sysctl_addgroup_f2(); + sysctl_addgroup_f3(); + sysctl_addgroup_f4(); + ret = formatnames(); + if (ret != 0) + printf("conversion of names failed for some reason\n"); + //dumpGST(); + printf("*** Global Sysctl Table entries = %i, total size = %i ***\n", + GST.count, GST.totalsize); +} + +void +keexit_GST() +{ + if (GST.namebuffer != NULL) + free(GST.namebuffer,0); + bzero(&GST, sizeof(GST)); +} + +void +sysctl_pushback(char* name, int flags, int datalen, void* data) +{ + if (GST.count >= GST_HARD_LIMIT) { + printf("WARNING: global sysctl table full, this entry will not be added," + "please recompile the module increasing the table size\n"); + return; + } + GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0' + GST.entry[GST.count].name = name; + GST.entry[GST.count].head.flags = flags; + GST.entry[GST.count].data = data; + GST.entry[GST.count].head.datalen = datalen; + GST.entry[GST.count].head.blocklen = + ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen + + GST.entry[GST.count].head.datalen)+3) & ~3; + GST.totalsize += GST.entry[GST.count].head.blocklen; + GST.count++; +} +#endif /* EMULATE_SYSCTL */ diff --git a/dummynet2/debug.c b/dummynet2/debug.c new file mode 100644 index 0000000..67a4f23 --- /dev/null +++ b/dummynet2/debug.c @@ -0,0 +1,67 @@ +#include + +const char* texify_cmd(int i) +{ + if (i==110) + return("IP_FW_ADD"); + if (i==111) + return("IP_FW_DEL"); + if (i==112) + return("IP_FW_FLUSH"); + if (i==113) + return("IP_FW_ZERO"); + if (i==114) + return("IP_FW_GET"); + if (i==115) + return("IP_FW_RESETLOG"); + if (i==116) + return("IP_FW_NAT_CFG"); + if (i==117) + return("IP_FW_NAT_DEL"); + if (i==118) + return("IP_FW_NAT_GET_CONFIG"); + if (i==119) + return("IP_FW_NAT_GET_LOG"); + if (i==120) + return("IP_DUMMYNET_CONFIGURE"); + if (i==121) + return("IP_DUMMYNET_DEL"); + if (i==122) + return("IP_DUMMYNET_FLUSH"); + if (i==124) + return("IP_DUMMYNET_GET"); + if (i==108) + return("IP_FW3"); + if (i==109) + return("IP_DUMMYNET3"); + return ("BOH"); +} + +const char* texify_proto(unsigned int p) +{ + if (p==1) + return("ICMP"); + if (p==6) + return("TCP"); + if (p==17) + return("UDP"); + return("OTHER"); +} + +void hexdump(unsigned char* addr, int len, const char *msg) +{ + int i; + const int cicli = len/8; + const int resto = len%8; + unsigned char d[8]; + + DbgPrint("%s at %p len %d\n", msg, addr, len); + for (i=0; i<=cicli; i++) { + bzero(d, 8); + bcopy(addr+i*8, d, i < cicli ? 8 : resto); + DbgPrint("%04X %02X %02X %02X %02X %02X %02X %02X %02X\n", + i*8, d[0], d[1], d[2], d[3], d[4], + d[5], d[6], d[7]); + } + DbgPrint("\n"); +} diff --git a/dummynet2/dn_heap.c b/dummynet2/dn_heap.c new file mode 100644 index 0000000..a56d185 --- /dev/null +++ b/dummynet2/dn_heap.c @@ -0,0 +1,588 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, used in dummynet + * + * $Id: dn_heap.c 7119 2010-07-15 13:51:07Z luigi $ + */ + +#include +#include +#ifdef _KERNEL +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.c 203279 2010-01-31 12:20:29Z luigi $"); +#include +#include +#include +#include +#ifndef log +#define log(x, arg...) +#endif + +#else /* !_KERNEL */ + +#include +#include +#include +#include + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) +#define MALLOC_DEFINE(a, b, c) +static void *my_malloc(int s) { return malloc(s); } +static void my_free(void *p) { free(p); } +#define malloc(s, t, w) my_malloc(s) +#define free(p, t) my_free(p) +#endif /* !_KERNEL */ + +MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_resize(struct dn_heap *h, unsigned int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) /* have enough room */ + return 0; +#if 1 /* round to the next power of 2 */ + new_size |= new_size >> 1; + new_size |= new_size >> 2; + new_size |= new_size >> 4; + new_size |= new_size >> 8; + new_size |= new_size >> 16; +#else + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; +#endif + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); + if (p == NULL) { + printf("--- %s, resize %d failed\n", __func__, new_size ); + return 1; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DN_HEAP); + } + h->p = p; + h->size = new_size; + return 0; +} + +int +heap_init(struct dn_heap *h, int size, int ofs) +{ + if (heap_resize(h, size)) + return 1; + h->elements = 0; + h->ofs = ofs; + return 0; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If ofs > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ + } while (0) +/* + * RESET_OFFSET is used for sanity checks. It sets ofs + * to an invalid value. + */ +#define RESET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ + } while (0) + +int +heap_insert(struct dn_heap *h, uint64_t key1, void *p) +{ + int son = h->elements; + + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); + if (p == NULL) { /* data already there, set starting point */ + son = key1; + } else { /* insert new element at the end, possibly resize */ + son = h->elements; + if (son == h->size) /* need resize... */ + // XXX expand by 16 or so + if (heap_resize(h, h->elements+16) ) + return 1; /* failure... */ + h->p[son].object = p; + h->p[son].key = key1; + h->elements++; + } + /* make sure that son >= father along the path */ + while (son > 0) { + int father = HEAP_FATHER(son); + struct dn_heap_entry tmp; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp); + SET_OFFSET(h, son); + son = father; + } + SET_OFFSET(h, son); + return 0; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1; + + if (max < 0) { + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); + return; + } + if (obj == NULL) + father = 0; /* default: move up smallest child */ + else { /* extract specific element, index is at offset */ + if (h->ofs <= 0) + panic("%s: extract from middle not set on %p\n", + __FUNCTION__, h); + father = *((int *)((char *)obj + h->ofs)); + if (father < 0 || father >= h->elements) { + panic("%s: father %d out of bound 0..%d\n", + __FUNCTION__, father, h->elements); + } + } + /* + * below, father is the index of the empty element, which + * we replace at each step with the smallest child until we + * reach the bottom level. + */ + // XXX why removing RESET_OFFSET increases runtime by 10% ? + RESET_OFFSET(h, father); + while ( (child = HEAP_LEFT(father)) <= max ) { + if (child != max && + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child++; /* take right child, otherwise left */ + h->p[father] = h->p[child]; + SET_OFFSET(h, father); + father = child; + } + h->elements--; + if (father != max) { + /* + * Fill hole with last entry and bubble up, + * reusing the insert code + */ + h->p[father] = h->p[max]; + heap_insert(h, father, NULL); + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, uint64_t new_key, void *object) +{ + int temp, i, max = h->elements-1; + struct dn_heap_entry *p, buf; + + if (h->ofs <= 0) + panic("cannot move items on this heap"); + p = h->p; /* shortcut */ + + i = *((int *)((char *)object + h->ofs)); + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ + p[i].key = new_key; + for (; i>0 && + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); + i = temp ) { /* bubble up */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } + } else { /* must move down */ + p[i].key = new_key; + while ( (temp = HEAP_LEFT(i)) <= max ) { + /* found left child */ + if (temp != max && + DN_KEY_LT(p[temp+1].key, p[temp].key)) + temp++; /* select child with min key */ + if (DN_KEY_LT(>p[temp].key, new_key)) { + /* go down */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } else + break; + i = temp; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i; + + for (i = 0; i < h->elements; i++ ) + heap_insert(h, i , NULL); +} + +int +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), + uintptr_t arg) +{ + int i, ret, found; + + for (i = found = 0 ; i < h->elements ;) { + ret = fn(h->p[i].object, arg); + if (ret & HEAP_SCAN_DEL) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (ret & HEAP_SCAN_END) + break; + } + if (found) + heapify(h); + return found; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DN_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * hash table support. + */ + +struct dn_ht { + int buckets; /* how many buckets, really buckets - 1*/ + int entries; /* how many entries */ + int ofs; /* offset of link field */ + uint32_t (*hash)(uintptr_t, int, void *arg); + int (*match)(void *_el, uintptr_t key, int, void *); + void *(*newh)(uintptr_t, int, void *); + void **ht; /* bucket heads */ +}; +/* + * Initialize, allocating bucket pointers inline. + * Recycle previous record if possible. + * If the 'newh' function is not supplied, we assume that the + * key passed to ht_find is the same object to be stored in. + */ +struct dn_ht * +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, + uint32_t (*h)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)) +{ + int l; + + /* + * Notes about rounding bucket size to a power of two. + * Given the original bucket size, we compute the nearest lower and + * higher power of two, minus 1 (respectively b_min and b_max) because + * this value will be used to do an AND with the index returned + * by hash function. + * To choice between these two values, the original bucket size is + * compared with b_min. If the original size is greater than 4/3 b_min, + * we round the bucket size to b_max, else to b_min. + * This ratio try to round to the nearest power of two, advantaging + * the greater size if the different between two power is relatively + * big. + * Rounding the bucket size to a power of two avoid the use of + * module when calculating the correct bucket. + * The ht->buckets variable store the bucket size - 1 to simply + * do an AND between the index returned by hash function and ht->bucket + * instead of a module. + */ + int b_min; /* min buckets */ + int b_max; /* max buckets */ + int b_ori; /* original buckets */ + + if (h == NULL || match == NULL) { + printf("--- missing hash or match function"); + return NULL; + } + if (buckets < 1 || buckets > 65536) + return NULL; + + b_ori = buckets; + /* calculate next power of 2, - 1*/ + buckets |= buckets >> 1; + buckets |= buckets >> 2; + buckets |= buckets >> 4; + buckets |= buckets >> 8; + buckets |= buckets >> 16; + + b_max = buckets; /* Next power */ + b_min = buckets >> 1; /* Previous power */ + + /* Calculate the 'nearest' bucket size */ + if (b_min * 4000 / 3000 < b_ori) + buckets = b_max; + else + buckets = b_min; + + if (ht) { /* see if we can reuse */ + if (buckets <= ht->buckets) { + ht->buckets = buckets; + } else { + /* free pointers if not allocated inline */ + if (ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + ht = NULL; + } + } + if (ht == NULL) { + /* Allocate buckets + 1 entries because buckets is use to + * do the AND with the index returned by hash function + */ + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); + } + if (ht) { + ht->ht = (void **)(ht + 1); + ht->buckets = buckets; + ht->ofs = ofs; + ht->hash = h; + ht->match = match; + ht->newh = newh; + } + return ht; +} + +/* dummy callback for dn_ht_free to unlink all */ +static int +do_del(void *obj, void *arg) +{ + return DNHT_SCAN_DEL; +} + +void +dn_ht_free(struct dn_ht *ht, int flags) +{ + if (ht == NULL) + return; + if (flags & DNHT_REMOVE) { + (void)dn_ht_scan(ht, do_del, NULL); + } else { + if (ht->ht && ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + } +} + +int +dn_ht_entries(struct dn_ht *ht) +{ + return ht ? ht->entries : 0; +} + +/* + * Helper function to scan a bucket in the hash table, it + * can only be called on a non-empty bucket for a valid table. + * + * In lookup and scan, consider ht->ht[i] as pointing to the tail + * of the queue (head is NEXTP(tail). The 'empty' value is irrelevant. + * While searching, start analysing p = head, end when p == tail. + * Note that 'tail' is a cache of the _original_ ht->ht[i] + * and is used to check for loop termination. If you remove + * it, you must also adjust 'p' when deleting the 'tail' element. + */ +#define NEXT(_h, _p) *((void **)((char *)(_p) + (_h)->ofs)) +static int +dn_ht_scan_body(struct dn_ht *ht, int *bucket, + int (*fn)(void *, void *), void *arg) +{ + int ret, found = 0, i = *bucket; + void *tail, *pp, *p, *nextp; + + pp = tail = ht->ht[i]; + do { + p = NEXT(ht, pp); + nextp = NEXT(ht, p); + ret = fn(p, arg); + if ((ret & DNHT_SCAN_DEL) == 0) { + pp = p; /* prepare for next loop */ + } else { + found++; + ht->entries--; + /* skip current element */ + if (pp != p) + /* pp == p implies p == tail */ + NEXT(ht, pp) = nextp; + if (p == tail) + ht->ht[i] = (pp != p) ? pp : NULL; + } + if (ret & DNHT_SCAN_END) { + /* Update ht->ht[i] before returning */ + ht->ht[i] = (ht->ht[i] == NULL) ? NULL : pp; + return found; + } + } while (p != tail); + + (*bucket)++; + return found; +} + +/* + * lookup and optionally create or delete element. + * This is an optimized version of the scan so it is coded + * inline. + */ +void * +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) +{ + int i, found; + void *tail, *pp, *p; /* pp is the prev element, pp is current */ + + if (ht == NULL) /* easy on an empty hash */ + return NULL; + i = (ht->buckets == 1) ? 0 : + (ht->hash(key, flags, arg) & ht->buckets); + + pp = tail = ht->ht[i]; + if (tail) { /* non empty, try a lookup */ + do { + p = NEXT(ht, pp); + found = (flags & DNHT_MATCH_PTR) ? key == (uintptr_t)p : + ht->match(p, key, flags, arg); + if (!found) + continue; + if (flags & DNHT_REMOVE) { + ht->entries--; + if (p != pp) /* skip current element */ + NEXT(ht, pp) = NEXT(ht, p); + if (p == tail) + ht->ht[i] = (pp != p) ? pp : NULL; + } + return p; + } while ( (pp = p) != tail); + } + /* not found */ + if ((flags & DNHT_INSERT) == 0) + return NULL; + p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; + if (p) { + ht->entries++; + if (tail == NULL) { + ht->ht[i] = NEXT(ht, p) = p; + } else { + NEXT(ht, p) = NEXT(ht, tail); + NEXT(ht, tail) = p; + } + } + + return p; +} + +/* + * do a scan with the option to delete the object. + * Similar to the lookup, but the match function is different, + * and we extract 'next' before running the callback because + * the element may be destroyed there. + */ +int +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) +{ + int i, bucket, found = 0; + + if (ht == NULL || fn == NULL) + return 0; + for (i = 0; i <= ht->buckets; i++) { + if (ht->ht[i] == NULL) + continue; /* empty bucket */ + bucket = i; + found += dn_ht_scan_body(ht, &bucket, fn, arg); + if (bucket == i) /* early exit */ + return found; + } + return found; +} + +/* + * Similar to dn_ht_scan(), except that the scan is performed only + * in the bucket 'bucket'. The function returns a correct bucket number if + * the original is invalid. + * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] + * pointer to the last entry processed. Moreover, the bucket number passed + * by caller is decremented, because usually the caller increment it. + */ +int +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), + void *arg) +{ + if (ht == NULL || fn == NULL) + return 0; + if (*bucket > ht->buckets || *bucket < 0) + *bucket = 0; + if (ht->ht[*bucket] == NULL) { + (*bucket)++; + return 0; + } else + return dn_ht_scan_body(ht, bucket, fn, arg); +} diff --git a/dummynet2/dn_sched_fifo.c b/dummynet2/dn_sched_fifo.c new file mode 100644 index 0000000..d7d923e --- /dev/null +++ b/dummynet2/dn_sched_fifo.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_fifo.c 5621 2010-03-04 16:51:27Z luigi $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +/* + * This file implements a FIFO scheduler for a single queue. + * The queue is allocated as part of the scheduler instance, + * and there is a single flowset is in the template which stores + * queue size and policy. + * Enqueue and dequeue use the default library functions. + */ +static int +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) +{ + /* XXX if called with q != NULL and m=NULL, this is a + * re-enqueue from an existing scheduler, which we should + * handle. + */ + return dn_enqueue((struct dn_queue *)(si+1), m, 0); +} + +static struct mbuf * +fifo_dequeue(struct dn_sch_inst *si) +{ + return dn_dequeue((struct dn_queue *)(si + 1)); +} + +static int +fifo_new_sched(struct dn_sch_inst *si) +{ + /* This scheduler instance contains the queue */ + struct dn_queue *q = (struct dn_queue *)(si + 1); + + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = si; + q->fs = si->sched->fs; + return 0; +} + +static int +fifo_free_sched(struct dn_sch_inst *si) +{ + struct dn_queue *q = (struct dn_queue *)(si + 1); + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); + return 0; +} + +/* + * FIFO scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fifo_desc = { + _SI( .type = ) DN_SCHED_FIFO, + _SI( .name = ) "FIFO", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct dn_queue), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fifo_enqueue, + _SI( .dequeue = ) fifo_dequeue, + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) fifo_new_sched, + _SI( .free_sched = ) fifo_free_sched, + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, +}; + +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/dummynet2/dn_sched_prio.c b/dummynet2/dn_sched_prio.c new file mode 100755 index 0000000..72af5da --- /dev/null +++ b/dummynet2/dn_sched_prio.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_prio.c 6338 2010-05-26 15:06:34Z svn_panicucci $ + */ +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_PRIO 5 //XXX + +#if !defined(_KERNEL) || !defined(__linux__) +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +#ifdef __MIPSEL__ +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +/* Size of the array of queues pointers. */ +#define BITMAP_T unsigned long +#define MAXPRIO (sizeof(BITMAP_T) * 8) + +/* + * The scheduler instance contains an array of pointers to queues, + * one for each priority, and a bitmap listing backlogged queues. + */ +struct prio_si { + BITMAP_T bitmap; /* array bitmap */ + struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ +}; + +/* + * If a queue with the same priority is already backlogged, use + * that one instead of the queue passed as argument. + */ +static int +prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + int prio = q->fs->fs.par[0]; + + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else { /* use the existing queue */ + q = si->q_array[prio]; + } + if (dn_enqueue(q, m, 0)) + return 1; + return 0; +} + +/* + * Packets are dequeued only from the highest priority queue. + * The function ffs() return the lowest bit in the bitmap that rapresent + * the array index (-1) which contains the pointer to the highest priority + * queue. + * After the dequeue, if this queue become empty, it is index is removed + * from the bitmap. + * Scheduler is idle if the bitmap is empty + * + * NOTE: highest priority is 0, lowest is sched->max_prio_q + */ +static struct mbuf * +prio_dequeue(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + int prio; + + if (si->bitmap == 0) /* scheduler idle */ + return NULL; + + prio = ffs(si->bitmap) - 1; + + /* Take the highest priority queue in the scheduler */ + q = si->q_array[prio]; + // assert(q) + + m = dn_dequeue(q); + if (q->mq.head == NULL) { + /* Queue is now empty, remove from scheduler + * and mark it + */ + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return m; +} + +static int +prio_new_sched(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + + bzero(si->q_array, sizeof(si->q_array)); + si->bitmap = 0; + + return 0; +} + +static int +prio_new_fsk(struct dn_fsk *fs) +{ + /* Check if the prioritiy is between 0 and MAXPRIO-1 */ + ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); + return 0; +} + +static int +prio_new_queue(struct dn_queue *q) +{ + struct prio_si *si = (struct prio_si *)(q->_si + 1); + int prio = q->fs->fs.par[0]; + struct dn_queue *oldq; + + q->ni.oid.subtype = DN_SCHED_PRIO; + + if (q->mq.head == NULL) + return 0; + + /* Queue already full, must insert in the scheduler or append + * mbufs to existing queue. This partly duplicates prio_enqueue + */ + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else if ( (oldq = si->q_array[prio]) != q) { + /* must append to the existing queue. + * can simply append q->mq.head to q2->... + * and add the counters to those of q2 + */ + oldq->mq.tail->m_nextpkt = q->mq.head; + oldq->mq.tail = q->mq.tail; + oldq->ni.length += q->ni.length; + q->ni.length = 0; + oldq->ni.len_bytes += q->ni.len_bytes; + q->ni.len_bytes = 0; + q->mq.tail = q->mq.head = NULL; + } + return 0; +} + +static int +prio_free_queue(struct dn_queue *q, int safe) +{ + int prio = q->fs->fs.par[0]; + struct prio_si *si = (struct prio_si *)(q->_si + 1); + + if (si->q_array[prio] == q) { + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return 0; +} + + +static struct dn_alg prio_desc = { + _SI( .type = ) DN_SCHED_PRIO, + _SI( .name = ) "PRIO", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct prio_si), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) prio_enqueue, + _SI( .dequeue = ) prio_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) prio_new_sched, + _SI( .free_sched = ) NULL, + + _SI( .new_fsk = ) prio_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) prio_new_queue, + _SI( .free_queue = ) prio_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); diff --git a/dummynet2/dn_sched_qfq.c b/dummynet2/dn_sched_qfq.c new file mode 100644 index 0000000..eddb472 --- /dev/null +++ b/dummynet2/dn_sched_qfq.c @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_qfq.c 6552 2010-06-15 11:24:59Z svn_panicucci $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifdef QFQ_DEBUG +struct qfq_sched; +static void dump_sched(struct qfq_sched *q, const char *msg); +#define NO(x) x +#else +#define NO(x) +#endif +#define DN_SCHED_QFQ 4 // XXX Where? +typedef unsigned long bitmap; + +/* + * bitmaps ops are critical. Some linux versions have __fls + * and the bitmap ops. Some machines have ffs + */ +#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) +int fls(unsigned int n) +{ + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} +#endif + +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} +#endif + +#if !defined(_KERNEL) || !defined(__linux__) +#ifdef QFQ_DEBUG +int test_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + return *p & (1< 31) + D("bad index %d", ix); + *p |= (1< 31) + D("bad index %d", ix); + *p &= ~(1<index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + +The max group index corresponds to Lmax/w_min, where +Lmax=1<group mapping. Class weights are + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<i_wsum) +#define IWSUM ((1< 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = ffs(bitmap) - 1; // zero-based + return &q->groups[index]; +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) +{ + uint64_t slot_size = (uint64_t)maxlen *inv_w; + unsigned long size_map; + int index = 0; + + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; // basically a log_2() + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; + +out: + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); + return index; +} +/*---- end support functions ----*/ + +/*-------- API calls --------------------------------*/ +/* + * Validate and copy parameters from flowset. + */ +static int +qfq_new_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + int i; + uint32_t w; /* approximated weight */ + + /* import parameters from the flowset. They should be correct + * already. + */ + w = _q->fs->fs.par[0]; + cl->lmax = _q->fs->fs.par[1]; + if (!w || w > QFQ_MAX_WEIGHT) { + w = 1; + D("rounding weight to 1"); + } + cl->inv_w = ONE_FP/w; + w = ONE_FP/cl->inv_w; + if (q->wsum + w > QFQ_MAX_WSUM) + return EINVAL; + + i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->grp = &q->groups[i]; + q->wsum += w; + // XXX cl->S = q->V; ? + // XXX compute q->i_wsum + return 0; +} + +/* remove an empty queue */ +static int +qfq_free_queue(struct dn_queue *_q, int safe) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + if (cl->inv_w) { + q->wsum -= ONE_FP/cl->inv_w; + cl->inv_w = 0; /* reset weight to avoid run twice */ + } + return 0; +} + +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long +mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static inline unsigned int +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static inline void +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) +{ + unsigned long mask, vslot, old_vslot; + + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) +{ + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + cl->next = grp->slots[i]; + grp->slots[i] = cl; + __set_bit(slot, &grp->full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->slots[grp->front]; + + *h = (*h)->next; + if (!*h) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_group *grp) +{ + int i; + + ND("grp %d full %x", grp->index, grp->full_slots); + if (!grp->full_slots) + return NULL; + + i = ffs(grp->full_slots) - 1; // zero-based + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return grp->slots[grp->front]; +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + + +static inline void +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) +{ + bitmap ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + + cl->S = cl->F; + if (cl->_q.mq.head == NULL) { + qfq_front_slot_remove(grp); + } else { + unsigned int len; + uint64_t roundedS; + + len = cl->_q.mq.head->m_pkthdr.len; + cl->F = cl->S + (uint64_t)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return 0; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + return 1; +} + +static struct mbuf * +qfq_dequeue(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + uint64_t old_V; + + NO(q->loops++;) + if (!q->bitmaps[ER]) { + NO(if (q->queued) + dump_sched(q, "start dequeue");) + return NULL; + } + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = grp->slots[grp->front]; + /* extract from the first bucket in the bucket list */ + m = dn_dequeue(&cl->_q); + + if (!m) { + D("BUG/* non-workconserving leaf */"); + return NULL; + } + NO(q->queued--;) + old_V = q->V; + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); + + if (qfq_update_class(q, grp, cl)) { + uint64_t old_F = grp->F; + cl = qfq_slot_scan(grp); + if (!cl) { /* group gone, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + // grp->S = grp->F + 1; // XXX debugging only + } else { + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + /* remove from ER and put in the new set */ + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + NO(if (!q->bitmaps[ER] && q->queued) + dump_sched(q, "end dequeue");) + + return m; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint32_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else { /* timestamp is not stale */ + cl->S = cl->F; + } +} + +static int +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl = (struct qfq_class *)_q; + uint64_t roundedS; + int s; + + NO(q->loops++;) + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, + _q, cl->inv_w, cl->grp->index); + /* XXX verify that the packet obeys the parameters */ + if (m != _q->mq.head) { + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ + return 1; + NO(q->queued++;) + if (m != _q->mq.head) + return 0; + } + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); /* adjust start time */ + /* compute new finish time and rounded start. */ + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + /* create a slot for this cl->S */ + qfq_slot_rotate(q, grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + ND("new state %d 0x%x", s, q->bitmaps[s]); + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return 0; +} + + +#if 0 +static inline void +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl, struct qfq_class **pprev) +{ + unsigned int i, offset; + uint64_t roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + +#ifdef notyet + if (!pprev) { + pprev = &grp->slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->next; + } +#endif + + *pprev = cl->next; + if (!grp->slots[i]) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, + struct qfq_class **pprev) +{ + struct qfq_group *grp = &q->groups[cl->index]; + unsigned long mask; + uint64_t roundedS; + int s; + + cl->F = cl->S; // not needed if the class goes away. + qfq_slot_remove(q, grp, cl, pprev); + + if (!grp->full_slots) { + /* nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (!grp->slots[grp->front]) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + qfq_update_eligible(q, q->V); +} +#endif + +static int +qfq_new_fsk(struct dn_fsk *f) +{ + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); + return 0; +} + +/* + * initialize a new scheduler instance + */ +static int +qfq_new_sched(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + int i; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - + (QFQ_MAX_INDEX - i); + } + return 0; +} + +/* + * QFQ scheduler descriptor + */ +static struct dn_alg qfq_desc = { + _SI( .type = ) DN_SCHED_QFQ, + _SI( .name = ) "QFQ", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct qfq_sched), + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), + + _SI( .enqueue = ) qfq_enqueue, + _SI( .dequeue = ) qfq_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) qfq_new_sched, + _SI( .free_sched = ) NULL, + _SI( .new_fsk = ) qfq_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) qfq_new_queue, + _SI( .free_queue = ) qfq_free_queue, +}; + +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); + +#ifdef QFQ_DEBUG +static void +dump_groups(struct qfq_sched *q, uint32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = &q->groups[i]; + + if (0 == (mask & (1<slots[j]) + D(" bucket %d %p", j, g->slots[j]); + } + D("full_slots 0x%x", g->full_slots); + D(" %2d S 0x%20llx F 0x%llx %c", i, + g->S, g->F, + mask & (1<loops, q->queued, q->V); + D(" ER 0x%08x", q->bitmaps[ER]); + D(" EB 0x%08x", q->bitmaps[EB]); + D(" IR 0x%08x", q->bitmaps[IR]); + D(" IB 0x%08x", q->bitmaps[IB]); + dump_groups(q, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ diff --git a/dummynet2/dn_sched_rr.c b/dummynet2/dn_sched_rr.c new file mode 100644 index 0000000..2b58cf0 --- /dev/null +++ b/dummynet2/dn_sched_rr.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_rr.c 6338 2010-05-26 15:06:34Z svn_panicucci $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_RR 3 // XXX Where? + +struct rr_queue { + struct dn_queue q; /* Standard queue */ + int status; /* 1: queue is in the list */ + int credit; /* Number of bytes to transmit */ + int quantum; /* quantum * C */ + struct rr_queue *qnext; /* */ +}; + +/* struct rr_schk contains global config parameters + * and is right after dn_schk + */ +struct rr_schk { + int min_q; /* Min quantum */ + int max_q; /* Max quantum */ + int q_bytes; /* Bytes per quantum */ +}; + +/* per-instance round robin list, right after dn_sch_inst */ +struct rr_si { + struct rr_queue *head, *tail; /* Pointer to current queue */ +}; + +/* Append a queue to the rr list */ +static inline void +rr_append(struct rr_queue *q, struct rr_si *si) +{ + q->status = 1; /* mark as in-rr_list */ + q->credit = q->quantum; /* initialize credit */ + + /* append to the tail */ + if (si->head == NULL) + si->head = q; + else + si->tail->qnext = q; + si->tail = q; /* advance the tail pointer */ + q->qnext = si->head; /* make it circular */ +} + +/* Remove the head queue from circular list. */ +static inline void +rr_remove_head(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + si->head->status = 0; + + if (si->head == si->tail) { + si->head = si->tail = NULL; + return; + } + + si->head = si->head->qnext; + si->tail->qnext = si->head; +} + +/* Remove a queue from circular list. + * XXX see if ti can be merge with remove_queue() + */ +static inline void +remove_queue_q(struct rr_queue *q, struct rr_si *si) +{ + struct rr_queue *prev; + + if (q->status != 1) + return; + if (q == si->head) { + rr_remove_head(si); + return; + } + + for (prev = si->head; prev; prev = prev->qnext) { + if (prev->qnext != q) + continue; + prev->qnext = q->qnext; + if (q == si->tail) + si->tail = prev; + q->status = 0; + break; + } +} + + +static inline void +next_pointer(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + + si->head = si->head->qnext; + si->tail = si->tail->qnext; +} + +static int +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct rr_si *si; + struct rr_queue *rrq; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) + return 0; + } + + /* If reach this point, queue q was idle */ + si = (struct rr_si *)(_si + 1); + rrq = (struct rr_queue *)q; + + if (rrq->status == 1) /* Queue is already in the queue list */ + return 0; + + /* Insert the queue in the queue list */ + rr_append(rrq, si); + + return 0; +} + +static struct mbuf * +rr_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct rr_si *si = (struct rr_si *)(_si + 1); + struct rr_queue *rrq; + uint64_t len; + + while ( (rrq = si->head) ) { + struct mbuf *m = rrq->q.mq.head; + if ( m == NULL) { + /* empty queue, remove from list */ + rr_remove_head(si); + continue; + } + len = m->m_pkthdr.len; + + if (len > rrq->credit) { + /* Packet too big */ + rrq->credit += rrq->quantum; + /* Try next queue */ + next_pointer(si); + } else { + rrq->credit -= len; + return dn_dequeue(&rrq->q); + } + } + + /* no packet to dequeue*/ + return NULL; +} + +static int +rr_config(struct dn_schk *_schk) +{ + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); + ND("called"); + + /* use reasonable quantums (64..2k bytes, default 1500) */ + schk->min_q = 64; + schk->max_q = 2048; + schk->q_bytes = 1500; /* quantum */ + + return 0; +} + +static int +rr_new_sched(struct dn_sch_inst *_si) +{ + struct rr_si *si = (struct rr_si *)(_si + 1); + + ND("called"); + si->head = si->tail = NULL; + + return 0; +} + +static int +rr_free_sched(struct dn_sch_inst *_si) +{ + ND("called"); + /* Nothing to do? */ + return 0; +} + +static int +rr_new_fsk(struct dn_fsk *fs) +{ + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); + /* par[0] is the weight, par[1] is the quantum step */ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 65536, "RR weight"); + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, + schk->min_q, schk->max_q, "RR quantum"); + return 0; +} + +static int +rr_new_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_RR; + + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; + ND("called, q->quantum %d", q->quantum); + q->credit = q->quantum; + q->status = 0; + + if (_q->mq.head != NULL) { + /* Queue NOT empty, insert in the queue list */ + rr_append(q, (struct rr_si *)(_q->_si + 1)); + } + return 0; +} + +static int +rr_free_queue(struct dn_queue *_q, int safe) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + ND("called"); + if (safe) /* Delete only if status == 0 */ + return q->status; + + if (q->status == 1) { + struct rr_si *si = (struct rr_si *)(_q->_si + 1); + remove_queue_q(q, si); + } + return 0; +} + +/* + * RR scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg rr_desc = { + _SI( .type = ) DN_SCHED_RR, + _SI( .name = ) "RR", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct rr_si), + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), + + _SI( .enqueue = ) rr_enqueue, + _SI( .dequeue = ) rr_dequeue, + + _SI( .config = ) rr_config, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) rr_new_sched, + _SI( .free_sched = ) rr_free_sched, + _SI( .new_fsk = ) rr_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) rr_new_queue, + _SI( .free_queue = ) rr_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/dummynet2/dn_sched_wf2q.c b/dummynet2/dn_sched_wf2q.c new file mode 100644 index 0000000..c42969e --- /dev/null +++ b/dummynet2/dn_sched_wf2q.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_wf2q.c 6338 2010-05-26 15:06:34Z svn_panicucci $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifndef MAX64 +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#endif + +/* + * timestamps are computed on 64 bit using fixed point arithmetic. + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len + * and sum of weights, respectively. FRAC_BITS is the number of + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w + * using an unsigned 32-bit division, and to avoid wraparounds we need + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 + * As an example + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 + */ +#ifndef FRAC_BITS +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#endif + +/* + * Private information for the scheduler instance: + * sch_heap (key is Finish time) returns the next queue to serve + * ne_heap (key is Start time) stores not-eligible queues + * idle_heap (key=start/finish time) stores idle flows. It must + * support extract-from-middle. + * A flow is only in 1 of the three heaps. + * XXX todo: use a more efficient data structure, e.g. a tree sorted + * by F with min_subtree(S) in each node + */ +struct wf2qp_si { + struct dn_heap sch_heap; /* top extract - key Finish time */ + struct dn_heap ne_heap; /* top extract - key Start time */ + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ + uint64_t V; /* virtual time */ + uint32_t inv_wsum; /* inverse of sum of weights */ + uint32_t wsum; /* sum of weights */ +}; + +struct wf2qp_queue { + struct dn_queue _q; + uint64_t S, F; /* start time, finish time */ + uint32_t inv_w; /* ONE_FP / weight */ + int32_t heap_pos; /* position (index) of struct in heap */ +}; + +/* + * This file implements a WF2Q+ scheduler as it has been in dummynet + * since 2000. + * The scheduler supports per-flow queues and has O(log N) complexity. + * + * WF2Q+ needs to drain entries from the idle heap so that we + * can keep the sum of weights up to date. We can do it whenever + * we get a chance, or periodically, or following some other + * strategy. The function idle_check() drains at most N elements + * from the idle heap. + */ +static void +idle_check(struct wf2qp_si *si, int n, int force) +{ + struct dn_heap *h = &si->idle_heap; + while (n-- > 0 && h->elements > 0 && + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { + struct dn_queue *q = HEAP_TOP(h)->object; + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + + heap_extract(h, NULL); + /* XXX to let the flowset delete the queue we should + * mark it as 'unused' by the scheduler. + */ + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + } +} + +static int +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct dn_fsk *fs = q->fs; + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct wf2qp_queue *alg_fq; + uint64_t len = m->m_pkthdr.len; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) /* queue was already busy */ + return 0; + } + + /* If reach this point, queue q was idle */ + alg_fq = (struct wf2qp_queue *)q; + + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { + /* Fbrand new queue. */ + alg_fq->S = si->V; /* init start time */ + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ + si->inv_wsum = ONE_FP/si->wsum; + } else { /* if it was idle then it was in the idle heap */ + heap_extract(&si->idle_heap, q); + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ + } + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; + + /* if nothing is backlogged, make sure this flow is eligible */ + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) + si->V = MAX64(alg_fq->S, si->V); + + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the sch_heap cannot be + * empty). If the flow is not eligible we just store it in the + * ne_heap. Otherwise, we store in the sch_heap. + * Note that for all flows in sch_heap (SCH), S_i <= V, + * and for all flows in ne_heap (NEH), S_i > V. + * So when we need to compute max(V, min(S_i)) forall i in + * SCH+NEH, we only need to look into NEH. + */ + if (DN_KEY_LT(si->V, alg_fq->S)) { + /* S>V means flow Not eligible. */ + if (si->sch_heap.elements == 0) + D("++ ouch! not eligible but empty scheduler!"); + heap_insert(&si->ne_heap, alg_fq->S, q); + } else { + heap_insert(&si->sch_heap, alg_fq->F, q); + } + return 0; +} + +/* XXX invariant: sch > 0 || V >= min(S in neh) */ +static struct mbuf * +wf2qp_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + struct dn_heap *sch = &si->sch_heap; + struct dn_heap *neh = &si->ne_heap; + struct wf2qp_queue *alg_fq; + + if (sch->elements == 0 && neh->elements == 0) { + /* we have nothing to do. We could kill the idle heap + * altogether and reset V + */ + idle_check(si, 0x7fffffff, 1); + si->V = 0; + si->wsum = 0; /* should be set already */ + return NULL; /* quick return if nothing to do */ + } + idle_check(si, 1, 0); /* drain something from the idle heap */ + + /* make sure at least one element is eligible, bumping V + * and moving entries that have become eligible. + * We need to repeat the first part twice, before and + * after extracting the candidate, or enqueue() will + * find the data structure in a wrong state. + */ + m = NULL; + for(;;) { + /* + * Compute V = max(V, min(S_i)). Remember that all elements + * in sch have by definition S_i <= V so if sch is not empty, + * V is surely the max and we must not update it. Conversely, + * if sch is empty we only need to look at neh. + * We don't need to move the queues, as it will be done at the + * next enqueue + */ + if (sch->elements == 0 && neh->elements > 0) { + si->V = MAX64(si->V, HEAP_TOP(neh)->key); + } + while (neh->elements > 0 && + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { + q = HEAP_TOP(neh)->object; + alg_fq = (struct wf2qp_queue *)q; + heap_extract(neh, NULL); + heap_insert(sch, alg_fq->F, q); + } + if (m) /* pkt found in previous iteration */ + break; + /* ok we have at least one eligible pkt */ + q = HEAP_TOP(sch)->object; + alg_fq = (struct wf2qp_queue *)q; + m = dn_dequeue(q); + heap_extract(sch, NULL); /* Remove queue from heap. */ + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; + alg_fq->S = alg_fq->F; /* Update start time. */ + if (q->mq.head == 0) { /* not backlogged any more. */ + heap_insert(&si->idle_heap, alg_fq->F, q); + } else { /* Still backlogged. */ + /* Update F, store in neh or sch */ + uint64_t len = q->mq.head->m_pkthdr.len; + alg_fq->F += len * alg_fq->inv_w; + if (DN_KEY_LEQ(alg_fq->S, si->V)) { + heap_insert(sch, alg_fq->F, q); + } else { + heap_insert(neh, alg_fq->S, q); + } + } + } + return m; +} + +static int +wf2qp_new_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + int ofs = offsetof(struct wf2qp_queue, heap_pos); + + /* all heaps support extract from middle */ + if (heap_init(&si->idle_heap, 16, ofs) || + heap_init(&si->sch_heap, 16, ofs) || + heap_init(&si->ne_heap, 16, ofs)) { + heap_free(&si->ne_heap); + heap_free(&si->sch_heap); + heap_free(&si->idle_heap); + return ENOMEM; + } + return 0; +} + +static int +wf2qp_free_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + + heap_free(&si->sch_heap); + heap_free(&si->ne_heap); + heap_free(&si->idle_heap); + + return 0; +} + +static int +wf2qp_new_fsk(struct dn_fsk *fs) +{ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 100, "WF2Q+ weight"); + return 0; +} + +static int +wf2qp_new_queue(struct dn_queue *_q) +{ + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_WF2QP; + q->F = 0; /* not strictly necessary */ + q->S = q->F + 1; /* mark timestamp as invalid. */ + q->inv_w = ONE_FP / _q->fs->fs.par[0]; + if (_q->mq.head != NULL) { + wf2qp_enqueue(_q->_si, _q, _q->mq.head); + } + return 0; +} + +/* + * Called when the infrastructure removes a queue (e.g. flowset + * is reconfigured). Nothing to do if we did not 'own' the queue, + * otherwise remove it from the right heap and adjust the sum + * of weights. + */ +static int +wf2qp_free_queue(struct dn_queue *q, int safe) +{ + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); + + if (alg_fq->S >= alg_fq->F + 1) + return 0; /* nothing to do, not in any heap */ + + /* queue is in a scheduler heap */ + if (safe) /* do not delete in safe mode */ + return 1; + + si->wsum -= q->fs->fs.par[0]; + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + + /* extract from the heap. XXX TODO we may need to adjust V + * to make sure the invariants hold. + */ + if (q->mq.head == NULL) { + heap_extract(&si->idle_heap, q); + } else if (DN_KEY_LT(si->V, alg_fq->S)) { + heap_extract(&si->ne_heap, q); + } else { + heap_extract(&si->sch_heap, q); + } + return 0; +} + +/* + * WF2Q+ scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg wf2qp_desc = { + _SI( .type = ) DN_SCHED_WF2QP, + _SI( .name = ) "WF2Q+", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct wf2qp_si), + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - + sizeof(struct dn_queue), + + _SI( .enqueue = ) wf2qp_enqueue, + _SI( .dequeue = ) wf2qp_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) wf2qp_new_sched, + _SI( .free_sched = ) wf2qp_free_sched, + + _SI( .new_fsk = ) wf2qp_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) wf2qp_new_queue, + _SI( .free_queue = ) wf2qp_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/dummynet2/in_cksum.c b/dummynet2/in_cksum.c new file mode 100644 index 0000000..8972cef --- /dev/null +++ b/dummynet2/in_cksum.c @@ -0,0 +1,150 @@ +/*- + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $"); + +#include +#include + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(struct mbuf *m, int len) +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ +#if defined(CONFIG_X86_64) + if ((1 & (long) w) && (mlen > 0)) { +#else + if ((1 & (int) w) && (mlen > 0)) { +#endif + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/dummynet2/include/net/if.h b/dummynet2/include/net/if.h new file mode 100644 index 0000000..1aa8e7b --- /dev/null +++ b/dummynet2/include/net/if.h @@ -0,0 +1 @@ +#include diff --git a/dummynet2/include/net/pfil.h b/dummynet2/include/net/pfil.h new file mode 100644 index 0000000..af26a79 --- /dev/null +++ b/dummynet2/include/net/pfil.h @@ -0,0 +1,121 @@ +/* $FreeBSD: src/sys/net/pfil.h,v 1.16 2007/06/08 12:43:25 gallatin Exp $ */ +/* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ + +/*- + * Copyright (c) 1996 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PFIL_H_ +#define _NET_PFIL_H_ + +#include +#include +#include +#include +#include +#include + +struct mbuf; +struct ifnet; +struct inpcb; + +/* + * The packet filter hooks are designed for anything to call them to + * possibly intercept the packet. + */ +struct packet_filter_hook { + TAILQ_ENTRY(packet_filter_hook) pfil_link; + int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); + void *pfil_arg; +}; + +#define PFIL_IN 0x00000001 +#define PFIL_OUT 0x00000002 +#define PFIL_WAITOK 0x00000004 +#define PFIL_ALL (PFIL_IN|PFIL_OUT) + +typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; + +#define PFIL_TYPE_AF 1 /* key is AF_* type */ +#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ + +struct pfil_head { + pfil_list_t ph_in; + pfil_list_t ph_out; + int ph_type; + int ph_nhooks; +#if defined( __linux__ ) || defined( _WIN32 ) + rwlock_t ph_mtx; +#else + struct rmlock ph_lock; +#endif + union { + u_long phu_val; + void *phu_ptr; + } ph_un; +#define ph_af ph_un.phu_val +#define ph_ifnet ph_un.phu_ptr + LIST_ENTRY(pfil_head) ph_list; +}; + +int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, + int, struct inpcb *), void *, int, struct pfil_head *); +int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, + int, struct inpcb *), void *, int, struct pfil_head *); +int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, + int, struct inpcb *inp); + +int pfil_head_register(struct pfil_head *); +int pfil_head_unregister(struct pfil_head *); + +struct pfil_head *pfil_head_get(int, u_long); + +#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) +#define PFIL_LOCK_INIT(p) \ + rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE) +#define PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock) +#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t)) +#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock) +#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t)) +#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock) +#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) +#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) + +static __inline struct packet_filter_hook * +pfil_hook_get(int dir, struct pfil_head *ph) +{ + + if (dir == PFIL_IN) + return (TAILQ_FIRST(&ph->ph_in)); + else if (dir == PFIL_OUT) + return (TAILQ_FIRST(&ph->ph_out)); + else + return (NULL); +} + +#endif /* _NET_PFIL_H_ */ diff --git a/dummynet2/include/net/radix.h b/dummynet2/include/net/radix.h new file mode 100644 index 0000000..e5b8ecc --- /dev/null +++ b/dummynet2/include/net/radix.h @@ -0,0 +1,180 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.h 8.2 (Berkeley) 10/31/94 + * $FreeBSD: head/sys/net/radix.h 185747 2008-12-07 21:15:43Z kmacy $ + */ + +#ifndef _RADIX_H_ +#define _RADIX_H_ + +#ifdef _KERNEL +#include +#include +#include +#endif + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_RTABLE); +#endif + +/* + * Radix search tree node layout. + */ + +struct radix_node { + struct radix_mask *rn_mklist; /* list of masks contained in subtree */ + struct radix_node *rn_parent; /* parent */ + short rn_bit; /* bit offset; -1-index(netmask) */ + char rn_bmask; /* node: mask for bit test*/ + u_char rn_flags; /* enumerated next */ +#define RNF_NORMAL 1 /* leaf contains normal route */ +#define RNF_ROOT 2 /* leaf is root leaf for tree */ +#define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ + union { + struct { /* leaf only data: */ + caddr_t rn_Key; /* object of search */ + caddr_t rn_Mask; /* netmask, if present */ + struct radix_node *rn_Dupedkey; + } rn_leaf; + struct { /* node only data: */ + int rn_Off; /* where to start compare */ + struct radix_node *rn_L;/* progeny */ + struct radix_node *rn_R;/* progeny */ + } rn_node; + } rn_u; +#ifdef RN_DEBUG + int rn_info; + struct radix_node *rn_twin; + struct radix_node *rn_ybro; +#endif +}; + +#define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey +#define rn_key rn_u.rn_leaf.rn_Key +#define rn_mask rn_u.rn_leaf.rn_Mask +#define rn_offset rn_u.rn_node.rn_Off +#define rn_left rn_u.rn_node.rn_L +#define rn_right rn_u.rn_node.rn_R + +/* + * Annotations to tree concerning potential routes applying to subtrees. + */ + +struct radix_mask { + short rm_bit; /* bit offset; -1-index(netmask) */ + char rm_unused; /* cf. rn_bmask */ + u_char rm_flags; /* cf. rn_flags */ + struct radix_mask *rm_mklist; /* more masks to try */ + union { + caddr_t rmu_mask; /* the mask */ + struct radix_node *rmu_leaf; /* for normal routes */ + } rm_rmu; + int rm_refs; /* # of references to this struct */ +}; + +#define rm_mask rm_rmu.rmu_mask +#define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ + +typedef int walktree_f_t(struct radix_node *, void *); + +struct radix_node_head { + struct radix_node *rnh_treetop; + int rnh_addrsize; /* permit, but not require fixed keys */ + int rnh_pktsize; /* permit, but not require fixed keys */ + struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ + (void *v, struct radix_node_head *head); + struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ + (void *v, struct radix_node_head *head); + int (*rnh_walktree) /* traverse tree */ + (struct radix_node_head *head, walktree_f_t *f, void *w); + int (*rnh_walktree_from) /* traverse tree below a */ + (struct radix_node_head *head, void *a, void *m, + walktree_f_t *f, void *w); + void (*rnh_close) /* do something when the last ref drops */ + (struct radix_node *rn, struct radix_node_head *head); + struct radix_node rnh_nodes[3]; /* empty tree for common case */ + int rnh_multipath; /* multipath capable ? */ +#ifdef _KERNEL +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rnh_lock; +#else + struct rwlock rnh_lock; /* locks entire radix tree */ +#endif /* !__linux__ */ +#endif +}; + +#ifndef _KERNEL +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) +#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) +#define Free(p) free((char *)p); +#else +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) +#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) +#define Free(p) free((caddr_t)p, M_RTABLE); + +#define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ + rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) +#define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) + + +#define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) +#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) +#endif /* _KERNEL */ + +void rn_init(int); +int rn_inithead(void **, int); +int rn_detachhead(void **); +int rn_refines(void *, void *); +struct radix_node + *rn_addmask(void *, int, int), + *rn_addroute (void *, void *, struct radix_node_head *, + struct radix_node [2]), + *rn_delete(void *, void *, struct radix_node_head *), + *rn_lookup (void *v_arg, void *m_arg, + struct radix_node_head *head), + *rn_match(void *, struct radix_node_head *); + +#endif /* _RADIX_H_ */ diff --git a/dummynet2/include/netgraph/ng_ipfw.h b/dummynet2/include/netgraph/ng_ipfw.h new file mode 100644 index 0000000..55fd890 --- /dev/null +++ b/dummynet2/include/netgraph/ng_ipfw.h @@ -0,0 +1,33 @@ +/*- + * Copyright 2005, Gleb Smirnoff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $ + */ +#ifndef __NG_IPFW_H +#define __NG_IPFW_H + +#define NG_IPFW_NODE_TYPE "ipfw" +#define NGM_IPFW_COOKIE 1105988990 +#endif /* __NG_IPFW_H */ diff --git a/dummynet2/include/netinet/ip.h b/dummynet2/include/netinet/ip.h new file mode 100644 index 0000000..c9da4d8 --- /dev/null +++ b/dummynet2/include/netinet/ip.h @@ -0,0 +1,49 @@ +#ifndef _NETINET_IP_H_ +#define _NETINET_IP_H_ + +#define LITTLE_ENDIAN 1234 +#define BIG_ENDIAN 4321 +#if defined(__BIG_ENDIAN) +#define BYTE_ORDER BIG_ENDIAN +//#warning we are in bigendian +#elif defined(__LITTLE_ENDIAN) +//#warning we are in littleendian +#define BYTE_ORDER LITTLE_ENDIAN +#else +#error no platform +#endif + +/* XXX endiannes doesn't belong here */ +// #define LITTLE_ENDIAN 1234 +// #define BIG_ENDIAN 4321 +// #define BYTE_ORDER LITTLE_ENDIAN + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#if BYTE_ORDER == LITTLE_ENDIAN + u_char ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif + u_char ip_tos; /* type of service */ + u_short ip_len; /* total length */ + u_short ip_id; /* identification */ + u_short ip_off; /* fragment offset field */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +} __packed __aligned(4); + +#define IPTOS_LOWDELAY 0x10 + +#endif /* _NETINET_IP_H_ */ diff --git a/dummynet2/include/netinet/ip6.h b/dummynet2/include/netinet/ip6.h new file mode 100644 index 0000000..88b42a4 --- /dev/null +++ b/dummynet2/include/netinet/ip6.h @@ -0,0 +1,58 @@ +#ifndef _NETINET_IP6_H_ +#define _NETINET_IP6_H_ +#define IN6_ARE_ADDR_EQUAL(a, b) \ +(memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0) + +struct ip6_hdr { + union { + struct ip6_hdrctl { + u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */ + u_int16_t ip6_un1_plen; /* payload length */ + u_int8_t ip6_un1_nxt; /* next header */ + u_int8_t ip6_un1_hlim; /* hop limit */ + } ip6_un1; + u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */ + } ip6_ctlun; + struct in6_addr ip6_src; /* source address */ + struct in6_addr ip6_dst; /* destination address */ +}; +#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt +#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow + + +struct icmp6_hdr { + u_int8_t icmp6_type; /* type field */ + u_int8_t icmp6_code; /* code field */ + u_int16_t icmp6_cksum; /* checksum field */ + union { + u_int32_t icmp6_un_data32[1]; /* type-specific field */ + u_int16_t icmp6_un_data16[2]; /* type-specific field */ + u_int8_t icmp6_un_data8[4]; /* type-specific field */ + } icmp6_dataun; +}; + +struct ip6_hbh { + u_int8_t ip6h_nxt; /* next header */ + u_int8_t ip6h_len; /* length in units of 8 octets */ + /* followed by options */ +}; +struct ip6_rthdr { + u_int8_t ip6r_nxt; /* next header */ + u_int8_t ip6r_len; /* length in units of 8 octets */ + u_int8_t ip6r_type; /* routing type */ + u_int8_t ip6r_segleft; /* segments left */ + /* followed by routing type specific data */ +}; +struct ip6_frag { + u_int8_t ip6f_nxt; /* next header */ + u_int8_t ip6f_reserved; /* reserved field */ + u_int16_t ip6f_offlg; /* offset, reserved, and flag */ + u_int32_t ip6f_ident; /* identification */ +}; +#define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */ +#define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */ +struct ip6_ext { + u_int8_t ip6e_nxt; + u_int8_t ip6e_len; +}; +#endif /* _NETINET_IP6_H_ */ diff --git a/dummynet2/include/netinet/ip_dummynet.h b/dummynet2/include/netinet/ip_dummynet.h new file mode 100644 index 0000000..961f850 --- /dev/null +++ b/dummynet2/include/netinet/ip_dummynet.h @@ -0,0 +1,261 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_dummynet.h 203321 2010-01-31 21:39:25Z luigi $ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of the kernel-userland API for dummynet. + * + * Setsockopt() and getsockopt() pass a batch of objects, each + * of them starting with a "struct dn_id" which should fully identify + * the object and its relation with others in the sequence. + * The first object in each request should have + * type= DN_CMD_*, id = DN_API_VERSION. + * For other objects, type and subtype specify the object, len indicates + * the total length including the header, and 'id' identifies the specific + * object. + * + * Most objects are numbered with an identifier in the range 1..65535. + * DN_MAX_ID indicates the first value outside the range. + */ + +#define DN_API_VERSION 12500000 +#define DN_MAX_ID 0x10000 + +struct dn_id { + uint16_t len; /* total obj len including this header */ + uint8_t type; + uint8_t subtype; + uint32_t id; /* generic id */ +}; + +/* + * These values are in the type field of struct dn_id. + * To preserve the ABI, never rearrange the list or delete + * entries with the exception of DN_LAST + */ +enum { + DN_NONE = 0, + DN_LINK = 1, + DN_FS, + DN_SCH, + DN_SCH_I, + DN_QUEUE, + DN_DELAY_LINE, + DN_PROFILE, + DN_FLOW, /* struct dn_flow */ + DN_TEXT, /* opaque text is the object */ + + DN_CMD_CONFIG = 0x80, /* objects follow */ + DN_CMD_DELETE, /* subtype + list of entries */ + DN_CMD_GET, /* subtype + list of entries */ + DN_CMD_FLUSH, + /* for compatibility with FreeBSD 7.2/8 */ + DN_COMPAT_PIPE, + DN_COMPAT_QUEUE, + DN_GET_COMPAT, + + /* special commands for emulation of sysctl variables */ + DN_SYSCTL_GET, + DN_SYSCTL_SET, + + DN_LAST, +}; + +enum { /* subtype for schedulers, flowset and the like */ + DN_SCHED_UNKNOWN = 0, + DN_SCHED_FIFO = 1, + DN_SCHED_WF2QP = 2, + /* others are in individual modules */ +}; + +enum { /* user flags */ + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ + DN_NOERROR = 0x0002, /* do not report errors */ + DN_QHT_HASH = 0x0004, /* qht is a hash table */ + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ + DN_IS_RED = 0x0020, + DN_IS_GENTLE_RED= 0x0040, + DN_PIPE_CMD = 0x1000, /* pipe config... */ +}; + +/* + * link template. + */ +struct dn_link { + struct dn_id oid; + + /* + * Userland sets bw and delay in bits/s and milliseconds. + * The kernel converts this back and forth to bits/tick and ticks. + * XXX what about burst ? + */ + int32_t link_nr; + int bandwidth; /* bit/s or bits/tick. */ + int delay; /* ms and ticks */ + uint64_t burst; /* scaled. bits*Hz XXX */ +}; + +/* + * A flowset, which is a template for flows. Contains parameters + * from the command line: id, target scheduler, queue sizes, plr, + * flow masks, buckets for the flow hash, and possibly scheduler- + * specific parameters (weight, quantum and so on). + */ +struct dn_fs { + struct dn_id oid; + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ + int qsize; /* queue size in slots or bytes */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t buckets; /* buckets used for the queue hash table */ + + struct ipfw_flow_id flow_mask; + uint32_t sched_nr; /* the scheduler we attach to */ + /* generic scheduler parameters. Leave them at -1 if unset. + * Now we use 0: weight, 1: lmax, 2: priority + */ + int par[4]; + + /* RED/GRED parameters. + * weight and probabilities are in the range 0..1 represented + * in fixed point arithmetic with SCALE_RED decimal bits. + */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + +}; + +/* + * dn_flow collects flow_id and stats for queues and scheduler + * instances, and is used to pass these info to userland. + * oid.type/oid.subtype describe the object, oid.id is number + * of the parent object. + */ +struct dn_flow { + struct dn_id oid; + struct ipfw_flow_id fid; + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue lenght, in packets */ + uint32_t len_bytes; /* Queue lenght, in bytes */ + uint32_t drops; +}; + + +/* + * Scheduler template, mostly indicating the name, number, + * sched_mask and buckets. + */ +struct dn_sch { + struct dn_id oid; + uint32_t sched_nr; /* N, scheduler number */ + uint32_t buckets; /* number of buckets for the instances */ + uint32_t flags; /* have_mask, ... */ + + char name[16]; /* null terminated */ + /* mask to select the appropriate scheduler instance */ + struct ipfw_flow_id sched_mask; /* M */ +}; + + +/* A delay profile is attached to a link. + * Note that a profile, as any other object, cannot be longer than 2^16 + */ +#define ED_MAX_SAMPLES_NO 1024 +struct dn_profile { + struct dn_id oid; + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int link_nr; + int loss_level; + int bandwidth; // XXX use link bandwidth? + int samples_no; /* actual len of samples[] */ + int samples[0]; /* may be shorter */ +}; + +/* + * Overall structure of dummynet + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE (bad name). + +A QUEUE defines a classifier, which groups packets into flows +according to a 'mask', puts them into independent queues (one +per flow) with configurable size and queue management policy, +and passes flows to a scheduler: + + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ + +Many QUEUE objects can connect to the same scheduler, each +QUEUE object can have its own set of parameters. + +In turn, the SCHEDuler 'forks' multiple instances according +to a 'sched_mask', each instance manages its own set of queues +and transmits on a private instance of a configurable LINK. + +A PIPE is a simplified version of the above, where there +is no flow_mask, and each scheduler instance handles a single queue. + +The following data structures (visible from userland) describe +the objects used by dummynet: + + + dn_link, contains the main configuration parameters related + to delay and bandwidth; + + dn_profile describes a delay profile; + + dn_flow describes the flow status (flow id, statistics) + + + dn_sch describes a scheduler + + dn_fs describes a flowset (msk, weight, queue parameters) + + * + */ + +#endif /* _IP_DUMMYNET_H */ diff --git a/dummynet2/include/netinet/ip_fw.h b/dummynet2/include/netinet/ip_fw.h new file mode 100644 index 0000000..5e77119 --- /dev/null +++ b/dummynet2/include/netinet/ip_fw.h @@ -0,0 +1,585 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_fw.h 202072 2010-01-11 10:12:35Z luigi $ + */ + +#ifndef _IPFW2_H +#define _IPFW2_H + +/* + * The default rule number. By the design of ip_fw, the default rule + * is the last one, so its number can also serve as the highest number + * allowed for a rule. The ip_fw code relies on both meanings of this + * constant. + */ +#define IPFW_DEFAULT_RULE 65535 + +/* + * The number of ipfw tables. The maximum allowed table number is the + * (IPFW_TABLES_MAX - 1). + */ +#define IPFW_TABLES_MAX 128 + +/* + * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit + * argument between 1 and 65534. The value 0 is unused, the value + * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the + * can be 1..65534, or 65535 to indicate the use of a 'tablearg' + * result of the most recent table() lookup. + * Note that 16bit is only a historical limit, resulting from + * the use of a 16-bit fields for that value. In reality, we can have + * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg. + */ +#define IPFW_ARG_MIN 1 +#define IPFW_ARG_MAX 65534 +#define IP_FW_TABLEARG 65535 /* XXX should use 0 */ + +/* + * The kernel representation of ipfw rules is made of a list of + * 'instructions' (for all practical purposes equivalent to BPF + * instructions), which specify which fields of the packet + * (or its metadata) should be analysed. + * + * Each instruction is stored in a structure which begins with + * "ipfw_insn", and can contain extra fields depending on the + * instruction type (listed below). + * Note that the code is written so that individual instructions + * have a size which is a multiple of 32 bits. This means that, if + * such structures contain pointers or other 64-bit entities, + * (there is just one instance now) they may end up unaligned on + * 64-bit architectures, so the must be handled with care. + * + * "enum ipfw_opcodes" are the opcodes supported. We can have up + * to 256 different opcodes. When adding new opcodes, they should + * be appended to the end of the opcode list before O_LAST_OPCODE, + * this will prevent the ABI from being broken, otherwise users + * will have to recompile ipfw(8) when they update the kernel. + */ + +enum ipfw_opcodes { /* arguments (4 byte each) */ + O_NOP, + + O_IP_SRC, /* u32 = IP */ + O_IP_SRC_MASK, /* ip = IP/mask */ + O_IP_SRC_ME, /* none */ + O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_DST, /* u32 = IP */ + O_IP_DST_MASK, /* ip = IP/mask */ + O_IP_DST_ME, /* none */ + O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ + O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ + O_PROTO, /* arg1=protocol */ + + O_MACADDR2, /* 2 mac addr:mask */ + O_MAC_TYPE, /* same as srcport */ + + O_LAYER2, /* none */ + O_IN, /* none */ + O_FRAG, /* none */ + + O_RECV, /* none */ + O_XMIT, /* none */ + O_VIA, /* none */ + + O_IPOPT, /* arg1 = 2*u8 bitmap */ + O_IPLEN, /* arg1 = len */ + O_IPID, /* arg1 = id */ + + O_IPTOS, /* arg1 = id */ + O_IPPRECEDENCE, /* arg1 = precedence << 5 */ + O_IPTTL, /* arg1 = TTL */ + + O_IPVER, /* arg1 = version */ + O_UID, /* u32 = id */ + O_GID, /* u32 = id */ + O_ESTAB, /* none (tcp established) */ + O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ + O_TCPWIN, /* arg1 = desired win */ + O_TCPSEQ, /* u32 = desired seq. */ + O_TCPACK, /* u32 = desired seq. */ + O_ICMPTYPE, /* u32 = icmp bitmap */ + O_TCPOPTS, /* arg1 = 2*u8 bitmap */ + + O_VERREVPATH, /* none */ + O_VERSRCREACH, /* none */ + + O_PROBE_STATE, /* none */ + O_KEEP_STATE, /* none */ + O_LIMIT, /* ipfw_insn_limit */ + O_LIMIT_PARENT, /* dyn_type, not an opcode. */ + + /* + * These are really 'actions'. + */ + + O_LOG, /* ipfw_insn_log */ + O_PROB, /* u32 = match probability */ + + O_CHECK_STATE, /* none */ + O_ACCEPT, /* none */ + O_DENY, /* none */ + O_REJECT, /* arg1=icmp arg (same as deny) */ + O_COUNT, /* none */ + O_SKIPTO, /* arg1=next rule number */ + O_PIPE, /* arg1=pipe number */ + O_QUEUE, /* arg1=queue number */ + O_DIVERT, /* arg1=port number */ + O_TEE, /* arg1=port number */ + O_FORWARD_IP, /* fwd sockaddr */ + O_FORWARD_MAC, /* fwd mac */ + O_NAT, /* nope */ + O_REASS, /* none */ + + /* + * More opcodes. + */ + O_IPSEC, /* has ipsec history */ + O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ + O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + O_ANTISPOOF, /* none */ + O_JAIL, /* u32 = id */ + O_ALTQ, /* u32 = altq classif. qid */ + O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ + O_TCPDATALEN, /* arg1 = tcp data len */ + O_IP6_SRC, /* address without mask */ + O_IP6_SRC_ME, /* my addresses */ + O_IP6_SRC_MASK, /* address with the mask */ + O_IP6_DST, + O_IP6_DST_ME, + O_IP6_DST_MASK, + O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ + O_ICMP6TYPE, /* icmp6 packet type filtering */ + O_EXT_HDR, /* filtering for ipv6 extension header */ + O_IP6, + + /* + * actions for ng_ipfw + */ + O_NETGRAPH, /* send to ng_ipfw */ + O_NGTEE, /* copy to ng_ipfw */ + + O_IP4, + + O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ + + O_TAG, /* arg1=tag number */ + O_TAGGED, /* arg1=tag number */ + + O_SETFIB, /* arg1=FIB number */ + O_FIB, /* arg1=FIB desired fib number */ + + O_LAST_OPCODE /* not an opcode! */ +}; + +/* + * The extension header are filtered only for presence using a bit + * vector with a flag for each header. + */ +#define EXT_FRAGMENT 0x1 +#define EXT_HOPOPTS 0x2 +#define EXT_ROUTING 0x4 +#define EXT_AH 0x8 +#define EXT_ESP 0x10 +#define EXT_DSTOPTS 0x20 +#define EXT_RTHDR0 0x40 +#define EXT_RTHDR2 0x80 + +/* + * Template for instructions. + * + * ipfw_insn is used for all instructions which require no operands, + * a single 16-bit value (arg1), or a couple of 8-bit values. + * + * For other instructions which require different/larger arguments + * we have derived structures, ipfw_insn_*. + * + * The size of the instruction (in 32-bit words) is in the low + * 6 bits of "len". The 2 remaining bits are used to implement + * NOT and OR on individual instructions. Given a type, you can + * compute the length to be put in "len" using F_INSN_SIZE(t) + * + * F_NOT negates the match result of the instruction. + * + * F_OR is used to build or blocks. By default, instructions + * are evaluated as part of a logical AND. An "or" block + * { X or Y or Z } contains F_OR set in all but the last + * instruction of the block. A match will cause the code + * to skip past the last instruction of the block. + * + * NOTA BENE: in a couple of places we assume that + * sizeof(ipfw_insn) == sizeof(u_int32_t) + * this needs to be fixed. + * + */ +typedef struct _ipfw_insn { /* template for instructions */ + u_int8_t opcode; + u_int8_t len; /* number of 32-bit words */ +#define F_NOT 0x80 +#define F_OR 0x40 +#define F_LEN_MASK 0x3f +#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) + + u_int16_t arg1; +} ipfw_insn; + +/* + * The F_INSN_SIZE(type) computes the size, in 4-byte words, of + * a given type. + */ +#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) + +/* + * This is used to store an array of 16-bit entries (ports etc.) + */ +typedef struct _ipfw_insn_u16 { + ipfw_insn o; + u_int16_t ports[2]; /* there may be more */ +} ipfw_insn_u16; + +/* + * This is used to store an array of 32-bit entries + * (uid, single IPv4 addresses etc.) + */ +typedef struct _ipfw_insn_u32 { + ipfw_insn o; + u_int32_t d[1]; /* one or more */ +} ipfw_insn_u32; + +/* + * This is used to store IP addr-mask pairs. + */ +typedef struct _ipfw_insn_ip { + ipfw_insn o; + struct in_addr addr; + struct in_addr mask; +} ipfw_insn_ip; + +/* + * This is used to forward to a given address (ip). + */ +typedef struct _ipfw_insn_sa { + ipfw_insn o; + struct sockaddr_in sa; +} ipfw_insn_sa; + +/* + * This is used for MAC addr-mask pairs. + */ +typedef struct _ipfw_insn_mac { + ipfw_insn o; + u_char addr[12]; /* dst[6] + src[6] */ + u_char mask[12]; /* dst[6] + src[6] */ +} ipfw_insn_mac; + +/* + * This is used for interface match rules (recv xx, xmit xx). + */ +typedef struct _ipfw_insn_if { + ipfw_insn o; + union { + struct in_addr ip; + int glob; + } p; + char name[IFNAMSIZ]; +} ipfw_insn_if; + +/* + * This is used for storing an altq queue id number. + */ +typedef struct _ipfw_insn_altq { + ipfw_insn o; + u_int32_t qid; +} ipfw_insn_altq; + +/* + * This is used for limit rules. + */ +typedef struct _ipfw_insn_limit { + ipfw_insn o; + u_int8_t _pad; + u_int8_t limit_mask; /* combination of DYN_* below */ +#define DYN_SRC_ADDR 0x1 +#define DYN_SRC_PORT 0x2 +#define DYN_DST_ADDR 0x4 +#define DYN_DST_PORT 0x8 + + u_int16_t conn_limit; +} ipfw_insn_limit; + +/* + * This is used for log instructions. + */ +typedef struct _ipfw_insn_log { + ipfw_insn o; + u_int32_t max_log; /* how many do we log -- 0 = all */ + u_int32_t log_left; /* how many left to log */ +} ipfw_insn_log; + +/* + * Data structures required by both ipfw(8) and ipfw(4) but not part of the + * management API are protected by IPFW_INTERNAL. + */ +#ifdef IPFW_INTERNAL +/* Server pool support (LSNAT). */ +struct cfg_spool { + LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ + struct in_addr addr; + u_short port; +}; +#endif + +/* Redirect modes id. */ +#define REDIR_ADDR 0x01 +#define REDIR_PORT 0x02 +#define REDIR_PROTO 0x04 + +#ifdef IPFW_INTERNAL +/* Nat redirect configuration. */ +struct cfg_redir { + LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ + u_int16_t mode; /* type of redirect mode */ + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + u_short lport; /* local port */ + u_short pport; /* public port */ + u_short rport; /* remote port */ + u_short pport_cnt; /* number of public ports */ + u_short rport_cnt; /* number of remote ports */ + int proto; /* protocol: tcp/udp */ + struct alias_link **alink; + /* num of entry in spool chain */ + u_int16_t spool_cnt; + /* chain of spool instances */ + LIST_HEAD(spool_chain, cfg_spool) spool_chain; +}; +#endif + +#define NAT_BUF_LEN 1024 + +#ifdef IPFW_INTERNAL +/* Nat configuration data struct. */ +struct cfg_nat { + /* chain of nat instances */ + LIST_ENTRY(cfg_nat) _next; + int id; /* nat id */ + struct in_addr ip; /* nat ip address */ + char if_name[IF_NAMESIZE]; /* interface name */ + int mode; /* aliasing mode */ + struct libalias *lib; /* libalias instance */ + /* number of entry in spool chain */ + int redir_cnt; + /* chain of redir instances */ + LIST_HEAD(redir_chain, cfg_redir) redir_chain; +}; +#endif + +#define SOF_NAT sizeof(struct cfg_nat) +#define SOF_REDIR sizeof(struct cfg_redir) +#define SOF_SPOOL sizeof(struct cfg_spool) + +/* Nat command. */ +typedef struct _ipfw_insn_nat { + ipfw_insn o; + struct cfg_nat *nat; +} ipfw_insn_nat; + +/* Apply ipv6 mask on ipv6 addr */ +#define APPLY_MASK(addr,mask) \ + (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ + (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ + (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ + (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; + +/* Structure for ipv6 */ +typedef struct _ipfw_insn_ip6 { + ipfw_insn o; + struct in6_addr addr6; + struct in6_addr mask6; +} ipfw_insn_ip6; + +/* Used to support icmp6 types */ +typedef struct _ipfw_insn_icmp6 { + ipfw_insn o; + uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h + * define ICMP6_MAXTYPE + * as follows: n = ICMP6_MAXTYPE/32 + 1 + * Actually is 203 + */ +} ipfw_insn_icmp6; + +/* + * Here we have the structure representing an ipfw rule. + * + * It starts with a general area (with link fields and counters) + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. + * + * Given a rule pointer r: + * + * r->cmd is the start of the first instruction. + * ACTION_PTR(r) is the start of the first action (things to do + * once a rule matched). + * + * When assembling instruction, remember the following: + * + * + if a rule has a "keep-state" (or "limit") option, then the + * first instruction (at r->cmd) MUST BE an O_PROBE_STATE + * + if a rule has a "log" option, then the first action + * (at ACTION_PTR(r)) MUST be O_LOG + * + if a rule has an "altq" option, it comes after "log" + * + if a rule has an O_TAG option, it comes after "log" and "altq" + * + * NOTE: we use a simple linked list of rules because we never need + * to delete a rule without scanning the list. We do not use + * queue(3) macros for portability and readability. + */ + +struct ip_fw { +#ifdef _X64EMU + int32_t pad1; +#endif + struct ip_fw *x_next; /* linked list of rules */ +#ifdef _X64EMU + int32_t pad2; +#endif + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ +#define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + uint32_t id; /* rule id */ + + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +#define ACTION_PTR(rule) \ + (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) + +#define RULESIZE(rule) (sizeof(struct ip_fw) + \ + ((struct ip_fw *)(rule))->cmd_len * 4 - 4) + +#if 1 // should be moved to in.h +/* + * This structure is used as a flow mask and a flow id for various + * parts of the code. + * addr_type is used in userland and kernel to mark the address type. + * fib is used in the kernel to record the fib in use. + * _flags is used in the kernel to store tcp flags for dynamic rules. + */ +struct ipfw_flow_id { + uint32_t dst_ip; + uint32_t src_ip; + uint16_t dst_port; + uint16_t src_port; + uint8_t fib; + uint8_t proto; + uint8_t _flags; /* protocol-specific flags */ + uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ + struct in6_addr dst_ip6; + struct in6_addr src_ip6; + uint32_t flow_id6; + uint32_t extra; /* queue/pipe or frag_id */ +}; +#endif + +#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) + +/* + * Dynamic ipfw rule. + */ +typedef struct _ipfw_dyn_rule ipfw_dyn_rule; + +struct _ipfw_dyn_rule { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ipfw_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ +#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ + +/* + * These are used for lookup tables. + */ +typedef struct _ipfw_table_entry { + in_addr_t addr; /* network address */ + u_int32_t value; /* value */ + u_int16_t tbl; /* table number */ + u_int8_t masklen; /* mask length */ +} ipfw_table_entry; + +typedef struct _ipfw_table { + u_int32_t size; /* size of entries in bytes */ + u_int32_t cnt; /* # of entries */ + u_int16_t tbl; /* table number */ + ipfw_table_entry ent[0]; /* entries */ +} ipfw_table; + +#endif /* _IPFW2_H */ diff --git a/dummynet2/include/netinet/ip_icmp.h b/dummynet2/include/netinet/ip_icmp.h new file mode 100644 index 0000000..5c7b851 --- /dev/null +++ b/dummynet2/include/netinet/ip_icmp.h @@ -0,0 +1,17 @@ +/* + * additional define not present in linux + * should go in glue.h + */ +#ifndef _NETINET_IP_ICMP_H_ +#define _NETINET_IP_ICMP_H_ + +#define ICMP_MAXTYPE 40 /* defined as 18 in compat.h */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_UNREACH_HOST 1 /* bad host */ + +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ + +#endif /* _NETINET_IP_ICMP_H_ */ diff --git a/dummynet2/include/netinet/ipfw/dn_heap.h b/dummynet2/include/netinet/ipfw/dn_heap.h new file mode 100644 index 0000000..09b2ac7 --- /dev/null +++ b/dummynet2/include/netinet/ipfw/dn_heap.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, header file + * + * $FreeBSD: head/sys/netinet/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $ + */ + +#ifndef _IP_DN_HEAP_H +#define _IP_DN_HEAP_H + +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) + +/* + * This module implements a binary heap supporting random extraction. + * + * A heap entry contains an uint64_t key and a pointer to object. + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' + * + * The heap is a struct dn_heap plus a dynamically allocated + * array of dn_heap_entry entries. 'size' represents the size of + * the array, 'elements' count entries in use. The topmost + * element has the smallest key. + * The heap supports ordered insert, and extract from the top. + * To extract an object from the middle of the heap, we the object + * must reserve an 'int32_t' to store the position of the object + * in the heap itself, and the location of this field must be + * passed as an argument to heap_init() -- use -1 if the feature + * is not used. + */ +struct dn_heap_entry { + uint64_t key; /* sorting key, smallest comes first */ + void *object; /* object pointer */ +}; + +struct dn_heap { + int size; /* the size of the array */ + int elements; /* elements in use */ + int ofs; /* offset in the object of heap index */ + struct dn_heap_entry *p; /* array of "size" entries */ +}; + +enum { + HEAP_SCAN_DEL = 1, + HEAP_SCAN_END = 2, +}; + +/* + * heap_init() reinitializes the heap setting the size and the offset + * of the index for random extraction (use -1 if not used). + * The 'elements' counter is set to 0. + * + * SET_HEAP_OFS() indicates where, in the object, is stored the index + * for random extractions from the heap. + * + * heap_free() frees the memory associated to a heap. + * + * heap_insert() adds a key-pointer pair to the heap + * + * HEAP_TOP() returns a pointer to the top element of the heap, + * but makes no checks on its existance (XXX should we change ?) + * + * heap_extract() removes the entry at the top, returing the pointer. + * (the key should have been read before). + * + * heap_scan() invokes a callback on each entry of the heap. + * The callback can return a combination of HEAP_SCAN_DEL and + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must + * be removed, and HEAP_SCAN_END means to terminate the scan. + * heap_scan() returns the number of elements removed. + * Because the order is not guaranteed, we should use heap_scan() + * only as a last resort mechanism. + */ +#define HEAP_TOP(h) ((h)->p) +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) +int heap_init(struct dn_heap *h, int size, int ofs); +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); +void heap_extract(struct dn_heap *h, void *obj); +void heap_free(struct dn_heap *h); +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); + +/*------------------------------------------------------ + * This module implements a generic hash table with support for + * running callbacks on the entire table. To avoid allocating + * memory during hash table operations, objects must reserve + * space for a link field. XXX if the heap is moderately full, + * an SLIST suffices, and we can tolerate the cost of a hash + * computation on each removal. + * + * dn_ht_init() initializes the table, setting the number of + * buckets, the offset of the link field, the main callbacks. + * Callbacks are: + * + * hash(key, flags, arg) called to return a bucket index. + * match(obj, key, flags, arg) called to determine if key + * matches the current 'obj' in the heap + * newh(key, flags, arg) optional, used to allocate a new + * object during insertions. + * + * dn_ht_free() frees the heap or unlink elements. + * DNHT_REMOVE unlink elements, 0 frees the heap. + * You need two calls to do both. + * + * dn_ht_find() is the main lookup function, which can also be + * used to insert or delete elements in the hash table. + * The final 'arg' is passed to all callbacks. + * + * dn_ht_scan() is used to invoke a callback on all entries of + * the heap, or possibly on just one bucket. The callback + * is invoked with a pointer to the object, and must return + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the + * removal of the object from the heap and the end of the + * scan, respectively. + * + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans + * only the specific bucket of the table. The bucket is a in-out + * parameter and return a valid bucket number if the original + * is invalid. + * + * A combination of flags can be used to modify the operation + * of the dn_ht_find(), and of the callbacks: + * + * DNHT_KEY_IS_OBJ means the key is the object pointer. + * It is usally of interest for the hash and match functions. + * + * DNHT_MATCH_PTR during a lookup, match pointers instead + * of calling match(). Normally used when removing specific + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used + * by the match function. + * + * DNHT_INSERT insert the element if not found. + * Calls new() to allocates a new object unless + * DNHT_KEY_IS_OBJ is set. + * + * DNHT_UNIQUE only insert if object not found. + * XXX should it imply DNHT_INSERT ? + * + * DNHT_REMOVE remove objects if we find them. + */ +struct dn_ht; /* should be opaque */ + +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, + uint32_t (*hash)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)); +void dn_ht_free(struct dn_ht *, int flags); + +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); +int dn_ht_entries(struct dn_ht *); + +enum { /* flags values. + * first two are returned by the scan callback to indicate + * to delete the matching element or to end the scan + */ + DNHT_SCAN_DEL = 0x0001, + DNHT_SCAN_END = 0x0002, + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ + DNHT_INSERT = 0x0010, /* insert if not found */ + DNHT_UNIQUE = 0x0020, /* report error if already there */ + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ +}; + +#endif /* _IP_DN_HEAP_H */ diff --git a/dummynet2/include/netinet/ipfw/dn_sched.h b/dummynet2/include/netinet/ipfw/dn_sched.h new file mode 100644 index 0000000..a755e86 --- /dev/null +++ b/dummynet2/include/netinet/ipfw/dn_sched.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The API to write a packet scheduling algorithm for dummynet. + * + * $FreeBSD: head/sys/netinet/ipfw/dn_sched.h 204591 2010-03-02 17:40:48Z luigi $ + */ + +#ifndef _DN_SCHED_H +#define _DN_SCHED_H + +#define DN_MULTIQUEUE 0x01 +/* + * Descriptor for a scheduling algorithm. + * Contains all function pointers for a given scheduler + * This is typically created when a module is loaded, and stored + * in a global list of schedulers. + */ +struct dn_alg { + uint32_t type; /* the scheduler type */ + const char *name; /* scheduler name */ + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ + + /* + * The following define the size of 3 optional data structures + * that may need to be allocated at runtime, and are appended + * to each of the base data structures: scheduler, sched.inst, + * and queue. We don't have a per-flowset structure. + */ + /* + parameters attached to the template, e.g. + * default queue sizes, weights, quantum size, and so on; + */ + size_t schk_datalen; + + /* + per-instance parameters, such as timestamps, + * containers for queues, etc; + */ + size_t si_datalen; + + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ + + /* + * Methods implemented by the scheduler: + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. + * q is NULL for !MULTIQUEUE. + * Return 0 on success, 1 on drop (packet consumed anyways). + * Note that q should be interpreted only as a hint + * on the flow that the mbuf belongs to: while a + * scheduler will normally enqueue m into q, it is ok + * to leave q alone and put the mbuf elsewhere. + * This function is called in two cases: + * - when a new packet arrives to the scheduler; + * - when a scheduler is reconfigured. In this case the + * call is issued by the new_queue callback, with a + * non empty queue (q) and m pointing to the first + * mbuf in the queue. For this reason, the function + * should internally check for (m != q->mq.head) + * before calling dn_enqueue(). + * + * dequeue Called when scheduler instance 's' can + * dequeue a packet. Return NULL if none are available. + * XXX what about non work-conserving ? + * + * config called on 'sched X config ...', normally writes + * in the area of size sch_arg + * + * destroy called on 'sched delete', frees everything + * in sch_arg (other parts are handled by more specific + * functions) + * + * new_sched called when a new instance is created, e.g. + * to create the local queue for !MULTIQUEUE, set V or + * copy parameters for WFQ, and so on. + * + * free_sched called when deleting an instance, cleans + * extra data in the per-instance area. + * + * new_fsk called when a flowset is linked to a scheduler, + * e.g. to validate parameters such as weights etc. + * free_fsk when a flowset is unlinked from a scheduler. + * (probably unnecessary) + * + * new_queue called to set the per-queue parameters, + * e.g. S and F, adjust sum of weights in the parent, etc. + * + * The new_queue callback is normally called from when + * creating a new queue. In some cases (such as a + * scheduler change or reconfiguration) it can be called + * with a non empty queue. In this case, the queue + * In case of non empty queue, the new_queue callback could + * need to call the enqueue function. In this case, + * the callback should eventually call enqueue() passing + * as m the first element in the queue. + * + * free_queue actions related to a queue removal, e.g. undo + * all the above. If the queue has data in it, also remove + * from the scheduler. This can e.g. happen during a reconfigure. + * If safe == 1 remove the queue only if the scheduler no longer + * need it, otherwise delete it even if the scheduler is using + * it. Usually, the flag safe is set when the drain routine is + * running to delete idle queues. + */ + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*dequeue)(struct dn_sch_inst *); + + int (*config)(struct dn_schk *); + int (*destroy)(struct dn_schk*); + int (*new_sched)(struct dn_sch_inst *); + int (*free_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *f); + int (*free_fsk)(struct dn_fsk *f); + int (*new_queue)(struct dn_queue *q); + int (*free_queue)(struct dn_queue *q, int safe); + + /* run-time fields */ + int ref_count; /* XXX number of instances in the system */ + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ +}; + +/* MSVC does not support initializers so we need this ugly macro */ +#ifdef _WIN32 +#define _SI(fld) +#else +#define _SI(fld) fld +#endif + +/* + * Additionally, dummynet exports some functions and macros + * to be used by schedulers: + */ + +void dn_free_pkts(struct mbuf *mnext); +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); +/* bound a variable between min and max */ +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); + +/* + * Extract the head of a queue, update stats. Must be the very last + * thing done on a dequeue as the queue itself may go away. + */ +static __inline struct mbuf* +dn_dequeue(struct dn_queue *q) +{ + struct mbuf *m = q->mq.head; + if (m == NULL) + return NULL; + q->mq.head = m->m_nextpkt; + + /* Update stats for the queue */ + q->ni.length--; + q->ni.len_bytes -= m->m_pkthdr.len; + /* When the queue becomes idle, update idle_time (used by RED) + * and also update the count of idle queues (for garbage collection). + */ + if (q->ni.length == 0) { + dn_cfg.idle_queue++; + q->q_time = dn_cfg.curr_time; + } + if (q->_si) { + struct dn_flow *ni = &(q->_si->ni); + /* update stats for the scheduler instance, and keep track + * of idle scheduler instances if needed + */ + ni->length--; + ni->len_bytes -= m->m_pkthdr.len; + if (ni->length == 0) + dn_cfg.idle_si++; + } + return m; +} + +int dn_sched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ + static moduledata_t name##_mod = { \ + #name, dn_sched_modevent, dnsched \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3); +#endif /* _DN_SCHED_H */ diff --git a/dummynet2/include/netinet/ipfw/ip_dn_private.h b/dummynet2/include/netinet/ipfw/ip_dn_private.h new file mode 100644 index 0000000..ecb4fe2 --- /dev/null +++ b/dummynet2/include/netinet/ipfw/ip_dn_private.h @@ -0,0 +1,419 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * internal dummynet APIs. + * + * $FreeBSD: head/sys/netinet/ipfw/ip_dn_private.h 204591 2010-03-02 17:40:48Z luigi $ + */ + +#ifndef _IP_DN_PRIVATE_H +#define _IP_DN_PRIVATE_H + +/* debugging support + * use ND() to remove debugging, D() to print a line, + * DX(level, ...) to print above a certain level + * If you redefine D() you are expected to redefine all. + */ +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +#endif + +MALLOC_DECLARE(M_DUMMYNET); + +#ifndef __linux__ +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) +#endif + +#define DN_LOCK_INIT() do { \ + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ + } while (0) +#define DN_LOCK_DESTROY() do { \ + mtx_destroy(&dn_cfg.uh_mtx); \ + mtx_destroy(&dn_cfg.bh_mtx); \ + } while (0) +#if 0 /* not used yet */ +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) +#endif + +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) + +SLIST_HEAD(dn_schk_head, dn_schk); +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); +SLIST_HEAD(dn_fsk_head, dn_fsk); +SLIST_HEAD(dn_queue_head, dn_queue); +SLIST_HEAD(dn_alg_head, dn_alg); + +struct mq { /* a basic queue of packets*/ + struct mbuf *head, *tail; +}; + +static inline void +set_oid(struct dn_id *o, int type, int len) +{ + o->type = type; + o->len = len; + o->subtype = 0; +}; + +uint64_t readTSC (void); +/* + * see if tsc (ot other timer) is supported. + * - FreeBSD has rdtsc macro for i386 and amd64 + * - Linux has rdtscll and/or rdtsc (also for openWRT patched kernel source) + * - Windows has KeQueryPerformanceCounter() function that use tsc or other + * timer + */ +#if defined(rdtscll) || defined(rdtsc) || defined(_WIN32) +#define HAVE_TSC +#endif +/* + * configuration and global data for a dummynet instance + * + * When a configuration is modified from userland, 'id' is incremented + * so we can use the value to check for stale pointers. + */ +struct dn_parms { + uint32_t id; /* configuration version */ + + /* defaults (sysctl-accessible) */ + int red_lookup_depth; + int red_avg_pkt_size; + int red_max_pkt_size; + int hash_size; + int max_hash_size; + long byte_limit; /* max queue sizes */ + long slot_limit; + + int io_fast; + int debug; + + /* timekeeping */ + struct timeval prev_t; /* last time dummynet_tick ran */ + struct dn_heap evheap; /* scheduled events */ + + /* counters of objects -- used for reporting space */ + int schk_count; + int si_count; + int fsk_count; + int queue_count; + + /* ticks and other stuff */ + uint64_t curr_time; /* in ticks */ + + /* + * Variables to manage the time spent in the drain routines. + * max_drain is max the fraction of a tick (0..100) to be used + * for draining. + * We also need some variables to store the average number of + * timecounter ticks between calls to the periodic task, etc. + */ + int drain_ratio; + uint64_t cycle_task_new; /* TSC when dummynet_task() starts */ + uint64_t cycle_task_old; /* TSC when prev. dummynet_task() starts */ + uint64_t cycle_task; + uint64_t cycle_task_avg; /* Moving average of cicle_task */ + + /* flowsets and schedulers are in hash tables, with 'hash_size' + * buckets. fshash is looked up at every packet arrival + * so better be generous if we expect many entries. + */ + struct dn_ht *fshash; + struct dn_ht *schedhash; + /* list of flowsets without a scheduler -- use sch_chain */ + struct dn_fsk_head fsu; /* list of unlinked flowsets */ + struct dn_alg_head schedlist; /* list of algorithms */ + + /* Counter of idle objects -- used by drain routine + * We scan when idle_queue (or idle_si) > expire_object. + * The drain routine is called every 'expire' cycles (the counter + * used is expire_cycle). + * We can disable the expire routine by setting expire to 0. + * An object is kept alive for at least object_idle_tick after it + * becomes idle. During the scan, we count the number of objects + * that are idle but not ready in 'idle_si_wait' and 'idle_queue_wait' + */ + int idle_queue; + int idle_queue_wait; /* idle but not expired yet */ + int idle_si; + int idle_si_wait; /* idle but not expired yet */ + uint32_t expire_object; /* threshold for expires */ + uint32_t expire; /* how often to expire */ + uint32_t expire_cycle; + uint32_t object_idle_tick; /* lifetime of objs */ + uint32_t expire_object_examined; /* Burst of object examined */ + + /* drain_fs and drain_sch point to the next bucket to scan when + * draining. + */ + uint32_t drain_fs; + uint32_t drain_sch; + + int init_done; + + /* if the upper half is busy doing something long, + * can set the busy flag and we will enqueue packets in + * a queue for later processing. + */ + int busy; + struct mq pending; + +#ifdef _KERNEL + /* + * This file is normally used in the kernel, unless we do + * some userland tests, in which case we do not need a mtx. + * uh_mtx arbitrates between system calls and also + * protects fshash, schedhash and fsunlinked. + * These structures are readonly for the lower half. + * bh_mtx protects all other structures which may be + * modified upon packet arrivals + */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_mtx; + spinlock_t bh_mtx; +#else + struct mtx uh_mtx; + struct mtx bh_mtx; +#endif + +#endif /* _KERNEL */ +}; + +/* + * Delay line, contains all packets on output from a link. + * Every scheduler instance has one. + */ +struct delay_line { + struct dn_id oid; + struct dn_sch_inst *si; + struct mq mq; +}; + +/* + * The kernel side of a flowset. It is linked in a hash table + * of flowsets, and in a list of children of their parent scheduler. + * qht is either the queue or (if HAVE_MASK) a hash table queues. + * Note that the mask to use is the (flow_mask|sched_mask), which + * changes as we attach/detach schedulers. So we store it here. + * + * XXX If we want to add scheduler-specific parameters, we need to + * put them in external storage because the scheduler may not be + * available when the fsk is created. + */ +struct dn_fsk { /* kernel side of a flowset */ + struct dn_fs fs; + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ + + struct ipfw_flow_id fsk_mask; + + /* qht is a hash table of queues, or just a single queue + * a bit in fs.flags tells us which one + */ + struct dn_ht *qht; + struct dn_schk *sched; /* Sched we are linked to */ + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ + + /* bucket index used by drain routine to drain queues for this + * flowset + */ + int drain_bucket; + /* Parameter realted to RED / GRED */ + /* original values are in dn_fs*/ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; + +/* + * A queue is created as a child of a flowset unless it belongs to + * a !MULTIQUEUE scheduler. It is normally in a hash table in the + * flowset. fs always points to the parent flowset. + * si normally points to the sch_inst, unless the flowset has been + * detached from the scheduler -- in this case si == NULL and we + * should not enqueue. + */ +struct dn_queue { + struct dn_flow ni; /* oid, flow_id, stats */ + struct mq mq; /* packets queue */ + struct dn_sch_inst *_si; /* owner scheduler instance */ + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ + struct dn_fsk *fs; /* parent flowset. */ + + /* RED parameters */ + int avg; /* average queue length est. (scaled) */ + int count; /* arrivals since last RED drop */ + int random; /* random value (scaled) */ + uint64_t q_time; /* start of queue idle time */ + +}; + +/* + * The kernel side of a scheduler. Contains the userland config, + * a link, pointer to extra config arguments from command line, + * kernel flags, and a pointer to the scheduler methods. + * It is stored in a hash table, and holds a list of all + * flowsets and scheduler instances. + * XXX sch must be at the beginning, see schk_hash(). + */ +struct dn_schk { + struct dn_sch sch; + struct dn_alg *fp; /* Pointer to scheduler functions */ + struct dn_link link; /* The link, embedded */ + struct dn_profile *profile; /* delay profile, if any */ + struct dn_id *cfg; /* extra config arguments */ + + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ + + struct dn_fsk_head fsk_list; /* all fsk linked to me */ + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ + + /* bucket index used by the drain routine to drain the scheduler + * instance for this flowset. + */ + int drain_bucket; + + /* Hash table of all instances (through sch.sched_mask) + * or single instance if no mask. Always valid. + */ + struct dn_ht *siht; +}; + + +/* + * Scheduler instance. + * Contains variables and all queues relative to a this instance. + * This struct is created a runtime. + */ +struct dn_sch_inst { + struct dn_flow ni; /* oid, flowid and stats */ + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ + struct delay_line dline; + struct dn_schk *sched; /* the template */ + int kflags; /* DN_ACTIVE */ + + int64_t credit; /* bits I can transmit (more or less). */ + uint64_t sched_time; /* time link was scheduled in ready_heap */ + uint64_t idle_time; /* start of scheduler instance idle time */ + + /* q_count is the number of queues that this instance is using. + * The counter is incremented or decremented when + * a reference from the queue is created or deleted. + * It is used to make sure that a scheduler instance can be safely + * deleted by the drain routine. + */ + int q_count; + +}; + + +/* kernel-side flags. Linux has DN_DELETE in fcntl.h + */ +enum { + /* 1 and 2 are reserved for the SCAN flags */ + DN_DESTROY = 0x0004, /* destroy */ + DN_DELETE_FS = 0x0008, /* destroy flowset */ + DN_DETACH = 0x0010, + DN_ACTIVE = 0x0020, /* object is in evheap */ + DN_F_DLINE = 0x0040, /* object is a delay line */ + DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed + * by scheduler */ + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ +}; + +extern struct dn_parms dn_cfg; +//VNET_DECLARE(struct dn_parms, _base_dn_cfg); +//#define dn_cfg VNET(_base_dn_cfg) + +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); +void dummynet_task(void *context, int pending); +void dn_reschedule(void); + +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct ipfw_flow_id *); +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); + +/* + * copy_range is a template for requests for ranges of pipes/queues/scheds. + * The number of ranges is variable and can be derived by o.len. + * As a default, we use a small number of entries so that the struct + * fits easily on the stack and is sufficient for most common requests. + */ +#define DEFAULT_RANGES 5 +struct copy_range { + struct dn_id o; + uint32_t r[ 2 * DEFAULT_RANGES ]; +}; + +struct copy_args { + char **start; + char *end; + int flags; + int type; + struct copy_range *extra; /* extra filtering */ +}; + +struct sockopt; +int ip_dummynet_compat(struct sockopt *sopt); +int dummynet_get(struct sockopt *sopt, void **compat); +int dn_c_copy_q (void *_ni, void *arg); +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); +int dn_compat_copy_queue(struct copy_args *a, void *_o); +int dn_compat_copy_pipe(struct copy_args *a, void *_o); +int copy_data_helper_compat(void *_o, void *_arg); +int dn_compat_calc_size(void); +int do_config(void *p, int l); + +/* function to drain idle object */ +void dn_drain_scheduler(void); +void dn_drain_queue(void); + +#endif /* _IP_DN_PRIVATE_H */ diff --git a/dummynet2/include/netinet/ipfw/ip_fw_private.h b/dummynet2/include/netinet/ipfw/ip_fw_private.h new file mode 100644 index 0000000..334face --- /dev/null +++ b/dummynet2/include/netinet/ipfw/ip_fw_private.h @@ -0,0 +1,301 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $ + */ + +#ifndef _IPFW2_PRIVATE_H +#define _IPFW2_PRIVATE_H + +/* + * Internal constants and data structures used by ipfw components + * and not meant to be exported outside the kernel. + */ + +#ifdef _KERNEL + +/* + * For platforms that do not have SYSCTL support, we wrap the + * SYSCTL_* into a function (one per file) to collect the values + * into an array at module initialization. The wrapping macros, + * SYSBEGIN() and SYSEND, are empty in the default case. + */ +#ifndef SYSBEGIN +#define SYSBEGIN(x) +#endif +#ifndef SYSEND +#define SYSEND +#endif + +/* Return values from ipfw_chk() */ +enum { + IP_FW_PASS = 0, + IP_FW_DENY, + IP_FW_DIVERT, + IP_FW_TEE, + IP_FW_DUMMYNET, + IP_FW_NETGRAPH, + IP_FW_NGTEE, + IP_FW_NAT, + IP_FW_REASS, +}; + +/* + * Structure for collecting parameters to dummynet for ip6_output forwarding + */ +struct _ip6dn_args { + struct ip6_pktopts *opt_or; + struct route_in6 ro_or; + int flags_or; + struct ip6_moptions *im6o_or; + struct ifnet *origifp_or; + struct ifnet *ifp_or; + struct sockaddr_in6 dst_or; + u_long mtu_or; + struct route_in6 ro_pmtu_or; +}; + + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + + /* + * On return, it points to the matching rule. + * On entry, rule.slot > 0 means the info is valid and + * contains the the starting rule for an ipfw search. + * If chain_id == chain->id && slot >0 then jump to that slot. + * Otherwise, we locate the first rule >= rulenum:rule_id + */ + struct ipfw_rule_ref rule; /* match/restart info */ + + struct ether_header *eh; /* for bridged packets */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + //uint32_t cookie; /* a cookie depending on rule action */ + struct inpcb *inp; + + struct _ip6dn_args dummypar; /* dummynet->ip6_output */ + struct sockaddr_in hopstore; /* store here if cannot use a pointer */ +}; + +MALLOC_DECLARE(M_IPFW); + +/* + * Hooks sometime need to know the direction of the packet + * (divert, dummynet, netgraph, ...) + * We use a generic definition here, with bit0-1 indicating the + * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the + * specific protocol + * indicating the protocol (if necessary) + */ +enum { + DIR_MASK = 0x3, + DIR_OUT = 0, + DIR_IN = 1, + DIR_FWD = 2, + DIR_DROP = 3, + PROTO_LAYER2 = 0x4, /* set for layer 2 */ + /* PROTO_DEFAULT = 0, */ + PROTO_IPV4 = 0x08, + PROTO_IPV6 = 0x10, + PROTO_IFB = 0x0c, /* layer2 + ifbridge */ + /* PROTO_OLDBDG = 0x14, unused, old bridge */ +}; + +/* wrapper for freeing a packet, in case we need to do more work */ +#ifndef FREE_PKT +#if defined(__linux__) || defined(_WIN32) +#define FREE_PKT(m) netisr_dispatch(-1, m) +#else +#define FREE_PKT(m) m_freem(m) +#endif +#endif /* !FREE_PKT */ + +/* + * Function definitions. + */ + +/* attach (arg = 1) or detach (arg = 0) hooks */ +int ipfw_attach_hooks(int); +#ifdef NOTYET +void ipfw_nat_destroy(void); +#endif + +/* In ip_fw_log.c */ +struct ip; +void ipfw_log_bpf(int); +void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip); +VNET_DECLARE(u_int64_t, norule_counter); +#define V_norule_counter VNET(norule_counter) +VNET_DECLARE(int, verbose_limit); +#define V_verbose_limit VNET(verbose_limit) + +/* In ip_fw_dynamic.c */ + +enum { /* result for matching dynamic rules */ + MATCH_REVERSE = 0, + MATCH_FORWARD, + MATCH_NONE, + MATCH_UNKNOWN, +}; + +/* + * The lock for dynamic rules is only used once outside the file, + * and only to release the result of lookup_dyn_rule(). + * Eventually we may implement it with a callback on the function. + */ +void ipfw_dyn_unlock(void); + +struct tcphdr; +struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, + u_int32_t, u_int32_t, int); +int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg); +ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, + int *match_direction, struct tcphdr *tcp); +void ipfw_remove_dyn_children(struct ip_fw *rule); +void ipfw_get_dynamic(char **bp, const char *ep); + +void ipfw_dyn_attach(void); /* uma_zcreate .... */ +void ipfw_dyn_detach(void); /* uma_zdestroy ... */ +void ipfw_dyn_init(void); /* per-vnet initialization */ +void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ +int ipfw_dyn_len(void); + +/* common variables */ +VNET_DECLARE(int, fw_one_pass); +#define V_fw_one_pass VNET(fw_one_pass) + +VNET_DECLARE(int, fw_verbose); +#define V_fw_verbose VNET(fw_verbose) + +VNET_DECLARE(struct ip_fw_chain, layer3_chain); +#define V_layer3_chain VNET(layer3_chain) + +VNET_DECLARE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DECLARE(int, autoinc_step); +#define V_autoinc_step VNET(autoinc_step) + +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + struct ip_fw *default_rule; + int n_rules; /* number of static rules */ + int static_len; /* total len of static rules */ + struct ip_fw **map; /* array of rule ptrs to ease lookup */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rwmtx; + spinlock_t uh_lock; +#else + struct rwlock rwmtx; + struct rwlock uh_lock; /* lock for upper half */ +#endif + uint32_t id; /* ruleset id */ +}; + +struct sockopt; /* used by tcp_var.h */ + +/* + * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c + * so the variable and the macros must be here. + */ + +#define IPFW_LOCK_INIT(_chain) do { \ + rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rw_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) + +#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) +#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) +#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) +#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) + +/* In ip_fw_sockopt.c */ +int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); +int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); +int ipfw_ctl(struct sockopt *sopt); +int ipfw_chk(struct ip_fw_args *args); +void ipfw_reap_rules(struct ip_fw *head); + +/* In ip_fw_pfil */ +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp); + +/* In ip_fw_table.c */ +struct radix_node; +int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val); +int ipfw_init_tables(struct ip_fw_chain *ch); +void ipfw_destroy_tables(struct ip_fw_chain *ch); +int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); +int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value); +int ipfw_dump_table_entry(struct radix_node *rn, void *arg); +int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen); +int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); +int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); + +/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ + +extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); + +typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); +typedef int ipfw_nat_cfg_t(struct sockopt *); + +extern ipfw_nat_t *ipfw_nat_ptr; +#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) + +extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#endif /* _KERNEL */ +#endif /* _IPFW2_PRIVATE_H */ diff --git a/dummynet2/include/netinet/tcp.h b/dummynet2/include/netinet/tcp.h new file mode 100644 index 0000000..5af35a7 --- /dev/null +++ b/dummynet2/include/netinet/tcp.h @@ -0,0 +1,228 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/tcp.h,v 1.40.2.2 2008/07/31 06:10:25 kmacy Exp $ + */ + +#ifndef _NETINET_TCP_H_ +#define _NETINET_TCP_H_ + +#include + +#define __BSD_VISIBLE 1 + +#if __BSD_VISIBLE + +typedef u_int32_t tcp_seq; + +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ + +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_char th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) +#define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" + + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOLEN_EOL 1 +#define TCPOPT_PAD 0 /* padding after EOL */ +#define TCPOLEN_PAD 1 +#define TCPOPT_NOP 1 +#define TCPOLEN_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 +#define TCPOLEN_SACKHDR 2 +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ +#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ +#define TCPOLEN_SIGNATURE 18 + +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ +#define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ + + +/* + * Default maximum segment size for TCP. + * With an IP MTU of 576, this is 536, + * but 512 is probably more convenient. + * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + */ +#define TCP_MSS 512 +/* + * TCP_MINMSS is defined to be 216 which is fine for the smallest + * link MTU (256 bytes, AX.25 packet radio) in the Internet. + * However it is very unlikely to come across such low MTU interfaces + * these days (anno dato 2003). + * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. + * Setting this to "0" disables the minmss check. + */ +#define TCP_MINMSS 216 + +/* + * Default maximum segment size for TCP6. + * With an IP6 MSS of 1280, this is 1220, + * but 1024 is probably more convenient. (xxx kazu in doubt) + * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) + */ +#define TCP6_MSS 1024 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +#define TCP_MAXBURST 4 /* maximum segments in a burst */ + +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + /* max space left for options */ +#endif /* __BSD_VISIBLE */ + +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#if __BSD_VISIBLE +#define TCP_MAXSEG 0x02 /* set maximum segment size */ +#define TCP_NOPUSH 0x04 /* don't push last block of write */ +#define TCP_NOOPT 0x08 /* don't use TCP options */ +#define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ +#define TCP_INFO 0x20 /* retrieve tcp_info structure */ +#define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */ + +#define TCP_CA_NAME_MAX 16 /* max congestion control name length */ + +#define TCPI_OPT_TIMESTAMPS 0x01 +#define TCPI_OPT_SACK 0x02 +#define TCPI_OPT_WSCALE 0x04 +#define TCPI_OPT_ECN 0x08 +#define TCPI_OPT_TOE 0x10 + +/* + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits + * the caller to query certain information about the state of a TCP + * connection. We provide an overlapping set of fields with the Linux + * implementation, but since this is a fixed size structure, room has been + * left for growth. In order to maximize potential future compatibility with + * the Linux API, the same variable names and order have been adopted, and + * padding left to make room for omitted fields in case they are added later. + * + * XXX: This is currently an unstable ABI/API, in that it is expected to + * change. + */ +struct tcp_info { + u_int8_t tcpi_state; /* TCP FSM state. */ + u_int8_t __tcpi_ca_state; + u_int8_t __tcpi_retransmits; + u_int8_t __tcpi_probes; + u_int8_t __tcpi_backoff; + u_int8_t tcpi_options; /* Options enabled on conn. */ + u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ + tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ + + u_int32_t __tcpi_rto; + u_int32_t __tcpi_ato; + u_int32_t __tcpi_snd_mss; + u_int32_t __tcpi_rcv_mss; + + u_int32_t __tcpi_unacked; + u_int32_t __tcpi_sacked; + u_int32_t __tcpi_lost; + u_int32_t __tcpi_retrans; + u_int32_t __tcpi_fackets; + + /* Times; measurements in usecs. */ + u_int32_t __tcpi_last_data_sent; + u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ + u_int32_t __tcpi_last_data_recv; + u_int32_t __tcpi_last_ack_recv; + + /* Metrics; variable units. */ + u_int32_t __tcpi_pmtu; + u_int32_t __tcpi_rcv_ssthresh; + u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ + u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ + u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ + u_int32_t __tcpi_advmss; + u_int32_t __tcpi_reordering; + + u_int32_t __tcpi_rcv_rtt; + u_int32_t tcpi_rcv_space; /* Advertised recv window. */ + + /* FreeBSD extensions to tcp_info. */ + u_int32_t tcpi_snd_wnd; /* Advertised send window. */ + u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_nxt; /* Next egress seqno */ + u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ + u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ + + /* Padding to grow without breaking ABI. */ + u_int32_t __tcpi_pad[29]; /* Padding. */ +}; +#endif + +#endif /* !_NETINET_TCP_H_ */ diff --git a/dummynet2/include/netinet/tcp_var.h b/dummynet2/include/netinet/tcp_var.h new file mode 100644 index 0000000..35196a2 --- /dev/null +++ b/dummynet2/include/netinet/tcp_var.h @@ -0,0 +1,4 @@ +#ifndef _NETINET_TCP_VAR_H_ +#define _NETINET_TCP_VAR_H_ +#include +#endif /* !_NETINET_TCP_VAR_H_ */ diff --git a/dummynet2/include/netinet/udp.h b/dummynet2/include/netinet/udp.h new file mode 100644 index 0000000..cd75bd1 --- /dev/null +++ b/dummynet2/include/netinet/udp.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/udp.h,v 1.10 2007/02/20 10:13:11 rwatson Exp $ + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * UDP protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +/* + * User-settable options (used with setsockopt). + */ +#define UDP_ENCAP 0x01 + + +/* + * UDP Encapsulation of IPsec Packets options. + */ +/* Encapsulation types. */ +#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ +#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ + +/* Default ESP in UDP encapsulation port. */ +#define UDP_ENCAP_ESPINUDP_PORT 500 + +/* Maximum UDP fragment size for ESP over UDP. */ +#define UDP_ENCAP_ESPINUDP_MAXFRAGLEN 552 + +#endif diff --git a/dummynet2/include/sys/cdefs.h b/dummynet2/include/sys/cdefs.h new file mode 100644 index 0000000..b95b4b7 --- /dev/null +++ b/dummynet2/include/sys/cdefs.h @@ -0,0 +1,33 @@ +#ifndef _CDEFS_H_ +#define _CDEFS_H_ + +/* + * various compiler macros and common functions + */ + +#ifndef __unused +#define __unused __attribute__ ((__unused__)) +#endif + +#ifndef __packed +#define __packed __attribute__ ((__packed__)) +#endif + +#ifndef __aligned +#define __aligned(x) __attribute__((__aligned__(x))) +#endif + +/* defined as assert */ +void panic(const char *fmt, ...); + +#define KASSERT(exp,msg) do { \ + if (__predict_false(!(exp))) \ + panic msg; \ +} while (0) + +/* don't bother to optimize */ +#ifndef __predict_false +#define __predict_false(x) (x) /* __builtin_expect((exp), 0) */ +#endif + +#endif /* !_CDEFS_H_ */ diff --git a/dummynet2/include/sys/kernel.h b/dummynet2/include/sys/kernel.h new file mode 100644 index 0000000..fbc9581 --- /dev/null +++ b/dummynet2/include/sys/kernel.h @@ -0,0 +1,26 @@ +/* + * from freebsd's kernel.h + */ +#ifndef _SYS_KERNEL_H_ +#define _SYS_KERNEL_H_ + +#define SYSINIT(a, b, c, d, e) \ + void *sysinit_ ## d = d +#define VNET_SYSINIT(a, b, c, d, e) \ + void *sysinit_ ## d = d +#define SYSUNINIT(a, b, c, d, e) \ + void *sysuninit_ ## d = d +#define VNET_SYSUNINIT(a, b, c, d, e) \ + void *sysuninit_ ## d = d + +/* + * Some enumerated orders; "ANY" sorts last. + */ +enum sysinit_elem_order { + SI_ORDER_FIRST = 0x0000000, /* first*/ + SI_ORDER_SECOND = 0x0000001, /* second*/ + SI_ORDER_THIRD = 0x0000002, /* third*/ + SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ + SI_ORDER_ANY = 0xfffffff /* last*/ +}; +#endif diff --git a/dummynet2/include/sys/malloc.h b/dummynet2/include/sys/malloc.h new file mode 100644 index 0000000..ac16aed --- /dev/null +++ b/dummynet2/include/sys/malloc.h @@ -0,0 +1,59 @@ +#ifndef _SYS_MALLOC_H_ +#define _SYS_MALLOC_H_ + +/* + * No matter what, try to get clear memory and be non-blocking. + * XXX check if 2.4 has a native way to zero memory, + * XXX obey to the flags (M_NOWAIT <-> GPF_ATOMIC, M_WAIT <-> GPF_KERNEL) + */ +#ifndef _WIN32 /* this is the linux version */ + +/* + * XXX On zeroshell (2.6.25.17) we get a load error + * __you_cannot_kmalloc_that_much + * which is triggered when kmalloc() is called with a large + * compile-time constant argument (include/linux/slab_def.h) + * + * I think it may be a compiler (or source) bug because there is no + * evidence that such a large request is made. + * Making the _size argument to kmalloc volatile prevents the compiler + * from making the mistake, though it is clearly not ideal. + */ + +#if !defined (LINUX_24) && LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) +#define malloc(_size, type, flags) \ + ({ volatile int _v = _size; kmalloc(_v, GFP_ATOMIC | __GFP_ZERO); }) +#else /* LINUX <= 2.6.22 and LINUX_24 */ +/* linux 2.6.22 does not zero allocated memory */ +#define malloc(_size, type, flags) \ + ({ int _s = _size; \ + void *_ret = kmalloc(_s, GFP_ATOMIC); \ + if (_ret) memset(_ret, 0, _s); \ + (_ret); \ + }) +#endif /* LINUX <= 2.6.22 */ + +#define calloc(_n, _s) malloc((_n * _s), NULL, GFP_ATOMIC | __GFP_ZERO) +#define free(_var, type) kfree(_var) + +#else /* _WIN32, the windows version */ + +/* + * ntddk.h uses win_malloc() and MmFreeContiguousMemory(). + * wipfw uses + * ExAllocatePoolWithTag(, pool, len, tag) + * ExFreePoolWithTag(ptr, tag) + */ +#define malloc(_size, _type, _flags) my_alloc(_size) +#define calloc(_size, _type, _flags) my_alloc(_size) + +void *my_alloc(int _size); +/* the 'tag' version does not work without -Gz in the linker */ +#define free(_var, type) ExFreePool(_var) +//#define free(_var, type) ExFreePoolWithTag(_var, 'wfpi') + +#endif /* _WIN32 */ + +#define M_NOWAIT 0x0001 /* do not block */ +#define M_ZERO 0x0100 /* bzero the allocation */ +#endif /* _SYS_MALLOC_H_ */ diff --git a/dummynet2/include/sys/mbuf.h b/dummynet2/include/sys/mbuf.h new file mode 100644 index 0000000..e65bbb6 --- /dev/null +++ b/dummynet2/include/sys/mbuf.h @@ -0,0 +1,265 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Universita` di Pisa + * + * BSD copyright. + * + * A simple compatibility interface to map mbufs onto sk_buff + */ + +#ifndef _SYS_MBUF_H_ +#define _SYS_MBUF_H_ + +#include /* we use free() */ +/* hopefully queue.h is already included by someone else */ +#include +#ifdef _KERNEL + +/* bzero not present on linux, but this should go in glue.h */ +// #define bzero(s, n) memset(s, 0, n) + +/* + * We implement a very simplified UMA allocator where the backend + * is simply malloc, and uma_zone only stores the length of the components. + */ +typedef int uma_zone_t; /* the zone size */ + +#define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len) + + +#define uma_zfree(zone, item) free(item, M_IPFW) +#define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags) +#define uma_zdestroy(zone) do {} while (0) + +/*- + * Macros for type conversion: + * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. + */ +#define mtod(m, t) ((t)((m)->m_data)) + +#endif /* _KERNEL */ + +/* + * Packet tag structure (see below for details). + */ +struct m_tag { + SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ + u_int16_t m_tag_id; /* Tag ID */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_cookie; /* ABI/Module ID */ + void (*m_tag_free)(struct m_tag *); +}; + +#if defined(__linux__) || defined( _WIN32 ) + +/* + * Auxiliary structure to store values from the sk_buf. + * Note that we should not alter the sk_buff, and if we do + * so make sure to keep the values in sync between the mbuf + * and the sk_buff (especially m_len and m_pkthdr.len). + */ + +struct mbuf { + struct mbuf *m_next; + struct mbuf *m_nextpkt; + void *m_data; + int m_len; /* length in this mbuf */ + int m_flags; +#ifdef __linux__ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + struct nf_info *queue_entry; +#else + struct nf_queue_entry *queue_entry; +#endif +#else /* _WIN32 */ + int direction; /* could go in rcvif */ + NDIS_HANDLE context; /* replaces queue_entry or skb ?*/ + PNDIS_PACKET pkt; +#endif + struct sk_buff *m_skb; + struct { +#ifdef __linux__ + struct net_device *rcvif; +#else + struct ifnet *rcvif; +#endif + int len; /* total packet len */ + SLIST_HEAD (packet_tags, m_tag) tags; + } m_pkthdr; +}; + +#define M_SKIP_FIREWALL 0x01 /* skip firewall processing */ +#define M_BCAST 0x02 /* send/received as link-level broadcast */ +#define M_MCAST 0x04 /* send/received as link-level multicast */ + +#define M_DONTWAIT M_NOWAIT /* should not be here... */ + + +/* + * m_dup() is used in the TEE case, currently unsupported so we + * just return. + */ +static __inline struct mbuf *m_dup(struct mbuf __unused *m, int __unused n) +{ + return NULL; +}; + +#define MTAG_ABI_COMPAT 0 /* compatibility ABI */ +static __inline struct m_tag * +m_tag_find(struct mbuf __unused *m, int __unused type, struct m_tag __unused *start) +{ + return NULL; +}; + + +static __inline void +m_tag_prepend(struct mbuf *m, struct m_tag *t) +{ + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); +} + +/* + * Return the next tag in the list of tags associated with an mbuf. + */ +static __inline struct m_tag * +m_tag_next(struct mbuf *m, struct m_tag *t) +{ + + return (SLIST_NEXT(t, m_tag_link)); +} + +/* + * Create an mtag of the given type + */ +static __inline struct m_tag * +m_tag_alloc(uint32_t cookie, int type, int length, int wait) +{ + int l = length + sizeof(struct m_tag); + struct m_tag *m = malloc(l, 0, M_NOWAIT); + if (m) { + memset(m, 0, l); + m->m_tag_id = type; + m->m_tag_len = length; + m->m_tag_cookie = cookie; + } + return m; +}; + +static __inline struct m_tag * +m_tag_get(int type, int length, int wait) +{ + return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); +} + +static __inline struct m_tag * +m_tag_first(struct mbuf *m) +{ + return SLIST_FIRST(&m->m_pkthdr.tags); +}; + +static __inline void +m_tag_delete(struct mbuf *m, struct m_tag *t) +{ +}; + +static __inline struct m_tag * +m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t) +{ + struct m_tag *tag; + + tag = m_tag_first(m); + if (tag == NULL) + return NULL; + + if (tag->m_tag_cookie != n || tag->m_tag_id != x) + return NULL; + else + return tag; +}; + +#define M_SETFIB(_m, _fib) /* nothing on linux */ + +static __inline void +m_freem(struct mbuf *m) +{ + struct m_tag *t; + + /* free the m_tag chain */ + while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) { + SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link); + free(t, 0); + } + + /* free the mbuf */ + free(m, M_IPFW); +}; + +/* m_pullup is not supported, there is a macro in missing.h */ + +#define M_GETFIB(_m) 0 + +/* macro used to create a new mbuf */ +#define MT_DATA 1 /* dynamic (data) allocation */ +#define MSIZE 256 /* size of an mbuf */ +#define MGETHDR(_m, _how, _type) ((_m) = m_gethdr((_how), (_type))) + +/* allocate and init a new mbuf using the same structure of FreeBSD */ +static __inline struct mbuf * +m_gethdr(int how, short type) +{ + struct mbuf *m; + + m = malloc(MSIZE, M_IPFW, M_NOWAIT); + + if (m == NULL) { + return m; + } + + /* here we have MSIZE - sizeof(struct mbuf) available */ + m->m_data = m + 1; + + return m; +} + +#endif /* __linux__ || _WIN32 */ + +/* + * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise + * tags are expected to ``vanish'' when they pass through a network + * interface. For most interfaces this happens normally as the tags are + * reclaimed when the mbuf is free'd. However in some special cases + * reclaiming must be done manually. An example is packets that pass through + * the loopback interface. Also, one must be careful to do this when + * ``turning around'' packets (e.g., icmp_reflect). + * + * To mark a tag persistent bit-or this flag in when defining the tag id. + * The tag will then be treated as described above. + */ +#define MTAG_PERSISTENT 0x800 + +#define PACKET_TAG_NONE 0 /* Nadda */ + +/* Packet tags for use with PACKET_ABI_COMPAT. */ +#define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ +#define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ +#define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ +#define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ +#define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ +#define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ +#define PACKET_TAG_GIF 8 /* GIF processing done */ +#define PACKET_TAG_GRE 9 /* GRE processing done */ +#define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ +#define PACKET_TAG_ENCAP 11 /* Encap. processing */ +#define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ +#define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ +#define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ +#define PACKET_TAG_DUMMYNET 15 /* dummynet info */ +#define PACKET_TAG_DIVERT 17 /* divert info */ +#define PACKET_TAG_IPFORWARD 18 /* ipforward info */ +#define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ +#define PACKET_TAG_PF 21 /* PF + ALTQ information */ +#define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ +#define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ +#define PACKET_TAG_CARP 28 /* CARP info */ + +#endif /* !_SYS_MBUF_H_ */ diff --git a/dummynet2/include/sys/module.h b/dummynet2/include/sys/module.h new file mode 100644 index 0000000..85bf220 --- /dev/null +++ b/dummynet2/include/sys/module.h @@ -0,0 +1,41 @@ +/* + * trivial module support + */ +#ifndef _SYS_MODULE_H_ +#define _SYS_MODULE_H_ +typedef struct module *module_t; +typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); + +typedef enum modeventtype { + MOD_LOAD, + MOD_UNLOAD, + MOD_SHUTDOWN, + MOD_QUIESCE +} modeventtype_t; + +typedef struct moduledata { + const char *name; /* module name */ + modeventhand_t evhand; /* event handler */ + void *priv; /* extra data */ +} moduledata_t; + +/* + * Hook the module descriptor, md, into our list of things to do. + * We should in principle respect the order of loading. + * + * XXX use the gcc .init functions + */ +#define DECLARE_MODULE(a, md, c,d) \ + moduledata_t *moddesc_##a = &md; + +/* + * XXX MODULE_VERSION is define in linux too + */ +#define MODULE_DEPEND(a,b,c,d,e) +#if defined( __linux__ ) || defined( _WIN32 ) +#undef MODULE_VERSION +#define MODULE_VERSION(a,b) +#endif + +#endif /* _SYS_MODULE_H_ */ + diff --git a/dummynet2/include/sys/param.h b/dummynet2/include/sys/param.h new file mode 100644 index 0000000..f068998 --- /dev/null +++ b/dummynet2/include/sys/param.h @@ -0,0 +1,11 @@ +#ifndef _SYS_PARAM_H_ +#define _SYS_PARAM_H_ + +/* + * number of additional groups + */ +#ifndef LINUX_24 +#define NGROUPS 16 +#endif + +#endif /* _SYS_PARAM_H_ */ diff --git a/dummynet2/include/sys/queue.h b/dummynet2/include/sys/queue.h new file mode 100644 index 0000000..3630218 --- /dev/null +++ b/dummynet2/include/sys/queue.h @@ -0,0 +1,623 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.68 2006/10/24 11:20:29 ru Exp $ + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +//#include + +/* + * This file defines four types of data structures: singly-linked lists, + * singly-linked tail queues, lists and tail queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ + * _HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - - - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_SAFE + + + + + * _FOREACH_REVERSE - - - + + * _FOREACH_REVERSE_SAFE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT - - + + + * _REMOVE_HEAD + - + - + * _REMOVE + + + + + * + */ +#ifdef QUEUE_MACRO_DEBUG +/* Store the last 2 places the queue element or head was altered */ +struct qm_trace { + char * lastfile; + int lastline; + char * prevfile; + int prevline; +}; + +#define TRACEBUF struct qm_trace trace; +#define TRASHIT(x) do {(x) = (void *)-1;} while (0) + +#define QMD_TRACE_HEAD(head) do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ +} while (0) + +#define QMD_TRACE_ELEM(elem) do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ +} while (0) + +#else +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define TRACEBUF +#define TRASHIT(x) +#endif /* QUEUE_MACRO_DEBUG */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#if defined( _WIN32 ) && defined(SLIST_ENTRY) +#undef SLIST_ENTRY +#endif +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); \ + ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ + TRASHIT((elm)->field.sle_next); \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_CONCAT(head1, head2) do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ +} while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + + +#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = STAILQ_FIRST((head)); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? \ + NULL : \ + ((struct type *)(void *) \ + ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ + TRASHIT((elm)->field.stqe_next); \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#ifndef LIST_HEAD +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_LIST_CHECK_HEAD(head, field) do { \ + if (LIST_FIRST((head)) != NULL && \ + LIST_FIRST((head))->field.le_prev != \ + &LIST_FIRST((head))) \ + panic("Bad list head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_LIST_CHECK_NEXT(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL && \ + LIST_NEXT((elm), field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_LIST_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_LIST_CHECK_HEAD(head, field) +#define QMD_LIST_CHECK_NEXT(elm, field) +#define QMD_LIST_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = LIST_FIRST((head)); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + QMD_LIST_CHECK_NEXT(listelm, field); \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_LIST_CHECK_PREV(listelm, field); \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + QMD_LIST_CHECK_HEAD((head), field); \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + QMD_LIST_CHECK_NEXT(elm, field); \ + QMD_LIST_CHECK_PREV(elm, field); \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ + TRASHIT((elm)->field.le_next); \ + TRASHIT((elm)->field.le_prev); \ +} while (0) +#endif /* LIST_HEAD */ + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_TAILQ_CHECK_HEAD(head, field) do { \ + if (!TAILQ_EMPTY(head) && \ + TAILQ_FIRST((head))->field.tqe_prev != \ + &TAILQ_FIRST((head))) \ + panic("Bad tailq head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_TAIL(head, field) do { \ + if (*(head)->tqh_last != NULL) \ + panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ + if (TAILQ_NEXT((elm), field) != NULL && \ + TAILQ_NEXT((elm), field)->field.tqe_prev != \ + &((elm)->field.tqe_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_TAILQ_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.tqe_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_TAILQ_CHECK_HEAD(head, field) +#define QMD_TAILQ_CHECK_TAIL(head, headname) +#define QMD_TAILQ_CHECK_NEXT(elm, field) +#define QMD_TAILQ_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(elm, field); \ + QMD_TAILQ_CHECK_PREV(elm, field); \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT((elm)->field.tqe_next); \ + TRASHIT((elm)->field.tqe_prev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + + +#ifdef _KERNEL + +/* + * XXX insque() and remque() are an old way of handling certain queues. + * They bogusly assumes that all queue heads look alike. + */ + +struct quehead { + struct quehead *qh_link; + struct quehead *qh_rlink; +}; + +#ifdef __CC_SUPPORTS___INLINE + +static __inline void +insque(void *a, void *b) +{ + struct quehead *element = (struct quehead *)a, + *head = (struct quehead *)b; + + element->qh_link = head->qh_link; + element->qh_rlink = head; + head->qh_link = element; + element->qh_link->qh_rlink = element; +} + +static __inline void +remque(void *a) +{ + struct quehead *element = (struct quehead *)a; + + element->qh_link->qh_rlink = element->qh_rlink; + element->qh_rlink->qh_link = element->qh_link; + element->qh_rlink = 0; +} + +#else /* !__CC_SUPPORTS___INLINE */ + +void insque(void *a, void *b); +void remque(void *a); + +#endif /* __CC_SUPPORTS___INLINE */ + +#endif /* _KERNEL */ + +#endif /* !_SYS_QUEUE_H_ */ diff --git a/dummynet2/include/sys/syslog.h b/dummynet2/include/sys/syslog.h new file mode 100644 index 0000000..143df1f --- /dev/null +++ b/dummynet2/include/sys/syslog.h @@ -0,0 +1,7 @@ +#ifndef _SYS_SYSLOG_H_ +#define _SYS_SYSLOG_H_ +/* XXX find linux equivalent */ +#define LOG_SECURITY 0 +#define LOG_NOTICE 0 +#define LOG_DEBUG 0 +#endif /* _SYS_SYSLOG_H_ */ diff --git a/dummynet2/include/sys/systm.h b/dummynet2/include/sys/systm.h new file mode 100644 index 0000000..e98335e --- /dev/null +++ b/dummynet2/include/sys/systm.h @@ -0,0 +1,126 @@ +#ifndef _SYS_SYSTM_H_ +#define _SYS_SYSTM_H_ + +#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ +#define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ + +#ifndef _WIN32 /* this is the linux version */ +/* callout support, in on FreeBSD */ +/* + * callout support on linux module is done using timers + */ +#include +#ifdef LINUX_24 +#include /* jiffies definition is here in 2.4 */ +#endif +#define callout timer_list +static __inline int +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) +{ + co->expires = jiffies + ticks; + co->function = (void (*)(unsigned long))fn; + co->data = (unsigned long)arg; + /* + * Linux 2.6.31 and above has add_timer_on(co, cpu), + * otherwise add_timer() always schedules a callout on the same + * CPU used the first time, so we don't need more. + */ + add_timer(co); + return 0; +} + +#define callout_init(co, safe) init_timer(co) +#define callout_drain(co) del_timer(co) +#define callout_stop(co) del_timer(co) + +#else /* _WIN32 */ +#include + +/* This is the windows part for callout support */ +struct callout { + KTIMER thetimer; + KDPC timerdpc; + int dpcinitialized; + LARGE_INTEGER duetime; +}; + +void dummynet (void*); +VOID dummynet_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ); + +VOID ipfw_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ); + +/* callout_reset must handle two problems: + * - dummynet() scheduler must be run always on the same processor + * because do_gettimeofday() is based on cpu performance counter, and + * _occasionally_ can leap backward in time if we query another cpu. + * typically this won't happen that much, and the cpu will almost always + * be the same even without the affinity restriction, but better to be sure. + * - ipfw_tick() does not have the granularity requirements of dummynet() + * but we need to pass a pointer as argument. + * + * for these reasons, if we are called for dummynet() timer, + * KeInitializeDpc is called only once as it should be, and the thread + * is forced on cpu0 (which is always present), while if we're called + * for ipfw_tick(), we re-initialize the DPC each time, using + * parameter DeferredContext to pass the needed pointer. since this + * timer is called only once a sec, this won't hurt that much. + */ +static __inline int +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) +{ + if(fn == &dummynet) + { + if(co->dpcinitialized == 0) + { + KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL); + KeSetTargetProcessorDpc(&co->timerdpc, cpu); + co->dpcinitialized = 1; + } + } + else + { + KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg); + } + co->duetime.QuadPart = (-ticks)*10000; + KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc); + return 0; +} + +static __inline void +callout_init(struct callout* co, int safe) +{ + printf("%s: initializing timer at %p\n",__FUNCTION__,co); + KeInitializeTimer(&co->thetimer); +} + +static __inline int +callout_drain(struct callout* co) +{ + BOOLEAN canceled = KeCancelTimer(&co->thetimer); + while (canceled != TRUE) + { + canceled = KeCancelTimer(&co->thetimer); + } + printf("%s: stopping timer at %p\n",__FUNCTION__,co); + return 0; +} + +static __inline int +callout_stop(struct callout* co) +{ + return callout_drain(co); +} + +#endif /* _WIN32 */ + +#endif /* _SYS_SYSTM_H_ */ diff --git a/dummynet2/include/sys/taskqueue.h b/dummynet2/include/sys/taskqueue.h new file mode 100644 index 0000000..43efdd5 --- /dev/null +++ b/dummynet2/include/sys/taskqueue.h @@ -0,0 +1,34 @@ +#ifndef _SYS_TASKQUEUE_H_ +#define _SYS_TASKQUEUE_H_ + +/* + * Remap taskqueue to direct calls + */ + +#ifdef _WIN32 +struct task { + void (*func)(void*, int); +}; +#define taskqueue_enqueue(tq, ta) (ta)->func(NULL,1) +#define TASK_INIT(a,b,c,d) do { \ + (a)->func = (c); } while (0) +#else +struct task { + void (*func)(void); +}; +#define taskqueue_enqueue(tq, ta) (ta)->func() +#define TASK_INIT(a,b,c,d) do { \ + (a)->func = (void (*)(void))c; } while (0) + +#endif +#define taskqueue_create_fast(_a, _b, _c, _d) NULL +#define taskqueue_start_threads(_a, _b, _c, _d) + +#define taskqueue_drain(_a, _b) /* XXX to be completed */ +#define taskqueue_free(_a) /* XXX to be completed */ + +#define PRI_MIN (0) /* Highest priority. */ +#define PRI_MIN_ITHD (PRI_MIN) +#define PI_NET (PRI_MIN_ITHD + 16) + +#endif /* !_SYS_TASKQUEUE_H_ */ diff --git a/dummynet2/ip_dn_glue.c b/dummynet2/ip_dn_glue.c new file mode 100644 index 0000000..0df0829 --- /dev/null +++ b/dummynet2/ip_dn_glue.c @@ -0,0 +1,845 @@ +/*- + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: ip_dn_glue.c 6031 2010-04-09 15:25:41Z svn_panicucci $ + * + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 + */ + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +/* FREEBSD7.2 ip_dummynet.h r191715*/ + +struct dn_heap_entry7 { + int64_t key; /* sorting key. Topmost element is smallest one */ + void *object; /* object pointer */ +}; + +struct dn_heap7 { + int size; + int elements; + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry7 *p; /* really an array of "size" entries */ +}; + +/* Common to 7.2 and 8 */ +struct dn_flow_set { + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DNOLD_HAVE_FLOW_MASK 0x0001 +#define DNOLD_IS_RED 0x0002 +#define DNOLD_IS_GENTLE_RED 0x0004 +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ +#define DNOLD_IS_PIPE 0x4000 +#define DNOLD_IS_QUEUE 0x8000 + + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue7 **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; +SLIST_HEAD(dn_flow_set_head, dn_flow_set); + +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +struct dn_flow_queue7 { + struct dn_flow_queue7 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + u_long numbytes; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe7 { /* a pipe */ + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + int numbytes; + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; +SLIST_HEAD(dn_pipe_head7, dn_pipe7); + + +/* FREEBSD8 ip_dummynet.h r196045 */ +struct dn_flow_queue8 { + struct dn_flow_queue8 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ + int64_t extra_bits; /* extra bits simulating unavailable channel */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + int64_t idle_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe8 { /* a pipe */ + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + /* Same as in dn_flow_queue, numbytes can become large */ + int64_t numbytes; /* bits I can transmit (more or less). */ + uint64_t burst; /* burst size, scaled: bits * hz */ + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + int64_t idle_time; /* start of pipe idle time */ + + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ + + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int loss_level; + int samples_no; + int *samples; +}; + +#define ED_MAX_SAMPLES_NO 1024 +struct dn_pipe_max8 { + struct dn_pipe8 pipe; + int samples[ED_MAX_SAMPLES_NO]; +}; +SLIST_HEAD(dn_pipe_head8, dn_pipe8); + +/* + * Changes from 7.2 to 8: + * dn_pipe: + * numbytes from int to int64_t + * add burst (int64_t) + * add idle_time (int64_t) + * add profile + * add struct dn_pipe_max + * add flag DN_HAS_PROFILE + * + * dn_flow_queue + * numbytes from u_long to int64_t + * add extra_bits (int64_t) + * q_time from u_int32_t to int64_t and name idle_time + * + * dn_flow_set unchanged + * + */ + +/* NOTE:XXX copied from dummynet.c */ +#define O_NEXT(p, len) ((void *)((char *)p + len)) +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + + +static size_t pipesize7 = sizeof(struct dn_pipe7); +static size_t pipesize8 = sizeof(struct dn_pipe8); +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); + +/* Indicate 'ipfw' version + * 1: from FreeBSD 7.2 + * 0: from FreeBSD 8 + * -1: unknow (for now is unused) + * + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, + * it is suppose to be the FreeBSD 8 version. + */ +static int is7 = 0; + +static int +convertflags2new(int src) +{ + int dst = 0; + + if (src & DNOLD_HAVE_FLOW_MASK) + dst |= DN_HAVE_MASK; + if (src & DNOLD_QSIZE_IS_BYTES) + dst |= DN_QSIZE_BYTES; + if (src & DNOLD_NOERROR) + dst |= DN_NOERROR; + if (src & DNOLD_IS_RED) + dst |= DN_IS_RED; + if (src & DNOLD_IS_GENTLE_RED) + dst |= DN_IS_GENTLE_RED; + if (src & DNOLD_HAS_PROFILE) + dst |= DN_HAS_PROFILE; + + return dst; +} + +static int +convertflags2old(int src) +{ + int dst = 0; + + if (src & DN_HAVE_MASK) + dst |= DNOLD_HAVE_FLOW_MASK; + if (src & DN_IS_RED) + dst |= DNOLD_IS_RED; + if (src & DN_IS_GENTLE_RED) + dst |= DNOLD_IS_GENTLE_RED; + if (src & DN_NOERROR) + dst |= DNOLD_NOERROR; + if (src & DN_HAS_PROFILE) + dst |= DNOLD_HAS_PROFILE; + if (src & DN_QSIZE_BYTES) + dst |= DNOLD_QSIZE_IS_BYTES; + + return dst; +} + +static int +dn_compat_del(void *v) +{ + struct dn_pipe7 *p = (struct dn_pipe7 *) v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + + /* XXX DN_API_VERSION ??? */ + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + + if (is7) { + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL; + } else { + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) + return EINVAL; + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) + return EINVAL; + } + + if (p->pipe_nr != 0) { /* pipe x delete */ + cmd.a[0] = p->pipe_nr; + cmd.oid.subtype = DN_LINK; + } else { /* queue x delete */ + cmd.oid.subtype = DN_FS; + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; + } + + return do_config(&cmd, cmd.oid.len); +} + +static int +dn_compat_config_queue(struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + struct dn_flow_set *f; + + if (is7) + f = &p7->fs; + else + f = &p8->fs; + + fs->fs_nr = f->fs_nr; + fs->sched_nr = f->parent_nr; + fs->flow_mask = f->flow_mask; + fs->buckets = f->rq_size; + fs->qsize = f->qsize; + fs->plr = f->plr; + fs->par[0] = f->weight; + fs->flags = convertflags2new(f->flags_fs); + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { + fs->w_q = f->w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->max_p; + } + + return 0; +} + +static int +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, + struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + int i = p7->pipe_nr; + + sch->sched_nr = i; + sch->oid.subtype = 0; + p->link_nr = i; + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Common to 7 and 8 */ + p->bandwidth = p7->bandwidth; + p->delay = p7->delay; + if (!is7) { + /* FreeBSD 8 has burst */ + p->burst = p8->burst; + } + + /* fill the fifo flowset */ + dn_compat_config_queue(fs, v); + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Move scheduler related parameter from fs to sch */ + sch->buckets = fs->buckets; /*XXX*/ + fs->buckets = 0; + if (fs->flags & DN_HAVE_MASK) { + sch->flags |= DN_HAVE_MASK; + fs->flags &= ~DN_HAVE_MASK; + sch->sched_mask = fs->flow_mask; + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); + } + + return 0; +} + +static int +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, + void *v) +{ + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); + + pf->link_nr = p->link_nr; + pf->loss_level = p8->loss_level; +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? + pf->samples_no = p8->samples_no; + strncpy(pf->name, p8->name,sizeof(pf->name)); + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); + + return 0; +} + +/* + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create + * the three main struct, else only a flowset is created + */ +static int +dn_compat_configure(void *v) +{ + struct dn_id *buf = NULL, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + int lmax; + int error; + + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + int i; /* number of object to configure */ + + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + /* pipe_nr is the same in p7 and p8 */ + i = p7->pipe_nr; + if (i != 0) { /* pipe config */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + error = dn_compat_config_pipe(sch, p, fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + if (!is7 && p8->samples_no > 0) { + /* Add profiles*/ + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + error = dn_compat_config_profile(pf, p, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + } else { /* queue config */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + error = dn_compat_config_queue(fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + error = do_config(base, (char *)buf - (char *)base); + + if (buf) + free(buf, M_DUMMYNET); + return error; +} + +int +dn_compat_calc_size(void) +{ + int need = 0; + /* XXX use FreeBSD 8 struct size */ + /* NOTE: + * - half scheduler: schk_count/2 + * - all flowset: fsk_count + * - all flowset queues: queue_count + * - all pipe queue: si_count + */ + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); + + return need; +} + +int +dn_c_copy_q (void *_ni, void *arg) +{ + struct copy_args *a = arg; + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; + struct dn_flow *ni = (struct dn_flow *)_ni; + int size = 0; + + /* XXX hash slot not set */ + /* No difference between 7.2/8 */ + fq7->len = ni->length; + fq7->len_bytes = ni->len_bytes; + fq7->id = ni->fid; + + if (is7) { + size = sizeof(struct dn_flow_queue7); + fq7->tot_pkts = ni->tot_pkts; + fq7->tot_bytes = ni->tot_bytes; + fq7->drops = ni->drops; + } else { + size = sizeof(struct dn_flow_queue8); + fq8->tot_pkts = ni->tot_pkts; + fq8->tot_bytes = ni->tot_bytes; + fq8->drops = ni->drops; + } + + *a->start += size; + return 0; +} + +int +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) +{ + struct dn_link *l = &s->link; + struct dn_fsk *f = s->fs; + + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; + struct dn_flow_set *fs; + int size = 0; + + if (is7) { + fs = &pipe7->fs; + size = sizeof(struct dn_pipe7); + } else { + fs = &pipe8->fs; + size = sizeof(struct dn_pipe8); + } + + /* These 4 field are the same in pipe7 and pipe8 */ + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; + pipe7->bandwidth = l->bandwidth; + pipe7->delay = l->delay; + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; + + if (!is7) { + if (s->profile) { + struct dn_profile *pf = s->profile; + strncpy(pipe8->name, pf->name, sizeof(pf->name)); + pipe8->loss_level = pf->loss_level; + pipe8->samples_no = pf->samples_no; + } + pipe8->burst = div64(l->burst , 8 * hz); + } + + fs->flow_mask = s->sch.sched_mask; + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; + + fs->parent_nr = l->link_nr - DN_MAX_ID; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->rq_elements = nq; + + fs->flags_fs = convertflags2old(f->fs.flags); + + *a->start += size; + return 0; +} + + +int +dn_compat_copy_pipe(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int pipe_size = sizeof(struct dn_pipe8); + int queue_size = sizeof(struct dn_flow_queue8); + int n_queue = 0; /* number of queues */ + + struct dn_schk *s = (struct dn_schk *)_o; + /* calculate needed space: + * - struct dn_pipe + * - if there are instances, dn_queue * n_instances + */ + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : + (s->siht ? 1 : 0)); + need = pipe_size + queue_size * n_queue; + if (have < need) { + D("have %d < need %d", have, need); + return 1; + } + /* copy pipe */ + dn_c_copy_pipe(s, a, n_queue); + + /* copy queues */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, dn_c_copy_q, a); + else if (s->siht) + dn_c_copy_q(s->siht, a); + return 0; +} + +int +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) +{ + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; + + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; + fs->fs_nr = f->fs.fs_nr; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->flow_mask = f->fs.flow_mask; + fs->rq_elements = nq; + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); + fs->parent_nr = f->fs.sched_nr; + fs->weight = f->fs.par[0]; + + fs->flags_fs = convertflags2old(f->fs.flags); + *a->start += sizeof(struct dn_flow_set); + return 0; +} + +int +dn_compat_copy_queue(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int fs_size = sizeof(struct dn_flow_set); + int queue_size = sizeof(struct dn_flow_queue8); + + struct dn_fsk *fs = (struct dn_fsk *)_o; + int n_queue = 0; /* number of queues */ + + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : + (fs->qht ? 1 : 0)); + + need = fs_size + queue_size * n_queue; + if (have < need) { + D("have < need"); + return 1; + } + + /* copy flowset */ + dn_c_copy_fs(fs, a, n_queue); + + /* copy queues */ + if (fs->fs.flags & DN_HAVE_MASK) + dn_ht_scan(fs->qht, dn_c_copy_q, a); + else if (fs->qht) + dn_c_copy_q(fs->qht, a); + + return 0; +} + +int +copy_data_helper_compat(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_COMPAT_PIPE) { + struct dn_schk *s = _o; + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { + return 0; /* not old type */ + } + /* copy pipe parameters, and if instance exists, copy + * other parameters and eventually queues. + */ + if(dn_compat_copy_pipe(a, _o)) + return DNHT_SCAN_END; + } else if (a->type == DN_COMPAT_QUEUE) { + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (dn_compat_copy_queue(a, _o)) + return DNHT_SCAN_END; + } + return 0; +} + +/* Main function to manage old requests */ +int +ip_dummynet_compat(struct sockopt *sopt) +{ + int error=0; + void *v = NULL; + struct dn_id oid; + + /* Lenght of data, used to found ipfw version... */ + int len = sopt->sopt_valsize; + + /* len can be 0 if command was dummynet_flush */ + if (len == pipesize7) { + D("setting compatibility with FreeBSD 7.2"); + is7 = 1; + } + else if (len == pipesize8 || len == pipesizemax8) { + D("setting compatibility with FreeBSD 8"); + is7 = 0; + } + + switch (sopt->sopt_name) { + default: + printf("dummynet: -- unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_config(&oid, oid.len); + break; + + case IP_DUMMYNET_DEL: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_del(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_CONFIGURE: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_configure(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_GET: { + void *buf; + int ret; + int original_size = sopt->sopt_valsize; + int size; + + ret = dummynet_get(sopt, &buf); + if (ret) + return 0;//XXX ? + size = sopt->sopt_valsize; + sopt->sopt_valsize = original_size; + D("size=%d, buf=%p", size, buf); + ret = sooptcopyout(sopt, buf, size); + if (ret) + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); + if (buf) + free(buf, M_DUMMYNET); + } + } + + return error; +} + + diff --git a/dummynet2/ip_dn_io.c b/dummynet2/ip_dn_io.c new file mode 100644 index 0000000..6672424 --- /dev/null +++ b/dummynet2/ip_dn_io.c @@ -0,0 +1,963 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Dummynet portions related to packet handling. + */ +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 2010-01-31 21:39:25Z luigi $"); + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include + +#include +#include /* ip_len, ip_off */ +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +#include /* various ether_* routines */ + +#include /* for ip6_input, ip6_output prototypes */ +#include + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) + * instead of dn_cfg.curr_time + */ + +struct dn_parms dn_cfg; +//VNET_DEFINE(struct dn_parms, _base_dn_cfg); + +static long tick_last; /* Last tick duration (usec). */ +static long tick_delta; /* Last vs standard tick diff (usec). */ +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ +static long tick_adjustment; /* Tick adjustments done. */ +static long tick_lost; /* Lost(coalesced) ticks number. */ +/* Adjusted vs non-adjusted curr_time difference (ticks). */ +static long tick_diff; + +static unsigned long io_pkt; +static unsigned long io_pkt_fast; +static unsigned long io_pkt_drop; + +/* + * We use a heap to store entities for which we have pending timer events. + * The heap is checked at every tick and all entities with expired events + * are extracted. + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); + +/* wrapper to pass dn_cfg fields to SYSCTL_* */ +//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) +#define DC(x) (&(dn_cfg.x)) +/* parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, DC(hash_size), 0, "Default hash table size"); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, + CTLFLAG_RW, DC(slot_limit), 0, + "Upper limit in slots for pipe queue."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, + CTLFLAG_RW, DC(byte_limit), 0, + "Upper limit in bytes for pipe queue."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, + CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, + CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); + +/* RED parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); + +/* time adjustment */ +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, + CTLFLAG_RD, &tick_diff, 0, + "Adjusted vs non-adjusted curr_time difference (ticks)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, + CTLFLAG_RD, &tick_lost, 0, + "Number of ticks coalesced by dummynet taskqueue."); + +/* Drain parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, + CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_object, + CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick, + CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio, + CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine"); + +/* statistics */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, + CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, + CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, + CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, + CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, + CTLFLAG_RD, &io_pkt, 0, + "Number of packets passed to dummynet."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, + CTLFLAG_RD, &io_pkt_fast, 0, + "Number of packets bypassed dummynet scheduler."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, + CTLFLAG_RD, &io_pkt_drop, 0, + "Number of packets dropped by dummynet."); +#undef DC +SYSEND + +#endif + +static void dummynet_send(struct mbuf *); + +/* + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. + * Outside dummynet, only the 'rule' field is relevant, and it must + * be at the beginning of the structure. + */ +struct dn_pkt_tag { + struct ipfw_rule_ref rule; /* matching rule */ + + /* second part, dummynet specific */ + int dn_dir; /* action when packet comes out.*/ + /* see ip_fw_private.h */ + uint64_t output_time; /* when the pkt is due for delivery*/ + struct ifnet *ifp; /* interface, for ip_output */ + struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +}; + +/* + * Return the mbuf tag holding the dummynet state (it should + * be the first one on the list). + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); + KASSERT(mtag != NULL && + mtag->m_tag_cookie == MTAG_ABI_COMPAT && + mtag->m_tag_id == PACKET_TAG_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); + return (struct dn_pkt_tag *)(mtag+1); +} + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +/* + * Dispose a list of packet. Use a functions so if we need to do + * more work, this is a central point to do it. + */ +void dn_free_pkts(struct mbuf *mnext) +{ + struct mbuf *m; + + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + FREE_PKT(m); + } +} + +static int +red_drops (struct dn_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + */ + + struct dn_fsk *fs = q->fs; + int64_t p_b = 0; + + /* Queue in bytes or packets? */ + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* Average queue size estimation. */ + if (q_size != 0) { + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + + q->avg += (int)v; + } else { + /* + * Queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + + /* Should i drop? */ + if (q->avg < fs->min_th) { + q->count = -1; + return (0); /* accept packet */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->fs.flags & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than + * max_th the packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th + * c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - + fs->c_4; + } else { + q->count = -1; + return (1); + } + } else if (q->avg > fs->min_th) { + /* + * We compute p_b using the linear dropping function + * p_b = c_1 * avg - c_2 + * where c_1 = max_p / (max_th - min_th) + * c_2 = max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + } + + if (fs->fs.flags & DN_QSIZE_BYTES) + p_b = div64((p_b * len) , fs->max_pkt_size); + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { + q->count = 0; + /* After a drop we calculate a new random value. */ + q->random = random() & 0xffff; + return (1); /* drop */ + } + } + /* End of RED algorithm. */ + + return (0); /* accept */ + +} + +/* + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + struct dn_fs *f; + struct dn_flow *ni; /* stats for scheduler instance */ + uint64_t len; + + if (q->fs == NULL || q->_si == NULL) { + printf("%s fs %p si %p, dropping\n", + __FUNCTION__, q->fs, q->_si); + FREE_PKT(m); + return 1; + } + f = &(q->fs->fs); + ni = &q->_si->ni; + len = m->m_pkthdr.len; + /* Update statistics, then check reasons to drop pkt. */ + q->ni.tot_bytes += len; + q->ni.tot_pkts++; + ni->tot_bytes += len; + ni->tot_pkts++; + if (drop) + goto drop; + if (f->plr && random() < f->plr) + goto drop; + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) + goto drop; + if (f->flags & DN_QSIZE_BYTES) { + if (q->ni.len_bytes > f->qsize) + goto drop; + } else if (q->ni.length >= f->qsize) { + goto drop; + } + mq_append(&q->mq, m); + if (q->ni.length == 0) { /* queue was idle */ + dn_cfg.idle_queue--; + if (ni->length == 0) /* scheduler was idle */ + dn_cfg.idle_si--; + } + q->ni.length++; + q->ni.len_bytes += len; + ni->length++; + ni->len_bytes += len; + return 0; + +drop: + io_pkt_drop++; + q->ni.drops++; + ni->drops++; + FREE_PKT(m); + return 1; +} + +/* + * Fetch packets from the delay line which are due now. If there are + * leftover packets, reinsert the delay line in the heap. + * Runs under scheduler lock. + */ +static void +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) +{ + struct mbuf *m; + struct dn_pkt_tag *pkt = NULL; + + dline->oid.subtype = 0; /* not in heap */ + while ((m = dline->mq.head) != NULL) { + pkt = dn_tag_get(m); + if (!DN_KEY_LEQ(pkt->output_time, now)) + break; + dline->mq.head = m->m_nextpkt; + mq_append(q, m); + } + if (m != NULL) { + dline->oid.subtype = 1; /* in heap */ + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); + } +} + +/* + * Convert the additional MAC overheads/delays into an equivalent + * number of bits for the given data rate. The samples are + * in milliseconds so we need to divide by 1000. + */ +static uint64_t +extra_bits(struct mbuf *m, struct dn_schk *s) +{ + int index; + uint64_t bits; + struct dn_profile *pf = s->profile; + + if (!pf || pf->samples_no == 0) + return 0; + index = random() % pf->samples_no; + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); + if (index >= pf->loss_level) { + struct dn_pkt_tag *dt = dn_tag_get(m); + if (dt) + dt->dn_dir = DIR_DROP; + } + return bits; +} + +/* + * Send traffic from a scheduler instance due by 'now'. + * Return a pointer to the head of the queue. + */ +static struct mbuf * +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) +{ + struct mq def_q; + struct dn_schk *s = si->sched; + struct mbuf *m = NULL; + int delay_line_idle = (si->dline.mq.head == NULL); + int done, bw; + + if (q == NULL) { + q = &def_q; + q->head = NULL; + } + + bw = s->link.bandwidth; + si->kflags &= ~DN_ACTIVE; + + if (bw > 0) + si->credit += (now - si->sched_time) * bw; + else + si->credit = 0; + si->sched_time = now; + done = 0; + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + + /* + * Some schedulers might want wake up the scheduler later. + * To suppor this the caller returns an mbuf with len < 0 + * this will result in a new wake up of the scheduler + * instance between m->m_pkthdr.len ticks. + */ + if (m->m_pkthdr.len < 0) { + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si); + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return NULL; + } + + /* a regular mbuf received */ + done++; + if (bw == 0) printf("bw is null\n"); + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay; + mq_append(&si->dline.mq, m); + } + + /* + * If credit >= 0 the instance is idle, mark time. + * Otherwise put back in the heap, and adjust the output + * time of the last inserted packet, m, which was too early. + */ + if (si->credit >= 0) { + si->idle_time = now; + } else { + uint64_t t; + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); + t = div64(bw - 1 - si->credit, bw); + if (m) + dn_tag_get(m)->output_time += t; + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now + t, si); + } + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return q->head; +} + +/* + * Support function to read the TSC (or equivalent). We use this + * high resolution timer to adapt the amount of work done for + * expiring the clock. + * Supports Linux and FreeBSD both i386 and amd64 platform + * Supports OpenWRT mips architecture + * + * SMP no special works is needed in + * - In linux 2.6 timers will always run in the same cpu that have added it.See + * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html) + * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which + * the timer must be run + * - Windows runs dummynet_task() on cpu0. + * + * - Linux 2.4 doesn't assure to run a timer in the same cpu every time. + */ +#ifdef HAVE_TSC +uint64_t +readTSC (void) +{ + uint64_t a=0; + +#ifdef __linux__ + /* Linux and openwrt have a macro to read the tsc for i386 and + * amd64. + * Openwrt have patched the kernel and allow use of tsc with mips + * and other platforms + * rdtscll() is a macro defined in include/asm-xxx/msr.h, + * where xxx is the architecture (x86, mips). + */ + rdtscll(a); +#elif defined(_WIN32) + /* Microsoft recommends the use of KeQueryPerformanceCounter() + * insteead of rdtsc(). + */ + KeQueryPerformanceCounter((PLARGE_INTEGER)&a); //XXX not tested! +#elif defined(__FreeBSD__) + /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h. + * We could use the macro instead of explicity assembly XXX + */ + return rdtsc(); +#endif + return a; +} +#endif /* HAVE_TSC */ + +/* + * compute avg task period. + * We could do something more complex, possibly. + */ +static void +do_update_cycle(void) +{ +#ifdef HAVE_TSC + uint64_t tmp = readTSC(); +#if defined (LINUX_24) && defined(CONFIG_SMP) + /* on LINUX24 and SMP, we have no guarantees on which cpu runs + * the timer callbacks. If the difference between new and + * old value is negative, we assume that the values come from + * different cpus so we adjust 'new' accordingly. + */ + if (tmp <= dn_cfg.cycle_task_new) + dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task; +#endif /* !(linux24 && SMP) */ + dn_cfg.cycle_task_old = dn_cfg.cycle_task_new; + dn_cfg.cycle_task_new = tmp; + dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old; + + /* Update the average + * avg = (2^N * avg + new - avg ) / 2^N * avg + * N==4 seems to be a good compromise between clock clock change + * and 'spurious' cycle_task value + */ +#define DN_N 4 + dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) + + dn_cfg.cycle_task - dn_cfg.cycle_task_avg; + dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N; +#undef DN_N + +#endif /* HAVE_TSC */ +} + +static void +do_drain(void) +{ +#ifdef HAVE_TSC + uint64_t dt_max; +#endif + if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire) + return; + /* It's time to check if drain routines should be called */ + dn_cfg.expire_cycle = 0; + + dn_cfg.idle_queue_wait = 0; + dn_cfg.idle_si_wait = 0; + /* Do a drain cycle even if there isn't time to do it */ +#ifdef HAVE_TSC + dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio; +#endif + for (;;) { + int done = 0; + + if (dn_cfg.idle_queue > dn_cfg.expire_object && + dn_cfg.idle_queue_wait < dn_cfg.idle_queue) { + dn_drain_queue(); + done = 1; + } + if (dn_cfg.idle_si > dn_cfg.expire_object && + dn_cfg.idle_si_wait < dn_cfg.idle_si) { + dn_drain_scheduler(); + done = 1; + } + /* time to end ? */ +#ifndef HAVE_TSC + /* If tsc does not exist, do only one drain cycle and exit */ + break; +#else + /* Exit when nothing was done or we have consumed all time */ + if ( (done == 0) || + ((readTSC() - dn_cfg.cycle_task_new) * 100 > dt_max) ) + break; +#endif /* HAVE_TSC */ + } +} + +/* + * The timer handler for dummynet. Time is computed in ticks, but + * but the code is tolerant to the actual rate at which this is called. + * Once complete, the function reschedules itself for the next tick. + */ +void +dummynet_task(void *context, int pending) +{ + struct timeval t; + struct mq q = { NULL, NULL }; /* queue to accumulate results */ + + CURVNET_SET((struct vnet *)context); + + do_update_cycle(); /* compute avg. tick duration */ + + DN_BH_WLOCK(); + + /* Update number of lost(coalesced) ticks. */ + tick_lost += pending - 1; + + getmicrouptime(&t); + /* Last tick duration (usec). */ + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + + (t.tv_usec - dn_cfg.prev_t.tv_usec); + /* Last tick vs standard tick difference (usec). */ + tick_delta = (tick_last * hz - 1000000) / hz; + /* Accumulated tick difference (usec). */ + tick_delta_sum += tick_delta; + + dn_cfg.prev_t = t; + + /* + * Adjust curr_time if the accumulated tick difference is + * greater than the 'standard' tick. Since curr_time should + * be monotonically increasing, we do positive adjustments + * as required, and throttle curr_time in case of negative + * adjustment. + */ + dn_cfg.curr_time++; + if (tick_delta_sum - tick >= 0) { + int diff = tick_delta_sum / tick; + + dn_cfg.curr_time += diff; + tick_diff += diff; + tick_delta_sum %= tick; + tick_adjustment++; + } else if (tick_delta_sum + tick <= 0) { + dn_cfg.curr_time--; + tick_diff--; + tick_delta_sum += tick; + tick_adjustment++; + } + + /* serve pending events, accumulate in q */ + for (;;) { + struct dn_id *p; /* generic parameter to handler */ + + if (dn_cfg.evheap.elements == 0 || + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) + break; + p = HEAP_TOP(&dn_cfg.evheap)->object; + heap_extract(&dn_cfg.evheap, NULL); + + if (p->type == DN_SCH_I) { + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); + } else { /* extracted a delay line */ + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); + } + } + do_drain(); + + DN_BH_WUNLOCK(); + dn_reschedule(); + if (q.head != NULL) + dummynet_send(q.head); + CURVNET_RESTORE(); +} + +/* + * forward a chain of packets to the proper destination. + * This runs outside the dummynet lock. + */ +static void +dummynet_send(struct mbuf *m) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ + struct m_tag *tag; + int dst; + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + tag = m_tag_first(m); + if (tag == NULL) { /* should not happen */ + dst = DIR_DROP; + } else { + struct dn_pkt_tag *pkt = dn_tag_get(m); + /* extract the dummynet info, rename the tag + * to carry reinject info. + */ + dst = pkt->dn_dir; + ifp = pkt->ifp; + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + } + + switch (dst) { + case DIR_OUT: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); + break ; + + case DIR_IN : + /* put header in network format for ip_input() */ + //SET_NET_IPLEN(mtod(m, struct ip *)); + netisr_dispatch(NETISR_IP, m); + break; + +#ifdef INET6 + case DIR_IN | PROTO_IPV6: + netisr_dispatch(NETISR_IPV6, m); + break; + + case DIR_OUT | PROTO_IPV6: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); + break; +#endif + + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ + if (bridge_dn_p != NULL) + ((*bridge_dn_p)(m, ifp)); + else + printf("dummynet: if_bridge not loaded\n"); + + break; + + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ + /* + * The Ethernet code assumes the Ethernet header is + * contiguous in the first mbuf header. + * Insure this is true. + */ + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/ether: pullup failed, " + "dropping packet\n"); + break; + } + ether_demux(m->m_pkthdr.rcvif, m); + break; + + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ + ether_output_frame(ifp, m); + break; + + case DIR_DROP: + /* drop the packet after some time */ + FREE_PKT(m); + break; + + default: + printf("dummynet: bad switch %d!\n", dst); + FREE_PKT(m); + break; + } + } +} + +static inline int +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt_tag *dt; + struct m_tag *mtag; + + mtag = m_tag_get(PACKET_TAG_DUMMYNET, + sizeof(*dt), M_NOWAIT | M_ZERO); + if (mtag == NULL) + return 1; /* Cannot allocate packet header. */ + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ + dt = (struct dn_pkt_tag *)(mtag + 1); + dt->rule = fwa->rule; + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ + dt->dn_dir = dir; + dt->ifp = fwa->oif; + /* dt->output tame is updated as we move through */ + dt->output_time = dn_cfg.curr_time; + return 0; +} + + +/* + * dummynet hook for packets. + * We use the argument to locate the flowset fs and the sched_set sch + * associated to it. The we apply flow_mask and sched_mask to + * determine the queue and scheduler instances. + * + * dir where shall we send the packet after dummynet. + * *m0 the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + */ +int +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct dn_fsk *fs = NULL; + struct dn_sch_inst *si; + struct dn_queue *q = NULL; /* default */ + + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); + DN_BH_WLOCK(); + io_pkt++; + /* we could actually tag outside the lock, but who cares... */ + if (tag_mbuf(m, dir, fwa)) + goto dropit; + if (dn_cfg.busy) { + /* if the upper half is busy doing something expensive, + * lets queue the packet and move forward + */ + mq_append(&dn_cfg.pending, m); + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + /* XXX locate_flowset could be optimised with a direct ref. */ + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); + if (fs == NULL) + goto dropit; /* This queue/pipe does not exist! */ + if (fs->sched == NULL) /* should not happen */ + goto dropit; + /* + * If the scheduler supports multiple queues, find the right one + * (otherwise it will be ignored by enqueue). + */ + if (fs->sched->fp->flags & DN_MULTIQUEUE) { + q = ipdn_q_find(fs, &(fwa->f_id)); + if (q == NULL) + goto dropit; + /* The scheduler instance lookup is done only for new queue. + * The callback q_new() will create the scheduler instance + * if needed. + */ + si = q->_si; + } else + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + + if (si == NULL) + goto dropit; + if (fs->sched->fp->enqueue(si, q, m)) { + /* packet was dropped by enqueue() */ + m = *m0 = NULL; + goto dropit; + } + + if (si->kflags & DN_ACTIVE) { + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + + /* compute the initial allowance */ + if (si->idle_time < dn_cfg.curr_time) { + /* Do this only on the first packet on an idle pipe */ + struct dn_link *p = &fs->sched->link; + + si->sched_time = dn_cfg.curr_time; + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; + if (p->burst) { + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; + if (burst > p->burst) + burst = p->burst; + si->credit += burst; + } + } + /* pass through scheduler and delay line */ + m = serve_sched(NULL, si, dn_cfg.curr_time); + + /* optimization -- pass it back to ipfw for immediate send */ + /* XXX Don't call dummynet_send() if scheduler return the packet + * just enqueued. This avoid a lock order reversal. + * + */ + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { + /* fast io, rename the tag * to carry reinject info. */ + struct m_tag *tag = m_tag_first(m); + + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + io_pkt_fast++; + if (m->m_nextpkt != NULL) { + printf("dummynet: fast io: pkt chain detected!\n"); + m->m_nextpkt = NULL; + } + m = NULL; + } else { + *m0 = NULL; + } +done: + DN_BH_WUNLOCK(); + if (m) + dummynet_send(m); + return 0; + +dropit: + io_pkt_drop++; + DN_BH_WUNLOCK(); + if (m) + FREE_PKT(m); + *m0 = NULL; + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; +} diff --git a/dummynet2/ip_dummynet.c b/dummynet2/ip_dummynet.c new file mode 100644 index 0000000..f5b6831 --- /dev/null +++ b/dummynet2/ip_dummynet.c @@ -0,0 +1,2400 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dummynet.c 203340 2010-02-01 12:06:37Z luigi $"); + +/* + * Configuration and internal object management for dummynet. + */ + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +/* which objects to copy */ +#define DN_C_LINK 0x01 +#define DN_C_SCH 0x02 +#define DN_C_FLOW 0x04 +#define DN_C_FS 0x08 +#define DN_C_QUEUE 0x10 + +/* we use this argument in case of a schk_new */ +struct schk_new_arg { + struct dn_alg *fp; + struct dn_sch *sch; +}; + +/*---- callout hooks. ----*/ +static struct callout dn_timeout; +static struct task dn_task; +static struct taskqueue *dn_tq = NULL; + +/* dummynet and ipfw_tick can't be static in windows */ +void +dummynet(void * __unused unused) +{ + + taskqueue_enqueue(dn_tq, &dn_task); +} + +void +dn_reschedule(void) +{ + callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0); +} +/*----- end of callout hooks -----*/ + +/* Return a scheduler descriptor given the type or name. */ +static struct dn_alg * +find_sched_type(int type, char *name) +{ + struct dn_alg *d; + + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { + if (d->type == type || (name && !strcmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + int oldv = *v; + const char *op = NULL; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } else + return *v; + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; +} + +/*---- flow_id mask, hash and compare functions ---*/ +/* + * The flow_id includes the 5-tuple, the queue/pipe number + * which we store in the extra area in host order, + * and for ipv6 also the flow_id6. + * XXX see if we want the tos byte (can store in 'flags') + */ +static struct ipfw_flow_id * +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) +{ + int is_v6 = IS_IP6_FLOW_ID(id); + + id->dst_port &= mask->dst_port; + id->src_port &= mask->src_port; + id->proto &= mask->proto; + id->extra &= mask->extra; + if (is_v6) { + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); + APPLY_MASK(&id->src_ip6, &mask->src_ip6); + id->flow_id6 &= mask->flow_id6; + } else { + id->dst_ip &= mask->dst_ip; + id->src_ip &= mask->src_ip; + } + return id; +} + +/* computes an OR of two masks, result in dst and also returned */ +static struct ipfw_flow_id * +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) +{ + int is_v6 = IS_IP6_FLOW_ID(dst); + + dst->dst_port |= src->dst_port; + dst->src_port |= src->src_port; + dst->proto |= src->proto; + dst->extra |= src->extra; + if (is_v6) { +#define OR_MASK(_d, _s) \ + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; + OR_MASK(&dst->dst_ip6, &src->dst_ip6); + OR_MASK(&dst->src_ip6, &src->src_ip6); +#undef OR_MASK + dst->flow_id6 |= src->flow_id6; + } else { + dst->dst_ip |= src->dst_ip; + dst->src_ip |= src->src_ip; + } + return dst; +} + +static int +nonzero_mask(struct ipfw_flow_id *m) +{ + if (m->dst_port || m->src_port || m->proto || m->extra) + return 1; + if (IS_IP6_FLOW_ID(m)) { + return + m->dst_ip6.__u6_addr.__u6_addr32[0] || + m->dst_ip6.__u6_addr.__u6_addr32[1] || + m->dst_ip6.__u6_addr.__u6_addr32[2] || + m->dst_ip6.__u6_addr.__u6_addr32[3] || + m->src_ip6.__u6_addr.__u6_addr32[0] || + m->src_ip6.__u6_addr.__u6_addr32[1] || + m->src_ip6.__u6_addr.__u6_addr32[2] || + m->src_ip6.__u6_addr.__u6_addr32[3] || + m->flow_id6; + } else { + return m->dst_ip || m->src_ip; + } +} + +/* XXX we may want a better hash function */ +static uint32_t +flow_id_hash(struct ipfw_flow_id *id) +{ + uint32_t i; + + if (IS_IP6_FLOW_ID(id)) { + uint32_t *d = (uint32_t *)&id->dst_ip6; + uint32_t *s = (uint32_t *)&id->src_ip6; + i = (d[0] ) ^ (d[1]) ^ + (d[2] ) ^ (d[3]) ^ + (d[0] >> 15) ^ (d[1] >> 15) ^ + (d[2] >> 15) ^ (d[3] >> 15) ^ + (s[0] << 1) ^ (s[1] << 1) ^ + (s[2] << 1) ^ (s[3] << 1) ^ + (s[0] << 16) ^ (s[1] << 16) ^ + (s[2] << 16) ^ (s[3] << 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->extra) ^ + (id->proto ) ^ (id->flow_id6); + } else { + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ + (id->extra) ^ + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); + } + return i; +} + +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ +static int +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) +{ + int is_v6 = IS_IP6_FLOW_ID(id1); + + if (!is_v6) { + if (IS_IP6_FLOW_ID(id2)) + return 1; /* different address families */ + + return (id1->dst_ip == id2->dst_ip && + id1->src_ip == id2->src_ip && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra) ? 0 : 1; + } + /* the ipv6 case */ + return ( + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra && + id1->flow_id6 == id2->flow_id6) ? 0 : 1; +} +/*--------- end of flow-id mask, hash and compare ---------*/ + +/*--- support functions for the qht hashtable ---- + * Entries are hashed by flow-id + */ +static uint32_t +q_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_queue *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +q_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_queue *o = (struct dn_queue *)obj; + struct ipfw_flow_id *id2; + + if (flags & DNHT_KEY_IS_OBJ) { + /* compare pointers */ + id2 = &((struct dn_queue *)key)->ni.fid; + } else { + id2 = (struct ipfw_flow_id *)key; + } + return (0 == flow_id_cmp(&o->ni.fid, id2)); +} + +/* + * create a new queue instance for the given 'key'. + */ +static void * +q_new(uintptr_t key, int flags, void *arg) +{ + struct dn_queue *q, *template = arg; + struct dn_fsk *fs = template->fs; + int size = sizeof(*q) + fs->sched->fp->q_datalen; + + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q == NULL) { + D("no memory for new queue"); + return NULL; + } + + set_oid(&q->ni.oid, DN_QUEUE, size); + if (fs->fs.flags & DN_QHT_HASH) + q->ni.fid = *(struct ipfw_flow_id *)key; + q->fs = fs; + q->_si = ipdn_si_find(q->fs->sched, &(template->ni.fid)); + if (q->_si == NULL) { + D("no memory for new si"); + free (q, M_DUMMYNET); + return NULL; + } + + q->_si->q_count++; + + if (fs->sched->fp->new_queue) + fs->sched->fp->new_queue(q); + dn_cfg.queue_count++; + dn_cfg.idle_queue++; + return q; +} + +/* + * Notify schedulers that a queue is going away. + * If (flags & DN_DESTROY), also free the packets. + * The version for callbacks is called q_delete_cb(). + * Returns 1 if the queue is NOT deleted (usually when + * the drain routine try to delete a queue that a scheduler + * instance needs), 0 otherwise. + * NOTE: flag DN_DEL_SAFE means that the queue should be + * deleted only if the scheduler no longer needs it + */ +static int +dn_delete_queue(struct dn_queue *q, int flags) +{ + struct dn_fsk *fs = q->fs; + + // D("fs %p si %p\n", fs, q->_si); + /* notify the parent scheduler that the queue is going away */ + if (fs && fs->sched->fp->free_queue) + if (fs->sched->fp->free_queue(q, flags & DN_DEL_SAFE) == 1) + return 1; /* queue NOT deleted */ + q->_si->q_count--; + q->_si = NULL; + if (flags & DN_DESTROY) { + if (q->mq.head) + dn_free_pkts(q->mq.head); + else + dn_cfg.idle_queue--; + bzero(q, sizeof(*q)); // safety + free(q, M_DUMMYNET); + dn_cfg.queue_count--; + } + return 0; +} + +static int +q_delete_cb(void *q, void *arg) +{ + int flags = (int)(uintptr_t)arg; + dn_delete_queue(q, flags); + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; +} + +/* + * calls dn_delete_queue/q_delete_cb on all queues, + * which notifies the parent scheduler and possibly drains packets. + * flags & DN_DESTROY: drains queues and destroy qht; + */ +static void +qht_delete(struct dn_fsk *fs, int flags) +{ + ND("fs %d start flags %d qht %p", + fs->fs.fs_nr, flags, fs->qht); + if (!fs->qht) + return; + if (fs->fs.flags & DN_QHT_HASH) { + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); + if (flags & DN_DESTROY) { + dn_ht_free(fs->qht, 0); + fs->qht = NULL; + } + } else { + dn_delete_queue((struct dn_queue *)(fs->qht), flags); + if (flags & DN_DESTROY) + fs->qht = NULL; + } +} + +/* + * Find and possibly create the queue for a MULTIQUEUE scheduler. + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). + */ +struct dn_queue * +ipdn_q_find(struct dn_fsk *fs, struct ipfw_flow_id *id) +{ + struct dn_queue template; + + template.fs = fs; + + if (fs->fs.flags & DN_QHT_HASH) { + struct ipfw_flow_id masked_id; + if (fs->qht == NULL) { + fs->qht = dn_ht_init(NULL, fs->fs.buckets, + offsetof(struct dn_queue, q_next), + q_hash, q_match, q_new); + if (fs->qht == NULL) + return NULL; + } + masked_id = *id; + flow_id_mask(&fs->fsk_mask, &masked_id); + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, + DNHT_INSERT, &template); + } else { + if (fs->qht == NULL) + fs->qht = q_new(0, 0, &template); + return (struct dn_queue *)fs->qht; + } +} +/*--- end of queue hash table ---*/ + +/*--- support functions for the sch_inst hashtable ---- + * + * These are hashed by flow-id + */ +static uint32_t +si_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +si_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_sch_inst *o = obj; + struct ipfw_flow_id *id2; + + id2 = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + return flow_id_cmp(&o->ni.fid, id2) == 0; +} + +static int si_reset_credit(void *_si, void *arg); // XXX si_new use this + +/* + * create a new instance for the given 'key' + * Allocate memory for instance, delay line and scheduler private data. + */ +static void * +si_new(uintptr_t key, int flags, void *arg) +{ + struct dn_schk *s = arg; + struct dn_sch_inst *si; + int l = sizeof(*si) + s->fp->si_datalen; + + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si == NULL) + goto error; + + /* Set length only for the part passed up to userland. */ + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); + set_oid(&(si->dline.oid), DN_DELAY_LINE, + sizeof(struct delay_line)); + /* mark si and dline as outside the event queue */ + si->ni.oid.id = si->dline.oid.id = -1; + + si->sched = s; + si->dline.si = si; + + if (s->fp->new_sched && s->fp->new_sched(si)) { + D("new_sched error"); + goto error; + } + if (s->sch.flags & DN_HAVE_MASK) + si->ni.fid = *(struct ipfw_flow_id *)key; + + si_reset_credit(si, NULL); + dn_cfg.si_count++; + dn_cfg.idle_si++; + return si; + +error: + if (si) { + bzero(si, sizeof(*si)); // safety + free(si, M_DUMMYNET); + } + return NULL; +} + +/* + * Callback from siht to delete all scheduler instances. Remove + * si and delay line from the system heap, destroy all queues. + * We assume that all flowset have been notified and do not + * point to us anymore. + */ +static int +si_destroy(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_schk *s = si->sched; + struct delay_line *dl = &si->dline; + + if (dl->oid.subtype) /* remove delay line from event heap */ + heap_extract(&dn_cfg.evheap, dl); + if (si->ni.length == 0) + dn_cfg.idle_si--; + dn_free_pkts(dl->mq.head); /* drain delay line */ + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ + heap_extract(&dn_cfg.evheap, si); + if (s->fp->free_sched) + s->fp->free_sched(si); + bzero(si, sizeof(*si)); /* safety */ + free(si, M_DUMMYNET); + dn_cfg.si_count--; + return DNHT_SCAN_DEL; +} + +/* + * Find the scheduler instance for this packet. If we need to apply + * a mask, do on a local copy of the flow_id to preserve the original. + * Assume siht is always initialized if we have a mask. + */ +struct dn_sch_inst * +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) +{ + + if (s->sch.flags & DN_HAVE_MASK) { + struct ipfw_flow_id id_t = *id; + flow_id_mask(&s->sch.sched_mask, &id_t); + return dn_ht_find(s->siht, (uintptr_t)&id_t, + DNHT_INSERT, s); + } + if (!s->siht) + s->siht = si_new(0, 0, s); + return (struct dn_sch_inst *)s->siht; +} + +/* callback to flush credit for the scheduler instance */ +static int +si_reset_credit(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_link *p = &si->sched->link; + + si->idle_time = dn_cfg.curr_time; + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); + return 0; +} + +static void +schk_reset_credit(struct dn_schk *s) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_reset_credit, NULL); + else if (s->siht) + si_reset_credit(s->siht, NULL); +} +/*---- end of sch_inst hashtable ---------------------*/ + +/*------------------------------------------------------- + * flowset hash (fshash) support. Entries are hashed by fs_nr. + * New allocations are put in the fsunlinked list, from which + * they are removed when they point to a specific scheduler. + */ +static uint32_t +fsk_hash(uintptr_t key, int flags, void *arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return ( (i>>8)^(i>>4)^i ); +} + +static int +fsk_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs = obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return (fs->fs.fs_nr == i); +} + +static void * +fsk_new(uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs; + + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs) { + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); + dn_cfg.fsk_count++; + fs->drain_bucket = 0; + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } + return fs; +} + +/* + * detach flowset from its current scheduler. Flags as follows: + * DN_DETACH removes from the fsk_list + * DN_DESTROY deletes individual queues + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). + */ +static void +fsk_detach(struct dn_fsk *fs, int flags) +{ + if (flags & DN_DELETE_FS) + flags |= DN_DESTROY; + ND("fs %d from sched %d flags %s %s %s", + fs->fs.fs_nr, fs->fs.sched_nr, + (flags & DN_DELETE_FS) ? "DEL_FS":"", + (flags & DN_DESTROY) ? "DEL":"", + (flags & DN_DETACH) ? "DET":""); + if (flags & DN_DETACH) { /* detach from the list */ + struct dn_fsk_head *h; + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); + } + /* Free the RED parameters, they will be recomputed on + * subsequent attach if needed. + */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + qht_delete(fs, flags); + if (fs->sched && fs->sched->fp->free_fsk) + fs->sched->fp->free_fsk(fs); + fs->sched = NULL; + if (flags & DN_DELETE_FS) { + bzero(fs, sizeof(fs)); /* safety */ + free(fs, M_DUMMYNET); + dn_cfg.fsk_count--; + } else { + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } +} + +/* + * Detach or destroy all flowsets in a list. + * flags specifies what to do: + * DN_DESTROY: flush all queues + * DN_DELETE_FS: DN_DESTROY + destroy flowset + * DN_DELETE_FS implies DN_DESTROY + */ +static void +fsk_detach_list(struct dn_fsk_head *h, int flags) +{ + struct dn_fsk *fs; + int n = 0; /* only for stats */ + + ND("head %p flags %x", h, flags); + while ((fs = SLIST_FIRST(h))) { + SLIST_REMOVE_HEAD(h, sch_chain); + n++; + fsk_detach(fs, flags); + } + ND("done %d flowsets", n); +} + +/* + * called on 'queue X delete' -- removes the flowset from fshash, + * deletes all queues for the flowset, and removes the flowset. + */ +static int +delete_fs(int i, int locked) +{ + struct dn_fsk *fs; + int err = 0; + + if (!locked) + DN_BH_WLOCK(); + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); + if (dn_ht_entries(dn_cfg.fshash) == 0) { + dn_ht_free(dn_cfg.fshash, 0); + dn_cfg.fshash = NULL; + } + ND("fs %d found %p", i, fs); + if (fs) { + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); + err = 0; + } else + err = EINVAL; + if (!locked) + DN_BH_WUNLOCK(); + return err; +} + +/*----- end of flowset hashtable support -------------*/ + +/*------------------------------------------------------------ + * Scheduler hash. When searching by index we pass sched_nr, + * otherwise we pass struct dn_sch * which is the first field in + * struct dn_schk so we can cast between the two. We use this trick + * because in the create phase (but it should be fixed). + */ +static uint32_t +schk_hash(uintptr_t key, int flags, void *_arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return ( (i>>8)^(i>>4)^i ); +} + +static int +schk_match(void *obj, uintptr_t key, int flags, void *_arg) +{ + struct dn_schk *s = (struct dn_schk *)obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return (s->sch.sched_nr == i); +} + +/* + * Create the entry and intialize with the sched hash if needed. + * Leave s->fp unset so we can tell whether a dn_ht_find() returns + * a new object or a previously existing one. + */ +static void * +schk_new(uintptr_t key, int flags, void *arg) +{ + struct schk_new_arg *a = arg; + struct dn_schk *s; + int l = sizeof(*s) +a->fp->schk_datalen; + + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s == NULL) + return NULL; + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); + s->sch = *a->sch; // copy initial values + s->link.link_nr = s->sch.sched_nr; + SLIST_INIT(&s->fsk_list); + /* initialize the hash table or create the single instance */ + s->fp = a->fp; /* si_new needs this */ + s->drain_bucket = 0; + if (s->sch.flags & DN_HAVE_MASK) { + s->siht = dn_ht_init(NULL, s->sch.buckets, + offsetof(struct dn_sch_inst, si_next), + si_hash, si_match, si_new); + if (s->siht == NULL) { + free(s, M_DUMMYNET); + return NULL; + } + } + s->fp = NULL; /* mark as a new scheduler */ + dn_cfg.schk_count++; + return s; +} + +/* + * Callback for sched delete. Notify all attached flowsets to + * detach from the scheduler, destroy the internal flowset, and + * all instances. The scheduler goes away too. + * arg is 0 (only detach flowsets and destroy instances) + * DN_DESTROY (detach & delete queues, delete schk) + * or DN_DELETE_FS (delete queues and flowsets, delete schk) + */ +static int +schk_delete_cb(void *obj, void *arg) +{ + struct dn_schk *s = obj; +#if 0 + int a = (int)arg; + ND("sched %d arg %s%s", + s->sch.sched_nr, + a&DN_DESTROY ? "DEL ":"", + a&DN_DELETE_FS ? "DEL_FS":""); +#endif + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); + /* no more flowset pointing to us now */ + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan(s->siht, si_destroy, NULL); + dn_ht_free(s->siht, 0); + } + else if (s->siht) + si_destroy(s->siht, NULL); + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + s->siht = NULL; + if (s->fp->destroy) + s->fp->destroy(s); + bzero(s, sizeof(*s)); // safety + free(obj, M_DUMMYNET); + dn_cfg.schk_count--; + return DNHT_SCAN_DEL; +} + +/* + * called on a 'sched X delete' command. Deletes a single scheduler. + * This is done by removing from the schedhash, unlinking all + * flowsets and deleting their traffic. + */ +static int +delete_schk(int i) +{ + struct dn_schk *s; + + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + if (dn_ht_entries(dn_cfg.schedhash) == 0) { + dn_ht_free(dn_cfg.schedhash, 0); + dn_cfg.schedhash = NULL; + } + ND("%d %p", i, s); + if (!s) + return EINVAL; + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ + /* then detach flowsets, delete traffic */ + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); + return 0; +} +/*--- end of schk hashtable support ---*/ + +static int +copy_obj(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + + if (have < o->len || o->len == 0 || o->type == 0) { + D("(WARN) type %d %s %d have %d need %d", + o->type, msg, i, have, o->len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, o->len); + bcopy(_o, *start, o->len); + if (o->type == DN_LINK) { + /* Adjust burst parameter for link */ + struct dn_link *l = (struct dn_link *)*start; + l->burst = div64(l->burst, 8 * hz); + } else if (o->type == DN_SCH) { + /* Set id->id to the number of instances */ + struct dn_schk *s = _o; + struct dn_id *id = (struct dn_id *)(*start); + id->id = (s->sch.flags & DN_HAVE_MASK) ? + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); + } + *start += o->len; + return 0; +} + +/* Specific function to copy a queue. + * Copies only the user-visible part of a queue (which is in + * a struct dn_flow), and sets len accordingly. + */ +static int +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + int len = sizeof(struct dn_flow); /* see above comment */ + + if (have < len || o->len == 0 || o->type != DN_QUEUE) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, len); + bcopy(_o, *start, len); + ((struct dn_id*)(*start))->len = len; + *start += len; + return 0; +} + +static int +copy_q_cb(void *obj, void *arg) +{ + struct dn_queue *q = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); + return 0; +} + +static int +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + if (!fs->qht) + return 0; + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, copy_q_cb, a); + else + copy_q_cb(fs->qht, a); + return 0; +} + +/* + * This routine only copies the initial part of a profile ? XXX + * XXX marta: I think this routine is called to print a summary + * of the pipe configuration and does not need to show the + * profile samples list. + */ +static int +copy_profile(struct copy_args *a, struct dn_profile *p) +{ + int have = a->end - *a->start; + /* XXX here we check for max length */ + int profile_len = sizeof(struct dn_profile); + + if (p == NULL) + return 0; + if (have < profile_len) { + D("error have %d need %d", have, profile_len); + return 1; + } + bcopy(p, *a->start, profile_len); + ((struct dn_id *)(*a->start))->len = profile_len; + *a->start += profile_len; + return 0; +} + +static int +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + struct dn_fs *ufs = (struct dn_fs *)(*a->start); + if (!fs) + return 0; + ND("flowset %d", fs->fs.fs_nr); + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) + return DNHT_SCAN_END; + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); + if (flags) { /* copy queues */ + copy_q(a, fs, 0); + } + return 0; +} + +static int +copy_si_cb(void *obj, void *arg) +{ + struct dn_sch_inst *si = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj(a->start, a->end, &si->ni, "inst", + si->sched->sch.sched_nr)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); + return 0; +} + +static int +copy_si(struct copy_args *a, struct dn_schk *s, int flags) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, copy_si_cb, a); + else if (s->siht) + copy_si_cb(s->siht, a); + return 0; +} + +/* + * compute a list of children of a scheduler and copy up + */ +static int +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) +{ + struct dn_fsk *fs; + struct dn_id *o; + uint32_t *p; + + int n = 0, space = sizeof(*o); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs->fs.fs_nr < DN_MAX_ID) + n++; + } + space += n * sizeof(uint32_t); + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); + if (a->end - *(a->start) < space) + return DNHT_SCAN_END; + o = (struct dn_id *)(*(a->start)); + o->len = space; + *a->start += o->len; + o->type = DN_TEXT; + p = (uint32_t *)(o+1); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) + if (fs->fs.fs_nr < DN_MAX_ID) + *p++ = fs->fs.fs_nr; + return 0; +} + +static int +copy_data_helper(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + uint32_t *r = a->extra->r; /* start of first range */ + uint32_t *lim; /* first invalid pointer */ + int n; + + lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); + + if (a->type == DN_LINK || a->type == DN_SCH) { + /* pipe|sched show, we receive a dn_schk */ + struct dn_schk *s = _o; + + n = s->sch.sched_nr; + if (a->type == DN_SCH && n >= DN_MAX_ID) + return 0; /* not a scheduler */ + if (a->type == DN_LINK && n <= DN_MAX_ID) + return 0; /* not a pipe */ + + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + /* Found a valid entry, copy and we are done */ + if (a->flags & DN_C_LINK) { + if (copy_obj(a->start, a->end, + &s->link, "link", n)) + return DNHT_SCAN_END; + if (copy_profile(a, s->profile)) + return DNHT_SCAN_END; + if (copy_flowset(a, s->fs, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_SCH) { + if (copy_obj(a->start, a->end, + &s->sch, "sched", n)) + return DNHT_SCAN_END; + /* list all attached flowsets */ + if (copy_fsk_list(a, s, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_FLOW) + copy_si(a, s, 0); + break; + } + } else if (a->type == DN_FS) { + /* queue show, skip internal flowsets */ + struct dn_fsk *fs = _o; + + n = fs->fs.fs_nr; + if (n >= DN_MAX_ID) + return 0; + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + if (copy_flowset(a, fs, 0)) + return DNHT_SCAN_END; + copy_q(a, fs, 0); + break; /* we are done */ + } + } + return 0; +} + +static inline struct dn_schk * +locate_scheduler(int i) +{ + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); +} + +/* + * red parameters are in fixed point arithmetic. + */ +static int +config_red(struct dn_fsk *fs) +{ + int64_t s, idle, weight, w0; + int t, i; + + fs->w_q = fs->fs.w_q; + fs->max_p = fs->fs.max_p; + D("called"); + /* Doing stuff that was in userland */ + i = fs->sched->link.bandwidth; + s = (i <= 0) ? 0 : + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; + + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); + /* fs->lookup_step not scaled, */ + if (!fs->lookup_step) + fs->lookup_step = 1; + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled + + for (t = fs->lookup_step; t > 1; --t) + weight = SCALE_MUL(weight, w0); + fs->lookup_weight = (int)(weight); // scaled + + /* Now doing stuff that was in kerneland */ + fs->min_th = SCALE(fs->fs.min_th); + fs->max_th = SCALE(fs->fs.max_th); + + fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); + + if (fs->fs.flags & DN_IS_GENTLE_RED) { + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; + fs->c_4 = SCALE(1) - 2 * fs->max_p; + } + + /* If the lookup table already exist, free and create it again. */ + if (fs->w_q_lookup) { + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + } + if (dn_cfg.red_lookup_depth == 0) { + printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" + "must be > 0\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return (EINVAL); + } + fs->lookup_depth = dn_cfg.red_lookup_depth; + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), + M_DUMMYNET, M_NOWAIT); + if (fs->w_q_lookup == NULL) { + printf("dummynet: sorry, cannot allocate red lookup table\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return(ENOSPC); + } + + /* Fill the lookup table with (1 - w_q)^x */ + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; + + for (i = 1; i < fs->lookup_depth; i++) + fs->w_q_lookup[i] = + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); + + if (dn_cfg.red_avg_pkt_size < 1) + dn_cfg.red_avg_pkt_size = 512; + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; + if (dn_cfg.red_max_pkt_size < 1) + dn_cfg.red_max_pkt_size = 1500; + fs->max_pkt_size = dn_cfg.red_max_pkt_size; + D("exit"); + return 0; +} + +/* Scan all flowset attached to this scheduler and update red */ +static void +update_red(struct dn_schk *s) +{ + struct dn_fsk *fs; + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs && (fs->fs.flags & DN_IS_RED)) + config_red(fs); + } +} + +/* attach flowset to scheduler s, possibly requeue */ +static void +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) +{ + ND("remove fs %d from fsunlinked, link to sched %d", + fs->fs.fs_nr, s->sch.sched_nr); + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); + fs->sched = s; + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); + if (s->fp->new_fsk) + s->fp->new_fsk(fs); + /* XXX compute fsk_mask */ + fs->fsk_mask = fs->fs.flow_mask; + if (fs->sched->sch.flags & DN_HAVE_MASK) + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); + if (fs->qht) { + /* + * we must drain qht according to the old + * type, and reinsert according to the new one. + * The requeue is complex -- in general we need to + * reclassify every single packet. + * For the time being, let's hope qht is never set + * when we reach this point. + */ + D("XXX TODO requeue from fs %d to sch %d", + fs->fs.fs_nr, s->sch.sched_nr); + fs->qht = NULL; + } + /* set the new type for qht */ + if (nonzero_mask(&fs->fsk_mask)) + fs->fs.flags |= DN_QHT_HASH; + else + fs->fs.flags &= ~DN_QHT_HASH; + + /* XXX config_red() can fail... */ + if (fs->fs.flags & DN_IS_RED) + config_red(fs); +} + +/* update all flowsets which may refer to this scheduler */ +static void +update_fs(struct dn_schk *s) +{ + struct dn_fsk *fs, *tmp; + + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { + if (s->sch.sched_nr != fs->fs.sched_nr) { + D("fs %d for sch %d not %d still unlinked", + fs->fs.fs_nr, fs->fs.sched_nr, + s->sch.sched_nr); + continue; + } + fsk_attach(fs, s); + } +} + +/* + * Configuration -- to preserve backward compatibility we use + * the following scheme (N is 65536) + * NUMBER SCHED LINK FLOWSET + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 + * + * "pipe i config" configures #1, #2 and #3 + * "sched i config" configures #1 and possibly #6 + * "queue i config" configures #3 + * #1 is configured with 'pipe i config' or 'sched i config' + * #2 is configured with 'pipe i config', and created if not + * existing with 'sched i config' + * #3 is configured with 'queue i config' + * #4 is automatically configured after #1, can only be FIFO + * #5 is automatically configured after #2 + * #6 is automatically created when #1 is !MULTIQUEUE, + * and can be updated. + * #7 is automatically configured after #2 + */ + +/* + * configure a link (and its FIFO instance) + */ +static int +config_link(struct dn_link *p, struct dn_id *arg) +{ + int i; + + if (p->oid.len != sizeof(*p)) { + D("invalid pipe len %d", p->oid.len); + return EINVAL; + } + i = p->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + * burst ??? + */ + p->delay = (p->delay * hz) / 1000; + /* Scale burst size: bytes -> bits * hz */ + p->burst *= 8 * hz; + + DN_BH_WLOCK(); + /* do it twice, base link and FIFO link */ + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_schk *s = locate_scheduler(i); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("sched %d not found", i); + return EINVAL; + } + /* remove profile if exists */ + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + /* copy all parameters */ + s->link.oid = p->oid; + s->link.link_nr = i; + s->link.delay = p->delay; + if (s->link.bandwidth != p->bandwidth) { + /* XXX bandwidth changes, need to update red params */ + s->link.bandwidth = p->bandwidth; + update_red(s); + } + s->link.burst = p->burst; + schk_reset_credit(s); + } + dn_cfg.id++; + DN_BH_WUNLOCK(); + return 0; +} + +/* + * configure a flowset. Can be called from inside with locked=1, + */ +static struct dn_fsk * +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) +{ + int i; + struct dn_fsk *fs; + + if (nfs->oid.len != sizeof(*nfs)) { + D("invalid flowset len %d", nfs->oid.len); + return NULL; + } + i = nfs->fs_nr; + if (i <= 0 || i >= 3*DN_MAX_ID) + return NULL; + ND("flowset %d", i); + /* XXX other sanity checks */ + if (nfs->flags & DN_QSIZE_BYTES) { + ipdn_bound_var(&nfs->qsize, 16384, + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); + } else { + ipdn_bound_var(&nfs->qsize, 50, + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); + } + if (nfs->flags & DN_HAVE_MASK) { + /* make sure we have some buckets */ + ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "flowset buckets"); + } else { + nfs->buckets = 1; /* we only need 1 */ + } + if (!locked) + DN_BH_WLOCK(); + if (dn_cfg.fshash == NULL) + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_fsk, fsk_next), + fsk_hash, fsk_match, fsk_new); + do { /* exit with break when done */ + struct dn_schk *s; + int flags = nfs->sched_nr ? DNHT_INSERT : 0; + int j; + int oldc = dn_cfg.fsk_count; + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); + if (fs == NULL) { + D("missing sched for flowset %d", i); + break; + } + /* grab some defaults from the existing one */ + if (nfs->sched_nr == 0) /* reuse */ + nfs->sched_nr = fs->fs.sched_nr; + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { + if (nfs->par[j] == -1) /* reuse */ + nfs->par[j] = fs->fs.par[j]; + } + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { + ND("flowset %d unchanged", i); + break; /* no change, nothing to do */ + } + if (oldc != dn_cfg.fsk_count) /* new item */ + dn_cfg.id++; + s = locate_scheduler(nfs->sched_nr); + /* detach from old scheduler if needed, preserving + * queues if we need to reattach. Then update the + * configuration, and possibly attach to the new sched. + */ + DX(2, "fs %d changed sched %d@%p to %d@%p", + fs->fs.fs_nr, + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); + if (fs->sched) { + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); + flags |= DN_DESTROY; /* XXX temporary */ + fsk_detach(fs, flags); + } + fs->fs = *nfs; /* copy configuration */ + if (s != NULL) + fsk_attach(fs, s); + } while (0); + if (!locked) + DN_BH_WUNLOCK(); + return fs; +} + +/* + * config/reconfig a scheduler and its FIFO variant. + * For !MULTIQUEUE schedulers, also set up the flowset. + * + * On reconfigurations (detected because s->fp is set), + * detach existing flowsets preserving traffic, preserve link, + * and delete the old scheduler creating a new one. + */ +static int +config_sched(struct dn_sch *_nsch, struct dn_id *arg) +{ + struct dn_schk *s; + struct schk_new_arg a; /* argument for schk_new */ + int i; + struct dn_link p; /* copy of oldlink */ + struct dn_profile *pf = NULL; /* copy of old link profile */ + /* Used to preserv mask parameter */ + struct ipfw_flow_id new_mask; + int new_buckets = 0; + int new_flags = 0; + int pipe_cmd; + int err = ENOMEM; + + a.sch = _nsch; + if (a.sch->oid.len != sizeof(*a.sch)) { + D("bad sched len %d", a.sch->oid.len); + return EINVAL; + } + i = a.sch->sched_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* make sure we have some buckets */ + if (a.sch->flags & DN_HAVE_MASK) + ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "sched buckets"); + /* XXX other sanity checks */ + bzero(&p, sizeof(p)); + + pipe_cmd = a.sch->flags & DN_PIPE_CMD; + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? + if (pipe_cmd) { + /* Copy mask parameter */ + new_mask = a.sch->sched_mask; + new_buckets = a.sch->buckets; + new_flags = a.sch->flags; + } + DN_BH_WLOCK(); + if (dn_cfg.schedhash == NULL) + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_schk, schk_next), + schk_hash, schk_match, schk_new); +again: /* run twice, for wfq and fifo */ + /* + * lookup the type. If not supplied, use the previous one + * or default to WF2Q+. Otherwise, return an error. + */ + dn_cfg.id++; + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); + if (a.fp != NULL) { + /* found. Lookup or create entry */ + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { + /* No type. search existing s* or retry with WF2Q+ */ + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); + if (s != NULL) { + a.fp = s->fp; + /* Scheduler exists, skip to FIFO scheduler + * if command was pipe config... + */ + if (pipe_cmd) + goto next; + } else { + /* New scheduler, create a wf2q+ with no mask + * if command was pipe config... + */ + if (pipe_cmd) { + /* clear mask parameter */ + bzero(&a.sch->sched_mask, sizeof(new_mask)); + a.sch->buckets = 0; + a.sch->flags &= ~DN_HAVE_MASK; + } + a.sch->oid.subtype = DN_SCHED_WF2QP; + goto again; + } + } else { + D("invalid scheduler type %d %s", + a.sch->oid.subtype, a.sch->name); + err = EINVAL; + goto error; + } + /* normalize name and subtype */ + a.sch->oid.subtype = a.fp->type; + bzero(a.sch->name, sizeof(a.sch->name)); + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); + if (s == NULL) { + D("cannot allocate scheduler %d", i); + goto error; + } + /* restore existing link if any */ + if (p.link_nr) { + s->link = p; + if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ + s->profile = NULL; /* XXX maybe not needed */ + } else { + size_t pf_size = sizeof(struct dn_profile) + + s->profile->samples_no * sizeof(int); + + s->profile = malloc(pf_size, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("cannot allocate profile"); + goto error; //XXX + } + bcopy(pf, s->profile, pf_size); + } + } + p.link_nr = 0; + if (s->fp == NULL) { + DX(2, "sched %d new type %s", i, a.fp->name); + } else if (s->fp != a.fp || + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { + /* already existing. */ + DX(2, "sched %d type changed from %s to %s", + i, s->fp->name, a.fp->name); + DX(4, " type/sub %d/%d -> %d/%d", + s->sch.oid.type, s->sch.oid.subtype, + a.sch->oid.type, a.sch->oid.subtype); + if (s->link.link_nr == 0) + D("XXX WARNING link 0 for sched %d", i); + p = s->link; /* preserve link */ + if (s->profile) {/* preserve profile */ + if (!pf) + pf = malloc(sizeof(*pf), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (pf) /* XXX should issue a warning otherwise */ + bcopy(s->profile, pf, sizeof(*pf)); + } + /* remove from the hash */ + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + /* Detach flowsets, preserve queues. */ + // schk_delete_cb(s, NULL); + // XXX temporarily, kill queues + schk_delete_cb(s, (void *)DN_DESTROY); + goto again; + } else { + DX(4, "sched %d unchanged type %s", i, a.fp->name); + } + /* complete initialization */ + s->sch = *a.sch; + s->fp = a.fp; + s->cfg = arg; + // XXX schk_reset_credit(s); + /* create the internal flowset if needed, + * trying to reuse existing ones if available + */ + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); + if (!s->fs) { + struct dn_fs fs; + bzero(&fs, sizeof(fs)); + set_oid(&fs.oid, DN_FS, sizeof(fs)); + fs.fs_nr = i + DN_MAX_ID; + fs.sched_nr = i; + s->fs = config_fs(&fs, NULL, 1 /* locked */); + } + if (!s->fs) { + schk_delete_cb(s, (void *)DN_DESTROY); + D("error creating internal fs for %d", i); + goto error; + } + } + /* call init function after the flowset is created */ + if (s->fp->config) + s->fp->config(s); + update_fs(s); +next: + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ + i += DN_MAX_ID; + if (pipe_cmd) { + /* Restore mask parameter for FIFO */ + a.sch->sched_mask = new_mask; + a.sch->buckets = new_buckets; + a.sch->flags = new_flags; + } else { + /* sched config shouldn't modify the FIFO scheduler */ + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { + /* FIFO already exist, don't touch it */ + err = 0; /* and this is not an error */ + goto error; + } + } + a.sch->sched_nr = i; + a.sch->oid.subtype = DN_SCHED_FIFO; + bzero(a.sch->name, sizeof(a.sch->name)); + goto again; + } + err = 0; +error: + DN_BH_WUNLOCK(); + if (pf) + free(pf, M_DUMMYNET); + return err; +} + +/* + * attach a profile to a link + */ +static int +config_profile(struct dn_profile *pf, struct dn_id *arg) +{ + struct dn_schk *s; + int i, olen, err = 0; + + if (pf->oid.len < sizeof(*pf)) { + D("short profile len %d", pf->oid.len); + return EINVAL; + } + i = pf->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* XXX other sanity checks */ + DN_BH_WLOCK(); + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + s = locate_scheduler(i); + + if (s == NULL) { + err = EINVAL; + break; + } + dn_cfg.id++; + /* + * If we had a profile and the new one does not fit, + * or it is deleted, then we need to free memory. + */ + if (s->profile && (pf->samples_no == 0 || + s->profile->oid.len < pf->oid.len)) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + if (pf->samples_no == 0) + continue; + /* + * new profile, possibly allocate memory + * and copy data. + */ + if (s->profile == NULL) + s->profile = malloc(pf->oid.len, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("no memory for profile %d", i); + err = ENOMEM; + break; + } + /* preserve larger length XXX double check */ + olen = s->profile->oid.len; + if (olen < pf->oid.len) + olen = pf->oid.len; + bcopy(pf, s->profile, pf->oid.len); + s->profile->oid.len = olen; + } + + DN_BH_WUNLOCK(); + return err; +} + +/* + * Delete all objects: + */ +static void +dummynet_flush(void) +{ + + /* delete all schedulers and related links/queues/flowsets */ + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, + (void *)(uintptr_t)DN_DELETE_FS); + /* delete all remaining (unlinked) flowsets */ + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); + + dn_ht_free(dn_cfg.schedhash, DNHT_REMOVE); + /* Reinitialize system heap... */ + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); +} + +/* + * Main handler for configuration. We are guaranteed to be called + * with an oid which is at least a dn_id. + * - the first object is the command (config, delete, flush, ...) + * - config_link must be issued after the corresponding config_sched + * - parameters (DN_TXT) for an object must preceed the object + * processed on a config_sched. + */ +int +do_config(void *p, int l) +{ + struct dn_id *next, *o; + int err = 0, err2 = 0; + struct dn_id *arg = NULL; + uintptr_t *a; + + o = p; + if (o->id != DN_API_VERSION) { + D("invalid api version got %d need %d", + o->id, DN_API_VERSION); + return EINVAL; + } + for (; l >= sizeof(*o); o = next) { + struct dn_id *prev = arg; + if (o->len < sizeof(*o) || l < o->len) { + D("bad len o->len %d len %d", o->len, l); + err = EINVAL; + break; + } + l -= o->len; + next = (struct dn_id *)((char *)o + o->len); + err = 0; + switch (o->type) { + default: + D("cmd %d not implemented", o->type); + break; + +#ifdef EMULATE_SYSCTL + /* sysctl emulation. + * if we recognize the command, jump to the correct + * handler and return + */ + case DN_SYSCTL_SET: + err = kesysctl_emu_set(p, l); + return err; +#endif + + case DN_CMD_CONFIG: /* simply a header */ + break; + + case DN_CMD_DELETE: + /* the argument is in the first uintptr_t after o */ + a = (uintptr_t *)(o+1); + if (o->len < sizeof(*o) + sizeof(*a)) { + err = EINVAL; + break; + } + switch (o->subtype) { + case DN_LINK: + /* delete base and derived schedulers */ + DN_BH_WLOCK(); + err = delete_schk(*a); + err2 = delete_schk(*a + DN_MAX_ID); + DN_BH_WUNLOCK(); + if (!err) + err = err2; + break; + + default: + D("invalid delete type %d", + o->subtype); + err = EINVAL; + break; + + case DN_FS: + err = (*a <1 || *a >= DN_MAX_ID) ? + EINVAL : delete_fs(*a, 0) ; + break; + } + break; + + case DN_CMD_FLUSH: + DN_BH_WLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); + break; + case DN_TEXT: /* store argument the next block */ + prev = NULL; + arg = o; + break; + case DN_LINK: + err = config_link((struct dn_link *)o, arg); + break; + case DN_PROFILE: + err = config_profile((struct dn_profile *)o, arg); + break; + case DN_SCH: + err = config_sched((struct dn_sch *)o, arg); + break; + case DN_FS: + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); + break; + } + if (prev) + arg = NULL; + if (err != 0) + break; + } + return err; +} + +static int +compute_space(struct dn_id *cmd, struct copy_args *a) +{ + int x = 0, need = 0; + int profile_size = sizeof(struct dn_profile); + + /* NOTE about compute space: + * NP = dn_cfg.schk_count + * NSI = dn_cfg.si_count + * NF = dn_cfg.fsk_count + * NQ = dn_cfg.queue_count + * - ipfw pipe show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI)*(dn_flow) all scheduler instance (includes + * the queue instance) + * - ipfw sched show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI * dn_flow) all scheduler instances + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler + * (NQ * dn_queue) all queue [XXXfor now not listed] + * - ipfw queue show + * (NF * dn_fs) all flowset + * (NQ * dn_queue) all queues + */ + switch (cmd->subtype) { + default: + return -1; + /* XXX where do LINK and SCH differ ? */ + /* 'ipfw sched show' could list all queues associated to + * a scheduler. This feature for now is disabled + */ + case DN_LINK: /* pipe show */ + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + break; + case DN_SCH: /* sched show */ + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; + break; + case DN_FS: /* queue show */ + x = DN_C_FS | DN_C_QUEUE; + break; + case DN_GET_COMPAT: /* compatibility mode */ + need = dn_compat_calc_size(); + break; + } + a->flags = x; + if (x & DN_C_SCH) { + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; + /* NOT also, each fs might be attached to a sched */ + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; + } + if (x & DN_C_FS) + need += dn_cfg.fsk_count * sizeof(struct dn_fs); + if (x & DN_C_LINK) { + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; + } + /* + * When exporting a queue to userland, only pass up the + * struct dn_flow, which is the only visible part. + */ + + if (x & DN_C_QUEUE) + need += dn_cfg.queue_count * sizeof(struct dn_flow); + if (x & DN_C_FLOW) + need += dn_cfg.si_count * (sizeof(struct dn_flow)); + return need; +} + +/* + * If compat != NULL dummynet_get is called in compatibility mode. + * *compat will be the pointer to the buffer to pass to ipfw + */ +int +dummynet_get(struct sockopt *sopt, void **compat) +{ + int have, i, need, error; + char *start = NULL, *buf; + size_t sopt_valsize; + struct dn_id *cmd; + struct copy_args a; + struct copy_range r; + int l = sizeof(struct dn_id); + + bzero(&a, sizeof(a)); + bzero(&r, sizeof(r)); + + /* save and restore original sopt_valsize around copyin */ + sopt_valsize = sopt->sopt_valsize; + + cmd = &r.o; + + if (!compat) { + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); +#endif + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_WAIT); + if (cmd == NULL) + return ENOMEM; //XXX + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + } + } else { /* compatibility */ + error = 0; + cmd->type = DN_CMD_GET; + cmd->len = sizeof(struct dn_id); + cmd->subtype = DN_GET_COMPAT; + // cmd->id = sopt_valsize; + D("compatibility mode"); + } + a.extra = (struct copy_range *)cmd; + if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ + uint32_t *rp = (uint32_t *)(cmd + 1); + cmd->len += 2* sizeof(uint32_t); + rp[0] = 1; + rp[1] = DN_MAX_ID - 1; + if (cmd->subtype == DN_LINK) { + rp[0] += DN_MAX_ID; + rp[1] += DN_MAX_ID; + } + } + /* Count space (under lock) and allocate (outside lock). + * Exit with lock held if we manage to get enough buffer. + * Try a few times then give up. + */ + for (have = 0, i = 0; i < 10; i++) { + DN_BH_WLOCK(); + need = compute_space(cmd, &a); + + /* if there is a range, ignore value from compute_space() */ + if (l > sizeof(*cmd)) + need = sopt_valsize - sizeof(*cmd); + + if (need < 0) { + DN_BH_WUNLOCK(); + error = EINVAL; + goto done; + } + need += sizeof(*cmd); + cmd->id = need; + if (have >= need) /* got space, hold the lock */ + break; + + DN_BH_WUNLOCK(); + if (start) + free(start, M_DUMMYNET); + start = NULL; + if (need > sopt_valsize) + break; + + have = need; + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); + if (start == NULL) { + error = ENOMEM; + goto done; + } + } + + if (start == NULL) { + if (compat) { + *compat = NULL; + error = 1; // XXX + } else { + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); + } + /* no enough memory, release the lock and give up */ + /* XXX marta: here we hold the lock */ + goto done; + } + ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " + "%d:%d si %d, %d:%d queues %d", + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); + sopt->sopt_valsize = sopt_valsize; + a.type = cmd->subtype; + + if (compat == NULL) { + bcopy(cmd, start, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); + } else + buf = start; + a.start = &buf; + a.end = start + have; + /* start copying other objects */ + if (compat) { + a.type = DN_COMPAT_PIPE; + dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); + a.type = DN_COMPAT_QUEUE; + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); + } else if (a.type == DN_FS) { + dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); + } else { + dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); + } + DN_BH_WUNLOCK(); + + if (compat) { + *compat = start; + sopt->sopt_valsize = buf - start; + /* free() is done by ip_dummynet_compat() */ + start = NULL; //XXX hack + } else { + error = sooptcopyout(sopt, start, buf - start); + } +done: + if (cmd && cmd != &r.o) + free(cmd, M_DUMMYNET); + if (start) + free(start, M_DUMMYNET); + + return error; +} + +/* + * Functions to drain idle objects -- see dummynet_task() for some notes + */ +/* Callback called on scheduler instance to delete it if idle */ +static int +drain_scheduler_cb(void *_si, void *_arg) +{ + struct dn_sch_inst *si = _si; + int *arg = _arg; + int empty; + + if ( (*arg++) > dn_cfg.expire_object_examined) + return DNHT_SCAN_END; + + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) + return 0; + + /* + * if the scheduler is multiqueue, q_count also reflects empty + * queues that point to si, so we need to check si->q_count to + * tell whether we can remove the instance. + */ + if (si->ni.length == 0) { + /* si was marked as idle: + * remove it or increment idle_si_wait counter + */ + empty = (si->sched->fp->flags & DN_MULTIQUEUE) ? + (si->q_count == 0) : 1; + if (empty && + (si->idle_time < dn_cfg.curr_time - dn_cfg.object_idle_tick)) + return si_destroy(si, NULL); + else + dn_cfg.idle_si_wait++; + } + return 0; +} + +/* Callback called on scheduler to check if it has instances */ +static int +drain_scheduler_sch_cb(void *_s, void *_arg) +{ + struct dn_schk *s = _s; + int *arg = _arg; + + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan_bucket(s->siht, &s->drain_bucket, + drain_scheduler_cb, _arg); + } else { + if (s->siht) { + if (drain_scheduler_cb(s->siht, _arg) == DNHT_SCAN_DEL) + s->siht = NULL; + } + } + return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0; +} + +/* Called every tick, try to delete a 'bucket' of scheduler */ +void +dn_drain_scheduler(void) +{ + int arg = 0; + + dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, + drain_scheduler_sch_cb, &arg); +} + +/* Callback called on queue to delete if it is idle */ +static int +drain_queue_cb(void *_q, void *_arg) +{ + struct dn_queue *q = _q; + int *arg = _arg; + + if ( (*arg++) > dn_cfg.expire_object_examined) + return DNHT_SCAN_END; + + if (q->ni.length == 0) { + if (q->q_time < dn_cfg.curr_time - dn_cfg.object_idle_tick) { + if (dn_delete_queue(q, DN_DESTROY | DN_DEL_SAFE) == 0) + return DNHT_SCAN_DEL; /* queue is deleted */ + } else + dn_cfg.idle_queue_wait++; + } + + return 0; /* queue isn't deleted */ +} + +/* Callback called on flowset used to check if it has queues */ +static int +drain_queue_fs_cb(void *_fs, void *_arg) +{ + struct dn_fsk *fs = _fs; + int *arg = _arg; + + if (fs->fs.flags & DN_QHT_HASH) { + /* Flowset has a hash table for queues */ + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, + drain_queue_cb, _arg); + } else { + /* No hash table for this flowset, null the pointer + * if the queue is deleted + */ + if (fs->qht) { + if (drain_queue_cb(fs->qht, _arg) == DNHT_SCAN_DEL) + fs->qht = NULL; + } + } + return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0; +} + +/* Called every tick, try to delete a 'bucket' of queue */ +void +dn_drain_queue(void) +{ + int arg = 0; + + /* scan a bucket of flowset */ + dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, + drain_queue_fs_cb, &arg); +} + +/* + * Handler for the various dummynet socket options + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + void *p = NULL; + int error, l; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); + if (error) + return (error); + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + switch (sopt->sopt_name) { + default : + D("dummynet: unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ + case IP_DUMMYNET_GET: + D("dummynet: compat option %d", sopt->sopt_name); + error = ip_dummynet_compat(sopt); + break; + + case IP_DUMMYNET3 : + if (sopt->sopt_dir == SOPT_GET) { + error = dummynet_get(sopt, NULL); + break; + } + l = sopt->sopt_valsize; + if (l < sizeof(struct dn_id) || l > 12000) { + D("argument len %d invalid", l); + break; + } + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? + error = sooptcopyin(sopt, p, l, l); + if (error) + break ; + error = do_config(p, l); + break; + } + + if (p != NULL) + free(p, M_TEMP); + + return error ; +} + + +static void +ip_dn_init(void) +{ + if (dn_cfg.init_done) + return; + printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); + dn_cfg.init_done = 1; + /* Set defaults here. MSVC does not accept initializers, + * and this is also useful for vimages + */ + /* queue limits */ + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ + dn_cfg.byte_limit = 1024 * 1024; + dn_cfg.expire = 1; + + /* RED parameters */ + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ + + /* hash tables */ + dn_cfg.max_hash_size = 1024; /* max in the hash tables */ + + if (dn_cfg.hash_size == 0) /* XXX or <= 0 ? */ + dn_cfg.hash_size = 64; /* default hash size */ + + /* hash tables for schedulers and flowsets are created + * when the first scheduler/flowset is inserted. + * This is done to allow to use the right hash_size value. + * When the last object is deleted, the table is destroyed, + * so a new hash_size value can be used. + * XXX rehash is not supported for now + */ + dn_cfg.schedhash = NULL; + dn_cfg.fshash = NULL; + /* bucket index to drain object */ + dn_cfg.drain_fs = 0; + dn_cfg.drain_sch = 0; + + if (dn_cfg.expire_object == 0) + dn_cfg.expire_object = 50; + if (dn_cfg.object_idle_tick == 0) + dn_cfg.object_idle_tick = 1000; + if (dn_cfg.expire_object_examined == 0) + dn_cfg.expire_object_examined = 10; + if (dn_cfg.drain_ratio == 0) + dn_cfg.drain_ratio = 1; + + // XXX what if we don't have a tsc ? +#ifdef HAVE_TSC + dn_cfg.cycle_task_new = dn_cfg.cycle_task_old = readTSC(); +#endif + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); + SLIST_INIT(&dn_cfg.fsu); + SLIST_INIT(&dn_cfg.schedlist); + + DN_LOCK_INIT(); + + TASK_INIT(&dn_task, 0, dummynet_task, curvnet); + dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT, + taskqueue_thread_enqueue, &dn_tq); + taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); + + callout_init(&dn_timeout, CALLOUT_MPSAFE); + callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0); + + /* Initialize curr_time adjustment mechanics. */ + getmicrouptime(&dn_cfg.prev_t); +} + +#ifdef KLD_MODULE +static void +ip_dn_destroy(int last) +{ + callout_drain(&dn_timeout); + + DN_BH_WLOCK(); + if (last) { + printf("%s removing last instance\n", __FUNCTION__); + ip_dn_ctl_ptr = NULL; + ip_dn_io_ptr = NULL; + } + + dummynet_flush(); + DN_BH_WUNLOCK(); + taskqueue_drain(dn_tq, &dn_task); + taskqueue_free(dn_tq); + + dn_ht_free(dn_cfg.schedhash, 0); + dn_ht_free(dn_cfg.fshash, 0); + heap_free(&dn_cfg.evheap); + + DN_LOCK_DESTROY(); +} +#endif /* KLD_MODULE */ + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + + if (type == MOD_LOAD) { + if (ip_dn_io_ptr) { + printf("DUMMYNET already loaded\n"); + return EEXIST ; + } + ip_dn_init(); + ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; + return 0; + } else if (type == MOD_UNLOAD) { +#if !defined(KLD_MODULE) + printf("dummynet statically compiled, cannot unload\n"); + return EINVAL ; +#else + ip_dn_destroy(1 /* last */); + return 0; +#endif + } else + return EOPNOTSUPP; +} + +/* modevent helpers for the modules */ +static int +load_dn_sched(struct dn_alg *d) +{ + struct dn_alg *s; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if scheduler already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { + if (strcmp(s->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* scheduler already exists */ + } + } + if (s == NULL) + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); + DN_BH_WUNLOCK(); + D("dn_sched %s %sloaded", d->name, s ? "not ":""); + return s ? 1 : 0; +} + +static int +unload_dn_sched(struct dn_alg *s) +{ + struct dn_alg *tmp, *r; + int err = EINVAL; + + D("called for %s", s->name); + + DN_BH_WLOCK(); + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { + if (strcmp(s->name, r->name) != 0) + continue; + D("ref_count = %d", r->ref_count); + err = (r->ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); + break; + } + DN_BH_WUNLOCK(); + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); + return err; +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_alg *sch = arg; + + if (cmd == MOD_LOAD) + return load_dn_sched(sch); + else if (cmd == MOD_UNLOAD) + return unload_dn_sched(sch); + else + return EINVAL; +} + +static moduledata_t dummynet_mod = { + "dummynet", dummynet_modevent, NULL +}; + +#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN +#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ +DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); +MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); +MODULE_VERSION(dummynet, 1); + +/* + * Starting up. Done in order after dummynet_modevent() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); + +/* + * Shutdown handlers up shop. These are done in REVERSE ORDER, but still + * after dummynet_modevent() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); + +/* end of file */ diff --git a/dummynet2/ip_fw2.c b/dummynet2/ip_fw2.c new file mode 100644 index 0000000..c55bc0f --- /dev/null +++ b/dummynet2/ip_fw2.c @@ -0,0 +1,2493 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $"); + +/* + * The FreeBSD IP packet firewall, main file + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include +#include +#endif + +#include /* XXX for in_cksum */ + +#ifdef MAC +#include +#endif + +/* + * static variables followed by global ones. + * All ipfw global variables are here. + */ + +/* ipfw_vnet_ready controls when we are open for business */ +static VNET_DEFINE(int, ipfw_vnet_ready) = 0; +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) + +static VNET_DEFINE(int, fw_deny_unknown_exthdrs); +#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) + +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT +static int default_to_accept = 1; +#else +static int default_to_accept; +#endif + +VNET_DEFINE(int, autoinc_step); + +/* + * Each rule belongs to one of 32 different sets (0..31). + * The variable set_disable contains one bit per set. + * If the bit is set, all rules in the corresponding set + * are disabled. Set RESVD_SET(31) is reserved for the default rule + * and rules that are not deleted by the flush command, + * and CANNOT be disabled. + * Rules in set RESVD_SET can only be deleted individually. + */ +VNET_DEFINE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DEFINE(int, fw_verbose); +/* counter for ipfw_log(NULL...) */ +VNET_DEFINE(u_int64_t, norule_counter); +VNET_DEFINE(int, verbose_limit); + +/* layer3_chain contains the list of rules for layer 3 */ +VNET_DEFINE(struct ip_fw_chain, layer3_chain); + +ipfw_nat_t *ipfw_nat_ptr = NULL; +struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); +ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_del_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#ifdef SYSCTL_NODE +uint32_t dummy_def = IPFW_DEFAULT_RULE; +uint32_t dummy_tables_max = IPFW_TABLES_MAX; + +SYSBEGIN(f3) + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, + "Rule number auto-increment step"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, + "Log matches to ipfw rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, + "Set upper limit of matches of ipfw rules logged"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, + &dummy_def, 0, + "The default/max possible rule number."); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD, + &dummy_tables_max, 0, + "The maximum number of tables."); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, + &default_to_accept, 0, + "Make the default rule accept all packets."); +TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, + "Number of static rules"); + +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, + "Deny packets with unknown IPv6 Extension Headers"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + + +/* + * Some macros used in the various matching options. + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +static __inline int +icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) +{ + int type = icmp->icmp_type; + + return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; + + return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. + * + * We scan options and store the bits we find set. We succeed if + * + * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear + * + * The code is sometimes optimized not to store additional variables. + */ + +static int +flags_match(ipfw_insn *cmd, u_int8_t bits) +{ + u_char want_clear; + bits = ~bits; + + if ( ((cmd->arg1 & 0xff) & bits) != 0) + return 0; /* some bits we want set were clear */ + want_clear = (cmd->arg1 >> 8) & 0xff; + if ( (want_clear & bits) != want_clear) + return 0; /* some bits we want clear were set */ + return 1; +} + +static int +ipopts_match(struct ip *ip, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(ip + 1); + int x = (ip->ip_hl << 2) - sizeof (struct ip); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[IPOPT_OPTVAL]; + + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > x) + return 0; /* invalid or truncated */ + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + bits |= IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + bits |= IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + bits |= IP_FW_IPOPT_RR; + break; + + case IPOPT_TS: + bits |= IP_FW_IPOPT_TS; + break; + } + } + return (flags_match(cmd, bits)); +} + +static int +tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(tcp + 1); + int x = (tcp->th_off << 2) - sizeof(struct tcphdr); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + bits |= IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + bits |= IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + bits |= IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + bits |= IP_FW_TCPOPT_TS; + break; + + } + } + return (flags_match(cmd, bits)); +} + +static int +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) +{ + if (ifp == NULL) /* no iface with this packet, match fails */ + return 0; + /* Check by name or by IP address */ + if (cmd->name[0] != '\0') { /* match by name */ + /* Check name */ + if (cmd->p.glob) { + if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) + return(1); + } else { + if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) + return(1); + } + } else { +#ifdef __FreeBSD__ /* and OSX too ? */ + struct ifaddr *ia; + + if_addr_rlock(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (cmd->p.ip.s_addr == ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) { + if_addr_runlock(ifp); + return(1); /* match */ + } + } + if_addr_runlock(ifp); +#endif /* __FreeBSD__ */ + } + return(0); /* no match, fail ... */ +} + +/* + * The verify_path function checks if a route to the src exists and + * if it is reachable via ifp (when provided). + * + * The 'verrevpath' option checks that the interface that an IP packet + * arrives on is the same interface that traffic destined for the + * packet's source address would be routed out of. + * The 'versrcreach' option just checks that the source address is + * reachable via any route (except default) in the routing table. + * These two are a measure to block forged packets. This is also + * commonly known as "anti-spoofing" or Unicast Reverse Path + * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs + * is purposely reminiscent of the Cisco IOS command, + * + * ip verify unicast reverse-path + * ip verify unicast source reachable-via any + * + * which implements the same functionality. But note that the syntax + * is misleading, and the check may be performed on all IP packets + * whether unicast, multicast, or broadcast. + */ +static int +verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) +{ +#ifndef __FreeBSD__ + return 0; +#else + struct route ro; + struct sockaddr_in *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in *)&(ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = src; + in_rtalloc_ign(&ro, 0, fib); + + if (ro.ro_rt == NULL) + return 0; + + /* + * If ifp is provided, check for equality with rtentry. + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * in order to pass packets injected back by if_simloop(): + * if useloopback == 1 routing entry (via lo0) for our own address + * may exist, so we need to handle routing assymetry. + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; +#endif /* __FreeBSD__ */ +} + +#ifdef INET6 +/* + * ipv6 specific rules here... + */ +static __inline int +icmp6type_match (int type, ipfw_insn_u32 *cmd) +{ + return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); +} + +static int +flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) +{ + int i; + for (i=0; i <= cmd->o.arg1; ++i ) + if (curr_flow == cmd->d[i] ) + return 1; + return 0; +} + +/* support for IP6_*_ME opcodes */ +static int +search_ip6_addr_net (struct in6_addr * ip6_addr) +{ + struct ifnet *mdc; + struct ifaddr *mdc2; + struct in6_ifaddr *fdm; + struct in6_addr copia; + + TAILQ_FOREACH(mdc, &V_ifnet, if_link) { + if_addr_rlock(mdc); + TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { + if (mdc2->ifa_addr->sa_family == AF_INET6) { + fdm = (struct in6_ifaddr *)mdc2; + copia = fdm->ia_addr.sin6_addr; + /* need for leaving scope_id in the sock_addr */ + in6_clearscope(&copia); + if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { + if_addr_runlock(mdc); + return 1; + } + } + } + if_addr_runlock(mdc); + } + return 0; +} + +static int +verify_path6(struct in6_addr *src, struct ifnet *ifp) +{ + struct route_in6 ro; + struct sockaddr_in6 *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in6 * )&(ro.ro_dst); + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = *src; + /* XXX MRT 0 for ipv6 at this time */ + rtalloc_ign((struct route *)&ro, 0); + + if (ro.ro_rt == NULL) + return 0; + + /* + * if ifp is provided, check for equality with rtentry + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * to support the case of sending packets to an address of our own. + * (where the former interface is the first argument of if_simloop() + * (=ifp), the latter is lo0) + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; + +} + +static int +is_icmp6_query(int icmp6_type) +{ + if ((icmp6_type <= ICMP6_MAXTYPE) && + (icmp6_type == ICMP6_ECHO_REQUEST || + icmp6_type == ICMP6_MEMBERSHIP_QUERY || + icmp6_type == ICMP6_WRUREQUEST || + icmp6_type == ICMP6_FQDN_QUERY || + icmp6_type == ICMP6_NI_QUERY)) + return (1); + + return (0); +} + +static void +send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) +{ + struct mbuf *m; + + m = args->m; + if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *tcp; + tcp = (struct tcphdr *)((char *)ip6 + hlen); + + if ((tcp->th_flags & TH_RST) == 0) { + struct mbuf *m0; + m0 = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m0 != NULL) + ip6_output(m0, NULL, NULL, 0, NULL, NULL, + NULL); + } + FREE_PKT(m); + } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ +#if 0 + /* + * Unlike above, the mbufs need to line up with the ip6 hdr, + * as the contents are read. We need to m_adj() the + * needed amount. + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + icmp6_error(m, ICMP6_DST_UNREACH, code, 0); + } else + FREE_PKT(m); + + args->m = NULL; +} + +#endif /* INET6 */ + + +/* + * sends a reject message, consuming the mbuf passed as an argument. + */ +static void +send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) +{ + +#if 0 + /* XXX When ip is not guaranteed to be at mtod() we will + * need to account for this */ + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ + /* We need the IP header in host order for icmp_error(). */ + SET_HOST_IPLEN(ip); + icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); + } else if (args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *const tcp = + L3HDR(struct tcphdr, mtod(args->m, struct ip *)); + if ( (tcp->th_flags & TH_RST) == 0) { + struct mbuf *m; + m = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m != NULL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + } + FREE_PKT(args->m); + } else + FREE_PKT(args->m); + args->m = NULL; +} + +/* + * Support for uid/gid/jail lookup. These tests are expensive + * (because we may need to look into the list of active sockets) + * so we cache the results. ugid_lookupp is 0 if we have not + * yet done a lookup, 1 if we succeeded, and -1 if we tried + * and failed. The function always returns the match value. + * We could actually spare the variable and use *uc, setting + * it to '(void *)check_uidgid if we have no info, NULL if + * we tried and failed, or any other value if successful. + */ +static int +check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, int *ugid_lookupp, + struct ucred **uc, struct inpcb *inp) +{ +#ifndef __FreeBSD__ + return cred_check(insn, proto, oif, + dst_ip, dst_port, src_ip, src_port, + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); +#else /* FreeBSD */ + struct inpcbinfo *pi; + int wildcard; + struct inpcb *pcb; + int match; + + /* + * Check to see if the UDP or TCP stack supplied us with + * the PCB. If so, rather then holding a lock and looking + * up the PCB, we can use the one that was supplied. + */ + if (inp && *ugid_lookupp == 0) { + INP_LOCK_ASSERT(inp); + if (inp->inp_socket != NULL) { + *uc = crhold(inp->inp_cred); + *ugid_lookupp = 1; + } else + *ugid_lookupp = -1; + } + /* + * If we have already been here and the packet has no + * PCB entry associated with it, then we can safely + * assume that this is a no match. + */ + if (*ugid_lookupp == -1) + return (0); + if (proto == IPPROTO_TCP) { + wildcard = 0; + pi = &V_tcbinfo; + } else if (proto == IPPROTO_UDP) { + wildcard = INPLOOKUP_WILDCARD; + pi = &V_udbinfo; + } else + return 0; + match = 0; + if (*ugid_lookupp == 0) { + INP_INFO_RLOCK(pi); + pcb = (oif) ? + in_pcblookup_hash(pi, + dst_ip, htons(dst_port), + src_ip, htons(src_port), + wildcard, oif) : + in_pcblookup_hash(pi, + src_ip, htons(src_port), + dst_ip, htons(dst_port), + wildcard, NULL); + if (pcb != NULL) { + *uc = crhold(pcb->inp_cred); + *ugid_lookupp = 1; + } + INP_INFO_RUNLOCK(pi); + if (*ugid_lookupp == 0) { + /* + * We tried and failed, set the variable to -1 + * so we will not try again on this packet. + */ + *ugid_lookupp = -1; + return (0); + } + } + if (insn->o.opcode == O_UID) + match = ((*uc)->cr_uid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_GID) + match = groupmember((gid_t)insn->d[0], *uc); + else if (insn->o.opcode == O_JAIL) + match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); + return match; +#endif /* __FreeBSD__ */ +} + +/* + * Helper function to set args with info on the rule after the matching + * one. slot is precise, whereas we guess rule_id as they are + * assigned sequentially. + */ +static inline void +set_match(struct ip_fw_args *args, int slot, + struct ip_fw_chain *chain) +{ + args->rule.chain_id = chain->id; + args->rule.slot = slot + 1; /* we use 0 as a marker */ + args->rule.rule_id = 1 + chain->map[slot]->id; + args->rule.rulenum = chain->map[slot]->rulenum; +} + +/* + * The main check routine for the firewall. + * + * All arguments are in args so we can modify them and return them + * back to the caller. + * + * Parameters: + * + * args->m (in/out) The packet; we set to NULL when/if we nuke it. + * Starts with the IP header. + * args->eh (in) Mac header if present, NULL for layer3 packet. + * args->L3offset Number of bytes bypassed if we came from L2. + * e.g. often sizeof(eh) ** NOTYET ** + * args->oif Outgoing interface, NULL if packet is incoming. + * The incoming interface is in the mbuf. (in) + * args->divert_rule (in/out) + * Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * + * args->rule Pointer to the last matching rule (in/out) + * args->next_hop Socket we are forwarding to (out). + * args->f_id Addresses grabbed from the packet (out) + * args->rule.info a cookie depending on rule action + * + * Return value: + * + * IP_FW_PASS the packet must be accepted + * IP_FW_DENY the packet must be dropped + * IP_FW_DIVERT divert packet, port in m_tag + * IP_FW_TEE tee packet, port in m_tag + * IP_FW_DUMMYNET to dummynet, pipe in args->cookie + * IP_FW_NETGRAPH into netgraph, cookie args->cookie + * args->rule contains the matching rule, + * args->rule.info has additional information. + * + */ +int +ipfw_chk(struct ip_fw_args *args) +{ + + /* + * Local variables holding state while processing a packet: + * + * IMPORTANT NOTE: to speed up the processing of rules, there + * are some assumption on the values of the variables, which + * are documented here. Should you change them, please check + * the implementation of the various instructions to make sure + * that they still work. + * + * args->eh The MAC header. It is non-null for a layer2 + * packet, it is NULL for a layer-3 packet. + * **notyet** + * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. + * + * m | args->m Pointer to the mbuf, as received from the caller. + * It may change if ipfw_chk() does an m_pullup, or if it + * consumes the packet because it calls send_reject(). + * XXX This has to change, so that ipfw_chk() never modifies + * or consumes the buffer. + * ip is the beginning of the ip(4 or 6) header. + * Calculated by adding the L3offset to the start of data. + * (Until we start using L3offset, the packet is + * supposed to start with the ip header). + */ + struct mbuf *m = args->m; + struct ip *ip = mtod(m, struct ip *); + + /* + * For rules which contain uid/gid or jail constraints, cache + * a copy of the users credentials after the pcb lookup has been + * executed. This will speed up the processing of rules with + * these types of constraints, as well as decrease contention + * on pcb related locks. + */ +#ifndef __FreeBSD__ + struct bsd_ucred ucred_cache; +#else + struct ucred *ucred_cache = NULL; +#endif + int ucred_lookup = 0; + + /* + * oif | args->oif If NULL, ipfw_chk has been called on the + * inbound path (ether_input, ip_input). + * If non-NULL, ipfw_chk has been called on the outbound path + * (ether_output, ip_output). + */ + struct ifnet *oif = args->oif; + + int f_pos = 0; /* index of current rule in the array */ + int retval = 0; + + /* + * hlen The length of the IP header. + */ + u_int hlen = 0; /* hlen >0 means we have an IP pkt */ + + /* + * offset The offset of a fragment. offset != 0 means that + * we have a fragment at this offset of an IPv4 packet. + * offset == 0 means that (if this is an IPv4 packet) + * this is the first or only fragment. + * For IPv6 offset == 0 means there is no Fragment Header. + * If offset != 0 for IPv6 always use correct mask to + * get the correct offset because we add IP6F_MORE_FRAG + * to be able to dectect the first fragment which would + * otherwise have offset = 0. + */ + u_short offset = 0; + + /* + * Local copies of addresses. They are only valid if we have + * an IP packet. + * + * proto The protocol. Set to 0 for non-ip packets, + * or to the protocol read from the packet otherwise. + * proto != 0 means that we have an IPv4 packet. + * + * src_port, dst_port port numbers, in HOST format. Only + * valid for TCP and UDP packets. + * + * src_ip, dst_ip ip addresses, in NETWORK format. + * Only valid for IPv4 packets. + */ + uint8_t proto; + uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ + struct in_addr src_ip, dst_ip; /* NOTE: network format */ + uint16_t iplen=0; + int pktlen; + uint16_t etype = 0; /* Host order stored ether type */ + + /* + * dyn_dir = MATCH_UNKNOWN when rules unchecked, + * MATCH_NONE when checked and not matched (q = NULL), + * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) + */ + int dyn_dir = MATCH_UNKNOWN; + ipfw_dyn_rule *q = NULL; + struct ip_fw_chain *chain = &V_layer3_chain; + + /* + * We store in ulp a pointer to the upper layer protocol header. + * In the ipv4 case this is easy to determine from the header, + * but for ipv6 we might have some additional headers in the middle. + * ulp is NULL if not found. + */ + void *ulp = NULL; /* upper layer protocol pointer. */ + + /* XXX ipv6 variables */ + int is_ipv6 = 0; + uint8_t icmp6_type = 0; + uint16_t ext_hd = 0; /* bits vector for extension header filtering */ + /* end of ipv6 variables */ + + int is_ipv4 = 0; + + int done = 0; /* flag to exit the outer loop */ + + if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) + return (IP_FW_PASS); /* accept */ + + dst_ip.s_addr = 0; /* make sure it is initialized */ + src_ip.s_addr = 0; /* make sure it is initialized */ + pktlen = m->m_pkthdr.len; + args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ + proto = args->f_id.proto = 0; /* mark f_id invalid */ + /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ + +/* + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, + * then it sets p to point at the offset "len" in the mbuf. WARNING: the + * pointer might become stale after other pullups (but we never use it + * this way). + */ +#define PULLUP_TO(_len, p, T) \ +do { \ + int x = (_len) + sizeof(T); \ + if ((m)->m_len < x) { \ + args->m = m = m_pullup(m, x); \ + if (m == NULL) \ + goto pullup_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ +} while (0) + + /* + * if we have an ether header, + */ + if (args->eh) + etype = ntohs(args->eh->ether_type); + + /* Identify IP packets and fill up variables. */ + if (pktlen >= sizeof(struct ip6_hdr) && + (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + is_ipv6 = 1; + args->f_id.addr_type = 6; + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + + /* Search extension headers to find upper layer protocols */ + while (ulp == NULL) { + switch (proto) { + case IPPROTO_ICMPV6: + PULLUP_TO(hlen, ulp, struct icmp6_hdr); + icmp6_type = ICMP6(ulp)->icmp6_type; + break; + + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_HOPOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_HOPOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_ROUTING: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_rthdr); + switch (((struct ip6_rthdr *)ulp)->ip6r_type) { + case 0: + ext_hd |= EXT_RTHDR0; + break; + case 2: + ext_hd |= EXT_RTHDR2; + break; + default: + printf("IPFW2: IPV6 - Unknown Routing " + "Header type(%d)\n", + ((struct ip6_rthdr *)ulp)->ip6r_type); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + ext_hd |= EXT_ROUTING; + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; + ulp = NULL; + break; + + case IPPROTO_FRAGMENT: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_frag); + ext_hd |= EXT_FRAGMENT; + hlen += sizeof (struct ip6_frag); + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_OFF_MASK; + /* Add IP6F_MORE_FRAG for offset of first + * fragment to be != 0. */ + offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_MORE_FRAG; + if (offset == 0) { + printf("IPFW2: IPV6 - Invalid Fragment " + "Header\n"); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + args->f_id.extra = + ntohl(((struct ip6_frag *)ulp)->ip6f_ident); + ulp = NULL; + break; + + case IPPROTO_DSTOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_DSTOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_AH: /* RFC 2402 */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + ext_hd |= EXT_AH; + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; + ulp = NULL; + break; + + case IPPROTO_ESP: /* RFC 2406 */ + PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ + /* Anything past Seq# is variable length and + * data past this ext. header is encrypted. */ + ext_hd |= EXT_ESP; + break; + + case IPPROTO_NONE: /* RFC 2460 */ + /* + * Packet ends here, and IPv6 header has + * already been pulled up. If ip6e_len!=0 + * then octets must be ignored. + */ + ulp = ip; /* non-NULL to get out of loop. */ + break; + + case IPPROTO_OSPFIGP: + /* XXX OSPF header check? */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + + case IPPROTO_PIM: + /* XXX PIM header check? */ + PULLUP_TO(hlen, ulp, struct pim); + break; + + case IPPROTO_CARP: + PULLUP_TO(hlen, ulp, struct carp_header); + if (((struct carp_header *)ulp)->carp_version != + CARP_VERSION) + return (IP_FW_DENY); + if (((struct carp_header *)ulp)->carp_type != + CARP_ADVERTISEMENT) + return (IP_FW_DENY); + break; + + case IPPROTO_IPV6: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip6_hdr); + break; + + case IPPROTO_IPV4: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip); + break; + + default: + printf("IPFW2: IPV6 - Unknown Extension " + "Header(%d), ext_hd=%x\n", proto, ext_hd); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + } /*switch */ + } + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)ip; + args->f_id.src_ip6 = ip6->ip6_src; + args->f_id.dst_ip6 = ip6->ip6_dst; + args->f_id.src_ip = 0; + args->f_id.dst_ip = 0; + args->f_id.flow_id6 = ntohl(ip6->ip6_flow); + } else if (pktlen >= sizeof(struct ip) && + (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { + is_ipv4 = 1; + hlen = ip->ip_hl << 2; + args->f_id.addr_type = 4; + + /* + * Collect parameters into local variables for faster matching. + */ + proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + offset = ntohs(ip->ip_off) & IP_OFFMASK; + iplen = ntohs(ip->ip_len); + pktlen = iplen < pktlen ? iplen : pktlen; + + if (offset == 0) { + switch (proto) { + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen, ulp, struct icmphdr); + //args->f_id.flags = ICMP(ulp)->icmp_type; + break; + + default: + break; + } + } + + ip = mtod(m, struct ip *); + args->f_id.src_ip = ntohl(src_ip.s_addr); + args->f_id.dst_ip = ntohl(dst_ip.s_addr); + } +#undef PULLUP_TO + if (proto) { /* we may have port numbers, store them */ + args->f_id.proto = proto; + args->f_id.src_port = src_port = ntohs(src_port); + args->f_id.dst_port = dst_port = ntohs(dst_port); + } + + IPFW_RLOCK(chain); + if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ + IPFW_RUNLOCK(chain); + return (IP_FW_PASS); /* accept */ + } + if (args->rule.slot) { + /* + * Packet has already been tagged as a result of a previous + * match on rule args->rule aka args->rule_id (PIPE, QUEUE, + * REASS, NETGRAPH, DIVERT/TEE...) + * Validate the slot and continue from the next one + * if still present, otherwise do a lookup. + */ + f_pos = (args->rule.chain_id == chain->id) ? + args->rule.slot : + ipfw_find_rule(chain, args->rule.rulenum, + args->rule.rule_id); + } else { + f_pos = 0; + } + + /* + * Now scan the rules, and parse microinstructions for each rule. + * We have two nested loops and an inner switch. Sometimes we + * need to break out of one or both loops, or re-enter one of + * the loops with updated variables. Loop variables are: + * + * f_pos (outer loop) points to the current rule. + * On output it points to the matching rule. + * done (outer loop) is used as a flag to break the loop. + * l (inner loop) residual length of current rule. + * cmd points to the current microinstruction. + * + * We break the inner loop by setting l=0 and possibly + * cmdlen=0 if we don't want to advance cmd. + * We break the outer loop by setting done=1 + * We can restart the inner loop by setting l>0 and f_pos, f, cmd + * as needed. + */ + for (; f_pos < chain->n_rules; f_pos++) { + ipfw_insn *cmd; + uint32_t tablearg = 0; + int l, cmdlen, skip_or; /* skip rest of OR block */ + struct ip_fw *f; + + f = chain->map[f_pos]; + if (V_set_disable & (1 << f->set) ) + continue; + + skip_or = 0; + for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; + l -= cmdlen, cmd += cmdlen) { + int match; + + /* + * check_body is a jump target used when we find a + * CHECK_STATE, and need to jump to the body of + * the target rule. + */ + +/* check_body: */ + cmdlen = F_LEN(cmd); + /* + * An OR block (insn_1 || .. || insn_n) has the + * F_OR bit set in all but the last instruction. + * The first match will set "skip_or", and cause + * the following instructions to be skipped until + * past the one with the F_OR bit clear. + */ + if (skip_or) { /* skip this instruction */ + if ((cmd->len & F_OR) == 0) + skip_or = 0; /* next one is good */ + continue; + } + match = 0; /* set to 1 if we succeed */ + + switch (cmd->opcode) { + /* + * The first set of opcodes compares the packet's + * fields with some pattern, setting 'match' if a + * match is found. At the end of the loop there is + * logic to deal with F_NOT and F_OR flags associated + * with the opcode. + */ + case O_NOP: + match = 1; + break; + + case O_FORWARD_MAC: + printf("ipfw: opcode %d unimplemented\n", + cmd->opcode); + break; + + case O_GID: + case O_UID: + case O_JAIL: + /* + * We only check offset == 0 && proto != 0, + * as this ensures that we have a + * packet with the ports info. + */ + if (offset!=0) + break; + if (is_ipv6) /* XXX to be fixed later */ + break; + if (proto == IPPROTO_TCP || + proto == IPPROTO_UDP) + match = check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); +#else + (void *)&ucred_cache, + (struct inpcb *)args->m); +#endif + break; + + case O_RECV: + match = iface_match(m->m_pkthdr.rcvif, + (ipfw_insn_if *)cmd); + break; + + case O_XMIT: + match = iface_match(oif, (ipfw_insn_if *)cmd); + break; + + case O_VIA: + match = iface_match(oif ? oif : + m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); + break; + + case O_MACADDR2: + if (args->eh != NULL) { /* have MAC header */ + u_int32_t *want = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->addr; + u_int32_t *mask = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->mask; + u_int32_t *hdr = (u_int32_t *)args->eh; + + match = + ( want[0] == (hdr[0] & mask[0]) && + want[1] == (hdr[1] & mask[1]) && + want[2] == (hdr[2] & mask[2]) ); + } + break; + + case O_MAC_TYPE: + if (args->eh != NULL) { + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (etype >= p[0] && + etype <= p[1]); + } + break; + + case O_FRAG: + match = (offset != 0); + break; + + case O_IN: /* "out" is "not in" */ + match = (oif == NULL); + break; + + case O_LAYER2: + match = (args->eh != NULL); + break; + + case O_DIVERTED: + { + /* For diverted packets, args->rule.info + * contains the divert port (in host format) + * reason and direction. + */ + uint32_t i = args->rule.info; + match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && + cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); + } + break; + + case O_PROTO: + /* + * We do not allow an arg of 0 so the + * check of "proto" only suffices. + */ + match = (proto == cmd->arg1); + break; + + case O_IP_SRC: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + src_ip.s_addr); + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (is_ipv4) { + uint32_t key = + (cmd->opcode == O_IP_DST_LOOKUP) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t v = 0; + + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { + /* generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + if (v == 0) + key = dst_ip.s_addr; + else if (v == 1) + key = src_ip.s_addr; + else if (v == 6) /* dscp */ + key = (ip->ip_tos >> 2) & 0x3f; + else if (offset != 0) + break; + else if (proto != IPPROTO_TCP && + proto != IPPROTO_UDP) + break; + else if (v == 2) + key = htonl(dst_port); + else if (v == 3) + key = htonl(src_port); + else if (v == 4 || v == 5) { + check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); + if (v == 4 /* O_UID */) + key = ucred_cache->cr_uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache, + (struct inpcb *)args->m); + if (v ==4 /* O_UID */) + key = ucred_cache.uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ + key = htonl(key); + } else + break; + } + match = ipfw_lookup_table(chain, + cmd->arg1, key, &v); + if (!match) + break; + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = + ((ipfw_insn_u32 *)cmd)->d[0] == v; + else + tablearg = v; + } + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + if (is_ipv4) { + uint32_t a = + (cmd->opcode == O_IP_DST_MASK) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; + int i = cmdlen-1; + + for (; !match && i>0; i-= 2, p+= 2) + match = (p[0] == (a & p[1])); + } + break; + + case O_IP_SRC_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(src_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_SRC_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); +#endif + break; + + case O_IP_DST_SET: + case O_IP_SRC_SET: + if (is_ipv4) { + u_int32_t *d = (u_int32_t *)(cmd+1); + u_int32_t addr = + cmd->opcode == O_IP_DST_SET ? + args->f_id.dst_ip : + args->f_id.src_ip; + + if (addr < d[0]) + break; + addr -= d[0]; /* subtract base */ + match = (addr < cmd->arg1) && + ( d[ 1 + (addr>>5)] & + (1<<(addr & 0x1f)) ); + } + break; + + case O_IP_DST: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + dst_ip.s_addr); + break; + + case O_IP_DST_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(dst_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_DST_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); +#endif + break; + + + case O_IP_SRCPORT: + case O_IP_DSTPORT: + /* + * offset == 0 && proto != 0 is enough + * to guarantee that we have a + * packet with port info. + */ + if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) + && offset == 0) { + u_int16_t x = + (cmd->opcode == O_IP_SRCPORT) ? + src_port : dst_port ; + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (x>=p[0] && x<=p[1]); + } + break; + + case O_ICMPTYPE: + match = (offset == 0 && proto==IPPROTO_ICMP && + icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); + break; + +#ifdef INET6 + case O_ICMP6TYPE: + match = is_ipv6 && offset == 0 && + proto==IPPROTO_ICMPV6 && + icmp6type_match( + ICMP6(ulp)->icmp6_type, + (ipfw_insn_u32 *)cmd); + break; +#endif /* INET6 */ + + case O_IPOPT: + match = (is_ipv4 && + ipopts_match(ip, cmd) ); + break; + + case O_IPVER: + match = (is_ipv4 && + cmd->arg1 == ip->ip_v); + break; + + case O_IPID: + case O_IPLEN: + case O_IPTTL: + if (is_ipv4) { /* only for IP packets */ + uint16_t x; + uint16_t *p; + int i; + + if (cmd->opcode == O_IPLEN) + x = iplen; + else if (cmd->opcode == O_IPTTL) + x = ip->ip_ttl; + else /* must be IPID */ + x = ntohs(ip->ip_id); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_IPPRECEDENCE: + match = (is_ipv4 && + (cmd->arg1 == (ip->ip_tos & 0xe0)) ); + break; + + case O_IPTOS: + match = (is_ipv4 && + flags_match(cmd, ip->ip_tos)); + break; + + case O_TCPDATALEN: + if (proto == IPPROTO_TCP && offset == 0) { + struct tcphdr *tcp; + uint16_t x; + uint16_t *p; + int i; + + tcp = TCP(ulp); + x = iplen - + ((ip->ip_hl + tcp->th_off) << 2); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_TCPFLAGS: + match = (proto == IPPROTO_TCP && offset == 0 && + flags_match(cmd, TCP(ulp)->th_flags)); + break; + + case O_TCPOPTS: + match = (proto == IPPROTO_TCP && offset == 0 && + tcpopts_match(TCP(ulp), cmd)); + break; + + case O_TCPSEQ: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_seq); + break; + + case O_TCPACK: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_ack); + break; + + case O_TCPWIN: + match = (proto == IPPROTO_TCP && offset == 0 && + cmd->arg1 == TCP(ulp)->th_win); + break; + + case O_ESTAB: + /* reject packets which have SYN only */ + /* XXX should i also check for TH_ACK ? */ + match = (proto == IPPROTO_TCP && offset == 0 && + (TCP(ulp)->th_flags & + (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); + break; + + case O_ALTQ: { + struct pf_mtag *at; + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + match = 1; + at = pf_find_mtag(m); + if (at != NULL && at->qid != 0) + break; + at = pf_get_mtag(m); + if (at == NULL) { + /* + * Let the packet fall back to the + * default ALTQ. + */ + break; + } + at->qid = altq->qid; + if (is_ipv4) + at->af = AF_INET; + else + at->af = AF_LINK; + at->hdr = ip; + break; + } + + case O_LOG: + ipfw_log(f, hlen, args, m, + oif, offset, tablearg, ip); + match = 1; + break; + + case O_PROB: + match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); + break; + + case O_VERREVPATH: + /* Outgoing packets automatically pass/match */ + match = ((oif != NULL) || + (m->m_pkthdr.rcvif == NULL) || + ( +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + m->m_pkthdr.rcvif) : +#endif + verify_path(src_ip, m->m_pkthdr.rcvif, + args->f_id.fib))); + break; + + case O_VERSRCREACH: + /* Outgoing packets automatically pass/match */ + match = (hlen > 0 && ((oif != NULL) || +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + NULL) : +#endif + verify_path(src_ip, NULL, args->f_id.fib))); + break; + + case O_ANTISPOOF: + /* Outgoing packets automatically pass/match */ + if (oif == NULL && hlen > 0 && + ( (is_ipv4 && in_localaddr(src_ip)) +#ifdef INET6 + || (is_ipv6 && + in6_localaddr(&(args->f_id.src_ip6))) +#endif + )) + match = +#ifdef INET6 + is_ipv6 ? verify_path6( + &(args->f_id.src_ip6), + m->m_pkthdr.rcvif) : +#endif + verify_path(src_ip, + m->m_pkthdr.rcvif, + args->f_id.fib); + else + match = 1; + break; + + case O_IPSEC: +#ifdef IPSEC + match = (m_tag_find(m, + PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); +#endif + /* otherwise no match */ + break; + +#ifdef INET6 + case O_IP6_SRC: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + + case O_IP6_DST: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if (is_ipv6) { + int i = cmdlen - 1; + struct in6_addr p; + struct in6_addr *d = + &((ipfw_insn_ip6 *)cmd)->addr6; + + for (; !match && i > 0; d += 2, + i -= F_INSN_SIZE(struct in6_addr) + * 2) { + p = (cmd->opcode == + O_IP6_SRC_MASK) ? + args->f_id.src_ip6: + args->f_id.dst_ip6; + APPLY_MASK(&p, &d[1]); + match = + IN6_ARE_ADDR_EQUAL(&d[0], + &p); + } + } + break; + + case O_FLOW6ID: + match = is_ipv6 && + flow6id_match(args->f_id.flow_id6, + (ipfw_insn_u32 *) cmd); + break; + + case O_EXT_HDR: + match = is_ipv6 && + (ext_hd & ((ipfw_insn *) cmd)->arg1); + break; + + case O_IP6: + match = is_ipv6; + break; +#endif + + case O_IP4: + match = is_ipv4; + break; + + case O_TAG: { + struct m_tag *mtag; + uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + + /* Packet is already tagged with this tag? */ + mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); + + /* We have `untag' action when F_NOT flag is + * present. And we must remove this mtag from + * mbuf and reset `match' to zero (`match' will + * be inversed later). + * Otherwise we should allocate new mtag and + * push it into mbuf. + */ + if (cmd->len & F_NOT) { /* `untag' action */ + if (mtag != NULL) + m_tag_delete(m, mtag); + match = 0; + } else if (mtag == NULL) { + if ((mtag = m_tag_alloc(MTAG_IPFW, + tag, 0, M_NOWAIT)) != NULL) + m_tag_prepend(m, mtag); + match = 1; + } + break; + } + + case O_FIB: /* try match the specified fib */ + if (args->f_id.fib == cmd->arg1) + match = 1; + break; + + case O_TAGGED: { + struct m_tag *mtag; + uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + + if (cmdlen == 1) { + match = m_tag_locate(m, MTAG_IPFW, + tag, NULL) != NULL; + break; + } + + /* we have ranges */ + for (mtag = m_tag_first(m); + mtag != NULL && !match; + mtag = m_tag_next(m, mtag)) { + uint16_t *p; + int i; + + if (mtag->m_tag_cookie != MTAG_IPFW) + continue; + + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for(; !match && i > 0; i--, p += 2) + match = + mtag->m_tag_id >= p[0] && + mtag->m_tag_id <= p[1]; + } + break; + } + + /* + * The second set of opcodes represents 'actions', + * i.e. the terminal part of a rule once the packet + * matches all previous patterns. + * Typically there is only one action for each rule, + * and the opcode is stored at the end of the rule + * (but there are exceptions -- see below). + * + * In general, here we set retval and terminate the + * outer loop (would be a 'break 3' in some language, + * but we need to set l=0, done=1) + * + * Exceptions: + * O_COUNT and O_SKIPTO actions: + * instead of terminating, we jump to the next rule + * (setting l=0), or to the SKIPTO target (setting + * f/f_len, cmd and l as needed), respectively. + * + * O_TAG, O_LOG and O_ALTQ action parameters: + * perform some action and set match = 1; + * + * O_LIMIT and O_KEEP_STATE: these opcodes are + * not real 'actions', and are stored right + * before the 'action' part of the rule. + * These opcodes try to install an entry in the + * state tables; if successful, we continue with + * the next opcode (match=1; break;), otherwise + * the packet must be dropped (set retval, + * break loops with l=0, done=1) + * + * O_PROBE_STATE and O_CHECK_STATE: these opcodes + * cause a lookup of the state table, and a jump + * to the 'action' part of the parent rule + * if an entry is found, or + * (CHECK_STATE only) a jump to the next rule if + * the entry is not found. + * The result of the lookup is cached so that + * further instances of these opcodes become NOPs. + * The jump to the next rule is done by setting + * l=0, cmdlen=0. + */ + case O_LIMIT: + case O_KEEP_STATE: + if (ipfw_install_state(f, + (ipfw_insn_limit *)cmd, args, tablearg)) { + /* error or limit violation */ + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + } + match = 1; + break; + + case O_PROBE_STATE: + case O_CHECK_STATE: + /* + * dynamic rules are checked at the first + * keep-state or check-state occurrence, + * with the result being stored in dyn_dir. + * The compiler introduces a PROBE_STATE + * instruction for us when we have a + * KEEP_STATE (because PROBE_STATE needs + * to be run first). + */ + if (dyn_dir == MATCH_UNKNOWN && + (q = ipfw_lookup_dyn_rule(&args->f_id, + &dyn_dir, proto == IPPROTO_TCP ? + TCP(ulp) : NULL)) + != NULL) { + /* + * Found dynamic entry, update stats + * and jump to the 'action' part of + * the parent rule by setting + * f, cmd, l and clearing cmdlen. + */ + q->pcnt++; + q->bcnt += pktlen; + /* XXX we would like to have f_pos + * readily accessible in the dynamic + * rule, instead of having to + * lookup q->rule. + */ + f = q->rule; + f_pos = ipfw_find_rule(chain, + f->rulenum, f->id); + cmd = ACTION_PTR(f); + l = f->cmd_len - f->act_ofs; + ipfw_dyn_unlock(); + cmdlen = 0; + match = 1; + break; + } + /* + * Dynamic entry not found. If CHECK_STATE, + * skip to next rule, if PROBE_STATE just + * ignore and continue with next opcode. + */ + if (cmd->opcode == O_CHECK_STATE) + l = 0; /* exit inner loop */ + match = 1; + break; + + case O_ACCEPT: + retval = 0; /* accept */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_PIPE: + case O_QUEUE: + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (cmd->opcode == O_PIPE) + args->rule.info |= IPFW_IS_PIPE; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = IP_FW_DUMMYNET; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_DIVERT: + case O_TEE: + if (args->eh) /* not on layer 2 */ + break; + /* otherwise this is terminal */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + retval = (cmd->opcode == O_DIVERT) ? + IP_FW_DIVERT : IP_FW_TEE; + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + break; + + case O_COUNT: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + l = 0; /* exit inner loop */ + break; + + case O_SKIPTO: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + /* If possible use cached f_pos (in f->next_rule), + * whose version is written in f->next_rule + * (horrible hacks to avoid changing the ABI). + */ + if (cmd->arg1 != IP_FW_TABLEARG && + (uintptr_t)f->x_next == chain->id) { + f_pos = (uintptr_t)f->next_rule; + } else { + int i = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + /* make sure we do not jump backward */ + if (i <= f->rulenum) + i = f->rulenum + 1; + f_pos = ipfw_find_rule(chain, i, 0); + /* update the cache */ + if (cmd->arg1 != IP_FW_TABLEARG) { + f->next_rule = + (void *)(uintptr_t)f_pos; + f->x_next = + (void *)(uintptr_t)chain->id; + } + } + /* + * Skip disabled rules, and re-enter + * the inner loop with the correct + * f_pos, f, l and cmd. + * Also clear cmdlen and skip_or + */ + for (; f_pos < chain->n_rules - 1 && + (V_set_disable & + (1 << chain->map[f_pos]->set)); + f_pos++) + ; + /* Re-enter the inner loop at the skipto rule. */ + f = chain->map[f_pos]; + l = f->cmd_len; + cmd = f->cmd; + match = 1; + cmdlen = 0; + skip_or = 0; + continue; + break; /* not reached */ + + case O_REJECT: + /* + * Drop the packet and send a reject notice + * if the packet is not ICMP (or is an ICMP + * query), and it is not multicast/broadcast. + */ + if (hlen > 0 && is_ipv4 && offset == 0 && + (proto != IPPROTO_ICMP || + is_icmp_query(ICMP(ulp))) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN_MULTICAST(ntohl(dst_ip.s_addr))) { + send_reject(args, cmd->arg1, iplen, ip); + m = args->m; + } + /* FALLTHROUGH */ +#ifdef INET6 + case O_UNREACH6: + if (hlen > 0 && is_ipv6 && + ((offset & IP6F_OFF_MASK) == 0) && + (proto != IPPROTO_ICMPV6 || + (is_icmp6_query(icmp6_type) == 1)) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { + send_reject6( + args, cmd->arg1, hlen, + (struct ip6_hdr *)ip); + m = args->m; + } + /* FALLTHROUGH */ +#endif + case O_DENY: + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_FORWARD_IP: + if (args->eh) /* not valid on layer2 pkts */ + break; + if (!q || dyn_dir == MATCH_FORWARD) { + struct sockaddr_in *sa; + sa = &(((ipfw_insn_sa *)cmd)->sa); + if (sa->sin_addr.s_addr == INADDR_ANY) { + bcopy(sa, &args->hopstore, + sizeof(*sa)); + args->hopstore.sin_addr.s_addr = + htonl(tablearg); + args->next_hop = &args->hopstore; + } else { + args->next_hop = sa; + } + } + retval = IP_FW_PASS; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_NETGRAPH: + case O_NGTEE: + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = (cmd->opcode == O_NETGRAPH) ? + IP_FW_NETGRAPH : IP_FW_NGTEE; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_SETFIB: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + M_SETFIB(m, cmd->arg1); + args->f_id.fib = cmd->arg1; + l = 0; /* exit inner loop */ + break; + + case O_NAT: + if (!IPFW_NAT_LOADED) { + retval = IP_FW_DENY; + } else { + struct cfg_nat *t; + int nat_id; + + set_match(args, f_pos, chain); + t = ((ipfw_insn_nat *)cmd)->nat; + if (t == NULL) { + nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + t = (*lookup_nat_ptr)(&chain->nat, nat_id); + + if (t == NULL) { + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + } + if (cmd->arg1 != IP_FW_TABLEARG) + ((ipfw_insn_nat *)cmd)->nat = t; + } + retval = ipfw_nat_ptr(args, t, m); + } + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_REASS: { + int ip_off; + + f->pcnt++; + f->bcnt += pktlen; + l = 0; /* in any case exit inner loop */ + ip_off = ntohs(ip->ip_off); + + /* if not fragmented, go to next rule */ + if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) + break; + /* + * ip_reass() expects len & off in host + * byte order. + */ + SET_HOST_IPLEN(ip); + + args->m = m = ip_reass(m); + + /* + * do IP header checksum fixup. + */ + if (m == NULL) { /* fragment got swallowed */ + retval = IP_FW_DENY; + } else { /* good, packet complete */ + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + retval = IP_FW_REASS; + set_match(args, f_pos, chain); + } + done = 1; /* exit outer loop */ + break; + } + + default: + panic("-- unknown opcode %d\n", cmd->opcode); + } /* end of switch() on opcodes */ + /* + * if we get here with l=0, then match is irrelevant. + */ + + if (cmd->len & F_NOT) + match = !match; + + if (match) { + if (cmd->len & F_OR) + skip_or = 1; + } else { + if (!(cmd->len & F_OR)) /* not an OR block, */ + break; /* try next rule */ + } + + } /* end of inner loop, scan opcodes */ + + if (done) + break; + +/* next_rule:; */ /* try next rule */ + + } /* end of outer for, scan rules */ + + if (done) { + struct ip_fw *rule = chain->map[f_pos]; + /* Update statistics */ + rule->pcnt++; + rule->bcnt += pktlen; + rule->timestamp = time_uptime; + } else { + retval = IP_FW_DENY; + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); + } + IPFW_RUNLOCK(chain); +#ifdef __FreeBSD__ + if (ucred_cache != NULL) + crfree(ucred_cache); +#endif + return (retval); + +pullup_failed: + if (V_fw_verbose) + printf("ipfw: pullup failed\n"); + return (IP_FW_DENY); +} + +/* + * Module and VNET glue + */ + +/* + * Stuff that must be initialised only on boot or module load + */ +static int +ipfw_init(void) +{ + int error = 0; + + ipfw_dyn_attach(); + /* + * Only print out this stuff the first time around, + * when called from the sysinit code. + */ + printf("ipfw2 " +#ifdef INET6 + "(+ipv6) " +#endif + "initialized, divert %s, nat %s, " + "rule-based forwarding " +#ifdef IPFIREWALL_FORWARD + "enabled, " +#else + "disabled, " +#endif + "default to %s, logging ", +#ifdef IPDIVERT + "enabled", +#else + "loadable", +#endif +#ifdef IPFIREWALL_NAT + "enabled", +#else + "loadable", +#endif + default_to_accept ? "accept" : "deny"); + + /* + * Note: V_xxx variables can be accessed here but the vnet specific + * initializer may not have been called yet for the VIMAGE case. + * Tuneables will have been processed. We will print out values for + * the default vnet. + * XXX This should all be rationalized AFTER 8.0 + */ + if (V_fw_verbose == 0) + printf("disabled\n"); + else if (V_verbose_limit == 0) + printf("unlimited\n"); + else + printf("limited to %d packets/entry by default\n", + V_verbose_limit); + + ipfw_log_bpf(1); /* init */ + return (error); +} + +/* + * Called for the removal of the last instance only on module unload. + */ +static void +ipfw_destroy(void) +{ + + ipfw_log_bpf(0); /* uninit */ + ipfw_dyn_detach(); + printf("IP firewall unloaded\n"); +} + +/* + * Stuff that must be initialized for every instance + * (including the first of course). + */ +static int +vnet_ipfw_init(const void *unused) +{ + int error; + struct ip_fw *rule = NULL; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + /* First set up some values that are compile time options */ + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_fw_deny_unknown_exthdrs = 1; +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif +#ifdef IPFIREWALL_NAT + LIST_INIT(&chain->nat); +#endif + + /* insert the default rule and create the initial map */ + chain->n_rules = 1; + chain->static_len = sizeof(struct ip_fw); + chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO); + if (chain->map) + rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO); + if (rule == NULL) { + if (chain->map) + free(chain->map, M_IPFW); + printf("ipfw2: ENOSPC initializing default rule " + "(support disabled)\n"); + return (ENOSPC); + } + error = ipfw_init_tables(chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } + + /* fill and insert the default rule */ + rule->act_ofs = 0; + rule->rulenum = IPFW_DEFAULT_RULE; + rule->cmd_len = 1; + rule->set = RESVD_SET; + rule->cmd[0].len = 1; + rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; + chain->rules = chain->default_rule = chain->map[0] = rule; + chain->id = rule->id = 1; + + IPFW_LOCK_INIT(chain); + ipfw_dyn_init(); + + /* First set up some values that are compile time options */ + V_ipfw_vnet_ready = 1; /* Open for business */ + + /* + * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) + * and pfil hooks for ipv4 and ipv6. Even if the latter two fail + * we still keep the module alive because the sockopt and + * layer2 paths are still useful. + * ipfw[6]_hook return 0 on success, ENOENT on failure, + * so we can ignore the exact return value and just set a flag. + * + * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so + * changes in the underlying (per-vnet) variables trigger + * immediate hook()/unhook() calls. + * In layer2 we have the same behaviour, except that V_ether_ipfw + * is checked on each packet because there are no pfil hooks. + */ + V_ip_fw_ctl_ptr = ipfw_ctl; + V_ip_fw_chk_ptr = ipfw_chk; + error = ipfw_attach_hooks(1); + return (error); +} + +/* + * Called for the removal of each instance. + */ +static int +vnet_ipfw_uninit(const void *unused) +{ + struct ip_fw *reap, *rule; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; + + V_ipfw_vnet_ready = 0; /* tell new callers to go away */ + /* + * disconnect from ipv4, ipv6, layer2 and sockopt. + * Then grab, release and grab again the WLOCK so we make + * sure the update is propagated and nobody will be in. + */ + (void)ipfw_attach_hooks(0 /* detach */); + V_ip_fw_chk_ptr = NULL; + V_ip_fw_ctl_ptr = NULL; + IPFW_UH_WLOCK(chain); + IPFW_UH_WUNLOCK(chain); + IPFW_UH_WLOCK(chain); + + IPFW_WLOCK(chain); + IPFW_WUNLOCK(chain); + IPFW_WLOCK(chain); + + ipfw_dyn_uninit(0); /* run the callout_drain */ + ipfw_destroy_tables(chain); + reap = NULL; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + rule->x_next = reap; + reap = rule; + } + if (chain->map) + free(chain->map, M_IPFW); + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); + if (reap != NULL) + ipfw_reap_rules(reap); + IPFW_LOCK_DESTROY(chain); + ipfw_dyn_uninit(1); /* free the remaining parts */ + return 0; +} + +/* + * Module event handler. + * In general we have the choice of handling most of these events by the + * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to + * use the SYSINIT handlers as they are more capable of expressing the + * flow of control during module and vnet operations, so this is just + * a skeleton. Note there is no SYSINIT equivalent of the module + * SHUTDOWN handler, but we don't have anything to do in that case anyhow. + */ +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + /* Called once at module load or + * system boot if compiled in. */ + break; + case MOD_QUIESCE: + /* Called before unload. May veto unloading. */ + break; + case MOD_UNLOAD: + /* Called during unload. */ + break; + case MOD_SHUTDOWN: + /* Called during system shutdown. */ + break; + default: + err = EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ +#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ +#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ + +DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); +MODULE_VERSION(ipfw, 2); +/* should declare some dependencies here */ + +/* + * Starting up. Done in order after ipfwmod() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_init, NULL); +VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_init, NULL); + +/* + * Closing up shop. These are done in REVERSE ORDER, but still + * after ipfwmod() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_destroy, NULL); +VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_uninit, NULL); +/* end of file */ diff --git a/dummynet2/ip_fw_dynamic.c b/dummynet2/ip_fw_dynamic.c new file mode 100644 index 0000000..d33849d --- /dev/null +++ b/dummynet2/ip_fw_dynamic.c @@ -0,0 +1,1241 @@ +/*- + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $"); + +#define DEB(x) +#define DDB(x) x + +/* + * Dynamic rule support for ipfw + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include + +#include +#include +#include /* ip_defttl */ +#include +#include +#include +#include + +#include /* IN6_ARE_ADDR_EQUAL */ +#ifdef INET6 +#include +#include +#endif + +#include /* XXX for in_cksum */ + +#ifdef MAC +#include +#endif + +/* + * Description of dynamic rules. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, its address fields are first masked + * with the mask defined for the rule, then hashed, then matched + * against the entries in the corresponding list. + * Dynamic rules can be used for different purposes: + * + stateful rules; + * + enforcing limits on the number of sessions; + * + in-kernel NAT (not implemented yet) + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is stored in dyn_count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rule holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. XXX we should make them survive. + * + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. XXX check the latter!!! + */ + +/* + * Static variables followed by global ones + */ +static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); +static VNET_DEFINE(u_int32_t, dyn_buckets); +static VNET_DEFINE(u_int32_t, curr_dyn_buckets); +static VNET_DEFINE(struct callout, ipfw_timeout); +#define V_ipfw_dyn_v VNET(ipfw_dyn_v) +#define V_dyn_buckets VNET(dyn_buckets) +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) +#define V_ipfw_timeout VNET(ipfw_timeout) + +static uma_zone_t ipfw_dyn_rule_zone; +#ifndef __FreeBSD__ +DEFINE_SPINLOCK(ipfw_dyn_mtx); +#else +static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ +#endif + +#define IPFW_DYN_LOCK_INIT() \ + mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) +#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) +#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) +#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) +#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) + +void +ipfw_dyn_unlock(void) +{ + IPFW_DYN_UNLOCK(); +} + +/* + * Timeouts for various events in handing dynamic rules. + */ +static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); +static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); +static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); +static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); +static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); +static VNET_DEFINE(u_int32_t, dyn_short_lifetime); + +#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET(dyn_short_lifetime) + +/* + * Keepalives are sent if dyn_keepalive is set. They are sent every + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval + * seconds of lifetime of a rule. + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower + * than dyn_keepalive_period. + */ + +static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); +static VNET_DEFINE(u_int32_t, dyn_keepalive_period); +static VNET_DEFINE(u_int32_t, dyn_keepalive); + +#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET(dyn_keepalive_period) +#define V_dyn_keepalive VNET(dyn_keepalive) + +static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ +static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ + +#define V_dyn_count VNET(dyn_count) +#define V_dyn_max VNET(dyn_max) + +#ifdef SYSCTL_NODE + +SYSBEGIN(f2) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, + "Number of dyn. buckets"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, + "Current Number of dyn. buckets"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_RD, &VNET_NAME(dyn_count), 0, + "Number of dyn. rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_RW, &VNET_NAME(dyn_max), 0, + "Max number of dyn. rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, + "Lifetime of dyn. rules for acks"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, + "Lifetime of dyn. rules for syn"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, + "Lifetime of dyn. rules for fin"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, + "Lifetime of dyn. rules for rst"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, + "Lifetime of dyn. rules for UDP"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, + "Enable keepalives for dyn. rules"); + +SYSEND + +#endif /* SYSCTL_NODE */ + + +static __inline int +hash_packet6(struct ipfw_flow_id *id) +{ + u_int32_t i; + i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->dst_port) ^ (id->src_port); + return i; +} + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +static __inline int +hash_packet(struct ipfw_flow_id *id) +{ + u_int32_t i; + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + i = hash_packet6(id); + else +#endif /* INET6 */ + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (V_curr_dyn_buckets - 1); + return i; +} + +static __inline void +unlink_dyn_rule_print(struct ipfw_flow_id *id) +{ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) { + ip6_sprintf(src, &id->src_ip6); + ip6_sprintf(dst, &id->dst_ip6); + } else +#endif + { + da.s_addr = htonl(id->src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(id->dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", + src, id->src_port, dst, id->dst_port, V_dyn_count - 1); +} + +/** + * unlink a dynamic rule from a chain. prev is a pointer to + * the previous one, q is a pointer to the rule to delete, + * head is a pointer to the head of the queue. + * Modifies q and potentially also head. + */ +#define UNLINK_DYN_RULE(prev, head, q) { \ + ipfw_dyn_rule *old_q = q; \ + \ + /* remove a refcount to the parent */ \ + if (q->dyn_type == O_LIMIT) \ + q->parent->count--; \ + DEB(unlink_dyn_rule_print(&q->id);) \ + if (prev != NULL) \ + prev->next = q = q->next; \ + else \ + head = q = q->next; \ + V_dyn_count--; \ + uma_zfree(ipfw_dyn_rule_zone, old_q); } + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) + +/** + * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. + * + * If keep_me == NULL, rules are deleted even if not expired, + * otherwise only expired rules are removed. + * + * The value of the second parameter is also used to point to identify + * a rule we absolutely do not want to remove (e.g. because we are + * holding a reference to it -- this is the case with O_LIMIT_PARENT + * rules). The pointer is only used for comparison, so any non-null + * value will do. + */ +static void +remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) +{ + static u_int32_t last_remove = 0; + +#define FORCE (keep_me == NULL) + + ipfw_dyn_rule *prev, *q; + int i, pass = 0, max_pass = 0; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) + return; + /* do not expire more than once per second, it is useless */ + if (!FORCE && last_remove == time_uptime) + return; + last_remove = time_uptime; + + /* + * because O_LIMIT refer to parent rules, during the first pass only + * remove child and mark any pending LIMIT_PARENT, and remove + * them in a second pass. + */ +next_pass: + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { + /* + * Logic can become complex here, so we split tests. + */ + if (q == keep_me) + goto next; + if (rule != NULL && rule != q->rule) + goto next; /* not the one we are looking for */ + if (q->dyn_type == O_LIMIT_PARENT) { + /* + * handle parent in the second pass, + * record we need one. + */ + max_pass = 1; + if (pass == 0) + goto next; + if (FORCE && q->count != 0 ) { + /* XXX should not happen! */ + printf("ipfw: OUCH! cannot remove rule," + " count %d\n", q->count); + } + } else { + if (!FORCE && + !TIME_LEQ( q->expire, time_uptime )) + goto next; + } + if (q->dyn_type != O_LIMIT_PARENT || !q->count) { + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); + continue; + } +next: + prev=q; + q=q->next; + } + } + if (pass++ < max_pass) + goto next_pass; +} + +void +ipfw_remove_dyn_children(struct ip_fw *rule) +{ + IPFW_DYN_LOCK(); + remove_dyn_rule(rule, NULL /* force removal */); + IPFW_DYN_UNLOCK(); +} + +/** + * lookup a dynamic rule, locked version + */ +static ipfw_dyn_rule * +lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + /* + * stateful ipfw extensions. + * Lookup into dynamic session queue + */ +#define MATCH_REVERSE 0 +#define MATCH_FORWARD 1 +#define MATCH_NONE 2 +#define MATCH_UNKNOWN 3 + int i, dir = MATCH_NONE; + ipfw_dyn_rule *prev, *q=NULL; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL) + goto done; /* not found */ + i = hash_packet( pkt ); + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { + if (q->dyn_type == O_LIMIT_PARENT && q->count) + goto next; + if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); + continue; + } + if (pkt->proto == q->id.proto && + q->dyn_type != O_LIMIT_PARENT) { + if (IS_IP6_FLOW_ID(pkt)) { + if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6)) && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.dst_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.src_ip6)) && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } else { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } + } +next: + prev = q; + q = q->next; + } + if (q == NULL) + goto done; /* q = NULL, not found */ + + if ( prev != NULL) { /* found and not in front */ + prev->next = q->next; + q->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = q; + } + if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ + u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST); + +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) + q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); + switch (q->state) { + case TH_SYN: /* opening */ + q->expire = time_uptime + V_dyn_syn_lifetime; + break; + + case BOTH_SYN: /* move to established */ + case BOTH_SYN | TH_FIN : /* one side tries to close */ + case BOTH_SYN | (TH_FIN << 8) : + if (tcp) { +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) + u_int32_t ack = ntohl(tcp->th_ack); + if (dir == MATCH_FORWARD) { + if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) + q->ack_fwd = ack; + else { /* ignore out-of-sequence */ + break; + } + } else { + if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) + q->ack_rev = ack; + else { /* ignore out-of-sequence */ + break; + } + } + } + q->expire = time_uptime + V_dyn_ack_lifetime; + break; + + case BOTH_SYN | BOTH_FIN: /* both sides closed */ + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; + break; + + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; + break; + } + } else if (pkt->proto == IPPROTO_UDP) { + q->expire = time_uptime + V_dyn_udp_lifetime; + } else { + /* other protocols */ + q->expire = time_uptime + V_dyn_short_lifetime; + } +done: + if (match_direction) + *match_direction = dir; + return q; +} + +ipfw_dyn_rule * +ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + ipfw_dyn_rule *q; + + IPFW_DYN_LOCK(); + q = lookup_dyn_rule_locked(pkt, match_direction, tcp); + if (q == NULL) + IPFW_DYN_UNLOCK(); + /* NB: return table locked when q is not NULL */ + return q; +} + +static void +realloc_dynamic_table(void) +{ + IPFW_DYN_LOCK_ASSERT(); + + /* + * Try reallocation, make sure we have a power of 2 and do + * not allow more than 64k entries. In case of overflow, + * default to 1024. + */ + + if (V_dyn_buckets > 65536) + V_dyn_buckets = 1024; + if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ + V_dyn_buckets = V_curr_dyn_buckets; /* reset */ + return; + } + V_curr_dyn_buckets = V_dyn_buckets; + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); + for (;;) { + V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), + M_IPFW, M_NOWAIT | M_ZERO); + if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) + break; + V_curr_dyn_buckets /= 2; + } +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (O_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (O_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (O_LIMIT_PARENT). + */ +static ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) +{ + ipfw_dyn_rule *r; + int i; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL || + (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { + realloc_dynamic_table(); + if (V_ipfw_dyn_v == NULL) + return NULL; /* failed ! */ + } + i = hash_packet(id); + + r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); + if (r == NULL) { + printf ("ipfw: sorry cannot allocate state\n"); + return NULL; + } + + /* increase refcount on parent, and set pointer */ + if (dyn_type == O_LIMIT) { + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; + if ( parent->dyn_type != O_LIMIT_PARENT) + panic("invalid parent"); + parent->count++; + r->parent = parent; + rule = parent->rule; + } + + r->id = *id; + r->expire = time_uptime + V_dyn_syn_lifetime; + r->rule = rule; + r->dyn_type = dyn_type; + r->pcnt = r->bcnt = 0; + r->count = 0; + + r->bucket = i; + r->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = r; + V_dyn_count++; + DEB({ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN]; + char dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN]; + char dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(r->id))) { + ip6_sprintf(src, &r->id.src_ip6); + ip6_sprintf(dst, &r->id.dst_ip6); + } else +#endif + { + da.s_addr = htonl(r->id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(r->id.dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", + dyn_type, src, r->id.src_port, dst, r->id.dst_port, + V_dyn_count); + }) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) +{ + ipfw_dyn_rule *q; + int i; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v) { + int is_v6 = IS_IP6_FLOW_ID(pkt); + i = hash_packet( pkt ); + for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) + if (q->dyn_type == O_LIMIT_PARENT && + rule== q->rule && + pkt->proto == q->id.proto && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port && + ( + (is_v6 && + IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6))) || + (!is_v6 && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip) + ) + ) { + q->expire = time_uptime + V_dyn_short_lifetime; + DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) + return q; + } + } + return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); +} + +/** + * Install dynamic state for rule type cmd->o.opcode + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +int +ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg) +{ + static int last_log; + ipfw_dyn_rule *q; + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + + src[0] = '\0'; + dst[0] = '\0'; + + IPFW_DYN_LOCK(); + + DEB( +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + ip6_sprintf(src, &args->f_id.src_ip6); + ip6_sprintf(dst, &args->f_id.dst_ip6); + } else +#endif + { + da.s_addr = htonl(args->f_id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(args->f_id.dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: %s: type %d %s %u -> %s %u\n", + __func__, cmd->o.opcode, src, args->f_id.src_port, + dst, args->f_id.dst_port); + src[0] = '\0'; + dst[0] = '\0'; + ) + + q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); + + if (q != NULL) { /* should never occur */ + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: entry already present, done\n", + __func__); + } + IPFW_DYN_UNLOCK(); + return (0); + } + + if (V_dyn_count >= V_dyn_max) + /* Run out of slots, try to remove any expired rule. */ + remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); + + if (V_dyn_count >= V_dyn_max) { + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: Too many dynamic rules\n", __func__); + } + IPFW_DYN_UNLOCK(); + return (1); /* cannot install, notify caller */ + } + + switch (cmd->o.opcode) { + case O_KEEP_STATE: /* bidir rule */ + add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); + break; + + case O_LIMIT: { /* limit number of sessions */ + struct ipfw_flow_id id; + ipfw_dyn_rule *parent; + uint32_t conn_limit; + uint16_t limit_mask = cmd->limit_mask; + + conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? + tablearg : cmd->conn_limit; + + DEB( + if (cmd->conn_limit == IP_FW_TABLEARG) + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " + "(tablearg)\n", __func__, conn_limit); + else + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", + __func__, conn_limit); + ) + + id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; + id.proto = args->f_id.proto; + id.addr_type = args->f_id.addr_type; + id.fib = M_GETFIB(args->m); + + if (IS_IP6_FLOW_ID (&(args->f_id))) { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip6 = args->f_id.src_ip6; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip6 = args->f_id.dst_ip6; + } else { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + } + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { + printf("ipfw: %s: add parent failed\n", __func__); + IPFW_DYN_UNLOCK(); + return (1); + } + + if (parent->count >= conn_limit) { + /* See if we can remove some expired rule. */ + remove_dyn_rule(rule, parent); + if (parent->count >= conn_limit) { + if (V_fw_verbose && last_log != time_uptime) { + last_log = time_uptime; +#ifdef INET6 + /* + * XXX IPv6 flows are not + * supported yet. + */ + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), + "[%s]", ip6_sprintf(ip6buf, + &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), + "[%s]", ip6_sprintf(ip6buf, + &args->f_id.dst_ip6)); + } else +#endif + { + da.s_addr = + htonl(args->f_id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = + htonl(args->f_id.dst_ip); + inet_ntoa_r(da, dst); + } + log(LOG_SECURITY | LOG_DEBUG, + "ipfw: %d %s %s:%u -> %s:%u, %s\n", + parent->rule->rulenum, + "drop session", + src, (args->f_id.src_port), + dst, (args->f_id.dst_port), + "too many entries"); + } + IPFW_DYN_UNLOCK(); + return (1); + } + } + add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); + break; + } + default: + printf("ipfw: %s: unknown dynamic rule type %u\n", + __func__, cmd->o.opcode); + IPFW_DYN_UNLOCK(); + return (1); + } + + /* XXX just set lifetime */ + lookup_dyn_rule_locked(&args->f_id, NULL, NULL); + + IPFW_DYN_UNLOCK(); + return (0); +} + +/* + * Generate a TCP packet, containing either a RST or a keepalive. + * When flags & TH_RST, we are sending a RST packet, because of a + * "reset" action matched the packet. + * Otherwise we are sending a keepalive, and flags & TH_ + * The 'replyto' mbuf is the mbuf being replied to, if any, and is required + * so that MAC can label the reply appropriately. + */ +struct mbuf * +ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, + u_int32_t ack, int flags) +{ + struct mbuf *m = NULL; /* stupid compiler */ + int len, dir; + struct ip *h = NULL; /* stupid compiler */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif + struct tcphdr *th = NULL; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); + + M_SETFIB(m, id->fib); +#ifdef MAC + if (replyto != NULL) + mac_netinet_firewall_reply(replyto, m); + else + mac_netinet_firewall_send(m); +#else + (void)replyto; /* don't warn about unused arg */ +#endif + + switch (id->addr_type) { + case 4: + len = sizeof(struct ip) + sizeof(struct tcphdr); + break; +#ifdef INET6 + case 6: + len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + break; +#endif + default: + /* XXX: log me?!? */ + FREE_PKT(m); + return (NULL); + } + dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); + + m->m_data += max_linkhdr; + m->m_flags |= M_SKIP_FIREWALL; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + + switch (id->addr_type) { + case 4: + h = mtod(m, struct ip *); + + /* prepare for checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(sizeof(struct tcphdr)); + if (dir) { + h->ip_src.s_addr = htonl(id->src_ip); + h->ip_dst.s_addr = htonl(id->dst_ip); + } else { + h->ip_src.s_addr = htonl(id->dst_ip); + h->ip_dst.s_addr = htonl(id->src_ip); + } + + th = (struct tcphdr *)(h + 1); + break; +#ifdef INET6 + case 6: + h6 = mtod(m, struct ip6_hdr *); + + /* prepare for checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(sizeof(struct tcphdr)); + if (dir) { + h6->ip6_src = id->src_ip6; + h6->ip6_dst = id->dst_ip6; + } else { + h6->ip6_src = id->dst_ip6; + h6->ip6_dst = id->src_ip6; + } + + th = (struct tcphdr *)(h6 + 1); + break; +#endif + } + + if (dir) { + th->th_sport = htons(id->src_port); + th->th_dport = htons(id->dst_port); + } else { + th->th_sport = htons(id->dst_port); + th->th_dport = htons(id->src_port); + } + th->th_off = sizeof(struct tcphdr) >> 2; + + if (flags & TH_RST) { + if (flags & TH_ACK) { + th->th_seq = htonl(ack); + th->th_flags = TH_RST; + } else { + if (flags & TH_SYN) + seq++; + th->th_ack = htonl(seq); + th->th_flags = TH_RST | TH_ACK; + } + } else { + /* + * Keepalive - use caller provided sequence numbers + */ + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_flags = TH_ACK; + } + + switch (id->addr_type) { + case 4: + th->th_sum = in_cksum(m, len); + + /* finish the ip header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = 0; + /* ip_len must be in host format for ip_output */ + h->ip_len = len; + h->ip_ttl = V_ip_defttl; + h->ip_sum = 0; + break; +#ifdef INET6 + case 6: + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), + sizeof(struct tcphdr)); + + /* finish the ip6 header */ + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + break; +#endif + } + + return (m); +} + +/* + * This procedure is only used to handle keepalives. It is invoked + * every dyn_keepalive_period + */ + /* dummynet() and ipfw_tick() can't be static in windows */ +void +ipfw_tick(void * vnetx) +{ + struct mbuf *m0, *m, *mnext, **mtailp; +#ifdef INET6 + struct mbuf *m6, **m6_tailp; +#endif + int i; + ipfw_dyn_rule *q; +#ifdef VIMAGE + struct vnet *vp = vnetx; +#endif + + CURVNET_SET(vp); + if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) + goto done; + + /* + * We make a chain of packets to go out here -- not deferring + * until after we drop the IPFW dynamic rule lock would result + * in a lock order reversal with the normal packet input -> ipfw + * call stack. + */ + m0 = NULL; + mtailp = &m0; +#ifdef INET6 + m6 = NULL; + m6_tailp = &m6; +#endif + IPFW_DYN_LOCK(); + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { + if (q->dyn_type == O_LIMIT_PARENT) + continue; + if (q->id.proto != IPPROTO_TCP) + continue; + if ( (q->state & BOTH_SYN) != BOTH_SYN) + continue; + if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, + q->expire)) + continue; /* too early */ + if (TIME_LEQ(q->expire, time_uptime)) + continue; /* too late, rule expired */ + + m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, + q->ack_fwd, TH_SYN); + mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, + q->ack_rev, 0); + + switch (q->id.addr_type) { + case 4: + if (m != NULL) { + *mtailp = m; + mtailp = &(*mtailp)->m_nextpkt; + } + if (mnext != NULL) { + *mtailp = mnext; + mtailp = &(*mtailp)->m_nextpkt; + } + break; +#ifdef INET6 + case 6: + if (m != NULL) { + *m6_tailp = m; + m6_tailp = &(*m6_tailp)->m_nextpkt; + } + if (mnext != NULL) { + *m6_tailp = mnext; + m6_tailp = &(*m6_tailp)->m_nextpkt; + } + break; +#endif + } + + m = mnext = NULL; + } + } + IPFW_DYN_UNLOCK(); + for (m = mnext = m0; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + ip_output(m, NULL, NULL, 0, NULL, NULL); + } +#ifdef INET6 + for (m = mnext = m6; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); + } +#endif +done: + callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz, + ipfw_tick, vnetx, 0); + CURVNET_RESTORE(); +} + +void +ipfw_dyn_attach(void) +{ + ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + IPFW_DYN_LOCK_INIT(); +} + +void +ipfw_dyn_detach(void) +{ + uma_zdestroy(ipfw_dyn_rule_zone); + IPFW_DYN_LOCK_DESTROY(); +} + +void +ipfw_dyn_init(void) +{ + V_ipfw_dyn_v = NULL; + V_dyn_buckets = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + + V_dyn_max = 4096; /* max # of dynamic rules */ + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); + callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0); +} + +void +ipfw_dyn_uninit(int pass) +{ + if (pass == 0) + callout_drain(&V_ipfw_timeout); + else { + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); + } +} + +int +ipfw_dyn_len(void) +{ + return (V_ipfw_dyn_v == NULL) ? 0 : + (V_dyn_count * sizeof(ipfw_dyn_rule)); +} + +void +ipfw_get_dynamic(char **pbp, const char *ep) +{ + ipfw_dyn_rule *p, *last = NULL; + char *bp; + int i; + + if (V_ipfw_dyn_v == NULL) + return; + bp = *pbp; + + IPFW_DYN_LOCK(); + for (i = 0 ; i < V_curr_dyn_buckets; i++) + for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { + if (bp + sizeof *p <= ep) { + ipfw_dyn_rule *dst = + (ipfw_dyn_rule *)bp; + bcopy(p, dst, sizeof *p); + bcopy(&(p->rule->rulenum), &(dst->rule), + sizeof(p->rule->rulenum)); + /* + * store set number into high word of + * dst->rule pointer. + */ + bcopy(&(p->rule->set), + (char *)&dst->rule + + sizeof(p->rule->rulenum), + sizeof(p->rule->set)); + /* + * store a non-null value in "next". + * The userland code will interpret a + * NULL here as a marker + * for the last dynamic rule. + */ + bcopy(&dst, &dst->next, sizeof(dst)); + last = dst; + dst->expire = + TIME_LEQ(dst->expire, time_uptime) ? + 0 : dst->expire - time_uptime ; + bp += sizeof(ipfw_dyn_rule); + } + } + IPFW_DYN_UNLOCK(); + if (last != NULL) /* mark last dynamic rule */ + bzero(&last->next, sizeof(last)); + *pbp = bp; +} +/* end of file */ diff --git a/dummynet2/ip_fw_log.c b/dummynet2/ip_fw_log.c new file mode 100644 index 0000000..55b5c26 --- /dev/null +++ b/dummynet2/ip_fw_log.c @@ -0,0 +1,449 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 209845 2010-07-09 11:27:33Z glebius $"); + +/* + * Logging support for ipfw + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include +#include /* for IFT_ETHER */ +#include /* for BPF */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include /* ip6_sprintf() */ +#endif + +#ifdef MAC +#include +#endif + +/* + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 +#define SNP(buf) buf, sizeof(buf) + +#ifdef WITHOUT_BPF +void +ipfw_log_bpf(int onoff) +{ +} +#else /* !WITHOUT_BPF */ +static struct ifnet *log_if; /* hook to attach to bpf */ + +/* we use this dummy function for all ifnet callbacks */ +static int +log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + return EINVAL; +} + +static int +ipfw_log_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + if (m != NULL) + m_freem(m); + return EINVAL; +} + +static void +ipfw_log_start(struct ifnet* ifp) +{ + panic("ipfw_log_start() must not be called"); +} + +static const u_char ipfwbroadcastaddr[6] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +void +ipfw_log_bpf(int onoff) +{ + struct ifnet *ifp; + + if (onoff) { + if (log_if) + return; + ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) + return; + if_initname(ifp, "ipfw", 0); + ifp->if_mtu = 65536; + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = (void *)log_dummy; + ifp->if_ioctl = log_dummy; + ifp->if_start = ipfw_log_start; + ifp->if_output = ipfw_log_output; + ifp->if_addrlen = 6; + ifp->if_hdrlen = 14; + if_attach(ifp); + ifp->if_broadcastaddr = ipfwbroadcastaddr; + ifp->if_baudrate = IF_Mbps(10); + bpfattach(ifp, DLT_EN10MB, 14); + log_if = ifp; + } else { + if (log_if) { + ether_ifdetach(log_if); + if_free(log_if); + } + log_if = NULL; + } +} +#endif /* !WITHOUT_BPF */ + +/* + * We enter here when we have a rule with O_LOG. + * XXX this function alone takes about 2Kbytes of code! + */ +void +ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip) +{ + char *action; + int limit_reached = 0; + char action2[40], proto[128], fragment[32]; + + if (V_fw_verbose == 0) { +#ifndef WITHOUT_BPF + + if (log_if == NULL || log_if->if_bpf == NULL) + return; + + if (args->eh) /* layer2, use orig hdr */ + BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); + else + /* Add fake header. Later we will store + * more info in the header. + */ + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); +#endif /* !WITHOUT_BPF */ + return; + } + /* the old 'log' function */ + fragment[0] = '\0'; + proto[0] = '\0'; + + if (f == NULL) { /* bogus pkt */ + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) + return; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; + action = "Refuse"; + } else { /* O_LOG is the first action, find the real one */ + ipfw_insn *cmd = ACTION_PTR(f); + ipfw_insn_log *l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + l->log_left--; + if (l->log_left == 0) + limit_reached = l->max_log; + cmd += F_LEN(cmd); /* point to first action */ + if (cmd->opcode == O_ALTQ) { + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + snprintf(SNPARGS(action2, 0), "Altq %d", + altq->qid); + cmd += F_LEN(cmd); + } + if (cmd->opcode == O_PROB) + cmd += F_LEN(cmd); + + if (cmd->opcode == O_TAG) + cmd += F_LEN(cmd); + + action = action2; + switch (cmd->opcode) { + case O_DENY: + action = "Deny"; + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + action = "Reset"; + else if (cmd->arg1==ICMP_UNREACH_HOST) + action = "Reject"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1==ICMP6_UNREACH_RST) + action = "Reset"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_ACCEPT: + action = "Accept"; + break; + case O_COUNT: + action = "Count"; + break; + case O_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + cmd->arg1); + break; + case O_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + cmd->arg1); + break; + case O_SETFIB: + snprintf(SNPARGS(action2, 0), "SetFib %d", + cmd->arg1); + break; + case O_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + cmd->arg1); + break; + case O_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + cmd->arg1); + break; + case O_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + cmd->arg1); + break; + case O_FORWARD_IP: { + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; + int len; + struct in_addr dummyaddr; + if (sa->sa.sin_addr.s_addr == INADDR_ANY) + dummyaddr.s_addr = htonl(tablearg); + else + dummyaddr.s_addr = sa->sa.sin_addr.s_addr; + + len = snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(dummyaddr)); + + if (sa->sa.sin_port) + snprintf(SNPARGS(action2, len), ":%d", + sa->sa.sin_port); + } + break; + case O_NETGRAPH: + snprintf(SNPARGS(action2, 0), "Netgraph %d", + cmd->arg1); + break; + case O_NGTEE: + snprintf(SNPARGS(action2, 0), "Ngtee %d", + cmd->arg1); + break; + case O_NAT: + action = "Nat"; + break; + case O_REASS: + action = "Reass"; + break; + default: + action = "UNKNOWN"; + break; + } + } + + if (hlen == 0) { /* non-ip */ + snprintf(SNPARGS(proto, 0), "MAC"); + + } else { + int len; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + struct icmphdr *icmp; + struct tcphdr *tcp; + struct udphdr *udp; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + struct icmp6_hdr *icmp6; +#endif + src[0] = '\0'; + dst[0] = '\0'; +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); + + ip6 = (struct ip6_hdr *)ip; + tcp = (struct tcphdr *)(((char *)ip) + hlen); + udp = (struct udphdr *)(((char *)ip) + hlen); + } else +#endif + { + tcp = L3HDR(struct tcphdr, ip); + udp = L3HDR(struct udphdr, ip); + + inet_ntoa_r(ip->ip_src, src); + inet_ntoa_r(ip->ip_dst, dst); + } + + switch (args->f_id.proto) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(tcp->th_sport), + dst, + ntohs(tcp->th_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(udp->uh_sport), + dst, + ntohs(udp->uh_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_ICMP: + icmp = L3HDR(struct icmphdr, ip); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#ifdef INET6 + case IPPROTO_ICMPV6: + icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMPv6:%u.%u ", + icmp6->icmp6_type, icmp6->icmp6_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#endif + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", + args->f_id.proto, src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; + } + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) + snprintf(SNPARGS(fragment, 0), + " (frag %08x:%d@%d%s)", + args->f_id.extra, + ntohs(ip6->ip6_plen) - hlen, + ntohs(offset & IP6F_OFF_MASK) << 3, + (offset & IP6F_MORE_FRAG) ? "+" : ""); + } else +#endif + { + int ipoff, iplen; + ipoff = ntohs(ip->ip_off); + iplen = ntohs(ip->ip_len); + if (ipoff & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), + " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), + offset << 3, + (ipoff & IP_MF) ? "+" : ""); + } + } +#ifdef __FreeBSD__ + if (oif || m->m_pkthdr.rcvif) + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s %s via %s%s\n", + f ? f->rulenum : -1, + action, proto, oif ? "out" : "in", + oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, + fragment); + else +#endif + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, + action, proto, fragment); + if (limit_reached) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + limit_reached, f ? f->rulenum : -1); +} +/* end of file */ diff --git a/dummynet2/ip_fw_lookup.c b/dummynet2/ip_fw_lookup.c new file mode 100644 index 0000000..bf04cb6 --- /dev/null +++ b/dummynet2/ip_fw_lookup.c @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $"); + +/* + * Rule and pipe lookup support for ipfw. + * + +ipfw and dummynet need to quickly find objects (rules, pipes) +that may be dynamically created or destroyed. +To address the problem, we label each new object with a unique +32-bit identifier whose low K bits are the index in a lookup +table. All existing objects are referred by the lookup table, +and identifiers are chosen so that for each slot there is +at most one active object (whose identifier points to the slot). +This is almost a hash table, except that we can pick the +identifiers after looking at the table's occupation so +we have a trivial hash function and are collision free. + +With this structure, operations are very fast and simple: +- the table has N entries s[i] with two fields, 'id' and 'ptr', + with N <= M = 2^k (M is an upper bound to the size of the table); +- initially, all slots have s[i].id = i, and the pointers + are used to build a freelist (tailq). +- a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N]. + This is easy to detect and we can use ptr to build the freelist. +- when a new object is created, we put it in the empty slot i at the + head of the freelist, and set the id to s[i].id; +- when an object is destroyed, we append its slot i to the end + of the freelist, and set s[i].id += M (note M, not N). +- on a lookup for id = X, we look at slot i = X & (M-1), + and consider the lookup successful only if the slot is not + empty and s[i].id == X; +- wraps occur at most every F * 2^32/M operations, where F is + the number of free slots. Because F is usually a reasonable + fraction of M, we should not worry too much. +- if the table fills up, we can extend it by increasing N +- shrinking the table is more difficult as we might create + collisions during the rehashing. + * + */ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup"); +#define Malloc(n) malloc(n, M_IPFW_LUT, M_WAITOK) +#define Calloc(n) calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO) +#define Free(p) free(p, M_IPFW_LUT) + +#define log(x, arg...) + +#else /* !_KERNEL */ +#include +#include +#include +#include +#define Malloc(n) malloc(n) +#define Calloc(n) calloc(1, n) +#define Free(p) free(p) +#define log(x, arg...) fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg) +#endif /* !_KERNEL */ + +struct entry { + uint32_t id; + struct entry *ptr; +}; + +struct lookup_table { + int _size; + int used; + int mask; /* 2^k -1, used for hashing */ + struct entry *f_head, *f_tail; /* freelist */ + struct entry * s; /* slots, array of N entries */ +}; + +static __inline int empty(struct lookup_table *head, const void *p) +{ + const struct entry *ep = p; + return (ep == NULL || + (ep >= head->s && ep < &head->s[head->_size])); +} + +/* + * init or reinit a table + */ +struct lookup_table * +ipfw_lut_init(struct lookup_table *head, int new_size, int mask) +{ + int i; + struct entry *s; /* the new slots */ + struct entry *fh, *ft; /* the freelist */ + + if (head != NULL) { + mask = head->mask; + if (new_size <= head->_size) + return head; + if (new_size >= mask+1) { + log("size larger than mask"); + return NULL; + } + } else { + log("old is null, initialize"); + head = Calloc(sizeof(*head)); + if (head == NULL) + return NULL; + if (new_size >= mask) + mask = new_size; + if (mask & (mask -1)) { + for (i = 1; i < mask; i += i) + ; + log("mask %d not 2^k, round up to %d", mask, i); + mask = i; + } + mask = head->mask = mask - 1; + } + + s = Calloc(new_size * sizeof(*s)); + if (s == NULL) + return NULL; + if (!head->s) { + head->s = s; + head->_size = 1; + } + fh = ft = NULL; + /* remap the entries, adjust the freelist */ + for (i = 0; i < new_size; i++) { + s[i].id = (i >= head->_size) ? i : head->s[i].id; + if (i < head->_size && !empty(head, head->s[i].ptr)) { + s[i].ptr = head->s[i].ptr; + continue; + } + if (fh == NULL) + fh = &s[i]; + else + ft->ptr = &s[i]; + ft = &s[i]; + } + head->f_head = fh; + head->f_tail = ft; + + /* write lock on the structure, to protect the readers */ + fh = head->s; + head->s = s; + head->_size = new_size; + /* release write lock */ + if (fh != s) + Free(fh); + log("done"); + return head; +} + +/* insert returns the id */ +int +ipfw_lut_insert(struct lookup_table *head, void *d) +{ + struct entry *e; + + e = head->f_head; + if (e == NULL) + return -1; + head->f_head = e->ptr; + e->ptr = d; + head->used++; + return e->id; +} + +/* delete, returns the original entry */ +void * +ipfw_lut_delete(struct lookup_table *head, int id) +{ + int i = id & head->mask; + void *result; + struct entry *e; + + if (i >= head->_size) + return NULL; + e = &head->s[i]; + if (e->id != id) + return NULL; + result = e->ptr; + /* write lock to invalidate the entry to readers */ + e->id += head->mask + 1; /* prepare for next insert */ + e->ptr = NULL; + /* release write lock */ + if (head->f_head == NULL) + head->f_head = e; + else + head->f_tail->ptr = e; + head->f_tail = e; + head->used--; + return result; +} + +void * +ipfw_lut_lookup(struct lookup_table *head, int id) +{ + int i = id & head->mask; + struct entry *e; + + if (i >= head->_size) + return NULL; + e = &head->s[i]; + return (e->id == id) ? e->ptr : NULL; +} + +void +ipfw_lut_dump(struct lookup_table *head) +{ + int i; + + log("head %p size %d used %d freelist %d", + head, head->_size, head->used, head->f_head ? + head->f_head - head->s : -1); + for (i = 0; i < head->_size; i++) { + struct entry *e = &head->s[i]; + char ee = empty(head, e->ptr) ? 'E' : ' '; + log("%5d %5d %c %p", i, e->id, ee, + ee == 'E' && e->ptr != NULL ? + (void *)((struct entry *)e->ptr - head->s) : e->ptr); + } +} + +#ifndef _KERNEL +void dump_p(struct lookup_table *p, int *map) +{ + int i; + for (i = 0; i < p->_size; i++) { + int id = (int)ipfw_lut_lookup(p, map[i]); + log("%3d: %3d: %c", map[i] % 64, i, id); + } +} +int main(int argc, char *argv[]) +{ + int i, j, l; +#define S 1000 + int map[S]; + struct lookup_table *p; + struct lookup_table *p1; + const char *m = "nel mezzo del cammin di nostra vita mi ritrovai" + " in una selva oscura e la diritta via era smarrita!"; + + fprintf(stderr, "testing lookup\n"); + + l = strlen(m); + + p = ipfw_lut_init(NULL, 120, 33); + + ipfw_lut_dump(p); + for (i = 0; i < l; i++) { + int x = m[i]; + int id = ipfw_lut_insert(p, (void *)x); + //ipfw_lut_dump(p); + map[i] = id; + for (j=0; j < 10; j++) { + id = ipfw_lut_insert(p, (void *)'a'); + // ipfw_lut_dump(p); + ipfw_lut_delete(p, id); + // ipfw_lut_dump(p); + } + // ipfw_lut_dump(p); + } + dump_p(p, map); + p1 = ipfw_lut_init(p, 23, 0); + if (!p1) + return 1; + dump_p(p1, map); + p1 = ipfw_lut_init(p1, 120, 0); + if (!p1) + return 1; + dump_p(p1, map); + return 0; +} +#endif +/* end of file */ diff --git a/dummynet2/ip_fw_nat.c b/dummynet2/ip_fw_nat.c new file mode 100644 index 0000000..41fe919 --- /dev/null +++ b/dummynet2/ip_fw_nat.c @@ -0,0 +1,604 @@ +/*- + * Copyright (c) 2008 Paolo Pisati + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* XXX for in_cksum */ + +static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag); +#define V_ifaddr_event_tag VNET(ifaddr_event_tag) + +static void +ifaddr_change(void *arg __unused, struct ifnet *ifp) +{ + struct cfg_nat *ptr; + struct ifaddr *ifa; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + /* Check every nat entry... */ + LIST_FOREACH(ptr, &chain->nat, _next) { + /* ...using nic 'ifp->if_xname' as dynamic alias address. */ + if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) + continue; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ptr->ip = ((struct sockaddr_in *) + (ifa->ifa_addr))->sin_addr; + LibAliasSetAddress(ptr->lib, ptr->ip); + } + if_addr_runlock(ifp); + } + IPFW_WUNLOCK(chain); +} + +/* + * delete the pointers for nat entry ix, or all of them if ix < 0 + */ +static void +flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) +{ + int i; + ipfw_insn_nat *cmd; + + IPFW_WLOCK_ASSERT(chain); + for (i = 0; i < chain->n_rules; i++) { + cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); + /* XXX skip log and the like ? */ + if (cmd->o.opcode == O_NAT && cmd->nat != NULL && + (ix < 0 || cmd->nat->id == ix)) + cmd->nat = NULL; + } +} + +static void +del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) +{ + struct cfg_redir *r, *tmp_r; + struct cfg_spool *s, *tmp_s; + int i, num; + + LIST_FOREACH_SAFE(r, head, _next, tmp_r) { + num = 1; /* Number of alias_link to delete. */ + switch (r->mode) { + case REDIR_PORT: + num = r->pport_cnt; + /* FALLTHROUGH */ + case REDIR_ADDR: + case REDIR_PROTO: + /* Delete all libalias redirect entry. */ + for (i = 0; i < num; i++) + LibAliasRedirectDelete(n->lib, r->alink[i]); + /* Del spool cfg if any. */ + LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { + LIST_REMOVE(s, _next); + free(s, M_IPFW); + } + free(r->alink, M_IPFW); + LIST_REMOVE(r, _next); + free(r, M_IPFW); + break; + default: + printf("unknown redirect mode: %u\n", r->mode); + /* XXX - panic?!?!? */ + break; + } + } +} + +static int +add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) +{ + struct cfg_redir *r, *ser_r; + struct cfg_spool *s, *ser_s; + int cnt, off, i; + + for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { + ser_r = (struct cfg_redir *)&buf[off]; + r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(r, ser_r, SOF_REDIR); + LIST_INIT(&r->spool_chain); + off += SOF_REDIR; + r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, + M_IPFW, M_WAITOK | M_ZERO); + switch (r->mode) { + case REDIR_ADDR: + r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, + r->paddr); + break; + case REDIR_PORT: + for (i = 0 ; i < r->pport_cnt; i++) { + /* If remotePort is all ports, set it to 0. */ + u_short remotePortCopy = r->rport + i; + if (r->rport_cnt == 1 && r->rport == 0) + remotePortCopy = 0; + r->alink[i] = LibAliasRedirectPort(ptr->lib, + r->laddr, htons(r->lport + i), r->raddr, + htons(remotePortCopy), r->paddr, + htons(r->pport + i), r->proto); + if (r->alink[i] == NULL) { + r->alink[0] = NULL; + break; + } + } + break; + case REDIR_PROTO: + r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, + r->raddr, r->paddr, r->proto); + break; + default: + printf("unknown redirect mode: %u\n", r->mode); + break; + } + /* XXX perhaps return an error instead of panic ? */ + if (r->alink[0] == NULL) + panic("LibAliasRedirect* returned NULL"); + /* LSNAT handling. */ + for (i = 0; i < r->spool_cnt; i++) { + ser_s = (struct cfg_spool *)&buf[off]; + s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(s, ser_s, SOF_SPOOL); + LibAliasAddServer(ptr->lib, r->alink[0], + s->addr, htons(s->port)); + off += SOF_SPOOL; + /* Hook spool entry. */ + LIST_INSERT_HEAD(&r->spool_chain, s, _next); + } + /* And finally hook this redir entry. */ + LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); + } + return (1); +} + +static int +ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) +{ + struct mbuf *mcl; + struct ip *ip; + /* XXX - libalias duct tape */ + int ldt, retval; + char *c; + + ldt = 0; + retval = 0; + mcl = m_megapullup(m, m->m_pkthdr.len); + if (mcl == NULL) { + args->m = NULL; + return (IP_FW_DENY); + } + ip = mtod(mcl, struct ip *); + + /* + * XXX - Libalias checksum offload 'duct tape': + * + * locally generated packets have only pseudo-header checksum + * calculated and libalias will break it[1], so mark them for + * later fix. Moreover there are cases when libalias modifies + * tcp packet data[2], mark them for later fix too. + * + * [1] libalias was never meant to run in kernel, so it does + * not have any knowledge about checksum offloading, and + * expects a packet with a full internet checksum. + * Unfortunately, packets generated locally will have just the + * pseudo header calculated, and when libalias tries to adjust + * the checksum it will actually compute a wrong value. + * + * [2] when libalias modifies tcp's data content, full TCP + * checksum has to be recomputed: the problem is that + * libalias does not have any idea about checksum offloading. + * To work around this, we do not do checksumming in LibAlias, + * but only mark the packets in th_x2 field. If we receive a + * marked packet, we calculate correct checksum for it + * aware of offloading. Why such a terrible hack instead of + * recalculating checksum for each packet? + * Because the previous checksum was not checked! + * Recalculating checksums for EVERY packet will hide ALL + * transmission errors. Yes, marked packets still suffer from + * this problem. But, sigh, natd(8) has this problem, too. + * + * TODO: -make libalias mbuf aware (so + * it can handle delayed checksum and tso) + */ + + if (mcl->m_pkthdr.rcvif == NULL && + mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + ldt = 1; + + c = mtod(mcl, char *); + if (args->oif == NULL) + retval = LibAliasIn(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + else + retval = LibAliasOut(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + if (retval == PKT_ALIAS_RESPOND) { + m->m_flags |= M_SKIP_FIREWALL; + retval = PKT_ALIAS_OK; + } + if (retval != PKT_ALIAS_OK && + retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) { + /* XXX - should i add some logging? */ + m_free(mcl); + args->m = NULL; + return (IP_FW_DENY); + } + mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); + + /* + * XXX - libalias checksum offload + * 'duct tape' (see above) + */ + + if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && + ip->ip_p == IPPROTO_TCP) { + struct tcphdr *th; + + th = (struct tcphdr *)(ip + 1); + if (th->th_x2) + ldt = 1; + } + + if (ldt) { + struct tcphdr *th; + struct udphdr *uh; + u_short cksum; + + ip->ip_len = ntohs(ip->ip_len); + cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))); + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + /* + * Maybe it was set in + * libalias... + */ + th->th_x2 = 0; + th->th_sum = cksum; + mcl->m_pkthdr.csum_data = + offsetof(struct tcphdr, th_sum); + break; + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + uh->uh_sum = cksum; + mcl->m_pkthdr.csum_data = + offsetof(struct udphdr, uh_sum); + break; + } + /* No hw checksum offloading: do it ourselves */ + if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { + in_delayed_cksum(mcl); + mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + ip->ip_len = htons(ip->ip_len); + } + args->m = mcl; + return (IP_FW_NAT); +} + +static struct cfg_nat * +lookup_nat(struct nat_list *l, int nat_id) +{ + struct cfg_nat *res; + + LIST_FOREACH(res, l, _next) { + if (res->id == nat_id) + break; + } + return res; +} + +static int +ipfw_nat_cfg(struct sockopt *sopt) +{ + struct cfg_nat *ptr, *ser_n; + char *buf; + struct ip_fw_chain *chain = &V_layer3_chain; + + buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); + sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat)); + ser_n = (struct cfg_nat *)buf; + + /* check valid parameter ser_n->id > 0 ? */ + /* + * Find/create nat rule. + */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, ser_n->id); + if (ptr == NULL) { + /* New rule: allocate and init new instance. */ + ptr = malloc(sizeof(struct cfg_nat), + M_IPFW, M_NOWAIT | M_ZERO); + if (ptr == NULL) { + IPFW_WUNLOCK(chain); + free(buf, M_IPFW); + return (ENOSPC); + } + ptr->lib = LibAliasInit(NULL); + if (ptr->lib == NULL) { + IPFW_WUNLOCK(chain); + free(ptr, M_IPFW); + free(buf, M_IPFW); + return (EINVAL); + } + LIST_INIT(&ptr->redir_chain); + } else { + /* Entry already present: temporarly unhook it. */ + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, ser_n->id); + } + IPFW_WUNLOCK(chain); + + /* + * Basic nat configuration. + */ + ptr->id = ser_n->id; + /* + * XXX - what if this rule doesn't nat any ip and just + * redirect? + * do we set aliasaddress to 0.0.0.0? + */ + ptr->ip = ser_n->ip; + ptr->redir_cnt = ser_n->redir_cnt; + ptr->mode = ser_n->mode; + LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode); + LibAliasSetAddress(ptr->lib, ptr->ip); + memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE); + + /* + * Redir and LSNAT configuration. + */ + /* Delete old cfgs. */ + del_redir_spool_cfg(ptr, &ptr->redir_chain); + /* Add new entries. */ + add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); + free(buf, M_IPFW); + IPFW_WLOCK(chain); + LIST_INSERT_HEAD(&chain->nat, ptr, _next); + IPFW_WUNLOCK(chain); + return (0); +} + +static int +ipfw_nat_del(struct sockopt *sopt) +{ + struct cfg_nat *ptr; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; + + sooptcopyin(sopt, &i, sizeof i, sizeof i); + /* XXX validate i */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, i); + if (ptr == NULL) { + IPFW_WUNLOCK(chain); + return (EINVAL); + } + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, i); + IPFW_WUNLOCK(chain); + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); + return (0); +} + +static int +ipfw_nat_get_cfg(struct sockopt *sopt) +{ + uint8_t *data; + struct cfg_nat *n; + struct cfg_redir *r; + struct cfg_spool *s; + int nat_cnt, off; + struct ip_fw_chain *chain; + int err = ENOSPC; + + chain = &V_layer3_chain; + nat_cnt = 0; + off = sizeof(nat_cnt); + + data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); + IPFW_RLOCK(chain); + /* Serialize all the data. */ + LIST_FOREACH(n, &chain->nat, _next) { + nat_cnt++; + if (off + SOF_NAT >= NAT_BUF_LEN) + goto nospace; + bcopy(n, &data[off], SOF_NAT); + off += SOF_NAT; + LIST_FOREACH(r, &n->redir_chain, _next) { + if (off + SOF_REDIR >= NAT_BUF_LEN) + goto nospace; + bcopy(r, &data[off], SOF_REDIR); + off += SOF_REDIR; + LIST_FOREACH(s, &r->spool_chain, _next) { + if (off + SOF_SPOOL >= NAT_BUF_LEN) + goto nospace; + bcopy(s, &data[off], SOF_SPOOL); + off += SOF_SPOOL; + } + } + } + err = 0; /* all good */ +nospace: + IPFW_RUNLOCK(chain); + if (err == 0) { + bcopy(&nat_cnt, data, sizeof(nat_cnt)); + sooptcopyout(sopt, data, NAT_BUF_LEN); + } else { + printf("serialized data buffer not big enough:" + "please increase NAT_BUF_LEN\n"); + } + free(data, M_IPFW); + return (err); +} + +static int +ipfw_nat_get_log(struct sockopt *sopt) +{ + uint8_t *data; + struct cfg_nat *ptr; + int i, size; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + IPFW_RLOCK(chain); + /* one pass to count, one to copy the data */ + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + i++; + } + size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); + data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); + if (data == NULL) { + IPFW_RUNLOCK(chain); + return (ENOSPC); + } + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + bcopy(&ptr->id, &data[i], sizeof(int)); + i += sizeof(int); + bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); + i += LIBALIAS_BUF_SIZE; + } + IPFW_RUNLOCK(chain); + sooptcopyout(sopt, data, size); + free(data, M_IPFW); + return(0); +} + +static void +ipfw_nat_init(void) +{ + + IPFW_WLOCK(&V_layer3_chain); + /* init ipfw hooks */ + ipfw_nat_ptr = ipfw_nat; + lookup_nat_ptr = lookup_nat; + ipfw_nat_cfg_ptr = ipfw_nat_cfg; + ipfw_nat_del_ptr = ipfw_nat_del; + ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; + ipfw_nat_get_log_ptr = ipfw_nat_get_log; + IPFW_WUNLOCK(&V_layer3_chain); + V_ifaddr_event_tag = EVENTHANDLER_REGISTER( + ifaddr_event, ifaddr_change, + NULL, EVENTHANDLER_PRI_ANY); +} + +static void +ipfw_nat_destroy(void) +{ + struct cfg_nat *ptr, *ptr_temp; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { + LIST_REMOVE(ptr, _next); + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); + } + EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag); + flush_nat_ptrs(chain, -1 /* flush all */); + /* deregister ipfw_nat */ + ipfw_nat_ptr = NULL; + lookup_nat_ptr = NULL; + ipfw_nat_cfg_ptr = NULL; + ipfw_nat_del_ptr = NULL; + ipfw_nat_get_cfg_ptr = NULL; + ipfw_nat_get_log_ptr = NULL; + IPFW_WUNLOCK(chain); +} + +static int +ipfw_nat_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + ipfw_nat_init(); + break; + + case MOD_UNLOAD: + ipfw_nat_destroy(); + break; + + default: + return EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfw_nat_mod = { + "ipfw_nat", + ipfw_nat_modevent, + 0 +}; + +DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); +MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); +MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); +MODULE_VERSION(ipfw_nat, 1); +/* end of file */ diff --git a/dummynet2/ip_fw_pfil.c b/dummynet2/ip_fw_pfil.c new file mode 100644 index 0000000..a125ef2 --- /dev/null +++ b/dummynet2/ip_fw_pfil.c @@ -0,0 +1,415 @@ +/*- + * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $"); + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif /* KLD_MODULE */ +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static VNET_DEFINE(int, fw_enable) = 1; +#define V_fw_enable VNET(fw_enable) + +#ifdef INET6 +static VNET_DEFINE(int, fw6_enable) = 1; +#define V_fw6_enable VNET(fw6_enable) +#endif + +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); + +/* Forward declarations. */ +static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f1) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw"); +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6_fw); +SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + +/* + * The pfilter hook to pass packets to ipfw_chk and then to + * dummynet, divert, netgraph or other modules. + * The packet may be consumed. + */ +int +ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + struct ip_fw_args args; + struct m_tag *tag; + int ipfw; + int ret; + + /* all the processing now uses ip_len in net format */ + if (mtod(*m0, struct ip *)->ip_v == 4) + SET_NET_IPLEN(mtod(*m0, struct ip *)); + + /* convert dir to IPFW values */ + dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; + bzero(&args, sizeof(args)); + +again: + /* + * extract and remove the tag if present. If we are left + * with onepass, optimize the outgoing path. + */ + tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (tag != NULL) { + args.rule = *((struct ipfw_rule_ref *)(tag+1)); + m_tag_delete(*m0, tag); + if (args.rule.info & IPFW_ONEPASS) { + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return 0; + } + } + + args.m = *m0; + args.oif = dir == DIR_OUT ? ifp : NULL; + args.inp = inp; + + ipfw = ipfw_chk(&args); + *m0 = args.m; + + KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", + __func__)); + + /* breaking out of the switch means drop */ + ret = 0; /* default return value for pass */ + switch (ipfw) { + case IP_FW_PASS: + /* next_hop may be set by ipfw_chk */ + if (args.next_hop == NULL) + break; /* pass */ +#ifndef IPFIREWALL_FORWARD + ret = EACCES; +#else + { + struct m_tag *fwd_tag; + + /* Incoming packets should not be tagged so we do not + * m_tag_find. Outgoing packets may be tagged, so we + * reuse the tag if present. + */ + fwd_tag = (dir == DIR_IN) ? NULL : + m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag != NULL) { + m_tag_unlink(*m0, fwd_tag); + } else { + fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, + sizeof(struct sockaddr_in), M_NOWAIT); + if (fwd_tag == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + } + bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in)); + m_tag_prepend(*m0, fwd_tag); + + if (in_localip(args.next_hop->sin_addr)) + (*m0)->m_flags |= M_FASTFWD_OURS; + } +#endif + break; + + case IP_FW_DENY: + ret = EACCES; + break; /* i.e. drop */ + + case IP_FW_DUMMYNET: + ret = EACCES; + if (ip_dn_io_ptr == NULL) + break; /* i.e. drop */ + if (mtod(*m0, struct ip *)->ip_v == 4) + ret = ip_dn_io_ptr(m0, dir, &args); + else if (mtod(*m0, struct ip *)->ip_v == 6) + ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); + else + break; /* drop it */ + /* + * XXX should read the return value. + * dummynet normally eats the packet and sets *m0=NULL + * unless the packet can be sent immediately. In this + * case args is updated and we should re-run the + * check without clearing args. + */ + if (*m0 != NULL) + goto again; + break; + + case IP_FW_TEE: + case IP_FW_DIVERT: + if (ip_divert_ptr == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ipfw_divert(m0, dir, &args.rule, + (ipfw == IP_FW_TEE) ? 1 : 0); + /* continue processing for the original packet (tee). */ + if (*m0) + goto again; + break; + + case IP_FW_NGTEE: + case IP_FW_NETGRAPH: + if (ng_ipfw_input_p == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ng_ipfw_input_p(m0, dir, &args, + (ipfw == IP_FW_NGTEE) ? 1 : 0); + if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ + goto again; /* continue with packet */ + break; + + case IP_FW_NAT: + /* honor one-pass in case of successful nat */ + if (V_fw_one_pass) + break; /* ret is already 0 */ + goto again; + + case IP_FW_REASS: + goto again; /* continue with packet */ + + default: + KASSERT(0, ("%s: unknown retval", __func__)); + } + + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return ret; +} + +/* do the divert, return 1 on error 0 on success */ +static int +ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, + int tee) +{ + /* + * ipfw_chk() has already tagged the packet with the divert tag. + * If tee is set, copy packet and return original. + * If not tee, consume packet and send it to divert socket. + */ + struct mbuf *clone; + struct ip *ip; + struct m_tag *tag; + + /* Cloning needed for tee? */ + if (tee == 0) { + clone = *m0; /* use the original mbuf */ + *m0 = NULL; + } else { + clone = m_dup(*m0, M_DONTWAIT); + /* If we cannot duplicate the mbuf, we sacrifice the divert + * chain and continue with the tee-ed packet. + */ + if (clone == NULL) + return 1; + } + + /* + * Divert listeners can normally handle non-fragmented packets, + * but we can only reass in the non-tee case. + * This means that listeners on a tee rule may get fragments, + * and have to live with that. + * Note that we now have the 'reass' ipfw option so if we care + * we can do it before a 'tee'. + */ + ip = mtod(clone, struct ip *); + if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { + int hlen; + struct mbuf *reass; + + SET_HOST_IPLEN(ip); /* ip_reass wants host order */ + reass = ip_reass(clone); /* Reassemble packet. */ + if (reass == NULL) + return 0; /* not an error */ + /* if reass = NULL then it was consumed by ip_reass */ + /* + * IP header checksum fixup after reassembly and leave header + * in network byte order. + */ + ip = mtod(reass, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(reass, hlen); + clone = reass; + } + /* attach a tag to the packet with the reinject info */ + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT); + if (tag == NULL) { + FREE_PKT(clone); + return 1; + } + *((struct ipfw_rule_ref *)(tag+1)) = *rule; + m_tag_prepend(clone, tag); + + /* Do the dirty job... */ + ip_divert_ptr(clone, incoming); + return 0; +} + +/* + * attach or detach hooks for a given protocol family + */ +static int +ipfw_hook(int onoff, int pf) +{ + struct pfil_head *pfh; + + pfh = pfil_head_get(PFIL_TYPE_AF, pf); + if (pfh == NULL) + return ENOENT; + + (void) (onoff ? pfil_add_hook : pfil_remove_hook) + (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); + + return 0; +} + +int +ipfw_attach_hooks(int arg) +{ + int error = 0; + + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET); + else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { + error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ + printf("ipfw_hook() error\n"); + } +#ifdef INET6 + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET6); + else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { + error = ENOENT; + printf("ipfw6_hook() error\n"); + } +#endif + return error; +} + +int +ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +{ + int enable; + int oldenable; + int error; + int af; + + if (arg1 == &VNET_NAME(fw_enable)) { + enable = V_fw_enable; + af = AF_INET; + } +#ifdef INET6 + else if (arg1 == &VNET_NAME(fw6_enable)) { + enable = V_fw6_enable; + af = AF_INET6; + } +#endif + else + return (EINVAL); + + oldenable = enable; + + error = sysctl_handle_int(oidp, &enable, 0, req); + + if (error) + return (error); + + enable = (enable) ? 1 : 0; + + if (enable == oldenable) + return (0); + + error = ipfw_hook(enable, af); + if (error) + return (error); + if (af == AF_INET) + V_fw_enable = enable; +#ifdef INET6 + else if (af == AF_INET6) + V_fw6_enable = enable; +#endif + + return (0); +} +/* end of file */ diff --git a/dummynet2/ip_fw_sockopt.c b/dummynet2/ip_fw_sockopt.c new file mode 100644 index 0000000..6938aca --- /dev/null +++ b/dummynet2/ip_fw_sockopt.c @@ -0,0 +1,1343 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Supported by: Valeria Paoli + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 206339 2010-04-07 08:23:58Z luigi $"); + +/* + * Sockopt support for ipfw. The routines here implement + * the upper half of the ipfw code. + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include /* struct m_tag used by nested headers */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* hooks */ +#include +#include + +#ifdef MAC +#include +#endif + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +/* + * static variables followed by global ones (none in this file) + */ + +/* + * Find the smallest rule >= key, id. + * We could use bsearch but it is so simple that we code it directly + */ +int +ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) +{ + int i, lo, hi; + struct ip_fw *r; + + for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { + i = (lo + hi) / 2; + r = chain->map[i]; + if (r->rulenum < key) + lo = i + 1; /* continue from the next one */ + else if (r->rulenum > key) + hi = i; /* this might be good */ + else if (r->id < id) + lo = i + 1; /* continue from the next one */ + else /* r->id >= id */ + hi = i; /* this might be good */ + }; + return hi; +} + +/* + * allocate a new map, returns the chain locked. extra is the number + * of entries to add or delete. + */ +static struct ip_fw ** +get_map(struct ip_fw_chain *chain, int extra, int locked) +{ + + for (;;) { + struct ip_fw **map; + int i; + + i = chain->n_rules + extra; + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, + locked ? M_NOWAIT : M_WAITOK); + if (map == NULL) { + printf("%s: cannot allocate map\n", __FUNCTION__); + return NULL; + } + if (!locked) + IPFW_UH_WLOCK(chain); + if (i >= chain->n_rules + extra) /* good */ + return map; + /* otherwise we lost the race, free and retry */ + if (!locked) + IPFW_UH_WUNLOCK(chain); + free(map, M_IPFW); + } +} + +/* + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK + */ +static struct ip_fw ** +swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) +{ + struct ip_fw **old_map; + + IPFW_WLOCK(chain); + chain->id++; + chain->n_rules = new_len; + old_map = chain->map; + chain->map = new_map; + IPFW_WUNLOCK(chain); + return old_map; +} + +/* + * Add a new rule to the list. Copy the rule into a malloc'ed area, then + * possibly create a rule number and add the rule to the list. + * Update the rule_number in the input struct so the caller knows it as well. + * XXX DO NOT USE FOR THE DEFAULT RULE. + * Must be called without IPFW_UH held + */ +int +ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) +{ + struct ip_fw *rule; + int i, l, insert_before; + struct ip_fw **map; /* the new array of pointers */ + + if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) + return (EINVAL); + + l = RULESIZE(input_rule); + rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); + if (rule == NULL) + return (ENOSPC); + /* get_map returns with IPFW_UH_WLOCK if successful */ + map = get_map(chain, 1, 0 /* not locked */); + if (map == NULL) { + free(rule, M_IPFW); + return ENOSPC; + } + + bcopy(input_rule, rule, l); + /* clear fields not settable from userland */ + rule->x_next = NULL; + rule->next_rule = NULL; + rule->pcnt = 0; + rule->bcnt = 0; + rule->timestamp = 0; + + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; + /* find the insertion point, we will insert before */ + insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; + i = ipfw_find_rule(chain, insert_before, 0); + /* duplicate first part */ + if (i > 0) + bcopy(chain->map, map, i * sizeof(struct ip_fw *)); + map[i] = rule; + /* duplicate remaining part, we always have the default rule */ + bcopy(chain->map + i, map + i + 1, + sizeof(struct ip_fw *) *(chain->n_rules - i)); + if (rule->rulenum == 0) { + /* write back the number */ + rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; + if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rule->rulenum += V_autoinc_step; + input_rule->rulenum = rule->rulenum; + } + + rule->id = chain->id + 1; + map = swap_map(chain, map, chain->n_rules + 1); + chain->static_len += l; + IPFW_UH_WUNLOCK(chain); + if (map) + free(map, M_IPFW); + return (0); +} + +/* + * Reclaim storage associated with a list of rules. This is + * typically the list created using remove_rule. + * A NULL pointer on input is handled correctly. + */ +void +ipfw_reap_rules(struct ip_fw *head) +{ + struct ip_fw *rule; + + while ((rule = head) != NULL) { + head = head->x_next; + free(rule, M_IPFW); + } +} + +/* + * Used by del_entry() to check if a rule should be kept. + * Returns 1 if the rule must be kept, 0 otherwise. + * + * Called with cmd = {0,1,5}. + * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; + * cmd == 1 matches on set numbers only, rule numbers are ignored; + * cmd == 5 matches on rule and set numbers. + * + * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. + * + * Rules to keep are + * (default || reserved || !match_set || !match_number) + * where + * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) + * // the default rule is always protected + * + * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) + * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") + * + * match_set ::= (cmd == 0 || rule->set == set) + * // set number is ignored for cmd == 0 + * + * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) + * // number is ignored for cmd == 1 or n == 0 + * + */ +static int +keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) +{ + return + (rule->rulenum == IPFW_DEFAULT_RULE) || + (cmd == 0 && n == 0 && rule->set == RESVD_SET) || + !(cmd == 0 || rule->set == set) || + !(cmd == 1 || n == 0 || n == rule->rulenum); +} + +/** + * Remove all rules with given number, or do set manipulation. + * Assumes chain != NULL && *chain != NULL. + * + * The argument is an uint32_t. The low 16 bit are the rule or set number; + * the next 8 bits are the new set; the top 8 bits indicate the command: + * + * 0 delete rules numbered "rulenum" + * 1 delete rules in set "rulenum" + * 2 move rules "rulenum" to set "new_set" + * 3 move rules from set "rulenum" to set "new_set" + * 4 swap sets "rulenum" and "new_set" + * 5 delete rules "rulenum" and set "new_set" + */ +static int +del_entry(struct ip_fw_chain *chain, uint32_t arg) +{ + struct ip_fw *rule; + uint32_t num; /* rule number or old_set */ + uint8_t cmd, new_set; + int start, end, i, ofs, n; + struct ip_fw **map = NULL; + int error = 0; + + num = arg & 0xffff; + cmd = (arg >> 24) & 0xff; + new_set = (arg >> 16) & 0xff; + + if (cmd > 5 || new_set > RESVD_SET) + return EINVAL; + if (cmd == 0 || cmd == 2 || cmd == 5) { + if (num >= IPFW_DEFAULT_RULE) + return EINVAL; + } else { + if (num > RESVD_SET) /* old_set */ + return EINVAL; + } + + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + chain->reap = NULL; /* prepare for deletions */ + + switch (cmd) { + case 0: /* delete rules "num" (num == 0 matches all) */ + case 1: /* delete all rules in set N */ + case 5: /* delete rules with number N and set "new_set". */ + + /* + * Locate first rule to delete (start), the rule after + * the last one to delete (end), and count how many + * rules to delete (n). Always use keep_rule() to + * determine which rules to keep. + */ + n = 0; + if (cmd == 1) { + /* look for a specific set including RESVD_SET. + * Must scan the entire range, ignore num. + */ + new_set = num; + for (start = -1, end = i = 0; i < chain->n_rules; i++) { + if (keep_rule(chain->map[i], cmd, new_set, 0)) + continue; + if (start < 0) + start = i; + end = i; + n++; + } + end++; /* first non-matching */ + } else { + /* Optimized search on rule numbers */ + start = ipfw_find_rule(chain, num, 0); + for (end = start; end < chain->n_rules; end++) { + rule = chain->map[end]; + if (num > 0 && rule->rulenum != num) + break; + if (!keep_rule(rule, cmd, new_set, num)) + n++; + } + } + + if (n == 0) { + /* A flush request (arg == 0) on empty ruleset + * returns with no error. On the contrary, + * if there is no match on a specific request, + * we return EINVAL. + */ + error = (arg == 0) ? 0 : EINVAL; + break; + } + + /* We have something to delete. Allocate the new map */ + map = get_map(chain, -n, 1 /* locked */); + if (map == NULL) { + error = EINVAL; + break; + } + + /* 1. bcopy the initial part of the map */ + if (start > 0) + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); + /* 2. copy active rules between start and end */ + for (i = ofs = start; i < end; i++) { + rule = chain->map[i]; + if (keep_rule(rule, cmd, new_set, num)) + map[ofs++] = rule; + } + /* 3. copy the final part of the map */ + bcopy(chain->map + end, map + ofs, + (chain->n_rules - end) * sizeof(struct ip_fw *)); + /* 4. swap the maps (under BH_LOCK) */ + map = swap_map(chain, map, chain->n_rules - n); + /* 5. now remove the rules deleted from the old map */ + for (i = start; i < end; i++) { + int l; + rule = map[i]; + if (keep_rule(rule, cmd, new_set, num)) + continue; + l = RULESIZE(rule); + chain->static_len -= l; + ipfw_remove_dyn_children(rule); + rule->x_next = chain->reap; + chain->reap = rule; + } + break; + + /* + * In the next 3 cases the loop stops at (n_rules - 1) + * because the default rule is never eligible.. + */ + + case 2: /* move rules with given RULE number to new set */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->rulenum == num) + rule->set = new_set; + } + break; + + case 3: /* move rules with given SET number to new set */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == num) + rule->set = new_set; + } + break; + + case 4: /* swap two sets */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == num) + rule->set = new_set; + else if (rule->set == new_set) + rule->set = num; + } + break; + } + + rule = chain->reap; + chain->reap = NULL; + IPFW_UH_WUNLOCK(chain); + ipfw_reap_rules(rule); + if (map) + free(map, M_IPFW); + return error; +} + +/* + * Clear counters for a specific rule. + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops + * so we only care that rules do not disappear. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) { + rule->bcnt = rule->pcnt = 0; + rule->timestamp = 0; + } + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/** + * Reset some or all counters on firewall rules. + * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, + * the next 8 bits are the set number, the top 8 bits are the command: + * 0 work with rules from all set's; + * 1 work with rules only from specified set. + * Specified rule number is zero if we want to clear all entries. + * log_only is 1 if we only want to reset logs, zero otherwise. + */ +static int +zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) +{ + struct ip_fw *rule; + char *msg; + int i; + + uint16_t rulenum = arg & 0xffff; + uint8_t set = (arg >> 16) & 0xff; + uint8_t cmd = (arg >> 24) & 0xff; + + if (cmd > 1) + return (EINVAL); + if (cmd == 1 && set > RESVD_SET) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + if (rulenum == 0) { + V_norule_counter = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + /* Skip rules not in our set. */ + if (cmd == 1 && rule->set != set) + continue; + clear_counters(rule, log_only); + } + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + } else { + int cleared = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->rulenum == rulenum) { + if (cmd == 0 || rule->set == set) + clear_counters(rule, log_only); + cleared = 1; + } + if (rule->rulenum > rulenum) + break; + } + if (!cleared) { /* we did not find any matching rules */ + IPFW_UH_RUNLOCK(chain); + return (EINVAL); + } + msg = log_only ? "logging count reset" : "cleared"; + } + IPFW_UH_RUNLOCK(chain); + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + + if (rulenum) + log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); + else + log(lev, "ipfw: %s.\n", msg); + } + return (0); +} + +/* + * Check validity of the structure before insert. + * Rules are simple, so this mostly need to check rule sizes. + */ +static int +check_ipfw_struct(struct ip_fw *rule, int size) +{ + int l, cmdlen = 0; + int have_action=0; + ipfw_insn *cmd; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + /* first, check for valid size */ + l = RULESIZE(rule); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + for (l = rule->cmd_len, cmd = rule->cmd ; + l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d size truncated\n", + cmd->opcode); + return EINVAL; + } + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_DIVERTED: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_TCPWIN: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_VERSRCREACH: + case O_ANTISPOOF: + case O_IPSEC: +#ifdef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: +#endif + case O_IP4: + case O_TAG: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + break; + + case O_SETFIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + goto check_action; + + case O_UID: + case O_GID: + case O_JAIL: + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + goto bad_size; + break; + + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + goto bad_size; + + ((ipfw_insn_log *)cmd)->log_left = + ((ipfw_insn_log *)cmd)->max_log; + + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ( !(cmdlen & 1) || cmdlen > 31) + goto bad_size; + break; + + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return EINVAL; + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + goto bad_size; + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (cmd->arg1 >= IPFW_TABLES_MAX) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + goto bad_size; + break; + + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + case O_TCPDATALEN: + case O_TAGGED: + if (cmdlen < 1 || cmdlen > 31) + goto bad_size; + break; + + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + goto bad_size; + break; + + case O_RECV: + case O_XMIT: + case O_VIA: + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + goto bad_size; + break; + + case O_ALTQ: + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) + goto bad_size; + break; + + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + goto check_action; + + case O_FORWARD_IP: +#ifdef IPFIREWALL_FORWARD + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + goto bad_size; + goto check_action; +#else + return EINVAL; +#endif + + case O_DIVERT: + case O_TEE: + if (ip_divert_ptr == NULL) + return EINVAL; + else + goto check_size; + case O_NETGRAPH: + case O_NGTEE: + if (ng_ipfw_input_p == NULL) + return EINVAL; + else + goto check_size; + case O_NAT: + if (!IPFW_NAT_LOADED) + return EINVAL; + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) + goto bad_size; + goto check_action; + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_CHECK_STATE: + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: +#ifdef INET6 + case O_UNREACH6: +#endif + case O_SKIPTO: + case O_REASS: +check_size: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; +check_action: + if (have_action) { + printf("ipfw: opcode %d, multiple actions" + " not allowed\n", + cmd->opcode); + return EINVAL; + } + have_action = 1; + if (l != cmdlen) { + printf("ipfw: opcode %d, action must be" + " last opcode\n", + cmd->opcode); + return EINVAL; + } + break; +#ifdef INET6 + case O_IP6_SRC: + case O_IP6_DST: + if (cmdlen != F_INSN_SIZE(struct in6_addr) + + F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FLOW6ID: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + ((ipfw_insn_u32 *)cmd)->o.arg1) + goto bad_size; + break; + + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if ( !(cmdlen & 1) || cmdlen > 127) + goto bad_size; + break; + case O_ICMP6TYPE: + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) + goto bad_size; + break; +#endif + + default: + switch (cmd->opcode) { +#ifndef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: + case O_UNREACH6: + case O_IP6_SRC: + case O_IP6_DST: + case O_FLOW6ID: + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + case O_ICMP6TYPE: + printf("ipfw: no IPv6 support in kernel\n"); + return EPROTONOSUPPORT; +#endif + default: + printf("ipfw: opcode %d, unknown opcode\n", + cmd->opcode); + return EINVAL; + } + } + } + if (have_action == 0) { + printf("ipfw: missing action\n"); + return EINVAL; + } + return 0; + +bad_size: + printf("ipfw: opcode %d size %d wrong\n", + cmd->opcode, cmdlen); + return EINVAL; +} + + +/* + * Translation of requests for compatibility with FreeBSD 7.2/8. + * a static variable tells us if we have an old client from userland, + * and if necessary we translate requests and responses between the + * two formats. + */ +static int is7 = 0; + +struct ip_fw7 { + struct ip_fw7 *next; /* linked list of rules */ + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + // #define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + // uint32_t id; /* rule id, only in v.8 */ + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + + int convert_rule_to_7(struct ip_fw *rule); +int convert_rule_to_8(struct ip_fw *rule); + +#ifndef RULESIZE7 +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) +#endif + + +/* + * Copy the static and dynamic rules to the supplied buffer + * and return the amount of space actually used. + * Must be run under IPFW_UH_RLOCK + */ +static size_t +ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) +{ + char *bp = buf; + char *ep = bp + space; + struct ip_fw *rule, *dst; + int l, i; + time_t boot_seconds; + + boot_seconds = boottime.tv_sec; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + + if (is7) { + /* Convert rule to FreeBSd 7.2 format */ + l = RULESIZE7(rule); + if (bp + l + sizeof(uint32_t) <= ep) { + int error; + bcopy(rule, bp, l + sizeof(uint32_t)); + error = convert_rule_to_7((struct ip_fw *) bp); + if (error) + return 0; /*XXX correct? */ + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, + &(((struct ip_fw7 *)bp)->next_rule), + sizeof(V_set_disable)); + if (((struct ip_fw7 *)bp)->timestamp) + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; + bp += l; + } + continue; /* go to next rule */ + } + + /* normal mode, don't touch rules */ + l = RULESIZE(rule); + if (bp + l > ep) { /* should not happen */ + printf("overflow dumping static rules\n"); + break; + } + dst = (struct ip_fw *)bp; + bcopy(rule, dst, l); + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); + if (dst->timestamp) + dst->timestamp += boot_seconds; + bp += l; + } + ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ + return (bp - (char *)buf); +} + + +/** + * {set|get}sockopt parser. + */ +int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + int error; + size_t size; + struct ip_fw *buf, *rule; + struct ip_fw_chain *chain; + u_int32_t rulenum[2]; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); + if (error) + return (error); + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + chain = &V_layer3_chain; + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number IPFW_DEFAULT_RULE), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + * + * Note that the calculated size is used to bound the + * amount of data returned to the user. The rule set may + * change between calculating the size and returning the + * data in which case we'll just return what fits. + */ + for (;;) { + int len = 0, want; + + size = chain->static_len; + size += ipfw_dyn_len(); + if (size >= sopt->sopt_valsize) + break; + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == NULL) + break; + IPFW_UH_RLOCK(chain); + /* check again how much space we need */ + want = chain->static_len + ipfw_dyn_len(); + if (size >= want) + len = ipfw_getrules(chain, buf, size); + IPFW_UH_RUNLOCK(chain); + if (size >= want) + error = sooptcopyout(sopt, buf, len); + free(buf, M_TEMP); + if (size >= want) + break; + } + break; + + case IP_FW_FLUSH: + /* locking is done within del_entry() */ + error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ + break; + + case IP_FW_ADD: + rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, + sizeof(struct ip_fw7) ); + + /* + * If the size of commands equals RULESIZE7 then we assume + * a FreeBSD7.2 binary is talking to us (set is7=1). + * is7 is persistent so the next 'ipfw list' command + * will use this format. + * NOTE: If wrong version is guessed (this can happen if + * the first ipfw command is 'ipfw [pipe] list') + * the ipfw binary may crash or loop infinitly... + */ + if (sopt->sopt_valsize == RULESIZE7(rule)) { + is7 = 1; + error = convert_rule_to_8(rule); + if (error) + return error; + if (error == 0) + error = check_ipfw_struct(rule, RULESIZE(rule)); + } else { + is7 = 0; + if (error == 0) + error = check_ipfw_struct(rule, sopt->sopt_valsize); + } + if (error == 0) { + /* locking is done within ipfw_add_rule() */ + error = ipfw_add_rule(chain, rule); + size = RULESIZE(rule); + if (!error && sopt->sopt_dir == SOPT_GET) { + if (is7) { + error = convert_rule_to_7(rule); + size = RULESIZE7(rule); + if (error) + return error; + } + error = sooptcopyout(sopt, rule, size); + } + } + free(rule, M_TEMP); + break; + + case IP_FW_DEL: + /* + * IP_FW_DEL is used for deleting single rules or sets, + * and (ab)used to atomically manipulate sets. Argument size + * is used to distinguish between the two: + * sizeof(u_int32_t) + * delete single rule or set of rules, + * or reassign rules (or sets) to a different set. + * 2*sizeof(u_int32_t) + * atomic disable/enable sets. + * first u_int32_t contains sets to be disabled, + * second u_int32_t contains sets to be enabled. + */ + error = sooptcopyin(sopt, rulenum, + 2*sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + size = sopt->sopt_valsize; + if (size == sizeof(u_int32_t) && rulenum[0] != 0) { + /* delete or reassign, locking done in del_entry() */ + error = del_entry(chain, rulenum[0]); + } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ + IPFW_UH_WLOCK(chain); + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & + ~(1<sopt_val != 0) { + error = sooptcopyin(sopt, rulenum, + sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + } + error = zero_entry(chain, rulenum[0], + sopt->sopt_name == IP_FW_RESETLOG); + break; + + /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ + case IP_FW_TABLE_ADD: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_add_table_entry(chain, ent.tbl, + ent.addr, ent.masklen, ent.value); + } + break; + + case IP_FW_TABLE_DEL: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_del_table_entry(chain, ent.tbl, + ent.addr, ent.masklen); + } + break; + + case IP_FW_TABLE_FLUSH: + { + u_int16_t tbl; + + error = sooptcopyin(sopt, &tbl, + sizeof(tbl), sizeof(tbl)); + if (error) + break; + IPFW_WLOCK(chain); + error = ipfw_flush_table(chain, tbl); + IPFW_WUNLOCK(chain); + } + break; + + case IP_FW_TABLE_GETSIZE: + { + u_int32_t tbl, cnt; + + if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), + sizeof(tbl)))) + break; + IPFW_RLOCK(chain); + error = ipfw_count_table(chain, tbl, &cnt); + IPFW_RUNLOCK(chain); + if (error) + break; + error = sooptcopyout(sopt, &cnt, sizeof(cnt)); + } + break; + + case IP_FW_TABLE_LIST: + { + ipfw_table *tbl; + + if (sopt->sopt_valsize < sizeof(*tbl)) { + error = EINVAL; + break; + } + size = sopt->sopt_valsize; + tbl = malloc(size, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); + if (error) { + free(tbl, M_TEMP); + break; + } + tbl->size = (size - sizeof(*tbl)) / + sizeof(ipfw_table_entry); + IPFW_RLOCK(chain); + error = ipfw_dump_table(chain, tbl); + IPFW_RUNLOCK(chain); + if (error) { + free(tbl, M_TEMP); + break; + } + error = sooptcopyout(sopt, tbl, size); + free(tbl, M_TEMP); + } + break; + + /*--- NAT operations are protected by the IPFW_LOCK ---*/ + case IP_FW_NAT_CFG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_DEL: + if (IPFW_NAT_LOADED) + error = ipfw_nat_del_ptr(sopt); + else { + printf("IP_FW_NAT_DEL: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_CONFIG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_GET_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_LOG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_log_ptr(sopt); + else { + printf("IP_FW_NAT_GET_LOG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + default: + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL; + } + + return (error); +#undef RULE_MAXSIZE +} + + +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + +/* Functions to convert rules 7.2 <==> 8.0 */ +int +convert_rule_to_7(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; + /* copy of original rule, version 8 */ + struct ip_fw *tmp; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + bcopy(rule, tmp, RULE_MAXSIZE); + + /* Copy fields */ + rule7->_pad = tmp->_pad; + rule7->set = tmp->set; + rule7->rulenum = tmp->rulenum; + rule7->cmd_len = tmp->cmd_len; + rule7->act_ofs = tmp->act_ofs; + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; + rule7->next = (struct ip_fw7 *)tmp->x_next; + rule7->cmd_len = tmp->cmd_len; + rule7->pcnt = tmp->pcnt; + rule7->bcnt = tmp->bcnt; + rule7->timestamp = tmp->timestamp; + + /* Copy commands */ + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * decrement opcode if it is after O_REASS + */ + dst->opcode--; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + free(tmp, M_TEMP); + + return 0; +} + +int +convert_rule_to_8(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + /* Copy of original rule */ + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + + bcopy(rule7, tmp, RULE_MAXSIZE); + + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * increment opcode if it is after O_REASS + */ + dst->opcode++; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + + rule->_pad = tmp->_pad; + rule->set = tmp->set; + rule->rulenum = tmp->rulenum; + rule->cmd_len = tmp->cmd_len; + rule->act_ofs = tmp->act_ofs; + rule->next_rule = (struct ip_fw *)tmp->next_rule; + rule->x_next = (struct ip_fw *)tmp->next; + rule->cmd_len = tmp->cmd_len; + rule->id = 0; /* XXX see if is ok = 0 */ + rule->pcnt = tmp->pcnt; + rule->bcnt = tmp->bcnt; + rule->timestamp = tmp->timestamp; + + free (tmp, M_TEMP); + return 0; +} + +/* end of file */ diff --git a/dummynet2/ip_fw_table.c b/dummynet2/ip_fw_table.c new file mode 100644 index 0000000..d8973d5 --- /dev/null +++ b/dummynet2/ip_fw_table.c @@ -0,0 +1,286 @@ +/*- + * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $"); + +/* + * Lookup table support for ipfw + * + * Lookup tables are implemented (at the moment) using the radix + * tree used for routing tables. Tables store key-value entries, where + * keys are network prefixes (addr/masklen), and values are integers. + * As a degenerate case we can interpret keys as 32-bit integers + * (with a /32 mask). + * + * The table is protected by the IPFW lock even for manipulation coming + * from userland, because operations are typically fast. + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include /* ip_fw.h requires IFNAMSIZ */ +#include +#include +#include + +#include +#include /* struct ipfw_rule_ref */ +#include +#include /* LIST_HEAD */ +#include + +#ifdef MAC +#include +#endif + +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + +/* + * The radix code expects addr and mask to be array of bytes, + * with the first byte being the length of the array. rn_inithead + * is called with the offset in bits of the lookup key within the + * array. If we use a sockaddr_in as the underlying type, + * sin_len is conveniently located at offset 0, sin_addr is at + * offset 4 and normally aligned. + * But for portability, let's avoid assumption and make the code explicit + */ +#define KEY_LEN(v) *((uint8_t *)&(v)) +#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) + +int +ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct radix_node *rn; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); + if (ent == NULL) + return (ENOMEM); + ent->value = value; + KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8; + ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent); + if (rn == NULL) { + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (EEXIST); + } + IPFW_WUNLOCK(ch); + return (0); +} + +int +ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa, mask; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = KEY_LEN(mask) = 8; + mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); + if (ent == NULL) { + IPFW_WUNLOCK(ch); + return (ESRCH); + } + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (0); +} + +static int +flush_table_entry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct table_entry *ent; + + ent = (struct table_entry *) + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); + if (ent != NULL) + free(ent, M_IPFW_TBL); + return (0); +} + +int +ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) +{ + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KASSERT(rnh != NULL, ("NULL IPFW table")); + rnh->rnh_walktree(rnh, flush_table_entry, rnh); + return (0); +} + +void +ipfw_destroy_tables(struct ip_fw_chain *ch) +{ + uint16_t tbl; + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) { + ipfw_flush_table(ch, tbl); + rnh = ch->tables[tbl]; + rn_detachhead((void **)&rnh); + } +} + +int +ipfw_init_tables(struct ip_fw_chain *ch) +{ + int i; + uint16_t j; + + for (i = 0; i < IPFW_TABLES_MAX; i++) { + if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) { + for (j = 0; j < i; j++) { + (void) ipfw_flush_table(ch, j); + } + return (ENOMEM); + } + } + return (0); +} + +int +ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa; + + if (tbl >= IPFW_TABLES_MAX) + return (0); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = 8; + sa.sin_addr.s_addr = addr; + ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); + if (ent != NULL) { + *val = ent->value; + return (1); + } + return (0); +} + +static int +count_table_entry(struct radix_node *rn, void *arg) +{ + u_int32_t * const cnt = arg; + + (*cnt)++; + return (0); +} + +int +ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) +{ + struct radix_node_head *rnh; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + *cnt = 0; + rnh->rnh_walktree(rnh, count_table_entry, cnt); + return (0); +} + +static int +dump_table_entry(struct radix_node *rn, void *arg) +{ + struct table_entry * const n = (struct table_entry *)rn; + ipfw_table * const tbl = arg; + ipfw_table_entry *ent; + + if (tbl->cnt == tbl->size) + return (1); + ent = &tbl->ent[tbl->cnt]; + ent->tbl = tbl->tbl; + if (in_nullhost(n->mask.sin_addr)) + ent->masklen = 0; + else + ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); + ent->addr = n->addr.sin_addr.s_addr; + ent->value = n->value; + tbl->cnt++; + return (0); +} + +int +ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) +{ + struct radix_node_head *rnh; + + if (tbl->tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl->tbl]; + tbl->cnt = 0; + rnh->rnh_walktree(rnh, dump_table_entry, tbl); + return (0); +} +/* end of file */ diff --git a/dummynet2/ipfw2_mod.c b/dummynet2/ipfw2_mod.c new file mode 100644 index 0000000..7ce046b --- /dev/null +++ b/dummynet2/ipfw2_mod.c @@ -0,0 +1,921 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: ipfw2_mod.c 10302 2012-01-19 21:49:23Z marta $ + * + * The main interface to build ipfw+dummynet as a linux module. + * (and possibly as a windows module as well, though that part + * is not complete yet). + * + * The control interface uses the sockopt mechanism + * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). + * + * The data interface uses the netfilter interface, at the moment + * hooked to the PRE_ROUTING and POST_ROUTING hooks. + * Unfortunately the netfilter interface is a moving target, + * so we need a set of macros to adapt to the various cases. + * + * In the netfilter hook we just mark packet as 'QUEUE' and then + * let the queue handler to do the whole work (filtering and + * possibly emulation). + * As we receive packets, we wrap them with an mbuf descriptor + * so the existing ipfw+dummynet code runs unmodified. + */ + +#include +#include /* sizeof struct mbuf */ +#include /* NGROUPS */ + +#ifdef __linux__ +#include +#include +#include +#include /* NF_IP_PRI_FILTER */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) +#include /* nf_queue */ +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +#define __read_mostly +#endif + +#endif /* !__linux__ */ + +#include /* in_addr */ +#include /* ip_fw_ctl_t, ip_fw_chk_t */ +#include /* ip_fw_ctl_t, ip_fw_chk_t */ +#include /* ip_dn_ctl_t, ip_dn_io_t */ +#include /* PFIL_IN, PFIL_OUT */ + +#ifdef __linux__ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,13) +/* XXX was < 2.6.0: inet_hashtables.h is introduced in 2.6.14 */ +// #warning --- inet_hashtables not present on 2.4 +#include +#include +#include +static inline int inet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +#else +#include /* inet_lookup */ +#endif +#endif /* __linux__ */ + +#include /* inet_iif */ + +/* + * Here we allocate some global variables used in the firewall. + */ +//ip_dn_ctl_t *ip_dn_ctl_ptr; +int (*ip_dn_ctl_ptr)(struct sockopt *); + +ip_fw_ctl_t *ip_fw_ctl_ptr; + +int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); +ip_fw_chk_t *ip_fw_chk_ptr; + +void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +/* Divert hooks. */ +void (*ip_divert_ptr)(struct mbuf *m, int incoming); + +/* ng_ipfw hooks. */ +ng_ipfw_input_t *ng_ipfw_input_p = NULL; + +/*--- + * Glue code to implement the registration of children with the parent. + * Each child should call my_mod_register() when linking, so that + * module_init() and module_exit() can call init_children() and + * fini_children() to provide the necessary initialization. + * We use the same mechanism for MODULE_ and SYSINIT_. + * The former only get a pointer to the moduledata, + * the latter have two function pointers (init/uninit) + */ +#include +struct mod_args { + const char *name; + int order; + struct moduledata *mod; + void (*init)(void), (*uninit)(void); +}; + +static unsigned int mod_idx; +static struct mod_args mods[10]; /* hard limit to 10 modules */ + +int +my_mod_register(const char *name, int order, + struct moduledata *mod, void *init, void *uninit); +/* + * my_mod_register should be called automatically as the init + * functions in the submodules. Unfortunately this compiler/linker + * trick is not supported yet so we call it manually. + */ +int +my_mod_register(const char *name, int order, + struct moduledata *mod, void *init, void *uninit) +{ + struct mod_args m; + + m.name = name; + m.order = order; + m.mod = mod; + m.init = init; + m.uninit = uninit; + + printf("%s %s called\n", __FUNCTION__, name); + if (mod_idx < sizeof(mods) / sizeof(mods[0])) + mods[mod_idx++] = m; + return 0; +} + +static void +init_children(void) +{ + unsigned int i; + + /* Call the functions registered at init time. */ + printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx); + for (i = 0; i < mod_idx; i++) { + struct mod_args *m = &mods[i]; + printf("+++ start module %d %s %s at %p order 0x%x\n", + i, m->name, m->mod ? m->mod->name : "SYSINIT", + m->mod, m->order); + if (m->mod && m->mod->evhand) + m->mod->evhand(NULL, MOD_LOAD, m->mod->priv); + else if (m->init) + m->init(); + } +} + +static void +fini_children(void) +{ + int i; + + /* Call the functions registered at init time. */ + for (i = mod_idx - 1; i >= 0; i--) { + struct mod_args *m = &mods[i]; + printf("+++ end module %d %s %s at %p order 0x%x\n", + i, m->name, m->mod ? m->mod->name : "SYSINIT", + m->mod, m->order); + if (m->mod && m->mod->evhand) + m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv); + else if (m->uninit) + m->uninit(); + } +} +/*--- end of module binding helper functions ---*/ + +/*--- + * Control hooks: + * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention. + * then call the ipfw handler in order to manage requests. + * In turn this is called by the linux set/get handlers. + */ +static int +ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user) +{ + struct thread t; + int ret = EINVAL; + + memset(s, 0, sizeof(s)); + s->sopt_name = cmd; + s->sopt_dir = dir; + s->sopt_valsize = len; + s->sopt_val = user; + + /* sopt_td is not used but it is referenced */ + memset(&t, 0, sizeof(t)); + s->sopt_td = &t; + + //printf("%s called with cmd %d len %d sopt %p user %p\n", __FUNCTION__, cmd, len, s, user); + + if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 || + cmd < IP_DUMMYNET_CONFIGURE)) + ret = ip_fw_ctl_ptr(s); + else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 || + cmd >= IP_DUMMYNET_CONFIGURE)) + ret = ip_dn_ctl_ptr(s); + + return -ret; /* errors are < 0 on linux */ +} + +/* + * Convert an mbuf into an skbuff + * At the moment this only works for ip packets fully contained + * in a single mbuf. We assume that on entry ip_len and ip_off are + * in host format, and the ip checksum is not computed. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* check boundary */ +int dst_output(struct skbuff *s) +{ + return 0; +} + +struct sk_buff * +mbuf2skbuff(struct mbuf* m) +{ + return NULL; +} +#else +struct sk_buff * +mbuf2skbuff(struct mbuf* m) +{ + struct sk_buff *skb; + size_t len = m->m_pkthdr.len; + + /* used to lookup the routing table */ + struct rtable *r; + struct flowi fl; + int ret = 0; /* success for ip_route_output_key() */ + + struct ip *ip = mtod(m, struct ip *); + + /* XXX ip_output has ip_len and ip_off in network format, + * linux expects host format */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, ip->ip_hl<<2); + + /* fill flowi struct, we need just the dst addr, see XXX */ + bzero(&fl, sizeof(fl)); + flow_daddr.daddr = ip->ip_dst.s_addr; + + /* + * ip_route_output_key() should increment + * r->u.dst.__use and call a dst_hold(dst) + * XXX verify how we release the resources. + */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) /* check boundary */ + r = ip_route_output_key(&init_net, &fl.u.ip4); +#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26) /* check boundary */ + ret = ip_route_output_key(&init_net, &r, &fl); +#else + ret = ip_route_output_key(&r, &fl); +#endif + if (ret != 0 || r == NULL ) { + printf("NO ROUTE FOUND\n"); + return NULL; + } + + /* allocate the skbuff and the data */ + skb = alloc_skb(len + sizeof(struct ethhdr), GFP_ATOMIC); + if (skb == NULL) { + printf("%s: can not allocate SKB buffers.\n", __FUNCTION__); + return NULL; + } + + skb->protocol = htons(ETH_P_IP); // XXX 8 or 16 bit ? + /* sk_dst_set XXX take the lock (?) */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) + skb_dst_set(skb, &r->u.dst); +#else + skb_dst_set(skb, &r->dst); +#endif + skb->dev = skb_dst(skb)->dev; + + /* reserve space for ethernet header */ + skb_reserve(skb, sizeof(struct ethhdr)); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) + skb_reset_network_header(skb); // skb->network_header = skb->data - skb->head +#else + skb->nh.raw = skb->data; +#endif + /* set skbuff tail pointers and copy content */ + skb_put(skb, len); + memcpy(skb->data, m->m_data, len); + + return skb; +} +#endif /* keepalives not supported on linux 2.4 */ + +/* + * This function is called to reinject packets to the + * kernel stack within the linux netfilter system + * or to send a new created mbuf. + * In the first case we have a valid sk_buff pointer + * encapsulated within the fake mbuf, so we can call + * the reinject function trough netisr_dispatch. + * In the last case we need to build a sk_buff from scratch, + * before sending out the packet. + */ +int +ip_output(struct mbuf *m, struct mbuf __unused *opt, + struct route __unused *ro, int __unused flags, + struct ip_moptions __unused *imo, struct inpcb __unused *inp) +{ + if ( m->m_skb != NULL ) { /* reinjected packet, just call dispatch */ + netisr_dispatch(0, m); + } else { + /* self-generated packet, wrap as appropriate and send */ +#ifdef __linux__ + struct sk_buff *skb = mbuf2skbuff(m); + + if (skb != NULL) + dst_output(skb); +#else /* Windows */ +#endif + FREE_PKT(m); + } + return 0; +} + +/* + * setsockopt hook has no return value other than the error code. + */ +int +do_ipfw_set_ctl(struct sock __unused *sk, int cmd, + void __user *user, unsigned int len) +{ + struct sockopt s; /* pass arguments */ + return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user); +} + +/* + * getsockopt can can return a block of data in response. + */ +int +do_ipfw_get_ctl(struct sock __unused *sk, + int cmd, void __user *user, int *len) +{ + struct sockopt s; /* pass arguments */ + int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user); + + *len = s.sopt_valsize; /* return lenght back to the caller */ + return ret; +} + +#ifdef __linux__ + +/* + * declare our [get|set]sockopt hooks + */ +static struct nf_sockopt_ops ipfw_sockopts = { + .pf = PF_INET, + .set_optmin = _IPFW_SOCKOPT_BASE, + .set_optmax = _IPFW_SOCKOPT_END, + .set = do_ipfw_set_ctl, + .get_optmin = _IPFW_SOCKOPT_BASE, + .get_optmax = _IPFW_SOCKOPT_END, + .get = do_ipfw_get_ctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) + .owner = THIS_MODULE, +#endif +}; + +/*---- + * We need a number of macros to adapt to the various APIs in + * different linux versions. Among them: + * + * - the hook names change between macros (NF_IP*) and enum NF_INET_* + * + * - the second argument to the netfilter hook is + * struct sk_buff ** in kernels <= 2.6.22 + * struct sk_buff * in kernels > 2.6.22 + * + * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT + * + * - the packet descriptor passed to the queue handler is + * struct nf_info in kernels <= 2.6.24 + * struct nf_queue_entry in kernels <= 2.6.24 + * + * - the arguments to the queue handler also change; + */ + +/* + * declare hook to grab packets from the netfilter interface. + * The NF_* names change in different versions of linux, in some + * cases they are #defines, in others they are enum, so we + * need to adapt. + */ +#ifndef NF_IP_PRE_ROUTING +#define NF_IP_PRE_ROUTING NF_INET_PRE_ROUTING +#endif +#ifndef NF_IP_POST_ROUTING +#define NF_IP_POST_ROUTING NF_INET_POST_ROUTING +#endif + +/* + * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains. + * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and + * POST_ROUTING chains, so if we want to use that information we + * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING. + * However at the moment the skb_tag info is not reliable so + * we stay with the standard hooks. + */ +#if 0 // defined(IPFW_PLANETLAB) +#define IPFW_HOOK_IN NF_IP_LOCAL_IN +#else +#define IPFW_HOOK_IN NF_IP_PRE_ROUTING +#endif + +/* + * The main netfilter hook. + * To make life simple, we queue everything and then do all the + * decision in the queue handler. + * + * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff** + * so we have an #ifdef to set the proper argument type. + */ +static unsigned int +call_ipfw(unsigned int __unused hooknum, +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have ** + struct sk_buff __unused **skb, +#else + struct sk_buff __unused *skb, +#endif + const struct net_device __unused *in, + const struct net_device __unused *out, + int __unused (*okfn)(struct sk_buff *)) +{ + return NF_QUEUE; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) /* XXX was 2.6.0 */ +#define NF_STOP NF_ACCEPT +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + +/* + * nf_queue_entry is a recent addition, in previous versions + * of the code the struct is called nf_info. + */ +#define nf_queue_entry nf_info /* for simplicity */ + +/* also, 2.4 and perhaps something else have different arguments */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX unsure */ +/* on 2.4 we use nf_info */ +#define QH_ARGS struct sk_buff *skb, struct nf_info *info, void *data +#else /* 2.6.14. 2.6.24 */ +#define QH_ARGS struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data +#endif + +#define DEFINE_SKB /* nothing, already an argument */ +#define REINJECT(_inf, _verd) nf_reinject(skb, _inf, _verd) + +#else /* 2.6.25 and above */ + +#define QH_ARGS struct nf_queue_entry *info, unsigned int queuenum +#define DEFINE_SKB struct sk_buff *skb = info->skb; +#define REINJECT(_inf, _verd) nf_reinject(_inf, _verd) +#endif + +/* + * used by dummynet when dropping packets + * XXX use dummynet_send() + */ +void +reinject_drop(struct mbuf* m) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) /* unsure on the exact boundary */ + struct sk_buff *skb = (struct sk_buff *)m; +#endif + REINJECT(m->queue_entry, NF_DROP); +} + +/* + * The real call to the firewall. nf_queue_entry points to the skbuf, + * and eventually we need to return both through nf_reinject(). + */ +static int +ipfw2_queue_handler(QH_ARGS) +{ + DEFINE_SKB /* no semicolon here, goes in the macro */ + int ret = 0; /* return value */ + struct mbuf *m; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + if (skb->nh.iph == NULL) { + printf("null dp, len %d reinject now\n", skb->len); + REINJECT(info, NF_ACCEPT); + return 0; + } +#endif + m = malloc(sizeof(*m), 0, 0); + if (m == NULL) { + printf("malloc fail, len %d reinject now\n", skb->len); + REINJECT(info, NF_ACCEPT); + return 0; + } + + m->m_skb = skb; + m->m_len = skb->len; /* len from ip header to end */ + m->m_pkthdr.len = skb->len; /* total packet len */ + m->m_pkthdr.rcvif = info->indev; + m->queue_entry = info; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) /* XXX was 2.6.0 */ + m->m_data = skb->nh.iph; +#else + m->m_data = skb_network_header(skb); +#endif + + /* XXX add the interface */ + if (info->hook == IPFW_HOOK_IN) { + ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL); + } else { + ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL); + } + + if (m != NULL) { /* Accept. reinject and free the mbuf */ + REINJECT(info, NF_ACCEPT); + m_freem(m); + } else if (ret == 0) { + /* dummynet has kept the packet, will reinject later. */ + } else { + /* + * Packet dropped by ipfw or dummynet. Nothing to do as + * FREE_PKT already did a reinject as NF_DROP + */ + } + return 0; +} + +struct route; +struct ip_moptions; +struct inpcb; + +/* XXX should include prototypes for netisr_dispatch and ip_output */ +/* + * The reinjection routine after a packet comes out from dummynet. + * We must update the skb timestamp so ping reports the right time. + * This routine is also used (with num == -1) as FREE_PKT. XXX + */ +void +netisr_dispatch(int num, struct mbuf *m) +{ + struct nf_queue_entry *info = m->queue_entry; + struct sk_buff *skb = m->m_skb; /* always used */ + + /* + * This function can be called by the FREE_PKT() + * used when ipfw generate their own mbuf packets + * or by the mbuf2skbuff() function. + */ + m_freem(m); + + /* XXX check + * info is null in the case of a real mbuf + * (one created by the ipfw code without a + * valid sk_buff pointer + */ + if (info == NULL) + return; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) // XXX above 2.6.x ? + __net_timestamp(skb); /* update timestamp */ +#endif + + /* XXX to obey one-pass, possibly call the queue handler here */ + REINJECT(info, ((num == -1)?NF_DROP:NF_STOP)); /* accept but no more firewall */ +} + +/* + * socket lookup function for linux. + * This code is used to associate uid, gid, jail/xid to packets, + * and store the info in a cache *ugp where they can be accessed quickly. + * The function returns 1 if the info is found, -1 otherwise. + * + * We do this only on selected protocols: TCP, ... + * + * The chain is the following + * sk_buff* sock* socket* file* + * skb -> sk ->sk_socket->file ->f_owner ->pid + * skb -> sk ->sk_socket->file ->f_uid (direct) + * skb -> sk ->sk_socket->file ->f_cred->fsuid (2.6.29+) + * + * Related headers: + * linux/skbuff.h struct skbuff + * net/sock.h struct sock + * linux/net.h struct socket + * linux/fs.h struct file + * + * With vserver we may have sk->sk_xid and sk->sk_nid that + * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2] + * (no matches yet) + * + * Note- for locally generated, outgoing packets we should not need + * need a lookup because the sk_buff already points to the socket where + * the info is. + */ +extern struct inet_hashinfo tcp_hashinfo; +int +linux_lookup(const int proto, const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + struct sk_buff *skb, int dir, struct bsd_ucred *u) +{ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,13) /* XXX was 2.6.0 */ + return -1; +#else + struct sock *sk; + int ret = -1; /* default return value */ + int st = -1; /* state */ + + + if (proto != IPPROTO_TCP) /* XXX extend for UDP */ + return -1; + + if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) { + panic(" -- this should not happen\n"); + return -1; + } + + if (skb->sk) { + sk = skb->sk; + } else { + /* + * Try a lookup. On a match, sk has a refcount that we must + * release on exit (we know it because skb->sk = NULL). + * + * inet_lookup above 2.6.24 has an additional 'net' parameter + * so we use a macro to conditionally supply it. + * swap dst and src depending on the direction. + */ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24) +#define _OPT_NET_ARG +#else +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) +/* there is no dev_net() on 2.6.25 */ +#define _OPT_NET_ARG (skb->dev->nd_net), +#else /* 2.6.26 and above */ +#define _OPT_NET_ARG dev_net(skb->dev), +#endif +#endif + sk = (dir) ? /* dir != 0 on output */ + inet_lookup(_OPT_NET_ARG &tcp_hashinfo, + daddr, dport, saddr, sport, // match outgoing + inet_iif(skb)) : + inet_lookup(_OPT_NET_ARG &tcp_hashinfo, + saddr, sport, daddr, dport, // match incoming + skb->dev->ifindex); +#undef _OPT_NET_ARG + + if (sk == NULL) /* no match, nothing to be done */ + return -1; + } + ret = 1; /* retrying won't make things better */ + st = sk->sk_state; +#ifdef CONFIG_VSERVER + u->xid = sk->sk_xid; + u->nid = sk->sk_nid; +#else + u->xid = u->nid = 0; +#endif + /* + * Exclude tcp states where sk points to a inet_timewait_sock which + * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more). + * To be safe, use a whitelist and not a blacklist. + * Before dereferencing sk_socket grab a lock on sk_callback_lock. + * + * Once again we need conditional code because the UID and GID + * location changes between kernels. + */ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28) +/* use the current's real uid/gid */ +#define _CURR_UID f_uid +#define _CURR_GID f_gid +#else /* 2.6.29 and above */ +/* use the current's file access real uid/gid */ +#define _CURR_UID f_cred->fsuid +#define _CURR_GID f_cred->fsgid +#endif + +#define GOOD_STATES ( \ + (1<sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + u->uid = sk->sk_socket->file->_CURR_UID; + u->gid = sk->sk_socket->file->_CURR_GID; + } + read_unlock_bh(&sk->sk_callback_lock); + } else { + u->uid = u->gid = 0; + } + if (!skb->sk) /* return the reference that came from the lookup */ + sock_put(sk); +#undef GOOD_STATES +#undef _CURR_UID +#undef _CURR_GID + return ret; + +#endif /* LINUX > 2.4 */ +} + +/* + * Now prepare to hook the various functions. + * Linux 2.4 has a different API so we need some adaptation + * for register and unregister hooks + * + * the unregister function changed arguments between 2.6.22 and 2.6.24 + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) +struct nf_queue_handler ipfw2_queue_handler_desc = { + .outfn = ipfw2_queue_handler, + .name = "ipfw2 dummynet queue", +}; +#define REG_QH_ARG(fn) &(fn ## _desc) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) /* XXX was 2.6.0 */ +static int +nf_register_hooks(struct nf_hook_ops *ops, int n) +{ + int i, ret = 0; + for (i = 0; i < n; i++) { + ret = nf_register_hook(ops + i); + if (ret < 0) + break; + } + return ret; +} + +static void +nf_unregister_hooks(struct nf_hook_ops *ops, int n) +{ + int i; + for (i = 0; i < n; i++) { + nf_unregister_hook(ops + i); + } +} +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX was 2.6.0 */ +#define REG_QH_ARG(fn) fn, NULL /* argument for nf_[un]register_queue_handler */ +#endif +#define UNREG_QH_ARG(fn) //fn /* argument for nf_[un]register_queue_handler */ +#define SET_MOD_OWNER + +#else /* linux > 2.6.17 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +#define UNREG_QH_ARG(fn) //fn /* argument for nf_[un]register_queue_handler */ +#else +#define UNREG_QH_ARG(fn) , &(fn ## _desc) +#endif /* 2.6.0 < LINUX > 2.6.24 */ + +#define SET_MOD_OWNER .owner = THIS_MODULE, + +#endif /* !LINUX < 2.6.0 */ + +static struct nf_hook_ops ipfw_ops[] __read_mostly = { + { + .hook = call_ipfw, + .pf = PF_INET, + .hooknum = IPFW_HOOK_IN, + .priority = NF_IP_PRI_FILTER, + SET_MOD_OWNER + }, + { + .hook = call_ipfw, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_FILTER, + SET_MOD_OWNER + }, +}; +#endif /* __linux__ */ + +/* descriptors for the children, until i find a way for the + * linker to produce them + */ +extern moduledata_t *moddesc_ipfw; +extern moduledata_t *moddesc_dummynet; +extern moduledata_t *moddesc_dn_fifo; +extern moduledata_t *moddesc_dn_wf2qp; +extern moduledata_t *moddesc_dn_rr; +extern moduledata_t *moddesc_dn_qfq; +extern moduledata_t *moddesc_dn_prio; +extern void *sysinit_ipfw_init; +extern void *sysuninit_ipfw_destroy; +extern void *sysinit_vnet_ipfw_init; +extern void *sysuninit_vnet_ipfw_uninit; + +/* + * Module glue - init and exit function. + */ +int __init +ipfw_module_init(void) +{ + int ret = 0; +#ifdef _WIN32 + unsigned long resolution; +#endif + + rn_init(64); + my_mod_register("ipfw", 1, moddesc_ipfw, NULL, NULL); + my_mod_register("sy_ipfw", 2, NULL, + sysinit_ipfw_init, sysuninit_ipfw_destroy); + my_mod_register("sy_Vnet_ipfw", 3, NULL, + sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit); + my_mod_register("dummynet", 4, moddesc_dummynet, NULL, NULL); + my_mod_register("dn_fifo", 5, moddesc_dn_fifo, NULL, NULL); + my_mod_register("dn_wf2qp", 6, moddesc_dn_wf2qp, NULL, NULL); + my_mod_register("dn_rr", 7, moddesc_dn_rr, NULL, NULL); + my_mod_register("dn_qfq", 8, moddesc_dn_qfq, NULL, NULL); + my_mod_register("dn_prio", 9, moddesc_dn_prio, NULL, NULL); + init_children(); + +#ifdef _WIN32 + resolution = ExSetTimerResolution(1, TRUE); + printf("*** ExSetTimerResolution: resolution set to %d n-sec ***\n",resolution); +#endif +#ifdef EMULATE_SYSCTL + keinit_GST(); +#endif + +#ifdef __linux__ + /* sockopt register, in order to talk with user space */ + ret = nf_register_sockopt(&ipfw_sockopts); + if (ret < 0) { + printf("error %d in nf_register_sockopt\n", ret); + goto clean_modules; + } + + /* queue handler registration, in order to get network + * packet under a private queue */ + ret = nf_register_queue_handler(PF_INET, REG_QH_ARG(ipfw2_queue_handler) ); + if (ret < 0) /* queue busy */ + goto unregister_sockopt; + + ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops)); + if (ret < 0) + goto unregister_sockopt; + + printf("%s loaded\n", __FUNCTION__); + return 0; + + +/* handle errors on load */ +unregister_sockopt: + nf_unregister_queue_handler(PF_INET UNREG_QH_ARG(ipfw2_queue_handler) ); + nf_unregister_sockopt(&ipfw_sockopts); + +clean_modules: + fini_children(); + printf("%s error\n", __FUNCTION__); + +#endif /* __linux__ */ + return ret; +} + +/* module shutdown */ +void __exit +ipfw_module_exit(void) +{ +#ifdef EMULATE_SYSCTL + keexit_GST(); +#endif +#ifdef _WIN32 + ExSetTimerResolution(0,FALSE); + +#else /* linux hook */ + nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops)); + /* maybe drain the queue before unregistering ? */ + nf_unregister_queue_handler(PF_INET UNREG_QH_ARG(ipfw2_queue_handler) ); + nf_unregister_sockopt(&ipfw_sockopts); +#endif /* __linux__ */ + + fini_children(); + + printf("%s unloaded\n", __FUNCTION__); +} + +#ifdef __linux__ +module_init(ipfw_module_init) +module_exit(ipfw_module_exit) +MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ +#endif diff --git a/dummynet2/md_win.c b/dummynet2/md_win.c new file mode 100644 index 0000000..0b8af00 --- /dev/null +++ b/dummynet2/md_win.c @@ -0,0 +1,630 @@ +/* + * Copyright (C) 2010 Luigi Rizzo, Francesco Magno, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * kernel variables and functions that are not available in Windows. + */ + +#include /* provides PFIL_IN and PFIL_OUT */ +#include +#include /* in_addr */ +#include +#include +#include + +/* credentials check */ +int +cred_check(void *_insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb) +{ + return 0; +} + +/* + * as good as anywhere, place here the missing calls + */ + +void * +my_alloc(int size) +{ + void *_ret = ExAllocatePoolWithTag(NonPagedPool, size, 'wfpi'); + if (_ret) + memset(_ret, 0, size); + return _ret; +} + +void +panic(const char *fmt, ...) +{ + printf("%s", fmt); + for (;;); +} + +int securelevel = 0; + +int ffs(int bits) +{ + int i; + if (bits == 0) + return (0); + for (i = 1; ; i++, bits >>= 1) { + if (bits & 1) + break; + } + return (i); +} + +void +do_gettimeofday(struct timeval *tv) +{ + static LARGE_INTEGER prevtime; //system time in 100-nsec resolution + static LARGE_INTEGER prevcount; //RTC counter value + static LARGE_INTEGER freq; //frequency + + LARGE_INTEGER currtime; + LARGE_INTEGER currcount; + if (prevtime.QuadPart == 0) { //first time we ask for system time + KeQuerySystemTime(&prevtime); + prevcount = KeQueryPerformanceCounter(&freq); + currtime.QuadPart = prevtime.QuadPart; + } else { + KeQuerySystemTime(&currtime); + currcount = KeQueryPerformanceCounter(&freq); + if (currtime.QuadPart == prevtime.QuadPart) { + //time has NOT changed, calculate time using ticks and DO NOT update + LONGLONG difftime = 0; //difference in 100-nsec + LONGLONG diffcount = 0; //clock count difference + //printf("time has NOT changed\n"); + diffcount = currcount.QuadPart - prevcount.QuadPart; + diffcount *= 10000000; + difftime = diffcount / freq.QuadPart; + currtime.QuadPart += difftime; + } else { + //time has changed, update and return SystemTime + //printf("time has changed\n"); + prevtime.QuadPart = currtime.QuadPart; + prevcount.QuadPart = currcount.QuadPart; + } + } + currtime.QuadPart /= 10; //convert in usec + tv->tv_sec = currtime.QuadPart / (LONGLONG)1000000; + tv->tv_usec = currtime.QuadPart % (LONGLONG)1000000; + //printf("sec %d usec %d\n",tv->tv_sec, tv->tv_usec); +} + +int time_uptime_w32() +{ + int ret; + LARGE_INTEGER tm; + KeQuerySystemTime(&tm); + ret = (int)(tm.QuadPart / (LONGLONG)1000000); + return ret; +} + + +/* + * Windows version of firewall hook. We receive a partial copy of + * the packet which points to the original buffers. In output, + * the refcount has been already incremented. + * The function reconstructs + * the whole packet in a contiguous memory area, builds a fake mbuf, + * calls the firewall, does the eventual cleaning and returns + * to MiniportSend or ProtocolReceive, which will silently return + * (dropping packet) or continue its execution (allowing packet). + * The memory area contains: + * - the fake mbuf, filled with data needed by ipfw, and information + * for reinjection + * - the packet data + */ +void hexdump(PUCHAR,int, const char *); +static char _if_in[] = "incoming"; +static char _if_out[] = "outgoing"; + +int +ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, + NDIS_HANDLE Context) +{ + unsigned int BufferCount = 0; + unsigned TotalPacketLength = 0; + PNDIS_BUFFER pCurrentBuffer = NULL; + PNDIS_BUFFER pNextBuffer = NULL; + struct mbuf* m; + unsigned char* payload = NULL; + unsigned int ofs, l; + unsigned short EtherType = 0; + unsigned int i = 0; + int ret = 0; + PNDIS_BUFFER pNdisBuffer, old_head, old_tail; + NDIS_HANDLE PacketPool; + PADAPT pAdapt; + NDIS_STATUS Status; + + /* In NDIS, packets are a chain of NDIS_BUFFER. We query + * the packet to get a pointer of chain's head, the length + * of the chain, and the length of the packet itself. + * Then allocate a buffer for the mbuf and the payload. + */ + NdisQueryPacket(pNdisPacket, NULL, &BufferCount, + &pCurrentBuffer, &TotalPacketLength); + m = malloc(sizeof(struct mbuf) + TotalPacketLength, 0, 0 ); + if (m == NULL) //resource shortage, drop the packet + goto drop_pkt; + + /* set mbuf fields to point past the MAC header. + * Also set additional W32 info + */ + payload = (unsigned char*)(m + 1); + m->m_len = m->m_pkthdr.len = TotalPacketLength-14; + m->m_pkthdr.rcvif = (void *)((direction==INCOMING) ? _if_in : NULL); + m->m_data = payload + 14; /* past the MAC header */ + m->direction = direction; + m->context = Context; + m->pkt = pNdisPacket; + + /* + * Now copy the data from the Windows buffers to the mbuf. + */ + for (i=0, ofs = 0; i < BufferCount; i++) { + unsigned char* src; + NdisQueryBufferSafe(pCurrentBuffer, &src, &l, + NormalPagePriority); + bcopy(src, payload + ofs, l); + ofs += l; + NdisGetNextBuffer(pCurrentBuffer, &pNextBuffer); + pCurrentBuffer = pNextBuffer; + } + /* + * Identify EtherType. If the packet is not IP, simply allow + * and don't bother the firewall. XXX should be done before. + */ + EtherType = *(unsigned short*)(payload + 12); + EtherType = RtlUshortByteSwap(EtherType); + if (EtherType != 0x0800) { + //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType); + free(m, 0); + return PASS; + } + + /* + * Now build a buffer descriptor to replace the original chain. + */ + pAdapt = Context; + PacketPool = direction == OUTGOING ? + pAdapt->SendPacketPoolHandle : pAdapt->RecvPacketPoolHandle; + NdisAllocateBuffer(&Status, &pNdisBuffer, + PacketPool, payload, m->m_pkthdr.len+14); + if (Status != NDIS_STATUS_SUCCESS) + goto drop_pkt; + /* + * Save the old buffer pointers, and put the new one + * into the chain. + */ + pNdisBuffer->Next = NULL; + old_head = NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket); + old_tail = NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket); + NdisReinitializePacket(pNdisPacket); + NdisChainBufferAtFront(pNdisPacket, pNdisBuffer); +#if 0 + if (direction == INCOMING) { + DBGPRINT(("incoming: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength)); + } else { + DBGPRINT(("outgoing: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength)); + } +#endif + if (direction == INCOMING) + ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL); + else + ret = ipfw_check_hook(NULL, &m, (struct ifnet*)_if_out, PFIL_OUT, NULL); + + if (m != NULL) { + /* Accept. Restore the old buffer chain, free + * the mbuf and return PASS. + */ + //DBGPRINT(("accepted\n")); + NdisReinitializePacket(pNdisPacket); + NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket) = old_head; + NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket) = old_tail; + NdisFreeBuffer(pNdisBuffer); + m_freem(m); + return PASS; + } else if (ret == 0) { + /* dummynet has kept the packet, will reinject later. */ + //DBGPRINT(("kept by dummynet\n")); + return DUMMYNET; + } else { + /* + * Packet dropped by ipfw or dummynet. Nothing to do as + * FREE_PKT already freed the fake mbuf + */ + //DBGPRINT(("dropped by dummynet, ret = %i\n", ret)); + return DROP; + } +drop_pkt: + /* for some reason we cannot proceed. Free any resources + * including those received from above, and return + * faking success. XXX this must be fixed later. + */ + NdisFreePacket(pNdisPacket); + return DROP; +} + +/* + * Windows reinjection function. + * The packet is already available as m->pkt, so we only + * need to send it to the right place. + * Normally a ndis intermediate driver allocates + * a fresh descriptor, while the actual data's ownership is + * retained by the protocol, or the miniport below. + * Since an intermediate driver behaves as a miniport driver + * at the upper edge (towards the protocol), and as a protocol + * driver at the lower edge (towards the NIC), when we handle a + * packet we have a reserved area in both directions (we can use + * only one for each direction at our own discretion). + * Normally this area is used to save a pointer to the original + * packet, so when the driver is done with it, the original descriptor + * can be retrieved, and the resources freed (packet descriptor, + * buffer descriptor(s) and the actual data). In our driver this + * area is used to mark the reinjected packets as 'orphan', because + * the original descriptor is gone long ago. This way we can handle + * correctly the resource freeing when the callback function + * is called by NDIS. + */ + +void +netisr_dispatch(int num, struct mbuf *m) +{ + unsigned char* payload = (unsigned char*)(m+1); + PADAPT pAdapt = m->context; + NDIS_STATUS Status; + PNDIS_PACKET pPacket = m->pkt; + PNDIS_BUFFER pNdisBuffer; + NDIS_HANDLE PacketPool; + + if (num < 0) + goto drop_pkt; + + //debug print +#if 0 + DbgPrint("reinject %s\n", m->direction == OUTGOING ? + "outgoing" : "incoming"); +#endif + NdisAcquireSpinLock(&pAdapt->Lock); + if (m->direction == OUTGOING) { + //we must first check if the adapter is going down, + // in this case abort the reinjection + if (pAdapt->PTDeviceState > NdisDeviceStateD0) { + pAdapt->OutstandingSends--; + // XXX should we notify up ? + NdisReleaseSpinLock(&pAdapt->Lock); + goto drop_pkt; + } + } else { + /* if the upper miniport edge is not initialized or + * the miniport edge is in low power state, abort + * XXX we should notify the error. + */ + if (!pAdapt->MiniportHandle || + pAdapt->MPDeviceState > NdisDeviceStateD0) { + NdisReleaseSpinLock(&pAdapt->Lock); + goto drop_pkt; + } + } + NdisReleaseSpinLock(&pAdapt->Lock); + + if (m->direction == OUTGOING) { + PSEND_RSVD SendRsvd; + /* use the 8-bytes protocol reserved area, the first + * field is used to mark/the packet as 'orphan', the + * second stores the pointer to the mbuf, so in the + * the SendComplete handler we know that this is a + * reinjected packet and can free correctly. + */ + SendRsvd = (PSEND_RSVD)(pPacket->ProtocolReserved); + SendRsvd->OriginalPkt = NULL; + SendRsvd->pMbuf = m; + //do the actual send + NdisSend(&Status, pAdapt->BindingHandle, pPacket); + if (Status != NDIS_STATUS_PENDING) { + /* done, call the callback now */ + PtSendComplete(m->context, m->pkt, Status); + } + return; /* unconditional return here. */ + } else { + /* There's no need to check the 8-bytes miniport + * reserved area since the path going up will be always + * syncronous, and all the cleanup will be done inline. + * If the reinjected packed comes from a PtReceivePacket, + * there will be no callback. + * Otherwise PtReceiveComplete will be called but will just + * return since all the cleaning is alreqady done */ + // do the actual receive. + ULONG Proc = KeGetCurrentProcessorNumber(); + pAdapt->ReceivedIndicationFlags[Proc] = TRUE; + NdisMEthIndicateReceive(pAdapt->MiniportHandle, NULL, payload, 14, payload+14, m->m_len, m->m_len); + NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle); + pAdapt->ReceivedIndicationFlags[Proc] = FALSE; + } +drop_pkt: + /* NDIS_PACKET exists and must be freed only if + * the packet come from a PtReceivePacket, oherwise + * m->pkt will ne null. + */ + if (m->pkt != NULL) + { + NdisUnchainBufferAtFront(m->pkt, &pNdisBuffer); + NdisFreeBuffer(pNdisBuffer); + NdisFreePacket(m->pkt); + } + m_freem(m); +} + +void win_freem(void *); /* wrapper for m_freem() for protocol.c */ +void +win_freem(void *_m) +{ + struct mbuf *m = _m; + m_freem(m); +} + +/* + * not implemented in linux. + * taken from /usr/src/lib/libc/string/strlcpy.c + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} + +void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt) +{ + PNDIS_BUFFER pNdisBuffer; + + NdisQueryPacket(Packet, NULL, NULL, &pNdisBuffer, NULL); + NdisUnchainBufferAtFront(Packet, &pNdisBuffer); + NdisFreeBuffer(pNdisBuffer); + win_freem(m); + NdisFreePacket(Packet); + ADAPT_DECR_PENDING_SENDS(pAdapt); +} + +int +ipfw2_qhandler_w32_oldstyle(int direction, + NDIS_HANDLE ProtocolBindingContext, + unsigned char* HeaderBuffer, + unsigned int HeaderBufferSize, + unsigned char* LookAheadBuffer, + unsigned int LookAheadBufferSize, + unsigned int PacketSize) +{ + struct mbuf* m; + unsigned char* payload = NULL; + unsigned short EtherType = 0; + int ret = 0; + + /* We are in a special case when NIC signals an incoming + * packet using old style calls. This is done passing + * a pointer to the MAC header and a pointer to the + * rest of the packet. + * We simply allocate space for the mbuf and the + * subsequent payload section. + */ + m = malloc(sizeof(struct mbuf) + HeaderBufferSize + LookAheadBufferSize, 0, 0 ); + if (m == NULL) //resource shortage, drop the packet + return DROP; + + /* set mbuf fields to point past the MAC header. + * Also set additional W32 info. + * m->pkt here is set to null because the notification + * from the NIC has come with a header+loolahead buffer, + * no NDIS_PACKET has been provided. + */ + payload = (unsigned char*)(m + 1); + m->m_len = m->m_pkthdr.len = HeaderBufferSize+LookAheadBufferSize-14; + m->m_data = payload + 14; /* past the MAC header */ + m->direction = direction; + m->context = ProtocolBindingContext; + m->pkt = NULL; + + /* + * Now copy the data from the Windows buffers to the mbuf. + */ + bcopy(HeaderBuffer, payload, HeaderBufferSize); + bcopy(LookAheadBuffer, payload+HeaderBufferSize, LookAheadBufferSize); + //hexdump(payload,HeaderBufferSize+LookAheadBufferSize,"qhandler"); + /* + * Identify EtherType. If the packet is not IP, simply allow + * and don't bother the firewall. XXX should be done before. + */ + EtherType = *(unsigned short*)(payload + 12); + EtherType = RtlUshortByteSwap(EtherType); + if (EtherType != 0x0800) { + //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType); + free(m, 0); + return PASS; + } + + //DbgPrint("incoming_raw: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), HeaderBufferSize+LookAheadBufferSize); + + /* Query the firewall */ + ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL); + + if (m != NULL) { + /* Accept. Free the mbuf and return PASS. */ + //DbgPrint("accepted\n"); + m_freem(m); + return PASS; + } else if (ret == 0) { + /* dummynet has kept the packet, will reinject later. */ + //DbgPrint("kept by dummynet\n"); + return DUMMYNET; + } else { + /* + * Packet dropped by ipfw or dummynet. Nothing to do as + * FREE_PKT already freed the fake mbuf + */ + //DbgPrint("dropped by dummynet, ret = %i\n", ret); + return DROP; + } +} + +/* forward declaration because those functions are used only here, + * no point to make them visible in passthru/protocol/miniport */ +int do_ipfw_set_ctl(struct sock __unused *sk, int cmd, + void __user *user, unsigned int len); +int do_ipfw_get_ctl(struct sock __unused *sk, int cmd, + void __user *user, int *len); + +NTSTATUS +DevIoControl( + IN PDEVICE_OBJECT pDeviceObject, + IN PIRP pIrp + ) +/*++ + +Routine Description: + + This is the dispatch routine for handling device ioctl requests. + +Arguments: + + pDeviceObject - Pointer to the device object. + + pIrp - Pointer to the request packet. + +Return Value: + + Status is returned. + +--*/ +{ + PIO_STACK_LOCATION pIrpSp; + NTSTATUS NtStatus = STATUS_SUCCESS; + unsigned long BytesReturned = 0; + unsigned long FunctionCode; + unsigned long len; + struct sockopt *sopt; + int ret = 0; + + UNREFERENCED_PARAMETER(pDeviceObject); + + pIrpSp = IoGetCurrentIrpStackLocation(pIrp); + + /* + * Using METHOD_BUFFERED as communication method, the userland + * side calls DeviceIoControl passing an input buffer and an output + * and their respective length (ipfw uses the same length for both). + * The system creates a single I/O buffer, with len=max(inlen,outlen). + * In the kernel we can read information from this buffer (which is + * directly accessible), overwrite it with our results, and set + * IoStatus.Information with the number of bytes that the system must + * copy back to userland. + * In our sockopt emulation, the initial part of the buffer contains + * a struct sockopt, followed by the data area. + */ + + len = pIrpSp->Parameters.DeviceIoControl.InputBufferLength; + if (len < sizeof(struct sockopt)) + { + return STATUS_NOT_SUPPORTED; // XXX find better value + } + sopt = pIrp->AssociatedIrp.SystemBuffer; + + FunctionCode = pIrpSp->Parameters.DeviceIoControl.IoControlCode; + + len = sopt->sopt_valsize; + + switch (FunctionCode) + { + case IP_FW_SETSOCKOPT: + ret = do_ipfw_set_ctl(NULL, sopt->sopt_name, sopt+1, len); + break; + + case IP_FW_GETSOCKOPT: + ret = do_ipfw_get_ctl(NULL, sopt->sopt_name, sopt+1, &len); + sopt->sopt_valsize = len; + //sanity check on len + if (len + sizeof(struct sockopt) <= pIrpSp->Parameters.DeviceIoControl.InputBufferLength) + BytesReturned = len + sizeof(struct sockopt); + else + BytesReturned = pIrpSp->Parameters.DeviceIoControl.InputBufferLength; + break; + + default: + NtStatus = STATUS_NOT_SUPPORTED; + break; + } + + pIrp->IoStatus.Information = BytesReturned; + pIrp->IoStatus.Status = NtStatus; + IoCompleteRequest(pIrp, IO_NO_INCREMENT); + + return NtStatus; +} + +void dummynet(void * __unused unused); +void ipfw_tick(void * vnetx); + +VOID dummynet_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ) +{ + dummynet(NULL); +} + +VOID ipfw_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ) +{ + ipfw_tick(DeferredContext); +} diff --git a/dummynet2/miniport.c b/dummynet2/miniport.c new file mode 100644 index 0000000..3baff88 --- /dev/null +++ b/dummynet2/miniport.c @@ -0,0 +1,1481 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + miniport.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + +#include "precomp.h" +#pragma hdrstop + + + +NDIS_STATUS +MPInitialize( + OUT PNDIS_STATUS OpenErrorStatus, + OUT PUINT SelectedMediumIndex, + IN PNDIS_MEDIUM MediumArray, + IN UINT MediumArraySize, + IN NDIS_HANDLE MiniportAdapterHandle, + IN NDIS_HANDLE WrapperConfigurationContext + ) +/*++ + +Routine Description: + + This is the initialize handler which gets called as a result of + the BindAdapter handler calling NdisIMInitializeDeviceInstanceEx. + The context parameter which we pass there is the adapter structure + which we retrieve here. + + Arguments: + + OpenErrorStatus Not used by us. + SelectedMediumIndex Place-holder for what media we are using + MediumArray Array of ndis media passed down to us to pick from + MediumArraySize Size of the array + MiniportAdapterHandle The handle NDIS uses to refer to us + WrapperConfigurationContext For use by NdisOpenConfiguration + +Return Value: + + NDIS_STATUS_SUCCESS unless something goes wrong + +--*/ +{ + UINT i; + PADAPT pAdapt; + NDIS_STATUS Status = NDIS_STATUS_FAILURE; + NDIS_MEDIUM Medium; + + UNREFERENCED_PARAMETER(WrapperConfigurationContext); + + do + { + // + // Start off by retrieving our adapter context and storing + // the Miniport handle in it. + // + pAdapt = NdisIMGetDeviceContext(MiniportAdapterHandle); + pAdapt->MiniportIsHalted = FALSE; + + DBGPRINT(("==> Miniport Initialize: Adapt %p\n", pAdapt)); + + // + // Usually we export the medium type of the adapter below as our + // virtual miniport's medium type. However if the adapter below us + // is a WAN device, then we claim to be of medium type 802.3. + // + Medium = pAdapt->Medium; + + if (Medium == NdisMediumWan) + { + Medium = NdisMedium802_3; + } + + for (i = 0; i < MediumArraySize; i++) + { + if (MediumArray[i] == Medium) + { + *SelectedMediumIndex = i; + break; + } + } + + if (i == MediumArraySize) + { + Status = NDIS_STATUS_UNSUPPORTED_MEDIA; + break; + } + + + // + // Set the attributes now. NDIS_ATTRIBUTE_DESERIALIZE enables us + // to make up-calls to NDIS without having to call NdisIMSwitchToMiniport + // or NdisIMQueueCallBack. This also forces us to protect our data using + // spinlocks where appropriate. Also in this case NDIS does not queue + // packets on our behalf. Since this is a very simple pass-thru + // miniport, we do not have a need to protect anything. However in + // a general case there will be a need to use per-adapter spin-locks + // for the packet queues at the very least. + // + NdisMSetAttributesEx(MiniportAdapterHandle, + pAdapt, + 0, // CheckForHangTimeInSeconds + NDIS_ATTRIBUTE_IGNORE_PACKET_TIMEOUT | + NDIS_ATTRIBUTE_IGNORE_REQUEST_TIMEOUT| + NDIS_ATTRIBUTE_INTERMEDIATE_DRIVER | + NDIS_ATTRIBUTE_DESERIALIZE | + NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND, + 0); + + pAdapt->MiniportHandle = MiniportAdapterHandle; + // + // Initialize LastIndicatedStatus to be NDIS_STATUS_MEDIA_CONNECT + // + pAdapt->LastIndicatedStatus = NDIS_STATUS_MEDIA_CONNECT; + + // + // Initialize the power states for both the lower binding (PTDeviceState) + // and our miniport edge to Powered On. + // + pAdapt->MPDeviceState = NdisDeviceStateD0; + pAdapt->PTDeviceState = NdisDeviceStateD0; + + // + // Add this adapter to the global pAdapt List + // + NdisAcquireSpinLock(&GlobalLock); + + pAdapt->Next = pAdaptList; + pAdaptList = pAdapt; + + NdisReleaseSpinLock(&GlobalLock); + + // + // Create an ioctl interface + // + (VOID)PtRegisterDevice(); + + Status = NDIS_STATUS_SUCCESS; + } + while (FALSE); + + // + // If we had received an UnbindAdapter notification on the underlying + // adapter, we would have blocked that thread waiting for the IM Init + // process to complete. Wake up any such thread. + // + ASSERT(pAdapt->MiniportInitPending == TRUE); + pAdapt->MiniportInitPending = FALSE; + NdisSetEvent(&pAdapt->MiniportInitEvent); + + if (Status == NDIS_STATUS_SUCCESS) + { + PtReferenceAdapt(pAdapt); + } + + DBGPRINT(("<== Miniport Initialize: Adapt %p, Status %x\n", pAdapt, Status)); + + *OpenErrorStatus = Status; + + + return Status; +} + + +NDIS_STATUS +MPSend( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet, + IN UINT Flags + ) +/*++ + +Routine Description: + + Send Packet handler. Either this or our SendPackets (array) handler is called + based on which one is enabled in our Miniport Characteristics. + +Arguments: + + MiniportAdapterContext Pointer to the adapter + Packet Packet to send + Flags Unused, passed down below + +Return Value: + + Return code from NdisSend + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + PNDIS_PACKET MyPacket; + PVOID MediaSpecificInfo = NULL; + ULONG MediaSpecificInfoSize = 0; + + // + // The driver should fail the send if the virtual miniport is in low + // power state + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + return NDIS_STATUS_FAILURE; + } + +#ifdef NDIS51 + // + // Use NDIS 5.1 packet stacking: + // + if (0) // XXX IPFW - make sure we don't go in here + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; + + // + // Packet stacks: Check if we can use the same packet for sending down. + // + + pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". + // + // NOTE: if we needed to keep per-packet information in packets + // sent down, we can use pStack->IMReserved[]. + // + ASSERT(pStack); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + return NDIS_STATUS_FAILURE; + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + NdisSend(&Status, + pAdapt->BindingHandle, + Packet); + + if (Status != NDIS_STATUS_PENDING) + { + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + + return(Status); + } + } +#endif // NDIS51 + + // + // We are either not using packet stacks, or there isn't stack space + // in the original packet passed down to us. Allocate a new packet + // to wrap the data with. + // + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + return NDIS_STATUS_FAILURE; + + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisAllocatePacket(&Status, + &MyPacket, + pAdapt->SendPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PSEND_RSVD SendRsvd; + + // + // Save a pointer to the original packet in our reserved + // area in the new packet. This is needed so that we can + // get back to the original packet when the new packet's send + // is completed. + // + SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved); + SendRsvd->OriginalPkt = Packet; + + NdisGetPacketFlags(MyPacket) = Flags; + + // + // Set up the new packet so that it describes the same + // data as the original packet. + // + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); +#ifdef WIN9X + // + // Work around the fact that NDIS does not initialize this + // to FALSE on Win9x. + // + NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE; +#endif + + // + // Copy the OOB Offset from the original packet to the new + // packet. + // + NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket), + NDIS_OOB_DATA_FROM_PACKET(Packet), + sizeof(NDIS_PACKET_OOB_DATA)); + +#ifndef WIN9X + // + // Copy the right parts of per packet info into the new packet. + // This API is not available on Win9x since task offload is + // not supported on that platform. + // + NdisIMCopySendPerPacketInfo(MyPacket, Packet); +#endif + + // + // Copy the Media specific information + // + NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet, + &MediaSpecificInfo, + &MediaSpecificInfoSize); + + if (MediaSpecificInfo || MediaSpecificInfoSize) + { + NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket, + MediaSpecificInfo, + MediaSpecificInfoSize); + } +#if 1 /* IPFW: query the firewall */ + /* if dummynet keeps the packet, we mimic success. + * otherwise continue as usual. + */ + { + int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING, + MiniportAdapterContext); + if (ret != PASS) { + if (ret == DROP) + return NDIS_STATUS_FAILURE; + else { //dummynet kept the packet +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + return NDIS_STATUS_SUCCESS; //otherwise simply continue + } + } + } +#endif /* end of IPFW code */ + + NdisSend(&Status, + pAdapt->BindingHandle, + MyPacket); + + + if (Status != NDIS_STATUS_PENDING) + { +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + NdisFreePacket(MyPacket); + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + else + { + ADAPT_DECR_PENDING_SENDS(pAdapt); + // + // We are out of packets. Silently drop it. Alternatively we can deal with it: + // - By keeping separate send and receive pools + // - Dynamically allocate more pools as needed and free them when not needed + // + } + + return(Status); +} + + +VOID +MPSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PPNDIS_PACKET PacketArray, + IN UINT NumberOfPackets + ) +/*++ + +Routine Description: + + Send Packet Array handler. Either this or our SendPacket handler is called + based on which one is enabled in our Miniport Characteristics. + +Arguments: + + MiniportAdapterContext Pointer to our adapter + PacketArray Set of packets to send + NumberOfPackets Self-explanatory + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + UINT i; + PVOID MediaSpecificInfo = NULL; + UINT MediaSpecificInfoSize = 0; + + + for (i = 0; i < NumberOfPackets; i++) + { + PNDIS_PACKET Packet, MyPacket; + + Packet = PacketArray[i]; + // + // The driver should fail the send if the virtual miniport is in low + // power state + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + NDIS_STATUS_FAILURE); + continue; + } + +#ifdef NDIS51 + + // + // Use NDIS 5.1 packet stacking: + // + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; + + // + // Packet stacks: Check if we can use the same packet for sending down. + // + pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". + // + // NOTE: if we needed to keep per-packet information in packets + // sent down, we can use pStack->IMReserved[]. + // + ASSERT(pStack); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + NDIS_STATUS_FAILURE); + } + else + { + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisSend(&Status, + pAdapt->BindingHandle, + Packet); + + if (Status != NDIS_STATUS_PENDING) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + Status); + + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + continue; + } + } +#endif + do + { + NdisAcquireSpinLock(&pAdapt->Lock); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisAllocatePacket(&Status, + &MyPacket, + pAdapt->SendPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PSEND_RSVD SendRsvd; + + SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved); + SendRsvd->OriginalPkt = Packet; + + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); +#ifdef WIN9X + // + // Work around the fact that NDIS does not initialize this + // to FALSE on Win9x. + // + NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE; +#endif // WIN9X + + // + // Copy the OOB data from the original packet to the new + // packet. + // + NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket), + NDIS_OOB_DATA_FROM_PACKET(Packet), + sizeof(NDIS_PACKET_OOB_DATA)); + // + // Copy relevant parts of the per packet info into the new packet + // +#ifndef WIN9X + NdisIMCopySendPerPacketInfo(MyPacket, Packet); +#endif + + // + // Copy the Media specific information + // + NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet, + &MediaSpecificInfo, + &MediaSpecificInfoSize); + + if (MediaSpecificInfo || MediaSpecificInfoSize) + { + NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket, + MediaSpecificInfo, + MediaSpecificInfoSize); + } + + NdisSend(&Status, + pAdapt->BindingHandle, + MyPacket); + + if (Status != NDIS_STATUS_PENDING) + { +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + NdisFreePacket(MyPacket); + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + else + { + // + // The driver cannot allocate a packet. + // + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + while (FALSE); + + if (Status != NDIS_STATUS_PENDING) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + Status); + } + } +} + + +NDIS_STATUS +MPQueryInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesWritten, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + + Entry point called by NDIS to query for the value of the specified OID. + Typical processing is to forward the query down to the underlying miniport. + + The following OIDs are filtered here: + + OID_PNP_QUERY_POWER - return success right here + + OID_GEN_SUPPORTED_GUIDS - do not forward, otherwise we will show up + multiple instances of private GUIDs supported by the underlying miniport. + + OID_PNP_CAPABILITIES - we do send this down to the lower miniport, but + the values returned are postprocessed before we complete this request; + see PtRequestComplete. + + NOTE on OID_TCP_TASK_OFFLOAD - if this IM driver modifies the contents + of data it passes through such that a lower miniport may not be able + to perform TCP task offload, then it should not forward this OID down, + but fail it here with the status NDIS_STATUS_NOT_SUPPORTED. This is to + avoid performing incorrect transformations on data. + + If our miniport edge (upper edge) is at a low-power state, fail the request. + + If our protocol edge (lower edge) has been notified of a low-power state, + we pend this request until the miniport below has been set to D0. Since + requests to miniports are serialized always, at most a single request will + be pended. + +Arguments: + + MiniportAdapterContext Pointer to the adapter structure + Oid Oid for this query + InformationBuffer Buffer for information + InformationBufferLength Size of this buffer + BytesWritten Specifies how much info is written + BytesNeeded In case the buffer is smaller than what we need, tell them how much is needed + + +Return Value: + + Return code from the NdisRequest below. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status = NDIS_STATUS_FAILURE; + + do + { + if (Oid == OID_PNP_QUERY_POWER) + { + // + // Do not forward this. + // + Status = NDIS_STATUS_SUCCESS; + break; + } + + if (Oid == OID_GEN_SUPPORTED_GUIDS) + { + // + // Do not forward this, otherwise we will end up with multiple + // instances of private GUIDs that the underlying miniport + // supports. + // + Status = NDIS_STATUS_NOT_SUPPORTED; + break; + } + + if (Oid == OID_TCP_TASK_OFFLOAD) + { + // + // Fail this -if- this driver performs data transformations + // that can interfere with a lower driver's ability to offload + // TCP tasks. + // + // Status = NDIS_STATUS_NOT_SUPPORTED; + // break; + // + } + // + // If the miniport below is unbinding, just fail any request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + NdisReleaseSpinLock(&pAdapt->Lock); + // + // All other queries are failed, if the miniport is not at D0, + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + Status = NDIS_STATUS_FAILURE; + break; + } + + pAdapt->Request.RequestType = NdisRequestQueryInformation; + pAdapt->Request.DATA.QUERY_INFORMATION.Oid = Oid; + pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer = InformationBuffer; + pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength = InformationBufferLength; + pAdapt->BytesNeeded = BytesNeeded; + pAdapt->BytesReadOrWritten = BytesWritten; + + // + // If the miniport below is binding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + // + // If the Protocol device state is OFF, mark this request as being + // pended. We queue this until the device state is back to D0. + // + if ((pAdapt->PTDeviceState > NdisDeviceStateD0) + && (pAdapt->StandingBy == FALSE)) + { + pAdapt->QueuedRequest = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_PENDING; + break; + } + // + // This is in the process of powering down the system, always fail the request + // + if (pAdapt->StandingBy == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingRequests = TRUE; + + NdisReleaseSpinLock(&pAdapt->Lock); + + // + // default case, most requests will be passed to the miniport below + // + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + + if (Status != NDIS_STATUS_PENDING) + { + PtRequestComplete(pAdapt, &pAdapt->Request, Status); + Status = NDIS_STATUS_PENDING; + } + + } while (FALSE); + + return(Status); + +} + + +VOID +MPQueryPNPCapabilities( + IN OUT PADAPT pAdapt, + OUT PNDIS_STATUS pStatus + ) +/*++ + +Routine Description: + + Postprocess a request for OID_PNP_CAPABILITIES that was forwarded + down to the underlying miniport, and has been completed by it. + +Arguments: + + pAdapt - Pointer to the adapter structure + pStatus - Place to return final status + +Return Value: + + None. + +--*/ + +{ + PNDIS_PNP_CAPABILITIES pPNPCapabilities; + PNDIS_PM_WAKE_UP_CAPABILITIES pPMstruct; + + if (pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength >= sizeof(NDIS_PNP_CAPABILITIES)) + { + pPNPCapabilities = (PNDIS_PNP_CAPABILITIES)(pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer); + + // + // The following fields must be overwritten by an IM driver. + // + pPMstruct= & pPNPCapabilities->WakeUpCapabilities; + pPMstruct->MinMagicPacketWakeUp = NdisDeviceStateUnspecified; + pPMstruct->MinPatternWakeUp = NdisDeviceStateUnspecified; + pPMstruct->MinLinkChangeWakeUp = NdisDeviceStateUnspecified; + *pAdapt->BytesReadOrWritten = sizeof(NDIS_PNP_CAPABILITIES); + *pAdapt->BytesNeeded = 0; + + + // + // Setting our internal flags + // Default, device is ON + // + pAdapt->MPDeviceState = NdisDeviceStateD0; + pAdapt->PTDeviceState = NdisDeviceStateD0; + + *pStatus = NDIS_STATUS_SUCCESS; + } + else + { + *pAdapt->BytesNeeded= sizeof(NDIS_PNP_CAPABILITIES); + *pStatus = NDIS_STATUS_RESOURCES; + } +} + + +NDIS_STATUS +MPSetInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + + Miniport SetInfo handler. + + In the case of OID_PNP_SET_POWER, record the power state and return the OID. + Do not pass below + If the device is suspended, do not block the SET_POWER_OID + as it is used to reactivate the Passthru miniport + + + PM- If the MP is not ON (DeviceState > D0) return immediately (except for 'query power' and 'set power') + If MP is ON, but the PT is not at D0, then queue the queue the request for later processing + + Requests to miniports are always serialized + + +Arguments: + + MiniportAdapterContext Pointer to the adapter structure + Oid Oid for this query + InformationBuffer Buffer for information + InformationBufferLength Size of this buffer + BytesRead Specifies how much info is read + BytesNeeded In case the buffer is smaller than what we need, tell them how much is needed + +Return Value: + + Return code from the NdisRequest below. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + + Status = NDIS_STATUS_FAILURE; + + do + { + // + // The Set Power should not be sent to the miniport below the Passthru, but is handled internally + // + if (Oid == OID_PNP_SET_POWER) + { + MPProcessSetPowerOid(&Status, + pAdapt, + InformationBuffer, + InformationBufferLength, + BytesRead, + BytesNeeded); + break; + + } + + // + // If the miniport below is unbinding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + NdisReleaseSpinLock(&pAdapt->Lock); + // + // All other Set Information requests are failed, if the miniport is + // not at D0 or is transitioning to a device state greater than D0. + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + Status = NDIS_STATUS_FAILURE; + break; + } + + // Set up the Request and return the result + pAdapt->Request.RequestType = NdisRequestSetInformation; + pAdapt->Request.DATA.SET_INFORMATION.Oid = Oid; + pAdapt->Request.DATA.SET_INFORMATION.InformationBuffer = InformationBuffer; + pAdapt->Request.DATA.SET_INFORMATION.InformationBufferLength = InformationBufferLength; + pAdapt->BytesNeeded = BytesNeeded; + pAdapt->BytesReadOrWritten = BytesRead; + + // + // If the miniport below is unbinding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + + // + // If the device below is at a low power state, we cannot send it the + // request now, and must pend it. + // + if ((pAdapt->PTDeviceState > NdisDeviceStateD0) + && (pAdapt->StandingBy == FALSE)) + { + pAdapt->QueuedRequest = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_PENDING; + break; + } + // + // This is in the process of powering down the system, always fail the request + // + if (pAdapt->StandingBy == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingRequests = TRUE; + + NdisReleaseSpinLock(&pAdapt->Lock); + // + // Forward the request to the device below. + // + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + if (Status != NDIS_STATUS_PENDING) + { + *BytesRead = pAdapt->Request.DATA.SET_INFORMATION.BytesRead; + *BytesNeeded = pAdapt->Request.DATA.SET_INFORMATION.BytesNeeded; + pAdapt->OutstandingRequests = FALSE; + } + + } while (FALSE); + + return(Status); +} + + +VOID +MPProcessSetPowerOid( + IN OUT PNDIS_STATUS pNdisStatus, + IN PADAPT pAdapt, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + This routine does all the procssing for a request with a SetPower Oid + The miniport shoud accept the Set Power and transition to the new state + + The Set Power should not be passed to the miniport below + + If the IM miniport is going into a low power state, then there is no guarantee if it will ever + be asked go back to D0, before getting halted. No requests should be pended or queued. + + +Arguments: + pNdisStatus - Status of the operation + pAdapt - The Adapter structure + InformationBuffer - The New DeviceState + InformationBufferLength + BytesRead - No of bytes read + BytesNeeded - No of bytes needed + + +Return Value: + Status - NDIS_STATUS_SUCCESS if all the wait events succeed. + +--*/ +{ + + + NDIS_DEVICE_POWER_STATE NewDeviceState; + + DBGPRINT(("==>MPProcessSetPowerOid: Adapt %p\n", pAdapt)); + + ASSERT (InformationBuffer != NULL); + + *pNdisStatus = NDIS_STATUS_FAILURE; + + do + { + // + // Check for invalid length + // + if (InformationBufferLength < sizeof(NDIS_DEVICE_POWER_STATE)) + { + *pNdisStatus = NDIS_STATUS_INVALID_LENGTH; + break; + } + + NewDeviceState = (*(PNDIS_DEVICE_POWER_STATE)InformationBuffer); + + // + // Check for invalid device state + // + if ((pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0)) + { + // + // If the miniport is in a non-D0 state, the miniport can only receive a Set Power to D0 + // + ASSERT (!(pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0)); + + *pNdisStatus = NDIS_STATUS_FAILURE; + break; + } + + // + // Is the miniport transitioning from an On (D0) state to an Low Power State (>D0) + // If so, then set the StandingBy Flag - (Block all incoming requests) + // + if (pAdapt->MPDeviceState == NdisDeviceStateD0 && NewDeviceState > NdisDeviceStateD0) + { + pAdapt->StandingBy = TRUE; + } + + // + // If the miniport is transitioning from a low power state to ON (D0), then clear the StandingBy flag + // All incoming requests will be pended until the physical miniport turns ON. + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0 && NewDeviceState == NdisDeviceStateD0) + { + pAdapt->StandingBy = FALSE; + } + + // + // Now update the state in the pAdapt structure; + // + pAdapt->MPDeviceState = NewDeviceState; + + *pNdisStatus = NDIS_STATUS_SUCCESS; + + + } while (FALSE); + + if (*pNdisStatus == NDIS_STATUS_SUCCESS) + { + // + // The miniport resume from low power state + // + if (pAdapt->StandingBy == FALSE) + { + // + // If we need to indicate the media connect state + // + if (pAdapt->LastIndicatedStatus != pAdapt->LatestUnIndicateStatus) + { + if (pAdapt->MiniportHandle != NULL) + { + NdisMIndicateStatus(pAdapt->MiniportHandle, + pAdapt->LatestUnIndicateStatus, + (PVOID)NULL, + 0); + NdisMIndicateStatusComplete(pAdapt->MiniportHandle); + pAdapt->LastIndicatedStatus = pAdapt->LatestUnIndicateStatus; + } + } + } + else + { + // + // Initialize LatestUnIndicatedStatus + // + pAdapt->LatestUnIndicateStatus = pAdapt->LastIndicatedStatus; + } + *BytesRead = sizeof(NDIS_DEVICE_POWER_STATE); + *BytesNeeded = 0; + } + else + { + *BytesRead = 0; + *BytesNeeded = sizeof (NDIS_DEVICE_POWER_STATE); + } + + DBGPRINT(("<==MPProcessSetPowerOid: Adapt %p\n", pAdapt)); +} + + +VOID +MPReturnPacket( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet + ) +/*++ + +Routine Description: + + NDIS Miniport entry point called whenever protocols are done with + a packet that we had indicated up and they had queued up for returning + later. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + Packet - packet being returned. + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + +#ifdef NDIS51 + // + // Packet stacking: Check if this packet belongs to us. + // + if (NdisGetPoolFromPacket(Packet) != pAdapt->RecvPacketPoolHandle) + { + // + // We reused the original packet in a receive indication. + // Simply return it to the miniport below us. + // + NdisReturnPackets(&Packet, 1); + } + else +#endif // NDIS51 + { + // + // This is a packet allocated from this IM's receive packet pool. + // Reclaim our packet, and return the original to the driver below. + // + + PNDIS_PACKET MyPacket; + PRECV_RSVD RecvRsvd; + + RecvRsvd = (PRECV_RSVD)(Packet->MiniportReserved); + MyPacket = RecvRsvd->OriginalPkt; + + NdisFreePacket(Packet); + NdisReturnPackets(&MyPacket, 1); + } +} + + +NDIS_STATUS +MPTransferData( + OUT PNDIS_PACKET Packet, + OUT PUINT BytesTransferred, + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_HANDLE MiniportReceiveContext, + IN UINT ByteOffset, + IN UINT BytesToTransfer + ) +/*++ + +Routine Description: + + Miniport's transfer data handler. + +Arguments: + + Packet Destination packet + BytesTransferred Place-holder for how much data was copied + MiniportAdapterContext Pointer to the adapter structure + MiniportReceiveContext Context + ByteOffset Offset into the packet for copying data + BytesToTransfer How much to copy. + +Return Value: + + Status of transfer + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + + // + // Return, if the device is OFF + // + + if (IsIMDeviceStateOn(pAdapt) == FALSE) + { + return NDIS_STATUS_FAILURE; + } + + NdisTransferData(&Status, + pAdapt->BindingHandle, + MiniportReceiveContext, + ByteOffset, + BytesToTransfer, + Packet, + BytesTransferred); + + return(Status); +} + +VOID +MPHalt( + IN NDIS_HANDLE MiniportAdapterContext + ) +/*++ + +Routine Description: + + Halt handler. All the hard-work for clean-up is done here. + +Arguments: + + MiniportAdapterContext Pointer to the Adapter + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + PADAPT *ppCursor; + + DBGPRINT(("==>MiniportHalt: Adapt %p\n", pAdapt)); + + pAdapt->MiniportHandle = NULL; + pAdapt->MiniportIsHalted = TRUE; + + // + // Remove this adapter from the global list + // + NdisAcquireSpinLock(&GlobalLock); + + for (ppCursor = &pAdaptList; *ppCursor != NULL; ppCursor = &(*ppCursor)->Next) + { + if (*ppCursor == pAdapt) + { + *ppCursor = pAdapt->Next; + break; + } + } + + NdisReleaseSpinLock(&GlobalLock); + + // + // Delete the ioctl interface that was created when the miniport + // was created. + // + (VOID)PtDeregisterDevice(); + + // + // If we have a valid bind, close the miniport below the protocol + // +#pragma prefast(suppress: __WARNING_DEREF_NULL_PTR, "pAdapt cannot be NULL") + if (pAdapt->BindingHandle != NULL) + { + // + // Close the binding below. and wait for it to complete + // + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(&Status, pAdapt->BindingHandle); + + if (Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + Status = pAdapt->Status; + } + + ASSERT (Status == NDIS_STATUS_SUCCESS); + + pAdapt->BindingHandle = NULL; + + PtDereferenceAdapt(pAdapt); + } + + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + + + DBGPRINT(("<== MiniportHalt: pAdapt %p\n", pAdapt)); +} + + +#ifdef NDIS51_MINIPORT + +VOID +MPCancelSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PVOID CancelId + ) +/*++ + +Routine Description: + + The miniport entry point to handle cancellation of all send packets + that match the given CancelId. If we have queued any packets that match + this, then we should dequeue them and call NdisMSendComplete for all + such packets, with a status of NDIS_STATUS_REQUEST_ABORTED. + + We should also call NdisCancelSendPackets in turn, on each lower binding + that this adapter corresponds to. This is to let miniports below cancel + any matching packets. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + CancelId - ID of packets to be cancelled. + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + + // + // If we queue packets on our adapter structure, this would be + // the place to acquire a spinlock to it, unlink any packets whose + // Id matches CancelId, release the spinlock and call NdisMSendComplete + // with NDIS_STATUS_REQUEST_ABORTED for all unlinked packets. + // + + // + // Next, pass this down so that we let the miniport(s) below cancel + // any packets that they might have queued. + // + NdisCancelSendPackets(pAdapt->BindingHandle, CancelId); + + return; +} + +VOID +MPDevicePnPEvent( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_DEVICE_PNP_EVENT DevicePnPEvent, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength + ) +/*++ + +Routine Description: + + This handler is called to notify us of PnP events directed to + our miniport device object. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + DevicePnPEvent - the event + InformationBuffer - Points to additional event-specific information + InformationBufferLength - length of above + +Return Value: + + None +--*/ +{ + // TBD - add code/comments about processing this. + + UNREFERENCED_PARAMETER(MiniportAdapterContext); + UNREFERENCED_PARAMETER(DevicePnPEvent); + UNREFERENCED_PARAMETER(InformationBuffer); + UNREFERENCED_PARAMETER(InformationBufferLength); + + return; +} + +VOID +MPAdapterShutdown( + IN NDIS_HANDLE MiniportAdapterContext + ) +/*++ + +Routine Description: + + This handler is called to notify us of an impending system shutdown. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + +Return Value: + + None +--*/ +{ + UNREFERENCED_PARAMETER(MiniportAdapterContext); + + return; +} + +#endif + + +VOID +MPFreeAllPacketPools( + IN PADAPT pAdapt + ) +/*++ + +Routine Description: + + Free all packet pools on the specified adapter. + +Arguments: + + pAdapt - pointer to ADAPT structure + +Return Value: + + None + +--*/ +{ + if (pAdapt->RecvPacketPoolHandle != NULL) + { + // + // Free the packet pool that is used to indicate receives + // + NdisFreePacketPool(pAdapt->RecvPacketPoolHandle); + + pAdapt->RecvPacketPoolHandle = NULL; + } + + if (pAdapt->SendPacketPoolHandle != NULL) + { + + // + // Free the packet pool that is used to send packets below + // + + NdisFreePacketPool(pAdapt->SendPacketPoolHandle); + + pAdapt->SendPacketPoolHandle = NULL; + + } +} + diff --git a/dummynet2/missing.h b/dummynet2/missing.h new file mode 100644 index 0000000..b48981e --- /dev/null +++ b/dummynet2/missing.h @@ -0,0 +1,639 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: missing.h 11275 2012-06-10 17:27:40Z marta $ + * + * Header for kernel variables and functions that are not available in + * userland. + */ + +#ifndef _MISSING_H_ +#define _MISSING_H_ + +/* sysctl.h and module.h are included before cdefs.h + * because of cdefs.h defines __unused */ + +#include +#include +#include +#include + +/* portability features, to be set before the rest: */ +#define HAVE_NET_IPLEN /* iplen/ipoff in net format */ +#define WITHOUT_BPF /* do not use bpf logging */ + +#ifdef _WIN32 + +#ifndef DEFINE_SPINLOCK +#define DEFINE_SPINLOCK(x) FAST_MUTEX x +#endif +/* spinlock --> Guarded Mutex KGUARDED_MUTEX */ +/* http://www.reactos.org/wiki/index.php/Guarded_Mutex */ +#define spin_lock_init(_l) +#define spin_lock_bh(_l) +#define spin_unlock_bh(_l) + +#include /* bsd-compat.c */ +#include /* bsd-compat.c */ +#include /* local version */ +#define INADDR_TO_IFP(a, b) b = NULL + +#else /* __linux__ */ + +#define MALLOC_DECLARE(x) /* nothing */ +#include /* do_gettimeofday */ +#include /* local version */ +struct inpcb; + +/* + * Kernel locking support. + * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c + * + * In linux we use spinlock_bh to implement both. + * For 'struct rwlock' we need an #ifdef to change it to spinlock_t + */ + +#ifndef DEFINE_SPINLOCK /* this is for linux 2.4 */ +#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED +#endif + + +#define rw_assert(a, b) +#define rw_destroy(_l) +#define rw_init(_l, msg) spin_lock_init(_l) +#define rw_rlock(_l) spin_lock_bh(_l) +#define rw_runlock(_l) spin_unlock_bh(_l) +#define rw_wlock(_l) spin_lock_bh(_l) +#define rw_wunlock(_l) spin_unlock_bh(_l) +#define rw_init_flags(_l, s, v) + +#define mtx_assert(a, b) +#define mtx_destroy(m) +#define mtx_init(m, a,b,c) spin_lock_init(m) +#define mtx_lock(_l) spin_lock_bh(_l) +#define mtx_unlock(_l) spin_unlock_bh(_l) + +#endif /* __linux__ */ +/* end of locking support */ + +/* + * Reference to an ipfw rule that can be carried outside critical sections. + * A rule is identified by rulenum:rule_id which is ordered. + * In version chain_id the rule can be found in slot 'slot', so + * we don't need a lookup if chain_id == chain->id. + * + * On exit from the firewall this structure refers to the rule after + * the matching one (slot points to the new rule; rulenum:rule_id-1 + * is the matching rule), and additional info (e.g. info often contains + * the insn argument or tablearg in the low 16 bits, in host format). + * On entry, the structure is valid if slot>0, and refers to the starting + * rules. 'info' contains the reason for reinject, e.g. divert port, + * divert direction, and so on. + */ +struct ipfw_rule_ref { + uint32_t slot; /* slot for matching rule */ + uint32_t rulenum; /* matching rule number */ + uint32_t rule_id; /* matching rule id */ + uint32_t chain_id; /* ruleset id */ + uint32_t info; /* see below */ +}; + +enum { + IPFW_INFO_MASK = 0x0000ffff, + IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ + IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ + IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ + IPFW_IS_MASK = 0x30000000, /* which source ? */ + IPFW_IS_DIVERT = 0x20000000, + IPFW_IS_DUMMYNET =0x10000000, + IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ +}; + +/* in netinet/in.h */ +#define in_nullhost(x) ((x).s_addr == INADDR_ANY) + +/* bzero not present on linux, but this should go in glue.h */ +#define bzero(s, n) memset(s, 0, n) +#define bcmp(p1, p2, n) memcmp(p1, p2, n) + +/* ethernet stuff */ +#define ETHERTYPE_IP 0x0800 /* IP protocol */ +//#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */ +struct ether_header { + u_char ether_dhost[ETHER_ADDR_LEN]; + u_char ether_shost[ETHER_ADDR_LEN]; + u_short ether_type; +}; + +#define ETHER_TYPE_LEN 2 /* length of the Ethernet type field */ +#define ETHER_HDR_LEN (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN) + +/* + * Historically, BSD keeps ip_len and ip_off in host format + * when doing layer 3 processing, and this often requires + * to translate the format back and forth. + * To make the process explicit, we define a couple of macros + * that also take into account the fact that at some point + * we may want to keep those fields always in net format. + */ + +#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN) +#define SET_NET_IPLEN(p) do {} while (0) +#define SET_HOST_IPLEN(p) do {} while (0) +#else /* never on linux */ +#define SET_NET_IPLEN(p) do { \ + struct ip *h_ip = (p); \ + h_ip->ip_len = htons(h_ip->ip_len); \ + h_ip->ip_off = htons(h_ip->ip_off); \ + } while (0) + +#define SET_HOST_IPLEN(p) do { \ + struct ip *h_ip = (p); \ + h_ip->ip_len = ntohs(h_ip->ip_len); \ + h_ip->ip_off = ntohs(h_ip->ip_off); \ + } while (0) +#endif /* !HAVE_NET_IPLEN */ + +/* ip_dummynet.c */ +#define __FreeBSD_version 500035 + +#ifdef __linux__ +struct moduledata; +int my_mod_register(const char *name, + int order, struct moduledata *mod, void *init, void *uninit); + +/* define some macro for ip_dummynet */ + +struct malloc_type { +}; + +#define MALLOC_DEFINE(type, shortdesc, longdesc) \ + struct malloc_type type[1]; void *md_dummy_ ## type = type + +#define CTASSERT(x) + +/* log... does not use the first argument */ +#define LOG_ERR 0x100 +#define LOG_INFO 0x200 +#define log(_level, fmt, arg...) do { \ + int __unused _qwerty=_level; printk(KERN_ERR fmt, ##arg); } while (0) + +/* + * gettimeofday would be in sys/time.h but it is not + * visible if _KERNEL is defined + */ +int gettimeofday(struct timeval *, struct timezone *); + +#else /* _WIN32 */ +#define MALLOC_DEFINE(a,b,c) +#endif /* _WIN32 */ + +extern int hz; +extern long tick; /* exists in 2.4 but not in 2.6 */ +extern int bootverbose; +extern struct timeval boottime; + +/* The time_uptime a FreeBSD variable increased each second */ +#ifdef __linux__ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,37) /* revise boundaries */ +#define time_uptime get_seconds() +#else /* OpenWRT */ +#define time_uptime CURRENT_TIME +#endif +#else /* WIN32 */ +#define time_uptime time_uptime_w32() +#endif + +extern int max_linkhdr; +extern int ip_defttl; +extern u_long in_ifaddrhmask; /* mask for hash table */ +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +/*-------------------------------------------------*/ + +/* define, includes and functions missing in linux */ +/* include and define */ +#include /* inet_ntoa */ + +struct mbuf; + +/* used by ip_dummynet.c */ +void reinject_drop(struct mbuf* m); + +#include /* error define */ +#include /* IFNAMESIZ */ + +void rn_init(int); +/* + * some network structure can be defined in the bsd way + * by using the _FAVOR_BSD definition. This is not true + * for icmp structure. + * XXX struct icmp contains bsd names in + * /usr/include/netinet/ip_icmp.h + */ +#ifdef __linux__ +#define icmp_code code +#define icmp_type type + +/* linux in6_addr has no member __u6_addr + * replace the whole structure ? + */ +#define __u6_addr in6_u +#define __u6_addr32 u6_addr32 +#endif /* __linux__ */ + +/* defined in linux/sctp.h with no bsd definition */ +struct sctphdr { + uint16_t src_port; /* source port */ + uint16_t dest_port; /* destination port */ + uint32_t v_tag; /* verification tag of packet */ + uint32_t checksum; /* Adler32 C-Sum */ + /* chunks follow... */ +}; + +/* missing definition */ +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_ACK 0x10 + +#define RTF_CLONING 0x100 /* generate new routes on use */ + +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_CARP 112 /* CARP */ +#ifndef _WIN32 +#define IPPROTO_IPV4 IPPROTO_IPIP /* for compatibility */ +#endif + +#define CARP_VERSION 2 +#define CARP_ADVERTISEMENT 0x01 + +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ + +#define IP_FORWARDING 0x1 /* most of ip header exists */ + +#define NETISR_IP 2 /* same as AF_INET */ + +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ + +extern int securelevel; + +struct carp_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int8_t carp_type:4, + carp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int8_t carp_version:4, + carp_type:4; +#endif +}; + +struct pim { + int dummy; /* windows compiler does not like empty definition */ +}; + +#ifndef _WIN32 +struct route { + struct rtentry *ro_rt; + struct sockaddr ro_dst; +}; +#endif + +struct ifaltq { + void *ifq_head; +}; + +/* + * ifnet->if_snd is used in ip_dummynet.c to take the transmission + * clock. + */ +#if defined( __linux__) +#define if_xname name +#define if_snd XXX +#elif defined( _WIN32 ) +/* used in ip_dummynet.c */ +struct ifnet { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ +// struct ifaltq if_snd; /* output queue (includes altq) */ +}; + +struct net_device { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ +}; +#endif + +/* involves mbufs */ +int in_cksum(struct mbuf *m, int len); +#define divert_cookie(mtag) 0 +#define divert_info(mtag) 0 +#define pf_find_mtag(a) NULL +#define pf_get_mtag(a) NULL +#ifndef _WIN32 +#define AF_LINK AF_ASH /* ? our sys/socket.h */ +#endif + +/* search local the ip addresses, used for the "me" keyword */ +#include +#define INADDR_TO_IFP(ip, b) \ + b = ip_dev_find((struct net *)&init_net, ip.s_addr) + +/* we don't pullup, either success or free and fail */ +#define m_pullup(m, x) \ + ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL)) + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + sa_family_t af; /* for ECN */ + u_int32_t qid; /* queue id */ +}; + +#if 0 // ndef radix +/* radix stuff in radix.h and radix.c */ +struct radix_node { + caddr_t rn_key; /* object of search */ + caddr_t rn_mask; /* netmask, if present */ +}; +#endif /* !radix */ + +/* missing kernel functions */ +char *inet_ntoa(struct in_addr ina); +int random(void); + +/* + * Return the risult of a/b + * + * this is used in linux kernel space, + * since the 64bit division needs to + * be done using a macro + */ +int64_t +div64(int64_t a, int64_t b); + +char * +inet_ntoa_r(struct in_addr ina, char *buf); + +/* from bsd sys/queue.h */ +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +/* depending of linux version */ +#ifndef ETHERTYPE_IPV6 +#define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ +#endif + +/*-------------------------------------------------*/ +#define RT_NUMFIBS 1 +extern u_int rt_numfibs; + +/* involves kernel locking function */ +#ifdef RTFREE +#undef RTFREE +#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n"); +#endif + +void getmicrouptime(struct timeval *tv); + +/* from sys/netinet/ip_output.c */ +struct ip_moptions; +struct route; +struct ip; + +struct mbuf *ip_reass(struct mbuf *); +u_short in_cksum_hdr(struct ip *); +int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct inpcb *inp); + +/* from net/netisr.c */ +void netisr_dispatch(int num, struct mbuf *m); + +/* definition moved in missing.c */ +int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); + +int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); + +/* defined in session.c */ +int priv_check(struct thread *td, int priv); + +/* struct ucred is in linux/socket.h and has pid, uid, gid. + * We need a 'bsd_ucred' to store also the extra info + */ + +struct bsd_ucred { + uid_t uid; + gid_t gid; + uint32_t xid; + uint32_t nid; +}; + +int +cred_check(void *insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb); + +int securelevel_ge(struct ucred *cr, int level); + +struct sysctl_oid; +struct sysctl_req; + +#ifdef _WIN32 +#define module_param_named(_name, _var, _ty, _perm) +#else /* !_WIN32 */ + +/* Linux 2.4 is mostly for openwrt */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#include /* generic_ffs() used in ip_fw2.c */ +typedef uint32_t __be32; +typedef uint16_t __be16; +struct sock; +struct net; +struct inet_hashinfo; +struct sock *inet_lookup( + struct inet_hashinfo *hashinfo, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif); +struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); +#endif /* Linux < 2.6 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) && \ + LINUX_VERSION_CODE > KERNEL_VERSION(2,6,16) /* XXX NOT sure, in 2.6.9 give an error */ +#define module_param_named(_name, _var, _ty, _perm) \ + //module_param(_name, _ty, 0644) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +typedef unsigned long uintptr_t; + +#ifdef __i386__ +static inline unsigned long __fls(unsigned long word) +{ + asm("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} +#endif + +#endif /* LINUX < 2.6.25 */ + +#endif /* !_WIN32 so maybe __linux__ */ + +#if defined (__linux__) && !defined (EMULATE_SYSCTL) +#define SYSCTL_DECL(_1) +#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8) +#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6) +#define _SYSCTL_BASE(_name, _var, _ty, _perm) \ + module_param_named(_name, *(_var), _ty, \ + ( (_perm) == CTLFLAG_RD) ? 0444: 0644 ) +#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b) + +#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, int, _mode) + +#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, long, _mode) + +#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, ulong, _mode) + +#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, uint, _mode) + +#define TUNABLE_INT(_name, _ptr) + +#define SYSCTL_VNET_PROC SYSCTL_PROC +#define SYSCTL_VNET_INT SYSCTL_INT + +#endif + +#define SYSCTL_HANDLER_ARGS \ + struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req +int sysctl_handle_int(SYSCTL_HANDLER_ARGS); +int sysctl_handle_long(SYSCTL_HANDLER_ARGS); + + +void ether_demux(struct ifnet *ifp, struct mbuf *m); + +int ether_output_frame(struct ifnet *ifp, struct mbuf *m); + +void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); + +void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu); + +void rtfree(struct rtentry *rt); + +u_short in_cksum_skip(struct mbuf *m, int len, int skip); + +#ifdef INP_LOCK_ASSERT +#undef INP_LOCK_ASSERT +#define INP_LOCK_ASSERT(a) +#endif + +int jailed(struct ucred *cred); + +/* +* Return 1 if an internet address is for a ``local'' host +* (one to which we have a connection). If subnetsarelocal +* is true, this includes other subnets of the local net. +* Otherwise, it includes only the directly-connected (sub)nets. +*/ +int in_localaddr(struct in_addr in); + +/* the prototype is already in the headers */ +//int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); + +int fnmatch(const char *pattern, const char *string, int flags); + +int +linux_lookup(const int proto, const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + struct sk_buff *skb, int dir, struct bsd_ucred *u); + +/* vnet wrappers, in vnet.h and ip_var.h */ +//int ipfw_init(void); +//void ipfw_destroy(void); + +#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ +#define MTAG_IPFW_RULE 1262273568 /* rule reference */ + +struct ip_fw_args; +extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); + +#define curvnet NULL +#define CURVNET_SET(_v) +#define CURVNET_RESTORE() +#define VNET_ASSERT(condition) + +#define VNET_NAME(n) n +#define VNET_DECLARE(t, n) extern t n +#define VNET_DEFINE(t, n) t n +#define _VNET_PTR(b, n) &VNET_NAME(n) +/* + * Virtualized global variable accessor macros. + */ +#define VNET_VNET_PTR(vnet, n) (&(n)) +#define VNET_VNET(vnet, n) (n) + +#define VNET_PTR(n) (&(n)) +#define VNET(n) (n) + +VNET_DECLARE(int, ip_defttl); +#define V_ip_defttl VNET(ip_defttl); + +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, + int dir, struct inpcb *inp); + +/* hooks for divert */ +extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); + +extern int (*ip_dn_ctl_ptr)(struct sockopt *); +typedef int ip_fw_ctl_t(struct sockopt *); +extern ip_fw_ctl_t *ip_fw_ctl_ptr; + +/* netgraph prototypes */ +typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); +extern ng_ipfw_input_t *ng_ipfw_input_p; + +/* For kernel ipfw_ether and ipfw_bridge. */ +struct ip_fw_args; +typedef int ip_fw_chk_t(struct ip_fw_args *args); +extern ip_fw_chk_t *ip_fw_chk_ptr; + +#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) +#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) +#define V_tcbinfo VNET(tcbinfo) +#define V_udbinfo VNET(udbinfo) + +#endif /* !_MISSING_H_ */ diff --git a/dummynet2/netipfw.inf b/dummynet2/netipfw.inf new file mode 100644 index 0000000..af0f716 --- /dev/null +++ b/dummynet2/netipfw.inf @@ -0,0 +1,79 @@ +; version section +[Version] +Signature = "$Windows NT$" +Class = NetService +ClassGUID = {4D36E974-E325-11CE-BFC1-08002BE10318} +Provider = %Unipi% +DriverVer = 26/02/2010,3.0.0.1 + +; manufacturer section +[Manufacturer] +%Unipi% = UNIPI,NTx86 + +; control flags section +; optional, unused in netipfw.inf inf, used in netipfw_m.inf +[ControlFlags] + +; models section +[UNIPI] ; Win2k +%Desc% = Ipfw.ndi, unipi_ipfw +[UNIPI.NTx86] ;For WinXP and later +%Desc% = Ipfw.ndi, unipi_ipfw + +; ddinstall section +[Ipfw.ndi] +AddReg = Ipfw.ndi.AddReg, Ipfw.AddReg +Characteristics = 0x4410 ; NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!! +CopyFiles = Ipfw.Files.Sys +CopyInf = netipfw_m.inf + +; remove section +[Ipfw.ndi.Remove] +DelFiles = Ipfw.Files.Sys + +;ddinstall.services section +[Ipfw.ndi.Services] +AddService = Ipfw,,Ipfw.AddService + +[Ipfw.AddService] +DisplayName = %ServiceDesc% +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\ipfw.sys +AddReg = Ipfw.AddService.AddReg + +[Ipfw.AddService.AddReg] + +;file copy related sections +[SourceDisksNames] +1=%DiskDescription%,"",, + +[SourceDisksFiles] +ipfw.sys=1 + +[DestinationDirs] +DefaultDestDir = 12 +Ipfw.Files.Sys = 12 ; %windir%\System32\drivers + +; ddinstall->copyfiles points here +[Ipfw.Files.Sys] +ipfw.sys,,,2 + +; ddinstall->addreg points here +[Ipfw.ndi.AddReg] +HKR, Ndi, HelpText, , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box +HKR, Ndi, FilterClass, , failover +HKR, Ndi, FilterDeviceInfId, , unipi_ipfwmp +HKR, Ndi, Service, , Ipfw +HKR, Ndi\Interfaces, UpperRange, , noupper +HKR, Ndi\Interfaces, LowerRange, , nolower +HKR, Ndi\Interfaces, FilterMediaTypes, , "ethernet, tokenring, fddi, wan" + +;strings section +[Strings] +Unipi = "Unipi" +DiskDescription = "Ipfw Driver Disk" +Desc = "ipfw+dummynet" +HELP = "This is ipfw and dummynet network emulator, developed by unipi.it" +ServiceDesc = "ipfw service" diff --git a/dummynet2/netipfw_m.inf b/dummynet2/netipfw_m.inf new file mode 100644 index 0000000..9174c0d --- /dev/null +++ b/dummynet2/netipfw_m.inf @@ -0,0 +1,54 @@ +; version section +[Version] +Signature = "$Windows NT$" +Class = Net +ClassGUID = {4D36E972-E325-11CE-BFC1-08002BE10318} +Provider = %Unipi% +DriverVer = 26/02/2010,3.0.0.1 + +; control flags section +; optional, unused in netipfw.inf inf, used in netipfw_m.inf +[ControlFlags] +ExcludeFromSelect = unipi_ipfwmp + +; destinationdirs section, optional +[DestinationDirs] +DefaultDestDir=12 +; No files to copy + +; manufacturer section +[Manufacturer] +%Unipi% = UNIPI,NTx86 + +; models section +[UNIPI] ; Win2k +%Desc% = IpfwMP.ndi, unipi_ipfwmp +[UNIPI.NTx86] ;For WinXP and later +%Desc% = IpfwMP.ndi, unipi_ipfwmp + +; ddinstall section +[IpfwMP.ndi] +AddReg = IpfwMP.ndi.AddReg +Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN + +; ddinstall->addreg points here +[IpfwMP.ndi.AddReg] +HKR, Ndi, Service, 0, IpfwMP + +;ddinstall.services section +[IpfwMP.ndi.Services] +AddService = IpfwMP,0x2, IpfwMP.AddService + +[IpfwMP.AddService] +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\ipfw.sys +AddReg = IpfwMP.AddService.AddReg + +[IpfwMP.AddService.AddReg] +; None + +[Strings] +Unipi = "Unipi" +Desc = "Ipfw Miniport" \ No newline at end of file diff --git a/dummynet2/passthru.c b/dummynet2/passthru.c new file mode 100644 index 0000000..c366173 --- /dev/null +++ b/dummynet2/passthru.c @@ -0,0 +1,469 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + passthru.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + + +#include "precomp.h" +#pragma hdrstop + +#pragma NDIS_INIT_FUNCTION(DriverEntry) + +NDIS_HANDLE ProtHandle = NULL; +NDIS_HANDLE DriverHandle = NULL; +NDIS_MEDIUM MediumArray[4] = + { + NdisMedium802_3, // Ethernet + NdisMedium802_5, // Token-ring + NdisMediumFddi, // Fddi + NdisMediumWan // NDISWAN + }; + +NDIS_SPIN_LOCK GlobalLock; + +PADAPT pAdaptList = NULL; +LONG MiniportCount = 0; + +NDIS_HANDLE NdisWrapperHandle; + +// +// To support ioctls from user-mode: +// + +#define STR2(x) #x +#define STR(x) STR2(x) +#define DOSPREFIX "\\DosDevices\\" +#define NTPREFIX "\\Device\\" +#define WIDEN2(x) L ## x +#define WIDEN(x) WIDEN2(x) +#define LINKNAME_STRING WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME)) +#define NTDEVICE_STRING WIDEN(NTPREFIX) WIDEN(STR(MODULENAME)) +#define PROTOCOLNAME_STRING WIDEN(STR(MODULENAME)) + +NDIS_HANDLE NdisDeviceHandle = NULL; +PDEVICE_OBJECT ControlDeviceObject = NULL; + +enum _DEVICE_STATE +{ + PS_DEVICE_STATE_READY = 0, // ready for create/delete + PS_DEVICE_STATE_CREATING, // create operation in progress + PS_DEVICE_STATE_DELETING // delete operation in progress +} ControlDeviceState = PS_DEVICE_STATE_READY; + + + +NTSTATUS +DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ) +/*++ + +Routine Description: + + First entry point to be called, when this driver is loaded. + Register with NDIS as an intermediate driver. + +Arguments: + + DriverObject - pointer to the system's driver object structure + for this driver + + RegistryPath - system's registry path for this driver + +Return Value: + + STATUS_SUCCESS if all initialization is successful, STATUS_XXX + error code if not. + +--*/ +{ + NDIS_STATUS Status; + NDIS_PROTOCOL_CHARACTERISTICS PChars; + NDIS_MINIPORT_CHARACTERISTICS MChars; + NDIS_STRING Name; + + Status = NDIS_STATUS_SUCCESS; + NdisAllocateSpinLock(&GlobalLock); + + NdisMInitializeWrapper(&NdisWrapperHandle, DriverObject, RegistryPath, NULL); + + do + { + // + // Register the miniport with NDIS. Note that it is the miniport + // which was started as a driver and not the protocol. Also the miniport + // must be registered prior to the protocol since the protocol's BindAdapter + // handler can be initiated anytime and when it is, it must be ready to + // start driver instances. + // + + NdisZeroMemory(&MChars, sizeof(NDIS_MINIPORT_CHARACTERISTICS)); + + MChars.MajorNdisVersion = PASSTHRU_MAJOR_NDIS_VERSION; + MChars.MinorNdisVersion = PASSTHRU_MINOR_NDIS_VERSION; + + MChars.InitializeHandler = MPInitialize; + MChars.QueryInformationHandler = MPQueryInformation; + MChars.SetInformationHandler = MPSetInformation; + MChars.ResetHandler = NULL; + MChars.TransferDataHandler = MPTransferData; + MChars.HaltHandler = MPHalt; +#ifdef NDIS51_MINIPORT + MChars.CancelSendPacketsHandler = MPCancelSendPackets; + MChars.PnPEventNotifyHandler = MPDevicePnPEvent; + MChars.AdapterShutdownHandler = MPAdapterShutdown; +#endif // NDIS51_MINIPORT + + // + // We will disable the check for hang timeout so we do not + // need a check for hang handler! + // + MChars.CheckForHangHandler = NULL; + MChars.ReturnPacketHandler = MPReturnPacket; + + // + // Either the Send or the SendPackets handler should be specified. + // If SendPackets handler is specified, SendHandler is ignored + // + MChars.SendHandler = MPSend; // IPFW: use MPSend, not SendPackets + MChars.SendPacketsHandler = NULL; + + Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle, + &MChars, + sizeof(MChars), + &DriverHandle); + if (Status != NDIS_STATUS_SUCCESS) + { + break; + } + +#ifndef WIN9X + NdisMRegisterUnloadHandler(NdisWrapperHandle, PtUnload); +#endif + + // + // Now register the protocol. + // + NdisZeroMemory(&PChars, sizeof(NDIS_PROTOCOL_CHARACTERISTICS)); + PChars.MajorNdisVersion = PASSTHRU_PROT_MAJOR_NDIS_VERSION; + PChars.MinorNdisVersion = PASSTHRU_PROT_MINOR_NDIS_VERSION; + + // + // Make sure the protocol-name matches the service-name + // (from the INF) under which this protocol is installed. + // This is needed to ensure that NDIS can correctly determine + // the binding and call us to bind to miniports below. + // + NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING); // Protocol name + PChars.Name = Name; + PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete; + PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete; + PChars.SendCompleteHandler = PtSendComplete; + PChars.TransferDataCompleteHandler = PtTransferDataComplete; + + PChars.ResetCompleteHandler = PtResetComplete; + PChars.RequestCompleteHandler = PtRequestComplete; + PChars.ReceiveHandler = PtReceive; + PChars.ReceiveCompleteHandler = PtReceiveComplete; + PChars.StatusHandler = PtStatus; + PChars.StatusCompleteHandler = PtStatusComplete; + PChars.BindAdapterHandler = PtBindAdapter; + PChars.UnbindAdapterHandler = PtUnbindAdapter; + PChars.UnloadHandler = PtUnloadProtocol; + + PChars.ReceivePacketHandler = PtReceivePacket; + PChars.PnPEventHandler= PtPNPHandler; + + NdisRegisterProtocol(&Status, + &ProtHandle, + &PChars, + sizeof(NDIS_PROTOCOL_CHARACTERISTICS)); + + if (Status != NDIS_STATUS_SUCCESS) + { + NdisIMDeregisterLayeredMiniport(DriverHandle); + break; + } + + NdisIMAssociateMiniport(DriverHandle, ProtHandle); + } + while (FALSE); + + if (Status != NDIS_STATUS_SUCCESS) + { + NdisTerminateWrapper(NdisWrapperHandle, NULL); + } + + ipfw_module_init(); // IPFW - start the system + + return(Status); +} + + +NDIS_STATUS +PtRegisterDevice( + VOID + ) +/*++ + +Routine Description: + + Register an ioctl interface - a device object to be used for this + purpose is created by NDIS when we call NdisMRegisterDevice. + + This routine is called whenever a new miniport instance is + initialized. However, we only create one global device object, + when the first miniport instance is initialized. This routine + handles potential race conditions with PtDeregisterDevice via + the ControlDeviceState and MiniportCount variables. + + NOTE: do not call this from DriverEntry; it will prevent the driver + from being unloaded (e.g. on uninstall). + +Arguments: + + None + +Return Value: + + NDIS_STATUS_SUCCESS if we successfully register a device object. + +--*/ +{ + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + UNICODE_STRING DeviceName; + UNICODE_STRING DeviceLinkUnicodeString; + PDRIVER_DISPATCH DispatchTable[IRP_MJ_MAXIMUM_FUNCTION+1]; + + DBGPRINT(("==>PtRegisterDevice\n")); + + NdisAcquireSpinLock(&GlobalLock); + + ++MiniportCount; + + if (1 == MiniportCount) + { + ASSERT(ControlDeviceState != PS_DEVICE_STATE_CREATING); + + // + // Another thread could be running PtDeregisterDevice on + // behalf of another miniport instance. If so, wait for + // it to exit. + // + while (ControlDeviceState != PS_DEVICE_STATE_READY) + { + NdisReleaseSpinLock(&GlobalLock); + NdisMSleep(1); + NdisAcquireSpinLock(&GlobalLock); + } + + ControlDeviceState = PS_DEVICE_STATE_CREATING; + + NdisReleaseSpinLock(&GlobalLock); + + + NdisZeroMemory(DispatchTable, (IRP_MJ_MAXIMUM_FUNCTION+1) * sizeof(PDRIVER_DISPATCH)); + + DispatchTable[IRP_MJ_CREATE] = PtDispatch; + DispatchTable[IRP_MJ_CLEANUP] = PtDispatch; + DispatchTable[IRP_MJ_CLOSE] = PtDispatch; + // IPFW we use DevIoControl ? + DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl; + + + NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING); + NdisInitUnicodeString(&DeviceLinkUnicodeString, LINKNAME_STRING); + + // + // Create a device object and register our dispatch handlers + // + + Status = NdisMRegisterDevice( + NdisWrapperHandle, + &DeviceName, + &DeviceLinkUnicodeString, + &DispatchTable[0], + &ControlDeviceObject, + &NdisDeviceHandle + ); + + NdisAcquireSpinLock(&GlobalLock); + + ControlDeviceState = PS_DEVICE_STATE_READY; + } + + NdisReleaseSpinLock(&GlobalLock); + + DBGPRINT(("<==PtRegisterDevice: %x\n", Status)); + + return (Status); +} + + +NTSTATUS +PtDispatch( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +/*++ +Routine Description: + + Process IRPs sent to this device. + +Arguments: + + DeviceObject - pointer to a device object + Irp - pointer to an I/O Request Packet + +Return Value: + + NTSTATUS - STATUS_SUCCESS always - change this when adding + real code to handle ioctls. + +--*/ +{ + PIO_STACK_LOCATION irpStack; + NTSTATUS status = STATUS_SUCCESS; + + UNREFERENCED_PARAMETER(DeviceObject); + + DBGPRINT(("==>Pt Dispatch\n")); + irpStack = IoGetCurrentIrpStackLocation(Irp); + + + switch (irpStack->MajorFunction) + { + case IRP_MJ_CREATE: + break; + + case IRP_MJ_CLEANUP: + break; + + case IRP_MJ_CLOSE: + break; + + case IRP_MJ_DEVICE_CONTROL: + // + // Add code here to handle ioctl commands sent to passthru. + // + break; + default: + break; + } + + Irp->IoStatus.Status = status; + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + DBGPRINT(("<== Pt Dispatch\n")); + + return status; + +} + + +NDIS_STATUS +PtDeregisterDevice( + VOID + ) +/*++ + +Routine Description: + + Deregister the ioctl interface. This is called whenever a miniport + instance is halted. When the last miniport instance is halted, we + request NDIS to delete the device object + +Arguments: + + NdisDeviceHandle - Handle returned by NdisMRegisterDevice + +Return Value: + + NDIS_STATUS_SUCCESS if everything worked ok + +--*/ +{ + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + + DBGPRINT(("==>PassthruDeregisterDevice\n")); + + NdisAcquireSpinLock(&GlobalLock); + + ASSERT(MiniportCount > 0); + + --MiniportCount; + + if (0 == MiniportCount) + { + // + // All miniport instances have been halted. Deregister + // the control device. + // + + ASSERT(ControlDeviceState == PS_DEVICE_STATE_READY); + + // + // Block PtRegisterDevice() while we release the control + // device lock and deregister the device. + // + ControlDeviceState = PS_DEVICE_STATE_DELETING; + + NdisReleaseSpinLock(&GlobalLock); + + if (NdisDeviceHandle != NULL) + { + Status = NdisMDeregisterDevice(NdisDeviceHandle); + NdisDeviceHandle = NULL; + } + + NdisAcquireSpinLock(&GlobalLock); + ControlDeviceState = PS_DEVICE_STATE_READY; + } + + NdisReleaseSpinLock(&GlobalLock); + + DBGPRINT(("<== PassthruDeregisterDevice: %x\n", Status)); + return Status; + +} + +VOID +PtUnload( + IN PDRIVER_OBJECT DriverObject + ) +// +// PassThru driver unload function +// +{ + UNREFERENCED_PARAMETER(DriverObject); + + DBGPRINT(("PtUnload: entered\n")); + + PtUnloadProtocol(); + + NdisIMDeregisterLayeredMiniport(DriverHandle); + + NdisFreeSpinLock(&GlobalLock); + + ipfw_module_exit(); // IPFW unloading dummynet + + DBGPRINT(("PtUnload: done!\n")); +} diff --git a/dummynet2/passthru.h b/dummynet2/passthru.h new file mode 100644 index 0000000..6e79db7 --- /dev/null +++ b/dummynet2/passthru.h @@ -0,0 +1,500 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + passthru.h + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + +#ifdef NDIS51_MINIPORT +#define PASSTHRU_MAJOR_NDIS_VERSION 5 +#define PASSTHRU_MINOR_NDIS_VERSION 1 +#else +#define PASSTHRU_MAJOR_NDIS_VERSION 4 +#define PASSTHRU_MINOR_NDIS_VERSION 0 +#endif + +#ifdef NDIS51 +#define PASSTHRU_PROT_MAJOR_NDIS_VERSION 5 +#define PASSTHRU_PROT_MINOR_NDIS_VERSION 0 +#else +#define PASSTHRU_PROT_MAJOR_NDIS_VERSION 4 +#define PASSTHRU_PROT_MINOR_NDIS_VERSION 0 +#endif + +#define MAX_BUNDLEID_LENGTH 50 + +#define TAG 'ImPa' +#define WAIT_INFINITE 0 + + + +//advance declaration +typedef struct _ADAPT ADAPT, *PADAPT; + +DRIVER_INITIALIZE DriverEntry; +extern +NTSTATUS +DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ); + +DRIVER_DISPATCH PtDispatch; +NTSTATUS +PtDispatch( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ); + +DRIVER_DISPATCH DevIoControl; +NTSTATUS +DevIoControl( + IN PDEVICE_OBJECT pDeviceObject, + IN PIRP pIrp + ); + +NDIS_STATUS +PtRegisterDevice( + VOID + ); + +NDIS_STATUS +PtDeregisterDevice( + VOID + ); + +DRIVER_UNLOAD PtUnload; +VOID +PtUnloadProtocol( + VOID + ); + +// +// Protocol proto-types +// +extern +VOID +PtOpenAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status, + IN NDIS_STATUS OpenErrorStatus + ); + +extern +VOID +PtCloseAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ); + +extern +VOID +PtResetComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ); + +extern +VOID +PtRequestComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_REQUEST NdisRequest, + IN NDIS_STATUS Status + ); + +extern +VOID +PtStatus( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS GeneralStatus, + IN PVOID StatusBuffer, + IN UINT StatusBufferSize + ); + +extern +VOID +PtStatusComplete( + IN NDIS_HANDLE ProtocolBindingContext + ); + +extern +VOID +PtSendComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status + ); + +extern +VOID +PtTransferDataComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status, + IN UINT BytesTransferred + ); + +extern +NDIS_STATUS +PtReceive( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE MacReceiveContext, + IN PVOID HeaderBuffer, + IN UINT HeaderBufferSize, + IN PVOID LookAheadBuffer, + IN UINT LookaheadBufferSize, + IN UINT PacketSize + ); + +extern +VOID +PtReceiveComplete( + IN NDIS_HANDLE ProtocolBindingContext + ); + +extern +INT +PtReceivePacket( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet + ); + +extern +VOID +PtBindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE BindContext, + IN PNDIS_STRING DeviceName, + IN PVOID SystemSpecific1, + IN PVOID SystemSpecific2 + ); + +extern +VOID +PtUnbindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE UnbindContext + ); + +VOID +PtUnload( + IN PDRIVER_OBJECT DriverObject + ); + + + +extern +NDIS_STATUS +PtPNPHandler( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNET_PNP_EVENT pNetPnPEvent + ); + + + + +NDIS_STATUS +PtPnPNetEventReconfigure( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ); + +NDIS_STATUS +PtPnPNetEventSetPower ( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ); + + +// +// Miniport proto-types +// +NDIS_STATUS +MPInitialize( + OUT PNDIS_STATUS OpenErrorStatus, + OUT PUINT SelectedMediumIndex, + IN PNDIS_MEDIUM MediumArray, + IN UINT MediumArraySize, + IN NDIS_HANDLE MiniportAdapterHandle, + IN NDIS_HANDLE WrapperConfigurationContext + ); + +VOID +MPSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PPNDIS_PACKET PacketArray, + IN UINT NumberOfPackets + ); + +NDIS_STATUS +MPSend( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet, + IN UINT Flags + ); + +NDIS_STATUS +MPQueryInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesWritten, + OUT PULONG BytesNeeded + ); + +NDIS_STATUS +MPSetInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ); + +VOID +MPReturnPacket( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet + ); + +NDIS_STATUS +MPTransferData( + OUT PNDIS_PACKET Packet, + OUT PUINT BytesTransferred, + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_HANDLE MiniportReceiveContext, + IN UINT ByteOffset, + IN UINT BytesToTransfer + ); + +VOID +MPHalt( + IN NDIS_HANDLE MiniportAdapterContext + ); + + +VOID +MPQueryPNPCapabilities( + OUT PADAPT MiniportProtocolContext, + OUT PNDIS_STATUS Status + ); + + +#ifdef NDIS51_MINIPORT + +VOID +MPCancelSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PVOID CancelId + ); + +VOID +MPAdapterShutdown( + IN NDIS_HANDLE MiniportAdapterContext + ); + +VOID +MPDevicePnPEvent( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_DEVICE_PNP_EVENT DevicePnPEvent, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength + ); + +#endif // NDIS51_MINIPORT + +VOID +MPFreeAllPacketPools( + IN PADAPT pAdapt + ); + + +VOID +MPProcessSetPowerOid( + IN OUT PNDIS_STATUS pNdisStatus, + IN PADAPT pAdapt, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ); + +VOID +PtReferenceAdapt( + IN PADAPT pAdapt + ); + +BOOLEAN +PtDereferenceAdapt( + IN PADAPT pAdapt + ); + +// +// There should be no DbgPrint's in the Free version of the driver +// +#if DBG + +#define DBGPRINT(Fmt) \ + { \ + DbgPrint("Passthru: "); \ + DbgPrint Fmt; \ + } + +#else // if DBG + +#define DBGPRINT(Fmt) + +#endif // if DBG + +#define NUM_PKTS_IN_POOL 256 + + +// +// Protocol reserved part of a sent packet that is allocated by us. +// +typedef struct _SEND_RSVD +{ + PNDIS_PACKET OriginalPkt; + struct mbuf* pMbuf; // IPFW extension, reference to the mbuf +} SEND_RSVD, *PSEND_RSVD; + +// +// Miniport reserved part of a received packet that is allocated by +// us. Note that this should fit into the MiniportReserved space +// in an NDIS_PACKET. +// +typedef struct _RECV_RSVD +{ + PNDIS_PACKET OriginalPkt; + struct mbuf* pMbuf; // IPFW extension, reference to the mbuf +} RECV_RSVD, *PRECV_RSVD; + +C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved)); + +// +// Event Codes related to the PassthruEvent Structure +// + +typedef enum +{ + Passthru_Invalid, + Passthru_SetPower, + Passthru_Unbind + +} PASSSTHRU_EVENT_CODE, *PPASTHRU_EVENT_CODE; + +// +// Passthru Event with a code to state why they have been state +// + +typedef struct _PASSTHRU_EVENT +{ + NDIS_EVENT Event; + PASSSTHRU_EVENT_CODE Code; + +} PASSTHRU_EVENT, *PPASSTHRU_EVENT; + + +// +// Structure used by both the miniport as well as the protocol part of the intermediate driver +// to represent an adapter and its corres. lower bindings +// +typedef struct _ADAPT +{ + struct _ADAPT * Next; + + NDIS_HANDLE BindingHandle; // To the lower miniport + NDIS_HANDLE MiniportHandle; // NDIS Handle to for miniport up-calls + NDIS_HANDLE SendPacketPoolHandle; + NDIS_HANDLE RecvPacketPoolHandle; + NDIS_STATUS Status; // Open Status + NDIS_EVENT Event; // Used by bind/halt for Open/Close Adapter synch. + NDIS_MEDIUM Medium; + NDIS_REQUEST Request; // This is used to wrap a request coming down + // to us. This exploits the fact that requests + // are serialized down to us. + PULONG BytesNeeded; + PULONG BytesReadOrWritten; + BOOLEAN ReceivedIndicationFlags[32]; + + BOOLEAN OutstandingRequests; // TRUE iff a request is pending + // at the miniport below + BOOLEAN QueuedRequest; // TRUE iff a request is queued at + // this IM miniport + + BOOLEAN StandingBy; // True - When the miniport or protocol is transitioning from a D0 to Standby (>D0) State + BOOLEAN UnbindingInProcess; + NDIS_SPIN_LOCK Lock; + // False - At all other times, - Flag is cleared after a transition to D0 + + NDIS_DEVICE_POWER_STATE MPDeviceState; // Miniport's Device State + NDIS_DEVICE_POWER_STATE PTDeviceState; // Protocol's Device State + NDIS_STRING DeviceName; // For initializing the miniport edge + NDIS_EVENT MiniportInitEvent; // For blocking UnbindAdapter while + // an IM Init is in progress. + BOOLEAN MiniportInitPending; // TRUE iff IMInit in progress + NDIS_STATUS LastIndicatedStatus; // The last indicated media status + NDIS_STATUS LatestUnIndicateStatus; // The latest suppressed media status + ULONG OutstandingSends; + LONG RefCount; + BOOLEAN MiniportIsHalted; +} ADAPT, *PADAPT; + +extern NDIS_HANDLE ProtHandle, DriverHandle; +extern NDIS_MEDIUM MediumArray[4]; +extern PADAPT pAdaptList; +extern NDIS_SPIN_LOCK GlobalLock; + + +#define ADAPT_MINIPORT_HANDLE(_pAdapt) ((_pAdapt)->MiniportHandle) +#define ADAPT_DECR_PENDING_SENDS(_pAdapt) \ + { \ + NdisAcquireSpinLock(&(_pAdapt)->Lock); \ + (_pAdapt)->OutstandingSends--; \ + NdisReleaseSpinLock(&(_pAdapt)->Lock); \ + } + +// +// Custom Macros to be used by the passthru driver +// +/* +BOOLEAN +IsIMDeviceStateOn( + PADAPT + ) + +*/ +#define IsIMDeviceStateOn(_pP) ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) + +#include "winmissing.h" + +int ipfw_module_init(void); +void ipfw_module_exit(void); +int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, + NDIS_HANDLE Context); +int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext, + unsigned char* HeaderBuffer, unsigned int HeaderBufferSize, + unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize, + unsigned int PacketSize); +void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt); +void hexdump(PUCHAR,int, const char *); +void my_init(); +void my_exit(); \ No newline at end of file diff --git a/dummynet2/precomp.h b/dummynet2/precomp.h new file mode 100644 index 0000000..b2870d1 --- /dev/null +++ b/dummynet2/precomp.h @@ -0,0 +1,11 @@ +#pragma warning(disable:4214) // bit field types other than int + +#pragma warning(disable:4201) // nameless struct/union +#pragma warning(disable:4115) // named type definition in parentheses +#pragma warning(disable:4127) // conditional expression is constant +#pragma warning(disable:4054) // cast of function pointer to PVOID +#pragma warning(disable:4244) // conversion from 'int' to 'BOOLEAN', possible loss of data + +#include +#include "passthru.h" + diff --git a/dummynet2/protocol.c b/dummynet2/protocol.c new file mode 100644 index 0000000..9db4c36 --- /dev/null +++ b/dummynet2/protocol.c @@ -0,0 +1,1670 @@ +/*++ + +Copyright(c) 1992-2000 Microsoft Corporation + +Module Name: + + protocol.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + + +#include "precomp.h" +#pragma hdrstop + +#define MAX_PACKET_POOL_SIZE 0x0000FFFF +#define MIN_PACKET_POOL_SIZE 0x000000FF + +// +// NDIS version as 0xMMMMmmmm, where M=Major/m=minor (0x00050001 = 5.1); +// initially unknown (0) +// +ULONG NdisDotSysVersion = 0x0; + + +#define NDIS_SYS_VERSION_51 0x00050001 + + +VOID +PtBindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE BindContext, + IN PNDIS_STRING DeviceName, + IN PVOID SystemSpecific1, + IN PVOID SystemSpecific2 + ) +/*++ + +Routine Description: + + Called by NDIS to bind to a miniport below. + +Arguments: + + Status - Return status of bind here. + BindContext - Can be passed to NdisCompleteBindAdapter if this call is pended. + DeviceName - Device name to bind to. This is passed to NdisOpenAdapter. + SystemSpecific1 - Can be passed to NdisOpenProtocolConfiguration to read per-binding information + SystemSpecific2 - Unused + +Return Value: + + NDIS_STATUS_PENDING if this call is pended. In this case call NdisCompleteBindAdapter + to complete. + Anything else Completes this call synchronously + +--*/ +{ + NDIS_HANDLE ConfigHandle = NULL; + PNDIS_CONFIGURATION_PARAMETER Param; + NDIS_STRING DeviceStr = NDIS_STRING_CONST("UpperBindings"); + NDIS_STRING NdisVersionStr = NDIS_STRING_CONST("NdisVersion"); + PADAPT pAdapt = NULL; + NDIS_STATUS Sts; + UINT MediumIndex; + ULONG TotalSize; + BOOLEAN NoCleanUpNeeded = FALSE; + + + UNREFERENCED_PARAMETER(BindContext); + UNREFERENCED_PARAMETER(SystemSpecific2); + + DBGPRINT(("==> Protocol BindAdapter\n")); + + do + { + // + // Access the configuration section for our binding-specific + // parameters. + // + NdisOpenProtocolConfiguration(Status, + &ConfigHandle, + SystemSpecific1); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + if (NdisDotSysVersion == 0) + { + NdisReadConfiguration(Status, + &Param, + ConfigHandle, + &NdisVersionStr, // "NdisVersion" + NdisParameterInteger); + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + NdisDotSysVersion = Param->ParameterData.IntegerData; + } + + + // + // Read the "UpperBindings" reserved key that contains a list + // of device names representing our miniport instances corresponding + // to this lower binding. Since this is a 1:1 IM driver, this key + // contains exactly one name. + // + // If we want to implement a N:1 mux driver (N adapter instances + // over a single lower binding), then UpperBindings will be a + // MULTI_SZ containing a list of device names - we would loop through + // this list, calling NdisIMInitializeDeviceInstanceEx once for + // each name in it. + // + NdisReadConfiguration(Status, + &Param, + ConfigHandle, + &DeviceStr, + NdisParameterString); + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Allocate memory for the Adapter structure. This represents both the + // protocol context as well as the adapter structure when the miniport + // is initialized. + // + // In addition to the base structure, allocate space for the device + // instance string. + // + TotalSize = sizeof(ADAPT) + Param->ParameterData.StringData.MaximumLength; + + NdisAllocateMemoryWithTag(&pAdapt, TotalSize, TAG); + + if (pAdapt == NULL) + { + *Status = NDIS_STATUS_RESOURCES; + break; + } + + // + // Initialize the adapter structure. We copy in the IM device + // name as well, because we may need to use it in a call to + // NdisIMCancelInitializeDeviceInstance. The string returned + // by NdisReadConfiguration is active (i.e. available) only + // for the duration of this call to our BindAdapter handler. + // + NdisZeroMemory(pAdapt, TotalSize); + pAdapt->DeviceName.MaximumLength = Param->ParameterData.StringData.MaximumLength; + pAdapt->DeviceName.Length = Param->ParameterData.StringData.Length; + pAdapt->DeviceName.Buffer = (PWCHAR)((ULONG_PTR)pAdapt + sizeof(ADAPT)); + NdisMoveMemory(pAdapt->DeviceName.Buffer, + Param->ParameterData.StringData.Buffer, + Param->ParameterData.StringData.MaximumLength); + + + + NdisInitializeEvent(&pAdapt->Event); + NdisAllocateSpinLock(&pAdapt->Lock); + + // + // Allocate a packet pool for sends. We need this to pass sends down. + // We cannot use the same packet descriptor that came down to our send + // handler (see also NDIS 5.1 packet stacking). + // + NdisAllocatePacketPoolEx(Status, + &pAdapt->SendPacketPoolHandle, + MIN_PACKET_POOL_SIZE, + MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE, + sizeof(SEND_RSVD)); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Allocate a packet pool for receives. We need this to indicate receives. + // Same consideration as sends (see also NDIS 5.1 packet stacking). + // + NdisAllocatePacketPoolEx(Status, + &pAdapt->RecvPacketPoolHandle, + MIN_PACKET_POOL_SIZE, + MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE, + PROTOCOL_RESERVED_SIZE_IN_PACKET); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Now open the adapter below and complete the initialization + // + NdisOpenAdapter(Status, + &Sts, + &pAdapt->BindingHandle, + &MediumIndex, + MediumArray, + sizeof(MediumArray)/sizeof(NDIS_MEDIUM), + ProtHandle, + pAdapt, + DeviceName, + 0, + NULL); + + if (*Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + *Status = pAdapt->Status; + } + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + PtReferenceAdapt(pAdapt); + +#pragma prefast(suppress: __WARNING_POTENTIAL_BUFFER_OVERFLOW, "Ndis guarantees MediumIndex to be within bounds"); + pAdapt->Medium = MediumArray[MediumIndex]; + + // + // Now ask NDIS to initialize our miniport (upper) edge. + // Set the flag below to synchronize with a possible call + // to our protocol Unbind handler that may come in before + // our miniport initialization happens. + // + pAdapt->MiniportInitPending = TRUE; + NdisInitializeEvent(&pAdapt->MiniportInitEvent); + + PtReferenceAdapt(pAdapt); + + *Status = NdisIMInitializeDeviceInstanceEx(DriverHandle, + &pAdapt->DeviceName, + pAdapt); + + if (*Status != NDIS_STATUS_SUCCESS) + { + if (pAdapt->MiniportIsHalted == TRUE) + { + NoCleanUpNeeded = TRUE; + } + + DBGPRINT(("BindAdapter: Adapt %p, IMInitializeDeviceInstance error %x\n", + pAdapt, *Status)); + + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + + break; + } + + PtDereferenceAdapt(pAdapt); + + } while(FALSE); + + // + // Close the configuration handle now - see comments above with + // the call to NdisIMInitializeDeviceInstanceEx. + // + if (ConfigHandle != NULL) + { + NdisCloseConfiguration(ConfigHandle); + } + + if ((*Status != NDIS_STATUS_SUCCESS) && (NoCleanUpNeeded == FALSE)) + { + if (pAdapt != NULL) + { + if (pAdapt->BindingHandle != NULL) + { + NDIS_STATUS LocalStatus; + + // + // Close the binding we opened above. + // + + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(&LocalStatus, pAdapt->BindingHandle); + pAdapt->BindingHandle = NULL; + + if (LocalStatus == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + LocalStatus = pAdapt->Status; + + + } + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + } + } + } + + + DBGPRINT(("<== Protocol BindAdapter: pAdapt %p, Status %x\n", pAdapt, *Status)); +} + + +VOID +PtOpenAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status, + IN NDIS_STATUS OpenErrorStatus + ) +/*++ + +Routine Description: + + Completion routine for NdisOpenAdapter issued from within the PtBindAdapter. Simply + unblock the caller. + +Arguments: + + ProtocolBindingContext Pointer to the adapter + Status Status of the NdisOpenAdapter call + OpenErrorStatus Secondary status(ignored by us). + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + UNREFERENCED_PARAMETER(OpenErrorStatus); + + DBGPRINT(("==> PtOpenAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status)); + pAdapt->Status = Status; + NdisSetEvent(&pAdapt->Event); +} + + +VOID +PtUnbindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE UnbindContext + ) +/*++ + +Routine Description: + + Called by NDIS when we are required to unbind to the adapter below. + This functions shares functionality with the miniport's HaltHandler. + The code should ensure that NdisCloseAdapter and NdisFreeMemory is called + only once between the two functions + +Arguments: + + Status Placeholder for return status + ProtocolBindingContext Pointer to the adapter structure + UnbindContext Context for NdisUnbindComplete() if this pends + +Return Value: + + Status for NdisIMDeinitializeDeviceContext + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS LocalStatus; + + UNREFERENCED_PARAMETER(UnbindContext); + + DBGPRINT(("==> PtUnbindAdapter: Adapt %p\n", pAdapt)); + + // + // Set the flag that the miniport below is unbinding, so the request handlers will + // fail any request comming later + // + NdisAcquireSpinLock(&pAdapt->Lock); + pAdapt->UnbindingInProcess = TRUE; + if (pAdapt->QueuedRequest == TRUE) + { + pAdapt->QueuedRequest = FALSE; + NdisReleaseSpinLock(&pAdapt->Lock); + + PtRequestComplete(pAdapt, + &pAdapt->Request, + NDIS_STATUS_FAILURE ); + + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } +#ifndef WIN9X + // + // Check if we had called NdisIMInitializeDeviceInstanceEx and + // we are awaiting a call to MiniportInitialize. + // + if (pAdapt->MiniportInitPending == TRUE) + { + // + // Try to cancel the pending IMInit process. + // + LocalStatus = NdisIMCancelInitializeDeviceInstance( + DriverHandle, + &pAdapt->DeviceName); + + if (LocalStatus == NDIS_STATUS_SUCCESS) + { + // + // Successfully cancelled IM Initialization; our + // Miniport Initialize routine will not be called + // for this device. + // + pAdapt->MiniportInitPending = FALSE; + ASSERT(pAdapt->MiniportHandle == NULL); + } + else + { + // + // Our Miniport Initialize routine will be called + // (may be running on another thread at this time). + // Wait for it to finish. + // + NdisWaitEvent(&pAdapt->MiniportInitEvent, 0); + ASSERT(pAdapt->MiniportInitPending == FALSE); + } + + } +#endif // !WIN9X + + // + // Call NDIS to remove our device-instance. We do most of the work + // inside the HaltHandler. + // + // The Handle will be NULL if our miniport Halt Handler has been called or + // if the IM device was never initialized + // + + if (pAdapt->MiniportHandle != NULL) + { + *Status = NdisIMDeInitializeDeviceInstance(pAdapt->MiniportHandle); + + if (*Status != NDIS_STATUS_SUCCESS) + { + *Status = NDIS_STATUS_FAILURE; + } + } + else + { + // + // We need to do some work here. + // Close the binding below us + // and release the memory allocated. + // + + if(pAdapt->BindingHandle != NULL) + { + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(Status, pAdapt->BindingHandle); + + // + // Wait for it to complete + // + if(*Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + *Status = pAdapt->Status; + } + pAdapt->BindingHandle = NULL; + } + else + { + // + // Both Our MiniportHandle and Binding Handle should not be NULL. + // + *Status = NDIS_STATUS_FAILURE; + ASSERT(0); + } + + // + // Free the memory here, if was not released earlier(by calling the HaltHandler) + // + MPFreeAllPacketPools(pAdapt); + NdisFreeSpinLock(&pAdapt->Lock); + NdisFreeMemory(pAdapt, 0, 0); + } + + DBGPRINT(("<== PtUnbindAdapter: Adapt %p\n", pAdapt)); +} + +VOID +PtUnloadProtocol( + VOID +) +{ + NDIS_STATUS Status; + + if (ProtHandle != NULL) + { + NdisDeregisterProtocol(&Status, ProtHandle); + ProtHandle = NULL; + } + + DBGPRINT(("PtUnloadProtocol: done!\n")); +} + + + +VOID +PtCloseAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion for the CloseAdapter call. + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + Status Completion status + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + DBGPRINT(("CloseAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status)); + pAdapt->Status = Status; + NdisSetEvent(&pAdapt->Event); +} + + +VOID +PtResetComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion for the reset. + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + Status Completion status + +Return Value: + + None. + +--*/ +{ + + UNREFERENCED_PARAMETER(ProtocolBindingContext); + UNREFERENCED_PARAMETER(Status); + // + // We never issue a reset, so we should not be here. + // + ASSERT(0); +} + + +VOID +PtRequestComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_REQUEST NdisRequest, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion handler for the previously posted request. All OIDS + are completed by and sent to the same miniport that they were requested for. + If Oid == OID_PNP_QUERY_POWER then the data structure needs to returned with all entries = + NdisDeviceStateUnspecified + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + NdisRequest The posted request + Status Completion status + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + NDIS_OID Oid = pAdapt->Request.DATA.SET_INFORMATION.Oid ; + + // + // Since our request is not outstanding anymore + // + ASSERT(pAdapt->OutstandingRequests == TRUE); + + pAdapt->OutstandingRequests = FALSE; + + // + // Complete the Set or Query, and fill in the buffer for OID_PNP_CAPABILITIES, if need be. + // + switch (NdisRequest->RequestType) + { + case NdisRequestQueryInformation: + + // + // We never pass OID_PNP_QUERY_POWER down. + // + ASSERT(Oid != OID_PNP_QUERY_POWER); + + if ((Oid == OID_PNP_CAPABILITIES) && (Status == NDIS_STATUS_SUCCESS)) + { + MPQueryPNPCapabilities(pAdapt, &Status); + } + *pAdapt->BytesReadOrWritten = NdisRequest->DATA.QUERY_INFORMATION.BytesWritten; + *pAdapt->BytesNeeded = NdisRequest->DATA.QUERY_INFORMATION.BytesNeeded; + + if (((Oid == OID_GEN_MAC_OPTIONS) + && (Status == NDIS_STATUS_SUCCESS)) + && (NdisDotSysVersion >= NDIS_SYS_VERSION_51)) + { + // + // Only do this on Windows XP or greater (NDIS.SYS v 5.1); + // do not do in Windows 2000 (NDIS.SYS v 5.0)) + // + + // + // Remove the no-loopback bit from mac-options. In essence we are + // telling NDIS that we can handle loopback. We don't, but the + // interface below us does. If we do not do this, then loopback + // processing happens both below us and above us. This is wasteful + // at best and if Netmon is running, it will see multiple copies + // of loopback packets when sniffing above us. + // + // Only the lowest miniport is a stack of layered miniports should + // ever report this bit set to NDIS. + // + *(PULONG)NdisRequest->DATA.QUERY_INFORMATION.InformationBuffer &= ~NDIS_MAC_OPTION_NO_LOOPBACK; + } + + NdisMQueryInformationComplete(pAdapt->MiniportHandle, + Status); + break; + + case NdisRequestSetInformation: + + ASSERT( Oid != OID_PNP_SET_POWER); + + *pAdapt->BytesReadOrWritten = NdisRequest->DATA.SET_INFORMATION.BytesRead; + *pAdapt->BytesNeeded = NdisRequest->DATA.SET_INFORMATION.BytesNeeded; + NdisMSetInformationComplete(pAdapt->MiniportHandle, + Status); + break; + + default: + ASSERT(0); + break; + } + +} + + +VOID +PtStatus( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS GeneralStatus, + IN PVOID StatusBuffer, + IN UINT StatusBufferSize + ) +/*++ + +Routine Description: + + Status handler for the lower-edge(protocol). + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + GeneralStatus Status code + StatusBuffer Status buffer + StatusBufferSize Size of the status buffer + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + + // + // Pass up this indication only if the upper edge miniport is initialized + // and powered on. Also ignore indications that might be sent by the lower + // miniport when it isn't at D0. + // + if ((pAdapt->MiniportHandle != NULL) && + (pAdapt->MPDeviceState == NdisDeviceStateD0) && + (pAdapt->PTDeviceState == NdisDeviceStateD0)) + { + if ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || + (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT)) + { + + pAdapt->LastIndicatedStatus = GeneralStatus; + } + NdisMIndicateStatus(pAdapt->MiniportHandle, + GeneralStatus, + StatusBuffer, + StatusBufferSize); + } + // + // Save the last indicated media status + // + else + { + if ((pAdapt->MiniportHandle != NULL) && + ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || + (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT))) + { + pAdapt->LatestUnIndicateStatus = GeneralStatus; + } + } + +} + + +VOID +PtStatusComplete( + IN NDIS_HANDLE ProtocolBindingContext + ) +/*++ + +Routine Description: + + +Arguments: + + +Return Value: + + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + + // + // Pass up this indication only if the upper edge miniport is initialized + // and powered on. Also ignore indications that might be sent by the lower + // miniport when it isn't at D0. + // + if ((pAdapt->MiniportHandle != NULL) && + (pAdapt->MPDeviceState == NdisDeviceStateD0) && + (pAdapt->PTDeviceState == NdisDeviceStateD0)) + { + NdisMIndicateStatusComplete(pAdapt->MiniportHandle); + } +} + + +VOID +PtSendComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Called by NDIS when the miniport below had completed a send. We should + complete the corresponding upper-edge send this represents. + +Arguments: + + ProtocolBindingContext - Points to ADAPT structure + Packet - Low level packet being completed + Status - status of send + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + PNDIS_PACKET Pkt; + NDIS_HANDLE PoolHandle; + +#ifdef NDIS51 + // + // Packet stacking: + // + // Determine if the packet we are completing is the one we allocated. If so, then + // get the original packet from the reserved area and completed it and free the + // allocated packet. If this is the packet that was sent down to us, then just + // complete it + // + PoolHandle = NdisGetPoolFromPacket(Packet); + if (PoolHandle != pAdapt->SendPacketPoolHandle) + { + // + // We had passed down a packet belonging to the protocol above us. + // + // DBGPRINT(("PtSendComp: Adapt %p, Stacked Packet %p\n", pAdapt, Packet)); + + NdisMSendComplete(pAdapt->MiniportHandle, + Packet, + Status); + } + else +#endif // NDIS51 + { + PSEND_RSVD SendRsvd; + + SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved); + Pkt = SendRsvd->OriginalPkt; + +#if 1 // IPFW - new code + //DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt); + if (Pkt == NULL) { //this is a reinjected packet, with no 'father' + CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt); + return; + } +#endif /* IPFW */ + +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Pkt, Packet); +#endif + + NdisDprFreePacket(Packet); + + NdisMSendComplete(pAdapt->MiniportHandle, + Pkt, + Status); + } + // + // Decrease the outstanding send count + // + ADAPT_DECR_PENDING_SENDS(pAdapt); +} + + +VOID +PtTransferDataComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status, + IN UINT BytesTransferred + ) +/*++ + +Routine Description: + + Entry point called by NDIS to indicate completion of a call by us + to NdisTransferData. + + See notes under SendComplete. + +Arguments: + +Return Value: + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + if(pAdapt->MiniportHandle) + { + NdisMTransferDataComplete(pAdapt->MiniportHandle, + Packet, + Status, + BytesTransferred); + } +} + + +NDIS_STATUS +PtReceive( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE MacReceiveContext, + IN PVOID HeaderBuffer, + IN UINT HeaderBufferSize, + IN PVOID LookAheadBuffer, + IN UINT LookAheadBufferSize, + IN UINT PacketSize + ) +/*++ + +Routine Description: + + Handle receive data indicated up by the miniport below. We pass + it along to the protocol above us. + + If the miniport below indicates packets, NDIS would more + likely call us at our ReceivePacket handler. However we + might be called here in certain situations even though + the miniport below has indicated a receive packet, e.g. + if the miniport had set packet status to NDIS_STATUS_RESOURCES. + +Arguments: + + + +Return Value: + + NDIS_STATUS_SUCCESS if we processed the receive successfully, + NDIS_STATUS_XXX error code if we discarded it. + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + PNDIS_PACKET MyPacket, Packet = NULL; + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + ULONG Proc = KeGetCurrentProcessorNumber(); + + if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0)) + { + Status = NDIS_STATUS_FAILURE; + } + else do + { + // + // Get at the packet, if any, indicated up by the miniport below. + // + Packet = NdisGetReceivedPacket(pAdapt->BindingHandle, MacReceiveContext); + if (Packet != NULL) + { + // + // The miniport below did indicate up a packet. Use information + // from that packet to construct a new packet to indicate up. + // + +#ifdef NDIS51 + // + // NDIS 5.1 NOTE: Do not reuse the original packet in indicating + // up a receive, even if there is sufficient packet stack space. + // If we had to do so, we would have had to overwrite the + // status field in the original packet to NDIS_STATUS_RESOURCES, + // and it is not allowed for protocols to overwrite this field + // in received packets. + // +#endif // NDIS51 + + // + // Get a packet off the pool and indicate that up + // + NdisDprAllocatePacket(&Status, + &MyPacket, + pAdapt->RecvPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + // + // Make our packet point to data from the original + // packet. NOTE: this works only because we are + // indicating a receive directly from the context of + // our receive indication. If we need to queue this + // packet and indicate it from another thread context, + // we will also have to allocate a new buffer and copy + // over the packet contents, OOB data and per-packet + // information. This is because the packet data + // is available only for the duration of this + // receive indication call. + // + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); + + // + // Get the original packet (it could be the same packet as the + // one received or a different one based on the number of layered + // miniports below) and set it on the indicated packet so the OOB + // data is visible correctly at protocols above. If the IM driver + // modifies the packet in any way it should not set the new packet's + // original packet equal to the original packet of the packet that + // was indicated to it from the underlying driver, in this case, the + // IM driver should also ensure that the related per packet info should + // be copied to the new packet. + // we can set the original packet to the original packet of the packet + // indicated from the underlying driver because the driver doesn't modify + // the data content in the packet. + // + NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet)); + NDIS_SET_PACKET_HEADER_SIZE(MyPacket, HeaderBufferSize); + + // + // Copy packet flags. + // + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + // + // Force protocols above to make a copy if they want to hang + // on to data in this packet. This is because we are in our + // Receive handler (not ReceivePacket) and we can't return a + // ref count from here. + // + NDIS_SET_PACKET_STATUS(MyPacket, NDIS_STATUS_RESOURCES); + + // + // By setting NDIS_STATUS_RESOURCES, we also know that we can reclaim + // this packet as soon as the call to NdisMIndicateReceivePacket + // returns. + // + + if (pAdapt->MiniportHandle != NULL) + { +#if 1 /* IPFW: query the firewall */ + int ret; + ret = ipfw2_qhandler_w32(MyPacket, INCOMING, + ProtocolBindingContext); + if (ret != PASS) + return 0; //otherwise simply continue +#endif /* end of IPFW code */ + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + + // + // Reclaim the indicated packet. Since we had set its status + // to NDIS_STATUS_RESOURCES, we are guaranteed that protocols + // above are done with it. + // + NdisDprFreePacket(MyPacket); + + break; + } + } + else + { + // + // The miniport below us uses the old-style (not packet) + // receive indication. Fall through. + // + } + + // + // Fall through if the miniport below us has either not + // indicated a packet or we could not allocate one + // + pAdapt->ReceivedIndicationFlags[Proc] = TRUE; + if (pAdapt->MiniportHandle == NULL) + { + break; + } + switch (pAdapt->Medium) + { + case NdisMedium802_3: + case NdisMediumWan: + //DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize); + //hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive"); + { + int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize); + if (ret != PASS) + return NDIS_STATUS_SUCCESS; + } + NdisMEthIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; + + case NdisMedium802_5: + NdisMTrIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; + +#if FDDI + case NdisMediumFddi: + NdisMFddiIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; +#endif + default: + ASSERT(FALSE); + break; + } + + } while(FALSE); + + return Status; +} + + +VOID +PtReceiveComplete( + IN NDIS_HANDLE ProtocolBindingContext + ) +/*++ + +Routine Description: + + Called by the adapter below us when it is done indicating a batch of + received packets. + +Arguments: + + ProtocolBindingContext Pointer to our adapter structure. + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + ULONG Proc = KeGetCurrentProcessorNumber(); + + /* Warning: this is a poor implementation of the PtReceiveComplete + * made by MS, and it's a well known (but never fixed) issue. + * Since the ProcessorNumber here can be different from the one + * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete + * will not be called, causing poor performance in the incoming traffic. + * In our driver, PtReceive is called for IP packets ONLY by particulary + * old NIC drivers, and the poor performance can be seen even + * in traffic not handled by ipfw or dummynet. + * Fortunately, this is quite rare, all the incoming IP packets + * will arrive through PtReceivePacket, and this callback will never + * be called. For reinjected traffic, a workaround is done + * commuting the ReceivedIndicationFlag and calling + * NdisMEthIndicateReceiveComplete manually for each packet. + */ + + if (((pAdapt->MiniportHandle != NULL) + && (pAdapt->MPDeviceState == NdisDeviceStateD0)) + && (pAdapt->ReceivedIndicationFlags[Proc])) + { + switch (pAdapt->Medium) + { + case NdisMedium802_3: + case NdisMediumWan: + NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle); + break; + + case NdisMedium802_5: + NdisMTrIndicateReceiveComplete(pAdapt->MiniportHandle); + break; +#if FDDI + case NdisMediumFddi: + NdisMFddiIndicateReceiveComplete(pAdapt->MiniportHandle); + break; +#endif + default: + ASSERT(FALSE); + break; + } + } + + pAdapt->ReceivedIndicationFlags[Proc] = FALSE; +} + + +INT +PtReceivePacket( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet + ) +/*++ + +Routine Description: + + ReceivePacket handler. Called by NDIS if the miniport below supports + NDIS 4.0 style receives. Re-package the buffer chain in a new packet + and indicate the new packet to protocols above us. Any context for + packets indicated up must be kept in the MiniportReserved field. + + NDIS 5.1 - packet stacking - if there is sufficient "stack space" in + the packet passed to us, we can use the same packet in a receive + indication. + +Arguments: + + ProtocolBindingContext - Pointer to our adapter structure. + Packet - Pointer to the packet + +Return Value: + + == 0 -> We are done with the packet + != 0 -> We will keep the packet and call NdisReturnPackets() this + many times when done. +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS Status; + PNDIS_PACKET MyPacket; + BOOLEAN Remaining; + + // + // Drop the packet silently if the upper miniport edge isn't initialized or + // the miniport edge is in low power state + // + if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0)) + { + return 0; + } + +#ifdef NDIS51 + // + // Check if we can reuse the same packet for indicating up. + // See also: PtReceive(). + // + (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (0 && Remaining) + { + // + // We can reuse "Packet". Indicate it up and be done with it. + // + Status = NDIS_GET_PACKET_STATUS(Packet); + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &Packet, 1); + return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0); + } +#endif // NDIS51 + + // + // Get a packet off the pool and indicate that up + // + NdisDprAllocatePacket(&Status, + &MyPacket, + pAdapt->RecvPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PRECV_RSVD RecvRsvd; + + RecvRsvd = (PRECV_RSVD)(MyPacket->MiniportReserved); + RecvRsvd->OriginalPkt = Packet; + + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); + + // + // Get the original packet (it could be the same packet as the one + // received or a different one based on the number of layered miniports + // below) and set it on the indicated packet so the OOB data is visible + // correctly to protocols above us. + // + NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet)); + + // + // Set Packet Flags + // + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + Status = NDIS_GET_PACKET_STATUS(Packet); + + NDIS_SET_PACKET_STATUS(MyPacket, Status); + NDIS_SET_PACKET_HEADER_SIZE(MyPacket, NDIS_GET_PACKET_HEADER_SIZE(Packet)); + + if (pAdapt->MiniportHandle != NULL) + { +#if 1 /* IPFW: query the firewall */ + int ret; + ret = ipfw2_qhandler_w32(MyPacket, INCOMING, + ProtocolBindingContext); + if (ret != PASS) + return 0; //otherwise simply continue +#endif /* end of IPFW code */ + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + + // + // Check if we had indicated up the packet with NDIS_STATUS_RESOURCES + // NOTE -- do not use NDIS_GET_PACKET_STATUS(MyPacket) for this since + // it might have changed! Use the value saved in the local variable. + // + if (Status == NDIS_STATUS_RESOURCES) + { + // + // Our ReturnPackets handler will not be called for this packet. + // We should reclaim it right here. + // + NdisDprFreePacket(MyPacket); + } + + return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0); + } + else + { + // + // We are out of packets. Silently drop it. + // + return(0); + } +} + + +NDIS_STATUS +PtPNPHandler( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNET_PNP_EVENT pNetPnPEvent + ) + +/*++ +Routine Description: + + This is called by NDIS to notify us of a PNP event related to a lower + binding. Based on the event, this dispatches to other helper routines. + + NDIS 5.1: forward this event to the upper protocol(s) by calling + NdisIMNotifyPnPEvent. + +Arguments: + + ProtocolBindingContext - Pointer to our adapter structure. Can be NULL + for "global" notifications + + pNetPnPEvent - Pointer to the PNP event to be processed. + +Return Value: + + NDIS_STATUS code indicating status of event processing. + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + + DBGPRINT(("PtPnPHandler: Adapt %p, Event %d\n", pAdapt, pNetPnPEvent->NetEvent)); + + switch (pNetPnPEvent->NetEvent) + { + case NetEventSetPower: + Status = PtPnPNetEventSetPower(pAdapt, pNetPnPEvent); + break; + + case NetEventReconfigure: + Status = PtPnPNetEventReconfigure(pAdapt, pNetPnPEvent); + break; + + default: +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above, before + // doing anything else with it. + // + if (pAdapt && pAdapt->MiniportHandle) + { + Status = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#else + Status = NDIS_STATUS_SUCCESS; + +#endif // NDIS51 + + break; + } + + return Status; +} + + +NDIS_STATUS +PtPnPNetEventReconfigure( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ) +/*++ +Routine Description: + + This routine is called from NDIS to notify our protocol edge of a + reconfiguration of parameters for either a specific binding (pAdapt + is not NULL), or global parameters if any (pAdapt is NULL). + +Arguments: + + pAdapt - Pointer to our adapter structure. + pNetPnPEvent - the reconfigure event + +Return Value: + + NDIS_STATUS_SUCCESS + +--*/ +{ + NDIS_STATUS ReconfigStatus = NDIS_STATUS_SUCCESS; + NDIS_STATUS ReturnStatus = NDIS_STATUS_SUCCESS; + + do + { + // + // Is this is a global reconfiguration notification ? + // + if (pAdapt == NULL) + { + // + // An important event that causes this notification to us is if + // one of our upper-edge miniport instances was enabled after being + // disabled earlier, e.g. from Device Manager in Win2000. Note that + // NDIS calls this because we had set up an association between our + // miniport and protocol entities by calling NdisIMAssociateMiniport. + // + // Since we would have torn down the lower binding for that miniport, + // we need NDIS' assistance to re-bind to the lower miniport. The + // call to NdisReEnumerateProtocolBindings does exactly that. + // + NdisReEnumerateProtocolBindings (ProtHandle); + + break; + } + +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above before doing anything + // with it. + // + if (pAdapt->MiniportHandle) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + ReconfigStatus = NDIS_STATUS_SUCCESS; + + } while(FALSE); + + DBGPRINT(("<==PtPNPNetEventReconfigure: pAdapt %p\n", pAdapt)); + +#ifdef NDIS51 + // + // Overwrite status with what upper-layer protocol(s) returned. + // + ReconfigStatus = ReturnStatus; +#endif + + return ReconfigStatus; +} + + +NDIS_STATUS +PtPnPNetEventSetPower( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ) +/*++ +Routine Description: + + This is a notification to our protocol edge of the power state + of the lower miniport. If it is going to a low-power state, we must + wait here for all outstanding sends and requests to complete. + + NDIS 5.1: Since we use packet stacking, it is not sufficient to + check usage of our local send packet pool to detect whether or not + all outstanding sends have completed. For this, use the new API + NdisQueryPendingIOCount. + + NDIS 5.1: Use the 5.1 API NdisIMNotifyPnPEvent to pass on PnP + notifications to upper protocol(s). + +Arguments: + + pAdapt - Pointer to the adpater structure + pNetPnPEvent - The Net Pnp Event. this contains the new device state + +Return Value: + + NDIS_STATUS_SUCCESS or the status returned by upper-layer protocols. + +--*/ +{ + PNDIS_DEVICE_POWER_STATE pDeviceState =(PNDIS_DEVICE_POWER_STATE)(pNetPnPEvent->Buffer); + NDIS_DEVICE_POWER_STATE PrevDeviceState = pAdapt->PTDeviceState; + NDIS_STATUS Status; + NDIS_STATUS ReturnStatus; + + ReturnStatus = NDIS_STATUS_SUCCESS; + + // + // Set the Internal Device State, this blocks all new sends or receives + // + NdisAcquireSpinLock(&pAdapt->Lock); + pAdapt->PTDeviceState = *pDeviceState; + + // + // Check if the miniport below is going to a low power state. + // + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + // + // If the miniport below is going to standby, fail all incoming requests + // + if (PrevDeviceState == NdisDeviceStateD0) + { + pAdapt->StandingBy = TRUE; + } + + NdisReleaseSpinLock(&pAdapt->Lock); + +#ifdef NDIS51 + // + // Notify upper layer protocol(s) first. + // + if (pAdapt->MiniportHandle != NULL) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + // + // Wait for outstanding sends and requests to complete. + // + while (pAdapt->OutstandingSends != 0) + { + NdisMSleep(2); + } + + while (pAdapt->OutstandingRequests == TRUE) + { + // + // sleep till outstanding requests complete + // + NdisMSleep(2); + } + + // + // If the below miniport is going to low power state, complete the queued request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->QueuedRequest) + { + pAdapt->QueuedRequest = FALSE; + NdisReleaseSpinLock(&pAdapt->Lock); + PtRequestComplete(pAdapt, &pAdapt->Request, NDIS_STATUS_FAILURE); + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } + + + ASSERT(NdisPacketPoolUsage(pAdapt->SendPacketPoolHandle) == 0); + ASSERT(pAdapt->OutstandingRequests == FALSE); + } + else + { + // + // If the physical miniport is powering up (from Low power state to D0), + // clear the flag + // + if (PrevDeviceState > NdisDeviceStateD0) + { + pAdapt->StandingBy = FALSE; + } + // + // The device below is being turned on. If we had a request + // pending, send it down now. + // + if (pAdapt->QueuedRequest == TRUE) + { + pAdapt->QueuedRequest = FALSE; + + pAdapt->OutstandingRequests = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + if (Status != NDIS_STATUS_PENDING) + { + PtRequestComplete(pAdapt, + &pAdapt->Request, + Status); + + } + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } + + +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above + // + if (pAdapt->MiniportHandle) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + } + + return ReturnStatus; +} + +VOID +PtReferenceAdapt( + IN PADAPT pAdapt + ) +{ + NdisAcquireSpinLock(&pAdapt->Lock); + + ASSERT(pAdapt->RefCount >= 0); + + pAdapt->RefCount ++; + NdisReleaseSpinLock(&pAdapt->Lock); +} + + +BOOLEAN +PtDereferenceAdapt( + IN PADAPT pAdapt + ) +{ + NdisAcquireSpinLock(&pAdapt->Lock); + + ASSERT(pAdapt->RefCount > 0); + + pAdapt->RefCount--; + + if (pAdapt->RefCount == 0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + + // + // Free all resources on this adapter structure. + // + MPFreeAllPacketPools (pAdapt);; + NdisFreeSpinLock(&pAdapt->Lock); + NdisFreeMemory(pAdapt, 0 , 0); + + return TRUE; + + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + + return FALSE; + } +} + + diff --git a/dummynet2/radix.c b/dummynet2/radix.c new file mode 100644 index 0000000..4bef996 --- /dev/null +++ b/dummynet2/radix.c @@ -0,0 +1,1204 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.c 8.5 (Berkeley) 5/19/95 + * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $ + */ + +/* + * Routines to build and maintain radix trees for routing lookups. + */ +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include "opt_mpath.h" +#ifdef RADIX_MPATH +#include +#endif +#else /* !_KERNEL */ +#include +#include +#include +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) +#define min(a, b) ((a) < (b) ? (a) : (b) ) +#include +#endif /* !_KERNEL */ + +static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, + walktree_f_t *f, void *w); +static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); +static struct radix_node + *rn_insert(void *, struct radix_node_head *, int *, + struct radix_node [2]), + *rn_newpair(void *, int, struct radix_node[2]), + *rn_search(void *, struct radix_node *), + *rn_search_m(void *, struct radix_node *, void *); + +static int max_keylen; +static struct radix_mask *rn_mkfreelist; +static struct radix_node_head *mask_rnhead; +/* + * Work area -- the following point to 3 buffers of size max_keylen, + * allocated in this order in a block of memory malloc'ed by rn_init. + * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards. + * addmask_key is used in rn_addmask in rw mode and not thread-safe. + */ +static char *rn_zeros, *rn_ones, *addmask_key; + +#define MKGet(m) { \ + if (rn_mkfreelist) { \ + m = rn_mkfreelist; \ + rn_mkfreelist = (m)->rm_mklist; \ + } else \ + R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); } + +#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} + +#define rn_masktop (mask_rnhead->rnh_treetop) + +static int rn_lexobetter(void *m_arg, void *n_arg); +static struct radix_mask * + rn_new_radix_mask(struct radix_node *tt, + struct radix_mask *next); +static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, + int skip); + +/* + * The data structure for the keys is a radix tree with one way + * branching removed. The index rn_bit at an internal node n represents a bit + * position to be tested. The tree is arranged so that all descendants + * of a node n have keys whose bits all agree up to position rn_bit - 1. + * (We say the index of n is rn_bit.) + * + * There is at least one descendant which has a one bit at position rn_bit, + * and at least one with a zero there. + * + * A route is determined by a pair of key and mask. We require that the + * bit-wise logical and of the key and mask to be the key. + * We define the index of a route to associated with the mask to be + * the first bit number in the mask where 0 occurs (with bit number 0 + * representing the highest order bit). + * + * We say a mask is normal if every bit is 0, past the index of the mask. + * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, + * and m is a normal mask, then the route applies to every descendant of n. + * If the index(m) < rn_bit, this implies the trailing last few bits of k + * before bit b are all 0, (and hence consequently true of every descendant + * of n), so the route applies to all descendants of the node as well. + * + * Similar logic shows that a non-normal mask m such that + * index(m) <= index(n) could potentially apply to many children of n. + * Thus, for each non-host route, we attach its mask to a list at an internal + * node as high in the tree as we can go. + * + * The present version of the code makes use of normal routes in short- + * circuiting an explict mask and compare operation when testing whether + * a key satisfies a normal route, and also in remembering the unique leaf + * that governs a subtree. + */ + +/* + * Most of the functions in this code assume that the key/mask arguments + * are sockaddr-like structures, where the first byte is an u_char + * indicating the size of the entire structure. + * + * To make the assumption more explicit, we use the LEN() macro to access + * this field. It is safe to pass an expression with side effects + * to LEN() as the argument is evaluated only once. + * We cast the result to int as this is the dominant usage. + */ +#define LEN(x) ( (int) (*(const u_char *)(x)) ) + +/* + * XXX THIS NEEDS TO BE FIXED + * In the code, pointers to keys and masks are passed as either + * 'void *' (because callers use to pass pointers of various kinds), or + * 'caddr_t' (which is fine for pointer arithmetics, but not very + * clean when you dereference it to access data). Furthermore, caddr_t + * is really 'char *', while the natural type to operate on keys and + * masks would be 'u_char'. This mismatch require a lot of casts and + * intermediate variables to adapt types that clutter the code. + */ + +/* + * Search a node in the tree matching the key. + */ +static struct radix_node * +rn_search(v_arg, head) + void *v_arg; + struct radix_node *head; +{ + register struct radix_node *x; + register caddr_t v; + + for (x = head, v = v_arg; x->rn_bit >= 0;) { + if (x->rn_bmask & v[x->rn_offset]) + x = x->rn_right; + else + x = x->rn_left; + } + return (x); +} + +/* + * Same as above, but with an additional mask. + * XXX note this function is used only once. + */ +static struct radix_node * +rn_search_m(v_arg, head, m_arg) + struct radix_node *head; + void *v_arg, *m_arg; +{ + register struct radix_node *x; + register caddr_t v = v_arg, m = m_arg; + + for (x = head; x->rn_bit >= 0;) { + if ((x->rn_bmask & m[x->rn_offset]) && + (x->rn_bmask & v[x->rn_offset])) + x = x->rn_right; + else + x = x->rn_left; + } + return x; +} + +int +rn_refines(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register caddr_t m = m_arg, n = n_arg; + register caddr_t lim, lim2 = lim = n + LEN(n); + int longer = LEN(n++) - LEN(m++); + int masks_are_equal = 1; + + if (longer > 0) + lim -= longer; + while (n < lim) { + if (*n & ~(*m)) + return 0; + if (*n++ != *m++) + masks_are_equal = 0; + } + while (n < lim2) + if (*n++) + return 0; + if (masks_are_equal && (longer < 0)) + for (lim2 = m - longer; m < lim2; ) + if (*m++) + return 1; + return (!masks_are_equal); +} + +struct radix_node * +rn_lookup(v_arg, m_arg, head) + void *v_arg, *m_arg; + struct radix_node_head *head; +{ + register struct radix_node *x; + caddr_t netmask = 0; + + if (m_arg) { + x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset); + if (x == 0) + return (0); + netmask = x->rn_key; + } + x = rn_match(v_arg, head); + if (x && netmask) { + while (x && x->rn_mask != netmask) + x = x->rn_dupedkey; + } + return x; +} + +static int +rn_satisfies_leaf(trial, leaf, skip) + char *trial; + register struct radix_node *leaf; + int skip; +{ + register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; + char *cplim; + int length = min(LEN(cp), LEN(cp2)); + + if (cp3 == NULL) + cp3 = rn_ones; + else + length = min(length, LEN(cp3)); + cplim = cp + length; cp3 += skip; cp2 += skip; + for (cp += skip; cp < cplim; cp++, cp2++, cp3++) + if ((*cp ^ *cp2) & *cp3) + return 0; + return 1; +} + +struct radix_node * +rn_match(v_arg, head) + void *v_arg; + struct radix_node_head *head; +{ + caddr_t v = v_arg; + register struct radix_node *t = head->rnh_treetop, *x; + register caddr_t cp = v, cp2; + caddr_t cplim; + struct radix_node *saved_t, *top = t; + int off = t->rn_offset, vlen = LEN(cp), matched_off; + register int test, b, rn_bit; + + /* + * Open code rn_search(v, top) to avoid overhead of extra + * subroutine call. + */ + for (; t->rn_bit >= 0; ) { + if (t->rn_bmask & cp[t->rn_offset]) + t = t->rn_right; + else + t = t->rn_left; + } + /* + * See if we match exactly as a host destination + * or at least learn how many bits match, for normal mask finesse. + * + * It doesn't hurt us to limit how many bytes to check + * to the length of the mask, since if it matches we had a genuine + * match and the leaf we have is the most specific one anyway; + * if it didn't match with a shorter length it would fail + * with a long one. This wins big for class B&C netmasks which + * are probably the most common case... + */ + if (t->rn_mask) + vlen = *(u_char *)t->rn_mask; + cp += off; cp2 = t->rn_key + off; cplim = v + vlen; + for (; cp < cplim; cp++, cp2++) + if (*cp != *cp2) + goto on1; + /* + * This extra grot is in case we are explicitly asked + * to look up the default. Ugh! + * + * Never return the root node itself, it seems to cause a + * lot of confusion. + */ + if (t->rn_flags & RNF_ROOT) + t = t->rn_dupedkey; + return t; +on1: + test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ + for (b = 7; (test >>= 1) > 0;) + b--; + matched_off = cp - v; + b += matched_off << 3; + rn_bit = -1 - b; + /* + * If there is a host route in a duped-key chain, it will be first. + */ + if ((saved_t = t)->rn_mask == 0) + t = t->rn_dupedkey; + for (; t; t = t->rn_dupedkey) + /* + * Even if we don't match exactly as a host, + * we may match if the leaf we wound up at is + * a route to a net. + */ + if (t->rn_flags & RNF_NORMAL) { + if (rn_bit <= t->rn_bit) + return t; + } else if (rn_satisfies_leaf(v, t, matched_off)) + return t; + t = saved_t; + /* start searching up the tree */ + do { + register struct radix_mask *m; + t = t->rn_parent; + m = t->rn_mklist; + /* + * If non-contiguous masks ever become important + * we can restore the masking and open coding of + * the search and satisfaction test and put the + * calculation of "off" back before the "do". + */ + while (m) { + if (m->rm_flags & RNF_NORMAL) { + if (rn_bit <= m->rm_bit) + return (m->rm_leaf); + } else { + off = min(t->rn_offset, matched_off); + x = rn_search_m(v, t, m->rm_mask); + while (x && x->rn_mask != m->rm_mask) + x = x->rn_dupedkey; + if (x && rn_satisfies_leaf(v, x, off)) + return x; + } + m = m->rm_mklist; + } + } while (t != top); + return 0; +} + +#ifdef RN_DEBUG +int rn_nodenum; +struct radix_node *rn_clist; +int rn_saveinfo; +int rn_debug = 1; +#endif + +/* + * Whenever we add a new leaf to the tree, we also add a parent node, + * so we allocate them as an array of two elements: the first one must be + * the leaf (see RNTORT() in route.c), the second one is the parent. + * This routine initializes the relevant fields of the nodes, so that + * the leaf is the left child of the parent node, and both nodes have + * (almost) all all fields filled as appropriate. + * (XXX some fields are left unset, see the '#if 0' section). + * The function returns a pointer to the parent node. + */ + +static struct radix_node * +rn_newpair(v, b, nodes) + void *v; + int b; + struct radix_node nodes[2]; +{ + register struct radix_node *tt = nodes, *t = tt + 1; + t->rn_bit = b; + t->rn_bmask = 0x80 >> (b & 7); + t->rn_left = tt; + t->rn_offset = b >> 3; + +#if 0 /* XXX perhaps we should fill these fields as well. */ + t->rn_parent = t->rn_right = NULL; + + tt->rn_mask = NULL; + tt->rn_dupedkey = NULL; + tt->rn_bmask = 0; +#endif + tt->rn_bit = -1; + tt->rn_key = (caddr_t)v; + tt->rn_parent = t; + tt->rn_flags = t->rn_flags = RNF_ACTIVE; + tt->rn_mklist = t->rn_mklist = 0; +#ifdef RN_DEBUG + tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; + tt->rn_ybro = rn_clist; + rn_clist = tt; +#endif + return t; +} + +static struct radix_node * +rn_insert(v_arg, head, dupentry, nodes) + void *v_arg; + struct radix_node_head *head; + int *dupentry; + struct radix_node nodes[2]; +{ + caddr_t v = v_arg; + struct radix_node *top = head->rnh_treetop; + int head_off = top->rn_offset, vlen = LEN(v); + register struct radix_node *t = rn_search(v_arg, top); + register caddr_t cp = v + head_off; + register int b; + struct radix_node *tt; + /* + * Find first bit at which v and t->rn_key differ + */ + { + register caddr_t cp2 = t->rn_key + head_off; + register int cmp_res; + caddr_t cplim = v + vlen; + + while (cp < cplim) + if (*cp2++ != *cp++) + goto on1; + *dupentry = 1; + return t; +on1: + *dupentry = 0; + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; + for (b = (cp - v) << 3; cmp_res; b--) + cmp_res >>= 1; + } + { + register struct radix_node *p, *x = top; + cp = v; + do { + p = x; + if (cp[x->rn_offset] & x->rn_bmask) + x = x->rn_right; + else + x = x->rn_left; + } while (b > (unsigned) x->rn_bit); + /* x->rn_bit < b && x->rn_bit >= 0 */ +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); +#endif + t = rn_newpair(v_arg, b, nodes); + tt = t->rn_left; + if ((cp[p->rn_offset] & p->rn_bmask) == 0) + p->rn_left = t; + else + p->rn_right = t; + x->rn_parent = t; + t->rn_parent = p; /* frees x, p as temp vars below */ + if ((cp[t->rn_offset] & t->rn_bmask) == 0) { + t->rn_right = x; + } else { + t->rn_right = tt; + t->rn_left = x; + } +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); +#endif + } + return (tt); +} + +struct radix_node * +rn_addmask(n_arg, search, skip) + int search, skip; + void *n_arg; +{ + caddr_t netmask = (caddr_t)n_arg; + register struct radix_node *x; + register caddr_t cp, cplim; + register int b = 0, mlen, j; + int maskduplicated, m0, isnormal; + struct radix_node *saved_x; + static int last_zeroed = 0; + + if ((mlen = LEN(netmask)) > max_keylen) + mlen = max_keylen; + if (skip == 0) + skip = 1; + if (mlen <= skip) + return (mask_rnhead->rnh_nodes); + if (skip > 1) + bcopy(rn_ones + 1, addmask_key + 1, skip - 1); + if ((m0 = mlen) > skip) + bcopy(netmask + skip, addmask_key + skip, mlen - skip); + /* + * Trim trailing zeroes. + */ + for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) + cp--; + mlen = cp - addmask_key; + if (mlen <= skip) { + if (m0 >= last_zeroed) + last_zeroed = mlen; + return (mask_rnhead->rnh_nodes); + } + if (m0 < last_zeroed) + bzero(addmask_key + m0, last_zeroed - m0); + *addmask_key = last_zeroed = mlen; + x = rn_search(addmask_key, rn_masktop); + if (bcmp(addmask_key, x->rn_key, mlen) != 0) + x = 0; + if (x || search) + return (x); + R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); + if ((saved_x = x) == 0) + return (0); + netmask = cp = (caddr_t)(x + 2); + bcopy(addmask_key, cp, mlen); + x = rn_insert(cp, mask_rnhead, &maskduplicated, x); + if (maskduplicated) { + log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); + Free(saved_x); + return (x); + } + /* + * Calculate index of mask, and check for normalcy. + * First find the first byte with a 0 bit, then if there are + * more bits left (remember we already trimmed the trailing 0's), + * the pattern must be one of those in normal_chars[], or we have + * a non-contiguous mask. + */ + cplim = netmask + mlen; + isnormal = 1; + for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) + cp++; + if (cp != cplim) { + static char normal_chars[] = { + 0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; + + for (j = 0x80; (j & *cp) != 0; j >>= 1) + b++; + if (*cp != normal_chars[b] || cp != (cplim - 1)) + isnormal = 0; + } + b += (cp - netmask) << 3; + x->rn_bit = -1 - b; + if (isnormal) + x->rn_flags |= RNF_NORMAL; + return (x); +} + +static int /* XXX: arbitrary ordering for non-contiguous masks */ +rn_lexobetter(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register u_char *mp = m_arg, *np = n_arg, *lim; + + if (LEN(mp) > LEN(np)) + return 1; /* not really, but need to check longer one first */ + if (LEN(mp) == LEN(np)) + for (lim = mp + LEN(mp); mp < lim;) + if (*mp++ > *np++) + return 1; + return 0; +} + +static struct radix_mask * +rn_new_radix_mask(tt, next) + register struct radix_node *tt; + register struct radix_mask *next; +{ + register struct radix_mask *m; + + MKGet(m); + if (m == 0) { + log(LOG_ERR, "Mask for route not entered\n"); + return (0); + } + bzero(m, sizeof *m); + m->rm_bit = tt->rn_bit; + m->rm_flags = tt->rn_flags; + if (tt->rn_flags & RNF_NORMAL) + m->rm_leaf = tt; + else + m->rm_mask = tt->rn_mask; + m->rm_mklist = next; + tt->rn_mklist = m; + return m; +} + +struct radix_node * +rn_addroute(v_arg, n_arg, head, treenodes) + void *v_arg, *n_arg; + struct radix_node_head *head; + struct radix_node treenodes[2]; +{ + caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; + register struct radix_node *t, *x = 0, *tt; + struct radix_node *saved_tt, *top = head->rnh_treetop; + short b = 0, b_leaf = 0; + int keyduplicated; + caddr_t mmask; + struct radix_mask *m, **mp; + + /* + * In dealing with non-contiguous masks, there may be + * many different routes which have the same mask. + * We will find it useful to have a unique pointer to + * the mask to speed avoiding duplicate references at + * nodes and possibly save time in calculating indices. + */ + if (netmask) { + if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0) + return (0); + b_leaf = x->rn_bit; + b = -1 - x->rn_bit; + netmask = x->rn_key; + } + /* + * Deal with duplicated keys: attach node to previous instance + */ + saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); + if (keyduplicated) { + for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { +#ifdef RADIX_MPATH + /* permit multipath, if enabled for the family */ + if (rn_mpath_capable(head) && netmask == tt->rn_mask) { + /* + * go down to the end of multipaths, so that + * new entry goes into the end of rn_dupedkey + * chain. + */ + do { + t = tt; + tt = tt->rn_dupedkey; + } while (tt && t->rn_mask == tt->rn_mask); + break; + } +#endif + if (tt->rn_mask == netmask) + return (0); + if (netmask == 0 || + (tt->rn_mask && + ((b_leaf < tt->rn_bit) /* index(netmask) > node */ + || rn_refines(netmask, tt->rn_mask) + || rn_lexobetter(netmask, tt->rn_mask)))) + break; + } + /* + * If the mask is not duplicated, we wouldn't + * find it among possible duplicate key entries + * anyway, so the above test doesn't hurt. + * + * We sort the masks for a duplicated key the same way as + * in a masklist -- most specific to least specific. + * This may require the unfortunate nuisance of relocating + * the head of the list. + * + * We also reverse, or doubly link the list through the + * parent pointer. + */ + if (tt == saved_tt) { + struct radix_node *xx = x; + /* link in at head of list */ + (tt = treenodes)->rn_dupedkey = t; + tt->rn_flags = t->rn_flags; + tt->rn_parent = x = t->rn_parent; + t->rn_parent = tt; /* parent */ + if (x->rn_left == t) + x->rn_left = tt; + else + x->rn_right = tt; + saved_tt = tt; x = xx; + } else { + (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; + t->rn_dupedkey = tt; + tt->rn_parent = t; /* parent */ + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = tt; /* parent */ + } +#ifdef RN_DEBUG + t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; +#endif + tt->rn_key = (caddr_t) v; + tt->rn_bit = -1; + tt->rn_flags = RNF_ACTIVE; + } + /* + * Put mask in tree. + */ + if (netmask) { + tt->rn_mask = netmask; + tt->rn_bit = x->rn_bit; + tt->rn_flags |= x->rn_flags & RNF_NORMAL; + } + t = saved_tt->rn_parent; + if (keyduplicated) + goto on2; + b_leaf = -1 - t->rn_bit; + if (t->rn_right == saved_tt) + x = t->rn_left; + else + x = t->rn_right; + /* Promote general routes from below */ + if (x->rn_bit < 0) { + for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) + if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { + *mp = m = rn_new_radix_mask(x, 0); + if (m) + mp = &m->rm_mklist; + } + } else if (x->rn_mklist) { + /* + * Skip over masks whose index is > that of new node + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m->rm_bit >= b_leaf) + break; + t->rn_mklist = m; *mp = 0; + } +on2: + /* Add new route to highest possible ancestor's list */ + if ((netmask == 0) || (b > t->rn_bit )) + return tt; /* can't lift at all */ + b_leaf = tt->rn_bit; + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + /* + * Search through routes associated with node to + * insert new route according to index. + * Need same criteria as when sorting dupedkeys to avoid + * double loop on deletion. + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { + if (m->rm_bit < b_leaf) + continue; + if (m->rm_bit > b_leaf) + break; + if (m->rm_flags & RNF_NORMAL) { + mmask = m->rm_leaf->rn_mask; + if (tt->rn_flags & RNF_NORMAL) { +#if !defined(RADIX_MPATH) + log(LOG_ERR, + "Non-unique normal route, mask not entered\n"); +#endif + return tt; + } + } else + mmask = m->rm_mask; + if (mmask == netmask) { + m->rm_refs++; + tt->rn_mklist = m; + return tt; + } + if (rn_refines(netmask, mmask) + || rn_lexobetter(netmask, mmask)) + break; + } + *mp = rn_new_radix_mask(tt, *mp); + return tt; +} + +struct radix_node * +rn_delete(v_arg, netmask_arg, head) + void *v_arg, *netmask_arg; + struct radix_node_head *head; +{ + register struct radix_node *t, *p, *x, *tt; + struct radix_mask *m, *saved_m, **mp; + struct radix_node *dupedkey, *saved_tt, *top; + caddr_t v, netmask; + int b, head_off, vlen; + + v = v_arg; + netmask = netmask_arg; + x = head->rnh_treetop; + tt = rn_search(v, x); + head_off = x->rn_offset; + vlen = LEN(v); + saved_tt = tt; + top = x; + if (tt == 0 || + bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) + return (0); + /* + * Delete our route from mask lists. + */ + if (netmask) { + if ((x = rn_addmask(netmask, 1, head_off)) == 0) + return (0); + netmask = x->rn_key; + while (tt->rn_mask != netmask) + if ((tt = tt->rn_dupedkey) == 0) + return (0); + } + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) + goto on1; + if (tt->rn_flags & RNF_NORMAL) { + if (m->rm_leaf != tt || m->rm_refs > 0) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + return 0; /* dangling ref could cause disaster */ + } + } else { + if (m->rm_mask != tt->rn_mask) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + goto on1; + } + if (--m->rm_refs >= 0) + goto on1; + } + b = -1 - tt->rn_bit; + t = saved_tt->rn_parent; + if (b > t->rn_bit) + goto on1; /* Wasn't lifted at all */ + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m == saved_m) { + *mp = m->rm_mklist; + MKFree(m); + break; + } + if (m == 0) { + log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); + if (tt->rn_flags & RNF_NORMAL) + return (0); /* Dangling ref to us */ + } +on1: + /* + * Eliminate us from tree + */ + if (tt->rn_flags & RNF_ROOT) + return (0); +#ifdef RN_DEBUG + /* Get us out of the creation list */ + for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} + if (t) t->rn_ybro = tt->rn_ybro; +#endif + t = tt->rn_parent; + dupedkey = saved_tt->rn_dupedkey; + if (dupedkey) { + /* + * Here, tt is the deletion target and + * saved_tt is the head of the dupekey chain. + */ + if (tt == saved_tt) { + /* remove from head of chain */ + x = dupedkey; x->rn_parent = t; + if (t->rn_left == tt) + t->rn_left = x; + else + t->rn_right = x; + } else { + /* find node in front of tt on the chain */ + for (x = p = saved_tt; p && p->rn_dupedkey != tt;) + p = p->rn_dupedkey; + if (p) { + p->rn_dupedkey = tt->rn_dupedkey; + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = p; + /* parent */ + } else log(LOG_ERR, "rn_delete: couldn't find us\n"); + } + t = tt + 1; + if (t->rn_flags & RNF_ACTIVE) { +#ifndef RN_DEBUG + *++x = *t; + p = t->rn_parent; +#else + b = t->rn_info; + *++x = *t; + t->rn_info = b; + p = t->rn_parent; +#endif + if (p->rn_left == t) + p->rn_left = x; + else + p->rn_right = x; + x->rn_left->rn_parent = x; + x->rn_right->rn_parent = x; + } + goto out; + } + if (t->rn_left == tt) + x = t->rn_right; + else + x = t->rn_left; + p = t->rn_parent; + if (p->rn_right == t) + p->rn_right = x; + else + p->rn_left = x; + x->rn_parent = p; + /* + * Demote routes attached to us. + */ + if (t->rn_mklist) { + if (x->rn_bit >= 0) { + for (mp = &x->rn_mklist; (m = *mp);) + mp = &m->rm_mklist; + *mp = t->rn_mklist; + } else { + /* If there are any key,mask pairs in a sibling + duped-key chain, some subset will appear sorted + in the same order attached to our mklist */ + for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) + if (m == x->rn_mklist) { + struct radix_mask *mm = m->rm_mklist; + x->rn_mklist = 0; + if (--(m->rm_refs) < 0) + MKFree(m); + m = mm; + } + if (m) + log(LOG_ERR, + "rn_delete: Orphaned Mask %p at %p\n", + m, x); + } + } + /* + * We may be holding an active internal node in the tree. + */ + x = tt + 1; + if (t != x) { +#ifndef RN_DEBUG + *t = *x; +#else + b = t->rn_info; + *t = *x; + t->rn_info = b; +#endif + t->rn_left->rn_parent = t; + t->rn_right->rn_parent = t; + p = x->rn_parent; + if (p->rn_left == x) + p->rn_left = t; + else + p->rn_right = t; + } +out: + tt->rn_flags &= ~RNF_ACTIVE; + tt[1].rn_flags &= ~RNF_ACTIVE; + return (tt); +} + +/* + * This is the same as rn_walktree() except for the parameters and the + * exit. + */ +static int +rn_walktree_from(h, a, m, f, w) + struct radix_node_head *h; + void *a, *m; + walktree_f_t *f; + void *w; +{ + int error; + struct radix_node *base, *next; + u_char *xa = (u_char *)a; + u_char *xm = (u_char *)m; + register struct radix_node *rn, *last = 0 /* shut up gcc */; + int stopping = 0; + int lastb; + + /* + * rn_search_m is sort-of-open-coded here. We cannot use the + * function because we need to keep track of the last node seen. + */ + /* printf("about to search\n"); */ + for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { + last = rn; + /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", + rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ + if (!(rn->rn_bmask & xm[rn->rn_offset])) { + break; + } + if (rn->rn_bmask & xa[rn->rn_offset]) { + rn = rn->rn_right; + } else { + rn = rn->rn_left; + } + } + /* printf("done searching\n"); */ + + /* + * Two cases: either we stepped off the end of our mask, + * in which case last == rn, or we reached a leaf, in which + * case we want to start from the last node we looked at. + * Either way, last is the node we want to start from. + */ + rn = last; + lastb = rn->rn_bit; + + /* printf("rn %p, lastb %d\n", rn, lastb);*/ + + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + + while (!stopping) { + /* printf("node %p (%d)\n", rn, rn->rn_bit); */ + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && !(rn->rn_flags & RNF_ROOT)) { + rn = rn->rn_parent; + + /* if went up beyond last, stop */ + if (rn->rn_bit <= lastb) { + stopping = 1; + /* printf("up too far\n"); */ + /* + * XXX we should jump to the 'Process leaves' + * part, because the values of 'rn' and 'next' + * we compute will not be used. Not a big deal + * because this loop will terminate, but it is + * inefficient and hard to understand! + */ + } + } + + /* + * At the top of the tree, no need to traverse the right + * half, prevent the traversal of the entire tree in the + * case of default route. + */ + if (rn->rn_parent->rn_flags & RNF_ROOT) + stopping = 1; + + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base) != 0) { + base = rn->rn_dupedkey; + /* printf("leaf %p\n", rn); */ + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + + if (rn->rn_flags & RNF_ROOT) { + /* printf("root, stopping"); */ + stopping = 1; + } + + } + return 0; +} + +static int +rn_walktree(h, f, w) + struct radix_node_head *h; + walktree_f_t *f; + void *w; +{ + int error; + struct radix_node *base, *next; + register struct radix_node *rn = h->rnh_treetop; + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + + /* First time through node, go left */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + for (;;) { + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && (rn->rn_flags & RNF_ROOT) == 0) + rn = rn->rn_parent; + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base)) { + base = rn->rn_dupedkey; + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + if (rn->rn_flags & RNF_ROOT) + return (0); + } + /* NOTREACHED */ +} + +/* + * Allocate and initialize an empty tree. This has 3 nodes, which are + * part of the radix_node_head (in the order ) and are + * marked RNF_ROOT so they cannot be freed. + * The leaves have all-zero and all-one keys, with significant + * bits starting at 'off'. + * Return 1 on success, 0 on error. + */ +int +rn_inithead(head, off) + void **head; + int off; +{ + register struct radix_node_head *rnh; + register struct radix_node *t, *tt, *ttt; + if (*head) + return (1); + R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); + if (rnh == 0) + return (0); +#ifdef _KERNEL + RADIX_NODE_HEAD_LOCK_INIT(rnh); +#endif + *head = rnh; + t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); + ttt = rnh->rnh_nodes + 2; + t->rn_right = ttt; + t->rn_parent = t; + tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ + tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; + tt->rn_bit = -1 - off; + *ttt = *tt; + ttt->rn_key = rn_ones; + rnh->rnh_addaddr = rn_addroute; + rnh->rnh_deladdr = rn_delete; + rnh->rnh_matchaddr = rn_match; + rnh->rnh_lookup = rn_lookup; + rnh->rnh_walktree = rn_walktree; + rnh->rnh_walktree_from = rn_walktree_from; + rnh->rnh_treetop = t; + return (1); +} + +int +rn_detachhead(void **head) +{ + struct radix_node_head *rnh; + + KASSERT((head != NULL && *head != NULL), + ("%s: head already freed", __func__)); + rnh = *head; + + /* Free nodes. */ + Free(rnh); + + *head = NULL; + return (1); +} + +void +rn_init(int maxk) +{ + char *cp, *cplim; + + max_keylen = maxk; + if (max_keylen == 0) { + log(LOG_ERR, + "rn_init: radix functions require max_keylen be set\n"); + return; + } + R_Malloc(rn_zeros, char *, 3 * max_keylen); + if (rn_zeros == NULL) + panic("rn_init"); + bzero(rn_zeros, 3 * max_keylen); + rn_ones = cp = rn_zeros + max_keylen; + addmask_key = cplim = rn_ones + max_keylen; + while (cp < cplim) + *cp++ = -1; + if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0) + panic("rn_init 2"); +} diff --git a/dummynet2/winmissing.h b/dummynet2/winmissing.h new file mode 100644 index 0000000..7c1e928 --- /dev/null +++ b/dummynet2/winmissing.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2010 Francesco Magno, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: winmissing.h 5563 2010-02-26 16:25:23Z svn_magno $ + * definitions and other things needed to build freebsd kernel + * modules in Windows (with the MSVC compiler) + */ + +#ifndef _WINMISSING_H_ +#define _WINMISSING_H_ + +#include +#include +#include +#include +#include +#include + +typedef UCHAR u_char; +typedef UCHAR u_int8_t; +typedef UCHAR uint8_t; +typedef USHORT u_short; +typedef USHORT u_int16_t; +typedef USHORT uint16_t; +typedef USHORT n_short; +typedef UINT u_int; +typedef INT32 int32_t; +typedef UINT32 u_int32_t; +typedef UINT32 uint32_t; +typedef ULONG u_long; +typedef ULONG n_long; +typedef UINT64 uint64_t; +typedef UINT64 u_int64_t; +typedef INT64 int64_t; + +typedef UINT32 in_addr_t; +typedef UCHAR sa_family_t; +typedef USHORT in_port_t; +typedef UINT32 __gid_t; +typedef UINT32 gid_t; +typedef UINT32 __uid_t; +typedef UINT32 uid_t; +typedef ULONG n_time; +typedef char* caddr_t; + +/* linux_lookup uses __be32 and __be16 in the prototype */ +typedef uint32_t __be32; /* XXX __u32 __bitwise __be32 */ +typedef uint16_t __be16; /* XXX */ + +//*** DEBUG STUFF *** +#define printf DbgPrint +#define log(lev, ...) DbgPrint(__VA_ARGS__) +const char* texify_cmd(int i); +const char* texify_proto(unsigned int p); +//*** end DEBUG STUFF *** + +#define snprintf _snprintf +#define timespec timeval +struct timeval { + long tv_sec; + long tv_usec; +}; + +struct in_addr { + in_addr_t s_addr; +}; + +struct sockaddr_in { + uint8_t sin_len; + sa_family_t sin_family; + in_port_t sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; + +/* XXX watch out, windows names are actually longer */ +#define IFNAMSIZ 16 +#define IF_NAMESIZE 16 + +#define ETHER_ADDR_LEN 6 + +/* we do not include the windows headers for in6_addr so + * we need to provide our own definition for the kernel. + */ +struct in6_addr { + union { + uint8_t __u6_addr8[16]; + uint16_t __u6_addr16[8]; + uint32_t __u6_addr32[4]; + } __u6_addr; /* 128-bit IP6 address */ +}; + +#define htons(x) RtlUshortByteSwap(x) +#define ntohs(x) RtlUshortByteSwap(x) +#define htonl(x) RtlUlongByteSwap(x) +#define ntohl(x) RtlUlongByteSwap(x) + +#define ENOSPC 28 /* No space left on device */ +#define EOPNOTSUPP 45 /* Operation not supported */ +#define EACCES 13 /* Permission denied */ +#define ENOENT 2 /* No such file or directory */ +#define EINVAL 22 /* Invalid argument */ +#define EPROTONOSUPPORT 43 /* Protocol not supported */ +#define ENOMEM 12 /* Cannot allocate memory */ +#define EEXIST 17 /* File exists */ +#define ESRCH 3 +#define ENOBUFS 55 /* No buffer space available */ +#define EBUSY 16 /* Module busy */ + + +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#define __unused +#define __packed +#define __aligned(x); +#define __user +#define __init +#define __exit +#define __func__ __FUNCTION__ +#define inline __inline + +struct sockaddr_in6 { + int dummy; +}; + +//SPINLOCKS +#define DEFINE_SPINLOCK(x) NDIS_SPIN_LOCK x +#define mtx_init(m,a,b,c) NdisAllocateSpinLock(m) +#define mtx_lock(_l) NdisAcquireSpinLock(_l) +#define mtx_unlock(_l) NdisReleaseSpinLock(_l) +#define mtx_destroy(m) NdisFreeSpinLock(m) +#define mtx_assert(a, b) + +#define rw_rlock(_l) NdisAcquireSpinLock(_l) +#define rw_runlock(_l) NdisReleaseSpinLock(_l) +#define rw_assert(a, b) +#define rw_wlock(_l) NdisAcquireSpinLock(_l) +#define rw_wunlock(_l) NdisReleaseSpinLock(_l) +#define rw_destroy(_l) NdisFreeSpinLock(_l) +#define rw_init(_l, msg) NdisAllocateSpinLock(_l) +#define rw_init_flags(_l, s, v) NdisAllocateSpinLock(_l) + +#define rwlock_t NDIS_SPIN_LOCK +#define spinlock_t NDIS_SPIN_LOCK + +#define s6_addr __u6_addr.__u6_addr8 + + +struct icmphdr { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ +}; + +#define ICMP_ECHO 8 /* echo service */ + +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_SSRR 137 /* strict source route */ +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ + +#define IPPROTO_ICMP 1 /* control message protocol */ +#define IPPROTO_TCP 6 /* tcp */ +#define IPPROTO_UDP 17 /* user datagram protocol */ +#define IPPROTO_ICMPV6 58 /* ICMP6 */ +#define IPPROTO_SCTP 132 /* SCTP */ +#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ +#define IPPROTO_ROUTING 43 /* IP6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ +#define IPPROTO_DSTOPTS 60 /* IP6 destination option */ +#define IPPROTO_AH 51 /* IP6 Auth Header */ +#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ +#define IPPROTO_NONE 59 /* IP6 no next header */ +#define IPPROTO_PIM 103 /* Protocol Independent Mcast */ + +#define IPPROTO_IPV6 41 +#define IPPROTO_IPV4 4 /* IPv4 encapsulation */ + + +#define INADDR_ANY (uint32_t)0x00000000 + +#define AF_INET 2 /* internetwork: UDP, TCP, etc. */ +#define AF_LINK 18 /* Link layer interface */ + +#define IN_CLASSD(i) (((uint32_t)(i) & 0xf0000000) == 0xe0000000) +#define IN_MULTICAST(i) IN_CLASSD(i) + +#define DROP 0 +#define PASS 1 +#define DUMMYNET 2 +#define INCOMING 0 +#define OUTGOING 1 + +size_t strlcpy(char *dst, const char *src, size_t siz); +void do_gettimeofday(struct timeval *tv); +int ffs(int bits); +int time_uptime_w32(); + +#endif /* _WINMISSING_H_ */ diff --git a/glue.h b/glue.h new file mode 100644 index 0000000..622ca4b --- /dev/null +++ b/glue.h @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * $Id: glue.h 11277 2012-06-10 17:44:15Z marta $ + * + * glue code to adapt the FreeBSD version to linux and windows, + * userland and kernel. + * This is included before any other headers, so we do not have + * a chance to override any #define that should appear in other + * headers. + * First handle headers for userland and kernel. Then common code + * (including headers that require a specific order of inclusion), + * then the user- and kernel- specific parts. + */ + +#ifndef _GLUE_H +#define _GLUE_H + + +/* + * common definitions to allow portability + */ +#ifndef __FBSDID +#define __FBSDID(x) +#endif /* FBSDID */ + +#ifndef KERNEL_MODULE /* Userland headers */ + +#if defined(__CYGWIN32__) && !defined(_WIN32) +#define _WIN32 +#endif + +#if defined(TCC) && defined(_WIN32) +#include +#endif /* TCC */ + +#include /* linux needs it in addition to sys/types.h */ +#include /* for size_t */ +#include +#include +#include +#ifdef __linux__ +#include /* linux only 20111031 */ +#endif + +#else /* KERNEL_MODULE, kernel headers */ + +#ifdef __linux__ + +#include + +#define ifnet net_device /* remap */ +#define _KERNEL # make kernel structure visible +#define KLD_MODULE # add the module glue +#define INET # want inet support + +#include /* linux kernel */ +#include /* linux kernel */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) // or 2.4.x +#include /* linux/msg.h require this */ +#include /* just MAX_ADDR_LEN 8 on 2.4 32 on 2.6, also brings in byteorder */ +#endif + +/* on 2.6.22, msg.h requires spinlock_types.h */ +/* XXX spinlock_type.h was introduced in 2.6.14 */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) +#include +#endif +/* XXX m_type define conflict with include/sys/mbuf.h, + * so early include msg.h (to be solved) +*/ +#include + +#include +#include /* struct in_addr */ +#include /* struct in6_addr */ +#include +/* + * LIST_HEAD in queue.h conflict with linux/list.h + * some previous linux include need list.h definition + */ +#undef LIST_HEAD + +#define IF_NAMESIZE (16) +typedef uint32_t in_addr_t; + +#define printf(fmt, arg...) printk(KERN_ERR fmt, ##arg) +#endif /* __linux__ */ + +#endif /* KERNEL_MODULE end of kernel headers */ + + +/* + * Part 2: common userland and kernel definitions + */ + +#ifndef ETHER_ADDR_LEN +#define ETHER_ADDR_LEN (6+0) /* length of an Ethernet address */ +#endif + +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ + +/* + * linux: sysctl are mapped into /sys/module/ipfw_mod parameters + * windows: they are emulated via get/setsockopt + */ +#define CTLFLAG_RD 1 +#define CTLFLAG_RDTUN 1 +#define CTLFLAG_RW 2 +#define CTLFLAG_SECURE3 0 // unsupported +#define CTLFLAG_VNET 0 /* unsupported */ + +/* if needed, queue.h must be included here after list.h */ + +/* + * struct thread is used in linux and windows kernel. + * In windows, we need to emulate the sockopt interface + * so also the userland needs to have the struct sockopt defined. + * In order to achieve 64 bit compatibility, padding has been inserted. + */ +struct thread { + void *sopt_td; + void *td_ucred; +}; + +enum sopt_dir { SOPT_GET, SOPT_SET }; + +struct sockopt { + enum sopt_dir sopt_dir; /* is this a get or a set? */ + int sopt_level; /* second arg of [gs]etsockopt */ + int sopt_name; /* third arg of [gs]etsockopt */ +#ifdef _X64EMU + void* pad1; + void* pad2; +#endif + void *sopt_val; /* fourth arg of [gs]etsockopt */ + size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ +#ifdef _X64EMU + void* pad3; + void* pad4; +#endif + struct thread *sopt_td; /* calling thread or null if kernel */ +}; + + +#define INET_ADDRSTRLEN (16) /* missing in netinet/in.h */ + +/* + * List of values used for set/getsockopt options. + * The base value on FreeBSD is defined as a macro, + * if not available we will use our own enum. + * The TABLE_BASE value is used in the kernel. + */ +#ifndef IP_FW_TABLE_ADD +#define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ +enum ipfw_msg_type { + IP_FW_TABLE_ADD = _IPFW_SOCKOPT_BASE, + IP_FW_TABLE_DEL, + IP_FW_TABLE_FLUSH, + IP_FW_TABLE_GETSIZE, + IP_FW_TABLE_LIST, + IP_FW_DYN_GET, /* new addition */ + + /* IP_FW3 and IP_DUMMYNET3 are the new API */ + IP_FW3 = _IPFW_SOCKOPT_BASE + 8, + IP_DUMMYNET3, + + IP_FW_ADD = _IPFW_SOCKOPT_BASE + 10, + IP_FW_DEL, + IP_FW_FLUSH, + IP_FW_ZERO, + IP_FW_GET, + IP_FW_RESETLOG, + + IP_FW_NAT_CFG, + IP_FW_NAT_DEL, + IP_FW_NAT_GET_CONFIG, + IP_FW_NAT_GET_LOG, + + IP_DUMMYNET_CONFIGURE, + IP_DUMMYNET_DEL , + IP_DUMMYNET_FLUSH, + /* 63 is missing */ + IP_DUMMYNET_GET = _IPFW_SOCKOPT_BASE + 24, + _IPFW_SOCKOPT_END +}; +#endif /* IP_FW_TABLE_ADD */ + +/* + * Part 3: userland stuff + */ + +#ifndef KERNEL_MODULE + +/* + * internal names in struct in6_addr (netinet/in6.h) differ, + * so we remap the FreeBSD names to the platform-specific ones. + */ +#ifndef _WIN32 +#define __u6_addr in6_u +#define __u6_addr32 u6_addr32 +#define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ +#else /* _WIN32 uses different naming */ +#define __u6_addr __u6 +#define __u6_addr32 __s6_addr32 +#endif /* _WIN32 */ + +/* missing in linux netinet/ip.h */ +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ + +/* defined in freebsd netinet/icmp6.h */ +#define ICMP6_MAXTYPE 201 + +/* on freebsd sys/socket.h pf specific */ +#define NET_RT_IFLIST 3 /* survey interface list */ + +#if defined(__linux__) || defined(__CYGWIN32__) +/* on freebsd net/if.h XXX used */ +struct if_data { + /* ... */ + u_long ifi_mtu; /* maximum transmission unit */ +}; + +/* + * Message format for use in obtaining information about interfaces + * from getkerninfo and the routing socket. + * This is used in nat.c + */ +struct if_msghdr { + u_short ifm_msglen; /* to skip over unknown messages */ + u_char ifm_version; /* future binary compatibility */ + u_char ifm_type; /* message type */ + int ifm_addrs; /* like rtm_addrs */ + int ifm_flags; /* value of if_flags */ + u_short ifm_index; /* index for associated ifp */ + struct if_data ifm_data;/* stats and other ifdata */ +}; + +/* + * Message format for use in obtaining information about interface + * addresses from getkerninfo and the routing socket + */ +struct ifa_msghdr { + u_short ifam_msglen; /* to skip over unknown messages */ + u_char ifam_version; /* future binary compatibility */ + u_char ifam_type; /* message type */ + int ifam_addrs; /* like rtm_addrs */ + int ifam_flags; /* value of ifa_flags */ + u_short ifam_index; /* index for associated ifp */ + int ifam_metric; /* value of ifa_metric */ +}; + +#ifndef NO_RTM /* conflicting with netlink */ +/* missing in net/route.h */ +#define RTM_VERSION 5 /* Up the ante and ignore older versions */ +#define RTM_IFINFO 0xe /* iface going up/down etc. */ +#define RTM_NEWADDR 0xc /* address being added to iface */ +#define RTA_IFA 0x20 /* interface addr sockaddr present */ +#endif /* NO_RTM */ + +/* SA_SIZE is used in the userland nat.c modified */ +#define SA_SIZE(sa) \ + ( (!(sa) ) ? \ + sizeof(long) : \ + 1 + ( (sizeof(struct sockaddr) - 1) | (sizeof(long) - 1) ) ) + +/* sys/time.h */ +/* + * Getkerninfo clock information structure + */ +struct clockinfo { + int hz; /* clock frequency */ + int tick; /* micro-seconds per hz tick */ + int spare; + int stathz; /* statistics clock frequency */ + int profhz; /* profiling clock frequency */ +}; + +/* no sin_len in sockaddr, we only remap in userland */ +#define sin_len sin_zero[0] + +#endif /* Linux/Win */ + +/* + * linux does not have a reentrant version of qsort, + * so we the FreeBSD stdlib version. + */ +void qsort_r(void *a, size_t n, size_t es, void *thunk, + int cmp_t(void *, const void *, const void *)); + +/* prototypes from libutil */ +/* humanize_number(3) */ +#define HN_DECIMAL 0x01 +#define HN_NOSPACE 0x02 +#define HN_B 0x04 +#define HN_DIVISOR_1000 0x08 + +#define HN_GETSCALE 0x10 +#define HN_AUTOSCALE 0x20 + +int humanize_number(char *_buf, size_t _len, int64_t _number, + const char *_suffix, int _scale, int _flags); +int expand_number(const char *_buf, int64_t *_num); + +#define setprogname(x) /* not present in linux */ + +extern int optreset; /* not present in linux */ + +size_t strlcpy(char * dst, const char * src, size_t siz); +long long int strtonum(const char *nptr, long long minval, + long long maxval, const char **errstr); + +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen); + + +#else /* KERNEL_MODULE */ + +/* + * Part 4: kernel stuff + */ + +/* linux and windows kernel do not have bcopy ? */ +#define bcopy(_s, _d, _l) memcpy(_d, _s, _l) +/* definitions useful for the kernel side */ +struct route_in6 { + int dummy; +}; + +#ifdef __linux__ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) // or 2.4.x +#include +#endif + +/* skb_dst() and skb_dst_set() was introduced from linux 2.6.31 */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) +void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst); +struct dst_entry *skb_dst(const struct sk_buff *skb); +#endif + +/* The struct flowi changed */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) // check boundaries +#define flow_daddr fl.u.ip4 +#else +#define flow_daddr fl.nl_u.ip4_u +#endif + +#endif /* __linux__ */ + +/* + * Do not load prio_heap.h header because of conflicting names + * with our heap functions defined in include/netinet/ipfw/dn_heap.h + */ +#define _LINUX_PRIO_HEAP_H +/* + * The following define prevent the ipv6.h header to be loaded. + * Starting from the 2.6.38 kernel the ipv6.h file, which is included + * by include/net/inetpeer.h in turn included by net/route.h + * include the system tcp.h file while we want to include + * our include/net/tcp.h instead. + */ +#ifndef _NET_IPV6_H +#define _NET_IPV6_H +static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2) +{ + memcpy(a1, a2, sizeof(struct in6_addr)); +} +#endif /* _NET_IPV6_H */ + +#endif /* KERNEL_MODULE */ + +/* + * Part 5: windows specific stuff + */ + +#ifdef _WIN32 +#ifndef KERNEL_MODULE +#define CTL_CODE( DeviceType, Function, Method, Access ) ( \ + ((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method) \ +) + +#define METHOD_BUFFERED 0 +#define METHOD_IN_DIRECT 1 +#define METHOD_OUT_DIRECT 2 +#define METHOD_NEITHER 3 +#define FILE_ANY_ACCESS 0 +#define FILE_READ_DATA ( 0x0001 ) // file & pipe +#define FILE_WRITE_DATA ( 0x0002 ) // file & pipe +#endif /* !KERNEL_MODULE */ + +#define FILE_DEVICE_IPFW 0x00654324 +#define IP_FW_BASE_CTL 0x840 +#define IP_FW_SETSOCKOPT \ + CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 1, METHOD_BUFFERED, FILE_WRITE_DATA) +#define IP_FW_GETSOCKOPT \ + CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 2, METHOD_BUFFERED, FILE_ANY_ACCESS) + +/********************************* +* missing declarations in altq.c * +**********************************/ + +#define _IOWR(x,y,t) _IOW(x,y,t) + +/********************************** +* missing declarations in ipfw2.c * +***********************************/ + +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ +#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ +#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ + +#define __unused + + +struct ether_addr; +struct ether_addr * ether_aton(const char *a); + +/********************************* +* missing declarations in ipv6.c * +**********************************/ + +struct hostent* gethostbyname2(const char *name, int af); + + +/******************** +* windows wrappings * +*********************/ + +int my_socket(int domain, int ty, int proto); +#define socket(_a, _b, _c) my_socket(_a, _b, _c) + +#endif /* _WIN32 */ +/******************* +* SYSCTL emulation * +********************/ +#if defined (_WIN32) || defined (EMULATE_SYSCTL) +#define STRINGIFY(x) #x + +/* flag is set with the last 2 bits for access, as defined in glue.h + * and the rest for type + */ +enum { + SYSCTLTYPE_INT = 0, + SYSCTLTYPE_UINT, + SYSCTLTYPE_SHORT, + SYSCTLTYPE_USHORT, + SYSCTLTYPE_LONG, + SYSCTLTYPE_ULONG, + SYSCTLTYPE_STRING, +}; + +struct sysctlhead { + uint32_t blocklen; //total size of the entry + uint32_t namelen; //strlen(name) + '\0' + uint32_t flags; //type and access + uint32_t datalen; +}; + +#ifdef _KERNEL + +#ifdef SYSCTL_NODE +#undef SYSCTL_NODE +#endif +#define SYSCTL_NODE(a,b,c,d,e,f) +#define SYSCTL_DECL(a) +#define SYSCTL_VNET_PROC(a,b,c,d,e,f,g,h,i) + +#define GST_HARD_LIMIT 100 + +/* In the module, GST is implemented as an array of + * sysctlentry, but while passing data to the userland + * pointers are useless, the buffer is actually made of: + * - sysctlhead (fixed size, containing lengths) + * - data (typically 32 bit) + * - name (zero-terminated and padded to mod4) + */ + +struct sysctlentry { + struct sysctlhead head; + char* name; + void* data; +}; + +struct sysctltable { + int count; //number of valid tables + int totalsize; //total size of valid entries of al the valid tables + void* namebuffer; //a buffer for all chained names + struct sysctlentry entry[GST_HARD_LIMIT]; +}; + +#ifdef SYSBEGIN +#undef SYSBEGIN +#endif +#define SYSBEGIN(x) void sysctl_addgroup_##x() { +#ifdef SYSEND +#undef SYSEND +#endif +#define SYSEND } + +/* XXX remove duplication */ +#define SYSCTL_INT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) + +#define SYSCTL_VNET_INT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) + +#define SYSCTL_UINT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) + +#define SYSCTL_LONG(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e) + +#define SYSCTL_ULONG(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e) +#define TUNABLE_INT(a,b) + +void keinit_GST(void); +void keexit_GST(void); +int kesysctl_emu_set(void* p, int l); +int kesysctl_emu_get(struct sockopt* sopt); +void sysctl_pushback(char* name, int flags, int datalen, void* data); + +#endif /* _KERNEL */ + +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen); +#endif /* _WIN32" || EMULATE_SYSCTL */ +#ifdef _WIN32 +int do_cmd(int optname, void *optval, uintptr_t optlen); + +#endif /* _WIN32 */ + +#endif /* !_GLUE_H */ diff --git a/ipfw/Makefile b/ipfw/Makefile new file mode 100644 index 0000000..4800b4a --- /dev/null +++ b/ipfw/Makefile @@ -0,0 +1,120 @@ +# +# $Id: Makefile 11277 2012-06-10 17:44:15Z marta $ +# +# GNUMakefile to build the userland part of ipfw on Linux and Windows +# +# enable extra debugging information +# Do not set with = or := so we can inherit from the caller +XOSARCH := $(shell uname) +OSARCH ?= $(XOSARCH) +OSARCH := $(shell uname) +OSARCH := $(findstring $(OSARCH),FreeBSD Linux Darwin) +ifeq ($(OSARCH),) + OSARCH := Windows +endif + +$(warning Building userland ipfw for $(VER) $(OSARCH)) + +#TCC=c:/tesi/tcc + +# common flags +EXTRA_CFLAGS += -O1 +EXTRA_CFLAGS += -Wall +EXTRA_CFLAGS += -include ../glue.h +EXTRA_CFLAGS += -I ./include_e -I ./include + +TARGET := ipfw +ifneq ($(VER),openwrt) +ifeq ($(OSARCH),Linux) + EXTRA_CFLAGS += -D__BSD_VISIBLE + EXTRA_CFLAGS += -Werror + # Required by GCC 4.6 + EXTRA_CFLAGS += -Wno-unused-but-set-variable +endif +ifeq ($(OSARCH),FreeBSD) + EXTRA_CFLAGS += -D__BSD_VISIBLE + EXTRA_CFLAGS += -Werror +endif +ifeq ($(OSARCH),Darwin) + EXTRA_CFLAGS += -D__BSD_VISIBLE + EXTRA_CFLAGS += -Werror +endif +# must be Cygwin ? +ifeq ($(OSARCH),Windows) +ifeq ($(TCC),) + EXTRA_CFLAGS += -I/cygdrive/c/WinDDK/7600.16385.0/inc/ddk + EXTRA_CFLAGS += -I . + EXTRA_CFLAGS += -pipe -Wall +else + # TCC points to the root of tcc tree + CC=$(TCC)/tcc.exe + EXTRA_CFLAGS += -DTCC -I.. + EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include + EXTRA_CFLAGS += -nostdinc + + EDIRS += arpa net netinet sys + EFILES += err.h grp.h netdb.h pwd.h sysexits.h + EFILES += arpa/inet.h + EFILES += net/if.h + EFILES += netinet/in.h netinet/in_systm.h netinet/ip.h + EFILES += netinet/ip_icmp.h + EFILES += sys/cdefs.h sys/wait.h + EFILES += sys/ioctl.h sys/socket.h + +endif + # EXTRA_CFLAGS += -D_WIN32 # see who defines it + EXTRA_CFLAGS += -Dsetsockopt=wnd_setsockopt + EXTRA_CFLAGS += -Dgetsockopt=wnd_getsockopt + EXTRA_CFLAGS += -DEMULATE_SYSCTL + EDIRS += net netinet + EFILES += net/ethernet.h net/route.h + EFILES += netinet/ether.h netinet/icmp6.h + EFILES += sys/sysctl.h + TARGET := ipfw.exe +endif +endif # !openwrt + +CFLAGS += $(EXTRA_CFLAGS) +# Location of OS headers and libraries. After our stuff. +USRDIR?= /usr +ifeq ($(TCC),) + CFLAGS += -I$(USRDIR)/include + LDFLAGS += -L$(USRDIR)/lib +else + LDFLAGS += -L. -L$(TCC)/lib -lws2_32 +endif + +OBJS = ipfw2.o dummynet.o main.o ipv6.o qsort_r.o +OBJS += expand_number.o humanize_number.o glue.o + +# we don't use ALTQ +CFLAGS += -DNO_ALTQ +#OBJS += altq.o + +all: $(TARGET) + echo "Done build for $(OSARCH) VER $(VER)" + +$(TARGET): $(OBJS) + $(CC) $(LDFLAGS) -o $@ $^ + +$(OBJS) : ipfw2.h ../glue.h include_e + +# support to create empty dirs and files in include_e/ +# EDIRS is the list of directories, EFILES is the list of files. +EDIRS += sys netinet +EFILES += sys/sockio.h libutil.h + +M ?= $(shell pwd) + +include_e: + echo "running in $M" + -@rm -rf $(M)/include_e opt_* + -@mkdir -p $(M)/include_e + -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) + -@(cd $(M)/include_e/netinet; \ + for i in ip_fw.h ip_dummynet.h tcp.h; do \ + cp ../../../dummynet2/include/netinet/$$i .; done; ) + +clean distclean: + -rm -f $(OBJS) $(TARGET) + -rm -rf include/netinet/ include_e diff --git a/ipfw/add_rules b/ipfw/add_rules new file mode 100755 index 0000000..f7866d7 --- /dev/null +++ b/ipfw/add_rules @@ -0,0 +1,27 @@ +#!/bin/bash +# +# A test script to add rules + +PRG=./ipfw + +myfun() { + $PRG add 10 count icmp from any to 131.114.9.128 + $PRG add 20 count icmp from 131.114.9.128 to any + $PRG add 20 count icmp from any to 131.114.9.130 + $PRG add 30 count icmp from 131.114.9.130 to any + $PRG add 40 count icmp from any to 131.114.9.129 + $PRG add 50 count icmp from 131.114.9.129 to any + $PRG add 60 count icmp from 131.114.9.236 to any + sleep 1 + $PRG del 10 + $PRG del 20 + $PRG del 20 + $PRG del 30 + $PRG del 40 + $PRG del 50 + $PRG del 60 +} + +for ((i=0;i<100;i++)) ; do + myfun +done diff --git a/ipfw/dummynet.c b/ipfw/dummynet.c new file mode 100644 index 0000000..231f52f --- /dev/null +++ b/ipfw/dummynet.c @@ -0,0 +1,1456 @@ +/* + * Copyright (c) 2002-2003,2010 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD: head/sbin/ipfw/dummynet.c 206843 2010-04-19 15:11:45Z luigi $ + * + * dummynet support + */ + +#include +#include +/* XXX there are several sysctl leftover here */ +#include + +#include "ipfw2.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include /* inet_ntoa */ + + +static struct _s_x dummynet_params[] = { + { "plr", TOK_PLR }, + { "noerror", TOK_NOERROR }, + { "buckets", TOK_BUCKETS }, + { "dst-ip", TOK_DSTIP }, + { "src-ip", TOK_SRCIP }, + { "dst-port", TOK_DSTPORT }, + { "src-port", TOK_SRCPORT }, + { "proto", TOK_PROTO }, + { "weight", TOK_WEIGHT }, + { "lmax", TOK_LMAX }, + { "maxlen", TOK_LMAX }, + { "all", TOK_ALL }, + { "mask", TOK_MASK }, /* alias for both */ + { "sched_mask", TOK_SCHED_MASK }, + { "flow_mask", TOK_FLOW_MASK }, + { "droptail", TOK_DROPTAIL }, + { "red", TOK_RED }, + { "gred", TOK_GRED }, + { "bw", TOK_BW }, + { "bandwidth", TOK_BW }, + { "delay", TOK_DELAY }, + { "link", TOK_LINK }, + { "pipe", TOK_PIPE }, + { "queue", TOK_QUEUE }, + { "flowset", TOK_FLOWSET }, + { "sched", TOK_SCHED }, + { "pri", TOK_PRI }, + { "priority", TOK_PRI }, + { "type", TOK_TYPE }, + { "flow-id", TOK_FLOWID}, + { "dst-ipv6", TOK_DSTIP6}, + { "dst-ip6", TOK_DSTIP6}, + { "src-ipv6", TOK_SRCIP6}, + { "src-ip6", TOK_SRCIP6}, + { "profile", TOK_PROFILE}, + { "burst", TOK_BURST}, + { "dummynet-params", TOK_NULL }, + { NULL, 0 } /* terminator */ +}; + +#define O_NEXT(p, len) ((void *)((char *)p + len)) + +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} + +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + +/* handle variable lenght structures moving back the pointer and fixing lenght */ +static void * +o_compact(struct dn_id **o, int len, int real_length, int type) +{ + struct dn_id *ret = *o; + + ret = O_NEXT(*o, -len); + oid_fill(ret, real_length, type, 0); + *o = O_NEXT(ret, real_length); + return ret; +} + +#if 0 +static int +sort_q(void *arg, const void *pa, const void *pb) +{ + int rev = (co.do_sort < 0); + int field = rev ? -co.do_sort : co.do_sort; + long long res = 0; + const struct dn_flow_queue *a = pa; + const struct dn_flow_queue *b = pb; + + switch (field) { + case 1: /* pkts */ + res = a->len - b->len; + break; + case 2: /* bytes */ + res = a->len_bytes - b->len_bytes; + break; + + case 3: /* tot pkts */ + res = a->tot_pkts - b->tot_pkts; + break; + + case 4: /* tot bytes */ + res = a->tot_bytes - b->tot_bytes; + break; + } + if (res < 0) + res = -1; + if (res > 0) + res = 1; + return (int)(rev ? res : -res); +} +#endif + +/* print a mask and header for the subsequent list of flows */ +static void +print_mask(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) { + printf(" " + "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + id->extra ? "queue," : "", + id->proto, + id->src_ip, id->src_port, + id->dst_ip, id->dst_port); + } else { + char buf[255]; + printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", + id->extra ? "queue," : "", + id->proto, id->flow_id6); + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); + printf("%s/0x%04x -> ", buf, id->src_port); + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); + printf("%s/0x%04x\n", buf, id->dst_port); + } +} + +static void +print_header(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) + printf("BKT Prot ___Source IP/port____ " + "____Dest. IP/port____ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + else + printf("BKT ___Prot___ _flow-id_ " + "______________Source IPv6/port_______________ " + "_______________Dest. IPv6/port_______________ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); +} + +static void +list_flow(struct dn_flow *ni, int *print) +{ + char buff[255]; + struct protoent *pe = NULL; + struct in_addr ina; + struct ipfw_flow_id *id = &ni->fid; + + if (*print) { + print_header(&ni->fid); + *print = 0; + } + pe = getprotobynumber(id->proto); + /* XXX: Should check for IPv4 flows */ + printf("%3u%c", (ni->oid.id) & 0xff, + id->extra ? '*' : ' '); + if (!IS_IP6_FLOW_ID(id)) { + if (pe) + printf("%-4s ", pe->p_name); + else + printf("%4u ", id->proto); + ina.s_addr = htonl(id->src_ip); + printf("%15s/%-5d ", + inet_ntoa(ina), id->src_port); + ina.s_addr = htonl(id->dst_ip); + printf("%15s/%-5d ", + inet_ntoa(ina), id->dst_port); + } else { + /* Print IPv6 flows */ + if (pe != NULL) + printf("%9s ", pe->p_name); + else + printf("%9u ", id->proto); + printf("%7d %39s/%-5d ", id->flow_id6, + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), + id->src_port); + printf(" %39s/%-5d ", + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), + id->dst_port); + } + pr_u64(&ni->tot_pkts, 4); + pr_u64(&ni->tot_bytes, 8); + printf("%2u %4u %3u\n", + ni->length, ni->len_bytes, ni->drops); +} + +static void +print_flowset_parms(struct dn_fs *fs, char *prefix) +{ + int l; + char qs[30]; + char plr[30]; + char red[90]; /* Display RED parameters */ + + l = fs->qsize; + if (fs->flags & DN_QSIZE_BYTES) { + if (l >= 8192) + sprintf(qs, "%d KB", l / 1024); + else + sprintf(qs, "%d B", l); + } else + sprintf(qs, "%3d sl.", l); + if (fs->plr) + sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); + else + plr[0] = '\0'; + + if (fs->flags & DN_IS_RED) /* RED parameters */ + sprintf(red, + "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', + 1.0 * fs->w_q / (double)(1 << SCALE_RED), + fs->min_th, + fs->max_th, + 1.0 * fs->max_p / (double)(1 << SCALE_RED)); + else + sprintf(red, "droptail"); + + if (prefix[0]) { + printf("%s %s%s %d queues (%d buckets) %s\n", + prefix, qs, plr, fs->oid.id, fs->buckets, red); + prefix[0] = '\0'; + } else { + printf("q%05d %s%s %d flows (%d buckets) sched %d " + "weight %d lmax %d pri %d %s\n", + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); + if (fs->flags & DN_HAVE_MASK) + print_mask(&fs->flow_mask); + } +} + +static void +print_extra_delay_parms(struct dn_profile *p) +{ + double loss; + if (p->samples_no <= 0) + return; + + loss = p->loss_level; + loss /= p->samples_no; + printf("\t profile: name \"%s\" loss %f samples %d\n", + p->name, loss, p->samples_no); +} + +static void +flush_buf(char *buf) +{ + if (buf[0]) + printf("%s\n", buf); + buf[0] = '\0'; +} + +/* + * generic list routine. We expect objects in a specific order, i.e. + * PIPES AND SCHEDULERS: + * link; scheduler; internal flowset if any; instances + * we can tell a pipe from the number. + * + * FLOWSETS: + * flowset; queues; + * link i (int queue); scheduler i; si(i) { flowsets() : queues } + */ +static void +list_pipes(struct dn_id *oid, struct dn_id *end) +{ + char buf[160]; /* pending buffer */ + int toPrint = 1; /* print header */ + + buf[0] = '\0'; + for (; oid != end; oid = O_NEXT(oid, oid->len)) { + if (oid->len < sizeof(*oid)) + errx(1, "invalid oid len %d\n", oid->len); + + switch (oid->type) { + default: + flush_buf(buf); + printf("unrecognized object %d size %d\n", oid->type, oid->len); + break; + case DN_TEXT: /* list of attached flowsets */ + { + int i, l; + struct { + struct dn_id id; + uint32_t p[0]; + } *d = (void *)oid; + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); + if (l == 0) + break; + printf(" Children flowsets: "); + for (i = 0; i < l; i++) + printf("%u ", d->p[i]); + printf("\n"); + break; + } + case DN_CMD_GET: + if (co.verbose) + printf("answer for cmd %d, len %d\n", oid->type, oid->id); + break; + case DN_SCH: { + struct dn_sch *s = (struct dn_sch *)oid; + flush_buf(buf); + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", + s->sched_nr, + s->name, s->flags, s->buckets, s->oid.id); + if (s->flags & DN_HAVE_MASK) + print_mask(&s->sched_mask); + } + break; + + case DN_FLOW: + list_flow((struct dn_flow *)oid, &toPrint); + break; + + case DN_LINK: { + struct dn_link *p = (struct dn_link *)oid; + double b = p->bandwidth; + char bwbuf[30]; + char burst[5 + 7]; + + /* This starts a new object so flush buffer */ + flush_buf(buf); + /* data rate */ + if (b == 0) + sprintf(bwbuf, "unlimited "); + else if (b >= 1000000) + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); + else if (b >= 1000) + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); + else + sprintf(bwbuf, "%7.3f bit/s ", b); + + if (humanize_number(burst, sizeof(burst), p->burst, + "", HN_AUTOSCALE, 0) < 0 || co.verbose) + sprintf(burst, "%d", (int)p->burst); + sprintf(buf, "%05d: %s %4d ms burst %s", + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); + } + break; + + case DN_FS: + print_flowset_parms((struct dn_fs *)oid, buf); + break; + case DN_PROFILE: + flush_buf(buf); + print_extra_delay_parms((struct dn_profile *)oid); + } + flush_buf(buf); // XXX does it really go here ? + } +} + +/* + * Delete pipe, queue or scheduler i + */ +int +ipfw_delete_pipe(int do_pipe, int i) +{ + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : + ( (do_pipe == 2) ? DN_FS : DN_SCH); + cmd.a[0] = i; + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); + if (i) { + i = 1; + warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); + } + return i; +} + +/* + * Code to parse delay profiles. + * + * Some link types introduce extra delays in the transmission + * of a packet, e.g. because of MAC level framing, contention on + * the use of the channel, MAC level retransmissions and so on. + * From our point of view, the channel is effectively unavailable + * for this extra time, which is constant or variable depending + * on the link type. Additionally, packets may be dropped after this + * time (e.g. on a wireless link after too many retransmissions). + * We can model the additional delay with an empirical curve + * that represents its distribution. + * + * cumulative probability + * 1.0 ^ + * | + * L +-- loss-level x + * | ****** + * | * + * | ***** + * | * + * | ** + * | * + * +-------*-------------------> + * delay + * + * The empirical curve may have both vertical and horizontal lines. + * Vertical lines represent constant delay for a range of + * probabilities; horizontal lines correspond to a discontinuty + * in the delay distribution: the link will use the largest delay + * for a given probability. + * + * To pass the curve to dummynet, we must store the parameters + * in a file as described below, and issue the command + * + * ipfw pipe config ... bw XXX profile ... + * + * The file format is the following, with whitespace acting as + * a separator and '#' indicating the beginning a comment: + * + * samples N + * the number of samples used in the internal + * representation (2..1024; default 100); + * + * loss-level L + * The probability above which packets are lost. + * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); + * + * name identifier + * Optional a name (listed by "ipfw pipe show") + * to identify the distribution; + * + * "delay prob" | "prob delay" + * One of these two lines is mandatory and defines + * the format of the following lines with data points. + * + * XXX YYY + * 2 or more lines representing points in the curve, + * with either delay or probability first, according + * to the chosen format. + * The unit for delay is milliseconds. + * + * Data points does not need to be ordered or equal to the number + * specified in the "samples" line. ipfw will sort and interpolate + * the curve as needed. + * + * Example of a profile file: + + name bla_bla_bla + samples 100 + loss-level 0.86 + prob delay + 0 200 # minimum overhead is 200ms + 0.5 200 + 0.5 300 + 0.8 1000 + 0.9 1300 + 1 1300 + + * Internally, we will convert the curve to a fixed number of + * samples, and when it is time to transmit a packet we will + * model the extra delay as extra bits in the packet. + * + */ + +#define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN +#define ED_TOK_SAMPLES "samples" +#define ED_TOK_LOSS "loss-level" +#define ED_TOK_NAME "name" +#define ED_TOK_DELAY "delay" +#define ED_TOK_PROB "prob" +#define ED_TOK_BW "bw" +#define ED_SEPARATORS " \t\n" +#define ED_MIN_SAMPLES_NO 2 + +/* + * returns 1 if s is a non-negative number, with at least one '.' + */ +static int +is_valid_number(const char *s) +{ + int i, dots_found = 0; + int len = strlen(s); + + for (i = 0; i 1)) + return 0; + return 1; +} + +/* + * Take as input a string describing a bandwidth value + * and return the numeric bandwidth value. + * set clocking interface or bandwidth value + */ +static void +read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) +{ + if (*bandwidth != -1) + warnx("duplicate token, override bandwidth value!"); + + if (arg[0] >= 'a' && arg[0] <= 'z') { + if (!if_name) { + errx(1, "no if support"); + } + if (namelen >= IFNAMSIZ) + warn("interface name truncated"); + namelen--; + /* interface name */ + strncpy(if_name, arg, namelen); + if_name[namelen] = '\0'; + *bandwidth = 0; + } else { /* read bandwidth value */ + int bw; + char *end = NULL; + + bw = strtoul(arg, &end, 0); + if (*end == 'K' || *end == 'k') { + end++; + bw *= 1000; + } else if (*end == 'M' || *end == 'm') { + end++; + bw *= 1000000; + } + if ((*end == 'B' && + _substrcmp2(end, "Bi", "Bit/s") != 0) || + _substrcmp2(end, "by", "bytes") == 0) + bw *= 8; + + if (bw < 0) + errx(EX_DATAERR, "bandwidth too large"); + + *bandwidth = bw; + if (if_name) + if_name[0] = '\0'; + } +} + +struct point { + double prob; + double delay; +}; + +static int +compare_points(const void *vp1, const void *vp2) +{ + const struct point *p1 = vp1; + const struct point *p2 = vp2; + double res = 0; + + res = p1->prob - p2->prob; + if (res == 0) + res = p1->delay - p2->delay; + if (res < 0) + return -1; + else if (res > 0) + return 1; + else + return 0; +} + +#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno + +/* + * Interpolate a set of proability-value tuples. + * + * This function takes as input a tuple of values + * and samples the interpolated curve described from the tuples. + * + * The user defined points are stored in the ponts structure. + * The number of points is stored in points_no. + * The user defined sampling value is stored in samples_no. + * The resulting samples are in the "samples" pointer. + * + * We assume that The last point for the '1' value of the + * probability should be defined. (XXX add checks for this) + * + * The input data are points and points_no. + * The output data are s (the array of s_no samples) + * and s_no (the number of samples) + * + */ +static void +interpolate_samples(struct point *p, int points_no, + int *samples, int samples_no, const char *filename) +{ + double dy; /* delta on the y axis */ + double y; /* current value of y */ + double x; /* current value of x */ + double m; /* the y slope */ + int i; /* samples index */ + int curr; /* points current index */ + + /* make sure that there are enough points. */ + /* XXX Duplicated should be removed */ + if (points_no < 3) + errx(EX_DATAERR, "%s too few samples, need at least %d", + filename, 3); + + qsort(p, points_no, sizeof(struct point), compare_points); + + dy = 1.0/samples_no; + y = 0; + + for (i=0, curr = 0; i < samples_no; i++, y+=dy) { + /* This statment move the curr pointer to the next point + * skipping the points with the same x value. We are + * guaranteed to exit from the loop because the + * last possible value of y is stricly less than 1 + * and the last possible value of the y points is 1 */ + while ( y >= p[curr+1].prob ) curr++; + + /* compute the slope of the curve */ + m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob); + /* compute the x value starting from the current point */ + x = p[curr].delay + (y - p[curr].prob) * m; + samples[i] = x; + } + + /* add the last sample */ + samples[i] = p[curr+1].delay; +} + +/* + * p is the link (old pipe) + * pf is the profile + */ +static void +load_extra_delays(const char *filename, struct dn_profile *p, + struct dn_link *link) +{ + char line[ED_MAX_LINE_LEN]; + FILE *f; + int lineno = 0; + + int samples = -1; + double loss = -1.0; + char profile_name[ED_MAX_NAME_LEN]; + int delay_first = -1; + int do_points = 0; + struct point points[ED_MAX_SAMPLES_NO]; + int points_no = 0; + + /* XXX link never NULL? */ + p->link_nr = link->link_nr; + + profile_name[0] = '\0'; + + f = fopen(filename, "r"); + if (f == NULL) + err(EX_UNAVAILABLE, "fopen: %s", filename); + + while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ + char *s, *cur = line, *name = NULL, *arg = NULL; + + ++lineno; + + /* parse the line */ + while (cur) { + s = strsep(&cur, ED_SEPARATORS); + if (s == NULL || *s == '#') + break; + if (*s == '\0') + continue; + if (arg) + errx(ED_EFMT("too many arguments")); + if (name == NULL) + name = s; + else + arg = s; + } + + if ((name == NULL) || (*name == '#')) /* empty line */ + continue; + + if (!strcasecmp(name, ED_TOK_SAMPLES)) { + if (samples > 0) + errx(ED_EFMT("duplicate ``samples'' line")); + if (atoi(arg) <=0) + errx(ED_EFMT("invalid number of samples")); + samples = atoi(arg); + if (samples>=ED_MAX_SAMPLES_NO-1) + errx(ED_EFMT("too many samples, maximum is %d"), + ED_MAX_SAMPLES_NO-1); + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_BW)) { + char buf[IFNAMSIZ]; + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); + p->bandwidth = link->bandwidth; + } else if (!strcasecmp(name, ED_TOK_LOSS)) { + if (loss != -1.0) + errx(ED_EFMT("duplicated token: %s"), name); + if (!is_valid_number(arg)) + errx(ED_EFMT("invalid %s"), arg); + loss = atof(arg); + if (loss > 1) + errx(ED_EFMT("%s greater than 1.0"), name); + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_NAME)) { + if (profile_name[0] != '\0') + errx(ED_EFMT("duplicated token: %s"), name); + strncpy(profile_name, arg, sizeof(profile_name) - 1); + profile_name[sizeof(profile_name)-1] = '\0'; + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_DELAY)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 1; + do_points = 1; + } else if (!strcasecmp(name, ED_TOK_PROB)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 0; + do_points = 1; + } else if (do_points) { + if (!is_valid_number(name) || !is_valid_number(arg)) + errx(ED_EFMT("invalid point found")); + if (delay_first) { + points[points_no].delay = atof(name); + points[points_no].prob = atof(arg); + } else { + points[points_no].delay = atof(arg); + points[points_no].prob = atof(name); + } + if (points[points_no].prob > 1.0) + errx(ED_EFMT("probability greater than 1.0")); + ++points_no; + } else { + errx(ED_EFMT("unrecognised command '%s'"), name); + } + } + + fclose (f); + + if (samples == -1) { + warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); + samples = 100; + } + + if (loss == -1.0) { + warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); + loss = 1; + } + + interpolate_samples(points, points_no, p->samples, samples, filename); + + p->samples_no = samples++; + p->loss_level = loss * samples; + strncpy(p->name, profile_name, sizeof(p->name)); +} + +/* + * configuration of pipes, schedulers, flowsets. + * When we configure a new scheduler, an empty pipe is created, so: + * + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility + * sched N+Delta type fifo sched_mask ... + * pipe N+Delta + * flowset N+Delta pipe N+Delta (no parameters) + * sched N type wf2q+ sched_mask ... + * pipe N + * + * do_pipe = 2 -> flowset N config + * flowset N parameters + * + * do_pipe = 3 -> sched N config + * sched N parameters (default no pipe) + * optional Pipe N config ... + * pipe ==> + */ +void +ipfw_config_pipe(int ac, char **av) +{ + int i, j; + char *end; + void *par = NULL; + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + struct ipfw_flow_id *mask = NULL; + int lmax; + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; + size_t max_pf_size = sizeof(struct dn_profile) + ED_MAX_SAMPLES_NO * sizeof(int); + + /* + * allocate space for 1 header, + * 1 scheduler, 1 link, 1 flowset, 1 profile + */ + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs); + lmax += max_pf_size; + + av++; ac--; + /* Pipe number */ + if (ac && isdigit(**av)) { + i = atoi(*av); av++; ac--; + } else + i = -1; + if (i <= 0) + errx(EX_USAGE, "need a pipe/flowset/sched number"); + base = buf = safe_calloc(1, lmax); + /* all commands start with a 'CONFIGURE' and a version */ + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + switch (co.do_pipe) { + case 1: /* "pipe N config ..." */ + /* Allocate space for the WF2Q+ scheduler, its link + * and the FIFO flowset. Set the number, but leave + * the scheduler subtype and other parameters to 0 + * so the kernel will use appropriate defaults. + * XXX todo: add a flag to record if a parameter + * is actually configured. + * If we do a 'pipe config' mask -> sched_mask. + * The FIFO scheduler and link are derived from the + * WF2Q+ one in the kernel. + */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + sch->sched_nr = i; + sch->oid.subtype = 0; /* defaults to WF2Q+ */ + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + *flags |= DN_PIPE_CMD; + + p->link_nr = i; + + /* This flowset is only for the FIFO scheduler */ + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + break; + + case 2: /* "queue N config ... " */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + fs->fs_nr = i; + mask = &fs->flow_mask; + flags = &fs->flags; + buckets = &fs->buckets; + break; + + case 3: /* "sched N config ..." */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + fs = o_next(&buf, sizeof(*fs), DN_FS); + sch->sched_nr = i; + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + /* fs is used only with !MULTIQUEUE schedulers */ + fs->fs_nr = i + DN_MAX_ID; + fs->sched_nr = i; + break; + } + /* set to -1 those fields for which we want to reuse existing + * values from the kernel. + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. + * XXX todo: support reuse of the mask. + */ + if (p) + p->bandwidth = -1; + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) + fs->par[j] = -1; + while (ac > 0) { + double d; + int tok = match_token(dummynet_params, *av); + ac--; av++; + + switch(tok) { + case TOK_NOERROR: + NEED(fs, "noerror is only for pipes"); + fs->flags |= DN_NOERROR; + break; + + case TOK_PLR: + NEED(fs, "plr is only for pipes"); + NEED1("plr needs argument 0..1\n"); + d = strtod(av[0], NULL); + if (d > 1) + d = 1; + else if (d < 0) + d = 0; + fs->plr = (int)(d*0x7fffffff); + ac--; av++; + break; + + case TOK_QUEUE: + NEED(fs, "queue is only for pipes or flowsets"); + NEED1("queue needs queue size\n"); + end = NULL; + fs->qsize = strtoul(av[0], &end, 0); + if (*end == 'K' || *end == 'k') { + fs->flags |= DN_QSIZE_BYTES; + fs->qsize *= 1024; + } else if (*end == 'B' || + _substrcmp2(end, "by", "bytes") == 0) { + fs->flags |= DN_QSIZE_BYTES; + } + ac--; av++; + break; + + case TOK_BUCKETS: + NEED(fs, "buckets is only for pipes or flowsets"); + NEED1("buckets needs argument\n"); + *buckets = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_FLOW_MASK: + case TOK_SCHED_MASK: + case TOK_MASK: + NEED(mask, "tok_mask"); + NEED1("mask needs mask specifier\n"); + /* + * per-flow queue, mask is dst_ip, dst_port, + * src_ip, src_port, proto measured in bits + */ + par = NULL; + + bzero(mask, sizeof(*mask)); + end = NULL; + + while (ac >= 1) { + uint32_t *p32 = NULL; + uint16_t *p16 = NULL; + uint32_t *p20 = NULL; + struct in6_addr *pa6 = NULL; + uint32_t a; + + tok = match_token(dummynet_params, *av); + ac--; av++; + switch(tok) { + case TOK_ALL: + /* + * special case, all bits significant + * except 'extra' (the queue number) + */ + mask->dst_ip = ~0; + mask->src_ip = ~0; + mask->dst_port = ~0; + mask->src_port = ~0; + mask->proto = ~0; + n2mask(&mask->dst_ip6, 128); + n2mask(&mask->src_ip6, 128); + mask->flow_id6 = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_QUEUE: + mask->extra = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_DSTIP: + mask->addr_type = 4; + p32 = &mask->dst_ip; + break; + + case TOK_SRCIP: + mask->addr_type = 4; + p32 = &mask->src_ip; + break; + + case TOK_DSTIP6: + mask->addr_type = 6; + pa6 = &mask->dst_ip6; + break; + + case TOK_SRCIP6: + mask->addr_type = 6; + pa6 = &mask->src_ip6; + break; + + case TOK_FLOWID: + mask->addr_type = 6; + p20 = &mask->flow_id6; + break; + + case TOK_DSTPORT: + p16 = &mask->dst_port; + break; + + case TOK_SRCPORT: + p16 = &mask->src_port; + break; + + case TOK_PROTO: + break; + + default: + ac++; av--; /* backtrack */ + goto end_mask; + } + if (ac < 1) + errx(EX_USAGE, "mask: value missing"); + if (*av[0] == '/') { + a = strtoul(av[0]+1, &end, 0); + if (pa6 == NULL) + a = (a == 32) ? ~0 : (1 << a) - 1; + } else + a = strtoul(av[0], &end, 0); + if (p32 != NULL) + *p32 = a; + else if (p16 != NULL) { + if (a > 0xFFFF) + errx(EX_DATAERR, + "port mask must be 16 bit"); + *p16 = (uint16_t)a; + } else if (p20 != NULL) { + if (a > 0xfffff) + errx(EX_DATAERR, + "flow_id mask must be 20 bit"); + *p20 = (uint32_t)a; + } else if (pa6 != NULL) { + if (a > 128) + errx(EX_DATAERR, + "in6addr invalid mask len"); + else + n2mask(pa6, a); + } else { + if (a > 0xFF) + errx(EX_DATAERR, + "proto mask must be 8 bit"); + mask->proto = (uint8_t)a; + } + if (a != 0) + *flags |= DN_HAVE_MASK; + ac--; av++; + } /* end while, config masks */ +end_mask: + break; + + case TOK_RED: + case TOK_GRED: + NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); + fs->flags |= DN_IS_RED; + if (tok == TOK_GRED) + fs->flags |= DN_IS_GENTLE_RED; + /* + * the format for parameters is w_q/min_th/max_th/max_p + */ + if ((end = strsep(&av[0], "/"))) { + double w_q = strtod(end, NULL); + if (w_q > 1 || w_q <= 0) + errx(EX_DATAERR, "0 < w_q <= 1"); + fs->w_q = (int) (w_q * (1 << SCALE_RED)); + } + if ((end = strsep(&av[0], "/"))) { + fs->min_th = strtoul(end, &end, 0); + if (*end == 'K' || *end == 'k') + fs->min_th *= 1024; + } + if ((end = strsep(&av[0], "/"))) { + fs->max_th = strtoul(end, &end, 0); + if (*end == 'K' || *end == 'k') + fs->max_th *= 1024; + } + if ((end = strsep(&av[0], "/"))) { + double max_p = strtod(end, NULL); + if (max_p > 1 || max_p <= 0) + errx(EX_DATAERR, "0 < max_p <= 1"); + fs->max_p = (int)(max_p * (1 << SCALE_RED)); + } + ac--; av++; + break; + + case TOK_DROPTAIL: + NEED(fs, "droptail is only for flowsets"); + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + break; + + case TOK_BW: + NEED(p, "bw is only for links"); + NEED1("bw needs bandwidth or interface\n"); + read_bandwidth(av[0], &p->bandwidth, NULL, 0); + ac--; av++; + break; + + case TOK_DELAY: + NEED(p, "delay is only for links"); + NEED1("delay needs argument 0..10000ms\n"); + p->delay = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_TYPE: { + int l; + NEED(sch, "type is only for schedulers"); + NEED1("type needs a string"); + l = strlen(av[0]); + if (l == 0 || l > 15) + errx(1, "type %s too long\n", av[0]); + strcpy(sch->name, av[0]); + sch->oid.subtype = 0; /* use string */ + ac--; av++; + break; + } + + case TOK_WEIGHT: + NEED(fs, "weight is only for flowsets"); + NEED1("weight needs argument\n"); + fs->par[0] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_LMAX: + NEED(fs, "lmax is only for flowsets"); + NEED1("lmax needs argument\n"); + fs->par[1] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PRI: + NEED(fs, "priority is only for flowsets"); + NEED1("priority needs argument\n"); + fs->par[2] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_SCHED: + case TOK_PIPE: + NEED(fs, "pipe/sched"); + NEED1("pipe/link/sched needs number\n"); + fs->sched_nr = strtoul(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PROFILE: + { + size_t real_length; + + NEED((!pf), "profile already set"); + NEED(p, "profile"); + NEED1("extra delay needs the file name\n"); + + /* load the profile structure using the DN_API */ + pf = o_next(&buf, max_pf_size, DN_PROFILE); + load_extra_delays(av[0], pf, p); //XXX can't fail? + + /* compact the dn_id structure */ + real_length = sizeof(struct dn_profile) + + pf->samples_no * sizeof(int); + o_compact(&buf, max_pf_size, real_length, DN_PROFILE); + --ac; ++av; + } + break; + + case TOK_BURST: + NEED(p, "burst"); + NEED1("burst needs argument\n"); + errno = 0; + if (expand_number(av[0], (int64_t *)&p->burst) < 0) + if (errno != ERANGE) + errx(EX_DATAERR, + "burst: invalid argument"); + if (errno || p->burst > (1ULL << 48) - 1) + errx(EX_DATAERR, + "burst: out of range (0..2^48-1)"); + ac--; av++; + break; + + default: + errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); + } + } + + /* check validity of parameters */ + if (p) { + if (p->delay > 10000) + errx(EX_DATAERR, "delay must be < 10000"); + if (p->bandwidth == -1) + p->bandwidth = 0; + } + if (fs) { + /* XXX accept a 0 scheduler to keep the default */ + if (fs->flags & DN_QSIZE_BYTES) { + size_t len; + long limit; + + len = sizeof(limit); + if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", + &limit, &len, NULL, 0) == -1) + limit = 1024*1024; + if (fs->qsize > limit) + errx(EX_DATAERR, "queue size must be < %ldB", limit); + } else { + size_t len; + long limit; + + len = sizeof(limit); + if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", + &limit, &len, NULL, 0) == -1) + limit = 100; + if (fs->qsize > limit) + errx(EX_DATAERR, "2 <= queue size <= %ld", limit); + } + + if (fs->flags & DN_IS_RED) { + size_t len; + int lookup_depth, avg_pkt_size; + double w_q; + + if (fs->min_th >= fs->max_th) + errx(EX_DATAERR, "min_th %d must be < than max_th %d", + fs->min_th, fs->max_th); + if (fs->max_th == 0) + errx(EX_DATAERR, "max_th must be > 0"); + + len = sizeof(int); + if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", + &lookup_depth, &len, NULL, 0) == -1) + lookup_depth = 256; + if (lookup_depth == 0) + errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" + " must be greater than zero"); + + len = sizeof(int); + if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", + &avg_pkt_size, &len, NULL, 0) == -1) + avg_pkt_size = 512; + + if (avg_pkt_size == 0) + errx(EX_DATAERR, + "net.inet.ip.dummynet.red_avg_pkt_size must" + " be greater than zero"); + + /* + * Ticks needed for sending a medium-sized packet. + * Unfortunately, when we are configuring a WF2Q+ queue, we + * do not have bandwidth information, because that is stored + * in the parent pipe, and also we have multiple queues + * competing for it. So we set s=0, which is not very + * correct. But on the other hand, why do we want RED with + * WF2Q+ ? + */ +#if 0 + if (p.bandwidth==0) /* this is a WF2Q+ queue */ + s = 0; + else + s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; +#endif + /* + * max idle time (in ticks) before avg queue size becomes 0. + * NOTA: (3/w_q) is approx the value x so that + * (1-w_q)^x < 10^-3. + */ + w_q = ((double)fs->w_q) / (1 << SCALE_RED); +#if 0 // go in kernel + idle = s * 3. / w_q; + fs->lookup_step = (int)idle / lookup_depth; + if (!fs->lookup_step) + fs->lookup_step = 1; + weight = 1 - w_q; + for (t = fs->lookup_step; t > 1; --t) + weight *= 1 - w_q; + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); +#endif + } + } + + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); + + if (i) + err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); +} + +void +dummynet_flush(void) +{ + struct dn_id oid; + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_cmd(IP_DUMMYNET3, &oid, oid.len); +} + +/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' + * Returns the number of ranges, and possibly stores them + * in the array v of size len. + */ +static int +parse_range(int ac, char *av[], uint32_t *v, int len) +{ + int n = 0; + char *endptr, *s; + uint32_t base[2]; + + if (v == NULL || len < 2) { + v = base; + len = 2; + } + + for (s = *av; s != NULL; av++, ac--) { + v[0] = strtoul(s, &endptr, 10); + v[1] = (*endptr != '-') ? v[0] : + strtoul(endptr+1, &endptr, 10); + if (*endptr == '\0') { /* prepare for next round */ + s = (ac > 0) ? *(av+1) : NULL; + } else { + if (*endptr != ',') { + warn("invalid number: %s", s); + s = ++endptr; + continue; + } + /* continue processing from here */ + s = ++endptr; + ac++; + av--; + } + if (v[1] < v[0] || + v[1] < 0 || v[1] >= DN_MAX_ID-1 || + v[0] < 0 || v[1] >= DN_MAX_ID-1) { + continue; /* invalid entry */ + } + n++; + /* translate if 'pipe list' */ + if (co.do_pipe == 1) { + v[0] += DN_MAX_ID; + v[1] += DN_MAX_ID; + } + v = (n*2 < len) ? v + 2 : base; + } + return n; +} + +/* main entry point for dummynet list functions. co.do_pipe indicates + * which function we want to support. + * av may contain filtering arguments, either individual entries + * or ranges, or lists (space or commas are valid separators). + * Format for a range can be n1-n2 or n3 n4 n5 ... + * In a range n1 must be <= n2, otherwise the range is ignored. + * A number 'n4' is translate in a range 'n4-n4' + * All number must be > 0 and < DN_MAX_ID-1 + */ +void +dummynet_list(int ac, char *av[], int show_counters) +{ + struct dn_id *oid, *x = NULL; + int ret, i, l; + int n; /* # of ranges */ + int buflen; + int max_size; /* largest obj passed up */ + + ac--; + av++; /* skip 'list' | 'show' word */ + + n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ + + /* Allocate space to store ranges */ + l = sizeof(*oid) + sizeof(uint32_t) * n * 2; + oid = safe_calloc(1, l); + oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); + + if (n > 0) /* store ranges in idx */ + parse_range(ac, av, (uint32_t *)(oid + 1), n*2); + /* + * Compute the size of the largest object returned. If the + * response leaves at least this much spare space in the + * buffer, then surely the response is complete; otherwise + * there might be a risk of truncation and we will need to + * retry with a larger buffer. + * XXX don't bother with smaller structs. + */ + max_size = sizeof(struct dn_fs); + if (max_size < sizeof(struct dn_sch)) + max_size = sizeof(struct dn_sch); + if (max_size < sizeof(struct dn_flow)) + max_size = sizeof(struct dn_flow); + + switch (co.do_pipe) { + case 1: + oid->subtype = DN_LINK; /* list pipe */ + break; + case 2: + oid->subtype = DN_FS; /* list queue */ + break; + case 3: + oid->subtype = DN_SCH; /* list sched */ + break; + } + + /* + * Ask the kernel an estimate of the required space (result + * in oid.id), unless we are requesting a subset of objects, + * in which case the kernel does not give an exact answer. + * In any case, space might grow in the meantime due to the + * creation of new queues, so we must be prepared to retry. + */ + if (n > 0) { + buflen = 4*1024; + } else { + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0 || oid->id <= sizeof(*oid)) + goto done; + buflen = oid->id + max_size; + oid->len = sizeof(*oid); /* restore */ + } + /* Try a few times, until the buffer fits */ + for (i = 0; i < 20; i++) { + l = buflen; + x = safe_realloc(x, l); + bcopy(oid, x, oid->len); + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); + if (ret != 0 || x->id <= sizeof(*oid)) + goto done; /* no response */ + if (l + max_size <= buflen) + break; /* ok */ + buflen *= 2; /* double for next attempt */ + } + list_pipes(x, O_NEXT(x, l)); +done: + if (x) + free(x); + free(oid); +} diff --git a/ipfw/expand_number.c b/ipfw/expand_number.c new file mode 100644 index 0000000..d557111 --- /dev/null +++ b/ipfw/expand_number.c @@ -0,0 +1,100 @@ +/*- + * Copyright (c) 2007 Eric Anderson + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +// #include +__FBSDID("$FreeBSD: src/lib/libutil/expand_number.c,v 1.2.4.2 2009/06/10 14:52:34 des Exp $"); + +#include +#include +#include +#include +//#include +#include + +/* + * Convert an expression of the following forms to a int64_t. + * 1) A positive decimal number. + * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). + * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10). + * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20). + * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30). + * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40). + * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50). + * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60). + */ +int +expand_number(const char *buf, int64_t *num) +{ + static const char unit[] = "bkmgtpe"; + char *endptr, s; + int64_t number; + int i; + + number = strtoimax(buf, &endptr, 0); + + if (endptr == buf) { + /* No valid digits. */ + errno = EINVAL; + return (-1); + } + + if (*endptr == '\0') { + /* No unit. */ + *num = number; + return (0); + } + + s = tolower(*endptr); + switch (s) { + case 'b': + case 'k': + case 'm': + case 'g': + case 't': + case 'p': + case 'e': + break; + default: + /* Unrecognized unit. */ + errno = EINVAL; + return (-1); + } + + for (i = 0; unit[i] != '\0'; i++) { + if (s == unit[i]) + break; + if ((number < 0 && (number << 10) > number) || + (number >= 0 && (number << 10) < number)) { + errno = ERANGE; + return (-1); + } + number <<= 10; + } + + *num = number; + return (0); +} diff --git a/ipfw/glue.c b/ipfw/glue.c new file mode 100644 index 0000000..3ede522 --- /dev/null +++ b/ipfw/glue.c @@ -0,0 +1,841 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: glue.c 5881 2010-03-25 14:29:48Z svn_panicucci $ + * + * Userland functions missing in linux/Windows + */ + +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#endif /* _WIN32 */ + +#ifndef HAVE_NAT +/* dummy nat functions */ +void +ipfw_show_nat(int ac, char **av) +{ + fprintf(stderr, "%s unsupported\n", __FUNCTION__); +} + +void +ipfw_config_nat(int ac, char **av) +{ + fprintf(stderr, "%s unsupported\n", __FUNCTION__); +} +#endif + +#ifdef __linux__ +int optreset; /* missing in linux */ +#endif + +/* + * not implemented in linux. + * taken from /usr/src/lib/libc/string/strlcpy.c + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} + + +/* missing in linux and windows */ +long long int +strtonum(const char *nptr, long long minval, long long maxval, + const char **errstr) +{ + long long ret; + int errno_c = errno; /* save actual errno */ + + errno = 0; +#ifdef TCC + ret = strtol(nptr, (char **)errstr, 0); +#else + ret = strtoll(nptr, (char **)errstr, 0); +#endif + /* We accept only a string that represent exactly a number (ie. start + * and end with a digit). + * FreeBSD version wants errstr==NULL if no error occurs, otherwise + * errstr should point to an error string. + * For our purspose, we implement only the invalid error, ranges + * error aren't checked + */ + if (errno != 0 || nptr == *errstr || **errstr != '\0') + *errstr = "invalid"; + else { + *errstr = NULL; + errno = errno_c; + } + return ret; +} + +#if defined (_WIN32) || defined (EMULATE_SYSCTL) +//XXX missing prerequisites +#include //openwrt +#include //openwrt +#include +#include +#endif + +/* + * set or get system information + * XXX lock acquisition/serialize calls + * + * we export this as sys/module/ipfw_mod/parameters/___ + * This function get or/and set the value of the sysctl passed by + * the name parameter. If the old value is not desired, + * oldp and oldlenp should be set to NULL. + * + * XXX + * I do not know how this works in FreeBSD in the case + * where there are no write permission on the sysctl var. + * We read the value and set return variables in any way + * but returns -1 on write failures, regardless the + * read success. + * + * Since there is no information on types, in the following + * code we assume a lenght of 4 is a int. + * + * Returns 0 on success, -1 on errors. + */ +int +sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) +{ +#if defined (_WIN32) || defined (EMULATE_SYSCTL) + /* + * we embed the sysctl request in the usual sockopt mechanics. + * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 + * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET + * subcommands. + * the syntax of this function is fully compatible with + * POSIX sysctlby name: + * if newp and newlen are != 0 => this is a set + * else if oldp and oldlen are != 0 => this is a get + * to avoid too much overhead in the module, the whole + * sysctltable is returned, and the parsing is done in userland, + * a probe request is done to retrieve the size needed to + * transfer the table, before the real request + * if both old and new params = 0 => this is a print + * this is a special request, done only by main() + * to implement the extension './ipfw sysctl', + * a command that bypasses the normal getopt, and that + * is available on those platforms that use this + * sysctl emulation. + * in this case, a negative oldlen signals that *oldp + * is actually a FILE* to print somewhere else than stdout + */ + + int l; + int ret; + struct dn_id* oid; + struct sysctlhead* entry; + char* pstring; + char* pdata; + FILE* fp; + + if((oldlenp != NULL) && (*oldlenp < 0)) + fp = (FILE*)oldp; + else + fp = stdout; + if(newp != NULL && newlen != 0) + { + //this is a set + l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen; + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_SET; + oid->id = DN_API_VERSION; + + entry = (struct sysctlhead*)(oid+1); + pdata = (unsigned char*)(entry+1); + pstring = pdata + newlen; + + entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3; + entry->namelen = strlen(name)+1; + entry->flags = 0; + entry->datalen = newlen; + + bcopy(newp, pdata, newlen); + bcopy(name, pstring, strlen(name)+1); + + ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); + if (ret != 0) + return -1; + } + else + { + //this is a get or a print + l = sizeof(struct dn_id); + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_GET; + oid->id = DN_API_VERSION; + + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0) + return -1; + + l=oid->id; + free(oid); + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_GET; + oid->id = DN_API_VERSION; + + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0) + return -1; + + entry = (struct sysctlhead*)(oid+1); + while(entry->blocklen != 0) + { + pdata = (unsigned char*)(entry+1); + pstring = pdata+entry->datalen; + + //time to check if this is a get or a print + if(name != NULL && oldp != NULL && *oldlenp > 0) + { + //this is a get + if(strcmp(name,pstring) == 0) + { + //match found, sanity chech on len + if(*oldlenp < entry->datalen) + { + printf("%s error: buffer too small\n",__FUNCTION__); + return -1; + } + *oldlenp = entry->datalen; + bcopy(pdata, oldp, *oldlenp); + return 0; + } + } + else + { + //this is a print + if( name == NULL ) + goto print; + if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) + goto print; + else + goto skip; +print: + fprintf(fp, "%s: ",pstring); + switch( entry->flags >> 2 ) + { + case SYSCTLTYPE_LONG: + fprintf(fp, "%li ", *(long*)(pdata)); + break; + case SYSCTLTYPE_UINT: + fprintf(fp, "%u ", *(unsigned int*)(pdata)); + break; + case SYSCTLTYPE_ULONG: + fprintf(fp, "%lu ", *(unsigned long*)(pdata)); + break; + case SYSCTLTYPE_INT: + default: + fprintf(fp, "%i ", *(int*)(pdata)); + } + if( (entry->flags & 0x00000003) == CTLFLAG_RD ) + fprintf(fp, "\t(read only)\n"); + else + fprintf(fp, "\n"); +skip: ; + } + entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen); + } + free(oid); + return 0; + } + //fallback for invalid options + return -1; + +#else /* __linux__ */ + FILE *fp; + char *basename = "/sys/module/ipfw_mod/parameters/"; + char filename[256]; /* full filename */ + char *varp; + int ret = 0; /* return value */ + int d; + + if (name == NULL) /* XXX set errno */ + return -1; + + /* locate the filename */ + varp = strrchr(name, '.'); + if (varp == NULL) /* XXX set errno */ + return -1; + + snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); + + /* + * XXX we could open the file here, in rw mode + * but need to check if a file have write + * permissions. + */ + + /* check parameters */ + if (oldp && oldlenp) { /* read mode */ + fp = fopen(filename, "r"); + if (fp == NULL) { + fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename); + return -1; + } + if (*oldlenp == 4) { + if (fscanf(fp, "%d", &d) == 1) + memcpy(oldp, &d, *oldlenp); + else + ret = -1; + } + fclose(fp); + } + + if (newp && newlen) { /* write */ + fp = fopen(filename, "w"); + if (fp == NULL) { + fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename); + return -1; + } + if (newlen == 4) { + if (fprintf(fp, "%d", *(int*)newp) < 1) + ret = -1; + } + + fclose(fp); + } + + return ret; +#endif /* __linux__ */ +} + +#ifdef _WIN32 +/* + * On windows, set/getsockopt are mapped to DeviceIoControl() + */ +int +wnd_setsockopt(int s, int level, int sopt_name, const void *optval, + socklen_t optlen) +{ + size_t len = sizeof (struct sockopt) + optlen; + struct sockopt *sock; + DWORD n; + BOOL result; + HANDLE _dev_h = (HANDLE)s; + + /* allocate a data structure for communication */ + sock = malloc(len); + if (sock == NULL) + return -1; + + sock->sopt_dir = SOPT_SET; + sock->sopt_name = sopt_name; + sock->sopt_valsize = optlen; + sock->sopt_val = (void *)(sock+1); + + memcpy(sock->sopt_val, optval, optlen); + result = DeviceIoControl (_dev_h, IP_FW_SETSOCKOPT, sock, len, + NULL, 0, &n, NULL); + free (sock); + + return (result ? 0 : -1); +} + +int +wnd_getsockopt(int s, int level, int sopt_name, void *optval, + socklen_t *optlen) +{ + size_t len = sizeof (struct sockopt) + *optlen; + struct sockopt *sock; + DWORD n; + BOOL result; + HANDLE _dev_h = (HANDLE)s; + + sock = malloc(len); + if (sock == NULL) + return -1; + + sock->sopt_dir = SOPT_GET; + sock->sopt_name = sopt_name; + sock->sopt_valsize = *optlen; + sock->sopt_val = (void *)(sock+1); + + memcpy (sock->sopt_val, optval, *optlen); + + result = DeviceIoControl (_dev_h, IP_FW_GETSOCKOPT, sock, len, + sock, len, &n, NULL); + //printf("len = %i, returned = %u, valsize = %i\n",len,n,sock->sopt_valsize); + *optlen = sock->sopt_valsize; + memcpy (optval, sock->sopt_val, *optlen); + free (sock); + return (result ? 0 : -1); +} + +int +my_socket(int domain, int ty, int proto) +{ + TCHAR *pcCommPort = TEXT("\\\\.\\Ipfw"); + HANDLE _dev_h = INVALID_HANDLE_VALUE; + + /* Special Handling For Accessing Device On Windows 2000 Terminal Server + See Microsoft KB Article 259131 */ + if (_dev_h == INVALID_HANDLE_VALUE) { + _dev_h = CreateFile (pcCommPort, + GENERIC_READ | GENERIC_WRITE, + 0, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + } + if (_dev_h == INVALID_HANDLE_VALUE) { + printf("%s failed %u, cannot talk to kernel module\n", + __FUNCTION__, (unsigned)GetLastError()); + return -1; + } + return (int)_dev_h; +} + +struct hostent* gethostbyname2(const char *name, int af) +{ + return gethostbyname(name); +} + +struct ether_addr* ether_aton(const char *a) +{ + fprintf(stderr, "%s empty\n", __FUNCTION__); + return NULL; +} + +#ifdef TCC +int opterr = 1, /* if error message should be printed */ + optind = 1, /* index into parent argv vector */ + optopt, /* character checked for validity */ + optreset; /* reset getopt */ +char *optarg; /* argument associated with option */ + +#define BADCH (int)'?' +#define BADARG (int)':' +#define EMSG "" + +#define PROGNAME "ipfw" +/* + * getopt -- + * Parse argc/argv argument vector. + */ +int +getopt(nargc, nargv, ostr) + int nargc; + char * const nargv[]; + const char *ostr; +{ + static char *place = EMSG; /* option letter processing */ + char *oli; /* option letter list index */ + + if (optreset || *place == 0) { /* update scanning pointer */ + optreset = 0; + place = nargv[optind]; + if (optind >= nargc || *place++ != '-') { + /* Argument is absent or is not an option */ + place = EMSG; + return (-1); + } + optopt = *place++; + if (optopt == '-' && *place == 0) { + /* "--" => end of options */ + ++optind; + place = EMSG; + return (-1); + } + if (optopt == 0) { + /* Solitary '-', treat as a '-' option + if the program (eg su) is looking for it. */ + place = EMSG; + if (strchr(ostr, '-') == NULL) + return (-1); + optopt = '-'; + } + } else + optopt = *place++; + + /* See if option letter is one the caller wanted... */ + if (optopt == ':' || (oli = strchr(ostr, optopt)) == NULL) { + if (*place == 0) + ++optind; + if (opterr && *ostr != ':') + (void)fprintf(stderr, + "%s: illegal option -- %c\n", PROGNAME, + optopt); + return (BADCH); + } + + /* Does this option need an argument? */ + if (oli[1] != ':') { + /* don't need argument */ + optarg = NULL; + if (*place == 0) + ++optind; + } else { + /* Option-argument is either the rest of this argument or the + entire next argument. */ + if (*place) + optarg = place; + else if (nargc > ++optind) + optarg = nargv[optind]; + else { + /* option-argument absent */ + place = EMSG; + if (*ostr == ':') + return (BADARG); + if (opterr) + (void)fprintf(stderr, + "%s: option requires an argument -- %c\n", + PROGNAME, optopt); + return (BADCH); + } + place = EMSG; + ++optind; + } + return (optopt); /* return option letter */ +} + +//static FILE *err_file = stderr; +void +verrx(int ex, int eval, const char *fmt, va_list ap) +{ + fprintf(stderr, "%s: ", PROGNAME); + if (fmt != NULL) + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + if (ex) + exit(eval); +} +void +errx(int eval, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + verrx(1, eval, fmt, ap); + va_end(ap); +} + +void +warnx(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + verrx(0, 0, fmt, ap); + va_end(ap); +} + +char * +strsep(char **stringp, const char *delim) +{ + char *s; + const char *spanp; + int c, sc; + char *tok; + + if ((s = *stringp) == NULL) + return (NULL); + for (tok = s;;) { + c = *s++; + spanp = delim; + do { + if ((sc = *spanp++) == c) { + if (c == 0) + s = NULL; + else + s[-1] = 0; + *stringp = s; + return (tok); + } + } while (sc != 0); + } + /* NOTREACHED */ +} + +static unsigned char +tolower(unsigned char c) +{ + return (c >= 'A' && c <= 'Z') ? c + 'a' - 'A' : c; +} + +static int isdigit(unsigned char c) +{ + return (c >= '0' && c <= '9'); +} + +static int isxdigit(unsigned char c) +{ + return (index("0123456789ABCDEFabcdef", c) ? 1 : 0); +} + +static int isspace(unsigned char c) +{ + return (index(" \t\n\r", c) ? 1 : 0); +} + +static int isascii(unsigned char c) +{ + return (c < 128); +} + +static int islower(unsigned char c) +{ + return (c >= 'a' && c <= 'z'); +} + +int +strcasecmp(const char *s1, const char *s2) +{ + const unsigned char + *us1 = (const unsigned char *)s1, + *us2 = (const unsigned char *)s2; + + while (tolower(*us1) == tolower(*us2++)) + if (*us1++ == '\0') + return (0); + return (tolower(*us1) - tolower(*--us2)); +} + +intmax_t +strtoimax(const char * restrict nptr, char ** restrict endptr, int base) +{ + return strtol(nptr, endptr,base); +} + +void +setservent(int a) +{ +} + +#define NS_INADDRSZ 128 + +int +inet_pton(int af, const char *src, void *dst) +{ + static const char digits[] = "0123456789"; + int saw_digit, octets, ch; + u_char tmp[NS_INADDRSZ], *tp; + + if (af != AF_INET) { + errno = EINVAL; + return -1; + } + + saw_digit = 0; + octets = 0; + *(tp = tmp) = 0; + while ((ch = *src++) != '\0') { + const char *pch; + + if ((pch = strchr(digits, ch)) != NULL) { + u_int new = *tp * 10 + (pch - digits); + + if (saw_digit && *tp == 0) + return (0); + if (new > 255) + return (0); + *tp = new; + if (!saw_digit) { + if (++octets > 4) + return (0); + saw_digit = 1; + } + } else if (ch == '.' && saw_digit) { + if (octets == 4) + return (0); + *++tp = 0; + saw_digit = 0; + } else + return (0); + } + if (octets < 4) + return (0); + memcpy(dst, tmp, NS_INADDRSZ); + return (1); +} + +const char * +inet_ntop(int af, const void *_src, char *dst, socklen_t size) +{ + static const char fmt[] = "%u.%u.%u.%u"; + char tmp[sizeof "255.255.255.255"]; + const u_char *src = _src; + int l; + if (af != AF_INET) { + errno = EINVAL; + return NULL; + } + + l = snprintf(tmp, sizeof(tmp), fmt, src[0], src[1], src[2], src[3]); + if (l <= 0 || (socklen_t) l >= size) { + errno = ENOSPC; + return (NULL); + } + strlcpy(dst, tmp, size); + return (dst); +} + +/*% + * Check whether "cp" is a valid ascii representation + * of an Internet address and convert to a binary address. + * Returns 1 if the address is valid, 0 if not. + * This replaces inet_addr, the return value from which + * cannot distinguish between failure and a local broadcast address. + */ +int +inet_aton(const char *cp, struct in_addr *addr) { + u_long val; + int base, n; + char c; + u_int8_t parts[4]; + u_int8_t *pp = parts; + int digit; + + c = *cp; + for (;;) { + /* + * Collect number up to ``.''. + * Values are specified as for C: + * 0x=hex, 0=octal, isdigit=decimal. + */ + if (!isdigit((unsigned char)c)) + return (0); + val = 0; base = 10; digit = 0; + if (c == '0') { + c = *++cp; + if (c == 'x' || c == 'X') + base = 16, c = *++cp; + else { + base = 8; + digit = 1 ; + } + } + for (;;) { + if (isascii(c) && isdigit((unsigned char)c)) { + if (base == 8 && (c == '8' || c == '9')) + return (0); + val = (val * base) + (c - '0'); + c = *++cp; + digit = 1; + } else if (base == 16 && isascii(c) && + isxdigit((unsigned char)c)) { + val = (val << 4) | + (c + 10 - (islower((unsigned char)c) ? 'a' : 'A')); + c = *++cp; + digit = 1; + } else + break; + } + if (c == '.') { + /* + * Internet format: + * a.b.c.d + * a.b.c (with c treated as 16 bits) + * a.b (with b treated as 24 bits) + */ + if (pp >= parts + 3 || val > 0xffU) + return (0); + *pp++ = val; + c = *++cp; + } else + break; + } + /* + * Check for trailing characters. + */ + if (c != '\0' && (!isascii(c) || !isspace((unsigned char)c))) + return (0); + /* + * Did we get a valid digit? + */ + if (!digit) + return (0); + /* + * Concoct the address according to + * the number of parts specified. + */ + n = pp - parts + 1; + switch (n) { + case 1: /*%< a -- 32 bits */ + break; + + case 2: /*%< a.b -- 8.24 bits */ + if (val > 0xffffffU) + return (0); + val |= parts[0] << 24; + break; + + case 3: /*%< a.b.c -- 8.8.16 bits */ + if (val > 0xffffU) + return (0); + val |= (parts[0] << 24) | (parts[1] << 16); + break; + + case 4: /*%< a.b.c.d -- 8.8.8.8 bits */ + if (val > 0xffU) + return (0); + val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8); + break; + } + if (addr != NULL) + addr->s_addr = htonl(val); + return (1); +} + +#endif /* TCC */ + +#endif /* _WIN32 */ diff --git a/ipfw/humanize_number.c b/ipfw/humanize_number.c new file mode 100644 index 0000000..90aa18b --- /dev/null +++ b/ipfw/humanize_number.c @@ -0,0 +1,153 @@ +/* $NetBSD: humanize_number.c,v 1.13 2007/12/14 17:26:19 christos Exp $ */ + +/* + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +// #include +__FBSDID("$FreeBSD: src/lib/libutil/humanize_number.c,v 1.2.10.1 2008/04/20 16:29:01 antoine Exp $"); + +#include +#include +#include +#include +#include +#include +// #include +//#include + +int +humanize_number(char *buf, size_t len, int64_t bytes, + const char *suffix, int scale, int flags) +{ + const char *prefixes, *sep; + int b, i, r, maxscale, s1, s2, sign; + int64_t divisor, max; + size_t baselen; + + assert(buf != NULL); + assert(suffix != NULL); + assert(scale >= 0); + + if (flags & HN_DIVISOR_1000) { + /* SI for decimal multiplies */ + divisor = 1000; + if (flags & HN_B) + prefixes = "B\0k\0M\0G\0T\0P\0E"; + else + prefixes = "\0\0k\0M\0G\0T\0P\0E"; + } else { + /* + * binary multiplies + * XXX IEC 60027-2 recommends Ki, Mi, Gi... + */ + divisor = 1024; + if (flags & HN_B) + prefixes = "B\0K\0M\0G\0T\0P\0E"; + else + prefixes = "\0\0K\0M\0G\0T\0P\0E"; + } + +#define SCALE2PREFIX(scale) (&prefixes[(scale) << 1]) + maxscale = 7; + + if (scale >= maxscale && + (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0) + return (-1); + + if (buf == NULL || suffix == NULL) + return (-1); + + if (len > 0) + buf[0] = '\0'; + if (bytes < 0) { + sign = -1; + bytes *= -100; + baselen = 3; /* sign, digit, prefix */ + } else { + sign = 1; + bytes *= 100; + baselen = 2; /* digit, prefix */ + } + if (flags & HN_NOSPACE) + sep = ""; + else { + sep = " "; + baselen++; + } + baselen += strlen(suffix); + + /* Check if enough room for `x y' + suffix + `\0' */ + if (len < baselen + 1) + return (-1); + + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { + /* See if there is additional columns can be used. */ + for (max = 100, i = len - baselen; i-- > 0;) + max *= 10; + + /* + * Divide the number until it fits the given column. + * If there will be an overflow by the rounding below, + * divide once more. + */ + for (i = 0; bytes >= max - 50 && i < maxscale; i++) + bytes /= divisor; + + if (scale & HN_GETSCALE) + return (i); + } else + for (i = 0; i < scale && i < maxscale; i++) + bytes /= divisor; + + /* If a value <= 9.9 after rounding and ... */ + if (bytes < 995 && i > 0 && flags & HN_DECIMAL) { + /* baselen + \0 + .N */ + if (len < baselen + 1 + 2) + return (-1); + b = ((int)bytes + 5) / 10; + s1 = b / 10; + s2 = b % 10; + r = snprintf(buf, len, "%d%s%d%s%s%s", + sign * s1, ".", s2, + sep, SCALE2PREFIX(i), suffix); + } else + r = snprintf(buf, len, "%" PRId64 "%s%s%s", + sign * ((bytes + 50) / 100), + sep, SCALE2PREFIX(i), suffix); + + return (r); +} diff --git a/ipfw/include/alias.h b/ipfw/include/alias.h new file mode 100644 index 0000000..888bd0d --- /dev/null +++ b/ipfw/include/alias.h @@ -0,0 +1,71 @@ +#ifndef _ALIAS_H_ +#define _ALIAS_H_ + +#define LIBALIAS_BUF_SIZE 128 + +/* + * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log + * every time a link is created or deleted. This is useful for debugging. + */ +#define PKT_ALIAS_LOG 0x01 + +/* + * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp, + * telnet or web servers will be prevented by the aliasing mechanism. + */ +#define PKT_ALIAS_DENY_INCOMING 0x02 + +/* + * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the + * same port as they originated on. This allows e.g. rsh to work *99% of the + * time*, but _not_ 100% (it will be slightly flakey instead of not working + * at all). This mode bit is set by PacketAliasInit(), so it is a default + * mode of operation. + */ +#define PKT_ALIAS_SAME_PORTS 0x04 + +/* + * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g. + * destination port and/or address is zero), the packet aliasing engine will + * attempt to allocate a socket for the aliasing port it chooses. This will + * avoid interference with the host machine. Fully specified links do not + * require this. This bit is set after a call to PacketAliasInit(), so it is + * a default mode of operation. + */ +#ifndef NO_USE_SOCKETS +#define PKT_ALIAS_USE_SOCKETS 0x08 +#endif +/*- + * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with + * unregistered source addresses will be aliased. Private + * addresses are those in the following ranges: + * + * 10.0.0.0 -> 10.255.255.255 + * 172.16.0.0 -> 172.31.255.255 + * 192.168.0.0 -> 192.168.255.255 + */ +#define PKT_ALIAS_UNREGISTERED_ONLY 0x10 + +/* + * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic + * aliasing links will be reset whenever PacketAliasSetAddress() changes the + * default aliasing address. If the default aliasing address is left + * unchanged by this function call, then the table of dynamic aliasing links + * will be left intact. This bit is set after a call to PacketAliasInit(). + */ +#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 + + +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + +#endif /* !_ALIAS_H_ */ diff --git a/ipfw/include/net/if_dl.h b/ipfw/include/net/if_dl.h new file mode 100644 index 0000000..4d2b4f7 --- /dev/null +++ b/ipfw/include/net/if_dl.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_dl.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/net/if_dl.h,v 1.14 2005/01/07 01:45:34 imp Exp $ + */ + +#ifndef _NET_IF_DL_H_ +#define _NET_IF_DL_H_ + +/* + * A Link-Level Sockaddr may specify the interface in one of two + * ways: either by means of a system-provided index number (computed + * anew and possibly differently on every reboot), or by a human-readable + * string such as "il0" (for managerial convenience). + * + * Census taking actions, such as something akin to SIOCGCONF would return + * both the index and the human name. + * + * High volume transactions (such as giving a link-level ``from'' address + * in a recvfrom or recvmsg call) may be likely only to provide the indexed + * form, (which requires fewer copy operations and less space). + * + * The form and interpretation of the link-level address is purely a matter + * of convention between the device driver and its consumers; however, it is + * expected that all drivers for an interface of a given if_type will agree. + */ + +/* + * Structure of a Link-Level sockaddr: + */ +struct sockaddr_dl { + u_char sdl_len; /* Total length of sockaddr */ + u_char sdl_family; /* AF_LINK */ + u_short sdl_index; /* if != 0, system given index for interface */ + u_char sdl_type; /* interface type */ + u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ + u_char sdl_alen; /* link level address length */ + u_char sdl_slen; /* link layer selector length */ + char sdl_data[46]; /* minimum work area, can be larger; + contains both if name and ll address */ +}; + +#define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen)) + +#ifndef _KERNEL + +#include + +__BEGIN_DECLS +void link_addr(const char *, struct sockaddr_dl *); +char *link_ntoa(const struct sockaddr_dl *); +__END_DECLS + +#endif /* !_KERNEL */ + +#endif diff --git a/ipfw/include/net/pfvar.h b/ipfw/include/net/pfvar.h new file mode 100644 index 0000000..304cb16 --- /dev/null +++ b/ipfw/include/net/pfvar.h @@ -0,0 +1,32 @@ +#ifndef _PF_VAR_H_ +#define _PF_VAR_H_ + +/* + * replacement for FreeBSD's pfqueue.h + */ +#include + +#define DIOCSTARTALTQ _IO ('D', 42) +#define DIOCSTOPALTQ _IO ('D', 43) + +struct pf_altq { + TAILQ_ENTRY(pf_altq) entries; + /* ... */ + u_int32_t qid; /* return value */ + +#define PF_QNAME_SIZE 64 + char qname[PF_QNAME_SIZE]; /* queue name */ + +}; + +struct pfioc_altq { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + struct pf_altq altq; +}; + +#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) +#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) + +#endif /* !_PF_VAR_H */ diff --git a/ipfw/include/timeconv.h b/ipfw/include/timeconv.h new file mode 100644 index 0000000..f3b8d22 --- /dev/null +++ b/ipfw/include/timeconv.h @@ -0,0 +1,29 @@ +/* + * simple override for _long_to_time() + */ +#ifndef _TIMECONV_H_ +#define _TIMECONV_H_ +static __inline time_t +_long_to_time(long tlong) +{ + if (sizeof(long) == sizeof(__int32_t)) + return((time_t)(__int32_t)(tlong)); + return((time_t)tlong); +} + +#ifdef __linux__ + +/* + * some linux headers have variables called __unused, whereas the name + * is an alias for the gcc attribute on FreeBSD. + * We have to define __unused appropriately, but this cannot be + * global because it would clash with the linux headers. + * + * __unused is defined here because there is not a better place + * and this file is included by ipfw2.c where the offending linux + * headers are not included. + */ +#define __unused __attribute__ ((__unused__)) +#endif + +#endif /* _TIMECONV_H_ */ diff --git a/ipfw/ipfw.8 b/ipfw/ipfw.8 new file mode 100644 index 0000000..b1ec24d --- /dev/null +++ b/ipfw/ipfw.8 @@ -0,0 +1,3218 @@ +.\" +.\" $FreeBSD: head/sbin/ipfw/ipfw.8 211936 2010-08-28 16:32:01Z brucec $ +.\" +.Dd July 27, 2010 +.Dt IPFW 8 +.Os +.Sh NAME +.Nm ipfw +.Nd User interface for firewall, traffic shaper, packet scheduler, +in-kernel NAT. +.Sh SYNOPSIS +.Ss FIREWALL CONFIGURATION +.Nm +.Op Fl cq +.Cm add +.Ar rule +.Nm +.Op Fl acdefnNStT +.Op Cm set Ar N +.Brq Cm list | show +.Op Ar rule | first-last ... +.Nm +.Op Fl f | q +.Op Cm set Ar N +.Cm flush +.Nm +.Op Fl q +.Op Cm set Ar N +.Brq Cm delete | zero | resetlog +.Op Ar number ... +.Pp +.Nm +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... +.Nm +.Cm set move +.Op Cm rule +.Ar number Cm to Ar number +.Nm +.Cm set swap Ar number number +.Nm +.Cm set show +.Ss SYSCTL SHORTCUTS +.Pp +.Nm +.Cm enable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Nm +.Cm disable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Pp +.Ss LOOKUP TABLES +.Nm +.Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value +.Nm +.Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen +.Nm +.Cm table +.Brq Ar number | all +.Cm flush +.Nm +.Cm table +.Brq Ar number | all +.Cm list +.Pp +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) +.Nm +.Brq Cm pipe | queue | sched +.Ar number +.Cm config +.Ar config-options +.Nm +.Op Fl s Op Ar field +.Brq Cm pipe | queue | sched +.Brq Cm delete | list | show +.Op Ar number ... +.Pp +.Ss IN-KERNEL NAT +.Nm +.Op Fl q +.Cm nat +.Ar number +.Cm config +.Ar config-options +.Pp +.Nm +.Op Fl cfnNqS +.Oo +.Fl p Ar preproc +.Oo +.Ar preproc-flags +.Oc +.Oc +.Ar pathname +.Sh DESCRIPTION +The +.Nm +utility is the user interface for controlling the +.Xr ipfw 4 +firewall, the +.Xr dummynet 4 +traffic shaper/packet scheduler, and the +in-kernel NAT services. +.Pp +A firewall configuration, or +.Em ruleset , +is made of a list of +.Em rules +numbered from 1 to 65535. +Packets are passed to the firewall +from a number of different places in the protocol stack +(depending on the source and destination of the packet, +it is possible for the firewall to be +invoked multiple times on the same packet). +The packet passed to the firewall is compared +against each of the rules in the +.Em ruleset , +in rule-number order +(multiple rules with the same number are permitted, in which case +they are processed in order of insertion). +When a match is found, the action corresponding to the +matching rule is performed. +.Pp +Depending on the action and certain system settings, packets +can be reinjected into the firewall at some rule after the +matching one for further processing. +.Pp +A ruleset always includes a +.Em default +rule (numbered 65535) which cannot be modified or deleted, +and matches all packets. +The action associated with the +.Em default +rule can be either +.Cm deny +or +.Cm allow +depending on how the kernel is configured. +.Pp +If the ruleset includes one or more rules with the +.Cm keep-state +or +.Cm limit +option, +the firewall will have a +.Em stateful +behaviour, i.e., upon a match it will create +.Em dynamic rules , +i.e. rules that match packets with the same 5-tuple +(protocol, source and destination addresses and ports) +as the packet which caused their creation. +Dynamic rules, which have a limited lifetime, are checked +at the first occurrence of a +.Cm check-state , +.Cm keep-state +or +.Cm limit +rule, and are typically used to open the firewall on-demand to +legitimate traffic only. +See the +.Sx STATEFUL FIREWALL +and +.Sx EXAMPLES +Sections below for more information on the stateful behaviour of +.Nm . +.Pp +All rules (including dynamic ones) have a few associated counters: +a packet count, a byte count, a log count and a timestamp +indicating the time of the last match. +Counters can be displayed or reset with +.Nm +commands. +.Pp +Each rule belongs to one of 32 different +.Em sets +, and there are +.Nm +commands to atomically manipulate sets, such as enable, +disable, swap sets, move all rules in a set to another +one, delete all rules in a set. +These can be useful to +install temporary configurations, or to test them. +See Section +.Sx SETS OF RULES +for more information on +.Em sets . +.Pp +.Pp +Rules can be added with the +.Cm add +command; deleted individually or in groups with the +.Cm delete +command, and globally (except those in set 31) with the +.Cm flush +command; displayed, optionally with the content of the +counters, using the +.Cm show +and +.Cm list +commands. +Finally, counters can be reset with the +.Cm zero +and +.Cm resetlog +commands. +.Pp +.Ss COMMAND OPTIONS +The following general options are available when invoking +.Nm : +.Bl -tag -width indent +.It Fl a +Show counter values when listing rules. +The +.Cm show +command implies this option. +.It Fl b +Only show the action and the comment, not the body of a rule. +Implies +.Fl c . +.It Fl c +When entering or showing rules, print them in compact form, +i.e., omitting the "ip from any to any" string +when this does not carry any additional information. +.It Fl d +When listing, show dynamic rules in addition to static ones. +.It Fl e +When listing and +.Fl d +is specified, also show expired dynamic rules. +.It Fl f +Do not ask for confirmation for commands that can cause problems +if misused, +.No i.e. Cm flush . +If there is no tty associated with the process, this is implied. +.It Fl i +When listing a table (see the +.Sx LOOKUP TABLES +section below for more information on lookup tables), format values +as IP addresses. By default, values are shown as integers. +.It Fl n +Only check syntax of the command strings, without actually passing +them to the kernel. +.It Fl N +Try to resolve addresses and service names in output. +.It Fl q +Be quiet when executing the +.Cm add , +.Cm nat , +.Cm zero , +.Cm resetlog +or +.Cm flush +commands; +(implies +.Fl f ) . +This is useful when updating rulesets by executing multiple +.Nm +commands in a script +(e.g., +.Ql sh\ /etc/rc.firewall ) , +or by processing a file with many +.Nm +rules across a remote login session. +It also stops a table add or delete +from failing if the entry already exists or is not present. +.Pp +The reason why this option may be important is that +for some of these actions, +.Nm +may print a message; if the action results in blocking the +traffic to the remote client, +the remote login session will be closed +and the rest of the ruleset will not be processed. +Access to the console would then be required to recover. +.It Fl S +When listing rules, show the +.Em set +each rule belongs to. +If this flag is not specified, disabled rules will not be +listed. +.It Fl s Op Ar field +When listing pipes, sort according to one of the four +counters (total or current packets or bytes). +.It Fl t +When listing, show last match timestamp converted with ctime(). +.It Fl T +When listing, show last match timestamp as seconds from the epoch. +This form can be more convenient for postprocessing by scripts. +.El +.Pp +.Ss LIST OF RULES AND PREPROCESSING +To ease configuration, rules can be put into a file which is +processed using +.Nm +as shown in the last synopsis line. +An absolute +.Ar pathname +must be used. +The file will be read line by line and applied as arguments to the +.Nm +utility. +.Pp +Optionally, a preprocessor can be specified using +.Fl p Ar preproc +where +.Ar pathname +is to be piped through. +Useful preprocessors include +.Xr cpp 1 +and +.Xr m4 1 . +If +.Ar preproc +does not start with a slash +.Pq Ql / +as its first character, the usual +.Ev PATH +name search is performed. +Care should be taken with this in environments where not all +file systems are mounted (yet) by the time +.Nm +is being run (e.g.\& when they are mounted over NFS). +Once +.Fl p +has been specified, any additional arguments are passed on to the preprocessor +for interpretation. +This allows for flexible configuration files (like conditionalizing +them on the local hostname) and the use of macros to centralize +frequently required arguments like IP addresses. +.Pp +.Ss TRAFFIC SHAPER CONFIGURATION +The +.Nm +.Cm pipe , queue +and +.Cm sched +commands are used to configure the traffic shaper and packet scheduler. +See the +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +Section below for details. +.Pp +If the world and the kernel get out of sync the +.Nm +ABI may break, preventing you from being able to add any rules. +This can +adversely effect the booting process. +You can use +.Nm +.Cm disable +.Cm firewall +to temporarily disable the firewall to regain access to the network, +allowing you to fix the problem. +.Sh PACKET FLOW +A packet is checked against the active ruleset in multiple places +in the protocol stack, under control of several sysctl variables. +These places and variables are shown below, and it is important to +have this picture in mind in order to design a correct ruleset. +.Bd -literal -offset indent + ^ to upper layers V + | | + +----------->-----------+ + ^ V + [ip(6)_input] [ip(6)_output] net.inet(6).ip(6).fw.enable=1 + | | + ^ V + [ether_demux] [ether_output_frame] net.link.ether.ipfw=1 + | | + +-->--[bdg_forward]-->--+ net.link.bridge.ipfw=1 + ^ V + | to devices | +.Ed +.Pp +The number of +times the same packet goes through the firewall can +vary between 0 and 4 depending on packet source and +destination, and system configuration. +.Pp +Note that as packets flow through the stack, headers can be +stripped or added to it, and so they may or may not be available +for inspection. +E.g., incoming packets will include the MAC header when +.Nm +is invoked from +.Cm ether_demux() , +but the same packets will have the MAC header stripped off when +.Nm +is invoked from +.Cm ip_input() +or +.Cm ip6_input() . +.Pp +Also note that each packet is always checked against the complete ruleset, +irrespective of the place where the check occurs, or the source of the packet. +If a rule contains some match patterns or actions which are not valid +for the place of invocation (e.g.\& trying to match a MAC header within +.Cm ip_input +or +.Cm ip6_input ), +the match pattern will not match, but a +.Cm not +operator in front of such patterns +.Em will +cause the pattern to +.Em always +match on those packets. +It is thus the responsibility of +the programmer, if necessary, to write a suitable ruleset to +differentiate among the possible places. +.Cm skipto +rules can be useful here, as an example: +.Bd -literal -offset indent +# packets from ether_demux or bdg_forward +ipfw add 10 skipto 1000 all from any to any layer2 in +# packets from ip_input +ipfw add 10 skipto 2000 all from any to any not layer2 in +# packets from ip_output +ipfw add 10 skipto 3000 all from any to any not layer2 out +# packets from ether_output_frame +ipfw add 10 skipto 4000 all from any to any layer2 out +.Ed +.Pp +(yes, at the moment there is no way to differentiate between +ether_demux and bdg_forward). +.Sh SYNTAX +In general, each keyword or argument must be provided as +a separate command line argument, with no leading or trailing +spaces. +Keywords are case-sensitive, whereas arguments may +or may not be case-sensitive depending on their nature +(e.g.\& uid's are, hostnames are not). +.Pp +Some arguments (e.g. port or address lists) are comma-separated +lists of values. +In this case, spaces after commas ',' are allowed to make +the line more readable. +You can also put the entire +command (including flags) into a single argument. +E.g., the following forms are equivalent: +.Bd -literal -offset indent +ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8 +ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 +ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" +.Ed +.Sh RULE FORMAT +The format of firewall rules is the following: +.Bd -ragged -offset indent +.Bk -words +.Op Ar rule_number +.Op Cm set Ar set_number +.Op Cm prob Ar match_probability +.Ar action +.Op Cm log Op Cm logamount Ar number +.Op Cm altq Ar queue +.Oo +.Bro Cm tag | untag +.Brc Ar number +.Oc +.Ar body +.Ek +.Ed +.Pp +where the body of the rule specifies which information is used +for filtering packets, among the following: +.Pp +.Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact +.It Layer-2 header fields +When available +.It IPv4 and IPv6 Protocol +TCP, UDP, ICMP, etc. +.It Source and dest. addresses and ports +.It Direction +See Section +.Sx PACKET FLOW +.It Transmit and receive interface +By name or address +.It Misc. IP header fields +Version, type of service, datagram length, identification, +fragment flag (non-zero IP offset), +Time To Live +.It IP options +.It IPv6 Extension headers +Fragmentation, Hop-by-Hop options, +Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options. +.It IPv6 Flow-ID +.It Misc. TCP header fields +TCP flags (SYN, FIN, ACK, RST, etc.), +sequence number, acknowledgment number, +window +.It TCP options +.It ICMP types +for ICMP packets +.It ICMP6 types +for ICMP6 packets +.It User/group ID +When the packet can be associated with a local socket. +.It Divert status +Whether a packet came from a divert socket (e.g., +.Xr natd 8 ) . +.It Fib annotation state +Whether a packet has been tagged for using a specific FIB (routing table) +in future forwarding decisions. +.El +.Pp +Note that some of the above information, e.g.\& source MAC or IP addresses and +TCP/UDP ports, can be easily spoofed, so filtering on those fields +alone might not guarantee the desired results. +.Bl -tag -width indent +.It Ar rule_number +Each rule is associated with a +.Ar rule_number +in the range 1..65535, with the latter reserved for the +.Em default +rule. +Rules are checked sequentially by rule number. +Multiple rules can have the same number, in which case they are +checked (and listed) according to the order in which they have +been added. +If a rule is entered without specifying a number, the kernel will +assign one in such a way that the rule becomes the last one +before the +.Em default +rule. +Automatic rule numbers are assigned by incrementing the last +non-default rule number by the value of the sysctl variable +.Ar net.inet.ip.fw.autoinc_step +which defaults to 100. +If this is not possible (e.g.\& because we would go beyond the +maximum allowed rule number), the number of the last +non-default value is used instead. +.It Cm set Ar set_number +Each rule is associated with a +.Ar set_number +in the range 0..31. +Sets can be individually disabled and enabled, so this parameter +is of fundamental importance for atomic ruleset manipulation. +It can be also used to simplify deletion of groups of rules. +If a rule is entered without specifying a set number, +set 0 will be used. +.br +Set 31 is special in that it cannot be disabled, +and rules in set 31 are not deleted by the +.Nm ipfw flush +command (but you can delete them with the +.Nm ipfw delete set 31 +command). +Set 31 is also used for the +.Em default +rule. +.It Cm prob Ar match_probability +A match is only declared with the specified probability +(floating point number between 0 and 1). +This can be useful for a number of applications such as +random packet drop or +(in conjunction with +.Nm dummynet ) +to simulate the effect of multiple paths leading to out-of-order +packet delivery. +.Pp +Note: this condition is checked before any other condition, including +ones such as keep-state or check-state which might have side effects. +.It Cm log Op Cm logamount Ar number +When a packet matches a rule with the +.Cm log +keyword, a message will be +logged to +.Xr syslogd 8 +with a +.Dv LOG_SECURITY +facility. +The logging only occurs if the sysctl variable +.Va net.inet.ip.fw.verbose +is set to 1 +(which is the default when the kernel is compiled with +.Dv IPFIREWALL_VERBOSE ) +and the number of packets logged so far for that +particular rule does not exceed the +.Cm logamount +parameter. +If no +.Cm logamount +is specified, the limit is taken from the sysctl variable +.Va net.inet.ip.fw.verbose_limit . +In both cases, a value of 0 removes the logging limit. +.Pp +Once the limit is reached, logging can be re-enabled by +clearing the logging counter or the packet counter for that entry, see the +.Cm resetlog +command. +.Pp +Note: logging is done after all other packet matching conditions +have been successfully verified, and before performing the final +action (accept, deny, etc.) on the packet. +.It Cm tag Ar number +When a packet matches a rule with the +.Cm tag +keyword, the numeric tag for the given +.Ar number +in the range 1..65534 will be attached to the packet. +The tag acts as an internal marker (it is not sent out over +the wire) that can be used to identify these packets later on. +This can be used, for example, to provide trust between interfaces +and to start doing policy-based filtering. +A packet can have multiple tags at the same time. +Tags are "sticky", meaning once a tag is applied to a packet by a +matching rule it exists until explicit removal. +Tags are kept with the packet everywhere within the kernel, but are +lost when packet leaves the kernel, for example, on transmitting +packet out to the network or sending packet to a +.Xr divert 4 +socket. +.Pp +To check for previously applied tags, use the +.Cm tagged +rule option. +To delete previously applied tag, use the +.Cm untag +keyword. +.Pp +Note: since tags are kept with the packet everywhere in kernelspace, +they can be set and unset anywhere in the kernel network subsystem +(using the +.Xr mbuf_tags 9 +facility), not only by means of the +.Xr ipfw 4 +.Cm tag +and +.Cm untag +keywords. +For example, there can be a specialized +.Xr netgraph 4 +node doing traffic analyzing and tagging for later inspecting +in firewall. +.It Cm untag Ar number +When a packet matches a rule with the +.Cm untag +keyword, the tag with the number +.Ar number +is searched among the tags attached to this packet and, +if found, removed from it. +Other tags bound to packet, if present, are left untouched. +.It Cm altq Ar queue +When a packet matches a rule with the +.Cm altq +keyword, the ALTQ identifier for the given +.Ar queue +(see +.Xr altq 4 ) +will be attached. +Note that this ALTQ tag is only meaningful for packets going "out" of IPFW, +and not being rejected or going to divert sockets. +Note that if there is insufficient memory at the time the packet is +processed, it will not be tagged, so it is wise to make your ALTQ +"default" queue policy account for this. +If multiple +.Cm altq +rules match a single packet, only the first one adds the ALTQ classification +tag. +In doing so, traffic may be shaped by using +.Cm count Cm altq Ar queue +rules for classification early in the ruleset, then later applying +the filtering decision. +For example, +.Cm check-state +and +.Cm keep-state +rules may come later and provide the actual filtering decisions in +addition to the fallback ALTQ tag. +.Pp +You must run +.Xr pfctl 8 +to set up the queues before IPFW will be able to look them up by name, +and if the ALTQ disciplines are rearranged, the rules in containing the +queue identifiers in the kernel will likely have gone stale and need +to be reloaded. +Stale queue identifiers will probably result in misclassification. +.Pp +All system ALTQ processing can be turned on or off via +.Nm +.Cm enable Ar altq +and +.Nm +.Cm disable Ar altq . +The usage of +.Va net.inet.ip.fw.one_pass +is irrelevant to ALTQ traffic shaping, as the actual rule action is followed +always after adding an ALTQ tag. +.El +.Ss RULE ACTIONS +A rule can be associated with one of the following actions, which +will be executed when the packet matches the body of the rule. +.Bl -tag -width indent +.It Cm allow | accept | pass | permit +Allow packets that match rule. +The search terminates. +.It Cm check-state +Checks the packet against the dynamic ruleset. +If a match is found, execute the action associated with +the rule which generated this dynamic rule, otherwise +move to the next rule. +.br +.Cm Check-state +rules do not have a body. +If no +.Cm check-state +rule is found, the dynamic ruleset is checked at the first +.Cm keep-state +or +.Cm limit +rule. +.It Cm count +Update counters for all packets that match rule. +The search continues with the next rule. +.It Cm deny | drop +Discard packets that match this rule. +The search terminates. +.It Cm divert Ar port +Divert packets that match this rule to the +.Xr divert 4 +socket bound to port +.Ar port . +The search terminates. +.It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port +Change the next-hop on matching packets to +.Ar ipaddr , +which can be an IP address or a host name. +The next hop can also be supplied by the last table +looked up for the packet by using the +.Cm tablearg +keyword instead of an explicit address. +The search terminates if this rule matches. +.Pp +If +.Ar ipaddr +is a local address, then matching packets will be forwarded to +.Ar port +(or the port number in the packet if one is not specified in the rule) +on the local machine. +.br +If +.Ar ipaddr +is not a local address, then the port number +(if specified) is ignored, and the packet will be +forwarded to the remote address, using the route as found in +the local routing table for that IP. +.br +A +.Ar fwd +rule will not match layer-2 packets (those received +on ether_input, ether_output, or bridged). +.br +The +.Cm fwd +action does not change the contents of the packet at all. +In particular, the destination address remains unmodified, so +packets forwarded to another system will usually be rejected by that system +unless there is a matching rule on that system to capture them. +For packets forwarded locally, +the local address of the socket will be +set to the original destination address of the packet. +This makes the +.Xr netstat 1 +entry look rather weird but is intended for +use with transparent proxy servers. +.Pp +To enable +.Cm fwd +a custom kernel needs to be compiled with the option +.Cd "options IPFIREWALL_FORWARD" . +.It Cm nat Ar nat_nr +Pass packet to a +nat instance +(for network address translation, address redirect, etc.): +see the +.Sx NETWORK ADDRESS TRANSLATION (NAT) +Section for further information. +.It Cm pipe Ar pipe_nr +Pass packet to a +.Nm dummynet +.Dq pipe +(for bandwidth limitation, delay, etc.). +See the +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +Section for further information. +The search terminates; however, on exit from the pipe and if +the +.Xr sysctl 8 +variable +.Va net.inet.ip.fw.one_pass +is not set, the packet is passed again to the firewall code +starting from the next rule. +.It Cm queue Ar queue_nr +Pass packet to a +.Nm dummynet +.Dq queue +(for bandwidth limitation using WF2Q+). +.It Cm reject +(Deprecated). +Synonym for +.Cm unreach host . +.It Cm reset +Discard packets that match this rule, and if the +packet is a TCP packet, try to send a TCP reset (RST) notice. +The search terminates. +.It Cm reset6 +Discard packets that match this rule, and if the +packet is a TCP packet, try to send a TCP reset (RST) notice. +The search terminates. +.It Cm skipto Ar number | tablearg +Skip all subsequent rules numbered less than +.Ar number . +The search continues with the first rule numbered +.Ar number +or higher. +It is possible to use the +.Cm tablearg +keyword with a skipto for a +.Em computed +skipto, but care should be used, as no destination caching +is possible in this case so the rules are always walked to find it, +starting from the +.Cm skipto . +.It Cm tee Ar port +Send a copy of packets matching this rule to the +.Xr divert 4 +socket bound to port +.Ar port . +The search continues with the next rule. +.It Cm unreach Ar code +Discard packets that match this rule, and try to send an ICMP +unreachable notice with code +.Ar code , +where +.Ar code +is a number from 0 to 255, or one of these aliases: +.Cm net , host , protocol , port , +.Cm needfrag , srcfail , net-unknown , host-unknown , +.Cm isolated , net-prohib , host-prohib , tosnet , +.Cm toshost , filter-prohib , host-precedence +or +.Cm precedence-cutoff . +The search terminates. +.It Cm unreach6 Ar code +Discard packets that match this rule, and try to send an ICMPv6 +unreachable notice with code +.Ar code , +where +.Ar code +is a number from 0, 1, 3 or 4, or one of these aliases: +.Cm no-route, admin-prohib, address +or +.Cm port . +The search terminates. +.It Cm netgraph Ar cookie +Divert packet into netgraph with given +.Ar cookie . +The search terminates. +If packet is later returned from netgraph it is either +accepted or continues with the next rule, depending on +.Va net.inet.ip.fw.one_pass +sysctl variable. +.It Cm ngtee Ar cookie +A copy of packet is diverted into netgraph, original +packet continues with the next rule. +See +.Xr ng_ipfw 4 +for more information on +.Cm netgraph +and +.Cm ngtee +actions. +.It Cm setfib Ar fibnum +The packet is tagged so as to use the FIB (routing table) +.Ar fibnum +in any subsequent forwarding decisions. +Initially this is limited to the values 0 through 15, see +.Xr setfib 1 . +Processing continues at the next rule. +.It Cm reass +Queue and reassemble ip fragments. +If the packet is not fragmented, counters are updated and processing continues with the next rule. +If the packet is the last logical fragment, the packet is reassembled and, if +.Va net.inet.ip.fw.one_pass +is set to 0, processing continues with the next rule, else packet is allowed to pass and search terminates. +If the packet is a fragment in the middle, it is consumed and processing stops immediately. +.Pp +Fragments handling can be tuned via +.Va net.inet.ip.maxfragpackets +and +.Va net.inet.ip.maxfragsperpacket +which limit, respectively, the maximum number of processable fragments (default: 800) and +the maximum number of fragments per packet (default: 16). +.Pp +NOTA BENE: since fragments do not contain port numbers, they should be avoided with the +.Nm reass +rule. +Alternatively, direction-based (like +.Nm in +/ +.Nm out +) and source-based (like +.Nm via +) match patterns can be used to select fragments. +.Pp +Usually a simple rule like: +.Bd -literal -offset indent +# reassemble incoming fragments +ipfw add reass all from any to any in +.Ed +.Pp +is all you need at the beginning of your ruleset. +.El +.Ss RULE BODY +The body of a rule contains zero or more patterns (such as +specific source and destination addresses or ports, +protocol options, incoming or outgoing interfaces, etc.) +that the packet must match in order to be recognised. +In general, the patterns are connected by (implicit) +.Cm and +operators -- i.e., all must match in order for the +rule to match. +Individual patterns can be prefixed by the +.Cm not +operator to reverse the result of the match, as in +.Pp +.Dl "ipfw add 100 allow ip from not 1.2.3.4 to any" +.Pp +Additionally, sets of alternative match patterns +.Pq Em or-blocks +can be constructed by putting the patterns in +lists enclosed between parentheses ( ) or braces { }, and +using the +.Cm or +operator as follows: +.Pp +.Dl "ipfw add 100 allow ip from { x or not y or z } to any" +.Pp +Only one level of parentheses is allowed. +Beware that most shells have special meanings for parentheses +or braces, so it is advisable to put a backslash \\ in front of them +to prevent such interpretations. +.Pp +The body of a rule must in general include a source and destination +address specifier. +The keyword +.Ar any +can be used in various places to specify that the content of +a required field is irrelevant. +.Pp +The rule body has the following format: +.Bd -ragged -offset indent +.Op Ar proto Cm from Ar src Cm to Ar dst +.Op Ar options +.Ed +.Pp +The first part (proto from src to dst) is for backward +compatibility with earlier versions of +.Fx . +In modern +.Fx +any match pattern (including MAC headers, IP protocols, +addresses and ports) can be specified in the +.Ar options +section. +.Pp +Rule fields have the following meaning: +.Bl -tag -width indent +.It Ar proto : protocol | Cm { Ar protocol Cm or ... } +.It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number +An IP protocol specified by number or name +(for a complete list see +.Pa /etc/protocols ) , +or one of the following keywords: +.Bl -tag -width indent +.It Cm ip4 | ipv4 +Matches IPv4 packets. +.It Cm ip6 | ipv6 +Matches IPv6 packets. +.It Cm ip | all +Matches any packet. +.El +.Pp +The +.Cm ipv6 +in +.Cm proto +option will be treated as inner protocol. +And, the +.Cm ipv4 +is not available in +.Cm proto +option. +.Pp +The +.Cm { Ar protocol Cm or ... } +format (an +.Em or-block ) +is provided for convenience only but its use is deprecated. +.It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports +An address (or a list, see below) +optionally followed by +.Ar ports +specifiers. +.Pp +The second format +.Em ( or-block +with multiple addresses) is provided for convenience only and +its use is discouraged. +.It Ar addr : Oo Cm not Oc Bro +.Bl -tag -width indent +.Cm any | me | me6 | +.Cm table Ns Pq Ar number Ns Op , Ns Ar value +.Ar | addr-list | addr-set +.Brc +.It Cm any +matches any IP address. +.It Cm me +matches any IP address configured on an interface in the system. +.It Cm me6 +matches any IPv6 address configured on an interface in the system. +The address list is evaluated at the time the packet is +analysed. +.It Cm table Ns Pq Ar number Ns Op , Ns Ar value +Matches any IPv4 address for which an entry exists in the lookup table +.Ar number . +If an optional 32-bit unsigned +.Ar value +is also specified, an entry will match only if it has this value. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.El +.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list +.It Ar ip-addr : +A host or subnet address specified in one of the following ways: +.Bl -tag -width indent +.It Ar numeric-ip | hostname +Matches a single IPv4 address, specified as dotted-quad or a hostname. +Hostnames are resolved at the time the rule is added to the firewall list. +.It Ar addr Ns / Ns Ar masklen +Matches all addresses with base +.Ar addr +(specified as an IP address, a network number, or a hostname) +and mask width of +.Cm masklen +bits. +As an example, 1.2.3.4/25 or 1.2.3.0/25 will match +all IP numbers from 1.2.3.0 to 1.2.3.127 . +.It Ar addr Ns : Ns Ar mask +Matches all addresses with base +.Ar addr +(specified as an IP address, a network number, or a hostname) +and the mask of +.Ar mask , +specified as a dotted quad. +As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match +1.*.3.*. +This form is advised only for non-contiguous +masks. +It is better to resort to the +.Ar addr Ns / Ns Ar masklen +format for contiguous masks, which is more compact and less +error-prone. +.El +.It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm } +.It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list +Matches all addresses with base address +.Ar addr +(specified as an IP address, a network number, or a hostname) +and whose last byte is in the list between braces { } . +Note that there must be no spaces between braces and +numbers (spaces after commas are allowed). +Elements of the list can be specified as single entries +or ranges. +The +.Ar masklen +field is used to limit the size of the set of addresses, +and can have any value between 24 and 32. +If not specified, +it will be assumed as 24. +.br +This format is particularly useful to handle sparse address sets +within a single rule. +Because the matching occurs using a +bitmask, it takes constant time and dramatically reduces +the complexity of rulesets. +.br +As an example, an address specified as 1.2.3.4/24{128,35-55,89} +or 1.2.3.0/24{128,35-55,89} +will match the following IP addresses: +.br +1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 . +.It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list +.It Ar ip6-addr : +A host or subnet specified one of the following ways: +.Pp +.Bl -tag -width indent +.It Ar numeric-ip | hostname +Matches a single IPv6 address as allowed by +.Xr inet_pton 3 +or a hostname. +Hostnames are resolved at the time the rule is added to the firewall +list. +.It Ar addr Ns / Ns Ar masklen +Matches all IPv6 addresses with base +.Ar addr +(specified as allowed by +.Xr inet_pton +or a hostname) +and mask width of +.Cm masklen +bits. +.El +.Pp +No support for sets of IPv6 addresses is provided because IPv6 addresses +are typically random past the initial prefix. +.It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports +For protocols which support port numbers (such as TCP and UDP), optional +.Cm ports +may be specified as one or more ports or port ranges, separated +by commas but no spaces, and an optional +.Cm not +operator. +The +.Ql \&- +notation specifies a range of ports (including boundaries). +.Pp +Service names (from +.Pa /etc/services ) +may be used instead of numeric port values. +The length of the port list is limited to 30 ports or ranges, +though one can specify larger ranges by using an +.Em or-block +in the +.Cm options +section of the rule. +.Pp +A backslash +.Pq Ql \e +can be used to escape the dash +.Pq Ql - +character in a service name (from a shell, the backslash must be +typed twice to avoid the shell itself interpreting it as an escape +character). +.Pp +.Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any" +.Pp +Fragmented packets which have a non-zero offset (i.e., not the first +fragment) will never match a rule which has one or more port +specifications. +See the +.Cm frag +option for details on matching fragmented packets. +.El +.Ss RULE OPTIONS (MATCH PATTERNS) +Additional match patterns can be used within +rules. +Zero or more of these so-called +.Em options +can be present in a rule, optionally prefixed by the +.Cm not +operand, and possibly grouped into +.Em or-blocks . +.Pp +The following match patterns can be used (listed in alphabetical order): +.Bl -tag -width indent +.It Cm // this is a comment. +Inserts the specified text as a comment in the rule. +Everything following // is considered as a comment and stored in the rule. +You can have comment-only rules, which are listed as having a +.Cm count +action followed by the comment. +.It Cm bridged +Alias for +.Cm layer2 . +.It Cm diverted +Matches only packets generated by a divert socket. +.It Cm diverted-loopback +Matches only packets coming from a divert socket back into the IP stack +input for delivery. +.It Cm diverted-output +Matches only packets going from a divert socket back outward to the IP +stack output for delivery. +.It Cm dst-ip Ar ip-address +Matches IPv4 packets whose destination IP is one of the address(es) +specified as argument. +.It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address +Matches IPv6 packets whose destination IP is one of the address(es) +specified as argument. +.It Cm dst-port Ar ports +Matches IP packets whose destination port is one of the port(s) +specified as argument. +.It Cm established +Matches TCP packets that have the RST or ACK bits set. +.It Cm ext6hdr Ar header +Matches IPv6 packets containing the extended header given by +.Ar header . +Supported headers are: +.Pp +Fragment, +.Pq Cm frag , +Hop-to-hop options +.Pq Cm hopopt , +any type of Routing Header +.Pq Cm route , +Source routing Routing Header Type 0 +.Pq Cm rthdr0 , +Mobile IPv6 Routing Header Type 2 +.Pq Cm rthdr2 , +Destination options +.Pq Cm dstopt , +IPSec authentication headers +.Pq Cm ah , +and IPsec encapsulated security payload headers +.Pq Cm esp . +.It Cm fib Ar fibnum +Matches a packet that has been tagged to use +the given FIB (routing table) number. +.It Cm flow-id Ar labels +Matches IPv6 packets containing any of the flow labels given in +.Ar labels . +.Ar labels +is a comma separated list of numeric flow labels. +.It Cm frag +Matches packets that are fragments and not the first +fragment of an IP datagram. +Note that these packets will not have +the next protocol header (e.g.\& TCP, UDP) so options that look into +these headers cannot match. +.It Cm gid Ar group +Matches all TCP or UDP packets sent by or received for a +.Ar group . +A +.Ar group +may be specified by name or number. +.It Cm jail Ar prisonID +Matches all TCP or UDP packets sent by or received for the +jail whos prison ID is +.Ar prisonID . +.It Cm icmptypes Ar types +Matches ICMP packets whose ICMP type is in the list +.Ar types . +The list may be specified as any combination of +individual types (numeric) separated by commas. +.Em Ranges are not allowed . +The supported ICMP types are: +.Pp +echo reply +.Pq Cm 0 , +destination unreachable +.Pq Cm 3 , +source quench +.Pq Cm 4 , +redirect +.Pq Cm 5 , +echo request +.Pq Cm 8 , +router advertisement +.Pq Cm 9 , +router solicitation +.Pq Cm 10 , +time-to-live exceeded +.Pq Cm 11 , +IP header bad +.Pq Cm 12 , +timestamp request +.Pq Cm 13 , +timestamp reply +.Pq Cm 14 , +information request +.Pq Cm 15 , +information reply +.Pq Cm 16 , +address mask request +.Pq Cm 17 +and address mask reply +.Pq Cm 18 . +.It Cm icmp6types Ar types +Matches ICMP6 packets whose ICMP6 type is in the list of +.Ar types . +The list may be specified as any combination of +individual types (numeric) separated by commas. +.Em Ranges are not allowed . +.It Cm in | out +Matches incoming or outgoing packets, respectively. +.Cm in +and +.Cm out +are mutually exclusive (in fact, +.Cm out +is implemented as +.Cm not in Ns No ). +.It Cm ipid Ar id-list +Matches IPv4 packets whose +.Cm ip_id +field has value included in +.Ar id-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm iplen Ar len-list +Matches IP packets whose total length, including header and data, is +in the set +.Ar len-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm ipoptions Ar spec +Matches packets whose IPv4 header contains the comma separated list of +options specified in +.Ar spec . +The supported IP options are: +.Pp +.Cm ssrr +(strict source route), +.Cm lsrr +(loose source route), +.Cm rr +(record packet route) and +.Cm ts +(timestamp). +The absence of a particular option may be denoted +with a +.Ql \&! . +.It Cm ipprecedence Ar precedence +Matches IPv4 packets whose precedence field is equal to +.Ar precedence . +.It Cm ipsec +Matches packets that have IPSEC history associated with them +(i.e., the packet comes encapsulated in IPSEC, the kernel +has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly +decapsulate it). +.Pp +Note that specifying +.Cm ipsec +is different from specifying +.Cm proto Ar ipsec +as the latter will only look at the specific IP protocol field, +irrespective of IPSEC kernel support and the validity of the IPSEC data. +.Pp +Further note that this flag is silently ignored in kernels without +IPSEC support. +It does not affect rule processing when given and the +rules are handled as if with no +.Cm ipsec +flag. +.It Cm iptos Ar spec +Matches IPv4 packets whose +.Cm tos +field contains the comma separated list of +service types specified in +.Ar spec . +The supported IP types of service are: +.Pp +.Cm lowdelay +.Pq Dv IPTOS_LOWDELAY , +.Cm throughput +.Pq Dv IPTOS_THROUGHPUT , +.Cm reliability +.Pq Dv IPTOS_RELIABILITY , +.Cm mincost +.Pq Dv IPTOS_MINCOST , +.Cm congestion +.Pq Dv IPTOS_ECN_CE . +The absence of a particular type may be denoted +with a +.Ql \&! . +.It Cm ipttl Ar ttl-list +Matches IPv4 packets whose time to live is included in +.Ar ttl-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm ipversion Ar ver +Matches IP packets whose IP version field is +.Ar ver . +.It Cm keep-state +Upon a match, the firewall will create a dynamic rule, whose +default behaviour is to match bidirectional traffic between +source and destination IP/port using the same protocol. +The rule has a limited lifetime (controlled by a set of +.Xr sysctl 8 +variables), and the lifetime is refreshed every time a matching +packet is found. +.It Cm layer2 +Matches only layer2 packets, i.e., those passed to +.Nm +from ether_demux() and ether_output_frame(). +.It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N +The firewall will only allow +.Ar N +connections with the same +set of parameters as specified in the rule. +One or more +of source and destination addresses and ports can be +specified. +Currently, +only IPv4 flows are supported. +.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N +Search an entry in lookup table +.Ar N +that matches the field specified as argument. +If not found, the match fails. +Otherwise, the match succeeds and +.Cm tablearg +is set to the value extracted from the table. +.Pp +This option can be useful to quickly dispatch traffic based on +certain packet fields. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.It Cm { MAC | mac } Ar dst-mac src-mac +Match packets with a given +.Ar dst-mac +and +.Ar src-mac +addresses, specified as the +.Cm any +keyword (matching any MAC address), or six groups of hex digits +separated by colons, +and optionally followed by a mask indicating the significant bits. +The mask may be specified using either of the following methods: +.Bl -enum -width indent +.It +A slash +.Pq / +followed by the number of significant bits. +For example, an address with 33 significant bits could be specified as: +.Pp +.Dl "MAC 10:20:30:40:50:60/33 any" +.Pp +.It +An ampersand +.Pq & +followed by a bitmask specified as six groups of hex digits separated +by colons. +For example, an address in which the last 16 bits are significant could +be specified as: +.Pp +.Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any" +.Pp +Note that the ampersand character has a special meaning in many shells +and should generally be escaped. +.Pp +.El +Note that the order of MAC addresses (destination first, +source second) is +the same as on the wire, but the opposite of the one used for +IP addresses. +.It Cm mac-type Ar mac-type +Matches packets whose Ethernet Type field +corresponds to one of those specified as argument. +.Ar mac-type +is specified in the same way as +.Cm port numbers +(i.e., one or more comma-separated single values or ranges). +You can use symbolic names for known values such as +.Em vlan , ipv4, ipv6 . +Values can be entered as decimal or hexadecimal (if prefixed by 0x), +and they are always printed as hexadecimal (unless the +.Cm -N +option is used, in which case symbolic resolution will be attempted). +.It Cm proto Ar protocol +Matches packets with the corresponding IP protocol. +.It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar ipno | Ar any +Matches packets received, transmitted or going through, +respectively, the interface specified by exact name +.Ns No ( Ar ifX Ns No ), +by device name +.Ns No ( Ar if Ns Ar * Ns No ), +by IP address, or through some interface. +.Pp +The +.Cm via +keyword causes the interface to always be checked. +If +.Cm recv +or +.Cm xmit +is used instead of +.Cm via , +then only the receive or transmit interface (respectively) +is checked. +By specifying both, it is possible to match packets based on +both receive and transmit interface, e.g.: +.Pp +.Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1" +.Pp +The +.Cm recv +interface can be tested on either incoming or outgoing packets, +while the +.Cm xmit +interface can only be tested on outgoing packets. +So +.Cm out +is required (and +.Cm in +is invalid) whenever +.Cm xmit +is used. +.Pp +A packet might not have a receive or transmit interface: packets +originating from the local host have no receive interface, +while packets destined for the local host have no transmit +interface. +.It Cm setup +Matches TCP packets that have the SYN bit set but no ACK bit. +This is the short form of +.Dq Li tcpflags\ syn,!ack . +.It Cm src-ip Ar ip-address +Matches IPv4 packets whose source IP is one of the address(es) +specified as an argument. +.It Cm src-ip6 Ar ip6-address +Matches IPv6 packets whose source IP is one of the address(es) +specified as an argument. +.It Cm src-port Ar ports +Matches IP packets whose source port is one of the port(s) +specified as argument. +.It Cm tagged Ar tag-list +Matches packets whose tags are included in +.Ar tag-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +Tags can be applied to the packet using +.Cm tag +rule action parameter (see it's description for details on tags). +.It Cm tcpack Ar ack +TCP packets only. +Match if the TCP header acknowledgment number field is set to +.Ar ack . +.It Cm tcpdatalen Ar tcpdatalen-list +Matches TCP packets whose length of TCP data is +.Ar tcpdatalen-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm tcpflags Ar spec +TCP packets only. +Match if the TCP header contains the comma separated list of +flags specified in +.Ar spec . +The supported TCP flags are: +.Pp +.Cm fin , +.Cm syn , +.Cm rst , +.Cm psh , +.Cm ack +and +.Cm urg . +The absence of a particular flag may be denoted +with a +.Ql \&! . +A rule which contains a +.Cm tcpflags +specification can never match a fragmented packet which has +a non-zero offset. +See the +.Cm frag +option for details on matching fragmented packets. +.It Cm tcpseq Ar seq +TCP packets only. +Match if the TCP header sequence number field is set to +.Ar seq . +.It Cm tcpwin Ar win +TCP packets only. +Match if the TCP header window field is set to +.Ar win . +.It Cm tcpoptions Ar spec +TCP packets only. +Match if the TCP header contains the comma separated list of +options specified in +.Ar spec . +The supported TCP options are: +.Pp +.Cm mss +(maximum segment size), +.Cm window +(tcp window advertisement), +.Cm sack +(selective ack), +.Cm ts +(rfc1323 timestamp) and +.Cm cc +(rfc1644 t/tcp connection count). +The absence of a particular option may be denoted +with a +.Ql \&! . +.It Cm uid Ar user +Match all TCP or UDP packets sent by or received for a +.Ar user . +A +.Ar user +may be matched by name or identification number. +.It Cm verrevpath +For incoming packets, +a routing table lookup is done on the packet's source address. +If the interface on which the packet entered the system matches the +outgoing interface for the route, +the packet matches. +If the interfaces do not match up, +the packet does not match. +All outgoing packets or packets with no incoming interface match. +.Pp +The name and functionality of the option is intentionally similar to +the Cisco IOS command: +.Pp +.Dl ip verify unicast reverse-path +.Pp +This option can be used to make anti-spoofing rules to reject all +packets with source addresses not from this interface. +See also the option +.Cm antispoof . +.It Cm versrcreach +For incoming packets, +a routing table lookup is done on the packet's source address. +If a route to the source address exists, but not the default route +or a blackhole/reject route, the packet matches. +Otherwise, the packet does not match. +All outgoing packets match. +.Pp +The name and functionality of the option is intentionally similar to +the Cisco IOS command: +.Pp +.Dl ip verify unicast source reachable-via any +.Pp +This option can be used to make anti-spoofing rules to reject all +packets whose source address is unreachable. +.It Cm antispoof +For incoming packets, the packet's source address is checked if it +belongs to a directly connected network. +If the network is directly connected, then the interface the packet +came on in is compared to the interface the network is connected to. +When incoming interface and directly connected interface are not the +same, the packet does not match. +Otherwise, the packet does match. +All outgoing packets match. +.Pp +This option can be used to make anti-spoofing rules to reject all +packets that pretend to be from a directly connected network but do +not come in through that interface. +This option is similar to but more restricted than +.Cm verrevpath +because it engages only on packets with source addresses of directly +connected networks instead of all source addresses. +.El +.Sh LOOKUP TABLES +Lookup tables are useful to handle large sparse sets of +addresses or other search keys (e.g. ports, jail IDs). +In the rest of this section we will use the term ``address'' +to mean any unsigned value of up to 32-bit. +There may be up to 128 different lookup tables, numbered 0 to 127. +.Pp +Each entry is represented by an +.Ar addr Ns Op / Ns Ar masklen +and will match all addresses with base +.Ar addr +(specified as an IP address, a hostname or an unsigned integer) +and mask width of +.Ar masklen +bits. +If +.Ar masklen +is not specified, it defaults to 32. +When looking up an IP address in a table, the most specific +entry will match. +Associated with each entry is a 32-bit unsigned +.Ar value , +which can optionally be checked by a rule matching code. +When adding an entry, if +.Ar value +is not specified, it defaults to 0. +.Pp +An entry can be added to a table +.Pq Cm add , +or removed from a table +.Pq Cm delete . +A table can be examined +.Pq Cm list +or flushed +.Pq Cm flush . +.Pp +Internally, each table is stored in a Radix tree, the same way as +the routing table (see +.Xr route 4 ) . +.Pp +Lookup tables currently support only ports, jail IDs and IPv4 addresses. +.Pp +The +.Cm tablearg +feature provides the ability to use a value, looked up in the table, as +the argument for a rule action, action parameter or rule option. +This can significantly reduce number of rules in some configurations. +If two tables are used in a rule, the result of the second (destination) +is used. +The +.Cm tablearg +argument can be used with the following actions: +.Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto +action parameters: +.Cm tag, untag, +rule options: +.Cm limit, tagged. +.Pp +When used with +.Cm fwd +it is possible to supply table entries with values +that are in the form of IP addresses or hostnames. +See the +.Sx EXAMPLES +Section for example usage of tables and the tablearg keyword. +.Pp +When used with the +.Cm skipto +action, the user should be aware that the code will walk the ruleset +up to a rule equal to, or past, the given number, and should therefore try keep the +ruleset compact between the skipto and the target rules. +.Sh SETS OF RULES +Each rule belongs to one of 32 different +.Em sets +, numbered 0 to 31. +Set 31 is reserved for the default rule. +.Pp +By default, rules are put in set 0, unless you use the +.Cm set N +attribute when entering a new rule. +Sets can be individually and atomically enabled or disabled, +so this mechanism permits an easy way to store multiple configurations +of the firewall and quickly (and atomically) switch between them. +The command to enable/disable sets is +.Bd -ragged -offset indent +.Nm +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... +.Ed +.Pp +where multiple +.Cm enable +or +.Cm disable +sections can be specified. +Command execution is atomic on all the sets specified in the command. +By default, all sets are enabled. +.Pp +When you disable a set, its rules behave as if they do not exist +in the firewall configuration, with only one exception: +.Bd -ragged -offset indent +dynamic rules created from a rule before it had been disabled +will still be active until they expire. +In order to delete +dynamic rules you have to explicitly delete the parent rule +which generated them. +.Ed +.Pp +The set number of rules can be changed with the command +.Bd -ragged -offset indent +.Nm +.Cm set move +.Brq Cm rule Ar rule-number | old-set +.Cm to Ar new-set +.Ed +.Pp +Also, you can atomically swap two rulesets with the command +.Bd -ragged -offset indent +.Nm +.Cm set swap Ar first-set second-set +.Ed +.Pp +See the +.Sx EXAMPLES +Section on some possible uses of sets of rules. +.Sh STATEFUL FIREWALL +Stateful operation is a way for the firewall to dynamically +create rules for specific flows when packets that +match a given pattern are detected. +Support for stateful +operation comes through the +.Cm check-state , keep-state +and +.Cm limit +options of +.Nm rules . +.Pp +Dynamic rules are created when a packet matches a +.Cm keep-state +or +.Cm limit +rule, causing the creation of a +.Em dynamic +rule which will match all and only packets with +a given +.Em protocol +between a +.Em src-ip/src-port dst-ip/dst-port +pair of addresses +.Em ( src +and +.Em dst +are used here only to denote the initial match addresses, but they +are completely equivalent afterwards). +Dynamic rules will be checked at the first +.Cm check-state, keep-state +or +.Cm limit +occurrence, and the action performed upon a match will be the same +as in the parent rule. +.Pp +Note that no additional attributes other than protocol and IP addresses +and ports are checked on dynamic rules. +.Pp +The typical use of dynamic rules is to keep a closed firewall configuration, +but let the first TCP SYN packet from the inside network install a +dynamic rule for the flow so that packets belonging to that session +will be allowed through the firewall: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add allow tcp from my-subnet to any setup keep-state" +.Dl "ipfw add deny tcp from any to any" +.Pp +A similar approach can be used for UDP, where an UDP packet coming +from the inside will install a dynamic rule to let the response through +the firewall: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add allow udp from my-subnet to any keep-state" +.Dl "ipfw add deny udp from any to any" +.Pp +Dynamic rules expire after some time, which depends on the status +of the flow and the setting of some +.Cm sysctl +variables. +See Section +.Sx SYSCTL VARIABLES +for more details. +For TCP sessions, dynamic rules can be instructed to periodically +send keepalive packets to refresh the state of the rule when it is +about to expire. +.Pp +See Section +.Sx EXAMPLES +for more examples on how to use dynamic rules. +.Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +.Nm +is also the user interface for the +.Nm dummynet +traffic shaper, packet scheduler and network emulator, a subsystem that +can artificially queue, delay or drop packets +emulating the behaviour of certain network links +or queueing systems. +.Pp +.Nm dummynet +operates by first using the firewall to select packets +using any match pattern that can be used in +.Nm +rules. +Matching packets are then passed to either of two +different objects, which implement the traffic regulation: +.Bl -hang -offset XXXX +.It Em pipe +A +.Em pipe +emulates a +.Em link +with given bandwidth and propagation delay, +driven by a FIFO scheduler and a single queue with programmable +queue size and packet loss rate. +Packets are appended to the queue as they come out from +.Nm ipfw , +and then transferred in FIFO order to the link at the desired rate. +.It Em queue +A +.Em queue +is an abstraction used to implement packet scheduling +using one of several packet scheduling algorithms. +Packets sent to a +.Em queue +are first grouped into flows according to a mask on the 5-tuple. +Flows are then passed to the scheduler associated to the +.Em queue , +and each flow uses scheduling parameters (weight and others) +as configured in the +.Em queue +itself. +A scheduler in turn is connected to an emulated link, +and arbitrates the link's bandwidth among backlogged flows according to +weights and to the features of the scheduling algorithm in use. +.El +.Pp +In practice, +.Em pipes +can be used to set hard limits to the bandwidth that a flow can use, whereas +.Em queues +can be used to determine how different flows share the available bandwidth. +.Pp +A graphical representation of the binding of queues, +flows, schedulers and links is below. +.Bd -literal -offset indent + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ +.Ed +It is important to understand the role of the SCHED_MASK +and FLOW_MASK, which are configured through the commands +.Dl "ipfw sched N config mask SCHED_MASK ..." +and +.Dl "ipfw queue X config mask FLOW_MASK ..." . +.Pp +The SCHED_MASK is used to assign flows to one or more +scheduler instances, one for each +value of the packet's 5-tuple after applying SCHED_MASK. +As an example, using ``src-ip 0xffffff00'' creates one instance +for each /24 destination subnet. +.Pp +The FLOW_MASK, together with the SCHED_MASK, is used to split +packets into flows. As an example, using +``src-ip 0x000000ff'' +together with the previous SCHED_MASK makes a flow for +each individual source address. In turn, flows for each /24 +subnet will be sent to the same scheduler instance. +.Pp +The above diagram holds even for the +.Em pipe +case, with the only restriction that a +.Em pipe +only supports a SCHED_MASK, and forces the use of a FIFO +scheduler (these are for backward compatibility reasons; +in fact, internally, a +.Nm dummynet's +pipe is implemented exactly as above). +.Pp +There are two modes of +.Nm dummynet +operation: +.Dq normal +and +.Dq fast . +The +.Dq normal +mode tries to emulate a real link: the +.Nm dummynet +scheduler ensures that the packet will not leave the pipe faster than it +would on the real link with a given bandwidth. +The +.Dq fast +mode allows certain packets to bypass the +.Nm dummynet +scheduler (if packet flow does not exceed pipe's bandwidth). +This is the reason why the +.Dq fast +mode requires less CPU cycles per packet (on average) and packet latency +can be significantly lower in comparison to a real link with the same +bandwidth. +The default mode is +.Dq normal . +The +.Dq fast +mode can be enabled by setting the +.Va net.inet.ip.dummynet.io_fast +.Xr sysctl 8 +variable to a non-zero value. +.Pp +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION +The +.Em pipe , +.Em queue +and +.Em scheduler +configuration commands are the following: +.Bd -ragged -offset indent +.Cm pipe Ar number Cm config Ar pipe-configuration +.Pp +.Cm queue Ar number Cm config Ar queue-configuration +.Pp +.Cm sched Ar number Cm config Ar sched-configuration +.Ed +.Pp +The following parameters can be configured for a pipe: +.Pp +.Bl -tag -width indent -compact +.It Cm bw Ar bandwidth | device +Bandwidth, measured in +.Sm off +.Op Cm K | M +.Brq Cm bit/s | Byte/s . +.Sm on +.Pp +A value of 0 (default) means unlimited bandwidth. +The unit must immediately follow the number, as in +.Pp +.Dl "ipfw pipe 1 config bw 300Kbit/s" +.Pp +If a device name is specified instead of a numeric value, as in +.Pp +.Dl "ipfw pipe 1 config bw tun0" +.Pp +then the transmit clock is supplied by the specified device. +At the moment only the +.Xr tun 4 +device supports this +functionality, for use in conjunction with +.Xr ppp 8 . +.Pp +.It Cm delay Ar ms-delay +Propagation delay, measured in milliseconds. +The value is rounded to the next multiple of the clock tick +(typically 10ms, but it is a good practice to run kernels +with +.Dq "options HZ=1000" +to reduce +the granularity to 1ms or less). +The default value is 0, meaning no delay. +.Pp +.It Cm burst Ar size +If the data to be sent exceeds the pipe's bandwidth limit +(and the pipe was previously idle), up to +.Ar size +bytes of data are allowed to bypass the +.Nm dummynet +scheduler, and will be sent as fast as the physical link allows. +Any additional data will be transmitted at the rate specified +by the +.Nm pipe +bandwidth. +The burst size depends on how long the pipe has been idle; +the effective burst size is calculated as follows: +MAX( +.Ar size +, +.Nm bw +* pipe_idle_time). +.Pp +.It Cm profile Ar filename +A file specifying the additional overhead incurred in the transmission +of a packet on the link. +.Pp +Some link types introduce extra delays in the transmission +of a packet, e.g. because of MAC level framing, contention on +the use of the channel, MAC level retransmissions and so on. +From our point of view, the channel is effectively unavailable +for this extra time, which is constant or variable depending +on the link type. Additionally, packets may be dropped after this +time (e.g. on a wireless link after too many retransmissions). +We can model the additional delay with an empirical curve +that represents its distribution. +.Bd -literal -offset indent + cumulative probability + 1.0 ^ + | + L +-- loss-level x + | ****** + | * + | ***** + | * + | ** + | * + +-------*-------------------> + delay +.Ed +The empirical curve may have both vertical and horizontal lines. +Vertical lines represent constant delay for a range of +probabilities. +Horizontal lines correspond to a discontinuity in the delay +distribution: the pipe will use the largest delay for a +given probability. +.Pp +The file format is the following, with whitespace acting as +a separator and '#' indicating the beginning a comment: +.Bl -tag -width indent +.It Cm name Ar identifier +optional name (listed by "ipfw pipe show") +to identify the delay distribution; +.It Cm bw Ar value +the bandwidth used for the pipe. +If not specified here, it must be present +explicitly as a configuration parameter for the pipe; +.It Cm loss-level Ar L +the probability above which packets are lost. +(0.0 <= L <= 1.0, default 1.0 i.e. no loss); +.It Cm samples Ar N +the number of samples used in the internal +representation of the curve (2..1024; default 100); +.It Cm "delay prob" | "prob delay" +One of these two lines is mandatory and defines +the format of the following lines with data points. +.It Ar XXX Ar YYY +2 or more lines representing points in the curve, +with either delay or probability first, according +to the chosen format. +The unit for delay is milliseconds. +Data points do not need to be sorted. +Also, the number of actual lines can be different +from the value of the "samples" parameter: +.Nm +utility will sort and interpolate +the curve as needed. +.El +.Pp +Example of a profile file: +.Bd -literal -offset indent +name bla_bla_bla +samples 100 +loss-level 0.86 +prob delay +0 200 # minimum overhead is 200ms +0.5 200 +0.5 300 +0.8 1000 +0.9 1300 +1 1300 +#configuration file end +.Ed +.El +.Pp +The following parameters can be configured for a queue: +.Pp +.Bl -tag -width indent -compact +.It Cm pipe Ar pipe_nr +Connects a queue to the specified pipe. +Multiple queues (with the same or different weights) can be connected to +the same pipe, which specifies the aggregate rate for the set of queues. +.Pp +.It Cm weight Ar weight +Specifies the weight to be used for flows matching this queue. +The weight must be in the range 1..100, and defaults to 1. +.El +.Pp +The following parameters can be configured for a scheduler: +.Pp +.Bl -tag -width indent -compact +.It Cm type Ar {fifo | wf2qp | rr | qfq} +specifies the scheduling algorithm to use. +.Bl -tag -width indent -compact +.It cm fifo +is just a FIFO scheduler (which means that all packets +are stored in the same queue as they arrive to the scheduler). +FIFO has O(1) per-packet time complexity, with very low +constants (estimate 60-80ns on a 2Ghz desktop machine) +but gives no service guarantees. +.It Cm wf2qp +implements the WF2Q+ algorithm, which is a Weighted Fair Queueing +algorithm which permits flows to share bandwidth according to +their weights. Note that weights are not priorities; even a flow +with a minuscule weight will never starve. +WF2Q+ has O(log N) per-packet processing cost, where N is the number +of flows, and is the default algorithm used by previous versions +dummynet's queues. +.It Cm rr +implements the Deficit Round Robin algorithm, which has O(1) processing +costs (roughly, 100-150ns per packet) +and permits bandwidth allocation according to weights, but +with poor service guarantees. +.It Cm qfq +implements the QFQ algorithm, which is a very fast variant of +WF2Q+, with similar service guarantees and O(1) processing +costs (roughly, 200-250ns per packet). +.El +.El +.Pp +In addition to the type, all parameters allowed for a pipe can also +be specified for a scheduler. +.Pp +Finally, the following parameters can be configured for both +pipes and queues: +.Pp +.Bl -tag -width XXXX -compact +.Pp +.It Cm buckets Ar hash-table-size +Specifies the size of the hash table used for storing the +various queues. +Default value is 64 controlled by the +.Xr sysctl 8 +variable +.Va net.inet.ip.dummynet.hash_size , +allowed range is 16 to 65536. +.Pp +.It Cm mask Ar mask-specifier +Packets sent to a given pipe or queue by an +.Nm +rule can be further classified into multiple flows, each of which is then +sent to a different +.Em dynamic +pipe or queue. +A flow identifier is constructed by masking the IP addresses, +ports and protocol types as specified with the +.Cm mask +options in the configuration of the pipe or queue. +For each different flow identifier, a new pipe or queue is created +with the same parameters as the original object, and matching packets +are sent to it. +.Pp +Thus, when +.Em dynamic pipes +are used, each flow will get the same bandwidth as defined by the pipe, +whereas when +.Em dynamic queues +are used, each flow will share the parent's pipe bandwidth evenly +with other flows generated by the same queue (note that other queues +with different weights might be connected to the same pipe). +.br +Available mask specifiers are a combination of one or more of the following: +.Pp +.Cm dst-ip Ar mask , +.Cm dst-ip6 Ar mask , +.Cm src-ip Ar mask , +.Cm src-ip6 Ar mask , +.Cm dst-port Ar mask , +.Cm src-port Ar mask , +.Cm flow-id Ar mask , +.Cm proto Ar mask +or +.Cm all , +.Pp +where the latter means all bits in all fields are significant. +.Pp +.It Cm noerror +When a packet is dropped by a +.Nm dummynet +queue or pipe, the error +is normally reported to the caller routine in the kernel, in the +same way as it happens when a device queue fills up. +Setting this +option reports the packet as successfully delivered, which can be +needed for some experimental setups where you want to simulate +loss or congestion at a remote router. +.Pp +.It Cm plr Ar packet-loss-rate +Packet loss rate. +Argument +.Ar packet-loss-rate +is a floating-point number between 0 and 1, with 0 meaning no +loss, 1 meaning 100% loss. +The loss rate is internally represented on 31 bits. +.Pp +.It Cm queue Brq Ar slots | size Ns Cm Kbytes +Queue size, in +.Ar slots +or +.Cm KBytes . +Default value is 50 slots, which +is the typical queue size for Ethernet devices. +Note that for slow speed links you should keep the queue +size short or your traffic might be affected by a significant +queueing delay. +E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit +or 20s of queue on a 30Kbit/s pipe. +Even worse effects can result if you get packets from an +interface with a much larger MTU, e.g.\& the loopback interface +with its 16KB packets. +The +.Xr sysctl 8 +variables +.Em net.inet.ip.dummynet.pipe_byte_limit +and +.Em net.inet.ip.dummynet.pipe_slot_limit +control the maximum lengths that can be specified. +.Pp +.It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p +Make use of the RED (Random Early Detection) queue management algorithm. +.Ar w_q +and +.Ar max_p +are floating +point numbers between 0 and 1 (0 not included), while +.Ar min_th +and +.Ar max_th +are integer numbers specifying thresholds for queue management +(thresholds are computed in bytes if the queue has been defined +in bytes, in slots otherwise). +The +.Nm dummynet +also supports the gentle RED variant (gred). +Three +.Xr sysctl 8 +variables can be used to control the RED behaviour: +.Bl -tag -width indent +.It Va net.inet.ip.dummynet.red_lookup_depth +specifies the accuracy in computing the average queue +when the link is idle (defaults to 256, must be greater than zero) +.It Va net.inet.ip.dummynet.red_avg_pkt_size +specifies the expected average packet size (defaults to 512, must be +greater than zero) +.It Va net.inet.ip.dummynet.red_max_pkt_size +specifies the expected maximum packet size, only used when queue +thresholds are in bytes (defaults to 1500, must be greater than zero). +.El +.El +.Pp +When used with IPv6 data, +.Nm dummynet +currently has several limitations. +Information necessary to route link-local packets to an +interface is not available after processing by +.Nm dummynet +so those packets are dropped in the output path. +Care should be taken to ensure that link-local packets are not passed to +.Nm dummynet . +.Sh CHECKLIST +Here are some important points to consider when designing your +rules: +.Bl -bullet +.It +Remember that you filter both packets going +.Cm in +and +.Cm out . +Most connections need packets going in both directions. +.It +Remember to test very carefully. +It is a good idea to be near the console when doing this. +If you cannot be near the console, +use an auto-recovery script such as the one in +.Pa /usr/share/examples/ipfw/change_rules.sh . +.It +Do not forget the loopback interface. +.El +.Sh FINE POINTS +.Bl -bullet +.It +There are circumstances where fragmented datagrams are unconditionally +dropped. +TCP packets are dropped if they do not contain at least 20 bytes of +TCP header, UDP packets are dropped if they do not contain a full 8 +byte UDP header, and ICMP packets are dropped if they do not contain +4 bytes of ICMP header, enough to specify the ICMP type, code, and +checksum. +These packets are simply logged as +.Dq pullup failed +since there may not be enough good data in the packet to produce a +meaningful log entry. +.It +Another type of packet is unconditionally dropped, a TCP packet with a +fragment offset of one. +This is a valid packet, but it only has one use, to try +to circumvent firewalls. +When logging is enabled, these packets are +reported as being dropped by rule -1. +.It +If you are logged in over a network, loading the +.Xr kld 4 +version of +.Nm +is probably not as straightforward as you would think. +The following command line is recommended: +.Bd -literal -offset indent +kldload ipfw && \e +ipfw add 32000 allow ip from any to any +.Ed +.Pp +Along the same lines, doing an +.Bd -literal -offset indent +ipfw flush +.Ed +.Pp +in similar surroundings is also a bad idea. +.It +The +.Nm +filter list may not be modified if the system security level +is set to 3 or higher +(see +.Xr init 8 +for information on system security levels). +.El +.Sh PACKET DIVERSION +A +.Xr divert 4 +socket bound to the specified port will receive all packets +diverted to that port. +If no socket is bound to the destination port, or if the divert module is +not loaded, or if the kernel was not compiled with divert socket support, +the packets are dropped. +.Sh NETWORK ADDRESS TRANSLATION (NAT) +.Pp +.Nm +support in-kernel NAT using the kernel version of +.Xr libalias 3 . +.Pp +The nat configuration command is the following: +.Bd -ragged -offset indent +.Bk -words +.Cm nat +.Ar nat_number +.Cm config +.Ar nat-configuration +.Ek +.Ed +.Pp +The following parameters can be configured: +.Bl -tag -width indent +.It Cm ip Ar ip_address +Define an ip address to use for aliasing. +.It Cm if Ar nic +Use ip address of NIC for aliasing, dynamically changing +it if NIC's ip address changes. +.It Cm log +Enable logging on this nat instance. +.It Cm deny_in +Deny any incoming connection from outside world. +.It Cm same_ports +Try to leave the alias port numbers unchanged from +the actual local port numbers. +.It Cm unreg_only +Traffic on the local network not originating from an +unregistered address spaces will be ignored. +.It Cm reset +Reset table of the packet aliasing engine on address change. +.It Cm reverse +Reverse the way libalias handles aliasing. +.It Cm proxy_only +Obey transparent proxy rules only, packet aliasing is not performed. +.El +.Pp +To let the packet continue after being (de)aliased, set the sysctl variable +.Va net.inet.ip.fw.one_pass +to 0. +For more information about aliasing modes, refer to +.Xr libalias 3 . +See Section +.Sx EXAMPLES +for some examples about nat usage. +.Ss REDIRECT AND LSNAT SUPPORT IN IPFW +Redirect and LSNAT support follow closely the syntax used in +.Xr natd 8 . +See Section +.Sx EXAMPLES +for some examples on how to do redirect and lsnat. +.Ss SCTP NAT SUPPORT +SCTP nat can be configured in a similar manner to TCP through the +.Nm +command line tool. +The main difference is that +.Nm sctp nat +does not do port translation. +Since the local and global side ports will be the same, +there is no need to specify both. +Ports are redirected as follows: +.Bd -ragged -offset indent +.Bk -words +.Cm nat +.Ar nat_number +.Cm config if +.Ar nic +.Cm redirect_port sctp +.Ar ip_address [,addr_list] {[port | port-port] [,ports]} +.Ek +.Ed +.Pp +Most +.Nm sctp nat +configuration can be done in real-time through the +.Xr sysctl 8 +interface. +All may be changed dynamically, though the hash_table size will only +change for new +.Nm nat +instances. +See +.Sx SYSCTL VARIABLES +for more info. +.Sh SYSCTL VARIABLES +A set of +.Xr sysctl 8 +variables controls the behaviour of the firewall and +associated modules +.Pq Nm dummynet , bridge , sctp nat . +These are shown below together with their default value +(but always check with the +.Xr sysctl 8 +command what value is actually in use) and meaning: +.Bl -tag -width indent +.It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0 +Defines how the +.Nm nat +responds to receipt of global OOTB ASCONF-AddIP: +.Bl -tag -width indent +.It Cm 0 +No response (unless a partially matching association exists - +ports and vtags match but global address does not) +.It Cm 1 +.Nm nat +will accept and process all OOTB global AddIP messages. +.El +.Pp +Option 1 should never be selected as this forms a security risk. +An attacker can +establish multiple fake associations by sending AddIP messages. +.It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5 +Defines the maximum number of chunks in an SCTP packet that will be parsed for a +packet that matches an existing association. +This value is enforced to be greater or equal than +.Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit . +A high value is +a DoS risk yet setting too low a value may result in important control chunks in +the packet not being located and parsed. +.It Va net.inet.ip.alias.sctp.error_on_ootb: No 1 +Defines when the +.Nm nat +responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets. +An OOTB packet is a packet that arrives with no existing association +registered in the +.Nm nat +and is not an INIT or ASCONF-AddIP packet: +.Bl -tag -width indent +.It Cm 0 +ErrorM is never sent in response to OOTB packets. +.It Cm 1 +ErrorM is only sent to OOTB packets received on the local side. +.It Cm 2 +ErrorM is sent to the local side and on the global side ONLY if there is a +partial match (ports and vtags match but the source global IP does not). +This value is only useful if the +.Nm nat +is tracking global IP addresses. +.It Cm 3 +ErrorM is sent in response to all OOTB packets on both the local and global side +(DoS risk). +.El +.Pp +At the moment the default is 0, since the ErrorM packet is not yet +supported by most SCTP stacks. +When it is supported, and if not tracking +global addresses, we recommend setting this value to 1 to allow +multi-homed local hosts to function with the +.Nm nat . +To track global addresses, we recommend setting this value to 2 to +allow global hosts to be informed when they need to (re)send an +ASCONF-AddIP. +Value 3 should never be chosen (except for debugging) as the +.Nm nat +will respond to all OOTB global packets (a DoS risk). +.It Va net.inet.ip.alias.sctp.hashtable_size: No 2003 +Size of hash tables used for +.Nm nat +lookups (100 < prime_number > 1000001). +This value sets the +.Nm hash table +size for any future created +.Nm nat +instance and therefore must be set prior to creating a +.Nm nat +instance. +The table sizes may be changed to suit specific needs. +If there will be few +concurrent associations, and memory is scarce, you may make these smaller. +If there will be many thousands (or millions) of concurrent associations, you +should make these larger. +A prime number is best for the table size. +The sysctl +update function will adjust your input value to the next highest prime number. +.It Va net.inet.ip.alias.sctp.holddown_time: No 0 +Hold association in table for this many seconds after receiving a +SHUTDOWN-COMPLETE. +This allows endpoints to correct shutdown gracefully if a +shutdown_complete is lost and retransmissions are required. +.It Va net.inet.ip.alias.sctp.init_timer: No 15 +Timeout value while waiting for (INIT-ACK|AddIP-ACK). +This value cannot be 0. +.It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2 +Defines the maximum number of chunks in an SCTP packet that will be parsed when +no existing association exists that matches that packet. +Ideally this packet +will only be an INIT or ASCONF-AddIP packet. +A higher value may become a DoS +risk as malformed packets can consume processing resources. +.It Va net.inet.ip.alias.sctp.param_proc_limit: No 25 +Defines the maximum number of parameters within a chunk that will be parsed in a +packet. +As for other similar sysctl variables, larger values pose a DoS risk. +.It Va net.inet.ip.alias.sctp.log_level: No 0 +Level of detail in the system log messages (0 \- minimal, 1 \- event, +2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug). May be a good +option in high loss environments. +.It Va net.inet.ip.alias.sctp.shutdown_time: No 15 +Timeout value while waiting for SHUTDOWN-COMPLETE. +This value cannot be 0. +.It Va net.inet.ip.alias.sctp.track_global_addresses: No 0 +Enables/disables global IP address tracking within the +.Nm nat +and places an +upper limit on the number of addresses tracked for each association: +.Bl -tag -width indent +.It Cm 0 +Global tracking is disabled +.It Cm >1 +Enables tracking, the maximum number of addresses tracked for each +association is limited to this value +.El +.Pp +This variable is fully dynamic, the new value will be adopted for all newly +arriving associations, existing associations are treated as they were previously. +Global tracking will decrease the number of collisions within the +.Nm nat +at a cost +of increased processing load, memory usage, complexity, and possible +.Nm nat +state +problems in complex networks with multiple +.Nm nats . +We recommend not tracking +global IP addresses, this will still result in a fully functional +.Nm nat . +.It Va net.inet.ip.alias.sctp.up_timer: No 300 +Timeout value to keep an association up with no traffic. +This value cannot be 0. +.It Va net.inet.ip.dummynet.expire : No 1 +Lazily delete dynamic pipes/queue once they have no pending traffic. +You can disable this by setting the variable to 0, in which case +the pipes/queues will only be deleted when the threshold is reached. +.It Va net.inet.ip.dummynet.hash_size : No 64 +Default size of the hash table used for dynamic pipes/queues. +This value is used when no +.Cm buckets +option is specified when configuring a pipe/queue. +.It Va net.inet.ip.dummynet.io_fast : No 0 +If set to a non-zero value, +the +.Dq fast +mode of +.Nm dummynet +operation (see above) is enabled. +.It Va net.inet.ip.dummynet.io_pkt +Number of packets passed to +.Nm dummynet . +.It Va net.inet.ip.dummynet.io_pkt_drop +Number of packets dropped by +.Nm dummynet . +.It Va net.inet.ip.dummynet.io_pkt_fast +Number of packets bypassed by the +.Nm dummynet +scheduler. +.It Va net.inet.ip.dummynet.max_chain_len : No 16 +Target value for the maximum number of pipes/queues in a hash bucket. +The product +.Cm max_chain_len*hash_size +is used to determine the threshold over which empty pipes/queues +will be expired even when +.Cm net.inet.ip.dummynet.expire=0 . +.It Va net.inet.ip.dummynet.red_lookup_depth : No 256 +.It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512 +.It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500 +Parameters used in the computations of the drop probability +for the RED algorithm. +.It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576 +.It Va net.inet.ip.dummynet.pipe_slot_limit : No 100 +The maximum queue size that can be specified in bytes or packets. +These limits prevent accidental exhaustion of resources such as mbufs. +If you raise these limits, +you should make sure the system is configured so that sufficient resources +are available. +.It Va net.inet.ip.fw.autoinc_step : No 100 +Delta between rule numbers when auto-generating them. +The value must be in the range 1..1000. +.It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets +The current number of buckets in the hash table for dynamic rules +(readonly). +.It Va net.inet.ip.fw.debug : No 1 +Controls debugging messages produced by +.Nm . +.It Va net.inet.ip.fw.default_rule : No 65535 +The default rule number (read-only). +By the design of +.Nm , the default rule is the last one, so its number +can also serve as the highest number allowed for a rule. +.It Va net.inet.ip.fw.dyn_buckets : No 256 +The number of buckets in the hash table for dynamic rules. +Must be a power of 2, up to 65536. +It only takes effect when all dynamic rules have expired, so you +are advised to use a +.Cm flush +command to make sure that the hash table is resized. +.It Va net.inet.ip.fw.dyn_count : No 3 +Current number of dynamic rules +(read-only). +.It Va net.inet.ip.fw.dyn_keepalive : No 1 +Enables generation of keepalive packets for +.Cm keep-state +rules on TCP sessions. +A keepalive is generated to both +sides of the connection every 5 seconds for the last 20 +seconds of the lifetime of the rule. +.It Va net.inet.ip.fw.dyn_max : No 8192 +Maximum number of dynamic rules. +When you hit this limit, no more dynamic rules can be +installed until old ones expire. +.It Va net.inet.ip.fw.dyn_ack_lifetime : No 300 +.It Va net.inet.ip.fw.dyn_syn_lifetime : No 20 +.It Va net.inet.ip.fw.dyn_fin_lifetime : No 1 +.It Va net.inet.ip.fw.dyn_rst_lifetime : No 1 +.It Va net.inet.ip.fw.dyn_udp_lifetime : No 5 +.It Va net.inet.ip.fw.dyn_short_lifetime : No 30 +These variables control the lifetime, in seconds, of dynamic +rules. +Upon the initial SYN exchange the lifetime is kept short, +then increased after both SYN have been seen, then decreased +again during the final FIN exchange or when a RST is received. +Both +.Em dyn_fin_lifetime +and +.Em dyn_rst_lifetime +must be strictly lower than 5 seconds, the period of +repetition of keepalives. +The firewall enforces that. +.It Va net.inet.ip.fw.enable : No 1 +Enables the firewall. +Setting this variable to 0 lets you run your machine without +firewall even if compiled in. +.It Va net.inet6.ip6.fw.enable : No 1 +provides the same functionality as above for the IPv6 case. +.It Va net.inet.ip.fw.one_pass : No 1 +When set, the packet exiting from the +.Nm dummynet +pipe or from +.Xr ng_ipfw 4 +node is not passed though the firewall again. +Otherwise, after an action, the packet is +reinjected into the firewall at the next rule. +.It Va net.inet.ip.fw.tables_max : No 128 +Maximum number of tables (read-only). +.It Va net.inet.ip.fw.verbose : No 1 +Enables verbose messages. +.It Va net.inet.ip.fw.verbose_limit : No 0 +Limits the number of messages produced by a verbose firewall. +.It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1 +If enabled packets with unknown IPv6 Extension Headers will be denied. +.It Va net.link.ether.ipfw : No 0 +Controls whether layer-2 packets are passed to +.Nm . +Default is no. +.It Va net.link.bridge.ipfw : No 0 +Controls whether bridged packets are passed to +.Nm . +Default is no. +.El +.Pp +.Sh EXAMPLES +There are far too many possible uses of +.Nm +so this Section will only give a small set of examples. +.Pp +.Ss BASIC PACKET FILTERING +This command adds an entry which denies all tcp packets from +.Em cracker.evil.org +to the telnet port of +.Em wolf.tambov.su +from being forwarded by the host: +.Pp +.Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet" +.Pp +This one disallows any connection from the entire cracker's +network to my host: +.Pp +.Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org" +.Pp +A first and efficient way to limit access (not using dynamic rules) +is the use of the following rules: +.Pp +.Dl "ipfw add allow tcp from any to any established" +.Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup" +.Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup" +.Dl "..." +.Dl "ipfw add deny tcp from any to any" +.Pp +The first rule will be a quick match for normal TCP packets, +but it will not match the initial SYN packet, which will be +matched by the +.Cm setup +rules only for selected source/destination pairs. +All other SYN packets will be rejected by the final +.Cm deny +rule. +.Pp +If you administer one or more subnets, you can take advantage +of the address sets and or-blocks and write extremely +compact rulesets which selectively enable services to blocks +of clients, as below: +.Pp +.Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q" +.Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q" +.Dl "" +.Dl "ipfw add allow ip from ${goodguys} to any" +.Dl "ipfw add deny ip from ${badguys} to any" +.Dl "... normal policies ..." +.Pp +The +.Cm verrevpath +option could be used to do automated anti-spoofing by adding the +following to the top of a ruleset: +.Pp +.Dl "ipfw add deny ip from any to any not verrevpath in" +.Pp +This rule drops all incoming packets that appear to be coming to the +system on the wrong interface. +For example, a packet with a source +address belonging to a host on a protected internal network would be +dropped if it tried to enter the system from an external interface. +.Pp +The +.Cm antispoof +option could be used to do similar but more restricted anti-spoofing +by adding the following to the top of a ruleset: +.Pp +.Dl "ipfw add deny ip from any to any not antispoof in" +.Pp +This rule drops all incoming packets that appear to be coming from another +directly connected system but on the wrong interface. +For example, a packet with a source address of +.Li 192.168.0.0/24 , +configured on +.Li fxp0 , +but coming in on +.Li fxp1 +would be dropped. +.Ss DYNAMIC RULES +In order to protect a site from flood attacks involving fake +TCP packets, it is safer to use dynamic rules: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add deny tcp from any to any established" +.Dl "ipfw add allow tcp from my-net to any setup keep-state" +.Pp +This will let the firewall install dynamic rules only for +those connection which start with a regular SYN packet coming +from the inside of our network. +Dynamic rules are checked when encountering the first +.Cm check-state +or +.Cm keep-state +rule. +A +.Cm check-state +rule should usually be placed near the beginning of the +ruleset to minimize the amount of work scanning the ruleset. +Your mileage may vary. +.Pp +To limit the number of connections a user can open +you can use the following type of rules: +.Pp +.Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10" +.Dl "ipfw add allow tcp from any to me setup limit src-addr 4" +.Pp +The former (assuming it runs on a gateway) will allow each host +on a /24 network to open at most 10 TCP connections. +The latter can be placed on a server to make sure that a single +client does not use more than 4 simultaneous connections. +.Pp +.Em BEWARE : +stateful rules can be subject to denial-of-service attacks +by a SYN-flood which opens a huge number of dynamic rules. +The effects of such attacks can be partially limited by +acting on a set of +.Xr sysctl 8 +variables which control the operation of the firewall. +.Pp +Here is a good usage of the +.Cm list +command to see accounting records and timestamp information: +.Pp +.Dl ipfw -at list +.Pp +or in short form without timestamps: +.Pp +.Dl ipfw -a list +.Pp +which is equivalent to: +.Pp +.Dl ipfw show +.Pp +Next rule diverts all incoming packets from 192.168.2.0/24 +to divert port 5000: +.Pp +.Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in +.Pp +.Ss TRAFFIC SHAPING +The following rules show some of the applications of +.Nm +and +.Nm dummynet +for simulations and the like. +.Pp +This rule drops random incoming packets with a probability +of 5%: +.Pp +.Dl "ipfw add prob 0.05 deny ip from any to any in" +.Pp +A similar effect can be achieved making use of +.Nm dummynet +pipes: +.Pp +.Dl "ipfw add pipe 10 ip from any to any" +.Dl "ipfw pipe 10 config plr 0.05" +.Pp +We can use pipes to artificially limit bandwidth, e.g.\& on a +machine acting as a router, if we want to limit traffic from +local clients on 192.168.2.0/24 we do: +.Pp +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" +.Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes" +.Pp +note that we use the +.Cm out +modifier so that the rule is not used twice. +Remember in fact that +.Nm +rules are checked both on incoming and outgoing packets. +.Pp +Should we want to simulate a bidirectional link with bandwidth +limitations, the correct way is the following: +.Pp +.Dl "ipfw add pipe 1 ip from any to any out" +.Dl "ipfw add pipe 2 ip from any to any in" +.Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes" +.Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes" +.Pp +The above can be very useful, e.g.\& if you want to see how +your fancy Web page will look for a residential user who +is connected only through a slow link. +You should not use only one pipe for both directions, unless +you want to simulate a half-duplex medium (e.g.\& AppleTalk, +Ethernet, IRDA). +It is not necessary that both pipes have the same configuration, +so we can also simulate asymmetric links. +.Pp +Should we want to verify network performance with the RED queue +management algorithm: +.Pp +.Dl "ipfw add pipe 1 ip from any to any" +.Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1" +.Pp +Another typical application of the traffic shaper is to +introduce some delay in the communication. +This can significantly affect applications which do a lot of Remote +Procedure Calls, and where the round-trip-time of the +connection often becomes a limiting factor much more than +bandwidth: +.Pp +.Dl "ipfw add pipe 1 ip from any to any out" +.Dl "ipfw add pipe 2 ip from any to any in" +.Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s" +.Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s" +.Pp +Per-flow queueing can be useful for a variety of purposes. +A very simple one is counting traffic: +.Pp +.Dl "ipfw add pipe 1 tcp from any to any" +.Dl "ipfw add pipe 1 udp from any to any" +.Dl "ipfw add pipe 1 ip from any to any" +.Dl "ipfw pipe 1 config mask all" +.Pp +The above set of rules will create queues (and collect +statistics) for all traffic. +Because the pipes have no limitations, the only effect is +collecting statistics. +Note that we need 3 rules, not just the last one, because +when +.Nm +tries to match IP packets it will not consider ports, so we +would not see connections on separate ports as different +ones. +.Pp +A more sophisticated example is limiting the outbound traffic +on a net with per-host limits, rather than per-network limits: +.Pp +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" +.Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in" +.Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" +.Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" +.Ss LOOKUP TABLES +In the following example, we need to create several traffic bandwidth +classes and we need different hosts/networks to fall into different classes. +We create one pipe for each class and configure them accordingly. +Then we create a single table and fill it with IP subnets and addresses. +For each subnet/host we set the argument equal to the number of the pipe +that it should use. +Then we classify traffic using a single rule: +.Pp +.Dl "ipfw pipe 1 config bw 1000Kbyte/s" +.Dl "ipfw pipe 4 config bw 4000Kbyte/s" +.Dl "..." +.Dl "ipfw table 1 add 192.168.2.0/24 1" +.Dl "ipfw table 1 add 192.168.0.0/27 4" +.Dl "ipfw table 1 add 192.168.0.2 1" +.Dl "..." +.Dl "ipfw add pipe tablearg ip from table(1) to any" +.Pp +Using the +.Cm fwd +action, the table entries may include hostnames and IP addresses. +.Pp +.Dl "ipfw table 1 add 192.168.2.0/24 10.23.2.1" +.Dl "ipfw table 1 add 192.168.0.0/27 router1.dmz" +.Dl "..." +.Dl "ipfw add 100 fwd tablearg ip from any to table(1)" +.Ss SETS OF RULES +To add a set of rules atomically, e.g.\& set 18: +.Pp +.Dl "ipfw set disable 18" +.Dl "ipfw add NN set 18 ... # repeat as needed" +.Dl "ipfw set enable 18" +.Pp +To delete a set of rules atomically the command is simply: +.Pp +.Dl "ipfw delete set 18" +.Pp +To test a ruleset and disable it and regain control if something goes wrong: +.Pp +.Dl "ipfw set disable 18" +.Dl "ipfw add NN set 18 ... # repeat as needed" +.Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18" +.Pp +Here if everything goes well, you press control-C before the "sleep" +terminates, and your ruleset will be left active. +Otherwise, e.g.\& if +you cannot access your box, the ruleset will be disabled after +the sleep terminates thus restoring the previous situation. +.Pp +To show rules of the specific set: +.Pp +.Dl "ipfw set 18 show" +.Pp +To show rules of the disabled set: +.Pp +.Dl "ipfw -S set 18 show" +.Pp +To clear a specific rule counters of the specific set: +.Pp +.Dl "ipfw set 18 zero NN" +.Pp +To delete a specific rule of the specific set: +.Pp +.Dl "ipfw set 18 delete NN" +.Ss NAT, REDIRECT AND LSNAT +First redirect all the traffic to nat instance 123: +.Pp +.Dl "ipfw add nat 123 all from any to any" +.Pp +Then to configure nat instance 123 to alias all the outgoing traffic with ip +192.168.0.123, blocking all incoming connections, trying to keep +same ports on both sides, clearing aliasing table on address change +and keeping a log of traffic/link statistics: +.Pp +.Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports" +.Pp +Or to change address of instance 123, aliasing table will be cleared (see +reset option): +.Pp +.Dl "ipfw nat 123 config ip 10.0.0.1" +.Pp +To see configuration of nat instance 123: +.Pp +.Dl "ipfw nat 123 show config" +.Pp +To show logs of all the instances in range 111-999: +.Pp +.Dl "ipfw nat 111-999 show" +.Pp +To see configurations of all instances: +.Pp +.Dl "ipfw nat show config" +.Pp +Or a redirect rule with mixed modes could looks like: +.Pp +.Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66" +.Dl " redirect_port tcp 192.168.0.1:80 500" +.Dl " redirect_proto udp 192.168.1.43 192.168.1.1" +.Dl " redirect_addr 192.168.0.10,192.168.0.11" +.Dl " 10.0.0.100 # LSNAT" +.Dl " redirect_port tcp 192.168.0.1:80,192.168.0.10:22" +.Dl " 500 # LSNAT" +.Pp +or it could be split in: +.Pp +.Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66" +.Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500" +.Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1" +.Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12" +.Dl " 10.0.0.100" +.Dl "ipfw nat 5 config redirect_port tcp" +.Dl " 192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500" +.Pp +.Sh SEE ALSO +.Xr cpp 1 , +.Xr m4 1 , +.Xr altq 4 , +.Xr divert 4 , +.Xr dummynet 4 , +.Xr if_bridge 4 , +.Xr ip 4 , +.Xr ipfirewall 4 , +.Xr ng_ipfw 4 , +.Xr protocols 5 , +.Xr services 5 , +.Xr init 8 , +.Xr kldload 8 , +.Xr reboot 8 , +.Xr sysctl 8 , +.Xr syslogd 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx 2.0 . +.Nm dummynet +was introduced in +.Fx 2.2.8 . +Stateful extensions were introduced in +.Fx 4.0 . +.Nm ipfw2 +was introduced in Summer 2002. +.Sh AUTHORS +.An Ugen J. S. Antsilevich , +.An Poul-Henning Kamp , +.An Alex Nash , +.An Archie Cobbs , +.An Luigi Rizzo . +.Pp +.An -nosplit +API based upon code written by +.An Daniel Boulet +for BSDI. +.Pp +Dummynet has been introduced by Luigi Rizzo in 1997-1998. +.Pp +Some early work (1999-2000) on the +.Nm dummynet +traffic shaper supported by Akamba Corp. +.Pp +The ipfw core (ipfw2) has been completely redesigned and +reimplemented by Luigi Rizzo in summer 2002. Further +actions and +options have been added by various developer over the years. +.Pp +.An -nosplit +In-kernel NAT support written by +.An Paolo Pisati Aq piso@FreeBSD.org +as part of a Summer of Code 2005 project. +.Pp +SCTP +.Nm nat +support has been developed by +.An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au . +The primary developers and maintainers are David Hayes and Jason But. +For further information visit: +.Aq http://www.caia.swin.edu.au/urp/SONATA +.Pp +Delay profiles have been developed by Alessandro Cerri and +Luigi Rizzo, supported by the +European Commission within Projects Onelab and Onelab2. +.Sh BUGS +The syntax has grown over the years and sometimes it might be confusing. +Unfortunately, backward compatibility prevents cleaning up mistakes +made in the definition of the syntax. +.Pp +.Em !!! WARNING !!! +.Pp +Misconfiguring the firewall can put your computer in an unusable state, +possibly shutting down network services and requiring console access to +regain control of it. +.Pp +Incoming packet fragments diverted by +.Cm divert +are reassembled before delivery to the socket. +The action used on those packet is the one from the +rule which matches the first fragment of the packet. +.Pp +Packets diverted to userland, and then reinserted by a userland process +may lose various packet attributes. +The packet source interface name +will be preserved if it is shorter than 8 bytes and the userland process +saves and reuses the sockaddr_in +(as does +.Xr natd 8 ) ; +otherwise, it may be lost. +If a packet is reinserted in this manner, later rules may be incorrectly +applied, making the order of +.Cm divert +rules in the rule sequence very important. +.Pp +Dummynet drops all packets with IPv6 link-local addresses. +.Pp +Rules using +.Cm uid +or +.Cm gid +may not behave as expected. +In particular, incoming SYN packets may +have no uid or gid associated with them since they do not yet belong +to a TCP connection, and the uid/gid associated with a packet may not +be as expected if the associated process calls +.Xr setuid 2 +or similar system calls. +.Pp +Rule syntax is subject to the command line environment and some patterns +may need to be escaped with the backslash character +or quoted appropriately. +.Pp +Due to the architecture of +.Xr libalias 3 , +ipfw nat is not compatible with the TCP segmentation offloading (TSO). +Thus, to reliably nat your network traffic, please disable TSO +on your NICs using +.Xr ifconfig 8 . +.Pp +ICMP error messages are not implicitly matched by dynamic rules +for the respective conversations. +To avoid failures of network error detection and path MTU discovery, +ICMP error messages may need to be allowed explicitly through static +rules. diff --git a/ipfw/ipfw2.c b/ipfw/ipfw2.c new file mode 100644 index 0000000..bf3a9b1 --- /dev/null +++ b/ipfw/ipfw2.c @@ -0,0 +1,3914 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipfw2.c 206843 2010-04-19 15:11:45Z luigi $ + */ + +#include +#include +#include +#include + +#include "ipfw2.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* ctime */ +#include /* _long_to_time */ +#include +/* + * FreeBSD uses __unused as a shorthand for __attribute__ ((__unused__)) + * whereas Linux sometimes uses __unused as a variable name. + * undefine the macro around problematic places. + */ +#undef __unused +#include +#define __unused __attribute__ ((__unused__)) + +#include +#include /* only IFNAMSIZ */ +#include +#include /* only n_short, n_long */ +#include +#include +#include +#include +#include + +struct cmdline_opts co; /* global options */ + +int resvd_set_number = RESVD_SET; + +#define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ + if (!av[0]) \ + errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ + if (_substrcmp(*av, "tablearg") == 0) { \ + arg = IP_FW_TABLEARG; \ + break; \ + } \ + \ + { \ + long _xval; \ + char *end; \ + \ + _xval = strtol(*av, &end, 10); \ + \ + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ + errx(EX_DATAERR, "%s: invalid argument: %s", \ + match_value(s_x, tok), *av); \ + \ + if (errno == ERANGE || _xval < min || _xval > max) \ + errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ + match_value(s_x, tok), min, max, *av); \ + \ + if (_xval == IP_FW_TABLEARG) \ + errx(EX_DATAERR, "%s: illegal argument value: %s", \ + match_value(s_x, tok), *av); \ + arg = _xval; \ + } \ +} while (0) + +static void +PRINT_UINT_ARG(const char *str, uint32_t arg) +{ + if (str != NULL) + printf("%s",str); + if (arg == IP_FW_TABLEARG) + printf("tablearg"); + else + printf("%u", arg); +} + +static struct _s_x f_tcpflags[] = { + { "syn", TH_SYN }, + { "fin", TH_FIN }, + { "ack", TH_ACK }, + { "psh", TH_PUSH }, + { "rst", TH_RST }, + { "urg", TH_URG }, + { "tcp flag", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_tcpopts[] = { + { "mss", IP_FW_TCPOPT_MSS }, + { "maxseg", IP_FW_TCPOPT_MSS }, + { "window", IP_FW_TCPOPT_WINDOW }, + { "sack", IP_FW_TCPOPT_SACK }, + { "ts", IP_FW_TCPOPT_TS }, + { "timestamp", IP_FW_TCPOPT_TS }, + { "cc", IP_FW_TCPOPT_CC }, + { "tcp option", 0 }, + { NULL, 0 } +}; + +/* + * IP options span the range 0 to 255 so we need to remap them + * (though in fact only the low 5 bits are significant). + */ +static struct _s_x f_ipopts[] = { + { "ssrr", IP_FW_IPOPT_SSRR}, + { "lsrr", IP_FW_IPOPT_LSRR}, + { "rr", IP_FW_IPOPT_RR}, + { "ts", IP_FW_IPOPT_TS}, + { "ip option", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_iptos[] = { + { "lowdelay", IPTOS_LOWDELAY}, + { "throughput", IPTOS_THROUGHPUT}, + { "reliability", IPTOS_RELIABILITY}, + { "mincost", IPTOS_MINCOST}, + { "congestion", IPTOS_ECN_CE}, + { "ecntransport", IPTOS_ECN_ECT0}, + { "ip tos option", 0}, + { NULL, 0 } +}; + +static struct _s_x limit_masks[] = { + {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, + {"src-addr", DYN_SRC_ADDR}, + {"src-port", DYN_SRC_PORT}, + {"dst-addr", DYN_DST_ADDR}, + {"dst-port", DYN_DST_PORT}, + {NULL, 0} +}; + +/* + * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines + * This is only used in this code. + */ +#define IPPROTO_ETHERTYPE 0x1000 +static struct _s_x ether_types[] = { + /* + * Note, we cannot use "-:&/" in the names because they are field + * separators in the type specifications. Also, we use s = NULL as + * end-delimiter, because a type of 0 can be legal. + */ + { "ip", 0x0800 }, + { "ipv4", 0x0800 }, + { "ipv6", 0x86dd }, + { "arp", 0x0806 }, + { "rarp", 0x8035 }, + { "vlan", 0x8100 }, + { "loop", 0x9000 }, + { "trail", 0x1000 }, + { "at", 0x809b }, + { "atalk", 0x809b }, + { "aarp", 0x80f3 }, + { "pppoe_disc", 0x8863 }, + { "pppoe_sess", 0x8864 }, + { "ipx_8022", 0x00E0 }, + { "ipx_8023", 0x0000 }, + { "ipx_ii", 0x8137 }, + { "ipx_snap", 0x8137 }, + { "ipx", 0x8137 }, + { "ns", 0x0600 }, + { NULL, 0 } +}; + + +static struct _s_x rule_actions[] = { + { "accept", TOK_ACCEPT }, + { "pass", TOK_ACCEPT }, + { "allow", TOK_ACCEPT }, + { "permit", TOK_ACCEPT }, + { "count", TOK_COUNT }, + { "pipe", TOK_PIPE }, + { "queue", TOK_QUEUE }, + { "divert", TOK_DIVERT }, + { "tee", TOK_TEE }, + { "netgraph", TOK_NETGRAPH }, + { "ngtee", TOK_NGTEE }, + { "fwd", TOK_FORWARD }, + { "forward", TOK_FORWARD }, + { "skipto", TOK_SKIPTO }, + { "deny", TOK_DENY }, + { "drop", TOK_DENY }, + { "reject", TOK_REJECT }, + { "reset6", TOK_RESET6 }, + { "reset", TOK_RESET }, + { "unreach6", TOK_UNREACH6 }, + { "unreach", TOK_UNREACH }, + { "check-state", TOK_CHECKSTATE }, + { "//", TOK_COMMENT }, + { "nat", TOK_NAT }, + { "reass", TOK_REASS }, + { "setfib", TOK_SETFIB }, + { NULL, 0 } /* terminator */ +}; + +static struct _s_x rule_action_params[] = { + { "altq", TOK_ALTQ }, + { "log", TOK_LOG }, + { "tag", TOK_TAG }, + { "untag", TOK_UNTAG }, + { NULL, 0 } /* terminator */ +}; + +/* + * The 'lookup' instruction accepts one of the following arguments. + * -1 is a terminator for the list. + * Arguments are passed as v[1] in O_DST_LOOKUP options. + */ +static int lookup_key[] = { + TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, + TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; + +static struct _s_x rule_options[] = { + { "tagged", TOK_TAGGED }, + { "uid", TOK_UID }, + { "gid", TOK_GID }, + { "jail", TOK_JAIL }, + { "in", TOK_IN }, + { "limit", TOK_LIMIT }, + { "keep-state", TOK_KEEPSTATE }, + { "bridged", TOK_LAYER2 }, + { "layer2", TOK_LAYER2 }, + { "out", TOK_OUT }, + { "diverted", TOK_DIVERTED }, + { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, + { "diverted-output", TOK_DIVERTEDOUTPUT }, + { "xmit", TOK_XMIT }, + { "recv", TOK_RECV }, + { "via", TOK_VIA }, + { "fragment", TOK_FRAG }, + { "frag", TOK_FRAG }, + { "fib", TOK_FIB }, + { "ipoptions", TOK_IPOPTS }, + { "ipopts", TOK_IPOPTS }, + { "iplen", TOK_IPLEN }, + { "ipid", TOK_IPID }, + { "ipprecedence", TOK_IPPRECEDENCE }, + { "dscp", TOK_DSCP }, + { "iptos", TOK_IPTOS }, + { "ipttl", TOK_IPTTL }, + { "ipversion", TOK_IPVER }, + { "ipver", TOK_IPVER }, + { "estab", TOK_ESTAB }, + { "established", TOK_ESTAB }, + { "setup", TOK_SETUP }, + { "tcpdatalen", TOK_TCPDATALEN }, + { "tcpflags", TOK_TCPFLAGS }, + { "tcpflgs", TOK_TCPFLAGS }, + { "tcpoptions", TOK_TCPOPTS }, + { "tcpopts", TOK_TCPOPTS }, + { "tcpseq", TOK_TCPSEQ }, + { "tcpack", TOK_TCPACK }, + { "tcpwin", TOK_TCPWIN }, + { "icmptype", TOK_ICMPTYPES }, + { "icmptypes", TOK_ICMPTYPES }, + { "dst-ip", TOK_DSTIP }, + { "src-ip", TOK_SRCIP }, + { "dst-port", TOK_DSTPORT }, + { "src-port", TOK_SRCPORT }, + { "proto", TOK_PROTO }, + { "MAC", TOK_MAC }, + { "mac", TOK_MAC }, + { "mac-type", TOK_MACTYPE }, + { "verrevpath", TOK_VERREVPATH }, + { "versrcreach", TOK_VERSRCREACH }, + { "antispoof", TOK_ANTISPOOF }, + { "ipsec", TOK_IPSEC }, + { "icmp6type", TOK_ICMP6TYPES }, + { "icmp6types", TOK_ICMP6TYPES }, + { "ext6hdr", TOK_EXT6HDR}, + { "flow-id", TOK_FLOWID}, + { "ipv6", TOK_IPV6}, + { "ip6", TOK_IPV6}, + { "ipv4", TOK_IPV4}, + { "ip4", TOK_IPV4}, + { "dst-ipv6", TOK_DSTIP6}, + { "dst-ip6", TOK_DSTIP6}, + { "src-ipv6", TOK_SRCIP6}, + { "src-ip6", TOK_SRCIP6}, + { "lookup", TOK_LOOKUP}, + { "//", TOK_COMMENT }, + + { "not", TOK_NOT }, /* pseudo option */ + { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ + { "or", TOK_OR }, /* pseudo option */ + { "|", /* escape */ TOK_OR }, /* pseudo option */ + { "{", TOK_STARTBRACE }, /* pseudo option */ + { "(", TOK_STARTBRACE }, /* pseudo option */ + { "}", TOK_ENDBRACE }, /* pseudo option */ + { ")", TOK_ENDBRACE }, /* pseudo option */ + { NULL, 0 } /* terminator */ +}; + +/* + * Helper routine to print a possibly unaligned uint64_t on + * various platform. If width > 0, print the value with + * the desired width, followed by a space; + * otherwise, return the required width. + */ +int +pr_u64(uint64_t *pd, int width) +{ +#ifdef TCC +#define U64_FMT "I64" +#else +#define U64_FMT "llu" +#endif + uint64_t u; + unsigned long long d; + + bcopy (pd, &u, sizeof(u)); + d = u; + return (width > 0) ? + printf("%*" U64_FMT " ", width, d) : + snprintf(NULL, 0, "%" U64_FMT, d) ; +#undef U64_FMT +} + +void * +safe_calloc(size_t number, size_t size) +{ + void *ret = calloc(number, size); + + if (ret == NULL) + err(EX_OSERR, "calloc"); + return ret; +} + +void * +safe_realloc(void *ptr, size_t size) +{ + void *ret = realloc(ptr, size); + + if (ret == NULL) + err(EX_OSERR, "realloc"); + return ret; +} + +/* + * conditionally runs the command. + * Selected options or negative -> getsockopt + */ +int +do_cmd(int optname, void *optval, uintptr_t optlen) +{ + static int s = -1; /* the socket */ + int i; + + if (co.test_only) + return 0; + + if (s == -1) + s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (s < 0) + err(EX_UNAVAILABLE, "socket"); + + if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || + optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST || + optname == IP_FW_TABLE_GETSIZE || + optname == IP_FW_NAT_GET_CONFIG || + optname < 0 || + optname == IP_FW_NAT_GET_LOG) { + if (optname < 0) + optname = -optname; + i = getsockopt(s, IPPROTO_IP, optname, optval, + (socklen_t *)optlen); + } else { + i = setsockopt(s, IPPROTO_IP, optname, optval, optlen); + } + return i; +} + +/** + * match_token takes a table and a string, returns the value associated + * with the string (-1 in case of failure). + */ +int +match_token(struct _s_x *table, char *string) +{ + struct _s_x *pt; + uint i = strlen(string); + + for (pt = table ; i && pt->s != NULL ; pt++) + if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) + return pt->x; + return -1; +} + +/** + * match_value takes a table and a value, returns the string associated + * with the value (NULL in case of failure). + */ +char const * +match_value(struct _s_x *p, int value) +{ + for (; p->s != NULL; p++) + if (p->x == value) + return p->s; + return NULL; +} + +/* + * _substrcmp takes two strings and returns 1 if they do not match, + * and 0 if they match exactly or the first string is a sub-string + * of the second. A warning is printed to stderr in the case that the + * first string is a sub-string of the second. + * + * This function will be removed in the future through the usual + * deprecation process. + */ +int +_substrcmp(const char *str1, const char* str2) +{ + + if (strncmp(str1, str2, strlen(str1)) != 0) + return 1; + + if (strlen(str1) != strlen(str2)) + warnx("DEPRECATED: '%s' matched '%s' as a sub-string", + str1, str2); + return 0; +} + +/* + * _substrcmp2 takes three strings and returns 1 if the first two do not match, + * and 0 if they match exactly or the second string is a sub-string + * of the first. A warning is printed to stderr in the case that the + * first string does not match the third. + * + * This function exists to warn about the bizzare construction + * strncmp(str, "by", 2) which is used to allow people to use a shotcut + * for "bytes". The problem is that in addition to accepting "by", + * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any + * other string beginning with "by". + * + * This function will be removed in the future through the usual + * deprecation process. + */ +int +_substrcmp2(const char *str1, const char* str2, const char* str3) +{ + + if (strncmp(str1, str2, strlen(str2)) != 0) + return 1; + + if (strcmp(str1, str3) != 0) + warnx("DEPRECATED: '%s' matched '%s'", + str1, str3); + return 0; +} + +/* + * prints one port, symbolic or numeric + */ +static void +print_port(int proto, uint16_t port) +{ + + if (proto == IPPROTO_ETHERTYPE) { + char const *s; + + if (co.do_resolv && (s = match_value(ether_types, port)) ) + printf("%s", s); + else + printf("0x%04x", port); + } else { + struct servent *se = NULL; + if (co.do_resolv) { + struct protoent *pe = getprotobynumber(proto); + + se = getservbyport(htons(port), pe ? pe->p_name : NULL); + } + if (se) + printf("%s", se->s_name); + else + printf("%d", port); + } +} + +static struct _s_x _port_name[] = { + {"dst-port", O_IP_DSTPORT}, + {"src-port", O_IP_SRCPORT}, + {"ipid", O_IPID}, + {"iplen", O_IPLEN}, + {"ipttl", O_IPTTL}, + {"mac-type", O_MAC_TYPE}, + {"tcpdatalen", O_TCPDATALEN}, + {"tagged", O_TAGGED}, + {NULL, 0} +}; + +/* + * Print the values in a list 16-bit items of the types above. + * XXX todo: add support for mask. + */ +static void +print_newports(ipfw_insn_u16 *cmd, int proto, int opcode) +{ + uint16_t *p = cmd->ports; + int i; + char const *sep; + + if (opcode != 0) { + sep = match_value(_port_name, opcode); + if (sep == NULL) + sep = "???"; + printf (" %s", sep); + } + sep = " "; + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("%s", sep); + print_port(proto, p[0]); + if (p[0] != p[1]) { + printf("-"); + print_port(proto, p[1]); + } + sep = ","; + } +} + +/* + * Like strtol, but also translates service names into port numbers + * for some protocols. + * In particular: + * proto == -1 disables the protocol check; + * proto == IPPROTO_ETHERTYPE looks up an internal table + * proto == matches the values there. + * Returns *end == s in case the parameter is not found. + */ +static int +strtoport(char *s, char **end, int base, int proto) +{ + char *p, *buf; + char *s1; + int i; + + *end = s; /* default - not found */ + if (*s == '\0') + return 0; /* not found */ + + if (isdigit(*s)) + return strtol(s, end, base); + + /* + * find separator. '\\' escapes the next char. + */ + for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) + if (*s1 == '\\' && s1[1] != '\0') + s1++; + + buf = safe_calloc(s1 - s + 1, 1); + + /* + * copy into a buffer skipping backslashes + */ + for (p = s, i = 0; p != s1 ; p++) + if (*p != '\\') + buf[i++] = *p; + buf[i++] = '\0'; + + if (proto == IPPROTO_ETHERTYPE) { + i = match_token(ether_types, buf); + free(buf); + if (i != -1) { /* found */ + *end = s1; + return i; + } + } else { + struct protoent *pe = NULL; + struct servent *se; + + if (proto != 0) + pe = getprotobynumber(proto); + setservent(1); + se = getservbyname(buf, pe ? pe->p_name : NULL); + free(buf); + if (se != NULL) { + *end = s1; + return ntohs(se->s_port); + } + } + return 0; /* not found */ +} + +/* + * Fill the body of the command with the list of port ranges. + */ +static int +fill_newports(ipfw_insn_u16 *cmd, char *av, int proto) +{ + uint16_t a, b, *p = cmd->ports; + int i = 0; + char *s = av; + + while (*s) { + a = strtoport(av, &s, 0, proto); + if (s == av) /* empty or invalid argument */ + return (0); + + switch (*s) { + case '-': /* a range */ + av = s + 1; + b = strtoport(av, &s, 0, proto); + /* Reject expressions like '1-abc' or '1-2-3'. */ + if (s == av || (*s != ',' && *s != '\0')) + return (0); + p[0] = a; + p[1] = b; + break; + case ',': /* comma separated list */ + case '\0': + p[0] = p[1] = a; + break; + default: + warnx("port list: invalid separator <%c> in <%s>", + *s, av); + return (0); + } + + i++; + p += 2; + av = s + 1; + } + if (i > 0) { + if (i + 1 > F_LEN_MASK) + errx(EX_DATAERR, "too many ports/ranges\n"); + cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ + } + return (i); +} + +static struct _s_x icmpcodes[] = { + { "net", ICMP_UNREACH_NET }, + { "host", ICMP_UNREACH_HOST }, + { "protocol", ICMP_UNREACH_PROTOCOL }, + { "port", ICMP_UNREACH_PORT }, + { "needfrag", ICMP_UNREACH_NEEDFRAG }, + { "srcfail", ICMP_UNREACH_SRCFAIL }, + { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, + { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, + { "isolated", ICMP_UNREACH_ISOLATED }, + { "net-prohib", ICMP_UNREACH_NET_PROHIB }, + { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, + { "tosnet", ICMP_UNREACH_TOSNET }, + { "toshost", ICMP_UNREACH_TOSHOST }, + { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, + { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, + { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, + { NULL, 0 } +}; + +static void +fill_reject_code(u_short *codep, char *str) +{ + int val; + char *s; + + val = strtoul(str, &s, 0); + if (s == str || *s != '\0' || val >= 0x100) + val = match_token(icmpcodes, str); + if (val < 0) + errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); + *codep = val; + return; +} + +static void +print_reject_code(uint16_t code) +{ + char const *s = match_value(icmpcodes, code); + + if (s != NULL) + printf("unreach %s", s); + else + printf("unreach %u", code); +} + +/* + * Returns the number of bits set (from left) in a contiguous bitmask, + * or -1 if the mask is not contiguous. + * XXX this needs a proper fix. + * This effectively works on masks in big-endian (network) format. + * when compiled on little endian architectures. + * + * First bit is bit 7 of the first byte -- note, for MAC addresses, + * the first bit on the wire is bit 0 of the first byte. + * len is the max length in bits. + */ +int +contigmask(uint8_t *p, int len) +{ + int i, n; + + for (i=0; iarg1 & 0xff; + uint8_t clear = (cmd->arg1 >> 8) & 0xff; + + if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { + printf(" setup"); + return; + } + + printf(" %s ", name); + for (i=0; list[i].x != 0; i++) { + if (set & list[i].x) { + set &= ~list[i].x; + printf("%s%s", comma, list[i].s); + comma = ","; + } + if (clear & list[i].x) { + clear &= ~list[i].x; + printf("%s!%s", comma, list[i].s); + comma = ","; + } + } +} + +/* + * Print the ip address contained in a command. + */ +static void +print_ip(ipfw_insn_ip *cmd, char const *s) +{ + struct hostent *he = NULL; + uint32_t len = F_LEN((ipfw_insn *)cmd); + uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; + + if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { + uint32_t d = a[1]; + const char *arg = ""; + + if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) + arg = match_value(rule_options, lookup_key[d]); + printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "", + arg, cmd->o.arg1); + return; + } + printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); + + if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { + printf("me"); + return; + } + if (cmd->o.opcode == O_IP_SRC_LOOKUP || + cmd->o.opcode == O_IP_DST_LOOKUP) { + printf("table(%u", ((ipfw_insn *)cmd)->arg1); + if (len == F_INSN_SIZE(ipfw_insn_u32)) + printf(",%u", *a); + printf(")"); + return; + } + if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { + uint32_t x, *map = (uint32_t *)&(cmd->mask); + int i, j; + char comma = '{'; + + x = cmd->o.arg1 - 1; + x = htonl( ~x ); + cmd->addr.s_addr = htonl(cmd->addr.s_addr); + printf("%s/%d", inet_ntoa(cmd->addr), + contigmask((uint8_t *)&x, 32)); + x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); + x &= 0xff; /* base */ + /* + * Print bits and ranges. + * Locate first bit set (i), then locate first bit unset (j). + * If we have 3+ consecutive bits set, then print them as a + * range, otherwise only print the initial bit and rescan. + */ + for (i=0; i < cmd->o.arg1; i++) + if (map[i/32] & (1<<(i & 31))) { + for (j=i+1; j < cmd->o.arg1; j++) + if (!(map[ j/32] & (1<<(j & 31)))) + break; + printf("%c%d", comma, i+x); + if (j>i+2) { /* range has at least 3 elements */ + printf("-%d", j-1+x); + i = j-1; + } + comma = ','; + } + printf("}"); + return; + } + /* + * len == 2 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + for (len = len / 2; len > 0; len--, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? + 32 : contigmask((uint8_t *)&(a[1]), 32); + if (mb == 32 && co.do_resolv) + he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); + if (he != NULL) /* resolved to name */ + printf("%s", he->h_name); + else if (mb == 0) /* any */ + printf("any"); + else { /* numeric IP followed by some kind of mask */ + printf("%s", inet_ntoa( *((struct in_addr *)&a[0]) ) ); + if (mb < 0) + printf(":%s", inet_ntoa( *((struct in_addr *)&a[1]) ) ); + else if (mb < 32) + printf("/%d", mb); + } + if (len > 1) + printf(","); + } +} + +/* + * prints a MAC address/mask pair + */ +static void +print_mac(uint8_t *addr, uint8_t *mask) +{ + int l = contigmask(mask, 48); + + if (l == 0) + printf(" any"); + else { + printf(" %02x:%02x:%02x:%02x:%02x:%02x", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + if (l == -1) + printf("&%02x:%02x:%02x:%02x:%02x:%02x", + mask[0], mask[1], mask[2], + mask[3], mask[4], mask[5]); + else if (l < 48) + printf("/%d", l); + } +} + +static void +fill_icmptypes(ipfw_insn_u32 *cmd, char *av) +{ + uint8_t type; + + cmd->d[0] = 0; + while (*av) { + if (*av == ',') + av++; + + type = strtoul(av, &av, 0); + + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ICMP type"); + + if (type > 31) + errx(EX_DATAERR, "ICMP type out of range"); + + cmd->d[0] |= 1 << type; + } + cmd->o.opcode = O_ICMPTYPE; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); +} + +static void +print_icmptypes(ipfw_insn_u32 *cmd) +{ + int i; + char sep= ' '; + + printf(" icmptypes"); + for (i = 0; i < 32; i++) { + if ( (cmd->d[0] & (1 << (i))) == 0) + continue; + printf("%c%d", sep, i); + sep = ','; + } +} + +/* + * show_ipfw() prints the body of an ipfw rule. + * Because the standard rule has at least proto src_ip dst_ip, we use + * a helper function to produce these entries if not provided explicitly. + * The first argument is the list of fields we have, the second is + * the list of fields we want to be printed. + * + * Special cases if we have provided a MAC header: + * + if the rule does not contain IP addresses/ports, do not print them; + * + if the rule does not contain an IP proto, print "all" instead of "ip"; + * + * Once we have 'have_options', IP header fields are printed as options. + */ +#define HAVE_PROTO 0x0001 +#define HAVE_SRCIP 0x0002 +#define HAVE_DSTIP 0x0004 +#define HAVE_PROTO4 0x0008 +#define HAVE_PROTO6 0x0010 +#define HAVE_IP 0x0100 +#define HAVE_OPTIONS 0x8000 + +static void +show_prerequisites(int *flags, int want, int cmd __unused) +{ + if (co.comment_only) + return; + if ( (*flags & HAVE_IP) == HAVE_IP) + *flags |= HAVE_OPTIONS; + + if ( !(*flags & HAVE_OPTIONS)) { + if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { + if ( (*flags & HAVE_PROTO4)) + printf(" ip4"); + else if ( (*flags & HAVE_PROTO6)) + printf(" ip6"); + else + printf(" ip"); + } + if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) + printf(" from any"); + if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) + printf(" to any"); + } + *flags |= want; +} + +static void +show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) +{ + static int twidth = 0; + int l; + ipfw_insn *cmd, *tagptr = NULL; + const char *comment = NULL; /* ptr to comment if we have one */ + int proto = 0; /* default */ + int flags = 0; /* prerequisites */ + ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ + ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ + int or_block = 0; /* we are in an or block */ + uint32_t set_disable; + + bcopy(&rule->next_rule, &set_disable, sizeof(set_disable)); + + if (set_disable & (1 << rule->set)) { /* disabled */ + if (!co.show_sets) + return; + else + printf("# DISABLED "); + } + printf("%05u ", rule->rulenum); + + if (pcwidth > 0 || bcwidth > 0) { + pr_u64(&rule->pcnt, pcwidth); + pr_u64(&rule->bcnt, bcwidth); + } + + if (co.do_time == 2) + printf("%10u ", rule->timestamp); + else if (co.do_time == 1) { + char timestr[30]; + time_t t = (time_t)0; + + if (twidth == 0) { + strcpy(timestr, ctime(&t)); + *strchr(timestr, '\n') = '\0'; + twidth = strlen(timestr); + } + if (rule->timestamp) { + t = _long_to_time(rule->timestamp); + + strcpy(timestr, ctime(&t)); + *strchr(timestr, '\n') = '\0'; + printf("%s ", timestr); + } else { + printf("%*s", twidth, " "); + } + } + + if (co.show_sets) + printf("set %d ", rule->set); + + /* + * print the optional "match probability" + */ + if (rule->cmd_len > 0) { + cmd = rule->cmd ; + if (cmd->opcode == O_PROB) { + ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; + double d = 1.0 * p->d[0]; + + d = (d / 0x7fffffff); + printf("prob %f ", d); + } + } + + /* + * first print actions + */ + for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); + l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { + switch(cmd->opcode) { + case O_CHECK_STATE: + printf("check-state"); + /* avoid printing anything else */ + flags = HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP; + break; + + case O_ACCEPT: + printf("allow"); + break; + + case O_COUNT: + printf("count"); + break; + + case O_DENY: + printf("deny"); + break; + + case O_REJECT: + if (cmd->arg1 == ICMP_REJECT_RST) + printf("reset"); + else if (cmd->arg1 == ICMP_UNREACH_HOST) + printf("reject"); + else + print_reject_code(cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1 == ICMP6_UNREACH_RST) + printf("reset6"); + else + print_unreach6_code(cmd->arg1); + break; + + case O_SKIPTO: + PRINT_UINT_ARG("skipto ", cmd->arg1); + break; + + case O_PIPE: + PRINT_UINT_ARG("pipe ", cmd->arg1); + break; + + case O_QUEUE: + PRINT_UINT_ARG("queue ", cmd->arg1); + break; + + case O_DIVERT: + PRINT_UINT_ARG("divert ", cmd->arg1); + break; + + case O_TEE: + PRINT_UINT_ARG("tee ", cmd->arg1); + break; + + case O_NETGRAPH: + PRINT_UINT_ARG("netgraph ", cmd->arg1); + break; + + case O_NGTEE: + PRINT_UINT_ARG("ngtee ", cmd->arg1); + break; + + case O_FORWARD_IP: + { + ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; + + if (s->sa.sin_addr.s_addr == INADDR_ANY) { + printf("fwd tablearg"); + } else { + printf("fwd %s", inet_ntoa(s->sa.sin_addr)); + } + if (s->sa.sin_port) + printf(",%d", s->sa.sin_port); + } + break; + + case O_LOG: /* O_LOG is printed last */ + logptr = (ipfw_insn_log *)cmd; + break; + + case O_ALTQ: /* O_ALTQ is printed after O_LOG */ + altqptr = (ipfw_insn_altq *)cmd; + break; + + case O_TAG: + tagptr = cmd; + break; + + case O_NAT: + PRINT_UINT_ARG("nat ", cmd->arg1); + break; + + case O_SETFIB: + PRINT_UINT_ARG("setfib ", cmd->arg1); + break; + + case O_REASS: + printf("reass"); + break; + + default: + printf("** unrecognized action %d len %d ", + cmd->opcode, cmd->len); + } + } + if (logptr) { + if (logptr->max_log > 0) + printf(" log logamount %d", logptr->max_log); + else + printf(" log"); + } +#ifndef NO_ALTQ + if (altqptr) { + print_altq_cmd(altqptr); + } +#endif + if (tagptr) { + if (tagptr->len & F_NOT) + PRINT_UINT_ARG(" untag ", tagptr->arg1); + else + PRINT_UINT_ARG(" tag ", tagptr->arg1); + } + + /* + * then print the body. + */ + for (l = rule->act_ofs, cmd = rule->cmd ; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + if ((cmd->len & F_OR) || (cmd->len & F_NOT)) + continue; + if (cmd->opcode == O_IP4) { + flags |= HAVE_PROTO4; + break; + } else if (cmd->opcode == O_IP6) { + flags |= HAVE_PROTO6; + break; + } + } + if (rule->_pad & 1) { /* empty rules before options */ + if (!co.do_compact) { + show_prerequisites(&flags, HAVE_PROTO, 0); + printf(" from any to any"); + } + flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | + HAVE_SRCIP | HAVE_DSTIP; + } + + if (co.comment_only) + comment = "..."; + + for (l = rule->act_ofs, cmd = rule->cmd ; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + /* useful alias */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; + + if (co.comment_only) { + if (cmd->opcode != O_NOP) + continue; + printf(" // %s\n", (char *)(cmd + 1)); + return; + } + + show_prerequisites(&flags, 0, cmd->opcode); + + switch(cmd->opcode) { + case O_PROB: + break; /* done already */ + + case O_PROBE_STATE: + break; /* no need to print anything here */ + + case O_IP_SRC: + case O_IP_SRC_LOOKUP: + case O_IP_SRC_MASK: + case O_IP_SRC_ME: + case O_IP_SRC_SET: + show_prerequisites(&flags, HAVE_PROTO, 0); + if (!(flags & HAVE_SRCIP)) + printf(" from"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip((ipfw_insn_ip *)cmd, + (flags & HAVE_OPTIONS) ? " src-ip" : ""); + flags |= HAVE_SRCIP; + break; + + case O_IP_DST: + case O_IP_DST_LOOKUP: + case O_IP_DST_MASK: + case O_IP_DST_ME: + case O_IP_DST_SET: + show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); + if (!(flags & HAVE_DSTIP)) + printf(" to"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip((ipfw_insn_ip *)cmd, + (flags & HAVE_OPTIONS) ? " dst-ip" : ""); + flags |= HAVE_DSTIP; + break; + + case O_IP6_SRC: + case O_IP6_SRC_MASK: + case O_IP6_SRC_ME: + show_prerequisites(&flags, HAVE_PROTO, 0); + if (!(flags & HAVE_SRCIP)) + printf(" from"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip6((ipfw_insn_ip6 *)cmd, + (flags & HAVE_OPTIONS) ? " src-ip6" : ""); + flags |= HAVE_SRCIP | HAVE_PROTO; + break; + + case O_IP6_DST: + case O_IP6_DST_MASK: + case O_IP6_DST_ME: + show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); + if (!(flags & HAVE_DSTIP)) + printf(" to"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip6((ipfw_insn_ip6 *)cmd, + (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); + flags |= HAVE_DSTIP; + break; + + case O_FLOW6ID: + print_flow6id( (ipfw_insn_u32 *) cmd ); + flags |= HAVE_OPTIONS; + break; + + case O_IP_DSTPORT: + show_prerequisites(&flags, + HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP, 0); + case O_IP_SRCPORT: + show_prerequisites(&flags, + HAVE_PROTO | HAVE_SRCIP, 0); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + if (cmd->len & F_NOT) + printf(" not"); + print_newports((ipfw_insn_u16 *)cmd, proto, + (flags & HAVE_OPTIONS) ? cmd->opcode : 0); + break; + + case O_PROTO: { + struct protoent *pe = NULL; + + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + if (cmd->len & F_NOT) + printf(" not"); + proto = cmd->arg1; + pe = getprotobynumber(cmd->arg1); + if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && + !(flags & HAVE_PROTO)) + show_prerequisites(&flags, + HAVE_PROTO | HAVE_IP | HAVE_SRCIP | + HAVE_DSTIP | HAVE_OPTIONS, 0); + if (flags & HAVE_OPTIONS) + printf(" proto"); + if (pe) + printf(" %s", pe->p_name); + else + printf(" %u", cmd->arg1); + } + flags |= HAVE_PROTO; + break; + + default: /*options ... */ + if (!(cmd->len & (F_OR|F_NOT))) + if (((cmd->opcode == O_IP6) && + (flags & HAVE_PROTO6)) || + ((cmd->opcode == O_IP4) && + (flags & HAVE_PROTO4))) + break; + show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + if (cmd->len & F_NOT && cmd->opcode != O_IN) + printf(" not"); + switch(cmd->opcode) { + case O_MACADDR2: { + ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; + + printf(" MAC"); + print_mac(m->addr, m->mask); + print_mac(m->addr + 6, m->mask + 6); + } + break; + + case O_MAC_TYPE: + print_newports((ipfw_insn_u16 *)cmd, + IPPROTO_ETHERTYPE, cmd->opcode); + break; + + + case O_FRAG: + printf(" frag"); + break; + + case O_FIB: + printf(" fib %u", cmd->arg1 ); + break; + + case O_IN: + printf(cmd->len & F_NOT ? " out" : " in"); + break; + + case O_DIVERTED: + switch (cmd->arg1) { + case 3: + printf(" diverted"); + break; + case 1: + printf(" diverted-loopback"); + break; + case 2: + printf(" diverted-output"); + break; + default: + printf(" diverted-?<%u>", cmd->arg1); + break; + } + break; + + case O_LAYER2: + printf(" layer2"); + break; + case O_XMIT: + case O_RECV: + case O_VIA: + { + char const *s; + ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; + + if (cmd->opcode == O_XMIT) + s = "xmit"; + else if (cmd->opcode == O_RECV) + s = "recv"; + else /* if (cmd->opcode == O_VIA) */ + s = "via"; + if (cmdif->name[0] == '\0') + printf(" %s %s", s, + inet_ntoa(cmdif->p.ip)); + else + printf(" %s %s", s, cmdif->name); + + break; + } + case O_IPID: + if (F_LEN(cmd) == 1) + printf(" ipid %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_IPID); + break; + + case O_IPTTL: + if (F_LEN(cmd) == 1) + printf(" ipttl %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_IPTTL); + break; + + case O_IPVER: + printf(" ipver %u", cmd->arg1 ); + break; + + case O_IPPRECEDENCE: + printf(" ipprecedence %u", (cmd->arg1) >> 5 ); + break; + + case O_IPLEN: + if (F_LEN(cmd) == 1) + printf(" iplen %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_IPLEN); + break; + + case O_IPOPT: + print_flags("ipoptions", cmd, f_ipopts); + break; + + case O_IPTOS: + print_flags("iptos", cmd, f_iptos); + break; + + case O_ICMPTYPE: + print_icmptypes((ipfw_insn_u32 *)cmd); + break; + + case O_ESTAB: + printf(" established"); + break; + + case O_TCPDATALEN: + if (F_LEN(cmd) == 1) + printf(" tcpdatalen %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_TCPDATALEN); + break; + + case O_TCPFLAGS: + print_flags("tcpflags", cmd, f_tcpflags); + break; + + case O_TCPOPTS: + print_flags("tcpoptions", cmd, f_tcpopts); + break; + + case O_TCPWIN: + printf(" tcpwin %d", ntohs(cmd->arg1)); + break; + + case O_TCPACK: + printf(" tcpack %d", ntohl(cmd32->d[0])); + break; + + case O_TCPSEQ: + printf(" tcpseq %d", ntohl(cmd32->d[0])); + break; + + case O_UID: + { + struct passwd *pwd = getpwuid(cmd32->d[0]); + + if (pwd) + printf(" uid %s", pwd->pw_name); + else + printf(" uid %u", cmd32->d[0]); + } + break; + + case O_GID: + { + struct group *grp = getgrgid(cmd32->d[0]); + + if (grp) + printf(" gid %s", grp->gr_name); + else + printf(" gid %u", cmd32->d[0]); + } + break; + + case O_JAIL: + printf(" jail %d", cmd32->d[0]); + break; + + case O_VERREVPATH: + printf(" verrevpath"); + break; + + case O_VERSRCREACH: + printf(" versrcreach"); + break; + + case O_ANTISPOOF: + printf(" antispoof"); + break; + + case O_IPSEC: + printf(" ipsec"); + break; + + case O_NOP: + comment = (char *)(cmd + 1); + break; + + case O_KEEP_STATE: + printf(" keep-state"); + break; + + case O_LIMIT: { + struct _s_x *p = limit_masks; + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + uint8_t x = c->limit_mask; + char const *comma = " "; + + printf(" limit"); + for (; p->x != 0 ; p++) + if ((x & p->x) == p->x) { + x &= ~p->x; + printf("%s%s", comma, p->s); + comma = ","; + } + PRINT_UINT_ARG(" ", c->conn_limit); + break; + } + + case O_IP6: + printf(" ip6"); + break; + + case O_IP4: + printf(" ip4"); + break; + + case O_ICMP6TYPE: + print_icmp6types((ipfw_insn_u32 *)cmd); + break; + + case O_EXT_HDR: + print_ext6hdr( (ipfw_insn *) cmd ); + break; + + case O_TAGGED: + if (F_LEN(cmd) == 1) + PRINT_UINT_ARG(" tagged ", cmd->arg1); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_TAGGED); + break; + + default: + printf(" [opcode %d len %d]", + cmd->opcode, cmd->len); + } + } + if (cmd->len & F_OR) { + printf(" or"); + or_block = 1; + } else if (or_block) { + printf(" }"); + or_block = 0; + } + } + show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP + | HAVE_IP, 0); + if (comment) + printf(" // %s", comment); + printf("\n"); +} + +static void +show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) +{ + struct protoent *pe; + struct in_addr a; + uint16_t rulenum; + char buf[INET6_ADDRSTRLEN]; + + if (!co.do_expired) { + if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) + return; + } + bcopy(&d->rule, &rulenum, sizeof(rulenum)); + printf("%05d", rulenum); + if (pcwidth > 0 || bcwidth > 0) { + printf(" "); + pr_u64(&d->pcnt, pcwidth); + pr_u64(&d->bcnt, bcwidth); + printf("(%ds)", d->expire); + } + switch (d->dyn_type) { + case O_LIMIT_PARENT: + printf(" PARENT %d", d->count); + break; + case O_LIMIT: + printf(" LIMIT"); + break; + case O_KEEP_STATE: /* bidir, no mask */ + printf(" STATE"); + break; + } + + if ((pe = getprotobynumber(d->id.proto)) != NULL) + printf(" %s", pe->p_name); + else + printf(" proto %u", d->id.proto); + + if (d->id.addr_type == 4) { + a.s_addr = htonl(d->id.src_ip); + printf(" %s %d", inet_ntoa(a), d->id.src_port); + + a.s_addr = htonl(d->id.dst_ip); + printf(" <-> %s %d", inet_ntoa(a), d->id.dst_port); + } else if (d->id.addr_type == 6) { + printf(" %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, + sizeof(buf)), d->id.src_port); + printf(" <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf, + sizeof(buf)), d->id.dst_port); + } else + printf(" UNKNOWN <-> UNKNOWN\n"); + + printf("\n"); +} + +/* + * This one handles all set-related commands + * ipfw set { show | enable | disable } + * ipfw set swap X Y + * ipfw set move X to Y + * ipfw set move rule X to Y + */ +void +ipfw_sets_handler(char *av[]) +{ + uint32_t set_disable, masks[2]; + int i, nbytes; + uint16_t rulenum; + uint8_t cmd, new_set; + + av++; + + if (av[0] == NULL) + errx(EX_USAGE, "set needs command"); + if (_substrcmp(*av, "show") == 0) { + void *data = NULL; + char const *msg; + int nalloc; + + nalloc = nbytes = sizeof(struct ip_fw); + while (nbytes >= nalloc) { + if (data) + free(data); + nalloc = nalloc * 2 + 200; + nbytes = nalloc; + data = safe_calloc(1, nbytes); + if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) + err(EX_OSERR, "getsockopt(IP_FW_GET)"); + } + + bcopy(&((struct ip_fw *)data)->next_rule, + &set_disable, sizeof(set_disable)); + + for (i = 0, msg = "disable" ; i < RESVD_SET; i++) + if ((set_disable & (1< RESVD_SET) + errx(EX_DATAERR, "invalid set number %s\n", av[0]); + if (!isdigit(*(av[1])) || new_set > RESVD_SET) + errx(EX_DATAERR, "invalid set number %s\n", av[1]); + masks[0] = (4 << 24) | (new_set << 16) | (rulenum); + i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); + } else if (_substrcmp(*av, "move") == 0) { + av++; + if (av[0] && _substrcmp(*av, "rule") == 0) { + cmd = 2; + av++; + } else + cmd = 3; + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || + av[3] != NULL || _substrcmp(av[1], "to") != 0) + errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); + rulenum = atoi(av[0]); + new_set = atoi(av[2]); + if (!isdigit(*(av[0])) || (cmd == 3 && rulenum > RESVD_SET) || + (cmd == 2 && rulenum == IPFW_DEFAULT_RULE) ) + errx(EX_DATAERR, "invalid source number %s\n", av[0]); + if (!isdigit(*(av[2])) || new_set > RESVD_SET) + errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); + masks[0] = (cmd << 24) | (new_set << 16) | (rulenum); + i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); + } else if (_substrcmp(*av, "disable") == 0 || + _substrcmp(*av, "enable") == 0 ) { + int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; + + av++; + masks[0] = masks[1] = 0; + + while (av[0]) { + if (isdigit(**av)) { + i = atoi(*av); + if (i < 0 || i > RESVD_SET) + errx(EX_DATAERR, + "invalid set number %d\n", i); + masks[which] |= (1<= nalloc) { + nalloc = nalloc * 2 + 200; + nbytes = nalloc; + data = safe_realloc(data, nbytes); + if (do_cmd(ocmd, data, (uintptr_t)&nbytes) < 0) + err(EX_OSERR, "getsockopt(IP_%s_GET)", + co.do_pipe ? "DUMMYNET" : "FW"); + } + + /* + * Count static rules. They have variable size so we + * need to scan the list to count them. + */ + for (nstat = 1, r = data, lim = (char *)data + nbytes; + r->rulenum < IPFW_DEFAULT_RULE && (char *)r < lim; + ++nstat, r = NEXT(r) ) + ; /* nothing */ + + /* + * Count dynamic rules. This is easier as they have + * fixed size. + */ + r = NEXT(r); + dynrules = (ipfw_dyn_rule *)r ; + n = (char *)r - (char *)data; + ndyn = (nbytes - n) / sizeof *dynrules; + + /* if showing stats, figure out column widths ahead of time */ + bcwidth = pcwidth = 0; + if (show_counters) { + for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { + /* skip rules from another set */ + if (co.use_set && r->set != co.use_set - 1) + continue; + + /* packet counter */ + width = pr_u64(&r->pcnt, 0); + if (width > pcwidth) + pcwidth = width; + + /* byte counter */ + width = pr_u64(&r->bcnt, 0); + if (width > bcwidth) + bcwidth = width; + } + } + if (co.do_dynamic && ndyn) { + for (n = 0, d = dynrules; n < ndyn; n++, d++) { + if (co.use_set) { + /* skip rules from another set */ + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co.use_set - 1) + continue; + } + width = pr_u64(&d->pcnt, 0); + if (width > pcwidth) + pcwidth = width; + + width = pr_u64(&d->bcnt, 0); + if (width > bcwidth) + bcwidth = width; + } + } + /* if no rule numbers were specified, list all rules */ + if (ac == 0) { + for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { + if (co.use_set && r->set != co.use_set - 1) + continue; + show_ipfw(r, pcwidth, bcwidth); + } + + if (co.do_dynamic && ndyn) { + printf("## Dynamic rules (%d):\n", ndyn); + for (n = 0, d = dynrules; n < ndyn; n++, d++) { + if (co.use_set) { + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co.use_set - 1) + continue; + } + show_dyn_ipfw(d, pcwidth, bcwidth); + } + } + goto done; + } + + /* display specific rules requested on command line */ + + for (lac = ac, lav = av; lac != 0; lac--) { + /* convert command line rule # */ + last = rnum = strtoul(*lav++, &endptr, 10); + if (*endptr == '-') + last = strtoul(endptr+1, &endptr, 10); + if (*endptr) { + exitval = EX_USAGE; + warnx("invalid rule number: %s", *(lav - 1)); + continue; + } + for (n = seen = 0, r = data; n < nstat; n++, r = NEXT(r) ) { + if (r->rulenum > last) + break; + if (co.use_set && r->set != co.use_set - 1) + continue; + if (r->rulenum >= rnum && r->rulenum <= last) { + show_ipfw(r, pcwidth, bcwidth); + seen = 1; + } + } + if (!seen) { + /* give precedence to other error(s) */ + if (exitval == EX_OK) + exitval = EX_UNAVAILABLE; + warnx("rule %lu does not exist", rnum); + } + } + + if (co.do_dynamic && ndyn) { + printf("## Dynamic rules:\n"); + for (lac = ac, lav = av; lac != 0; lac--) { + last = rnum = strtoul(*lav++, &endptr, 10); + if (*endptr == '-') + last = strtoul(endptr+1, &endptr, 10); + if (*endptr) + /* already warned */ + continue; + for (n = 0, d = dynrules; n < ndyn; n++, d++) { + uint16_t rulenum; + + bcopy(&d->rule, &rulenum, sizeof(rulenum)); + if (rulenum > rnum) + break; + if (co.use_set) { + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co.use_set - 1) + continue; + } + if (r->rulenum >= rnum && r->rulenum <= last) + show_dyn_ipfw(d, pcwidth, bcwidth); + } + } + } + + ac = 0; + +done: + free(data); + + if (exitval != EX_OK) + exit(exitval); +#undef NEXT +} + +static int +lookup_host (char *host, struct in_addr *ipaddr) +{ + struct hostent *he; + + if (!inet_aton(host, ipaddr)) { + if ((he = gethostbyname(host)) == NULL) + return(-1); + *ipaddr = *(struct in_addr *)he->h_addr_list[0]; + } + return(0); +} + +/* + * fills the addr and mask fields in the instruction as appropriate from av. + * Update length as appropriate. + * The following formats are allowed: + * me returns O_IP_*_ME + * 1.2.3.4 single IP address + * 1.2.3.4:5.6.7.8 address:mask + * 1.2.3.4/24 address/mask + * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet + * We can have multiple comma-separated address/mask entries. + */ +static void +fill_ip(ipfw_insn_ip *cmd, char *av) +{ + int len = 0; + uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; + + cmd->o.len &= ~F_LEN_MASK; /* zero len */ + + if (_substrcmp(av, "any") == 0) + return; + + if (_substrcmp(av, "me") == 0) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return; + } + + if (strncmp(av, "table(", 6) == 0) { + char *p = strchr(av + 6, ','); + + if (p) + *p++ = '\0'; + cmd->o.opcode = O_IP_DST_LOOKUP; + cmd->o.arg1 = strtoul(av + 6, NULL, 0); + if (p) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); + d[0] = strtoul(p, NULL, 0); + } else + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return; + } + + while (av) { + /* + * After the address we can have '/' or ':' indicating a mask, + * ',' indicating another address follows, '{' indicating a + * set of addresses of unspecified size. + */ + char *t = NULL, *p = strpbrk(av, "/:,{"); + int masklen; + char md, nd = '\0'; + + if (p) { + md = *p; + *p++ = '\0'; + if ((t = strpbrk(p, ",{")) != NULL) { + nd = *t; + *t = '\0'; + } + } else + md = '\0'; + + if (lookup_host(av, (struct in_addr *)&d[0]) != 0) + errx(EX_NOHOST, "hostname ``%s'' unknown", av); + switch (md) { + case ':': + if (!inet_aton(p, (struct in_addr *)&d[1])) + errx(EX_DATAERR, "bad netmask ``%s''", p); + break; + case '/': + masklen = atoi(p); + if (masklen == 0) + d[1] = htonl(0); /* mask */ + else if (masklen > 32) + errx(EX_DATAERR, "bad width ``%s''", p); + else + d[1] = htonl(~0 << (32 - masklen)); + break; + case '{': /* no mask, assume /24 and put back the '{' */ + d[1] = htonl(~0 << (32 - 24)); + *(--p) = md; + break; + + case ',': /* single address plus continuation */ + *(--p) = md; + /* FALLTHROUGH */ + case 0: /* initialization value */ + default: + d[1] = htonl(~0); /* force /32 */ + break; + } + d[0] &= d[1]; /* mask base address with mask */ + if (t) + *t = nd; + /* find next separator */ + if (p) + p = strpbrk(p, ",{"); + if (p && *p == '{') { + /* + * We have a set of addresses. They are stored as follows: + * arg1 is the set size (powers of 2, 2..256) + * addr is the base address IN HOST FORMAT + * mask.. is an array of arg1 bits (rounded up to + * the next multiple of 32) with bits set + * for each host in the map. + */ + uint32_t *map = (uint32_t *)&cmd->mask; + int low, high; + int i = contigmask((uint8_t *)&(d[1]), 32); + + if (len > 0) + errx(EX_DATAERR, "address set cannot be in a list"); + if (i < 24 || i > 31) + errx(EX_DATAERR, "invalid set with mask %d\n", i); + cmd->o.arg1 = 1<<(32-i); /* map length */ + d[0] = ntohl(d[0]); /* base addr in host format */ + cmd->o.opcode = O_IP_DST_SET; /* default */ + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; + for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) + map[i] = 0; /* clear map */ + + av = p + 1; + low = d[0] & 0xff; + high = low + cmd->o.arg1 - 1; + /* + * Here, i stores the previous value when we specify a range + * of addresses within a mask, e.g. 45-63. i = -1 means we + * have no previous value. + */ + i = -1; /* previous value in a range */ + while (isdigit(*av)) { + char *s; + int a = strtol(av, &s, 0); + + if (s == av) { /* no parameter */ + if (*av != '}') + errx(EX_DATAERR, "set not closed\n"); + if (i != -1) + errx(EX_DATAERR, "incomplete range %d-", i); + break; + } + if (a < low || a > high) + errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", + a, low, high); + a -= low; + if (i == -1) /* no previous in range */ + i = a; + else { /* check that range is valid */ + if (i > a) + errx(EX_DATAERR, "invalid range %d-%d", + i+low, a+low); + if (*s == '-') + errx(EX_DATAERR, "double '-' in range"); + } + for (; i <= a; i++) + map[i/32] |= 1<<(i & 31); + i = -1; + if (*s == '-') + i = a; + else if (*s == '}') + break; + av = s+1; + } + return; + } + av = p; + if (av) /* then *av must be a ',' */ + av++; + + /* Check this entry */ + if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ + /* + * 'any' turns the entire list into a NOP. + * 'not any' never matches, so it is removed from the + * list unless it is the only item, in which case we + * report an error. + */ + if (cmd->o.len & F_NOT) { /* "not any" never matches */ + if (av == NULL && len == 0) /* only this entry */ + errx(EX_DATAERR, "not any never matches"); + } + /* else do nothing and skip this entry */ + return; + } + /* A single IP can be stored in an optimized format */ + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); + return; + } + len += 2; /* two words... */ + d += 2; + } /* end while */ + if (len + 1 > F_LEN_MASK) + errx(EX_DATAERR, "address list too long"); + cmd->o.len |= len+1; +} + + +/* n2mask sets n bits of the mask */ +void +n2mask(struct in6_addr *mask, int n) +{ + static int minimask[9] = + { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; + u_char *p; + + memset(mask, 0, sizeof(struct in6_addr)); + p = (u_char *) mask; + for (; n > 0; p++, n -= 8) { + if (n >= 8) + *p = 0xff; + else + *p = minimask[n]; + } + return; +} + + +/* + * helper function to process a set of flags and set bits in the + * appropriate masks. + */ +static void +fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, + struct _s_x *flags, char *p) +{ + uint8_t set=0, clear=0; + + while (p && *p) { + char *q; /* points to the separator */ + int val; + uint8_t *which; /* mask we are working on */ + + if (*p == '!') { + p++; + which = &clear; + } else + which = &set; + q = strchr(p, ','); + if (q) + *q++ = '\0'; + val = match_token(flags, p); + if (val <= 0) + errx(EX_DATAERR, "invalid flag %s", p); + *which |= (uint8_t)val; + p = q; + } + cmd->opcode = opcode; + cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; + cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); +} + + +void +ipfw_delete(char *av[]) +{ + uint32_t rulenum; + int i; + int exitval = EX_OK; + int do_set = 0; + + av++; + NEED1("missing rule specification"); + if ( *av && _substrcmp(*av, "set") == 0) { + /* Do not allow using the following syntax: + * ipfw set N delete set M + */ + if (co.use_set) + errx(EX_DATAERR, "invalid syntax"); + do_set = 1; /* delete set */ + av++; + } + + /* Rule number */ + while (*av && isdigit(**av)) { + i = atoi(*av); av++; + if (co.do_nat) { + exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); + if (exitval) { + exitval = EX_UNAVAILABLE; + warn("rule %u not available", i); + } + } else if (co.do_pipe) { + exitval = ipfw_delete_pipe(co.do_pipe, i); + } else { + if (co.use_set) + rulenum = (i & 0xffff) | (5 << 24) | + ((co.use_set - 1) << 16); + else + rulenum = (i & 0xffff) | (do_set << 24); + i = do_cmd(IP_FW_DEL, &rulenum, sizeof rulenum); + if (i) { + exitval = EX_UNAVAILABLE; + warn("rule %u: setsockopt(IP_FW_DEL)", + rulenum); + } + } + } + if (exitval != EX_OK) + exit(exitval); +} + + +/* + * fill the interface structure. We do not check the name as we can + * create interfaces dynamically, so checking them at insert time + * makes relatively little sense. + * Interface names containing '*', '?', or '[' are assumed to be shell + * patterns which match interfaces. + */ +static void +fill_iface(ipfw_insn_if *cmd, char *arg) +{ + cmd->name[0] = '\0'; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); + + /* Parse the interface or address */ + if (strcmp(arg, "any") == 0) + cmd->o.len = 0; /* effectively ignore this command */ + else if (!isdigit(*arg)) { + strlcpy(cmd->name, arg, sizeof(cmd->name)); + cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; + } else if (!inet_aton(arg, &cmd->p.ip)) + errx(EX_DATAERR, "bad ip address ``%s''", arg); +} + +static void +get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) +{ + int i; + size_t l; + char *ap, *ptr, *optr; + struct ether_addr *mac; + const char *macset = "0123456789abcdefABCDEF:"; + + if (strcmp(p, "any") == 0) { + for (i = 0; i < ETHER_ADDR_LEN; i++) + addr[i] = mask[i] = 0; + return; + } + + optr = ptr = strdup(p); + if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { + l = strlen(ap); + if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) + errx(EX_DATAERR, "Incorrect MAC address"); + bcopy(mac, addr, ETHER_ADDR_LEN); + } else + errx(EX_DATAERR, "Incorrect MAC address"); + + if (ptr != NULL) { /* we have mask? */ + if (p[ptr - optr - 1] == '/') { /* mask len */ + long ml = strtol(ptr, &ap, 10); + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) + errx(EX_DATAERR, "Incorrect mask length"); + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); + } else { /* mask */ + l = strlen(ptr); + if (strspn(ptr, macset) != l || + (mac = ether_aton(ptr)) == NULL) + errx(EX_DATAERR, "Incorrect mask"); + bcopy(mac, mask, ETHER_ADDR_LEN); + } + } else { /* default mask: ff:ff:ff:ff:ff:ff */ + for (i = 0; i < ETHER_ADDR_LEN; i++) + mask[i] = 0xff; + } + for (i = 0; i < ETHER_ADDR_LEN; i++) + addr[i] &= mask[i]; + + free(optr); +} + +/* + * helper function, updates the pointer to cmd with the length + * of the current command, and also cleans up the first word of + * the new command in case it has been clobbered before. + */ +static ipfw_insn * +next_cmd(ipfw_insn *cmd) +{ + cmd += F_LEN(cmd); + bzero(cmd, sizeof(*cmd)); + return cmd; +} + +/* + * Takes arguments and copies them into a comment + */ +static void +fill_comment(ipfw_insn *cmd, char **av) +{ + int i, l; + char *p = (char *)(cmd + 1); + + cmd->opcode = O_NOP; + cmd->len = (cmd->len & (F_NOT | F_OR)); + + /* Compute length of comment string. */ + for (i = 0, l = 0; av[i] != NULL; i++) + l += strlen(av[i]) + 1; + if (l == 0) + return; + if (l > 84) + errx(EX_DATAERR, + "comment too long (max 80 chars)"); + l = 1 + (l+3)/4; + cmd->len = (cmd->len & (F_NOT | F_OR)) | l; + for (i = 0; av[i] != NULL; i++) { + strcpy(p, av[i]); + p += strlen(av[i]); + *p++ = ' '; + } + *(--p) = '\0'; +} + +/* + * A function to fill simple commands of size 1. + * Existing flags are preserved. + */ +static void +fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) +{ + cmd->opcode = opcode; + cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; + cmd->arg1 = arg; +} + +/* + * Fetch and add the MAC address and type, with masks. This generates one or + * two microinstructions, and returns the pointer to the last one. + */ +static ipfw_insn * +add_mac(ipfw_insn *cmd, char *av[]) +{ + ipfw_insn_mac *mac; + + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) + errx(EX_DATAERR, "MAC dst src"); + + cmd->opcode = O_MACADDR2; + cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); + + mac = (ipfw_insn_mac *)cmd; + get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ + get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), + &(mac->mask[ETHER_ADDR_LEN])); /* src */ + return cmd; +} + +static ipfw_insn * +add_mactype(ipfw_insn *cmd, char *av) +{ + if (!av) + errx(EX_DATAERR, "missing MAC type"); + if (strcmp(av, "any") != 0) { /* we have a non-null type */ + fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); + cmd->opcode = O_MAC_TYPE; + return cmd; + } else + return NULL; +} + +static ipfw_insn * +add_proto0(ipfw_insn *cmd, char *av, u_char *protop) +{ + struct protoent *pe; + char *ep; + int proto; + + proto = strtol(av, &ep, 10); + if (*ep != '\0' || proto <= 0) { + if ((pe = getprotobyname(av)) == NULL) + return NULL; + proto = pe->p_proto; + } + + fill_cmd(cmd, O_PROTO, 0, proto); + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_proto(ipfw_insn *cmd, char *av, u_char *protop) +{ + u_char proto = IPPROTO_IP; + + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) + ; /* do not set O_IP4 nor O_IP6 */ + else if (strcmp(av, "ip4") == 0) + /* explicit "just IPv4" rule */ + fill_cmd(cmd, O_IP4, 0, 0); + else if (strcmp(av, "ip6") == 0) { + /* explicit "just IPv6" rule */ + proto = IPPROTO_IPV6; + fill_cmd(cmd, O_IP6, 0, 0); + } else + return add_proto0(cmd, av, protop); + + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) +{ + u_char proto = IPPROTO_IP; + + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) + ; /* do not set O_IP4 nor O_IP6 */ + else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) + /* explicit "just IPv4" rule */ + fill_cmd(cmd, O_IP4, 0, 0); + else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { + /* explicit "just IPv6" rule */ + proto = IPPROTO_IPV6; + fill_cmd(cmd, O_IP6, 0, 0); + } else + return add_proto0(cmd, av, protop); + + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_srcip(ipfw_insn *cmd, char *av) +{ + fill_ip((ipfw_insn_ip *)cmd, av); + if (cmd->opcode == O_IP_DST_SET) /* set */ + cmd->opcode = O_IP_SRC_SET; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + cmd->opcode = O_IP_SRC_LOOKUP; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ + cmd->opcode = O_IP_SRC_ME; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ + cmd->opcode = O_IP_SRC; + else /* addr/mask */ + cmd->opcode = O_IP_SRC_MASK; + return cmd; +} + +static ipfw_insn * +add_dstip(ipfw_insn *cmd, char *av) +{ + fill_ip((ipfw_insn_ip *)cmd, av); + if (cmd->opcode == O_IP_DST_SET) /* set */ + ; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + ; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ + cmd->opcode = O_IP_DST_ME; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ + cmd->opcode = O_IP_DST; + else /* addr/mask */ + cmd->opcode = O_IP_DST_MASK; + return cmd; +} + +static ipfw_insn * +add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) +{ + /* XXX "any" is trapped before. Perhaps "to" */ + if (_substrcmp(av, "any") == 0) { + return NULL; + } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { + /* XXX todo: check that we have a protocol with ports */ + cmd->opcode = opcode; + return cmd; + } + return NULL; +} + +static ipfw_insn * +add_src(ipfw_insn *cmd, char *av, u_char proto) +{ + struct in6_addr a; + char *host, *ch; + ipfw_insn *ret = NULL; + + if ((host = strdup(av)) == NULL) + return NULL; + if ((ch = strrchr(host, '/')) != NULL) + *ch = '\0'; + + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || + inet_pton(AF_INET6, host, &a) == 1) + ret = add_srcip6(cmd, av); + /* XXX: should check for IPv4, not !IPv6 */ + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || + inet_pton(AF_INET6, host, &a) != 1)) + ret = add_srcip(cmd, av); + if (ret == NULL && strcmp(av, "any") != 0) + ret = cmd; + + free(host); + return ret; +} + +static ipfw_insn * +add_dst(ipfw_insn *cmd, char *av, u_char proto) +{ + struct in6_addr a; + char *host, *ch; + ipfw_insn *ret = NULL; + + if ((host = strdup(av)) == NULL) + return NULL; + if ((ch = strrchr(host, '/')) != NULL) + *ch = '\0'; + + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || + inet_pton(AF_INET6, host, &a) == 1) + ret = add_dstip6(cmd, av); + /* XXX: should check for IPv4, not !IPv6 */ + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || + inet_pton(AF_INET6, host, &a) != 1)) + ret = add_dstip(cmd, av); + if (ret == NULL && strcmp(av, "any") != 0) + ret = cmd; + + free(host); + return ret; +} + +/* + * Parse arguments and assemble the microinstructions which make up a rule. + * Rules are added into the 'rulebuf' and then copied in the correct order + * into the actual rule. + * + * The syntax for a rule starts with the action, followed by + * optional action parameters, and the various match patterns. + * In the assembled microcode, the first opcode must be an O_PROBE_STATE + * (generated if the rule includes a keep-state option), then the + * various match patterns, log/altq actions, and the actual action. + * + */ +void +ipfw_add(char *av[]) +{ + /* + * rules are added into the 'rulebuf' and then copied in + * the correct order into the actual rule. + * Some things that need to go out of order (prob, action etc.) + * go into actbuf[]. + */ + static uint32_t rulebuf[255], actbuf[255], cmdbuf[255]; + + ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; + ipfw_insn *first_cmd; /* first match pattern */ + + struct ip_fw *rule; + + /* + * various flags used to record that we entered some fields. + */ + ipfw_insn *have_state = NULL; /* check-state or keep-state */ + ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; + size_t len; + + int i; + + int open_par = 0; /* open parenthesis ( */ + + /* proto is here because it is used to fetch ports */ + u_char proto = IPPROTO_IP; /* default protocol */ + + double match_prob = 1; /* match probability, default is always match */ + + bzero(actbuf, sizeof(actbuf)); /* actions go here */ + bzero(cmdbuf, sizeof(cmdbuf)); + bzero(rulebuf, sizeof(rulebuf)); + + rule = (struct ip_fw *)rulebuf; + cmd = (ipfw_insn *)cmdbuf; + action = (ipfw_insn *)actbuf; + + av++; + + /* [rule N] -- Rule number optional */ + if (av[0] && isdigit(**av)) { + rule->rulenum = atoi(*av); + av++; + } + + /* [set N] -- set number (0..RESVD_SET), optional */ + if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { + int set = strtoul(av[1], NULL, 10); + if (set < 0 || set > RESVD_SET) + errx(EX_DATAERR, "illegal set %s", av[1]); + rule->set = set; + av += 2; + } + + /* [prob D] -- match probability, optional */ + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { + match_prob = strtod(av[1], NULL); + + if (match_prob <= 0 || match_prob > 1) + errx(EX_DATAERR, "illegal match prob. %s", av[1]); + av += 2; + } + + /* action -- mandatory */ + NEED1("missing action"); + i = match_token(rule_actions, *av); + av++; + action->len = 1; /* default */ + switch(i) { + case TOK_CHECKSTATE: + have_state = action; + action->opcode = O_CHECK_STATE; + break; + + case TOK_ACCEPT: + action->opcode = O_ACCEPT; + break; + + case TOK_DENY: + action->opcode = O_DENY; + action->arg1 = 0; + break; + + case TOK_REJECT: + action->opcode = O_REJECT; + action->arg1 = ICMP_UNREACH_HOST; + break; + + case TOK_RESET: + action->opcode = O_REJECT; + action->arg1 = ICMP_REJECT_RST; + break; + + case TOK_RESET6: + action->opcode = O_UNREACH6; + action->arg1 = ICMP6_UNREACH_RST; + break; + + case TOK_UNREACH: + action->opcode = O_REJECT; + NEED1("missing reject code"); + fill_reject_code(&action->arg1, *av); + av++; + break; + + case TOK_UNREACH6: + action->opcode = O_UNREACH6; + NEED1("missing unreach code"); + fill_unreach6_code(&action->arg1, *av); + av++; + break; + + case TOK_COUNT: + action->opcode = O_COUNT; + break; + + case TOK_NAT: + action->opcode = O_NAT; + action->len = F_INSN_SIZE(ipfw_insn_nat); + goto chkarg; + + case TOK_QUEUE: + action->opcode = O_QUEUE; + goto chkarg; + case TOK_PIPE: + action->opcode = O_PIPE; + goto chkarg; + case TOK_SKIPTO: + action->opcode = O_SKIPTO; + goto chkarg; + case TOK_NETGRAPH: + action->opcode = O_NETGRAPH; + goto chkarg; + case TOK_NGTEE: + action->opcode = O_NGTEE; + goto chkarg; + case TOK_DIVERT: + action->opcode = O_DIVERT; + goto chkarg; + case TOK_TEE: + action->opcode = O_TEE; +chkarg: + if (!av[0]) + errx(EX_USAGE, "missing argument for %s", *(av - 1)); + if (isdigit(**av)) { + action->arg1 = strtoul(*av, NULL, 10); + if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) + errx(EX_DATAERR, "illegal argument for %s", + *(av - 1)); + } else if (_substrcmp(*av, "tablearg") == 0) { + action->arg1 = IP_FW_TABLEARG; + } else if (i == TOK_DIVERT || i == TOK_TEE) { + struct servent *s; + setservent(1); + s = getservbyname(av[0], "divert"); + if (s != NULL) + action->arg1 = ntohs(s->s_port); + else + errx(EX_DATAERR, "illegal divert/tee port"); + } else + errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); + av++; + break; + + case TOK_FORWARD: { + ipfw_insn_sa *p = (ipfw_insn_sa *)action; + char *s, *end; + + NEED1("missing forward address[:port]"); + + action->opcode = O_FORWARD_IP; + action->len = F_INSN_SIZE(ipfw_insn_sa); + + /* + * In the kernel we assume AF_INET and use only + * sin_port and sin_addr. Remember to set sin_len as + * the routing code seems to use it too. + */ + p->sa.sin_family = AF_INET; + p->sa.sin_len = sizeof(struct sockaddr_in); + p->sa.sin_port = 0; + /* + * locate the address-port separator (':' or ',') + */ + s = strchr(*av, ':'); + if (s == NULL) + s = strchr(*av, ','); + if (s != NULL) { + *(s++) = '\0'; + i = strtoport(s, &end, 0 /* base */, 0 /* proto */); + if (s == end) + errx(EX_DATAERR, + "illegal forwarding port ``%s''", s); + p->sa.sin_port = (u_short)i; + } + if (_substrcmp(*av, "tablearg") == 0) + p->sa.sin_addr.s_addr = INADDR_ANY; + else + lookup_host(*av, &(p->sa.sin_addr)); + av++; + break; + } + case TOK_COMMENT: + /* pretend it is a 'count' rule followed by the comment */ + action->opcode = O_COUNT; + av--; /* go back... */ + break; + + case TOK_SETFIB: + { + int numfibs; + size_t intsize = sizeof(int); + + action->opcode = O_SETFIB; + NEED1("missing fib number"); + action->arg1 = strtoul(*av, NULL, 10); + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + errx(EX_DATAERR, "fibs not suported.\n"); + if (action->arg1 >= numfibs) /* Temporary */ + errx(EX_DATAERR, "fib too large.\n"); + av++; + break; + } + + case TOK_REASS: + action->opcode = O_REASS; + break; + + default: + errx(EX_DATAERR, "invalid action %s\n", av[-1]); + } + action = next_cmd(action); + + /* + * [altq queuename] -- altq tag, optional + * [log [logamount N]] -- log, optional + * + * If they exist, it go first in the cmdbuf, but then it is + * skipped in the copy section to the end of the buffer. + */ + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { + av++; + switch (i) { + case TOK_LOG: + { + ipfw_insn_log *c = (ipfw_insn_log *)cmd; + int l; + + if (have_log) + errx(EX_DATAERR, + "log cannot be specified more than once"); + have_log = (ipfw_insn *)c; + cmd->len = F_INSN_SIZE(ipfw_insn_log); + cmd->opcode = O_LOG; + if (av[0] && _substrcmp(*av, "logamount") == 0) { + av++; + NEED1("logamount requires argument"); + l = atoi(*av); + if (l < 0) + errx(EX_DATAERR, + "logamount must be positive"); + c->max_log = l; + av++; + } else { + len = sizeof(c->max_log); + if (sysctlbyname("net.inet.ip.fw.verbose_limit", + &c->max_log, &len, NULL, 0) == -1) + errx(1, "sysctlbyname(\"%s\")", + "net.inet.ip.fw.verbose_limit"); + } + } + break; + +#ifndef NO_ALTQ + case TOK_ALTQ: + { + ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; + + NEED1("missing altq queue name"); + if (have_altq) + errx(EX_DATAERR, + "altq cannot be specified more than once"); + have_altq = (ipfw_insn *)a; + cmd->len = F_INSN_SIZE(ipfw_insn_altq); + cmd->opcode = O_ALTQ; + a->qid = altq_name_to_qid(*av); + av++; + } + break; +#endif + + case TOK_TAG: + case TOK_UNTAG: { + uint16_t tag; + + if (have_tag) + errx(EX_USAGE, "tag and untag cannot be " + "specified more than once"); + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, + rule_action_params); + have_tag = cmd; + fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); + av++; + break; + } + + default: + abort(); + } + cmd = next_cmd(cmd); + } + + if (have_state) /* must be a check-state, we are done */ + goto done; + +#define OR_START(target) \ + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ + if (open_par) \ + errx(EX_USAGE, "nested \"(\" not allowed\n"); \ + prev = NULL; \ + open_par = 1; \ + if ( (av[0])[1] == '\0') { \ + av++; \ + } else \ + (*av)++; \ + } \ + target: \ + + +#define CLOSE_PAR \ + if (open_par) { \ + if (av[0] && ( \ + strcmp(*av, ")") == 0 || \ + strcmp(*av, "}") == 0)) { \ + prev = NULL; \ + open_par = 0; \ + av++; \ + } else \ + errx(EX_USAGE, "missing \")\"\n"); \ + } + +#define NOT_BLOCK \ + if (av[0] && _substrcmp(*av, "not") == 0) { \ + if (cmd->len & F_NOT) \ + errx(EX_USAGE, "double \"not\" not allowed\n"); \ + cmd->len |= F_NOT; \ + av++; \ + } + +#define OR_BLOCK(target) \ + if (av[0] && _substrcmp(*av, "or") == 0) { \ + if (prev == NULL || open_par == 0) \ + errx(EX_DATAERR, "invalid OR block"); \ + prev->len |= F_OR; \ + av++; \ + goto target; \ + } \ + CLOSE_PAR; + + first_cmd = cmd; + +#if 0 + /* + * MAC addresses, optional. + * If we have this, we skip the part "proto from src to dst" + * and jump straight to the option parsing. + */ + NOT_BLOCK; + NEED1("missing protocol"); + if (_substrcmp(*av, "MAC") == 0 || + _substrcmp(*av, "mac") == 0) { + av++; /* the "MAC" keyword */ + add_mac(cmd, av); /* exits in case of errors */ + cmd = next_cmd(cmd); + av += 2; /* dst-mac and src-mac */ + NOT_BLOCK; + NEED1("missing mac type"); + if (add_mactype(cmd, av[0])) + cmd = next_cmd(cmd); + av++; /* any or mac-type */ + goto read_options; + } +#endif + + /* + * protocol, mandatory + */ + OR_START(get_proto); + NOT_BLOCK; + NEED1("missing protocol"); + if (add_proto_compat(cmd, *av, &proto)) { + av++; + if (F_LEN(cmd) != 0) { + prev = cmd; + cmd = next_cmd(cmd); + } + } else if (first_cmd != cmd) { + errx(EX_DATAERR, "invalid protocol ``%s''", *av); + } else + goto read_options; + OR_BLOCK(get_proto); + + /* + * "from", mandatory + */ + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) + errx(EX_USAGE, "missing ``from''"); + av++; + + /* + * source IP, mandatory + */ + OR_START(source_ip); + NOT_BLOCK; /* optional "not" */ + NEED1("missing source address"); + if (add_src(cmd, *av, proto)) { + av++; + if (F_LEN(cmd) != 0) { /* ! any */ + prev = cmd; + cmd = next_cmd(cmd); + } + } else + errx(EX_USAGE, "bad source address %s", *av); + OR_BLOCK(source_ip); + + /* + * source ports, optional + */ + NOT_BLOCK; /* optional "not" */ + if ( av[0] != NULL ) { + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_SRCPORT)) { + av++; + if (F_LEN(cmd) != 0) + cmd = next_cmd(cmd); + } + } + + /* + * "to", mandatory + */ + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) + errx(EX_USAGE, "missing ``to''"); + av++; + + /* + * destination, mandatory + */ + OR_START(dest_ip); + NOT_BLOCK; /* optional "not" */ + NEED1("missing dst address"); + if (add_dst(cmd, *av, proto)) { + av++; + if (F_LEN(cmd) != 0) { /* ! any */ + prev = cmd; + cmd = next_cmd(cmd); + } + } else + errx( EX_USAGE, "bad destination address %s", *av); + OR_BLOCK(dest_ip); + + /* + * dest. ports, optional + */ + NOT_BLOCK; /* optional "not" */ + if (av[0]) { + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_DSTPORT)) { + av++; + if (F_LEN(cmd) != 0) + cmd = next_cmd(cmd); + } + } + +read_options: + if (av[0] && first_cmd == cmd) { + /* + * nothing specified so far, store in the rule to ease + * printout later. + */ + rule->_pad = 1; + } + prev = NULL; + while ( av[0] != NULL ) { + char *s; + ipfw_insn_u32 *cmd32; /* alias for cmd */ + + s = *av; + cmd32 = (ipfw_insn_u32 *)cmd; + + if (*s == '!') { /* alternate syntax for NOT */ + if (cmd->len & F_NOT) + errx(EX_USAGE, "double \"not\" not allowed\n"); + cmd->len = F_NOT; + s++; + } + i = match_token(rule_options, s); + av++; + switch(i) { + case TOK_NOT: + if (cmd->len & F_NOT) + errx(EX_USAGE, "double \"not\" not allowed\n"); + cmd->len = F_NOT; + break; + + case TOK_OR: + if (open_par == 0 || prev == NULL) + errx(EX_USAGE, "invalid \"or\" block\n"); + prev->len |= F_OR; + break; + + case TOK_STARTBRACE: + if (open_par) + errx(EX_USAGE, "+nested \"(\" not allowed\n"); + open_par = 1; + break; + + case TOK_ENDBRACE: + if (!open_par) + errx(EX_USAGE, "+missing \")\"\n"); + open_par = 0; + prev = NULL; + break; + + case TOK_IN: + fill_cmd(cmd, O_IN, 0, 0); + break; + + case TOK_OUT: + cmd->len ^= F_NOT; /* toggle F_NOT */ + fill_cmd(cmd, O_IN, 0, 0); + break; + + case TOK_DIVERTED: + fill_cmd(cmd, O_DIVERTED, 0, 3); + break; + + case TOK_DIVERTEDLOOPBACK: + fill_cmd(cmd, O_DIVERTED, 0, 1); + break; + + case TOK_DIVERTEDOUTPUT: + fill_cmd(cmd, O_DIVERTED, 0, 2); + break; + + case TOK_FRAG: + fill_cmd(cmd, O_FRAG, 0, 0); + break; + + case TOK_LAYER2: + fill_cmd(cmd, O_LAYER2, 0, 0); + break; + + case TOK_XMIT: + case TOK_RECV: + case TOK_VIA: + NEED1("recv, xmit, via require interface name" + " or address"); + fill_iface((ipfw_insn_if *)cmd, av[0]); + av++; + if (F_LEN(cmd) == 0) /* not a valid address */ + break; + if (i == TOK_XMIT) + cmd->opcode = O_XMIT; + else if (i == TOK_RECV) + cmd->opcode = O_RECV; + else if (i == TOK_VIA) + cmd->opcode = O_VIA; + break; + + case TOK_ICMPTYPES: + NEED1("icmptypes requires list of types"); + fill_icmptypes((ipfw_insn_u32 *)cmd, *av); + av++; + break; + + case TOK_ICMP6TYPES: + NEED1("icmptypes requires list of types"); + fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); + av++; + break; + + case TOK_IPTTL: + NEED1("ipttl requires TTL"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPTTL)) + errx(EX_DATAERR, "invalid ipttl %s", *av); + } else + fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPID: + NEED1("ipid requires id"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPID)) + errx(EX_DATAERR, "invalid ipid %s", *av); + } else + fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPLEN: + NEED1("iplen requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPLEN)) + errx(EX_DATAERR, "invalid ip len %s", *av); + } else + fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPVER: + NEED1("ipver requires version"); + fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPPRECEDENCE: + NEED1("ipprecedence requires value"); + fill_cmd(cmd, O_IPPRECEDENCE, 0, + (strtoul(*av, NULL, 0) & 7) << 5); + av++; + break; + + case TOK_IPOPTS: + NEED1("missing argument for ipoptions"); + fill_flags(cmd, O_IPOPT, f_ipopts, *av); + av++; + break; + + case TOK_IPTOS: + NEED1("missing argument for iptos"); + fill_flags(cmd, O_IPTOS, f_iptos, *av); + av++; + break; + + case TOK_UID: + NEED1("uid requires argument"); + { + char *end; + uid_t uid; + struct passwd *pwd; + + cmd->opcode = O_UID; + uid = strtoul(*av, &end, 0); + pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); + if (pwd == NULL) + errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); + cmd32->d[0] = pwd->pw_uid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_GID: + NEED1("gid requires argument"); + { + char *end; + gid_t gid; + struct group *grp; + + cmd->opcode = O_GID; + gid = strtoul(*av, &end, 0); + grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); + if (grp == NULL) + errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); + cmd32->d[0] = grp->gr_gid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_JAIL: + NEED1("jail requires argument"); + { + char *end; + int jid; + + cmd->opcode = O_JAIL; + jid = (int)strtol(*av, &end, 0); + if (jid < 0 || *end != '\0') + errx(EX_DATAERR, "jail requires prison ID"); + cmd32->d[0] = (uint32_t)jid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_ESTAB: + fill_cmd(cmd, O_ESTAB, 0, 0); + break; + + case TOK_SETUP: + fill_cmd(cmd, O_TCPFLAGS, 0, + (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); + break; + + case TOK_TCPDATALEN: + NEED1("tcpdatalen requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TCPDATALEN)) + errx(EX_DATAERR, "invalid tcpdata len %s", *av); + } else + fill_cmd(cmd, O_TCPDATALEN, 0, + strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPOPTS: + NEED1("missing argument for tcpoptions"); + fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); + av++; + break; + + case TOK_TCPSEQ: + case TOK_TCPACK: + NEED1("tcpseq/tcpack requires argument"); + cmd->len = F_INSN_SIZE(ipfw_insn_u32); + cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; + cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPWIN: + NEED1("tcpwin requires length"); + fill_cmd(cmd, O_TCPWIN, 0, + htons(strtoul(*av, NULL, 0))); + av++; + break; + + case TOK_TCPFLAGS: + NEED1("missing argument for tcpflags"); + cmd->opcode = O_TCPFLAGS; + fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); + av++; + break; + + case TOK_KEEPSTATE: + if (open_par) + errx(EX_USAGE, "keep-state cannot be part " + "of an or block"); + if (have_state) + errx(EX_USAGE, "only one of keep-state " + "and limit is allowed"); + have_state = cmd; + fill_cmd(cmd, O_KEEP_STATE, 0, 0); + break; + + case TOK_LIMIT: { + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + int val; + + if (open_par) + errx(EX_USAGE, + "limit cannot be part of an or block"); + if (have_state) + errx(EX_USAGE, "only one of keep-state and " + "limit is allowed"); + have_state = cmd; + + cmd->len = F_INSN_SIZE(ipfw_insn_limit); + cmd->opcode = O_LIMIT; + c->limit_mask = c->conn_limit = 0; + + while ( av[0] != NULL ) { + if ((val = match_token(limit_masks, *av)) <= 0) + break; + c->limit_mask |= val; + av++; + } + + if (c->limit_mask == 0) + errx(EX_USAGE, "limit: missing limit mask"); + + GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, + TOK_LIMIT, rule_options); + + av++; + break; + } + + case TOK_PROTO: + NEED1("missing protocol"); + if (add_proto(cmd, *av, &proto)) { + av++; + } else + errx(EX_DATAERR, "invalid protocol ``%s''", + *av); + break; + + case TOK_SRCIP: + NEED1("missing source IP"); + if (add_srcip(cmd, *av)) { + av++; + } + break; + + case TOK_DSTIP: + NEED1("missing destination IP"); + if (add_dstip(cmd, *av)) { + av++; + } + break; + + case TOK_SRCIP6: + NEED1("missing source IP6"); + if (add_srcip6(cmd, *av)) { + av++; + } + break; + + case TOK_DSTIP6: + NEED1("missing destination IP6"); + if (add_dstip6(cmd, *av)) { + av++; + } + break; + + case TOK_SRCPORT: + NEED1("missing source port"); + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_SRCPORT)) { + av++; + } else + errx(EX_DATAERR, "invalid source port %s", *av); + break; + + case TOK_DSTPORT: + NEED1("missing destination port"); + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_DSTPORT)) { + av++; + } else + errx(EX_DATAERR, "invalid destination port %s", + *av); + break; + + case TOK_MAC: + if (add_mac(cmd, av)) + av += 2; + break; + + case TOK_MACTYPE: + NEED1("missing mac type"); + if (!add_mactype(cmd, *av)) + errx(EX_DATAERR, "invalid mac type %s", *av); + av++; + break; + + case TOK_VERREVPATH: + fill_cmd(cmd, O_VERREVPATH, 0, 0); + break; + + case TOK_VERSRCREACH: + fill_cmd(cmd, O_VERSRCREACH, 0, 0); + break; + + case TOK_ANTISPOOF: + fill_cmd(cmd, O_ANTISPOOF, 0, 0); + break; + + case TOK_IPSEC: + fill_cmd(cmd, O_IPSEC, 0, 0); + break; + + case TOK_IPV6: + fill_cmd(cmd, O_IP6, 0, 0); + break; + + case TOK_IPV4: + fill_cmd(cmd, O_IP4, 0, 0); + break; + + case TOK_EXT6HDR: + fill_ext6hdr( cmd, *av ); + av++; + break; + + case TOK_FLOWID: + if (proto != IPPROTO_IPV6 ) + errx( EX_USAGE, "flow-id filter is active " + "only for ipv6 protocol\n"); + fill_flow6( (ipfw_insn_u32 *) cmd, *av ); + av++; + break; + + case TOK_COMMENT: + fill_comment(cmd, av); + av[0]=NULL; + break; + + case TOK_TAGGED: + if (av[0] && strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TAGGED)) + errx(EX_DATAERR, "tagged: invalid tag" + " list: %s", *av); + } + else { + uint16_t tag; + + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, + TOK_TAGGED, rule_options); + fill_cmd(cmd, O_TAGGED, 0, tag); + } + av++; + break; + + case TOK_FIB: + NEED1("fib requires fib number"); + fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_LOOKUP: { + ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; + char *p; + int j; + + if (!av[0] || !av[1]) + errx(EX_USAGE, "format: lookup argument tablenum"); + cmd->opcode = O_IP_DST_LOOKUP; + cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; + i = match_token(rule_options, *av); + for (j = 0; lookup_key[j] >= 0 ; j++) { + if (i == lookup_key[j]) + break; + } + if (lookup_key[j] <= 0) + errx(EX_USAGE, "format: cannot lookup on %s", *av); + c->d[1] = j; // i converted to option + av++; + cmd->arg1 = strtoul(*av, &p, 0); + if (p && *p) + errx(EX_USAGE, "format: lookup argument tablenum"); + av++; + } + break; + + default: + errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); + } + if (F_LEN(cmd) > 0) { /* prepare to advance */ + prev = cmd; + cmd = next_cmd(cmd); + } + } + +done: + /* + * Now copy stuff into the rule. + * If we have a keep-state option, the first instruction + * must be a PROBE_STATE (which is generated here). + * If we have a LOG option, it was stored as the first command, + * and now must be moved to the top of the action part. + */ + dst = (ipfw_insn *)rule->cmd; + + /* + * First thing to write into the command stream is the match probability. + */ + if (match_prob != 1) { /* 1 means always match */ + dst->opcode = O_PROB; + dst->len = 2; + *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); + dst += dst->len; + } + + /* + * generate O_PROBE_STATE if necessary + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + fill_cmd(dst, O_PROBE_STATE, 0, 0); + dst = next_cmd(dst); + } + + /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ + for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { + i = F_LEN(src); + + switch (src->opcode) { + case O_LOG: + case O_KEEP_STATE: + case O_LIMIT: + case O_ALTQ: + case O_TAG: + break; + default: + bcopy(src, dst, i * sizeof(uint32_t)); + dst += i; + } + } + + /* + * put back the have_state command as last opcode + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + i = F_LEN(have_state); + bcopy(have_state, dst, i * sizeof(uint32_t)); + dst += i; + } + /* + * start action section + */ + rule->act_ofs = dst - rule->cmd; + + /* put back O_LOG, O_ALTQ, O_TAG if necessary */ + if (have_log) { + i = F_LEN(have_log); + bcopy(have_log, dst, i * sizeof(uint32_t)); + dst += i; + } + if (have_altq) { + i = F_LEN(have_altq); + bcopy(have_altq, dst, i * sizeof(uint32_t)); + dst += i; + } + if (have_tag) { + i = F_LEN(have_tag); + bcopy(have_tag, dst, i * sizeof(uint32_t)); + dst += i; + } + /* + * copy all other actions + */ + for (src = (ipfw_insn *)actbuf; src != action; src += i) { + i = F_LEN(src); + bcopy(src, dst, i * sizeof(uint32_t)); + dst += i; + } + + rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); + i = (char *)dst - (char *)rule; + if (do_cmd(IP_FW_ADD, rule, (uintptr_t)&i) == -1) + err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_ADD"); + if (!co.do_quiet) + show_ipfw(rule, 0, 0); +} + +/* + * clear the counters or the log counters. + */ +void +ipfw_zero(int ac, char *av[], int optname /* 0 = IP_FW_ZERO, 1 = IP_FW_RESETLOG */) +{ + uint32_t arg, saved_arg; + int failed = EX_OK; + char const *errstr; + char const *name = optname ? "RESETLOG" : "ZERO"; + + optname = optname ? IP_FW_RESETLOG : IP_FW_ZERO; + + av++; ac--; + + if (!ac) { + /* clear all entries */ + if (do_cmd(optname, NULL, 0) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_FW_%s)", name); + if (!co.do_quiet) + printf("%s.\n", optname == IP_FW_ZERO ? + "Accounting cleared":"Logging counts reset"); + + return; + } + + while (ac) { + /* Rule number */ + if (isdigit(**av)) { + arg = strtonum(*av, 0, 0xffff, &errstr); + if (errstr) + errx(EX_DATAERR, + "invalid rule number %s\n", *av); + saved_arg = arg; + if (co.use_set) + arg |= (1 << 24) | ((co.use_set - 1) << 16); + av++; + ac--; + if (do_cmd(optname, &arg, sizeof(arg))) { + warn("rule %u: setsockopt(IP_FW_%s)", + saved_arg, name); + failed = EX_UNAVAILABLE; + } else if (!co.do_quiet) + printf("Entry %d %s.\n", saved_arg, + optname == IP_FW_ZERO ? + "cleared" : "logging count reset"); + } else { + errx(EX_USAGE, "invalid rule number ``%s''", *av); + } + } + if (failed != EX_OK) + exit(failed); +} + +void +ipfw_flush(int force) +{ + int cmd = co.do_pipe ? IP_DUMMYNET_FLUSH : IP_FW_FLUSH; + + if (!force && !co.do_quiet) { /* need to ask user */ + int c; + + printf("Are you sure? [yn] "); + fflush(stdout); + do { + c = toupper(getc(stdin)); + while (c != '\n' && getc(stdin) != '\n') + if (feof(stdin)) + return; /* and do not flush */ + } while (c != 'Y' && c != 'N'); + printf("\n"); + if (c == 'N') /* user said no */ + return; + } + if (co.do_pipe) { + dummynet_flush(); + return; + } + /* `ipfw set N flush` - is the same that `ipfw delete set N` */ + if (co.use_set) { + uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); + if (do_cmd(IP_FW_DEL, &arg, sizeof(arg)) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_FW_DEL)"); + } else if (do_cmd(cmd, NULL, 0) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_%s_FLUSH)", + co.do_pipe ? "DUMMYNET" : "FW"); + if (!co.do_quiet) + printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); +} + + +static void table_list(ipfw_table_entry ent, int need_header); + +/* + * This one handles all table-related commands + * ipfw table N add addr[/masklen] [value] + * ipfw table N delete addr[/masklen] + * ipfw table {N | all} flush + * ipfw table {N | all} list + */ +void +ipfw_table_handler(int ac, char *av[]) +{ + ipfw_table_entry ent; + int do_add; + int is_all; + size_t len; + char *p; + uint32_t a; + uint32_t tables_max; + + len = sizeof(tables_max); + if (sysctlbyname("net.inet.ip.fw.tables_max", &tables_max, &len, + NULL, 0) == -1) { +#ifdef IPFW_TABLES_MAX + warn("Warn: Failed to get the max tables number via sysctl. " + "Using the compiled in defaults. \nThe reason was"); + tables_max = IPFW_TABLES_MAX; +#else + errx(1, "Failed sysctlbyname(\"net.inet.ip.fw.tables_max\")"); +#endif + } + + ac--; av++; + if (ac && isdigit(**av)) { + ent.tbl = atoi(*av); + is_all = 0; + ac--; av++; + } else if (ac && _substrcmp(*av, "all") == 0) { + ent.tbl = 0; + is_all = 1; + ac--; av++; + } else + errx(EX_USAGE, "table number or 'all' keyword required"); + if (ent.tbl >= tables_max) + errx(EX_USAGE, "The table number exceeds the maximum allowed " + "value (%d)", tables_max - 1); + NEED1("table needs command"); + if (is_all && _substrcmp(*av, "list") != 0 + && _substrcmp(*av, "flush") != 0) + errx(EX_USAGE, "table number required"); + + if (_substrcmp(*av, "add") == 0 || + _substrcmp(*av, "delete") == 0) { + do_add = **av == 'a'; + ac--; av++; + if (!ac) + errx(EX_USAGE, "IP address required"); + p = strchr(*av, '/'); + if (p) { + *p++ = '\0'; + ent.masklen = atoi(p); + if (ent.masklen > 32) + errx(EX_DATAERR, "bad width ``%s''", p); + } else + ent.masklen = 32; + if (lookup_host(*av, (struct in_addr *)&ent.addr) != 0) + errx(EX_NOHOST, "hostname ``%s'' unknown", *av); + ac--; av++; + if (do_add && ac) { + unsigned int tval; + /* isdigit is a bit of a hack here.. */ + if (strchr(*av, (int)'.') == NULL && isdigit(**av)) { + ent.value = strtoul(*av, NULL, 0); + } else { + if (lookup_host(*av, (struct in_addr *)&tval) == 0) { + /* The value must be stored in host order * + * so that the values < 65k can be distinguished */ + ent.value = ntohl(tval); + } else { + errx(EX_NOHOST, "hostname ``%s'' unknown", *av); + } + } + } else + ent.value = 0; + if (do_cmd(do_add ? IP_FW_TABLE_ADD : IP_FW_TABLE_DEL, + &ent, sizeof(ent)) < 0) { + /* If running silent, don't bomb out on these errors. */ + if (!(co.do_quiet && (errno == (do_add ? EEXIST : ESRCH)))) + err(EX_OSERR, "setsockopt(IP_FW_TABLE_%s)", + do_add ? "ADD" : "DEL"); + /* In silent mode, react to a failed add by deleting */ + if (do_add) { + do_cmd(IP_FW_TABLE_DEL, &ent, sizeof(ent)); + if (do_cmd(IP_FW_TABLE_ADD, + &ent, sizeof(ent)) < 0) + err(EX_OSERR, + "setsockopt(IP_FW_TABLE_ADD)"); + } + } + } else if (_substrcmp(*av, "flush") == 0) { + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); + do { + if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl, + sizeof(ent.tbl)) < 0) + err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); + } while (++ent.tbl < a); + } else if (_substrcmp(*av, "list") == 0) { + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); + do { + table_list(ent, is_all); + } while (++ent.tbl < a); + } else + errx(EX_USAGE, "invalid table command %s", *av); +} + +static void +table_list(ipfw_table_entry ent, int need_header) +{ + ipfw_table *tbl; + socklen_t l; + uint32_t a; + + a = ent.tbl; + l = sizeof(a); + if (do_cmd(IP_FW_TABLE_GETSIZE, &a, (uintptr_t)&l) < 0) + err(EX_OSERR, "getsockopt(IP_FW_TABLE_GETSIZE)"); + + /* If a is zero we have nothing to do, the table is empty. */ + if (a == 0) + return; + + l = sizeof(*tbl) + a * sizeof(ipfw_table_entry); + tbl = safe_calloc(1, l); + tbl->tbl = ent.tbl; + if (do_cmd(IP_FW_TABLE_LIST, tbl, (uintptr_t)&l) < 0) + err(EX_OSERR, "getsockopt(IP_FW_TABLE_LIST)"); + if (tbl->cnt && need_header) + printf("---table(%d)---\n", tbl->tbl); + for (a = 0; a < tbl->cnt; a++) { + unsigned int tval; + tval = tbl->ent[a].value; + if (co.do_value_as_ip) { + char tbuf[128]; + strncpy(tbuf, inet_ntoa(*(struct in_addr *) + &tbl->ent[a].addr), 127); + /* inet_ntoa expects network order */ + tval = htonl(tval); + printf("%s/%u %s\n", tbuf, tbl->ent[a].masklen, + inet_ntoa(*(struct in_addr *)&tval)); + } else { + printf("%s/%u %u\n", + inet_ntoa(*(struct in_addr *)&tbl->ent[a].addr), + tbl->ent[a].masklen, tval); + } + } + free(tbl); +} diff --git a/ipfw/ipfw2.h b/ipfw/ipfw2.h new file mode 100644 index 0000000..237f815 --- /dev/null +++ b/ipfw/ipfw2.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipfw2.h 206843 2010-04-19 15:11:45Z luigi $ + */ + +/* + * Options that can be set on the command line. + * When reading commands from a file, a subset of the options can also + * be applied globally by specifying them before the file name. + * After that, each line can contain its own option that changes + * the global value. + * XXX The context is not restored after each line. + */ + +struct cmdline_opts { + /* boolean options: */ + int do_value_as_ip; /* show table value as IP */ + int do_resolv; /* try to resolve all ip to names */ + int do_time; /* Show time stamps */ + int do_quiet; /* Be quiet in add and flush */ + int do_pipe; /* this cmd refers to a pipe/queue/sched */ + int do_nat; /* this cmd refers to a nat config */ + int do_dynamic; /* display dynamic rules */ + int do_expired; /* display expired dynamic rules */ + int do_compact; /* show rules in compact mode */ + int do_force; /* do not ask for confirmation */ + int show_sets; /* display the set each rule belongs to */ + int test_only; /* only check syntax */ + int comment_only; /* only print action and comment */ + int verbose; /* be verbose on some commands */ + + /* The options below can have multiple values. */ + + int do_sort; /* field to sort results (0 = no) */ + /* valid fields are 1 and above */ + + int use_set; /* work with specified set number */ + /* 0 means all sets, otherwise apply to set use_set - 1 */ + +}; + +extern struct cmdline_opts co; + +/* + * _s_x is a structure that stores a string <-> token pairs, used in + * various places in the parser. Entries are stored in arrays, + * with an entry with s=NULL as terminator. + * The search routines are match_token() and match_value(). + * Often, an element with x=0 contains an error string. + * + */ +struct _s_x { + char const *s; + int x; +}; + +enum tokens { + TOK_NULL=0, + + TOK_OR, + TOK_NOT, + TOK_STARTBRACE, + TOK_ENDBRACE, + + TOK_ACCEPT, + TOK_COUNT, + TOK_PIPE, + TOK_LINK, + TOK_QUEUE, + TOK_FLOWSET, + TOK_SCHED, + TOK_DIVERT, + TOK_TEE, + TOK_NETGRAPH, + TOK_NGTEE, + TOK_FORWARD, + TOK_SKIPTO, + TOK_DENY, + TOK_REJECT, + TOK_RESET, + TOK_UNREACH, + TOK_CHECKSTATE, + TOK_NAT, + TOK_REASS, + + TOK_ALTQ, + TOK_LOG, + TOK_TAG, + TOK_UNTAG, + + TOK_TAGGED, + TOK_UID, + TOK_GID, + TOK_JAIL, + TOK_IN, + TOK_LIMIT, + TOK_KEEPSTATE, + TOK_LAYER2, + TOK_OUT, + TOK_DIVERTED, + TOK_DIVERTEDLOOPBACK, + TOK_DIVERTEDOUTPUT, + TOK_XMIT, + TOK_RECV, + TOK_VIA, + TOK_FRAG, + TOK_IPOPTS, + TOK_IPLEN, + TOK_IPID, + TOK_IPPRECEDENCE, + TOK_DSCP, + TOK_IPTOS, + TOK_IPTTL, + TOK_IPVER, + TOK_ESTAB, + TOK_SETUP, + TOK_TCPDATALEN, + TOK_TCPFLAGS, + TOK_TCPOPTS, + TOK_TCPSEQ, + TOK_TCPACK, + TOK_TCPWIN, + TOK_ICMPTYPES, + TOK_MAC, + TOK_MACTYPE, + TOK_VERREVPATH, + TOK_VERSRCREACH, + TOK_ANTISPOOF, + TOK_IPSEC, + TOK_COMMENT, + + TOK_PLR, + TOK_NOERROR, + TOK_BUCKETS, + TOK_DSTIP, + TOK_SRCIP, + TOK_DSTPORT, + TOK_SRCPORT, + TOK_ALL, + TOK_MASK, + TOK_FLOW_MASK, + TOK_SCHED_MASK, + TOK_BW, + TOK_DELAY, + TOK_PROFILE, + TOK_BURST, + TOK_RED, + TOK_GRED, + TOK_DROPTAIL, + TOK_PROTO, + /* dummynet tokens */ + TOK_WEIGHT, + TOK_LMAX, + TOK_PRI, + TOK_TYPE, + TOK_SLOTSIZE, + + TOK_IP, + TOK_IF, + TOK_ALOG, + TOK_DENY_INC, + TOK_SAME_PORTS, + TOK_UNREG_ONLY, + TOK_RESET_ADDR, + TOK_ALIAS_REV, + TOK_PROXY_ONLY, + TOK_REDIR_ADDR, + TOK_REDIR_PORT, + TOK_REDIR_PROTO, + + TOK_IPV6, + TOK_FLOWID, + TOK_ICMP6TYPES, + TOK_EXT6HDR, + TOK_DSTIP6, + TOK_SRCIP6, + + TOK_IPV4, + TOK_UNREACH6, + TOK_RESET6, + + TOK_FIB, + TOK_SETFIB, + TOK_LOOKUP, +}; +/* + * the following macro returns an error message if we run out of + * arguments. + */ +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} + +int pr_u64(uint64_t *pd, int width); + +/* memory allocation support */ +void *safe_calloc(size_t number, size_t size); +void *safe_realloc(void *ptr, size_t size); + +/* string comparison functions used for historical compatibility */ +int _substrcmp(const char *str1, const char* str2); +int _substrcmp2(const char *str1, const char* str2, const char* str3); + +/* utility functions */ +int match_token(struct _s_x *table, char *string); +char const *match_value(struct _s_x *p, int value); + +int do_cmd(int optname, void *optval, uintptr_t optlen); + +struct in6_addr; +void n2mask(struct in6_addr *mask, int n); +int contigmask(uint8_t *p, int len); + +/* + * Forward declarations to avoid include way too many headers. + * C does not allow duplicated typedefs, so we use the base struct + * that the typedef points to. + * Should the typedefs use a different type, the compiler will + * still detect the change when compiling the body of the + * functions involved, so we do not lose error checking. + */ +struct _ipfw_insn; +struct _ipfw_insn_altq; +struct _ipfw_insn_u32; +struct _ipfw_insn_ip6; +struct _ipfw_insn_icmp6; + +/* + * The reserved set numer. This is a constant in ip_fw.h + * but we store it in a variable so other files do not depend + * in that header just for one constant. + */ +extern int resvd_set_number; + +/* first-level command handlers */ +void ipfw_add(char *av[]); +void ipfw_show_nat(int ac, char **av); +void ipfw_config_pipe(int ac, char **av); +void ipfw_config_nat(int ac, char **av); +void ipfw_sets_handler(char *av[]); +void ipfw_table_handler(int ac, char *av[]); +void ipfw_sysctl_handler(char *av[], int which); +void ipfw_delete(char *av[]); +void ipfw_flush(int force); +void ipfw_zero(int ac, char *av[], int optname); +void ipfw_list(int ac, char *av[], int show_counters); + +/* altq.c */ +void altq_set_enabled(int enabled); +u_int32_t altq_name_to_qid(const char *name); + +void print_altq_cmd(struct _ipfw_insn_altq *altqptr); + +/* dummynet.c */ +void dummynet_list(int ac, char *av[], int show_counters); +void dummynet_flush(void); +int ipfw_delete_pipe(int pipe_or_queue, int n); + +/* ipv6.c */ +void print_unreach6_code(uint16_t code); +void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s); +void print_flow6id(struct _ipfw_insn_u32 *cmd); +void print_icmp6types(struct _ipfw_insn_u32 *cmd); +void print_ext6hdr(struct _ipfw_insn *cmd ); + +struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av); +struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av); + +void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av ); +void fill_unreach6_code(u_short *codep, char *str); +void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av); +int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); diff --git a/ipfw/ipv6.c b/ipfw/ipv6.c new file mode 100644 index 0000000..6d58c04 --- /dev/null +++ b/ipfw/ipv6.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: user/luigi/ipfw3-head/sbin/ipfw/ipv6.c 187770 2009-01-27 12:01:30Z luigi $ + * + * ipv6 support + */ + +#include +#include + +#include "ipfw2.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static struct _s_x icmp6codes[] = { + { "no-route", ICMP6_DST_UNREACH_NOROUTE }, + { "admin-prohib", ICMP6_DST_UNREACH_ADMIN }, + { "address", ICMP6_DST_UNREACH_ADDR }, + { "port", ICMP6_DST_UNREACH_NOPORT }, + { NULL, 0 } +}; + +void +fill_unreach6_code(u_short *codep, char *str) +{ + int val; + char *s; + + val = strtoul(str, &s, 0); + if (s == str || *s != '\0' || val >= 0x100) + val = match_token(icmp6codes, str); + if (val < 0) + errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str); + *codep = val; + return; +} + +void +print_unreach6_code(uint16_t code) +{ + char const *s = match_value(icmp6codes, code); + + if (s != NULL) + printf("unreach6 %s", s); + else + printf("unreach6 %u", code); +} + +/* + * Print the ip address contained in a command. + */ +void +print_ip6(ipfw_insn_ip6 *cmd, char const *s) +{ + struct hostent *he = NULL; + int len = F_LEN((ipfw_insn *) cmd) - 1; + struct in6_addr *a = &(cmd->addr6); + char trad[255]; + + printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); + + if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) { + printf("me6"); + return; + } + if (cmd->o.opcode == O_IP6) { + printf(" ip6"); + return; + } + + /* + * len == 4 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + + for (len = len / 4; len > 0; len -= 2, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ? + 128 : contigmask((uint8_t *)&(a[1]), 128); + + if (mb == 128 && co.do_resolv) + he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6); + if (he != NULL) /* resolved to name */ + printf("%s", he->h_name); + else if (mb == 0) /* any */ + printf("any"); + else { /* numeric IP followed by some kind of mask */ + if (inet_ntop(AF_INET6, a, trad, sizeof( trad ) ) == NULL) + printf("Error ntop in print_ip6\n"); + printf("%s", trad ); + if (mb < 0) /* XXX not really legal... */ + printf(":%s", + inet_ntop(AF_INET6, &a[1], trad, sizeof(trad))); + else if (mb < 128) + printf("/%d", mb); + } + if (len > 2) + printf(","); + } +} + +void +fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av) +{ + uint8_t type; + + bzero(cmd, sizeof(*cmd)); + while (*av) { + if (*av == ',') + av++; + type = strtoul(av, &av, 0); + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ICMP6 type"); + /* + * XXX: shouldn't this be 0xFF? I can't see any reason why + * we shouldn't be able to filter all possiable values + * regardless of the ability of the rest of the kernel to do + * anything useful with them. + */ + if (type > ICMP6_MAXTYPE) + errx(EX_DATAERR, "ICMP6 type out of range"); + cmd->d[type / 32] |= ( 1 << (type % 32)); + } + cmd->o.opcode = O_ICMP6TYPE; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6); +} + + +void +print_icmp6types(ipfw_insn_u32 *cmd) +{ + int i, j; + char sep= ' '; + + printf(" ip6 icmp6types"); + for (i = 0; i < 7; i++) + for (j=0; j < 32; ++j) { + if ( (cmd->d[i] & (1 << (j))) == 0) + continue; + printf("%c%d", sep, (i*32 + j)); + sep = ','; + } +} + +void +print_flow6id( ipfw_insn_u32 *cmd) +{ + uint16_t i, limit = cmd->o.arg1; + char sep = ','; + + printf(" flow-id "); + for( i=0; i < limit; ++i) { + if (i == limit - 1) + sep = ' '; + printf("%d%c", cmd->d[i], sep); + } +} + +/* structure and define for the extension header in ipv6 */ +static struct _s_x ext6hdrcodes[] = { + { "frag", EXT_FRAGMENT }, + { "hopopt", EXT_HOPOPTS }, + { "route", EXT_ROUTING }, + { "dstopt", EXT_DSTOPTS }, + { "ah", EXT_AH }, + { "esp", EXT_ESP }, + { "rthdr0", EXT_RTHDR0 }, + { "rthdr2", EXT_RTHDR2 }, + { NULL, 0 } +}; + +/* fills command for the extension header filtering */ +int +fill_ext6hdr( ipfw_insn *cmd, char *av) +{ + int tok; + char *s = av; + + cmd->arg1 = 0; + + while(s) { + av = strsep( &s, ",") ; + tok = match_token(ext6hdrcodes, av); + switch (tok) { + case EXT_FRAGMENT: + cmd->arg1 |= EXT_FRAGMENT; + break; + + case EXT_HOPOPTS: + cmd->arg1 |= EXT_HOPOPTS; + break; + + case EXT_ROUTING: + cmd->arg1 |= EXT_ROUTING; + break; + + case EXT_DSTOPTS: + cmd->arg1 |= EXT_DSTOPTS; + break; + + case EXT_AH: + cmd->arg1 |= EXT_AH; + break; + + case EXT_ESP: + cmd->arg1 |= EXT_ESP; + break; + + case EXT_RTHDR0: + cmd->arg1 |= EXT_RTHDR0; + break; + + case EXT_RTHDR2: + cmd->arg1 |= EXT_RTHDR2; + break; + + default: + errx( EX_DATAERR, "invalid option for ipv6 exten header" ); + break; + } + } + if (cmd->arg1 == 0 ) + return 0; + cmd->opcode = O_EXT_HDR; + cmd->len |= F_INSN_SIZE( ipfw_insn ); + return 1; +} + +void +print_ext6hdr( ipfw_insn *cmd ) +{ + char sep = ' '; + + printf(" extension header:"); + if (cmd->arg1 & EXT_FRAGMENT ) { + printf("%cfragmentation", sep); + sep = ','; + } + if (cmd->arg1 & EXT_HOPOPTS ) { + printf("%chop options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_ROUTING ) { + printf("%crouting options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_RTHDR0 ) { + printf("%crthdr0", sep); + sep = ','; + } + if (cmd->arg1 & EXT_RTHDR2 ) { + printf("%crthdr2", sep); + sep = ','; + } + if (cmd->arg1 & EXT_DSTOPTS ) { + printf("%cdestination options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_AH ) { + printf("%cauthentication header", sep); + sep = ','; + } + if (cmd->arg1 & EXT_ESP ) { + printf("%cencapsulated security payload", sep); + } +} + +/* Try to find ipv6 address by hostname */ +static int +lookup_host6 (char *host, struct in6_addr *ip6addr) +{ + struct hostent *he; + + if (!inet_pton(AF_INET6, host, ip6addr)) { + if ((he = gethostbyname2(host, AF_INET6)) == NULL) + return(-1); + memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr)); + } + return(0); +} + + +/* + * fill the addr and mask fields in the instruction as appropriate from av. + * Update length as appropriate. + * The following formats are allowed: + * any matches any IP6. Actually returns an empty instruction. + * me returns O_IP6_*_ME + * + * 03f1::234:123:0342 single IP6 addres + * 03f1::234:123:0342/24 address/mask + * 03f1::234:123:0342/24,03f1::234:123:0343/ List of address + * + * Set of address (as in ipv6) not supported because ipv6 address + * are typically random past the initial prefix. + * Return 1 on success, 0 on failure. + */ +static int +fill_ip6(ipfw_insn_ip6 *cmd, char *av) +{ + int len = 0; + struct in6_addr *d = &(cmd->addr6); + /* + * Needed for multiple address. + * Note d[1] points to struct in6_add r mask6 of cmd + */ + + cmd->o.len &= ~F_LEN_MASK; /* zero len */ + + if (strcmp(av, "any") == 0) + return (1); + + + if (strcmp(av, "me") == 0) { /* Set the data for "me" opt*/ + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + if (strcmp(av, "me6") == 0) { /* Set the data for "me" opt*/ + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + av = strdup(av); + while (av) { + /* + * After the address we can have '/' indicating a mask, + * or ',' indicating another address follows. + */ + + char *p; + int masklen; + char md = '\0'; + + if ((p = strpbrk(av, "/,")) ) { + md = *p; /* save the separator */ + *p = '\0'; /* terminate address string */ + p++; /* and skip past it */ + } + /* now p points to NULL, mask or next entry */ + + /* lookup stores address in *d as a side effect */ + if (lookup_host6(av, d) != 0) { + /* XXX: failed. Free memory and go */ + errx(EX_DATAERR, "bad address \"%s\"", av); + } + /* next, look at the mask, if any */ + masklen = (md == '/') ? atoi(p) : 128; + if (masklen > 128 || masklen < 0) + errx(EX_DATAERR, "bad width \"%s\''", p); + else + n2mask(&d[1], masklen); + + APPLY_MASK(d, &d[1]) /* mask base address with mask */ + + /* find next separator */ + + if (md == '/') { /* find separator past the mask */ + p = strpbrk(p, ","); + if (p != NULL) + p++; + } + av = p; + + /* Check this entry */ + if (masklen == 0) { + /* + * 'any' turns the entire list into a NOP. + * 'not any' never matches, so it is removed from the + * list unless it is the only item, in which case we + * report an error. + */ + if (cmd->o.len & F_NOT && av == NULL && len == 0) + errx(EX_DATAERR, "not any never matches"); + continue; + } + + /* + * A single IP can be stored alone + */ + if (masklen == 128 && av == NULL && len == 0) { + len = F_INSN_SIZE(struct in6_addr); + break; + } + + /* Update length and pointer to arguments */ + len += F_INSN_SIZE(struct in6_addr)*2; + d += 2; + } /* end while */ + + /* + * Total length of the command, remember that 1 is the size of + * the base command. + */ + if (len + 1 > F_LEN_MASK) + errx(EX_DATAERR, "address list too long"); + cmd->o.len |= len+1; + free(av); + return (1); +} + +/* + * fills command for ipv6 flow-id filtering + * note that the 20 bit flow number is stored in a array of u_int32_t + * it's supported lists of flow-id, so in the o.arg1 we store how many + * additional flow-id we want to filter, the basic is 1 + */ +void +fill_flow6( ipfw_insn_u32 *cmd, char *av ) +{ + u_int32_t type; /* Current flow number */ + u_int16_t nflow = 0; /* Current flow index */ + char *s = av; + cmd->d[0] = 0; /* Initializing the base number*/ + + while (s) { + av = strsep( &s, ",") ; + type = strtoul(av, &av, 0); + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); + if (type > 0xfffff) + errx(EX_DATAERR, "flow number out of range %s", av); + cmd->d[nflow] |= type; + nflow++; + } + if( nflow > 0 ) { + cmd->o.opcode = O_FLOW6ID; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow; + cmd->o.arg1 = nflow; + } + else { + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); + } +} + +ipfw_insn * +add_srcip6(ipfw_insn *cmd, char *av) +{ + + fill_ip6((ipfw_insn_ip6 *)cmd, av); + if (F_LEN(cmd) == 0) { /* any */ + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ + cmd->opcode = O_IP6_SRC_ME; + } else if (F_LEN(cmd) == + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { + /* single IP, no mask*/ + cmd->opcode = O_IP6_SRC; + } else { /* addr/mask opt */ + cmd->opcode = O_IP6_SRC_MASK; + } + return cmd; +} + +ipfw_insn * +add_dstip6(ipfw_insn *cmd, char *av) +{ + + fill_ip6((ipfw_insn_ip6 *)cmd, av); + if (F_LEN(cmd) == 0) { /* any */ + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ + cmd->opcode = O_IP6_DST_ME; + } else if (F_LEN(cmd) == + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { + /* single IP, no mask*/ + cmd->opcode = O_IP6_DST; + } else { /* addr/mask opt */ + cmd->opcode = O_IP6_DST_MASK; + } + return cmd; +} diff --git a/ipfw/main.c b/ipfw/main.c new file mode 100644 index 0000000..b0e51e1 --- /dev/null +++ b/ipfw/main.c @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2002-2003,2010 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * Command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/main.c 206494 2010-04-12 08:27:53Z luigi $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipfw2.h" + +static void +help(void) +{ + fprintf(stderr, +"ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n" +"\tipfw [-abcdefhnNqStTv] \n\n" +"where is one of the following:\n\n" +"add [num] [set N] [prob x] RULE-BODY\n" +"{pipe|queue} N config PIPE-BODY\n" +"[pipe|queue] {zero|delete|show} [N{,N}]\n" +"nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n" +" reverse|proxy_only|redirect_addr linkspec|\n" +" redirect_port linkspec|redirect_proto linkspec}\n" +"set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n" +"set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n" +"table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n" +"table all {flush | list}\n" +"\n" +"RULE-BODY: check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n" +"ACTION: check-state | allow | count | deny | unreach{,6} CODE |\n" +" skipto N | {divert|tee} PORT | forward ADDR |\n" +" pipe N | queue N | nat N | setfib FIB | reass\n" +"PARAMS: [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n" +"ADDR: [ MAC dst src ether_type ] \n" +" [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n" +" [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n" +"IPADDR: [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n" +"IP6ADDR: [not] { any | me | me6 | ip6/bits | IP6LIST }\n" +"IP6LIST: { ip6 | ip6/bits }[,IP6LIST]\n" +"IPLIST: { ip | ip/bits | ip:mask }[,IPLIST]\n" +"OPTION_LIST: OPTION [OPTION_LIST]\n" +"OPTION: bridged | diverted | diverted-loopback | diverted-output |\n" +" {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n" +" {dst-port|src-port} LIST |\n" +" estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n" +" iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n" +" ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n" +" icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n" +" mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n" +" setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n" +" tcpdatalen LIST | verrevpath | versrcreach | antispoof\n" +); + + exit(0); +} + +/* + * Called with the arguments, including program name because getopt + * wants it to be present. + * Returns 0 if successful, 1 if empty command, errx() in case of errors. + * First thing we do is process parameters creating an argv[] array + * which includes the program name and a NULL entry at the end. + * If we are called with a single string, we split it on whitespace. + * Also, arguments with a trailing ',' are joined to the next one. + * The pointers (av[]) and data are in a a single chunk of memory. + * av[0] points to the original program name, all other entries + * point into the allocated chunk. + */ +static int +ipfw_main(int oldac, char **oldav) +{ + int ch, ac; + const char *errstr; + char **av, **save_av; + int do_acct = 0; /* Show packet/byte count */ + int try_next = 0; /* set if pipe cmd not found */ + int av_size; /* compute the av size */ + char *av_p; /* used to build the av list */ + +#define WHITESP " \t\f\v\n\r" + if (oldac < 2) + return 1; /* need at least one argument */ + + if (oldac == 2) { + /* + * If we are called with one argument, try to split it into + * words for subsequent parsing. Spaces after a ',' are + * removed by copying the string in-place. + */ + char *arg = oldav[1]; /* The string is the first arg. */ + int l = strlen(arg); + int copy = 0; /* 1 if we need to copy, 0 otherwise */ + int i, j; + + for (i = j = 0; i < l; i++) { + if (arg[i] == '#') /* comment marker */ + break; + if (copy) { + arg[j++] = arg[i]; + copy = !index("," WHITESP, arg[i]); + } else { + copy = !index(WHITESP, arg[i]); + if (copy) + arg[j++] = arg[i]; + } + } + if (!copy && j > 0) /* last char was a 'blank', remove it */ + j--; + l = j; /* the new argument length */ + arg[j++] = '\0'; + if (l == 0) /* empty string! */ + return 1; + + /* + * First, count number of arguments. Because of the previous + * processing, this is just the number of blanks plus 1. + */ + for (i = 0, ac = 1; i < l; i++) + if (index(WHITESP, arg[i]) != NULL) + ac++; + + /* + * Allocate the argument list structure as a single block + * of memory, containing pointers and the argument + * strings. We include one entry for the program name + * because getopt expects it, and a NULL at the end + * to simplify further parsing. + */ + ac++; /* add 1 for the program name */ + av_size = (ac+1) * sizeof(char *) + l + 1; + av = safe_calloc(av_size, 1); + + /* + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[]. For each one, + * j is the initial character, i is the one past the end. + */ + av_p = (char *)&av[ac+1]; + for (ac = 1, i = j = 0; i < l; i++) { + if (index(WHITESP, arg[i]) != NULL || i == l-1) { + if (i == l-1) + i++; + bcopy(arg+j, av_p, i-j); + av[ac] = av_p; + av_p += i-j; /* the lenght of the string */ + *av_p++ = '\0'; + ac++; + j = i + 1; + } + } + } else { + /* + * If an argument ends with ',' join with the next one. + */ + int first, i, l=0; + + /* + * Allocate the argument list structure as a single block + * of memory, containing both pointers and the argument + * strings. We include some space for the program name + * because getopt expects it. + * We add an extra pointer to the end of the array, + * to make simpler further parsing. + */ + for (i=0; i= 2 && !strcmp(av[1], "sysctl")) { + char *s; + int i; + + if (ac != 3) { + printf( "sysctl emulation usage:\n" + " ipfw sysctl name[=value]\n" + " ipfw sysctl -a\n"); + return 0; + } + s = index(av[2], '='); + if (s == NULL) { + s = !strcmp(av[2], "-a") ? NULL : av[2]; + sysctlbyname(s, NULL, NULL, NULL, 0); + } else { /* ipfw sysctl x.y.z=value */ + /* assume an INT value, will extend later */ + if (s[1] == '\0') { + printf("ipfw sysctl: missing value\n\n"); + return 0; + } + *s = '\0'; + i = strtol(s+1, NULL, 0); + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); + } + return 0; + } +#endif + + /* Save arguments for final freeing of memory. */ + save_av = av; + + optind = optreset = 1; /* restart getopt() */ + while ((ch = getopt(ac, av, "abcdefhinNqs:STtv")) != -1) + switch (ch) { + case 'a': + do_acct = 1; + break; + + case 'b': + co.comment_only = 1; + co.do_compact = 1; + break; + + case 'c': + co.do_compact = 1; + break; + + case 'd': + co.do_dynamic = 1; + break; + + case 'e': + co.do_expired = 1; + break; + + case 'f': + co.do_force = 1; + break; + + case 'h': /* help */ + free(save_av); + help(); + break; /* NOTREACHED */ + + case 'i': + co.do_value_as_ip = 1; + break; + + case 'n': + co.test_only = 1; + break; + + case 'N': + co.do_resolv = 1; + break; + + case 'q': + co.do_quiet = 1; + break; + + case 's': /* sort */ + co.do_sort = atoi(optarg); + break; + + case 'S': + co.show_sets = 1; + break; + + case 't': + co.do_time = 1; + break; + + case 'T': + co.do_time = 2; /* numeric timestamp */ + break; + + case 'v': /* verbose */ + co.verbose = 1; + break; + + default: + free(save_av); + return 1; + } + + ac -= optind; + av += optind; + NEED1("bad arguments, for usage summary ``ipfw''"); + + /* + * An undocumented behaviour of ipfw1 was to allow rule numbers first, + * e.g. "100 add allow ..." instead of "add 100 allow ...". + * In case, swap first and second argument to get the normal form. + */ + if (ac > 1 && isdigit(*av[0])) { + char *p = av[0]; + + av[0] = av[1]; + av[1] = p; + } + + /* + * Optional: pipe, queue or nat. + */ + co.do_nat = 0; + co.do_pipe = 0; + if (!strncmp(*av, "nat", strlen(*av))) + co.do_nat = 1; + else if (!strncmp(*av, "pipe", strlen(*av))) + co.do_pipe = 1; + else if (_substrcmp(*av, "queue") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "flowset") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "sched") == 0) + co.do_pipe = 3; + else if (!strncmp(*av, "set", strlen(*av))) { + if (ac > 1 && isdigit(av[1][0])) { + co.use_set = strtonum(av[1], 0, resvd_set_number, + &errstr); + if (errstr) + errx(EX_DATAERR, + "invalid set number %s\n", av[1]); + ac -= 2; av += 2; co.use_set++; + } + } + + if (co.do_pipe || co.do_nat) { + ac--; + av++; + } + NEED1("missing command"); + + /* + * For pipes, queues and nats we normally say 'nat|pipe NN config' + * but the code is easier to parse as 'nat|pipe config NN' + * so we swap the two arguments. + */ + if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) { + char *p = av[0]; + + av[0] = av[1]; + av[1] = p; + } + + if (co.use_set == 0) { + if (_substrcmp(*av, "add") == 0) + ipfw_add(av); + else if (co.do_nat && _substrcmp(*av, "show") == 0) + ipfw_show_nat(ac, av); + else if (co.do_pipe && _substrcmp(*av, "config") == 0) + ipfw_config_pipe(ac, av); + else if (co.do_nat && _substrcmp(*av, "config") == 0) + ipfw_config_nat(ac, av); + else if (_substrcmp(*av, "set") == 0) + ipfw_sets_handler(av); + else if (_substrcmp(*av, "table") == 0) + ipfw_table_handler(ac, av); + else if (_substrcmp(*av, "enable") == 0) + ipfw_sysctl_handler(av, 1); + else if (_substrcmp(*av, "disable") == 0) + ipfw_sysctl_handler(av, 0); + else + try_next = 1; + } + + if (co.use_set || try_next) { + if (_substrcmp(*av, "delete") == 0) + ipfw_delete(av); + else if (_substrcmp(*av, "flush") == 0) + ipfw_flush(co.do_force); + else if (_substrcmp(*av, "zero") == 0) + ipfw_zero(ac, av, 0 /* IP_FW_ZERO */); + else if (_substrcmp(*av, "resetlog") == 0) + ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */); + else if (_substrcmp(*av, "print") == 0 || + _substrcmp(*av, "list") == 0) + ipfw_list(ac, av, do_acct); + else if (_substrcmp(*av, "show") == 0) + ipfw_list(ac, av, 1 /* show counters */); + else + errx(EX_USAGE, "bad command `%s'", *av); + } + + /* Free memory allocated in the argument parsing. */ + free(save_av); + return 0; +} + + +static void +ipfw_readfile(int ac, char *av[]) +{ +#define MAX_ARGS 32 + char buf[BUFSIZ]; + char *progname = av[0]; /* original program name */ + const char *cmd = NULL; /* preprocessor name, if any */ + const char *filename = av[ac-1]; /* file to read */ + int c, lineno=0; + FILE *f = NULL; + pid_t preproc = 0; + + while ((c = getopt(ac, av, "cfNnp:qS")) != -1) { + switch(c) { + case 'c': + co.do_compact = 1; + break; + + case 'f': + co.do_force = 1; + break; + + case 'N': + co.do_resolv = 1; + break; + + case 'n': + co.test_only = 1; + break; + + case 'p': + /* + * ipfw -p cmd [args] filename + * + * We are done with getopt(). All arguments + * except the filename go to the preprocessor, + * so we need to do the following: + * - check that a filename is actually present; + * - advance av by optind-1 to skip arguments + * already processed; + * - decrease ac by optind, to remove the args + * already processed and the final filename; + * - set the last entry in av[] to NULL so + * popen() can detect the end of the array; + * - set optind=ac to let getopt() terminate. + */ + if (optind == ac) + errx(EX_USAGE, "no filename argument"); + cmd = optarg; + av[ac-1] = NULL; + av += optind - 1; + ac -= optind; + optind = ac; + break; + + case 'q': + co.do_quiet = 1; + break; + + case 'S': + co.show_sets = 1; + break; + + default: + errx(EX_USAGE, "bad arguments, for usage" + " summary ``ipfw''"); + } + + } + + if (cmd == NULL && ac != optind + 1) + errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]); + + if ((f = fopen(filename, "r")) == NULL) + err(EX_UNAVAILABLE, "fopen: %s", filename); + + if (cmd != NULL) { /* pipe through preprocessor */ + int pipedes[2]; + + if (pipe(pipedes) == -1) + err(EX_OSERR, "cannot create pipe"); + + preproc = fork(); + if (preproc == -1) + err(EX_OSERR, "cannot fork"); + + if (preproc == 0) { + /* + * Child, will run the preprocessor with the + * file on stdin and the pipe on stdout. + */ + if (dup2(fileno(f), 0) == -1 + || dup2(pipedes[1], 1) == -1) + err(EX_OSERR, "dup2()"); + fclose(f); + close(pipedes[1]); + close(pipedes[0]); + execvp(cmd, av); + err(EX_OSERR, "execvp(%s) failed", cmd); + } else { /* parent, will reopen f as the pipe */ + fclose(f); + close(pipedes[1]); + if ((f = fdopen(pipedes[0], "r")) == NULL) { + int savederrno = errno; + + (void)kill(preproc, SIGTERM); + errno = savederrno; + err(EX_OSERR, "fdopen()"); + } + } + } + + while (fgets(buf, BUFSIZ, f)) { /* read commands */ + char linename[20]; + char *args[2]; + + lineno++; + snprintf(linename, sizeof(linename), "Line %d", lineno); + setprogname(linename); /* XXX */ + args[0] = progname; + args[1] = buf; + ipfw_main(2, args); + } + fclose(f); + if (cmd != NULL) { + int status; + + if (waitpid(preproc, &status, 0) == -1) + errx(EX_OSERR, "waitpid()"); + if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK) + errx(EX_UNAVAILABLE, + "preprocessor exited with status %d", + WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + errx(EX_UNAVAILABLE, + "preprocessor exited with signal %d", + WTERMSIG(status)); + } +} + +int +main(int ac, char *av[]) +{ +#if defined(_WIN32) && defined(TCC) + { + WSADATA wsaData; + int ret=0; + unsigned short wVersionRequested = MAKEWORD(2, 2); + ret = WSAStartup(wVersionRequested, &wsaData); + if (ret != 0) { + /* Tell the user that we could not find a usable */ + /* Winsock DLL. */ + printf("WSAStartup failed with error: %d\n", ret); + return 1; + } + } +#endif + /* + * If the last argument is an absolute pathname, interpret it + * as a file to be preprocessed. + */ + + if (ac > 1 && av[ac - 1][0] == '/' && access(av[ac - 1], R_OK) == 0) + ipfw_readfile(ac, av); + else { + if (ipfw_main(ac, av)) { + errx(EX_USAGE, + "usage: ipfw [options]\n" + "do \"ipfw -h\" or \"man ipfw\" for details"); + } + } + return EX_OK; +} diff --git a/ipfw/qsort.c b/ipfw/qsort.c new file mode 100644 index 0000000..095ec8b --- /dev/null +++ b/ipfw/qsort.c @@ -0,0 +1,195 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)qsort.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ +#include +__FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $"); + +#include + +#ifdef I_AM_QSORT_R +typedef int cmp_t(void *, const void *, const void *); +#else +typedef int cmp_t(const void *, const void *); +#endif +static inline char *med3(char *, char *, char *, cmp_t *, void *); +static inline void swapfunc(char *, char *, int, int); + +#define min(a, b) (a) < (b) ? a : b + +/* + * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". + */ +#define swapcode(TYPE, parmi, parmj, n) { \ + long i = (n) / sizeof (TYPE); \ + TYPE *pi = (TYPE *) (parmi); \ + TYPE *pj = (TYPE *) (parmj); \ + do { \ + TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} + +#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ + es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; + +static inline void +swapfunc(a, b, n, swaptype) + char *a, *b; + int n, swaptype; +{ + if(swaptype <= 1) + swapcode(long, a, b, n) + else + swapcode(char, a, b, n) +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(a); \ + *(long *)(a) = *(long *)(b); \ + *(long *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) + +#ifdef I_AM_QSORT_R +#define CMP(t, x, y) (cmp((t), (x), (y))) +#else +#define CMP(t, x, y) (cmp((x), (y))) +#endif + +static inline char * +med3(char *a, char *b, char *c, cmp_t *cmp, void *thunk +#ifndef I_AM_QSORT_R +__unused +#endif +) +{ + return CMP(thunk, a, b) < 0 ? + (CMP(thunk, b, c) < 0 ? b : (CMP(thunk, a, c) < 0 ? c : a )) + :(CMP(thunk, b, c) > 0 ? b : (CMP(thunk, a, c) < 0 ? a : c )); +} + +#ifdef I_AM_QSORT_R +void +qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) +#else +#define thunk NULL +void +qsort(void *a, size_t n, size_t es, cmp_t *cmp) +#endif +{ + char *pa, *pb, *pc, *pd, *pl, *pm, *pn; + size_t d, r; + int cmp_result; + int swaptype, swap_cnt; + +loop: SWAPINIT(a, es); + swap_cnt = 0; + if (n < 7) { + for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es) + for (pl = pm; + pl > (char *)a && CMP(thunk, pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + pm = (char *)a + (n / 2) * es; + if (n > 7) { + pl = a; + pn = (char *)a + (n - 1) * es; + if (n > 40) { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, cmp, thunk); + pm = med3(pm - d, pm, pm + d, cmp, thunk); + pn = med3(pn - 2 * d, pn - d, pn, cmp, thunk); + } + pm = med3(pl, pm, pn, cmp, thunk); + } + swap(a, pm); + pa = pb = (char *)a + es; + + pc = pd = (char *)a + (n - 1) * es; + for (;;) { + while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pc, pd); + pd -= es; + } + pc -= es; + } + if (pb > pc) + break; + swap(pb, pc); + swap_cnt = 1; + pb += es; + pc -= es; + } + if (swap_cnt == 0) { /* Switch to insertion sort */ + for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es) + for (pl = pm; + pl > (char *)a && CMP(thunk, pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + + pn = (char *)a + n * es; + r = min(pa - (char *)a, pb - pa); + vecswap(a, pb - r, r); + r = min(pd - pc, pn - pd - es); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > es) +#ifdef I_AM_QSORT_R + qsort_r(a, r / es, es, thunk, cmp); +#else + qsort(a, r / es, es, cmp); +#endif + if ((r = pd - pc) > es) { + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + goto loop; + } +/* qsort(pn - r, r / es, es, cmp);*/ +} diff --git a/ipfw/qsort_r.c b/ipfw/qsort_r.c new file mode 100644 index 0000000..f7c0e54 --- /dev/null +++ b/ipfw/qsort_r.c @@ -0,0 +1,8 @@ +/* + * This file is in the public domain. Originally written by Garrett + * A. Wollman. + * + * $FreeBSD: src/lib/libc/stdlib/qsort_r.c,v 1.1 2002/09/10 02:04:49 wollman Exp $ + */ +#define I_AM_QSORT_R +#include "qsort.c" diff --git a/ipfw/rule_test.sh b/ipfw/rule_test.sh new file mode 100755 index 0000000..d5ad6be --- /dev/null +++ b/ipfw/rule_test.sh @@ -0,0 +1,83 @@ +#/bin/bash + +COMMAND=ipfw + + +echo .########## Set $COMMAND mode .########## +$COMMAND add allow ip from any to any +$COMMAND -q flush + +echo .########## empty rules .########## +$COMMAND list +$COMMAND add allow ip from any to any +$COMMAND add allow ip from any to { 1.2.3.4 or 2.3.4.5 } +$COMMAND add allow { dst-ip 1.2.3.4 or dst-ip 2.3.4.5 } + +echo .########## listing 3 rules .########## +$COMMAND list + +$COMMAND delete 200 +echo .########## listing 2 rules .########## +$COMMAND list + +$COMMAND table 10 add 1.2.3.4 +$COMMAND table 10 add 1.2.3.5 +$COMMAND table 10 add 1.2.3.6 +$COMMAND table 10 add 1.2.3.7/13 +$COMMAND table 10 add 1.2.3.7/20 +$COMMAND table 10 add 1.2.3.7/28 + +echo .########## listing table 10 with 6 elements .########## +$COMMAND table 10 list +$COMMAND table 10 delete 1.2.3.6 + +echo .########## listing table 10 with 5 elements .########## +$COMMAND table 10 list +$COMMAND table 10 flush + +echo .########## table 10 empty .########## +$COMMAND table 10 list + +echo .########## move rule 100 to set 1 300 to 3 .########## +$COMMAND set move rule 100 to 1 +$COMMAND set move rule 300 to 3 +$COMMAND -S show + +echo .########## move rule 200 to 2 but 200 do not exist .###### +$COMMAND set move rule 200 to 2 + +echo .########## add some rules .########## +$COMMAND add 200 queue 2 proto ip +$COMMAND add 300 queue 5 proto ip +$COMMAND add 400 queue 40 proto ip +$COMMAND add 400 queue 50 proto ip + +echo .########## move rule 200 to 2 .###### +$COMMAND set move rule 200 to 2 + +echo .########## move rule 400 to 5 .###### +$COMMAND set move rule 400 to 5 + +echo .########## set 5 show 2 rules .###### +$COMMAND set 5 show + +echo .########## flush set 5 .###### +$COMMAND -q set 5 flush + +echo .########## set 5 show 0 rule .###### +$COMMAND set 5 show + +echo .########## disable set 1 .###### +$COMMAND set disable 1 + +echo .########## show all rules except set 1 .###### +$COMMAND -S show + +echo .########## enable set 1 .###### +$COMMAND set enable 1 + +echo .########## show all rules .###### +$COMMAND -S show + + + diff --git a/ipfw/ws2_32.def b/ipfw/ws2_32.def new file mode 100644 index 0000000..3813911 --- /dev/null +++ b/ipfw/ws2_32.def @@ -0,0 +1,120 @@ +LIBRARY ws2_32.dll + +EXPORTS +FreeAddrInfoW +GetAddrInfoW +GetNameInfoW +WEP +WPUCompleteOverlappedRequest +WSAAccept +WSAAddressToStringA +WSAAddressToStringW +WSAAsyncGetHostByAddr +WSAAsyncGetHostByName +WSAAsyncGetProtoByName +WSAAsyncGetProtoByNumber +WSAAsyncGetServByName +WSAAsyncGetServByPort +WSAAsyncSelect +WSACancelAsyncRequest +WSACancelBlockingCall +WSACleanup +WSACloseEvent +WSAConnect +WSACreateEvent +WSADuplicateSocketA +WSADuplicateSocketW +WSAEnumNameSpaceProvidersA +WSAEnumNameSpaceProvidersW +WSAEnumNetworkEvents +WSAEnumProtocolsA +WSAEnumProtocolsW +WSAEventSelect +WSAGetLastError +WSAGetOverlappedResult +WSAGetQOSByName +WSAGetServiceClassInfoA +WSAGetServiceClassInfoW +WSAGetServiceClassNameByClassIdA +WSAGetServiceClassNameByClassIdW +WSAHtonl +WSAHtons +WSAInstallServiceClassA +WSAInstallServiceClassW +WSAIoctl +WSAIsBlocking +WSAJoinLeaf +WSALookupServiceBeginA +WSALookupServiceBeginW +WSALookupServiceEnd +WSALookupServiceNextA +WSALookupServiceNextW +WSANSPIoctl +WSANtohl +WSANtohs +WSAProviderConfigChange +WSARecv +WSARecvDisconnect +WSARecvFrom +WSARemoveServiceClass +WSAResetEvent +WSASend +WSASendDisconnect +WSASendTo +WSASetBlockingHook +WSASetEvent +WSASetLastError +WSASetServiceA +WSASetServiceW +WSASocketA +WSASocketW +WSAStartup +WSAStringToAddressA +WSAStringToAddressW +WSAUnhookBlockingHook +WSAWaitForMultipleEvents +WSApSetPostRoutine +WSCDeinstallProvider +WSCEnableNSProvider +WSCEnumProtocols +WSCGetProviderPath +WSCInstallNameSpace +WSCInstallProvider +WSCUnInstallNameSpace +WSCUpdateProvider +WSCWriteNameSpaceOrder +WSCWriteProviderOrder +__WSAFDIsSet +accept +bind +closesocket +connect +freeaddrinfo +getaddrinfo +gethostbyaddr +gethostbyname +gethostname +getnameinfo +getpeername +getprotobyname +getprotobynumber +getservbyname +getservbyport +getsockname +getsockopt +htonl +htons +inet_addr +inet_ntoa +ioctlsocket +listen +ntohl +ntohs +recv +recvfrom +select +send +sendto +setsockopt +shutdown +socket diff --git a/planetlab/check_planetlab_sync b/planetlab/check_planetlab_sync new file mode 100755 index 0000000..f59853f --- /dev/null +++ b/planetlab/check_planetlab_sync @@ -0,0 +1,22 @@ +#!/bin/sh + +# +# This script is used to check the sync of the local repo +# with the remote planetlab repository + +tmpfile=/tmp/chech_planetlab_sync.tmp + +# check for local copy sync +svn diff > /tmp/chech_planetlab_sync.tmp +if [ -s $tmpfile ] ; then + echo "Local repo unsynced, can not continue" + exit -1 + rm $tmpfile +fi + +# export remote copy +svn --force export http://svn.planet-lab.org/svn/ipfw/trunk ./ >> /dev/null + +# check diffs again, output to the user +svn diff +svn status | grep -v check_planetlab_sync diff --git a/planetlab/ipfw b/planetlab/ipfw new file mode 100755 index 0000000..114cafb --- /dev/null +++ b/planetlab/ipfw @@ -0,0 +1,84 @@ +#!/bin/sh +# +# ipfw init the emulation service +# +# chkconfig: 2345 09 91 +# description: ipfw init and shutdown +# + +# Source function library. +. /etc/init.d/functions + +IPFW=ipfw +IPFW_BACKEND=/vsys/ipfw-be +IPFW_MOD=ipfw_mod + +if [ ! -x /sbin/$IPFW ] || [ ! -x ${IPFW_BACKEND} ]; then + echo -n "/sbin/$IPFW does not exist."; warning; echo + exit 0 +fi + +# Load the ipfw module, and initialize netconfig +start() { + # load the module + modprobe $IPFW_MOD >& /dev/null + let ret=$?; + [ $ret -eq 0 ] && success || failure + + # init netconfig + echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null + echo "super init" | ${IPFW_BACKEND} root >& /dev/null + + return $ret +} + +stop() { + # clean netconfig stuff + echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null + echo "Unloading $IPFW_MOD module: " + + # unload the ipfw module + rmmod ${IPFW_MOD} + let ret=$?; + [ $ret -eq 0 ] && success || failure + + return $ret +} + +# echo the ipfw status +status() { + # check for module presence + grep '^ipfw_mod$' /proc/modules >& /dev/null || echo "ipfw not loaded" && return 0 + + # Show active users + USERS=$(grep BLOCK /tmp/ff | wc -l) + echo "ipfw is loaded and there are currently ${USERS} with active emulation." + return 0 +} + +# main +case "$1" in + start) + start + RETVAL=$? + ;; + stop) + stop + RETVAL=$? + ;; + restart) + stop + start + RETVAL=$? + ;; + status) + status + RETVAL=$? + ;; + *) + echo $"Usage: $0 {start|stop|restart|status}" + exit 1 + ;; +esac + +exit $RETVAL diff --git a/planetlab/ipfw.8.gz b/planetlab/ipfw.8.gz new file mode 100644 index 0000000000000000000000000000000000000000..c2db9233519c3f1d9716c42d886c11b8d01ca60f GIT binary patch literal 26396 zcmV(#K;*w4iwFP!000001I@i_d)wHRCi*$}6^OehkrI-oWLt5$pPuK)lAO{hTXG~h zRhg+idO#AS5R(86fRdS1{`*<)y6g)`$#SJSGnt;UNCNx1_S)(MI?`SU>98c22)oj(Y>5Cp-`2&5`rl;@Ue4oBd2IoyOxV%Zv4&R+5 zgR@17Zs?B*J&;a|I={>2^E4|bX9{J}x|*lUx|-G5f*ZN~%h~z+ z%hO9VL@h7VH}iD-DH$9s(rhwG28VT8ujYC9b2gsj)AH=Hz2a}^$tG`8dhje6yskgt zi+Mpq|B3po?)VvPg}$QkHEn;M(eH!7AZ&I@zkb?mkv>f4tL7$rbjxGf@g$$;E&BOe zURU%_ozo-pYG$J-SBnv?gzK7K$;)hn32F;&0e?!fdHad}smlC%nKcdlImt(>8U1^k z*Q2UIKa+KN{V~s%*}S;TU7M4l`2`z&zjO^_!sk`$yND;1o{v*m8JX2RQ${O`3Qfzy zvcvva`l_w`yU}~iT^6q7wkLSkgF&~Qp$Ki^3;Kt4ufdd_;$Qk|ku@LZd8sp@FK)B> zDu1Xg-Se=yWw*J9-VexY`vah6KOx=HWwFFoeOl$KaP7DaI%v~k#^1`H{c713RoR3@ z-01wKMLwV0(`=Wjx16hRdxJGEv-T@SWiRUar_=q3IKp|A4Co_wv#fJs8)Ny>f9_A` z*^C-EAIty8FIjt2W{aHJ%kjzO(Z%WetJCwd*RnaQQklbFBhZ{0I0W@(6|=aBu}epn#qb+Q~yXJE1FNzX;h9|{zN{y6lx}4Lj8yYAt2CZpZjfqD$Ej5cUd(}0cvhet$KOcU~?E*D8t`V*r@iNg>o6%13Q|XSQJ*K$b^zNphHKW0n zRg3X1Dk7|cey47QX-s+vCd`9Eqcm=vqinA&O4C&1BEv-QiuOiMR=0ayl)S)noQkjk+Wx;bQ+Ket$~|7){f?|sA{7jKP>7Cmt&ufNSj;l z^I}o7i7+ a&tO*J|Mwli!%G=E*3($!-feO1-og?@-!?riUxbx4nezc$@P0?D{O}c|Ch&CBr(y}kBx@~stFgVJP9gr2EO=voO^mJco zj5)I)ni_4)yqM-VtUYEhw4Kxlox0Yj5{zm*UeQr0$HKjY?c+gNEMjgCtsXk>@wm}J zyIvM!+SD~TG@ty6wl+QjRoS293ueVtGS6oPjSu}87p}_rdXQY^x$NBK=k)UG@ap8v zhqvjQ(~Fbu58u88dGaqOKODY$|MujP9%q_th}MXr_y{o_6eW(g@;hZ$x}mAtQ0CPn z6YCM9G0L{>2x(#UZLNvMXm50pr}lqxCxP~`~YlTbmj6f<)-WnzVmS($_gd60dzDQCk8;kgy_R0V}hAqlQ#vo*>k&> zIyP>r=-PpP;M#FoLcG3*^QP*_b;kS;^hmb^ABB?l^4Y$>$%Vt<^5~gxLPSd^DcU)_ zoVC?Lj~0DbELU@!8NAv{n-%qzH7dXN5*71%B6rI(5Z#%RyW2J69YE7`5r8?<_&f*R>Dr5GiM5+PTfCnMbx(!Xgo#bP-xa)_jKP00ub*7hhlCq^bMb|eWQ-NhmY zH^427FGkg5E#f2AeUO|=+bm6wQ`hWRM4ucZD!N9cQ5XzU$JT6EuFB}vBP ziwi_ut)yE!R_T*sDbi_Hu7xVrPl#8dljQD(nK(^yQqgoVd8yeuFBMcul# zr^#`}lOny(bP2kyl=9>YJBcI}^AQ+lbOIor5t%KTghsO{8eDpGU7l4eAvisec28uM znq|(C&H|}2Tf`?5_jM_1eU|o<74T4~q@Kkae>eaZ@+&QjP-)79>KK zZT8tNHak*6Y>lT+qFcFgph&3rv@mr-G3JkNXBe5vM6%uPQR0)IRt4nrk=$|m43?qHJ$L1dhdSNxS}V(~p8yn0+0%Qo4`2eSc<_D`f2-u#$8&D-(Q zdOUDq*e;EadZAXmjsckl)I}RZ4X(YaK|`-eF$8nU7Zu$PGp6;kM zY!^BWoe6iWlvI^wVr8Cay=aALV&p^~O*SL``#P5+?r%a6IQO5crm4@;$)DY{sj5>n z+u|1bR-%L&Eme;hhlzLF785i~&5Y+Xkx63L1wEPJ?y4gtS?+J`Sho?(;&6<=?4;0i z=}6@1UA3BXYbxImY1Sg7*ZG*lgW8>j%MO>(o5k)jyNQ!+Vvh1U=;U*{cXcNNuB_6@ z#OY|Xz%qpd{RPY}wsyrsg3Sv!sx-a4)f^JG8vCc8-mY@zc((Z|mwS+QR)sdx5je$A!?|Ob1M3Sv75Np6PT8I{+g-nTy<1k$BqVX~(2hhl&BQMloI}LlE7vXc9+c zB4Vq)tbAWhHasCj>lazdQd@# zI8~$%%}6u*APhY0#@T1s?l@j9(>>85=+okPy2s){Xdq3TltpAws}e$tofei;+9*l=$&YR@3#8*FCk5(#1(Tew)5Oy!tl5h>5VWT8w3Z(d$TWZEyyfOq+cq z?x1{=4S&Rrx2`8sL<+$cSJb4g@>UXDe|TP8OdTA=4c884PVJvq>`YJU`-D&m#j6;?CYgiPPI zDzow#W200fX1F=sNw~_Q!@?v=3}s$?^kWm7Ykpq zxK=T`F>XS%fP0cSGkqkv$MI-CE$%at#Y`D;W_(bjQ&Yt$;3NbrvYBR_T}`$rGJ^Ng z&Z%a>^6KL7&70Gs^zz%o_a_(W&hdwL@BVUjar+~ybvhz0@DYklPOOf(|GHWUgA{^dX#?D9GGT&H z@t${+&Ulhdh@P69Rx~HYdu$3$B;U3mkHMv8%FMP@Cpei>zFH ztHQXA+bOt=%eb6`Ir9!&rHvpw)ry8VAY*#p1t`~UsV;P2l@ zU2ORub@9K8<<7zGby3nK`d`6M`n@O~@&ggWfbO)Gz8~nZHe|0QFg3-;rHe;vSe_7!9JJ+IUM9!+M~#5eDUgr! zS;b%SwH_aBaFkovxkhxBaUBhr)}N=5A48k*zPBXsy~*iNPf~QZ2)8_gTEMmjzQ#mM zoz7YO_9N2sdqXmkH(VxAqnArD1&e`~R;*~X%StX=uvJ(B=pR)doY71#M%=M7pc5#P zkYTEYcSNY*u96K8=iT9vj)iaauJaI^A;A-#CU>BPj?@$ttiHCcydrAaPj5PXxSwJv zhj3@t)bnC_U|*%)T+75FtIZ0HXS3>F+s18SS7S@5GNxWqu)i@EE^NM4a7vY$_$@Yn zu*XI1qTW3lxno7;72OJ)86At~P;>^8Lx}^Hth=aLL51{81+-g|CMLX>ejwBZR%t9! zjEP{Rf~)Q%ZD-L=k}L>^c(9%utC{Zh66dS3p?SrQzwV{jW>Mnl%cLUKoVDUhpdAw< zJ{z)aYOH0Vkwg@6SsZB_F7lIw?vHp+MdX5S!TBmrTZtD~HL63y9o!Tn2_?YEm)j75 zO$*38#gupyZfiQ-JuJvwUBDPl11rR&;nnWM8S!DmOS46#O1<--%QZO=+wn(I!m1=v zveecC;$U%$zRf>pAgfmo-g5uuIRf}waGY3^^(;8y_y-6@dqW)o{WTo2O`~J$m1}Yz z!4IAG-L`LOZU4U3u=!~kXbWuV4 zj%>I@dVCi}WT00<5^f6kZm=u60}GvCZl=`k1dtKost+Qh#bnGSTX7 zF=53wfl~%fimrk2#-eUJ8i>pxNW-dikC$l}o*JuyM6hkV2O)M=9-^83cxb zxnEto72C)O1+7{p1I3LvFg^M3S!NpTe_}F4U<86e`^Az-cEJDd>HhQH-u@r>!`{=E zk9t1cc;U%cyzxX108uEsap{}$i+6`t>TF||fH=#Qm1#VdTLPVJoy}%Mrk%+SUVlvc zOm1O-=e32KMd6wBPv-|HMiQMz-2NJ$GU4QGI?2?VD9^$)@BYfzst|@I%1<0sN#APR z9D>wd*BmDahv8487tz1?ZXCTU5o>VQ(Mo-E?80C2=E*ZNtut5H6&<=>#O5y-^|Y9) zS5H}6FSu61B}V97$Fce3G7j<@q+6m&p12+P;fEj6zWN5)Q27=Nu)lBDKAT({4YP8R zAs#$^fBT$Uq(2VQ_o_=KS4Z!A>4#(b&*{-S`fuKj#RB@6aq)x4MXfI3u6sDtg7M^4 zot4d^kmw0|DF^3q(SXV8HM@CK{zYH>*1Mt^4`4cpr}$?QohANfz05`XWmi!zon*AW zbdwMREoUuBFa@ZXHd5rLUP9N*jQKNLx_8Q|?6a$g`jBNNx#$WJ99LEPmZNfc{Zbb~ zDa(0~o_ub>K@t5VvV8JJn^tY%+iKY#t^4$kz1&MKDscsUE3foyrEC4ZZf_>_u$R88 zMzGUyNBrGB?a_N$Uu$aM2l9qUtbckOtQYNTXTk6d^Tu?C?yz3^=9Kt1H~xi7(C$v5?Zy@?JTG6!qzig zv3W(s4vu1?7aHAi^#*uNleG(FTcR%qe2Vr}u5VRVpdnz%-^a%p#PO84fE;TMs+mvV zZh)PPE$vk`h1s^L0&eFdccrjw4kwQHs?Am;=g{%a)tT3@O_X|F&DpVWe~d@xiaBxq zDkyi|eRqiq_PE;1sP9>x?hOVUK-u%&`Z;SO>nb z&`73Lo!_&lhpV<)aHL4Ecp3r3ebPv2Wl5r|^Mx9Yt@WhjFhp?+2d|scf@2!&G<58s z5+kpsyFsyp(JrI0P)FCa)DQ)*p4vKFeo5={eo=F7o1SQr@kl++cjl^{!NIhyG-!R1 zeJ&O&jhR*Vsg8QLW766_O_`Gl9I)`T#MgxxZI!zoaaw|>qr);B44&;F&Z~9WZWKf4 ze83=3F?b475W}#=#=@*avze~SNyfL~+}V;f>R-a*oLcTe<18w{csxXY^LsQLelcB3 zV1h=%YdjkNMno%d3lnuAe|l}MlMp(%cqW2NK(6m1ZTZR;Wd*e*Cq zLM5GP$HFp&R-LXC2T!IuFu0iZq+xyRoANP@_M-kO-B<&=gJ9Ht0q4!dE zA3?sjj^>#ytDH3Fi4EV|na(STiCtEOlN~zd*S%fu@CmFdOeDd{VArt-aE%2{CPcIo zy1ezyozxJs>STw*=gPmXO7Uh%L~WEhlI|u#P!f*G3Z;0M;N5fXm#n>MoEnZ8g;h&} zc%K&ql^)z3l84&PxVr#!Xl#6O)`k!<`{?|!dv*2+_!ws zG$iTjL>jlR59?H#kfD=tsRMF?q9|I#%&L*2=L2r+%p~;lOp{<}sfqQ{7bG_&#&wQw z)3@h;yuLg+`fzc2^_PT1es%U=NwCdKL<&SIvz^sqjW)PA5pai#Yy1${j>gvPB|8@T z5JeK1+d4{9SS*XVQZtY9^t}bFU;p{!;`RCE3DJxBK0QgA9ZP2p(I~6+oMkQ1M@@Ge z5t!yfy5Il=ROgd$U1e7i$3K~il~w2GA_|feYso@9b>jbbZj;9^*|jt=fIVwewIYx} zfAu%%ki(sB?TXip;`cOA157;{P7R9~25C>b*zCSu8pH#WF`ca1UGDaz2JxExJBY{6 z{o+ah8r?h=J-H>YJ3NrqI!D5mmcheW4SWhi@|RKQscgjWDbWin_6QSqA{wRjbCC@0 zSZjcW&1s4l^+W@Rci>N6wmmlHsJdn01};ZLPAp{NpL0TC_HZXuX_baK<_689Fe7eN zhUtRaq~+)CDkM>v_tJ%8=umNJ(JalbQ6o5O@(AWwWHlYHbVrDR<6$`3AE5%YZfxkg zBVAxObz!+9AKswaI^U7rD*&M#WKnM1784G7H4{9W|JB}_wzU{w<-T#2hDHu^rC~ag zO0FonU)KFm#+Rk(I0Ns#chdY~rO9;ArL({X28SdKkH02rc+$}M|G0kABVI}J4RbeG zRCwKRH)}K$+bOjZ5)HC$wp(ifmUIS+Ibxz|r1UKx%WNL>_z{U3J}6n5L{Pw5tX$$0 z3izLTcWSWjM!BQDMu*eRygeH&Ge}qMIk@8OI^_8DC}e^|-PO3P4`Ie~Ba71gJaJ09 z1iFd8jRWKA9JxLX9XYpN*^E;~@{sU}154tXh4WukCLV2XB5px!hI)8H&Oti+*tYr+ zD@>~^tHs7{cCW2nxLPijuT?33bzgA{)~rV2Vor86uniGsqt*19$MKE3J9i}wfiF#> z9FSSVc3z)DYka}tilT`kk0v@gJRp<3P2?^9mP^nZjc-(1caF5;al*7vVYIBizIYk1!jO3|$B^8WAn&eDi$TV${_NJ-HQ#?b?r@ zKi-R{fB5$5PtNM;uFa75OCQKS;GN~JAr*Jo{va?k@p5p}%D|ZJl((78ZtQx-wDE-4 z!4n7(fU^+gNHiwqZn>u0(rfZKyX68iToZO*1gRKAODhl^$ImWkFA$9`oS0MZg2NdG zzxG_Zr|NCOI9P*~%j&LZa&t&Q39EU+$rF_ypUA@LWx=MIl}7Mlby#lmA^x^EK{BcL zNEvbPBDOxOv*h^u#_uEPT214u*9i@or8PNxNLsT4$RT&R1Ol>>smz1n05HrA8k2!> zqD<6DN{3^K6)+A?FkMbHeDF<Yu!y~kXq6q!3SWmiksY)3s$}9{P9ZG6WQ~vr2H7+>xvWrQk{!db zxWl1gi^#nZ)tufLiT6T?TvZqDcv3WDqKBeXOCkx!H49sMSe>;%<3%h3QYg)zVwYMU z80k2ybS~yUIzuJL7R|+LQ|6M(z;ZV!;%pYY zndu7ClD4rC8jRlJg_36G%?#oBKEmHkIO;G2D@x{bAyZvQ2wec%t1Z9v zEvwd0^nz}BqUV9TC!A61MAXl%t=veWqmF`k$kCigs*Wa)W6eYTeB61ygKE*4Y!n0> zWb=P9SKqHQ+X6l;Cz)lYhu-T*mZ<7BQODdvExN>zO+6tHvC!B+TTQ<-i0Rz~uvFq$!&r(X!wCH(-nXot zDWXtM5rw$?To4$0EZB^tf*f zn-gTV4!tvuWQV$T{q3?yIx9&~m^eHD8>87r!#T?6B#F@V6QmbbV0@YK5r{T^0+GAe z+rv)Z%ajgzMnva{>Iq>ir$o#3_ucyMi2GJUF(z0l34r6k!Cn-c$6>D$0M4Os)Y+AR zKTvndqJ5sZKQxN=(8Q6lV1KOtoxfOE|@S}UPVSQ z5rtQQQ>(ga0Op@U z(!R@yc!?7BW%A7c`wsA)kaJmApVu5?BiZ4akXZUG8xTP{Cc(N|c!Nehv3$G8%r28# zN~F-yg!Dv(fe1|RZ_Y2iKfE}8k~;QH^JfD=)?Sx&@*c8FXi&n>3~GVQvmG1_GpN{# z@r33U$?phj!VyerlYNzhDfwq7SKpsse3u>`A77kYULup~?D8#VOQkz!hgZ8UXDUR0 z+Q3sVus5(a`giN_JK$!6<4=+i#xofaR}qPhMFO3|B&QRt8#?=H#6ksK= zJK1Yhu4* ze~HKg!K=vBevIg)DO-Z85hOuyV3wH~T0vQWAy$78pq!nHOCkzd?_&H#8IIpL_`hpR z8X)e$5xp#zT+(kW6~mw^aXYGQJ8U&&_*v)?Nxvj5i=`%r!JHxn?7MFEfO zD@nisW~K0no=*qu$y)UEk$i&qR{FN7$AImizv#XGsze-Tsqg!bk`mNZbJm>YS!(*G z7}=xlQ(HAW0-@ci(Se9LZsR2qDkq2}HC@(!(ZNa#cxd4uMstK_5)w>XbcU!DJj2UdM7@>q|SoCo(5PtBYB<&FD0Ih_VK zt=M1dT+MhqYuB)t$`u+?Y)N9pf9M4Eq8yV41%OWYf}j6P`HD{du)|c?yS!H+4O5-Y$| zH3*qoX<{S$oCWdUzVJMu| z7AM^@1`8eAs_u_S5&!6*e}S~r)+<|1lC)gVv70)LbMT=que7 zI|-+{{XP8$Uq?x7k0YM%Rw^@}L-XoUYw`D#vqiBaoC*H7^dHfhFj^a~qdBerTzI1& zo}r|Gyk3*qRI6`wft$R!$>;M%BMCLyRlU27`gUDQiJN5LJk2Jz2B{&5dRYqp8{^-k zKmHh{48vlQaHtN0gGjVv4C~?{Qo4tnskm^*!)3E)_nhX$bbT|~)p84}*e!`@C$>^ZdKeI@{cZdDCFaH}iQqJkb=3W$>=LYKhro;#4|4#MXSwmiqRGUo0nP>hR-H25=Rp4^-w8}PIoh^RUf zT1N7o$o*30ex-qCCSEyf=(#icTdHKbrR71?|NVQedA2*52#0pdq`RznTMD@1lnVG8 zupKyf3uf4sGdx&`b>^ahQ4-GPS;Z$o&Nq>iX1%Hl%f`kZgK?!uge6|EL4cgWdFbuQ zJ4{6&9to^vJfPhq0B|<%IHGq4t_jCf1BNYVc*@UsIds_J?2XY02hG} zPT`$s?2!{6NfPE@vK-22Lc3axqxqQ1QbIn$iPb?6WzxPMh5#}>({rBgzd&R1X^2mr zC;>r-aaiJ{)TWUqIroMC+}r;{y0H$gf_3;;E`q3iv=EiJ-1+Zb61S_Qk%BbtTbEb+ zFJ26W1OBfp&=Av(z7AGuZ}2<%`F9}*tLZ7s68TPpHdXf7@-SPGyq{on4u*)dyf&{5 zp}KCJmHGklQc1wd8%eYLtE=|`U#;N43eG!c{K6oZ*LB6l)Y42I0b_t3@G7jNwb?oD zJNkm90r`QsiG1Y*Q{^y_Ammijai;%5_+)uugXJ){`2xjw*oI$@a_%@nfx}+77DwSP zfg;H5@CtVthb+UDZ!kP%Wf(M_*C1BdY1-KaQzK3hmVI^y$;mulgaHjak`2WKK%Bx2 zajO)fGjrgl)g}lj#xzFDf{|P*zM+f1`HbeAm1np15^d@-Q*e_?-j+>pUobzO?Q6(d zi0GUYtq*DC+BrPv{pS|lp+_cD->b!FNR=3=B0>F@6NXC3YEJ-C8bh;9+@&B^Jj3U9 zmLN8tNyeWdB!DwwVK`j7<%kBh&~R8MZUI*B>x`4gSuiC5XN9~TQOUtEJ_}Ts#oaX; z9uqT_tlV-v(h@i({QU2G`!9RXUi4qQ=)L^rA4$lPyT1ieLwv7>U<s5coJLHtM(S9x-Y z(;AXnh%sLcyKa?Ozw_4w5925KhnDTf-}HkpN&M+=jxd0-k-#Wtf{+W)rWHihdGkC6 z17Mv#9KYZ7HH4h240rT;gWZOO#AN7dyH~c`bI@y(WVANk4&B?`3|bPajJphN+seED zG{^iCj|1bdqB2dTgz1$B5m>v}jh*2#P{_2)sJ_LHU|^cnMiGsgbl0XEB0!)6OYr1L z26b-~`pAWd;b+t80yoXGpEnNe z9zN%NZ>m0c9F+YNPlVoa#i__lGMX6R&07z;wX86jhS019c|;w}(IMDg=<0!K?FiHl zk7~ZW$w+tupBDGr?w72wPoJ6xR&2lmb9h?PiD(rb*)ho$-qb93FbtI@!4`?Et!$+m zIJGZm7C@J%O;@w`r2r5jpW~(sw2^67AMUfl6_q{YbV!+f#`bv;0A0?S#T5sZNA@ViX7JbV=w=Y<%{|7F`K#y}>+xN!H%Hn_fQ8srl0D;UQS$@z z6@8QT*i&e5jxkK4Ju0aYy`+Pg7(bj|p^c$?hXEDor4UhZ+Z+4Q8~gOeaaeXa9`pn? zvoL7EWXx8ycx7t|w1x+%4`(;2sOs`?UXu9GtR&=)GEMyf&1xlj^b(;f>6 zLDTOB!H2|MTMUfM4A=^BJMlJX(2o%Ehz%+_99-{9iBNpU+eIWzf1+b-h@l3w)m;QfSZ)J1%PANvGDrtO5dfE`prqLGzy8i}7N~NTKLmAVVD?2DqZifF1H^$4tnLOEan-6&wb&sX~H^i5N$| z)_URr@U)2?CCgo}PNq(s9Iz;OA!?EVzqyzjc8xmL>48!`_sqvPurx2{YdinoT;a_U z1hLv9&tOXzj>zb(H}=DG`oZ#B?7J8Ao%TWxH}3nFcoysx#tldhMZ;!J`~4gLdsA_= znZ(xU&wCgrOfg{c5F-@l3WB&kvB%uf&dMnBzwGO~QckTiu)SxUUxUTj!wf^mTKC-- z8^4EdAEYkKL`*W>*Z;s=yYooc!poRe4;~=au7KikaJyD*^^*MGK(RKrk_53T5Iwg> zK05eoUr_@!!8zKZbkVG6hKq)+v`i=w%6!#;WB}baC6T}99@JW@xvG^}V_F%8bk-my z!l&C5OJSJBq#yY!HYHV87-OQQcU+nZv?;-46T7suF_+LAL=Kx}Q?b%4d;%AXM!!14 zO45y3638gvsb-B2rU+J8P33G`5!04c7ykwa#;4iq`nQ zes*>%BDRHI-nh#3j9}+<9xk-k6u>MXXFE66Y&al+Jf$zv4G#Q1EI_TcZiH4hHLSZW zof%gOhhg*0 zh6_MTVKQRjliYQZ7FNR|z}0qGvC1qkTMjzX!XX+o89ye0f~=+D`OJWk*@?_jBrEMK z1{!vpg`$V?e!LBv#ffCfnqC{uTqEv_iTb@n#mB=q`ZBjSn&4T+kWN93Z)$~4g5}200(xt!O@46? zZ>L95#$=lPz*Qak!udK}lZs(@c;Jeeaqw`B^jaFp_ zp;84n_teoz`Y=0UdF9c}sBKwqpntw}BdZ$^p6^=~h~j(L+W6OBRT|*Tk!z6^W#h84 zSZK#ZCE|=tCfZy8^73JcpghZmh@zO7Q;a#171q26%0`IX5N$^}tfoLJlhU?gXC4(3 zE6V{Ut{6yG<>;QAm(%x8nV)zh3MuX8;%8m>`VHHF&66}QraAB4juxYwn+;Xs6}IPq zqguUjF4qEs$4-go0fLWl71iVKoD4w%7sF$CoMSfj#cly;ZMfGp6vSZ$vrBy|1PW^LI!8|eX7#0%Cl_z;Y7<(!gXBnxa0AeSN6RO9%9VFH z9og8UTcy8qjQ>x7UP=FVrf=%V!q^6DvgKxj`7N zTC~+W-@I)GG_ibb2&r^ zK4rHWj3Vsbzp{9^4UJCDp7OPghz#24o5}^~=ZMzY@ChY{LB;8(N!N4Y@vAq76@h^U z!7;oi&z^0(p>;w!Dru0&gq7YKzSJR=lJ z0tsX{Dp^ez<6;DRn#O>@Oe(4Q`krO1!b)>H&yc~7)D`~k0m&7AmdDhgY!xpMX-(5v zup1etBeRb?0!QmS-NVnjJqyxZhwu-}x+u*t8@}&hl6Y16XBHNq;fvk!Hg19(xV4Is zb)Xk5mV%4o>rjXWWpc+R71%}SH(lfGyr|+JOUAc!_k6}h{Sz=K>GtKkgkt&weZhZd zZ0zs!^RleuFI*tquFaa=^WfKc2w*G1JdCd10M^qGE6coy4DPhBV?lu|gDMY(~*c}>9QwCj>1$ePZu5vn4DF>>XQ28^Mm z>ugvCSnQ-zuEwI=umg;QReIoCs{|)?E~P@8?{+6#N=~J6eZO9Z1zk|Spr@IrA>zs+ z5-$YbowOdVc3@o#WJI@bDW0`0FPG5~i{(Z+QlOtR$>o!w5M4gmdlIcPuauvj!KLOL z0|7K8x%wZpuR;Nu`_LR&@Tf`2Dhu1~HZn*qtI*2ol2>=kNJV;=lxK!nVV4zoNaP!P zY|&f}pmZJLWj?sOuUy$T#b=1IWdtpagI2k9(+Y~#V^+FBV#MHzn>jep}z1##%Pai5gjvxLH%#oAN0L{ znf-ywIv9MYuN4s(OFh6h5b#4NW2Y9Tz<4LF_**_1E+GJ_Kqexh4{J>}B4vM}KvuW8P5wzr?tS`-o$VA>-Fht0 zaM%&){OE}O`HGIDGV9j%(O%eo*9Z3t)9Kgw;({`=F|97>CfczoicK!e1zAz*ZPQY; zJBYN_$wvVU`d9;6CT=)aZwh_XU*L_ar{le6`_EGEm2=Lz@mPL7c>X+XpQ0CUm6Isd zU(|!Q5+626R%&Ygc@aN&-v#_|W7@wYi$~Euae?G+@DD9zo!_Du9$QhIWa^~Rtb!xzOPDJHyux8H0_fu8#EuCNVmxhE$5+g@H{kKtK zrXi*(XvI$`3_(vNel2HU)oAga>^oBYBQ6>0#^aNC5DaE?YC*=(nfhxR|L&A^Zk2>=`be#I5q#Gy7{2heMp zzw8)^I=ME7WMd9ZRK)=x$DP+L z_3EaBPVzRy9pQc7$mZr`zim0)y~O8`>MGx?%8AlxgG&YO=9_8-PX^CGliyiFs$Jy3 zic{bA_Fo3!G!Y<5&to(TsKx#tgmSpLwcxa{)k;&k)KyeQxD4mZjRarD6!4ka{|xrU zBhWsAj>W@v8h_cXT*3CsumCv{_xVg?ac~r9f#N}MDjTnv97ciiQ}rq70@?sqBaXw$ zqxH=0hs=p8ngsq&Zw(e_oi7g}fS!|Fr{Jq7Y4z4(8Uyk?yc9#VGa5>dSw<|`J2N~W zO}Us%IQTiJ)7N=I%R@qH5NW|&D4(-MQTn)LHbVd(A7#4z*Zfga{Z6&?DUAKRpwb}t zh0Bf0(u-_Td~ryKt}&d?z1$Y6VfE)4`j?O;O7OV^nX`C{<}qWLK=yO1cIPR|o45 zxU79;b#QG%x3Y9q+`@NNhqFz>WKVH7gkYo!R2I5DcfDbIW5wJoyJ@K%N4;Y8Dh71c z_P8!WPpSy?G(E2eob+A*Qp`BNJ(SYuDWrPRT{>-23YensWiKpys=snzN>6Q`viE!f z-Dq9Kj!kz7R~%X^4Jz}N8H!yjaXB_h25Q@mU@q9_XrK)-RMyE4hwt9w9tulOsAt~Q zd2@TH@jSa|mjz8P?mn(aZ8I`4bu|C_w{fi7dfgB70p5m+)3(w)~uQ7a{cFu^AHq?Fy<$TMc-J27P|hl zD#jnF-<^0IyJi9-hSfW`U&uve{$h_3|EFsP4cpm&1- zFrP@)ISMN)`d=ON>z$$@1@%d_fRB1^wQ+-NXf{BN2Fa05N6Pc5P%I0<5QFo%n)|*G4e_qm989>*e zf{U2a=@_dbzR1eeZ=0`8q!Fd8wt6SYW_?5kkPHxkF~jaPYv9;i$!iiuUGOeGmTn<{ z0UuK`Yga;@__yyWwp8tTX4V@Hd5 zm2qQ7$)F1%2&_V)k;1;7ugfWr$jqRyyJ*abIa1{EGGJyL!3j~&m;*R3L*yYHUE)V4 zsTy8y&b#$6(_R?uZ02;SX=#>zXh~-a9l0F)|T%A-S(Q;Uo4*4mI0T%) z@$y@s*1bhbgZ=K^y9tc%R>L{sW4=wY&1*=ItXcSQc#r06@y?k`kqkInH)p4;(##{t zI6>2@hLWvFce~Bxv*zmDxEQ^8iq^a4X$bnVG#$mnrPu98lQ!}EY*qu_gq8? zwrHLQE6eU4gqSF)yNH;AQCPhF52$z`&**ya>;if7EDdG8)~X1KT5er3osgPbZVTOze92)w?yU!G%CF!m#uYp_EXiqsgMR!TU4q3G2HA(>iC;r&}=`~tS?pvcT ztd4uJyE^ef7STdsE%};)tJ83Jq_m9-=0$HSF9BnGc5G?K*hs;71PhhxIk&@Sc8JEO z*COAQmw&lDx_X=b`S9XYtdI^BS>5jvp$)QEY9!9pk+n=4aHTp>K7v$-YNEFwS9`*0 zWt>Cg>+)JoiJO}X9#W1Eqq)FYj}XEqLKRD5cIk(caTmXvgzB%nem5X^v7L1J2HpFQ zEzAhX#4xIz2D{=r5vi=DFv7j#J1t39H(w;;F)r}6h_P*Kt64S};XvCC3V`{Nm2}SFx#M9mP@LR^l(c^k4Ax+7-h6L?_ zNB7&t_qRJpuWnWi4_JEDK+{HQFOHWa(yzz#ce2dQcg{Ef8Jr2Te3u0n{T_=~j-Bl{ zwie>js?~7@*3RcXS_?L~`lU#Z)wsd7XSZFwV6dA1WM+oZ+VQ`z_Lr^Gps23-U15kj zFLzxHFs!PDB_)a=4-$L7{Z>}vuy9@}FkJeKwJ<;Eu1dTMr5eFM=0TqjtBhjI0+#M% zh0?p!;J_^b=E)CFx|M>osB9xjGJEX6rrq7G+^WcHm&s5{kna?3I3G62&5V(lAFQ;J ztT(X^EHepfAa!34_1{7QSx0mSsr+hg%XqE14s5dXeFa^Tvmo9;(EmO!W;eLH_|u=z zBn@O&Sl-%WgC{UlVY3P}$TZ{^O5HB#Fxzlm5`im9wd3S zsCBqa2*adbIsf#M?)#bqd=w$p>p?RH<@oE2+4BNXZnmOrrE6102xBU&M^oE9T~a@^ zoZ+gV7=~?2Jy&gr}59{<>VF|2##{4+t?tgiuQrBXz;z#tw% zzfSvVz;_swcNl};yq`2{CwcAppF9l~?Pm)I`n2O=oO(z1h1#pG9Ip_K=V1*zL$6E1sIf=bMTv86B{ z@7A6zHO=(daQGdMQFINCoC=bZ(oQ~|k5q7=Zt7NiRB!F797;znhQ=k6jJe1X`dev` zyk@%k{$*f;++RJ1@lisM*4bP1L{}Tn6yw3`6HU7|PCMNr?G; zOm=+0#@=wD<{&NCm(Hqc;`kKnNh=5Wk;sIOKc!F1R`>1y`p>=LkT!xPE4XF_RFHN8 zajxn#-dnKy57Hg1Nh)Hw@eg}0k(KLI)7%GPk*}{iqXaJa!M~{eek6KBV~x3s z%ux@X93<~-jfLf=J5f5mIG~Kr>ZP?C4BN$MCDAQuSH6c2XU066k&woU0{&y>98LL` z0Yn6*7t^7;|1P8U^k<2Va;6cvzT-OF+_q4hzGTHS3{Zfv&G!ZaqJ*A82>2L{J-?N2 z3XO+oziv;FmzVHusSY#}gXd|Ce__031Nvg4c(79-FoN8VE}cD%-g^AsjA*ND+O@U_*cR#i@pm^-7EEy>sfy2ypAExKaG zPyG>QdlJFZ+y^}`^NXo-rUuPeHJ0@8Dd$n@ZOYs#QQ32|8x=(F?I+L%*nAI#3u_ch z-t)CAkeKU*K}E3KZm6=bJ08kPUeRH2BOCmfu2pcf4nvOBmR-@Q4xMNm0Z-Ig3aEp) z7@+`kK|6dRi$~Me`d9wTgU+`;3v}bl?xP_-$0@2b!2%X(YcDbtcuEYrBeP^-Z zC^5|-(!FH(|5~Hz+{GngO;0~2LF}El>Z$j#F%P54fzti7Kw0F~lA}^uyDBY(OnPsW zw0SxigEjyFZMvF^S03?RI2AsEa+D_}!=uVY1Di(v8Rl%rrrBIPBaON);P}4FPlchIBXp=eAI5$ae>^{%JBT*8k zYZH|fKHJm0pF?I`dm3ah3}ikYcaOw-`-&uq|G|AL(y5KYaup=7G79xYk)S;VtqJBQ z`=tmE&Ae)%LGf)W5D(v>gjkdO$yua&E+Y*O8{vNkr3*UO#ufi!C{OA}CYt1Jj=;dw zg-vxhB|=V-U4u8B)$wJ{DRbpViR#eUF^d*cwVT_NTylDG4Y)5e=Q*Y(0DN|af)gkv zI4re_+b4tBpohsUvd=6QDbN+hYDS)&y%)nF*Ia8vwM-90E(;>0#X$o&t4HuPbY1V!J}+(k)PZX%aR5Ly)w zGKAruNU@TP^}Fj&QLUl`35isA9M3gV;~nN$LY?6)P6eQLT<+#?17+H z-ONQ+W;3y4X7d?LZ8r;n%Arv>3@&M&f!37^Bq0}Qx@;ib87gM4-;{7>?Xv7q=dE{;@^XnVeTVT>i=3Y zEv_f|k`}t7dLmLI8?VK@fi*L!Tm3Q0knYS5!_EM3Y9x;Agv#Lli-Vq1A7?E8s>cdW zkliniGP|8!FF&@d4s}L}o03_b4N7lbW|Pq%UhM6=K_ow85KQS`7=)egU=TgAE=h1Q zmP%HwLzIpdqalOwKD2PEW2>Is34&&*4ncc%wCPg>K<1XE!+gqx&Ib_bBpH+FlGSX@ zk>mqxa$mi(-b)yZCdwY6$m3(WSUcpFO;1uJ)GFxl(7;*(9 z+0RMN7c9*Sm$7)W^*oM9=*tC)?}C^Si$y#@+<7ESVqF-0D5~7E1tpy%kax)D2|*ys z++z}?mhvpkImAx^D+oHT7zZe2IL$FllEqRhaI0oOnp&n7qB75F(}|nP0wgAtiN2g^ zp|O})-7T*m9)KfmvQ=C4frZVT3jm2hn#ne^h%QC%R!#ks6~(6(Q1KM@;HMITdMzOX zl5|7!JC;yiB3x5qH{}7J1o!6j>?D1EetLFwxv@>&&p$5e@oIrmqsRsq;X43>utFi4 zT~fk8TbLg!Q34)BI0_ME$3e;#(~z-ipEhV;1ujHIda#9Fl=c6Z_EKyxT`62K7cW~w z9P$>{&G>R_$#XX*dDYs=O9>9M8tS5)yP6g~POIQ96ApfARuT~(R8E%_H(15gu&B&b?X+CM`O1}MV;C)c%bDT%matlU?A^m*t=BNI z3oYKVl}ST$8kY??<5&Rz+DK>uE64HDCQdPC;fCdZuVu5fufHdYpy!#ZvB^fDNc=4W zq4{g1CFGPQ`iA@(tGUD$;nkeiFNoe)Ag?lXQ`C&-RC;%}Au(CT?30 zW?qHJ$LJGrXqA<^e4uykigN;8Ymgi@K9O!IcH9(5(XqI$3^h=g@eYg)2^O*35w7Hi z()*&Cs3lfY*8~Q`)sLFf#Xe6Q;!FCBv#{86ow#H|gb5R8m~wKwzw;5gzOA`t*K^SK zhezL?T&2gSe?GamM2y;@h`>^6?s;Ox<}lAKUKHrAYzlOsdvkYDCsyIk#iS~}{FFBxCqbzT+1jk9$ei+u!Q^`I=1c}(E z2oz^0SKpsse3u>`A77kYULxG@?D8##^rbszhgZ9b)X=6|9qEP|_eeMT>yIS(SM{W^ zG{clNDEi}C%`t}c4nL~>w&5WE=qPsZ6&|=R)ul8LyK6&}J7(1iLIM3`>B|}^$=*EF zZlqp4^7@n={lz%>0Dr5!1m1dfdKA6W3)3MaWd$--!L|l^__s~ismK!^$Ge&(C#3?0 z2xH|qSctR?1|aglz`JwH^)=m$#|Xa#7jb|K0Wc7=7!u+ns<^7>9uXV71&JY_Ggz5#%8I4eUi? zb^JaGb26B$E0WG2ITk0bbs_?=Riw&XaB`zK(#(ybc2V6dz#tR?$8yh(C_D|d#s7*E zlQ`kL7|8+&P(iavGj#cUUdz1CM0@!x67@8&ot&+5f%Sl@89%pK-d2@hg>F->HfyN2_ch~;AA&gjz{fDSQjIF z{ZRV_dX{VkVMlSxYH`-MWoa7J&MCOU%q~ujPcKf6uF}J^j_69_#w<2 zYEUz*x*_L>50@h_JBx5&UshpD8fX#}vym%Csx=0Y)@e^`(xxW#ss?*ka;zstKVze- zb4?bt%eECrgq;y@H)HOy2pxDa!ztM1afgf3*{X#{%-NDdD=zg6R1W@GmLts|cm)I{q)~mq?UA<+Y1tc%TnG@g z0~a=)v-?*PrqT9ObxuDt3yVwUISe6Isf_555@krDOf=~eJpDvMptlCR3#=${joW

7YOp~tj!4T)<@gMo7MRn1|pTbh#?cqsiFQSc8p3zH$;XGM7p zNK-j32g#MB^2<6~M2}exed=vQ%&f~ZZ3$h7cqUqHtC0*rU%zw2>fpDdV9@vkuzDv) z2@ve-sB4!CCWn3^ak1j{KYEZq{O2%zczL=8Xj!5C7YAQu5SF)I?CpP*1y^_D#c-Is z_oA(#goynLHt)=mQ$?8|!cwhP$}idW!5SZWNoXX0&K~7V(t!-0zkKnBgD=169ZUA& zP-}R*l6t0KFdwQ?4RGx)`md z-WW(qzDR1W&0Z2XG#S=QjkQj9l{g;@|FTTQX5~WPU8igzb^*&jJkXv1a5fWNMLinrYqpuV$oi0c_{c@up-1z zAFu;^Te}YKned8XkrpKfvk9)g?Hn2X<)hWiF7P80clg_xl$U85#wSz5*@WwUlT>qW zgV6-_7+>AwKW_#IvXqskn~kIS>HYnlf`w_^+o=Wa5+<7OrbVXf}l=!>Q5V)LyCC^+K@| z8jr#+_U**J*Ee0^uOJ`up{WzYlS#ka`v$eXy4Poio*aV6O`*jO6&k6mH2ZW*ifN+h zzeq`EXb!$DOP53Y(D-Pfuiczv*NFk$|2%0#o#*!-dVE@RAK!c6>AGpZ;_207*?IcK zLjy-F%EnanO4-n|tmAN=GnCglU((wfXIz7)oR*ryZZOUz z*yKKnRkWZE7YVi9IOZ+Z#b6K-Q$ErLI3{V4Ax<@Sa->o6z79H#6^iekKlRl zY`T4IBET134~5ZT@>bK=qKki#F2pSl2YIn@R1q>~!j&zak7( zeOzeZdmGUM12)JEcpG(UPD-M4&(<0)DcRqTTmVri$slf! zX<)M#F$_K2jXX2?s&3R&3FDVDo_W{ylu>IdsU6paC}O{p;ieg$k+-GH*k6-23kB<9 zU{|%qY2eyrlfFK@JUucukrZJ*J^Q1AJE~ZfO(umxTPz@tRa2^=6&FNJNfG4Lo<5*=5rdK$KNG#g5FxMcG(7T8%@?~S79q5;! zN)`$%jOGQvRKI{1Lrb#9`wL&_9viIkcHe!oFHzE8ZseJJ<(W1TeiJq_7!1A+gGO)N zMk`-C3>|?AR4@!&O^PrO#D)m)PoZv6Uw5D@`&i6CfJ{YFYEZ|a_4tabW#Szi5@pO4 zFIc^XCjB&Wn`Q-kMvT_YY{z%!nWcbk`-8)Bk)$5b$|N3#$)X4*3mQG+Q-paDg1FIm z6t|Xvz9+XCU`%rZn+f5j1$L|MkC4wOpYCc@m89FC&KEi3Vhf0-hkuiNIxF+xFe0r) zx8PfFgXnTR$9ZUa5+gx$6Ob|@OJ=Ke^PfNd?$h7Xz2RVQu+InW@B72vvlqRCgWlfD ze_(?2+y3AgetY@%gWj{=-rhgbf6!Y>N~6rR6SnfQ_v~fwU>H2Oos#Xe|NGxg9sl;edE2)ruSPszRmm5sG1>ZTtV#St|1?z<|U^{O5i3=9-%)%IV zwUk_7-L;g2S?{5V7z<+VXc$Pq>KJ9YCJMM6u>=K6et~6qE;!ZXj)exhJLq( zY#acjr^~7m$9Nus5-x%ANVN&T_shT+YbY*B5RnDFwbsgCjFcLh&vb2Y9HgatYCHO( z2oLO~g{o|WwU2f;iFP;r#dp_N{`h4!SNmWFmux8tpo@ul{}Vg!QQ=7vd0V7=-|P?e z4qgt1eAbg*Xy}B8lOCCVUJerp1#<-(g5UNM2kR5Wm^A6}U(OESk)S0Az#&wq(iUfb zv|0yYI!An5*0$N0YX;m_^IKjO;=W;^qLt2UGI1ww0$6<|tl53n=2u+*i*>(z5G&4T zgRiOpK~Gd7gK>+EJa#cA5`i3uW_oqBS?%^#uyVqC@jl=<1pKXPUlL>silqVyD(W#? z2J^t$umw>u*>14ovQC{R`FNBHbA3>>`2%4LVO6Q zN6nT?!J2D_-MskISz2O(=Pli zUu7_3M|_juZ1sgp$pNf56BCt1X8HZ5T`Yj5mH~^9eNxKhY(-FljSqr@L|HQ|yaae< zUx)|s6$;IQ>D}L*pb%LV&LQv-^&ZS#uVF4E5%?F_g+69Jz?xJJU@8T@w=#5L08m(p zi3qmOIS-8>A8UVpc4k)RJMhu^@VdhiW<}Pc3InfZGoxlqdY8LgXuV-Hz>y6`65gn49op*<54}C9?w|G- zXpTm)xd_`}k<=ysVkpVPvMgl)YZetofUR7^yKCnLx0}{X$s()YF_MwX?Ez9A^j`+` zcDS06yul8TDkuxaTJ=Uba7H>6`5{Fq|&7pZPH*E=R z{?Z=AmDRhKH9x$6@6ot_lMbFod$c!{yzq^8_8;|*N&w=6(6h%*P${eBNL!Ph-NNRK z&X%y>WSRlh^80*5+c8rk5@KYmm`>gZ#E5`R!wFR-QSCRyvJ$-JOft*a*32PxDi+#3 z?@AsBl&FI41o>lDz$4j)JHR1H^J#ywB5mk%3MjGFBHhth9uiZ{ud?~aUSjF*z4Y|r z_;AKD#T!D4=^mK5t>15o=8cVXR8&R`fEOIETl1P6aF>Cokle09; z7qQ*GY5-z-Bw2G?D)d(o?Sn-4qQfcn=)U8kEQ8^2|LL>g)0e}i!@-{Nc%P)iKm)<2#oB+$UBX$w5jTuKXM!rw((9k(zfLdtVusz;RkKg3I9J@;P> z7Y(*_@0}sy-NQ%jeI7utdEIBY90%6m)4(BI3nzednoCj7<*Hs*T<7ml_zh>$X+mX_ zI9%6t}$;o{t7sU^&|>sI^+}nbN5EwrbG`@%G2P{A7~*<)c^>K1NaWH&+*Zq=2~fBY#-erf z)GA*sZ+KE{#kD$I7ARv1Kb;s&y^<9bowYO@7%`kaTR1iD5AZF%-U9t-99Mq-rKGR> z4{qo6ht9CzI|n2a3l>Kh z6UNsI+$hX4=&bZZ>p}B)Mjsx#E}~ZM4JM1f-9LGVW;Bn@`75`vWst)h7wEgH#IP7& zX1;)xQxEX_-pejg;af|N*?ZX?*x8xB#zo-lfBy!ktUTk~Yk?Sh(rHmWqTHaEYBi1L z-eEe4QIk|9Yri{MZAHf9+Ggwt=MBe7n~Dp^{|6~lz~g#0L|*y!&Yk^lDlxF(==1qp+}=w`c9ePckE~W47WWBv@g$LC zjf|m(ur{n8+S-{*ex6TTEgu&JkU3Q|3E(+2yn`DWRiAsI3ExLVX^A?CK@*T#p0ghh zy|?0+XAg?XSp$rU+7#_dH3{8zwzsLvMHAMoDA^XLpzal3dUAgYZj@VE=wEJaWQ-#V z!nV!@=2V<7Uo$jLlR7)wa%7^GAK#!hz)ce0I6J)RZArOI-biqX%W~8Ti*_?1oFzz7 zMo7>$`3ld_PlD5UPaI3BFgJRMcbnzMIl0uv;EYH%7(riKCZ5aEdp!xa;0rXg69-~z z511F~LWZwP;2A}JtrXsdBhGuWpVF4*!D^~;H<>u)vIl5Qu5RFdU~^=sPuYIuMf&cV z*?Q?UO&mX(6St;$;F9+oWOd7ep0oc&$KsvC(USJMvvB3sFiVtQ0B(b{FRuIDM+|g` z+s#Y|SC#1qF}&@Uh{oFGLnv>@2a{HyvD)+M2UXY{EQ#dE?Ctfx`R1FgP0LT61K%B} z>%cX>t*wnAo{P~;PmIQ85eY4d&yumrk?D~7U}S69l>dcwwbriHb0|w}e;gd>HS()p z{mMVeE)FQy0qMMYIfUtF>*EZ<3KiR)-y1xC7JS&-_LhvtcQEV)AHp|&h5OG2J{#lXXshp+bcpQN4EX=1+$+Euqy>%&c&<-BMm_GRmOf8}bs7xy<_{PNnnmpAwL z(L3SVcz(~w@7tH~L%5E=a*~SQscv5{?wzdP%ZE2E_`FY_U+Ai6?C|BuNqYG9@|+!E z<0UNuV1ViG#dG&(Hg7-CqEbo)D;G(xpAFZ$p{dI^ zE|$-}5Lg@iU_e0jxz0yb<$pF;um4D_j6n!!^533bUY%e3#bNQRT6WY~5G{unDSE-1 z&namg$c>8~hId@jqP3`Z_vrorWg%Rd*W9AhAs`CaW#^UW()!tgMb7=rhSN*xH3uB) za9=4)_}kwqD|12L_RES-cXIgtG#zpN4Hxv3;_5Yy zeaY#B^z3m~7Wq7VO=HMgcCfy_JU-ph)@j*Sh@KExqSM;@teRKpdlC)XB0c;>x@uGX zKX2+hr$#wkJmca{tpug%lF{Es7#S_y3lJD$6}0L!3{_S6aA`doi$uj9e#{o5j2f($ zvJJ04{P9xiDhuKT5WZ$L5)(M<8rS8T2uW03)^*^e2dfMr%BL%VHu-=MfOb`K#g-n| z8)6_Wj(u$laV1*p$S{3u$FG0=>-77>i?h?S zKjLrx{$0_8-~&;l6-?CtYh)cmp;p;92`6y5(t77yBR0|0gEg(>N>Ym^SQs-Ax}zViY}larV`ZX@jYRhai{={#9iOI5Y?iQ&=a%y zzSL}Uij1^184N72a03YM%FldS}rHgu~1{g zGWNU%7DoHi`-wS4sA)vCQ?qG+bYm{kMd|<<9a!WVz;7J#oN&Bzw<4z_gkth^TeCq? zZ=Oanil$(AZNY0wNsRX2s8G;i;QNUL>7W{#VbbU@Tt%Ik!Ysw3vg0aZGluF3+_%CxcGb#OwsM vF;hP7!V%@>>!nMelsY3!N)5md#Lu3c9y!`%iBCmTNId^POERIDIy(UXY7dJc literal 0 HcmV?d00001 diff --git a/planetlab/ipfw.cron b/planetlab/ipfw.cron new file mode 100644 index 0000000..1b09340 --- /dev/null +++ b/planetlab/ipfw.cron @@ -0,0 +1,3 @@ +# Runs every 5 minutes and clean ipfw expired rules +# $Id: ipfw.cron 6069 2010-04-15 09:35:33Z marta $ +*/5 * * * * root echo "super killexpired" | /vsys/ipfw-be root > /dev/null 2>&1 diff --git a/planetlab/ipfwroot.spec b/planetlab/ipfwroot.spec new file mode 100644 index 0000000..aa7038f --- /dev/null +++ b/planetlab/ipfwroot.spec @@ -0,0 +1,135 @@ +# +# $Id: ipfwroot.spec 16174 2009-12-15 13:38:15Z marta $ +# +# TODO: +# restart crond +# +%define url $URL: svn+ssh://onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3/planetlab/ipfwroot.spec $ + +# Marta Carbone +# 2009 - Universita` di Pisa +# License is BSD. + +# kernel_release, kernel_version and kernel_arch are expected to be set by the build to e.g. +# kernel_release : vs2.3.0.29.1.planetlab +# kernel_version : 2.6.22.14 + +%define name ipfwroot +%define version 0.9 +%define taglevel 11 + +%define release %{kernel_version}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} +%define kernel_id_arch %{kernel_version}-%{kernel_release}-%{kernel_arch} +%define kernel_id %{kernel_version}-%{kernel_release} + +Summary: ipfw and dummynet for Linux +Name: %{name} +Version: %{version} +Release: %{release} +License: BSD +Group: System Environment/Kernel +Source0: %{name}-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot +Requires: vixie-cron +Requires: vsys-scripts + +Vendor: unipi +Packager: PlanetLab +# XXX ask +Distribution: PlanetLab %{plrelease} +URL: %(echo %{url} | cut -d ' ' -f 2) + +%description +ipfw is the Linux port of the FreeBSD ipfw and dummynet packages + +%prep +%setup + +%build +# clean the rpm build directory +rm -rf $RPM_BUILD_ROOT + +# with the new build, we use the kernel-devel rpm for building +%define kernelpath /usr/src/kernels/%{kernel_id_arch} + +%__make KERNELPATH=%kernelpath clean +%__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1 + +%install +install -D -m 755 dummynet2/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko +install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw +install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron +install -D -m 755 planetlab/ipfw $RPM_BUILD_ROOT/etc/rc.d/init.d/ipfw + +%clean +rm -rf $RPM_BUILD_ROOT + +%post +### this script is also triggered while the node image is being created at build-time +# some parts of the script do not make sense in this context +# this is why the build exports PL_BOOTCD=1 in such cases +depmod -a +/sbin/chkconfig --add ipfw +# start the service if not building +[ -z "$PL_BOOTCD" ] && service ipfw start + +%postun +# stop the service if not building +[ -z "$PL_BOOTCD" ] && service ipfw stop + +# here there is a list of the final installation directories +%files +%defattr(-,root,root) +%dir /lib/modules/%{kernel_id} +/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko +/sbin/ipfw +%{_sysconfdir}/cron.d/ipfw.cron +/etc/rc.d/init.d/ipfw + +%changelog +* Mon Apr 12 2010 Thierry Parmentelat - ipfw-0.9-11 +- add ipfw initialization script to chkconfig + +* Wed Mar 03 2010 Talip Baris Metin - ipfw-0.9-10 +- - Load module at installation - Marta + +* Mon Jan 11 2010 Thierry Parmentelat - ipfw-0.9-9 +- consistent with vsys-scripts-0.95-13 + +* Mon Jan 11 2010 Marta Carbone +- Integrated the ipfw rules cleanup into the backend + +* Sat Jan 09 2010 Thierry Parmentelat - ipfw-0.9-8 +- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits + +* Wed Jan 06 2010 Marta Carbone +- move to dummynet2, added support for table lookup +- added the vsys-script dependencies and the ipfw initialization + +* Tue Dec 15 2009 Marta Carbone +- more work on the radix code, added sysctl read/write support + +* Sun Nov 29 2009 Thierry Parmentelat - ipfw-0.9-7 +- added missing qsort.c - tag 0.9-6 was broken + +* Thu Nov 26 2009 Thierry Parmentelat - ipfw-0.9-6 +- root: removed goto into the main ipfw switch, enabled slice_id matching +- slice: completely move netconfig checks into the backend + +* Mon Nov 09 2009 Thierry Parmentelat - ipfw-0.9-5 +- additional features on matching packets, including uid match + +* Mon Sep 07 2009 Thierry Parmentelat - ipfw-0.9-4 +- on behalf of Marta Carbone, more options and features + +* Thu Jul 23 2009 Thierry Parmentelat - ipfw-0.9-3 +- fixed memory usage issue + +* Wed Jul 15 2009 Thierry Parmentelat - ipfw-0.9-2 +- patch for building on x86_64 + +* Thu Jun 25 2009 Marta Carbone +- post installation removed for deployment, moved manpages to the slice package + +* Fri Apr 17 2009 Marta Carbone +- Initial release diff --git a/planetlab/ipfwslice.spec b/planetlab/ipfwslice.spec new file mode 100644 index 0000000..3cd81eb --- /dev/null +++ b/planetlab/ipfwslice.spec @@ -0,0 +1,94 @@ +# +# $Id: ipfwslice.spec 16174 2009-12-15 13:38:15Z marta $ +# +# TODO: +# restart crond +# modprobe ipfw_mod.ko (depmod ?) +# +%define url $URL: svn+ssh://onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3/planetlab/ipfwslice.spec $ + +# Marta Carbone +# 2009 - Universita` di Pisa +# License is BSD. + +%define name ipfwslice +%define version 0.9 +%define taglevel 11 + +%define release %{kernel_version}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} +%define kernel_id_arch %{kernel_version}-%{kernel_release}-%{kernel_arch} +%define kernel_id %{kernel_version}-%{kernel_release} + +Summary: ipfw and dummynet for Linux +Name: %{name} +Version: %{version} +Release: %{release} +License: BSD +Group: System Environment/Kernel +Source0: %{name}-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot + +Vendor: unipi +Packager: PlanetLab +Distribution: PlanetLab %{plrelease} +URL: %(echo %{url} | cut -d ' ' -f 2) + +%description +the frontend part of the ipfw planetlab package + +%prep +%setup + +%build +rm -rf $RPM_BUILD_ROOT + +%install +install -D -m 755 planetlab/netconfig $RPM_BUILD_ROOT/sbin/netconfig +install -D -m 755 planetlab/ipfw.8.gz $RPM_BUILD_ROOT/%{_mandir}/man8/ipfw.8.gz + +%clean +rm -rf $RPM_BUILD_ROOT + +# here there is a list of the final installation directories +%files +%defattr(-,root,root) +/sbin/netconfig +%{_mandir}/man8/ipfw.8* + +%changelog +* Mon Apr 12 2010 Thierry Parmentelat - ipfw-0.9-11 +- add ipfw initialization script to chkconfig + +* Wed Mar 03 2010 Talip Baris Metin - ipfw-0.9-10 +- - Load module at installation - Marta + +* Mon Jan 11 2010 Thierry Parmentelat - ipfw-0.9-9 +- consistent with vsys-scripts-0.95-13 + +* Sat Jan 09 2010 Thierry Parmentelat - ipfw-0.9-8 +- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits + +* Tue Dec 15 2009 Marta Carbone +- more work on the radix code, added sysctl read/write support + +* Sun Nov 29 2009 Thierry Parmentelat - ipfw-0.9-7 +- added missing qsort.c - tag 0.9-6 was broken + +* Thu Nov 26 2009 Thierry Parmentelat - ipfw-0.9-6 +- root: removed goto into the main ipfw switch, enabled slice_id matching +- slice: completely move netconfig checks into the backend + +* Mon Nov 09 2009 Thierry Parmentelat - ipfw-0.9-5 +- additional features on matching packets, including uid match + +* Mon Sep 07 2009 Thierry Parmentelat - ipfw-0.9-4 +- on behalf of Marta Carbone, more options and features + +* Thu Jul 23 2009 Thierry Parmentelat - ipfw-0.9-3 +- fixed memory usage issue + +* Wed Jul 15 2009 Thierry Parmentelat - ipfw-0.9-2 +- patch for building on x86_64 + +* Thu Jun 25 2009 Marta Carbone +- Initial release diff --git a/planetlab/netconfig b/planetlab/netconfig new file mode 100755 index 0000000..7108582 --- /dev/null +++ b/planetlab/netconfig @@ -0,0 +1,14 @@ +#!/bin/sh +# +# Marta Carbone, Luigi Rizzo +# Copyright (C) 2009 Universita` di Pisa +# $Id: netconfig 4533 2009-12-16 14:39:23Z luigi $ +# +# This script is the frontend to be used with the vsys system. +# It simply passes information to the backend and gets back the reply + +PIPE_IN=/vsys/ipfw-be.in +PIPE_OUT=/vsys/ipfw-be.out + +sudo sh -c "echo $* >> ${PIPE_IN}" +sudo sh -c "cat ${PIPE_OUT}" diff --git a/planetlab/planetlab-tags.mk b/planetlab/planetlab-tags.mk new file mode 100644 index 0000000..25eff0e --- /dev/null +++ b/planetlab/planetlab-tags.mk @@ -0,0 +1,6 @@ +# $Id: planetlab-tags.mk 7450 2010-10-18 11:17:43Z marta $ +# These are good to build the ipfw modules from svn on kernels 2.6.22 +# and are used to fetch files from the onelab2 repository. +linux-2.6-SVNBRANCH := 22 +linux-2.6-SVNPATH := http://svn.planet-lab.org/svn/linux-2.6/tags/linux-2.6-22-39-1 +ipfwsrc-SVNPATH := svn+ssh://luigi%40onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3 diff --git a/planetlab/planetlab.mk b/planetlab/planetlab.mk new file mode 100644 index 0000000..6d3504b --- /dev/null +++ b/planetlab/planetlab.mk @@ -0,0 +1,26 @@ +# $Id: planetlab.mk 4533 2009-12-16 14:39:23Z luigi $ +# .mk file to build a module +kernel-MODULES := linux-2.6 +kernel-SPEC := kernel-2.6.spec +kernel-BUILD-FROM-SRPM := yes +ifeq "$(HOSTARCH)" "i386" +kernel-RPMFLAGS:= --target i686 +else +kernel-RPMFLAGS:= --target $(HOSTARCH) +endif +ALL += kernel + +ipfwroot-MODULES := ipfwsrc +ipfwroot-SPEC := planetlab/ipfwroot.spec +ipfwroot-DEPEND-DEVEL-RPMS := kernel-devel +ipfwroot-SPECVARS = kernel_version=$(kernel.rpm-version) \ + kernel_release=$(kernel.rpm-release) \ + kernel_arch=$(kernel.rpm-arch) +ALL += ipfwroot + +ipfwslice-MODULES := ipfwsrc +ipfwslice-SPEC := planetlab/ipfwslice.spec +ipfwslice-SPECVARS = kernel_version=$(kernel.rpm-version) \ + kernel_release=$(kernel.rpm-release) \ + kernel_arch=$(kernel.rpm-arch) +ALL += ipfwslice diff --git a/planetlab/sample_hook b/planetlab/sample_hook new file mode 100755 index 0000000..b47c8de --- /dev/null +++ b/planetlab/sample_hook @@ -0,0 +1,34 @@ +#!/bin/sh + +# +# Marta Carbone +# 2009 - Universita` di Pisa +# +# This is a sample hook file in charge to collect +# statistical information on netconfig usage. It dumps +# on a log file slicename, port and the configuration string +# used to configure a dummynet experiment. +# +# Each time a user configure a dummynet port, this file +# will be executed. +# The following variables will be passed as argument: +# +# ${SLICE} ${PORT} ${CONFIG_STRING} +# ${SLICE} The slicename executing the netconfig command +# ${PORT} The port to be configured +# ${CONFIG_STRING} The configuration string +# +# Note that this script can get additional information +# by executing the ipfw command, e.g. +# ipfw list # list of installed rules +# ipfw show # list of rules and statistical information +# ipfw pipe show # list of pipes +# +# a complete list of ipfw commands is available at: +# http://www.freebsd.org/cgi/man.cgi?query=ipfw&sektion=8 + +# logfile +LOG_FILE=/tmp/ipfw_hook.log + +echo -e `date` >> ${LOG_FILE} +echo "$*" >> ${LOG_FILE} diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..9ed47f8 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,53 @@ +# +# $Id: Makefile 5626 2010-03-04 21:55:22Z luigi $ +# +# Makefile for building userland tests +# this is written in a form compatible with gmake + +SCHED_SRCS = test_dn_sched.c +SCHED_SRCS += dn_sched_fifo.c +SCHED_SRCS += dn_sched_wf2q.c +SCHED_SRCS += dn_sched_qfq.c +SCHED_SRCS += dn_sched_rr.c +SCHED_SRCS += dn_heap.c +SCHED_SRCS += main.c + +SCHED_OBJS=$(SCHED_SRCS:.c=.o) + +HEAP_SRCS = dn_heap.c test_dn_heap.c +HEAP_OBJS=$(HEAP_SRCS:.c=.o) + +VPATH= .:../dummynet2 + +#CFLAGS = -I../dummynet2/include -I. -Wall -Werror -O3 -DIPFW +CFLAGS = -I. -I../dummynet2/include/netinet/ipfw -DIPFW +CFLAGS += -Wall -Werror +CFLAGS += -g -O3 +TARGETS= test_sched # no test_heap by default + +all: $(TARGETS) + +test_heap : $(HEAP_OBJS) + $(CC) -o $@ $(HEAP_OBJS) + +test_sched : $(SCHED_OBJS) + $(CC) -o $@ $(SCHED_OBJS) + +$(SCHED_OBJS): dn_test.h +main.o: mylist.h + +clean: + - rm *.o $(TARGETS) *.core + +ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \ + dn_sched.h dn_heap.h ip_dn_private.h Makefile +TMPBASE = /tmp/testXYZ +TMPDIR = $(TMPBASE)/test + +tgz: + -rm -rf $(TMPDIR) + mkdir -p $(TMPDIR) + -cp -p $(ALLSRCS) $(TMPDIR) + -(cd ..; cp -p $(ALLSRCS) $(TMPDIR)) + ls -la $(TMPDIR) + (cd $(TMPBASE); tar cvzf /tmp/test.tgz test) diff --git a/test/basic_ipfw.sh b/test/basic_ipfw.sh new file mode 100755 index 0000000..08b66f9 --- /dev/null +++ b/test/basic_ipfw.sh @@ -0,0 +1,72 @@ +#!/bin/sh + +IPFW=./ipfw/ipfw +PING=/bin/ping +RH=127.0.0.1 # remote host +R=10 # test rule number +P=1 # test pipe number + +abort() +{ +echo $* +} + +#insmod dummynet2/ipfw_mod.ko +#$IPFW show > /dev/null +#$IPFW pipe show +echo "Flushing rules, do you agree ?" +$IPFW flush + +# test_msg rule counter +clean() +{ + $IPFW delete $R 2> /dev/null + $IPFW pipe $P delete 2> /dev/null +} + +# simple counter/allow test +echo -n "counter/allow test..." +clean +$IPFW add $R allow icmp from any to 127.0.0.1 > /dev/null +$PING -f -c100 $RH > /dev/null +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f3` +[ ! $counter -eq 400 ] && abort "Wrong counter $counter 400" +echo "...OK" + +# simple drop test +echo -n "deny test..." +clean +$IPFW add $R deny icmp from any to 127.0.0.1 > /dev/null +$PING -f -c10 -W 1 $RH > /dev/null +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ ! $counter -eq 10 ] && abort "Wrong counter $counter 10" +echo "...OK" + +# pipe delay test +echo -n "pipe delay test..." +clean +$IPFW pipe $P config delay 2000ms >/dev/null +$IPFW add $R pipe $P icmp from any to $RH >/dev/null +$PING -f -c10 -W 1 $RH > /dev/null +counter1=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +sleep 2 +counter2=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ ! $counter1 -eq 10 ] && abort "Wrong counter $counter 10" +[ ! $counter2 -eq 20 ] && abort "Wrong counter $counter 20" +echo "...OK" + +# pipe bw test +echo -n "pipe bw test..." +clean +$IPFW pipe $P config bw 2Kbit/s >/dev/null +$IPFW add $R pipe $P icmp from any to $RH >/dev/null +$PING -i 0.1 -c10 -W 1 $RH > /dev/null +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30" +sleep 1 +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30" +echo "...OK" + +# Final clean +clean diff --git a/test/dn_test.h b/test/dn_test.h new file mode 100644 index 0000000..f2a4a51 --- /dev/null +++ b/test/dn_test.h @@ -0,0 +1,157 @@ +/* + * $Id: dn_test.h 5626 2010-03-04 21:55:22Z luigi $ + * + * userspace compatibility code for dummynet schedulers + */ + +#ifndef _DN_TEST_H +#define _DN_TEST_H +#include +#include +#include +#include /* bzero, ffs, ... */ +#include /* strcmp */ +#include +#include +#include + +extern int debug; +#define ND(fmt, args...) do {} while (0) +#define D1(fmt, args...) do {} while (0) +#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ + __FUNCTION__, ## args) +#define DX(lev, fmt, args...) do { \ + if (debug > lev) D(fmt, ## args); } while (0) + + +#define offsetof(t,m) (int)((&((t *)0L)->m)) + +#include + +/* prevent include of other system headers */ +#define _NETINET_IP_VAR_H_ /* ip_fw_args */ +#define _IPFW2_H +#define _SYS_MBUF_H_ + +enum { + DN_QUEUE, +}; + +enum { + DN_SCHED_FIFO, + DN_SCHED_WF2QP, +}; + +struct dn_id { + int type, subtype, len, id; +}; +struct dn_fs { + int par[4]; /* flowset parameters */ + + /* simulation entries. + * 'index' is not strictly necessary + * y is used for the inverse mapping , + */ + int index; + int y; /* inverse mapping */ + int base_y; /* inverse mapping */ + int next_y; /* inverse mapping */ + int n_flows; + int first_flow; + int next_flow; /* first_flow + n_flows */ + /* + * when generating, let 'cur' go from 0 to n_flows-1, + * then point to flow first_flow + cur + */ + int cur; +}; +struct dn_sch { +}; +struct dn_flow { + struct dn_id oid; + int length; + int len_bytes; + int drops; + uint64_t tot_bytes; + uint32_t flow_id; + struct list_head h; /* used by the generator */ +}; +struct dn_link { +}; + +struct ip_fw_args { +}; + +struct mbuf { + struct { + int len; + } m_pkthdr; + struct mbuf *m_nextpkt; + int flow_id; /* for testing, index of a flow */ + //int flowset_id; /* for testing, index of a flowset */ + void *cfg; /* config args */ +}; + +#define MALLOC_DECLARE(x) +#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) +struct ipfw_flow_id { +}; + +typedef void * module_t; +struct _md_t { + const char *name; + int (*f)(module_t, int, void *); + void *p; +}; +typedef struct _md_t moduledata_t; +#define DECLARE_MODULE(name, b, c, d) \ + moduledata_t *_g_##name = & b +#define MODULE_DEPEND(a, b, c, d, e) + +#ifdef IPFW +#include +#include +#include +#else +struct dn_queue { + struct dn_fsk *fs; /* parent flowset. */ + struct dn_sch_inst *_si; /* parent sched instance. */ +}; +struct dn_schk { +}; +struct dn_fsk { + struct dn_fs fs; + struct dn_schk *sched; +}; +struct dn_sch_inst { + struct dn_schk *sched; +}; +struct dn_alg { + int type; + const char *name; + void *enqueue, *dequeue; + int q_datalen, si_datalen, schk_datalen; + int (*config)(struct dn_schk *); + int (*new_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *); + int (*new_queue)(struct dn_queue *q); +}; + +#endif + +#ifndef __FreeBSD__ +int fls(int); +#endif + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +#endif /* _DN_TEST_H */ diff --git a/test/dynrules.sh b/test/dynrules.sh new file mode 100644 index 0000000..98f5fe6 --- /dev/null +++ b/test/dynrules.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# 20100507 marta, quick test for dyn rules +# ./ipfw/ipfw -d show |grep \ 80 + +IPFW_MOD=dummynet2/ipfw_mod.ko +IPFW=ipfw/ipfw + +# main +# remove any previous loaded module +/sbin/rmmod ipfw_mod +/sbin/insmod ${IPFW_MOD} +echo "25" > /sys/module/ipfw_mod/parameters/dyn_ack_lifetime +${IPFW} add 1 check-state +${IPFW} add 9 allow all from any to any keep-state +${IPFW} add 10 allow all from any to onelab1.iet.unipi.it keep-state + +telnet 72.14.234.104 80 + + diff --git a/test/interpolation.c b/test/interpolation.c new file mode 100644 index 0000000..d6731f1 --- /dev/null +++ b/test/interpolation.c @@ -0,0 +1,335 @@ +#include +#include +#include + +/* gcc interpolation.c -o interpolation */ + +void +err(int eval, const char *fmt, ...) +{ +} +void +errx(int eval, const char *fmt, ...) +{ +} + + +#define ED_MAX_SAMPLES_NO 1000 +#define ED_MAX_LINE_LEN 128 +#define EX_DATAERR 1 +#define EX_UNAVAILABLE 3 +#define ED_TOK_DELAY "delay" +#define ED_TOK_PROB "prob" +#define ED_SEPARATORS " \t\n" +#define ED_TOK_PROFILE_NO "profile_no" + + +struct point { + double prob; /* y */ + double delay; /* x */ +}; + +struct profile { + char filename[128]; /* profile filename */ + int samples[ED_MAX_SAMPLES_NO+1]; /* may be shorter */ + int samples_no; /* actual len of samples[] */ +}; + +/* + * returns 1 if s is a non-negative number, with at least one '.' + */ +static int +is_valid_number(const char *s) +{ +#if 0 + int i, dots_found = 0; + int len = strlen(s); + + for (i = 0; i 1)) + return 0; +#endif + return 1; +} + +static int +compare_points(const void *vp1, const void *vp2) +{ + const struct point *p1 = vp1; + const struct point *p2 = vp2; + double res = 0; + + res = p1->prob - p2->prob; + if (res == 0) + res = p1->delay - p2->delay; + if (res < 0) + return -1; + else if (res > 0) + return 1; + else + return 0; +} + +#define ED_EFMT(s) 1,"error in %s at line %d: "#s,filename,lineno + +/* + * The points defined by the user are stored in the ponts structure. + * The number of user defined points is stored in points_no. + * We assume that The last point for the '1' value of the + * probability should be defined. (XXX add checks for this) + * The user defined sampling value is stored in samples_no. + * The resulting samples are in the "samples" pointer. + */ +static void +interpolate_samples(struct point *p, int points_no, + int *samples, int samples_no, const char *filename) +{ + double dy; /* delta on the y axis */ + double y; /* current value of y */ + double x; /* current value of x */ + double m; /* the y slope */ + int i; /* samples index */ + int curr; /* points current index */ + + dy = 1.0/samples_no; + y = 0; + + for (i=0, curr = 0; i < samples_no; i++, y+=dy) { + /* This statment move the curr pointer to the next point + * skipping the points with the same x value. We are + * guaranteed to exit from the loop because the + * last possible value of y is stricly less than 1 + * and the last possible value of the y points is 1 */ + while ( y >= p[curr+1].prob ) curr++; + + /* compute the slope of the curve */ + m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob); + /* compute the x value starting from the current point */ + x = p[curr].delay + (y - p[curr].prob) * m; + samples[i] = x; + } + + /* add the last sample */ + samples[i] = p[curr+1].delay; +} + +#if 0 +static void +interpolate_samples_old(struct point *points, int points_no, + int *samples, int samples_no, const char *filename) +{ + int i; /* pointer to the sampled array */ + int j = 0; /* pointer to user defined samples */ + double dy; /* delta y */ + double y; /* current value of y */ + int x; /* computed value of x */ + double m; /* slope of the line */ + double y1, x1, y2, x2; /* two points of the current line */ + + /* make sure that there are enough points. */ + /* XXX Duplicated shoule be removed */ + if (points_no < 3) + errx(EX_DATAERR, "%s too few samples, need at least %d", + filename, 3); + + qsort(points, points_no, sizeof(struct point), compare_points); + + samples_no--; + dy = 1.0/samples_no; + printf("\nsamples no is %d dy is %f ", samples_no, dy); + + /* start with the first two points */ + y1 = points[j].prob * samples_no; + x1 = points[j].delay; + j++; + y2 = points[j].prob * samples_no; + x2 = points[j].delay; + + m = (y2-y1)/(x2-x1); + printf("\nStart"); + printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f m %f\n", + x1, y1, x2, y2, m); + + y = 0; + x = x1; + + for(i=0; i < samples_no+1; i++, y+=dy) { + printf("\ni:%d j:%d y:%f real y:%f", i, j, y, y*samples_no); + if ( (y*samples_no) >= y2 ) { /* move to the next point */ + j++; + if ( j >= points_no ) { + printf("\n\tNo more points, exit with j: %d i: %d and y:%f %f\n", + j, i, y, (y*samples_no)); + break; /* no more user defined points */ + } + /* load a new point */ + y1 = y2; + x1 = x2; + y2 = points[j].prob * samples_no; + x2 = points[j].delay; + m = (y2-y1)/(x2-x1); + if (x1==x2) { /* m = infinito */ + m = -1; + x = x2; + } + /* very small m problem */ + printf ("\ndelta %f\n", (y1 - y2)); + if (abs(y1 - y2) < 0.00001) { /* m = 0 XXX Should this magic number depend on samples_no ? */ + m = 0; + x = x2; + } + printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f (%f/%f)=m \n", + x1, y1, x2, y2, (y2-y1), (x2-x1), m); + } + printf("\n\tcompute step y %f x[%d]=%d ", + y, i, x); + if ((m != -1) && ( m != 0 )) { + x = x + (dy * samples_no)/m; + } + samples[i] = x; + printf(" dy %f x new %d\n", dy*samples_no, x); + printf(" m %f (dy * samples_no)/m %f \n", m, (dy * samples_no)/m); + } + + x = samples[i-1]; + printf("Finish i is %d samples_no is %d\n", i, samples_no); + /* The last point has a probability less than 1 */ + for (; i <= samples_no; i++) + samples[i] = x; +} +#endif + +static void +load_profile(struct profile *p) +{ + FILE *f; /* file handler */ + char line[ED_MAX_LINE_LEN]; + int lineno = 0; + int do_points = 0; + int delay_first = -1; + int i; + + struct point points[1000]; /* MAX_POINTS_NO */ + int points_no = 0; + + char *filename = p->filename; + f = fopen(filename, "r"); + if (f == NULL) { + err(EX_UNAVAILABLE, "fopen: %s", filename); + } + + + while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ + char *s, *cur = line, *name = NULL, *arg = NULL; + + ++lineno; + + /* parse the line */ + while (cur) { + s = strsep(&cur, ED_SEPARATORS); + if (s == NULL || *s == '#') + break; + if (*s == '\0') + continue; + if (arg) + errx(ED_EFMT("too many arguments")); + if (name == NULL) + name = s; + else + arg = s; + } + + if (name == NULL) + continue; + + if (!strcasecmp(name, ED_TOK_DELAY)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 1; + do_points = 1; + continue; + } else if (!strcasecmp(name, ED_TOK_PROB)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 0; + do_points = 1; + continue; + } + if (!strcasecmp(name, ED_TOK_PROFILE_NO)) { + int p_no = atof(arg); + if (p_no <= 0) { + p_no = 100; + printf("invalid interpolation samples, using %d\n", + p_no); + } + if (p_no > ED_MAX_SAMPLES_NO) { + p_no = ED_MAX_SAMPLES_NO; + printf("invalid interpolation samples, using %d\n", + p_no); + } + + p->samples_no = p_no; + continue; + + } else if (do_points) { + if (!is_valid_number(name) || !is_valid_number(arg)) + errx(ED_EFMT("invalid point found")); + if (delay_first) { + points[points_no].delay = atof(name); + points[points_no].prob = atof(arg); + } else { + points[points_no].delay = atof(arg); + points[points_no].prob = atof(name); + } + if (points[points_no].prob > 1.0) + errx(ED_EFMT("probability greater than 1.0")); + ++points_no; + /* XXX no more that 1000 */ + continue; + } else { + errx(ED_EFMT("unrecognised command '%s'"), name); + } + } + + for(i=0; i < p->samples_no; i++) { + p->samples[i] = 666; + } + + /* This code assume the user define a value of X for the sampling value, + * and that: + * - the value stored in the emulator structure is X; + * - the allocated structure for the samples is X+1; + */ + interpolate_samples(points, points_no, p->samples, p->samples_no, filename); + + // User defined samples + printf("\nLoaded %d points:\n", points_no); + for(i=0; i < points_no; i++) { + printf("%f %f\n", points[i].prob, points[i].delay); + } + printf("\n"); + printf("The sample value is %d \n", p->samples_no); + +} + +int main(int argc, char **argv) +{ + if (argc < 2) { + printf("Usage: ./interpolation \n"); + return -1; + } + + char *filename; + filename = argv[1]; + + struct profile p; + int i; + + strncpy(p.filename, filename, 128); + load_profile(&p); + printf("-----------\n"); + for (i=0; i<=p.samples_no; i++) + printf("%d %d\n", i, p.samples[i]); + printf("-----------\n"); + return 0; +} diff --git a/test/main.c b/test/main.c new file mode 100644 index 0000000..85fc621 --- /dev/null +++ b/test/main.c @@ -0,0 +1,636 @@ +/* + * $Id: main.c 5626 2010-03-04 21:55:22Z luigi $ + * + * Testing program for schedulers + * + * The framework include a simple controller which, at each + * iteration, decides whether we can enqueue and/or dequeue. + * Then the mainloop runs the required number of tests, + * keeping track of statistics. + */ + +#include "dn_test.h" + +struct q_list { + struct list_head h; +}; + +struct cfg_s { + int ac; + char * const *av; + + const char *name; + int loops; + struct timeval time; + + /* running counters */ + uint32_t _enqueue; + uint32_t drop; + uint32_t pending; + uint32_t dequeue; + + /* generator parameters */ + int th_min, th_max; + int maxburst; + int lmin, lmax; /* packet len */ + int flows; /* number of flows */ + int flowsets; /* number of flowsets */ + int wsum; /* sum of weights of all flows */ + int max_y; /* max random number in the generation */ + int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ + const char *fs_config; /* flowset config */ + int can_dequeue; + int burst; /* count of packets sent in a burst */ + struct mbuf *tosend; /* packet to send -- also flag to enqueue */ + + struct mbuf *freelist; + + struct mbuf *head, *tail; /* a simple tailq */ + + /* scheduler hooks */ + int (*enq)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*deq)(struct dn_sch_inst *); + /* size of the three fields including sched-specific areas */ + int schk_len; + int q_len; /* size of a queue including sched-fields */ + int si_len; /* size of a sch_inst including sched-fields */ + char *q; /* array of flow queues */ + /* use a char* because size is variable */ + struct dn_fsk *fs; /* array of flowsets */ + struct dn_sch_inst *si; + struct dn_schk *sched; + + /* generator state */ + int state; /* 0 = going up, 1: going down */ + + /* + * We keep lists for each backlog level, and always serve + * the one with shortest backlog. llmask contains a bitmap + * of lists, and ll are the heads of the lists. The last + * entry (BACKLOG) contains all entries considered 'full' + * XXX to optimize things, entry i could contain queues with + * 2^{i-1}+1 .. 2^i entries. + */ +#define BACKLOG 30 + uint32_t llmask; + struct list_head ll[BACKLOG + 10]; +}; + +/* FI2Q and Q2FI converts from flow_id to dn_queue and back. + * We cannot easily use pointer arithmetic because it is variable size. + */ +#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) +#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) + +int debug = 0; + +struct dn_parms dn_cfg; + +static void controller(struct cfg_s *c); + +/* release a packet: put the mbuf in the freelist, and the queue in + * the bucket. + */ +int +drop(struct cfg_s *c, struct mbuf *m) +{ + struct dn_queue *q; + int i; + + c->drop++; + q = FI2Q(c, m->flow_id); + i = q->ni.length; // XXX or ffs... + + ND("q %p id %d current length %d", q, m->flow_id, i); + if (i < BACKLOG) { + struct list_head *h = &q->ni.h; + c->llmask &= ~(1<<(i+1)); + c->llmask |= (1<<(i)); + list_del(h); + list_add_tail(h, &c->ll[i]); + } + m->m_nextpkt = c->freelist; + c->freelist = m; + return 0; +} + +/* dequeue returns NON-NULL when a packet is dropped */ +static int +enqueue(struct cfg_s *c, void *_m) +{ + struct mbuf *m = _m; + if (c->enq) + return c->enq(c->si, FI2Q(c, m->flow_id), m); + if (c->head == NULL) + c->head = m; + else + c->tail->m_nextpkt = m; + c->tail = m; + return 0; /* default - success */ +} + +/* dequeue returns NON-NULL when a packet is available */ +static void * +dequeue(struct cfg_s *c) +{ + struct mbuf *m; + if (c->deq) + return c->deq(c->si); + if ((m = c->head)) { + m = c->head; + c->head = m->m_nextpkt; + m->m_nextpkt = NULL; + } + return m; +} + +static int +mainloop(struct cfg_s *c) +{ + int i; + struct mbuf *m; + + for (i=0; i < c->loops; i++) { + /* implement histeresis */ + controller(c); + DX(3, "loop %d enq %d send %p rx %d", + i, c->_enqueue, c->tosend, c->can_dequeue); + if ( (m = c->tosend) ) { + c->_enqueue++; + if (enqueue(c, m)) { + drop(c, m); + ND("loop %d enqueue fail", i ); + } else { + ND("enqueue ok"); + c->pending++; + } + } + if (c->can_dequeue) { + c->dequeue++; + if ((m = dequeue(c))) { + c->pending--; + drop(c, m); + c->drop--; /* compensate */ + } + } + } + DX(1, "mainloop ends %d", i); + return 0; +} + +int +dump(struct cfg_s *c) +{ + int i; + struct dn_queue *q; + + for (i=0; i < c->flows; i++) { + q = FI2Q(c, i); + DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); + } + DX(1, "done %d loops\n", c->loops); + return 0; +} + +/* interpret a number in human form */ +static long +getnum(const char *s, char **next, const char *key) +{ + char *end = NULL; + long l; + + if (next) /* default */ + *next = NULL; + if (s && *s) { + DX(3, "token is <%s> %s", s, key ? key : "-"); + l = strtol(s, &end, 0); + } else { + DX(3, "empty string"); + l = -1; + } + if (l < 0) { + DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); + return 0; // invalid + } + if (!end || !*end) + return l; + if (*end == 'n') + l = -l; /* multiply by n */ + else if (*end == 'K') + l = l*1000; + else if (*end == 'M') + l = l*1000000; + else if (*end == 'k') + l = l*1024; + else if (*end == 'm') + l = l*1024*1024; + else if (*end == 'w') + ; + else {/* not recognized */ + D("suffix %s for %s, next %p", end, key, next); + end--; + } + end++; + DX(3, "suffix now %s for %s, next %p", end, key, next); + if (next && *end) { + DX(3, "setting next to %s for %s", end, key); + *next = end; + } + return l; +} + +/* + * flowsets are a comma-separated list of + * weight:maxlen:flows + * indicating how many flows are hooked to that fs. + * Both weight and range can be min-max-steps. + * In a first pass we just count the number of flowsets and flows, + * in a second pass we complete the setup. + */ +static void +parse_flowsets(struct cfg_s *c, const char *fs, int pass) +{ + char *s, *cur, *next; + int n_flows = 0, n_fs = 0, wsum = 0; + int i, j; + struct dn_fs *prev = NULL; + + DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); + if (pass == 0) + c->fs_config = fs; + s = c->fs_config ? strdup(c->fs_config) : NULL; + if (s == NULL) { + if (pass == 0) + D("no fsconfig"); + return; + } + for (next = s; (cur = strsep(&next, ","));) { + char *p = NULL; + int w, w_h, w_steps, wi; + int len, len_h, l_steps, li; + int flows; + + w = getnum(strsep(&cur, ":"), &p, "weight"); + if (w <= 0) + w = 1; + w_h = p ? getnum(p+1, &p, "weight_max") : w; + w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); + len = getnum(strsep(&cur, ":"), &p, "len"); + if (len <= 0) + len = 1000; + len_h = p ? getnum(p+1, &p, "len_max") : len; + l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); + flows = getnum(strsep(&cur, ":"), NULL, "flows"); + if (flows == 0) + flows = 1; + DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", + w, w_h, w_steps, len, len_h, l_steps, flows); + if (w == 0 || w_h < w || len == 0 || len_h < len || + flows == 0) { + DX(4,"wrong parameters %s", fs); + return; + } + n_flows += flows * w_steps * l_steps; + for (i = 0; i < w_steps; i++) { + wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); + for (j = 0; j < l_steps; j++, n_fs++) { + struct dn_fs *fs = &c->fs[n_fs].fs; // tentative + int x; + + li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); + x = (wi*2048)/li; + DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", + n_fs, wi, li, x, flows); + if (pass == 0) + continue; + if (c->fs == NULL || c->flowsets <= n_fs) { + D("error in number of flowsets"); + return; + } + wsum += wi * flows; + fs->par[0] = wi; + fs->par[1] = li; + fs->index = n_fs; + fs->n_flows = flows; + fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; + fs->next_flow = fs->first_flow + fs->n_flows; + fs->y = x * flows; + fs->base_y = (prev == NULL) ? 0 : prev->next_y; + fs->next_y = fs->base_y + fs->y; + prev = fs; + } + } + } + c->max_y = prev ? prev->base_y + prev->y : 0; + c->flows = n_flows; + c->flowsets = n_fs; + c->wsum = wsum; + if (pass == 0) + return; + + /* now link all flows to their parent flowsets */ + DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); + for (i=0; i < c->flowsets; i++) { + struct dn_fs *fs = &c->fs[i].fs; + DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", + i, fs->par[0], fs->par[1], + fs->first_flow, fs->next_flow, + fs->base_y, fs->next_y); + for (j = fs->first_flow; j < fs->next_flow; j++) { + struct dn_queue *q = FI2Q(c, j); + q->fs = &c->fs[i]; + } + } +} + +static int +init(struct cfg_s *c) +{ + int i; + int ac = c->ac; + char * const *av = c->av; + + c->si_len = sizeof(struct dn_sch_inst); + c->q_len = sizeof(struct dn_queue); + moduledata_t *mod = NULL; + struct dn_alg *p = NULL; + + c->th_min = 0; + c->th_max = -20;/* 20 packets per flow */ + c->lmin = c->lmax = 1280; /* packet len */ + c->flows = 1; + c->flowsets = 1; + c->name = "null"; + ac--; av++; + while (ac > 1) { + if (!strcmp(*av, "-n")) { + c->loops = getnum(av[1], NULL, av[0]); + } else if (!strcmp(*av, "-d")) { + debug = atoi(av[1]); + } else if (!strcmp(*av, "-alg")) { + extern moduledata_t *_g_dn_fifo; + extern moduledata_t *_g_dn_wf2qp; + extern moduledata_t *_g_dn_rr; + extern moduledata_t *_g_dn_qfq; +#ifdef WITH_KPS + extern moduledata_t *_g_dn_kps; +#endif + if (!strcmp(av[1], "rr")) + mod = _g_dn_rr; + else if (!strcmp(av[1], "wf2qp")) + mod = _g_dn_wf2qp; + else if (!strcmp(av[1], "fifo")) + mod = _g_dn_fifo; + else if (!strcmp(av[1], "qfq")) + mod = _g_dn_qfq; +#ifdef WITH_KPS + else if (!strcmp(av[1], "kps")) + mod = _g_dn_kps; +#endif + else + mod = NULL; + c->name = mod ? mod->name : "NULL"; + DX(3, "using scheduler %s", c->name); + } else if (!strcmp(*av, "-len")) { + c->lmin = getnum(av[1], NULL, av[0]); + c->lmax = c->lmin; + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-burst")) { + c->maxburst = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmax")) { + c->th_max = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmin")) { + c->th_min = getnum(av[1], NULL, av[0]); + DX(3, "setting min to %d", c->th_min); + } else if (!strcmp(*av, "-flows")) { + c->flows = getnum(av[1], NULL, av[0]); + DX(3, "setting flows to %d", c->flows); + } else if (!strcmp(*av, "-flowsets")) { + parse_flowsets(c, av[1], 0); + DX(3, "setting flowsets to %d", c->flowsets); + } else { + D("option %s not recognised, ignore", *av); + } + ac -= 2; av += 2; + } + if (c->maxburst <= 0) + c->maxburst = 1; + if (c->loops <= 0) + c->loops = 1; + if (c->flows <= 0) + c->flows = 1; + if (c->flowsets <= 0) + c->flowsets = 1; + if (c->lmin <= 0) + c->lmin = 1; + if (c->lmax <= 0) + c->lmax = 1; + /* multiply by N */ + if (c->th_min < 0) + c->th_min = c->flows * -c->th_min; + if (c->th_max < 0) + c->th_max = c->flows * -c->th_max; + if (c->th_max <= c->th_min) + c->th_max = c->th_min + 1; + if (mod) { + p = mod->p; + DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); + DX(3, "modname %s ty %d", p->name, p->type); + c->enq = p->enqueue; + c->deq = p->dequeue; + c->si_len += p->si_datalen; + c->q_len += p->q_datalen; + c->schk_len += p->schk_datalen; + } + /* allocate queues, flowsets and one scheduler */ + c->q = calloc(c->flows, c->q_len); + c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); + c->si = calloc(1, c->si_len); + c->sched = calloc(c->flows, c->schk_len); + if (c->q == NULL || c->fs == NULL) { + D("error allocating memory for flows"); + exit(1); + } + c->si->sched = c->sched; + if (p) { + if (p->config) + p->config(c->sched); + if (p->new_sched) + p->new_sched(c->si); + } + /* parse_flowsets links queues to their flowsets */ + parse_flowsets(c, av[1], 1); + /* complete the work calling new_fsk */ + for (i = 0; i < c->flowsets; i++) { + if (c->fs[i].fs.par[1] == 0) + c->fs[i].fs.par[1] = 1000; /* default pkt len */ + c->fs[i].sched = c->sched; + if (p && p->new_fsk) + p->new_fsk(&c->fs[i]); + } + + /* initialize the lists for the generator, and put + * all flows in the list for backlog = 0 + */ + for (i=0; i <= BACKLOG+5; i++) + INIT_LIST_HEAD(&c->ll[i]); + + for (i = 0; i < c->flows; i++) { + struct dn_queue *q = FI2Q(c, i); + if (q->fs == NULL) + q->fs = &c->fs[0]; /* XXX */ + q->_si = c->si; + if (p && p->new_queue) + p->new_queue(q); + INIT_LIST_HEAD(&q->ni.h); + list_add_tail(&q->ni.h, &c->ll[0]); + } + c->llmask = 1; + return 0; +} + + +int +main(int ac, char *av[]) +{ + struct cfg_s c; + struct timeval end; + double ll; + int i; + char msg[40]; + + bzero(&c, sizeof(c)); + c.ac = ac; + c.av = av; + init(&c); + gettimeofday(&c.time, NULL); + mainloop(&c); + gettimeofday(&end, NULL); + end.tv_sec -= c.time.tv_sec; + end.tv_usec -= c.time.tv_usec; + if (end.tv_usec < 0) { + end.tv_usec += 1000000; + end.tv_sec--; + } + c.time = end; + ll = end.tv_sec*1000000 + end.tv_usec; + ll *= 1000; /* convert to nanoseconds */ + ll /= c._enqueue; + sprintf(msg, "1::%d", c.flows); + D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", + c.name, c._enqueue, c.loops, + (int)c.time.tv_sec, (int)c.time.tv_usec, ll, + c.th_min, c.th_max, + c.fs_config ? c.fs_config : msg, c.drop); + dump(&c); + DX(1, "done ac %d av %p", ac, av); + for (i=0; i < ac; i++) + DX(1, "arg %d %s", i, av[i]); + return 0; +} + +/* + * The controller decides whether in this iteration we should send + * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) + */ +static void +controller(struct cfg_s *c) +{ + struct mbuf *m; + struct dn_fs *fs; + int flow_id; + + /* histeresis between max and min */ + if (c->state == 0 && c->pending >= c->th_max) + c->state = 1; + else if (c->state == 1 && c->pending <= c->th_min) + c->state = 0; + ND(1, "state %d pending %2d", c->state, c->pending); + c->can_dequeue = c->state; + c->tosend = NULL; + if (c->state) + return; + + if (1) { + int i; + struct dn_queue *q; + struct list_head *h; + + i = ffs(c->llmask) - 1; + if (i < 0) { + DX(2, "no candidate"); + c->can_dequeue = 1; + return; + } + h = &c->ll[i]; + ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); + q = list_first_entry(h, struct dn_queue, ni.h); + list_del(&q->ni.h); + flow_id = Q2FI(c, q); + DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); + if (list_empty(h)) { + ND(2, "backlog %d empty", i); + c->llmask &= ~(1<ni.h, h+1); + ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); + if (i < BACKLOG) { + ND(2, "backlog %d full", i+1); + c->llmask |= 1<<(1+i); + } + fs = &q->fs->fs; + c->cur_fs = q->fs - c->fs; + fs->cur = flow_id; + } else { + /* XXX this does not work ? */ + /* now decide whom to send the packet, and the length */ + /* lookup in the flow table */ + if (c->cur_y >= c->max_y) { /* handle wraparound */ + c->cur_y = 0; + c->cur_fs = 0; + } + fs = &c->fs[c->cur_fs].fs; + flow_id = fs->cur++; + if (fs->cur >= fs->next_flow) + fs->cur = fs->first_flow; + c->cur_y++; + if (c->cur_y >= fs->next_y) + c->cur_fs++; + } + + /* construct a packet */ + if (c->freelist) { + m = c->tosend = c->freelist; + c->freelist = c->freelist->m_nextpkt; + } else { + m = c->tosend = calloc(1, sizeof(struct mbuf)); + } + if (m == NULL) + return; + + m->cfg = c; + m->m_nextpkt = NULL; + m->m_pkthdr.len = fs->par[1]; // XXX maxlen + m->flow_id = flow_id; + + ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", + c->cur_y, m->flow_id, c->cur_fs, + fs->par[0], m->m_pkthdr.len); + +} + +/* +Packet allocation: +to achieve a distribution that matches weights, for each X=w/lmax class +we should generate a number of packets proportional to Y = X times the number +of flows in the class. +So we construct an array with the cumulative distribution of Y's, +and use it to identify the flow via inverse mapping (if the Y's are +not too many we can use an array for the lookup). In practice, +each flow will have X entries [virtually] pointing to it. + +*/ diff --git a/test/memory_leak.sh b/test/memory_leak.sh new file mode 100644 index 0000000..9bdf093 --- /dev/null +++ b/test/memory_leak.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# this script execute N times the command CMD +# collecting the memory usage on a file. +# The value of the Dirty memory should not increase +# between tests. + +BASE_NAME=ipfw_r5808_ +N=10000 +CMD1="/sbin/insmod ../dummynet2/ipfw_mod.ko" +CMD2="/sbin/rmmod ipfw_mod" + +# main +# remove any previous loaded module +/sbin/rmmod ipfw_mod + +# pre + +for n in `seq $N`; do + $CMD1 + $CMD2 + [ $n = 10 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} + [ $n = 100 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} + [ $n = 1000 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} +done; + +# post diff --git a/test/mylist.h b/test/mylist.h new file mode 100644 index 0000000..b546fc2 --- /dev/null +++ b/test/mylist.h @@ -0,0 +1,49 @@ +/* + * $Id: mylist.h 5626 2010-03-04 21:55:22Z luigi $ + * + * linux-like bidirectional lists + */ + +#ifndef _MYLIST_H +#define _MYLIST_H +struct list_head { + struct list_head *prev, *next; +}; + +#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) +#define list_empty(l) ( (l)->next == l ) +static inline void +__list_add(struct list_head *o, struct list_head *prev, + struct list_head *next) +{ + next->prev = o; + o->next = next; + o->prev = prev; + prev->next = o; +} + +static inline void +list_add_tail(struct list_head *o, struct list_head *head) +{ + __list_add(o, head->prev, head); +} + +#define list_first_entry(pL, ty, member) \ + (ty *)((char *)((pL)->next) - offsetof(ty, member)) + +static inline void +__list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void +list_del(struct list_head *entry) +{ + ND("called on %p", entry); + __list_del(entry->prev, entry->next); + entry->next = entry->prev = NULL; +} + +#endif /* _MYLIST_H */ diff --git a/test/profile_bench1 b/test/profile_bench1 new file mode 100644 index 0000000..797650f --- /dev/null +++ b/test/profile_bench1 @@ -0,0 +1,26 @@ +profile_no 100 +delay prob +207 0.000264 +255 0.034117 +270 0.072280 +279 0.106749 +288 0.148604 +298 0.184304 +302 0.202194 +353 0.384541 +423 0.588842 +510 0.782126 +516 0.800970 +545 0.845706 +553 0.861411 +573 0.889430 +586 0.912117 +620 0.920003 +661 0.938308 +695 0.944191 +740 0.949112 +765 0.952598 +848 0.957109 +1379 0.983768 +1555 0.983778 +1649 1 diff --git a/test/profile_bench2 b/test/profile_bench2 new file mode 100644 index 0000000..c733868 --- /dev/null +++ b/test/profile_bench2 @@ -0,0 +1,7 @@ +samples 10 +delay prob +0 0 +250 0 +250 0.5 +500 0.5 +500 1 diff --git a/test/profile_bench3 b/test/profile_bench3 new file mode 100644 index 0000000..5d1722e --- /dev/null +++ b/test/profile_bench3 @@ -0,0 +1,5 @@ +profile_no 100 +delay prob +0 0 +50 0.5 +100 1 diff --git a/test/test_dn_heap.c b/test/test_dn_heap.c new file mode 100644 index 0000000..7d3dc05 --- /dev/null +++ b/test/test_dn_heap.c @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Userland code for testing binary heaps and hash tables + * + * $Id: test_dn_heap.c 6131 2010-04-22 15:37:36Z svn_panicucci $ + */ + +#include +#include + +#include +#include +#include +#include "dn_test.h" +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) + +#include + +struct x { + struct x *ht_link; + char buf[0]; +}; + +uint32_t hf(uintptr_t key, int flags, void *arg) +{ + return (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf[0] : *(char *)key; +} + +int matchf(void *obj, uintptr_t key, int flags, void *arg) +{ + char *s = (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf : (char *)key; + return (strcmp(((struct x *)obj)->buf, s) == 0); +} + +void *newfn(uintptr_t key, int flags, void *arg) +{ + char *s = (char *)key; + struct x *p = malloc(sizeof(*p) + 1 + strlen(s)); + if (p) + strcpy(p->buf, s); + return p; +} + +char *strings[] = { + "undici", "unico", "doppio", "devoto", + "uno", "due", "tre", "quattro", "cinque", "sei", + "uno", "due", "tre", "quattro", "cinque", "sei", + NULL, +}; + +int doprint(void *_x, void *arg) +{ + struct x *x = _x; + printf("found element <%s>\n", x->buf); + return (int)arg; +} + +static void +test_hash() +{ + char **p; + struct dn_ht *h; + uintptr_t x = 0; + uintptr_t x1 = 0; + + /* first, find and allocate */ + h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn); + + for (p = strings; *p; p++) { + dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("/* second -- find without allocate */\n"); + h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL); + for (p = strings; *p; p++) { + void **y = newfn((uintptr_t)*p, 0, NULL); + if (x == 0) + x = (uintptr_t)y; + else { + if (x1 == 0) + x1 = (uintptr_t)*p; + } + dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + dn_ht_scan(h, doprint, 0); +} + +int +main(int argc, char *argv[]) +{ + struct dn_heap h; + int i, n, n2, n3; + + test_hash(); + return 0; + + /* n = elements, n2 = cycles */ + n = (argc > 1) ? atoi(argv[1]) : 0; + if (n <= 0 || n > 1000000) + n = 100; + n2 = (argc > 2) ? atoi(argv[2]) : 0; + if (n2 <= 0) + n = 1000000; + n3 = (argc > 3) ? atoi(argv[3]) : 0; + bzero(&h, sizeof(h)); + heap_init(&h, n, -1); + while (n2-- > 0) { + uint64_t prevk = 0; + for (i=0; i < n; i++) + heap_insert(&h, n3 ? n-i: random(), (void *)(100+i)); + + for (i=0; h.elements > 0; i++) { + uint64_t k = h.p[0].key; + if (k < prevk) + panic("wrong sequence\n"); + prevk = k; + if (0) + printf("%d key %llu, val %p\n", + i, h.p[0].key, h.p[0].object); + heap_extract(&h, NULL); + } + } + return 0; +} diff --git a/test/test_dn_sched.c b/test/test_dn_sched.c new file mode 100644 index 0000000..65bbf18 --- /dev/null +++ b/test/test_dn_sched.c @@ -0,0 +1,89 @@ +/* + * $Id: test_dn_sched.c 5626 2010-03-04 21:55:22Z luigi $ + * + * library functions for userland testing of dummynet schedulers + */ + +#include "dn_test.h" + +void +m_freem(struct mbuf *m) +{ + printf("free %p\n", m); +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + return 0; +} + +void +dn_free_pkts(struct mbuf *m) +{ + struct mbuf *x; + while ( (x = m) ) { + m = m->m_nextpkt; + m_freem(x); + } +} + +int +dn_delete_queue(void *_q, void *do_free) +{ + struct dn_queue *q = _q; + if (q->mq.head) + dn_free_pkts(q->mq.head); + free(q); + return 0; +} + +/* + * This is a simplified function for testing purposes, which does + * not implement statistics or random loss. + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + if (drop) + goto drop; + if (q->ni.length >= 200) + goto drop; + mq_append(&q->mq, m); + q->ni.length++; + q->ni.tot_bytes += m->m_pkthdr.len; + return 0; + +drop: + q->ni.drops++; + return 1; +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + if (*v < lo) { + *v = dflt; + } else if (*v > hi) { + *v = hi; + } + return *v; +} + +#ifndef __FreeBSD__ +int +fls(int mask) +{ + int bit; + + if (mask == 0) + return (0); + for (bit = 1; mask != 1; bit++) + mask = (unsigned int)mask >> 1; + return (bit); +} +#endif -- 2.43.0