From: Luigi Rizzo Date: Sun, 17 Aug 2014 10:30:20 +0000 (-0700) Subject: initial version, corresponding to ipfw3-2012 X-Git-Url: http://git.onelab.eu/?p=ipfw-google.git;a=commitdiff_plain;h=8e87ee720bdedbc75ae87923fa1032af0d62d5fe initial version, corresponding to ipfw3-2012 --- 8e87ee720bdedbc75ae87923fa1032af0d62d5fe diff --git a/020-mips-hz1000.patch b/020-mips-hz1000.patch new file mode 100644 index 0000000..eb54ca2 --- /dev/null +++ b/020-mips-hz1000.patch @@ -0,0 +1,11 @@ +--- include/asm-mips/param_orig.h 2010-02-23 12:45:58.000000000 +0100 ++++ include/asm-mips/param.h 2010-02-23 12:00:31.000000000 +0100 +@@ -41,7 +41,7 @@ + counter is increasing. This value is independent from the external value + and can be changed in order to suit the hardware and application + requirements. */ +-# define HZ 100 ++# define HZ 1000 + # define hz_to_std(a) (a) + + #endif /* Not a DECstation */ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..508f1ae --- /dev/null +++ b/Makefile @@ -0,0 +1,149 @@ +# $Id: Makefile 11689 2012-08-12 21:07:34Z luigi $ +# +# Top level makefile for building ipfw/dummynet (kernel and userspace). +# You can run it manually or also under the Planetlab build. +# Planetlab wants also the 'install' target. +# +# To build on system with non standard Kernel sources or userland files, +# you should run this with +# +# make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr +# +# We assume that $(USRDIR) contains include/ and lib/ used to build userland. +# + +include Makefile.inc + +DATE ?= $(shell date +%Y%m%d) +SNAPSHOT_NAME=$(DATE)-ipfw3.tgz +BINDIST=$(DATE)-dummynet-linux.tgz +WINDIST=$(DATE)-dummynet-windows.zip + +DISTFILES= Makefile Makefile.inc README binary* ipfw kipfw *.h sys + +.PHONY: ipfw kipfw + +########################################### +# windows x86 and x64 specific variables # +########################################### +# DRIVE must be the hard drive letter where DDK is installed +# DDKDIR must be the path to the DDK root directory, without drive letter +# TARGETOS (x64 only) must be one of the following: +# wnet -> windows server 2003 +# wlh -> windows vista and windows server 2008 +# win7 -> windows 7 +# future version must be added here +DRIVE ?= C: +DDKDIR ?= /WinDDK/7600.16385.1 +DDK = $(DRIVE)$(DDKDIR) +TARGETOS=win7 + +export WIN64 +export DDK +export DRIVE +export DDKDIR + +_all: all + +clean distclean: + -@(cd ipfw && $(MAKE) $(@) ) + -@rm -rf kipfw-mod binary64/[A-hj-z]* + +all: kipfw ipfw + @# -- windows only +ifeq ($(OSARCH),Windows) # copy files +ifeq ($(WIN64),) + -@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/ + -@ cp kipfw/*.inf binary/ +else + -@ cp binary/* kipfw/*.inf binary64/ + -@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/ +endif # WIN64 +endif # Windows + +win64: + $(MAKE) WIN64=1 + +# kipfw-src prepares the sources for the kernel part. +# The windows files (passthru etc.) are modified version of the +# examples found in the $(DDK)/src/network/ndis/passthru/driver/ +# They can be re-created using the 'ndis-glue' target +# # We need a sed trick to remove newlines from the patchfile. + +ndis-glue: + -@mkdir -p kipfw-mod + cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod + cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch ) + +kipfw-src: + -@rm -rf kipfw-mod + -@mkdir -p kipfw-mod + -@cp -Rp kipfw/* kipfw-mod + -@cp `find sys -name \*.c` kipfw-mod + -@(cd kipfw-mod && $(MAKE) include_e) +ifeq ($(OSARCH),Windows) + make ndis-glue +endif + +snapshot: + $(MAKE) distclean + (tar cvzhf /tmp/$(SNAPSHOT_NAME) -s':^:ipfw3-2012/:' $(DISTFILES) ) + +bindist: + $(MAKE) clean + $(MAKE) all + tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko + +windist: + $(MAKE) clean + -$(MAKE) all + -rm /tmp/$(WINDIST) + zip -r /tmp/$(WINDIST) binary -x \*.svn\* + + +ipfw: + @(cd ipfw && $(MAKE) $(@) ) + +kipfw: kipfw-src +ifeq ($(WIN64),) # linux or windows 32 bit + @(cd kipfw-mod && $(MAKE) $(@) ) +else #--- windows 64 bit, we use build.exe and nmake + rm -f kipfw-mod/Makefile + mkdir kipfw-mod/tmpbuild # check mysetenv.sh + bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS) +endif + +openwrt_release: + # create a temporary directory + $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX)) + # create the source destination directory + $(eval IPFWDIR := ipfw3-$(DATE)) + $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR)) + mkdir $(DSTDIR) + # copy the package, clean objects and svn info + cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR) + (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf) + (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR)) + + # create the port files in /tmp/ipfw3-port + $(eval PORTDIR := $(TMPDIR)/ipfw3) + mkdir -p $(PORTDIR)/patches + # generate the Makefile, PKG_VERSION and PKG_MD5SUM + md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum + cat ./OPENWRT/Makefile | \ + sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \ + sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \ + > $(PORTDIR)/Makefile + + @echo "" + @echo "The openwrt port is in $(TMPDIR)/ipfw3-port" + @echo "The source file should be copied to the public server:" + @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet" + @echo "after this the temporary directory $(TMPDIR) can be removed." + +install: + +diff: + -@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw) + -@(diff -upr $(BSD_HEAD)/sys sys) + diff --git a/Makefile.inc b/Makefile.inc new file mode 100644 index 0000000..ffa14e9 --- /dev/null +++ b/Makefile.inc @@ -0,0 +1,23 @@ +# $Id$ +# GNU makefile header for ipfw/kipfw building +BSD_HEAD ?= ~/FreeBSD/head +OSARCH := $(shell uname) +OSARCH := $(findstring $(OSARCH),FreeBSD Linux Darwin) +ifeq ($(OSARCH),) + OSARCH := Windows +endif +OBJDIR=mia + +KSRC ?= /lib/modules/$(shell uname -r)/build +ifneq ($V,1) # no echo + MSG=@echo + HIDE=@ +else + MSG=@\# + HIDE= +endif + +.c.o: + $(MSG) " CC $<" + $(HIDE) $(CC) $(CFLAGS) -c $< -o $@ + diff --git a/Makefile.openwrt b/Makefile.openwrt new file mode 100644 index 0000000..3c7be80 --- /dev/null +++ b/Makefile.openwrt @@ -0,0 +1,95 @@ +# Makefile to build the package in openwrt. +# goes into package/ipfw3/Makefile +# +# Edit IPFW_DIR to point to the directory with the sources for ipfw + +IPFW_DIR := $(TOPDIR)/../ipfw3 + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=kmod-ipfw3 +PKG_RELEASE:=1 + +# MV is undefined +MV ?= mv + +include $(INCLUDE_DIR)/package.mk + +#Stuff depending on kernel version +ifeq ($(KERNEL),2.6) + +VERS:=2.6 +IPFW_MOD:=ipfw_mod.ko +IPFW_SRC_DIR:=M + +else + +VERS:=openwrt +CFLAGS_WRT:=-DSYSCTL_NODE -DEMULATE_SYSCTL +IPFW_MOD:=ipfw_mod.o +IPFW_SRC_DIR:=SUBDIRS + +endif + +# Description for the package. +# The names KernelPackage/ipfw3 must match the arguments to the +# call $(eval $(call KernelPackage,ipfw3)) used to build it + +define KernelPackage/ipfw3 + SUBMENU:=Other modules + TITLE:= IPFW and dummynet + # FILES is what makes up the module, both kernel and userland + # It must be in the KernelPackage section + FILES := $(PKG_BUILD_DIR)/dummynet2/$(IPFW_MOD) $(PKG_BUILD_DIR)/ipfw/ipfw + # AUTOLOAD:=$(call AutoLoad,80,ipfw_mod) +endef + +define KernelPackage/ipfw3/description + This package contains the ipfw and dummynet module +endef + +# Standard entries for the openwrt builds: Build/Prepare and Build/Compile +# Remember that commands must start with a tab + +# 'prepare' instructions for both kernel and userland +# We copy the entire subtree, then build include_e/ which +# contains empty headers used by the kernel sources. +define Build/Prepare + # $(warning Preparing ipfw sources) + mkdir -p $(PKG_BUILD_DIR) + $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/ + (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e ) + (cd $(PKG_BUILD_DIR)/dummynet2 && $(MAKE) include_e ) +endef + +define Build/Compile + # compile the kernel part for openwrt + $(MAKE) -C "$(LINUX_DIR)" \ + CROSS_COMPILE="$(TARGET_CROSS)" \ + ARCH="$(LINUX_KARCH)" \ + $(IPFW_SRC_DIR)="$(PKG_BUILD_DIR)/dummynet2" \ + VER=$(VERS) modules + # compile the userland part for openwrt + $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \ + $(TARGET_CONFIGURE_OPTS) \ + CFLAGS="$(TARGET_CFLAGS) $(CFLAGS_WRT) -I./include_e -I./include -include ../glue.h -DNO_ALTQ -D__BSD_VISIBLE" \ + VER=$(VERS) all +endef + +define Package/ipfw3-userland + SECTION:=utils + CATEGORY:=Utilities + TITLE := /sbin/ipfw + DESCRIPTION := This is the control program for ipfw and dummynet +endef + +define Package/ipfw3-userland/install + $(INSTALL_DIR) $(1) /sbin +endef + +# XXX not entirely clear why the install entry for userland works, +# given that /sbin/ipfw is in KernelPackage/ipfw3 + +$(eval $(call Package,ipfw3-userland)) +$(eval $(call KernelPackage,ipfw3)) diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..52bb5bf --- /dev/null +++ b/NOTES @@ -0,0 +1,220 @@ +# +# $Id: NOTES 6552 2010-06-15 11:24:59Z svn_panicucci $ +# + +--------------------------------------------------------------------- +--- DEVELOPER NOTES ------------------------------------------------ + +Both the client and the kernel code use almost unmodified sources +from FreeBSD (just a very small number of sections #ifdef'ed out +for features not relevant or not implemented). + +In both cases we provide two set of headers: + - one set is made of empty files, automatically generated, to replace + FreeBSD headers not available or conflicting on the ported platforms. + - one set is made of custom files, sometimes copied verbatim + from FreeBSD, sometimes containing only the minimal set of + macros/ struct/ prototypes required by the port. + +Additionally, we have a small set of .c files providing functions not +available in the port platforms, and hooks for the sockopt/packet +data. + + +TODO 20100205: ++ use an appropriate identifier instead of LINUX24 ++ find the discharging module hook, in order to force a queue flush ++ better matching on interface names (case insensitive etc ?) ++ match by interface address ++ verify path ++ send keepalives (20100301 marta: implemented) ++ pullup of data in external buffers ++ O_TAG ++ O_DIVERT ++ O_TEE ++ O_SETFIB ++ kmem_cache_alloc + +TODO (OpenWRT) 20090622 ++ add a module compilation for 2.6 + +TODO (FreeBSD, general) ++ New features related to the forthcoming IPv6 are missing, as the IPv6 +support for lookup tables that currently support IPv4 addresses only. +One of the goal of this project is to add the tables feature to the +IPv6 protocol. + ++ The current code implements rules listing requests as a single +request returning both static and dynamic rules as a whole block. This +operation requires a lock to be held for the time needed to get the +full list of rules, regardless of the requested rules. I propose to +break up the rule request in two parts, for static and dynamic rules, in +order to avoid to lock the whole struct for a subset of rules required. + ++ At last, due to improvement and contribution to the code, the tool +significantly grown over the time with new functionalities and features, +leaving the general view aside. An example of this will be the use of +dispatching table instead some very long switch case, making the resulting +code more readable and hopefully a faster execution. + ++ XXX can't find the ipfw_* indirection... + +DETAILED PORTING INFO + +--- ipfw (userland) on linux --- + +The port is relatively trivial. Communication with the kernel occurs +through a raw socket using [gs]etsockopt(), and all is needed is the +availability of ip_fw.h and ip_dummynet.h headers to describe the +relevant data structures. + +--- kernel ipfw on linux --- + +Sources are mostly unmodified, except for commenting out +unsupported features (tables, in-kernel nat...). +The port requires a rather large number of empty headers. +Other porting issues are in ipfw2_mod.c + +--- build as an Openwrt package + +------ WINDOWS PORT ------ + +We started from the wipfw port available at [WIPFW] , but +most of the port is done from scratch using the most recent +version of ipfw+dummynet from HEAD/RELENG_7 as of March 2009 + +# WIPFW: wipfw.sourceforge.net +#binary: +http://downloads.sourceforge.net/wipfw/wipfw-0.3.2b.zip?use_mirror=mesh +http://downloads.sourceforge.net/wipfw/wipfw-0.2.8-source.zip + +--- DEVELOPMENT TOOLS: + +At least initially, to build the code you need a pc with +windows installed and the [WINDDK] from the microsoft site. +Other tools like the new WDK should work as well. + +The 'standard' way used by WDK/WINDDK is to run a 'build' +script which in turn calls nmake and then the microsoft +compiler [CL] and linker [LINK]. See the documentation for +command line switches for these tools, they are similar but +not the same as the equivalent gcc switches. In particular, +a / is often used to replace - though both forms are accepted. + +The steps to do in order to launch the build environment follows: + + + download winddk from microsoft.com + + install + + run the Free Build Enviroment from: + + Start -> All Program -> WINDDK -> + [NT|XP|2000] -> Free Build Environment + + + change dir to .src and type `build' in command line + +For our purposes, however, it is much more convenient to use +cygwin [CYGWIN] and invoke CL and LINK using gmake + +A debugging tools is: + http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx +it simply display the kernel-mode debug output. +Use the DbgPrint() function, that is something similar to printk(). +Can be lauched with dbgview.exe. + +After a succesfully compilation and link, you can launch the program +in user space simply executing the binary file, while for the kernel +space you need to do the following steps: + +cp ipfw.sys /cygdrive/c/WINDOWS/system32/drivers/ +ipfw install_drv System32\DRIVERS\ip_fw.sys +net start ip_fw + + +======= +--- ARCHITECTURE --- + +The main part of the userland program mostly work as the +unix equivalent, the only issue is to provide empty +header files to replace those not available in Windows, +and include the winsock2 headers to access some network +related functions and headers. + +Communication with the kernel module does not use a raw IP socket +as in the unix version. Instead, we inherit the same method +used in ipfw -- a replacement for socket() creates a handle +to access the control structure, and setsockopt/getsockopt +replacements are also used to communicate with the kernel +side. This is implemented in win32.c + +In order to load the module and activate it, we also use +the same technique suggested in wipfw -- the main() is +extended (with a wrapper) so that it can handle additional +commands to install/control/deinstall the service and +call the appropriate actions. See svcmain.c for details. + +--- PORTING ISSUES: + +Most of the unix hierarchy of headers is not available so we +have to replicate them. + +gcc attributes are also not present. + +C99 types are not present, remapped in +Also, we don't have C99 initializers which sometimes gives trouble. + +--- USEFUL LINKS: + +[WIPFW] + http://wipfw.sourceforge.net/ + +[WINDDK] + http://www.microsoft.com/whdc/devtools/ddk/default.mspx + +[CL] + http://msdn.microsoft.com/en-us/library/610ecb4h.aspx + command line syntax + +[CYGWIN] + http://www.cygwin.com/setup.exe +Windows Driver Kit +http://www.microsoft.com/whdc/DevTools/WDK/WDKpkg.mspx + +Debug Symbols for WinXP SP3 +http://www.microsoft.com/whdc/devtools/debugging/symbolpkg.mspx#d + +DbgView +http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx + +Cygwin +http://www.cygwin.com/ +(installazione pacchetti di default + categoria devel) + +Winrar (il WDK e' distribuito in un file .iso) +http://www.rarlab.com/download.htm + +puttycyg (terminale per cygwin) +http://code.google.com/p/puttycyg/ + +Tortoise SVN +http://tortoisesvn.net/downloads + +EditPlus +http://www.editplus.com/ + +--------------------------------------------------------------------- +--- OPEN ISSUES/TODO ------------------------------------------------ + +- Fix the build on OpenWRT for linux 2.6 + [Forum: https://forum.openwrt.org/viewtopic.php?id=24990] +- Compilation on 2.6 OpenWRT (target is MIPS Artheros 71xx) gives compilation + errors; [Send updates to: https://forum.openwrt.org/viewtopic.php?id=24990] +- Windows stack corruption [a tricky bug in dummynet] +- Windows ipv6 port [RE: Windows port of ipv6 in ipfw+dummynet] + +NOTE: +- To allow compilation on OpenWRT with kernel 2.6 only the Makefile.opewrt + is modified to guess the kernel version (2.4/2.6) +- ipfw3 Makefile is not modified. +- Also compile on bigendian, but not tested yet... +- Little changes in source code. + diff --git a/README b/README new file mode 100644 index 0000000..9791ea1 --- /dev/null +++ b/README @@ -0,0 +1,275 @@ +# +# $Id: README 11691 2012-08-12 21:32:37Z luigi $ +# + +This directory contains a port of ipfw and dummynet to Linux and Windows. +This version of ipfw and dummynet is called "ipfw3" as it is the +third major rewrite of the code. The source code here comes straight +from FreeBSD (roughly the version in HEAD as of February 2010), +plus some glue code and headers written from scratch. Unless +specified otherwise, all the code here is under a BSD license. + +Specific build instructions are below, and in general produce + + a kernel module, ipfw_mod.ko (ipfw.sys on windows) + a userland program, /sbin/ipfw (ipfw.exe on windows) + +which you need to install on your system. + +CREDITS: + Luigi Rizzo (main design and development) + Marta Carbone (Linux and Planetlab ports) + Riccardo Panicucci (modular scheduler support) + Francesco Magno (Windows port) + Fabio Checconi (the QFQ scheduler) + Funding from Universita` di Pisa (NETOS project), + European Commission (ONELAB2 project) + ACM SIGCOMM (Sigcomm Community Projects Award, April 2012) + +------ INSTALL/REMOVE INSTRUCTIONS ------ + +Linux + INSTALL: + # Do the following as root + insmod ./dummynet2/ipfw_mod.ko + cp ipfw/ipfw /usr/local/sbin + REMOVE: + rmmod ipfw_mod.ko + +OpenWRT + INSTALL: # use the correct name for your system + opkg install kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install + ls -l ls -l /lib/modules/2.4.35.4/ipfw* # check + insmod /lib/modules/2.4.35.4/ipfw_mod.o # load the module + /lib/modules/2.4.35.4/ipfw show # launch the userspace tool + REMOVE: + rmmod ipfw_mod.o # remove the module + +Windows: + A pre-built version is in binary/ and binary64/ directories. + + INSTALL THE NDIS DRIVER + - open the configuration panel for the network card in use + (right click on the icon on the SYSTRAY, or go to + Control Panel -> Network and select one card) + - click on Properties->Install->Service->Add + - click on 'Driver Disk' and select 'netipfw.inf' in this folder + - select 'ipfw+dummynet' which is the only service you should see + - click accept on the warnings for the installation of an unsigned + driver (roughly twice per existing network card) + + Now you are ready to use the emulator. To configure it, open a 'cmd' + window (REMEMBER to run it as Administrator) + and you can use the ipfw command from the command line. + Otherwise click on the 'TESTME.bat' which is a batch program that + runs various tests. + REMEMBER: you need to run ipfw as administrator. + + REMOVE: + - select a network card as above. + - click on Properties + - select 'ipfw+dummynet' + - click on 'Remove' + + +------ BUILD INSTRUCTIONS ------ + ++ Windows 32 bit and 64 bit (XP, Windows7) + + To build your own version of the package you need: + - cygwin, http://www.cygwin.com/ with base packages, make, + c compiler, possibly an editor and subversion. + This is used to build the userspace control program, ipfw.exe + + - Microsoft Windows Driver Kit Version 7.1.0, available from + http://www.microsoft.com/en-us/download/details.aspx?id=11800 + (ISO image, GRMWDK_EN_7600_1.ISO) + This is used to build the kernel module. + + - optionally, DbgView if you want to see diagnostics coming from + the kernel module. You can find it at + + http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx + + Check the Makefile in the root directory to make sure that the WDK is + installed in the place indicated by DRIVE and DDKDIR variables + (otherwise pass the correct values to the Makefile). + Open a shell from cygwin, move to this directory, and run "make" for + the 32-bit version, "make win64" for the 64 bit version. + This will produce in the binary/ or binary64/ directory the + following files: + ipfw.exe (you also need cygwin1.dll) + ipfw.sys (an NDIS intermediate filter driver) + netipfw.inf and netipfw_m.inf (installer files) + + Cross compilation of the userland side under FreeBSD is possible with + gmake TCC=`pwd`/tcc-0.9.25-bsd/win32 CC=`pwd`/tcc-0.9.25-bsd/win32/bin/wintcc + (wintcc is a custom version of tcc which produces Windows code) + + NOTE: the 64-bit version is compiled as a 32-bit executable for userspace, + with appropriate changes to produce 64-bit pointers. + The kernel module is built using the MSC 'build' utility instead + of 'make'. THE MODULE IS NOT SIGNED. + IMPORTANT: Windows 64-bit will not load unsigned kernel modules unless + you boot with 'F8' and disable checks for signed modules. + +***** Linux 2.6 and above ****** + + make [KSRC=/path/to/linux USRDIR=/path/to/usr] + + where the two variables are optional an point to the linux kernel + sources and the /usr directory. Defaults are USRDIR=/usr and + KSRC=/lib/modules/`uname -r`/build --- XXX check ? + + NOTE: make sure CONFIG_NETFILTER is enabled in the kernel + configuration file. You need the ncurses devel library, + that can be installed according your distro with: + apt-get install ncurses-dev # for debian based distro + yum -y install ncurses-dev # for fedora based distro + You can enable CONFIG_NETFILTER by doing: + + "(cd ${KSRC}; make menuconfig)" + + and enabling the option listed below: + + Networking ---> + Networking options ---> + [*] Network packet filtering framework (Netfilter) + + If you have not yet compiled your kernel source, you need to + prepare the build environment: + + (cd $(KSRC); make oldconfig; make prepare; make scripts) + +***** Linux 2.4.x ***** + + Almost as above, with an additional VER=2.4 + + make VER=2.4 KSRC=... + + For 2.4, if KSRC is not specified then we use + KSRC ?= /usr/src/`uname -r`/build + + You need to follow the same instruction for the 2.6 kernel, enabling + netfilter in the kernel options: + + Networking options ---> + [*] Network packet filtering (replaces ipchains) + +***** Openwrt package ***** + + (Tested with kamikaze_8.09.1 and Linux 2.4) + + + Download and extract the OpenWrt package, e.g. + + wget http://downloads.openwrt.org/kamikaze/8.09.1/kamikaze_8.09.1_source.tar.bz2 + tar xvjf kamikaze_8.09.1_source.tar.bz2 + + + move to the directory with the OpenWrt sources (the one that + contains Config.in, rules.mk ...) + + cd kamikaze_8.09.1 + + + Optional: Add support for 1ms resolution. + + By default OpenWRT kernel is compiled with HZ=100; this implies + that all timeouts are rounded to 10ms, too coarse for dummynet. + The file 020-mips-hz1000.patch contains a kernel patch to build + a kernel with HZ=1000 (i.e. 1ms resolution) as in Linux/FreeBSD. + To apply this patch, go in the kernel source directory and + patch the kernel + + cd build_dir/linux-brcm-2.4/linux-2.4.35.4 + cat $IPFW3_SOURCES/020-mips-hz1000.patch | patch -p0 + + where IPFW3_SOURCES contains the ipfw3 source code. + Now, the next kernel recompilation will use the right HZ value + + + Optional: to be sure that the tools are working, make a first + build as follows: + + - run "make menuconfig" and set the correct target device, + drivers, and so on; + - run "make" to do the build + + + Add ipfw3 to the openwrt package, as follows: + + - copy the code from this directory to the place used for the build: + + cp -Rp /path_to_ipfw3 ../ipfw3; + + If you want, you can fetch a newer version from the web + (cd ..; rm -rf ipfw3; \ + wget http://info.iet.unipi.it/~luigi/dummynet/ipfw3-latest.tgz;\ + tar xvzf ipfw3-latest.tgz) + + - run the following commands: + (mkdir package/ipfw3; \ + cp ../ipfw3/Makefile.openwrt package/ipfw3/Makefile) + + to create the package/ipfw3 directory in the OpenWrt source + directory, and copy Makefile.openwrt to package/ipfw3/Makefile ; + + - if necessary, edit package/ipfw3/Makefile and set IPFW_DIR to point to + the directory ipfw3, which contains the sources; + + - run "make menuconfig" and select kmod-ipfw3 as a module in + Kernel Modules -> Other modules -> kmod-ipfw3 + + - run "make" to build the package, "make V=99" for verbose build. + + - to modify the code, assuming you are in directory "kamikaze_8.09.1" + + (cd ../ipfw3 && vi ...the files you are interested in ) + rm -rf build_dir/linux-brcm-2.4/kmod-ipfw3 + make package/ipfw3/compile V=99 + + The resulting package is located in bin/packages/mipsel/kmod-ipfw3*, + upload the file and install on the target system, as follows: + + opkg install kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk #install + ls -l ls -l /lib/modules/2.4.35.4/ipfw* # check + insmod /lib/modules/2.4.35.4/ipfw_mod.o # load the module + /lib/modules/2.4.35.4/ipfw show # launch the userspace tool + rmmod ipfw_mod.o # remove the module + +***** PLANETLAB BUILD (within a slice) ***** +These instruction can be used by PlanetLab developers to compile +the dummynet module on a node. To install the module on the node +users need root access in root context. PlanetLab users that want +to use the dummynet package should ask to PlanetLab support for +nodes with dummynet emulation capabilities. + + Follow the instructions below. You can just cut&paste + + # install the various tools if not available + sudo yum -y install subversion rpm-build rpm-devel m4 redhat-rpm-config make gcc + # new build installation requires the gnupg package + sudo yum -y install gnupg + # the linux kernel and the ipfw source can be fetched by git + sudo yum -y install git + + # create and move to a work directory + mkdir -p test + # extract a planetlab distribution to directory XYZ + (cd test; git clone git://git.onelab.eu/build ./XYZ) + # download the specfiles and do some patching. + # Results are into SPEC/ (takes 5 minutes) + (cd test/XYZ; make stage1=true PLDISTRO=onelab) + # Building the slice code is fast, the root code takes longer + # as it needs to rebuild the whole kernel + (cd test/XYZ; sudo make ipfwslice PLDISTRO=onelab) + (cd test/XYZ; sudo make ipfwroot PLDISTRO=onelab) + + The kernel dependency phase is a bit time consuming, but does not + need to be redone if we are changing the ipfw sources only. + To clean up the code do + (cd test/XYZ; sudo make ipfwroot-clean ipfwslice-clean) + then after you have updated the repository again + (cd test/XYZ; sudo make ipfwslice ipfwroot) + +--- References +[1] https://svn.planet-lab.org/wiki/VserverCentos +[2] http://wiki.linux-vserver.org/Installation_on_CentOS +[3] http://mirror.centos.org/centos/5/isos/ +[4] More information are in /build/README* files diff --git a/binary/README.txt b/binary/README.txt new file mode 100644 index 0000000..0212277 --- /dev/null +++ b/binary/README.txt @@ -0,0 +1,27 @@ +This directory contains the binaries to install and use IPFW and +DUMMYNET on a Windows Machine. The kernel part is an NDIS module, +whereas the user interface is a command line program. + +1. INSTALL THE NDIS DRIVER + +- open the configuration panel for the network card in use + (either right click on the icon on the SYSTRAY, or go to + Control Panel -> Network and select one card) + +- click on Properties->Install->Service->Add +- click on 'Driver Disk' and select 'netipfw.inf' in this folder +- select 'ipfw+dummynet' which is the only service you should see +- click accept on the warnings for the installation of an unknown + driver (roughly twice per existing network card) + +Now you are ready to use the emulator. To configure it, open a 'cmd' +window and you can use the ipfw command from the command line. +Otherwise click on the 'TESTME.bat' which is a batch program that +runs various tests. + +2. UNINSTALL THE DRIVER + +- select a network card as above. +- click on Properties +- select 'ipfw+dummynet' +- click on 'Remove' diff --git a/binary/cygwin1.dll b/binary/cygwin1.dll new file mode 100644 index 0000000..317c51e Binary files /dev/null and b/binary/cygwin1.dll differ diff --git a/binary/ipfw.exe b/binary/ipfw.exe new file mode 100644 index 0000000..09bdc37 Binary files /dev/null and b/binary/ipfw.exe differ diff --git a/binary/ipfw.sys b/binary/ipfw.sys new file mode 100644 index 0000000..59e855c Binary files /dev/null and b/binary/ipfw.sys differ diff --git a/binary/netipfw.inf b/binary/netipfw.inf new file mode 100644 index 0000000..7159403 --- /dev/null +++ b/binary/netipfw.inf @@ -0,0 +1,81 @@ +; version section +[Version] +Signature = "$Windows NT$" +Class = NetService +ClassGUID = {4D36E974-E325-11CE-BFC1-08002BE10318} +Provider = %Unipi% +DriverVer = 26/02/2010,3.0.0.1 + +; manufacturer section +[Manufacturer] +%Unipi% = UNIPI,NTx86,NTamd64 + +; control flags section +; optional, unused in netipfw.inf inf, used in netipfw_m.inf +[ControlFlags] + +; models section +[UNIPI] ; Win2k +%Desc% = Ipfw.ndi, unipi_ipfw +[UNIPI.NTx86] ;For WinXP and later +%Desc% = Ipfw.ndi, unipi_ipfw +[UNIPI.NTamd64] ;For x64 +%Desc% = Ipfw.ndi, unipi_ipfw + +; ddinstall section +[Ipfw.ndi] +AddReg = Ipfw.ndi.AddReg, Ipfw.AddReg +Characteristics = 0x4410 ; NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!! +CopyFiles = Ipfw.Files.Sys +CopyInf = netipfw_m.inf + +; remove section +[Ipfw.ndi.Remove] +DelFiles = Ipfw.Files.Sys + +;ddinstall.services section +[Ipfw.ndi.Services] +AddService = Ipfw,,Ipfw.AddService + +[Ipfw.AddService] +DisplayName = %ServiceDesc% +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\ipfw.sys +AddReg = Ipfw.AddService.AddReg + +[Ipfw.AddService.AddReg] + +;file copy related sections +[SourceDisksNames] +1=%DiskDescription%,"",, + +[SourceDisksFiles] +ipfw.sys=1 + +[DestinationDirs] +DefaultDestDir = 12 +Ipfw.Files.Sys = 12 ; %windir%\System32\drivers + +; ddinstall->copyfiles points here +[Ipfw.Files.Sys] +ipfw.sys,,,2 + +; ddinstall->addreg points here +[Ipfw.ndi.AddReg] +HKR, Ndi, HelpText, , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box +HKR, Ndi, FilterClass, , failover +HKR, Ndi, FilterDeviceInfId, , unipi_ipfwmp +HKR, Ndi, Service, , Ipfw +HKR, Ndi\Interfaces, UpperRange, , noupper +HKR, Ndi\Interfaces, LowerRange, , nolower +HKR, Ndi\Interfaces, FilterMediaTypes, , "ethernet, tokenring, fddi, wan" + +;strings section +[Strings] +Unipi = "Unipi" +DiskDescription = "Ipfw Driver Disk" +Desc = "ipfw+dummynet" +HELP = "This is ipfw and dummynet network emulator, developed by unipi.it" +ServiceDesc = "ipfw service" diff --git a/binary/netipfw_m.inf b/binary/netipfw_m.inf new file mode 100644 index 0000000..350e4d1 --- /dev/null +++ b/binary/netipfw_m.inf @@ -0,0 +1,56 @@ +; version section +[Version] +Signature = "$Windows NT$" +Class = Net +ClassGUID = {4D36E972-E325-11CE-BFC1-08002BE10318} +Provider = %Unipi% +DriverVer = 26/02/2010,3.0.0.1 + +; control flags section +; optional, unused in netipfw.inf inf, used in netipfw_m.inf +[ControlFlags] +ExcludeFromSelect = unipi_ipfwmp + +; destinationdirs section, optional +[DestinationDirs] +DefaultDestDir=12 +; No files to copy + +; manufacturer section +[Manufacturer] +%Unipi% = UNIPI,NTx86,NTamd64 + +; models section +[UNIPI] ; Win2k +%Desc% = IpfwMP.ndi, unipi_ipfwmp +[UNIPI.NTx86] ;For WinXP and later +%Desc% = IpfwMP.ndi, unipi_ipfwmp +[UNIPI.NTamd64] ;For x64 +%Desc% = IpfwMP.ndi, unipi_ipfwmp + +; ddinstall section +[IpfwMP.ndi] +AddReg = IpfwMP.ndi.AddReg +Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN + +; ddinstall->addreg points here +[IpfwMP.ndi.AddReg] +HKR, Ndi, Service, 0, IpfwMP + +;ddinstall.services section +[IpfwMP.ndi.Services] +AddService = IpfwMP,0x2, IpfwMP.AddService + +[IpfwMP.AddService] +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\ipfw.sys +AddReg = IpfwMP.AddService.AddReg + +[IpfwMP.AddService.AddReg] +; None + +[Strings] +Unipi = "Unipi" +Desc = "Ipfw Miniport" \ No newline at end of file diff --git a/binary/testme.bat b/binary/testme.bat new file mode 100644 index 0000000..5b3de00 --- /dev/null +++ b/binary/testme.bat @@ -0,0 +1,79 @@ +@echo on +@set CYGWIN=nodosfilewarning + +@ipfw -q flush +@ipfw -q pipe flush +@echo ###################################################################### +@echo ## Setting delay to 100ms for both incoming and outgoing ip packets ## +@echo ## and sending 4 echo request to Google ## +@echo ###################################################################### +ipfw pipe 3 config delay 100ms +ipfw add pipe 3 ip from any to any +ipfw pipe show +ping -n 4 www.google.it + +@echo ############################################## +@echo ## Raising delay to 300ms and pinging again ## +@echo ############################################## +ipfw pipe 3 config delay 300ms +ipfw pipe show +ping -n 4 www.google.com + +@echo ################################## +@echo ## Shaping bandwidth to 500kbps ## +@echo ################################## +ipfw pipe 3 config bw 500Kbit/s +ipfw pipe show +wget http://info.iet.unipi.it/~luigi/1m +@del 1m + +@echo ################################### +@echo ## Lowering bandwidth to 250kbps ## +@echo ################################### +ipfw pipe 3 config bw 250Kbit/s +ipfw pipe show +wget http://info.iet.unipi.it/~luigi/1m +@del 1m + +@echo ################################################################### +@echo ## Simulating 50 percent packet loss and sending 15 echo request ## +@echo ################################################################### +@ipfw -q flush +@ipfw -q pipe flush +ipfw add prob 0.5 deny proto icmp in +ping -n 15 -w 300 www.google.it +@ipfw -q flush + +@echo ############################## +@echo ## Showing SYSCTL variables ## +@echo ############################## +ipfw sysctl -a + +@echo ############################################# +@echo ## Inserting rules to test command parsing ## +@echo ############################################# +@echo -- dropping all packets of a specific protocol -- +ipfw add deny proto icmp +@echo -- dropping packets of all protocols except a specific one -- +ipfw add deny not proto tcp +@echo -- dropping all packets from IP x to IP y -- +ipfw add deny src-ip 1.2.3.4 dst-ip 5.6.7.8 +@echo -- dropping all ssh outgoing connections -- +ipfw add deny out dst-port 22 +@echo -- allowing already opened browser connections -- +@echo -- but preventing new ones from being opened -- +ipfw add deny out proto tcp dst-port 80 tcpflags syn +@echo -- another way to do the same thing -- +ipfw add allow out proto tcp dst-port 80 established +ipfw add deny out proto tcp dst-port 80 setup +@echo -- checking what rules have been inserted -- +ipfw -c show +@ipfw -q flush + +@echo ################# +@echo ## Cleaning up ## +@echo ################# +ipfw -q flush +ipfw -q pipe flush + +pause diff --git a/binary/wget.exe b/binary/wget.exe new file mode 100644 index 0000000..f2a11c1 Binary files /dev/null and b/binary/wget.exe differ diff --git a/binary64/ipfw.exe b/binary64/ipfw.exe new file mode 100755 index 0000000..35c86d9 Binary files /dev/null and b/binary64/ipfw.exe differ diff --git a/binary64/ipfw.sys b/binary64/ipfw.sys new file mode 100755 index 0000000..8e2275d Binary files /dev/null and b/binary64/ipfw.sys differ diff --git a/configuration/README b/configuration/README new file mode 100644 index 0000000..778f7aa --- /dev/null +++ b/configuration/README @@ -0,0 +1,14 @@ +This directorty contains some ipfw configurations and a scripts +to safely change the firewall rules. + +The firewall configuration comes from the FreeBSD initial script. +The change_rules_linux.sh allows to change the ipfw rules and +in case os a misconfiguration which prevents to reach the remote +host, to restore the old ruleset. + +To configure the firewall behavior, edit the ipfw.conf file and +execute the ./change_rules_linux.sh script. + +The ipfw program executable should be located in /sbin (XXX) + +XXX seems we use something which is not compatible with dash diff --git a/configuration/change_rules.sh b/configuration/change_rules.sh new file mode 100755 index 0000000..8f23369 --- /dev/null +++ b/configuration/change_rules.sh @@ -0,0 +1,159 @@ +#!/bin/sh +# +# Copyright (c) 2000 Alexandre Peixoto +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD: src/share/examples/ipfw/change_rules.sh,v 1.6 2003/09/07 07:52:56 jmg Exp $ + +# Change ipfw(8) rules with safety guarantees for remote operation +# +# Invoke this script to edit ${firewall_script}. It will call ${EDITOR}, +# or vi(1) if the environment variable is not set, for you to edit +# ${firewall_script}, ask for confirmation, and then run +# ${firewall_script}. You can then examine the output of ipfw list and +# confirm whether you want the new version or not. +# +# If no answer is received in 30 seconds, the previous +# ${firewall_script} is run, restoring the old rules (this assumes ipfw +# flush is present in it). +# +# If the new rules are confirmed, they'll replace ${firewall_script} and +# the previous ones will be copied to ${firewall_script}.{date}. Mail +# will also be sent to root with a unified diff of the rule change. +# +# Unapproved rules are kept in ${firewall_script}.new, and you are +# offered the option of changing them instead of the present rules when +# you call this script. +# +# This script could be improved by using version control +# software. + +# XXX on linux /etc/rc.conf defines: +# firewall_type and firewall_script + +if [ -r /etc/defaults/rc.conf ]; then + . /etc/defaults/rc.conf + source_rc_confs +elif [ -r /etc/rc.conf ]; then + . /etc/rc.conf +fi + +EDITOR=${EDITOR:-/usr/bin/vi} +PAGER=${PAGER:-/usr/bin/more} + +# on linux the default mktemp invocation behavior +# is different, we should change the temporary file creation +tempfoo=`basename $0` +#TMPFILE=`mktemp -t ${tempfoo}` || exit 1 +TMPFILE=`mktemp -t ${tempfoo}.XXXXX` || exit 1 + +get_yes_no() { + while true + do + echo -n "$1 (Y/N) ? " + read -t 30 a + if [ $? != 0 ]; then + a="No"; + return; + fi + case $a in + [Yy]) a="Yes"; + return;; + [Nn]) a="No"; + return;; + *);; + esac + done +} + +restore_rules() { + nohup sh ${firewall_script} /dev/null 2>&1 + rm ${TMPFILE} + exit 1 +} + +case "${firewall_type}" in +[Cc][Ll][Ii][Ee][Nn][Tt]|\ +[Cc][Ll][Oo][Ss][Ee][Dd]|\ +[Oo][Pp][Ee][Nn]|\ +[Ss][Ii][Mm][Pp][Ll][Ee]|\ +[Uu][Nn][Kk][Nn][Oo][Ww][Nn]) + edit_file="${firewall_script}" + rules_edit=no + ;; +*) + if [ -r "${firewall_type}" ]; then + edit_file="${firewall_type}" + rules_edit=yes + fi + ;; +esac + +if [ -f ${edit_file}.new ]; then + get_yes_no "A new rules file already exists, do you want to use it" + [ $a = 'No' ] && cp ${edit_file} ${edit_file}.new +else + cp ${edit_file} ${edit_file}.new +fi + +trap restore_rules SIGHUP + +${EDITOR} ${edit_file}.new + +get_yes_no "Do you want to install the new rules" + +[ $a = 'No' ] && exit 1 + +cat < ${TMPFILE} 2>&1 +else + nohup sh ${firewall_script}.new \ + < /dev/null > ${TMPFILE} 2>&1 +fi +sleep 2; +get_yes_no "Would you like to see the resulting new rules" +[ $a = 'Yes' ] && ${PAGER} ${TMPFILE} +get_yes_no "Type y to keep the new rules" +[ $a != 'Yes' ] && restore_rules + +DATE=`date "+%Y%m%d%H%M"` +cp ${edit_file} ${edit_file}.$DATE +mv ${edit_file}.new ${edit_file} +cat </dev/null + fi + ${fwcmd} add deny $log ip from any to any + ;; + +[Cc][Ll][Oo][Ss][Ee][Dd]) + ${fwcmd} add 65000 deny ip from any to any + ;; +[Uu][Nn][Kk][Nn][Oo][Ww][Nn]) + ;; +*) + if [ -r "${firewall_type}" ]; then + ${fwcmd} ${firewall_flags} ${firewall_type} + fi + ;; +esac diff --git a/glue.h b/glue.h new file mode 100644 index 0000000..75216cc --- /dev/null +++ b/glue.h @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * $Id: glue.h 12501 2014-01-10 01:09:14Z luigi $ + * + * glue code to adapt the FreeBSD version to linux and windows, + * userland and kernel. + * This is included before any other headers, so we do not have + * a chance to override any #define that should appear in other + * headers. + * First handle headers for userland and kernel. Then common code + * (including headers that require a specific order of inclusion), + * then the user- and kernel- specific parts. + */ + +#if defined __FreeBSD__ +#define _GLUE_H +#endif /* __FreeBSD__ */ +#ifndef _GLUE_H +#define _GLUE_H + + +/* + * common definitions to allow portability + */ +#ifndef __FBSDID +#define __FBSDID(x) +#endif /* FBSDID */ + +#ifndef KERNEL_MODULE /* Userland headers */ + +#if defined(__CYGWIN32__) && !defined(_WIN32) +#define _WIN32 +#endif + +#if defined(TCC) && defined(_WIN32) +#include +#endif /* TCC */ + +#include /* linux needs it in addition to sys/types.h */ +#include /* for size_t */ +#include +#include +#include +#ifdef __linux__ +#include /* linux only 20111031 */ +#endif + +#else /* KERNEL_MODULE, kernel headers */ + +#define INET # want inet support +#ifdef __linux__ + +#include + +#define ifnet net_device /* remap */ +#define _KERNEL # make kernel structure visible +#define KLD_MODULE # add the module glue + +#include /* linux kernel */ +#include /* linux kernel */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) // or 2.4.x +#include /* linux/msg.h require this */ +#include /* just MAX_ADDR_LEN 8 on 2.4 32 on 2.6, also brings in byteorder */ +#endif + +/* on 2.6.22, msg.h requires spinlock_types.h */ +/* XXX spinlock_type.h was introduced in 2.6.14 */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) +#include +#endif +/* XXX m_type define conflict with include/sys/mbuf.h, + * so early include msg.h (to be solved) +*/ +#include + +#include +#include /* struct in_addr */ +#include /* struct in6_addr */ +#include +/* + * LIST_HEAD in queue.h conflict with linux/list.h + * some previous linux include need list.h definition + */ +#undef LIST_HEAD + +#define IF_NAMESIZE (16) +typedef uint32_t in_addr_t; + +#define printf(fmt, arg...) printk(KERN_ERR fmt, ##arg) +#endif /* __linux__ */ + +#endif /* KERNEL_MODULE end of kernel headers */ + + +/* + * Part 2: common userland and kernel definitions + */ + +#ifndef ETHER_ADDR_LEN +#define ETHER_ADDR_LEN (6+0) /* length of an Ethernet address */ +#endif + +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ + +/* + * linux: sysctl are mapped into /sys/module/ipfw_mod parameters + * windows: they are emulated via get/setsockopt + */ +#define CTLFLAG_RD 1 +#define CTLFLAG_RDTUN 1 +#define CTLFLAG_RW 2 +#define CTLFLAG_SECURE3 0 // unsupported +#define CTLFLAG_VNET 0 /* unsupported */ + +/* if needed, queue.h must be included here after list.h */ + +/* + * struct thread is used in linux and windows kernel. + * In windows, we need to emulate the sockopt interface + * so also the userland needs to have the struct sockopt defined. + * In order to achieve 64 bit compatibility, padding has been inserted. + */ +struct thread { + void *sopt_td; + void *td_ucred; +}; + +enum sopt_dir { SOPT_GET, SOPT_SET }; + +struct sockopt { + enum sopt_dir sopt_dir; /* is this a get or a set? */ + int sopt_level; /* second arg of [gs]etsockopt */ + int sopt_name; /* third arg of [gs]etsockopt */ +#ifdef _X64EMU + void* pad1; + void* pad2; +#endif + void *sopt_val; /* fourth arg of [gs]etsockopt */ + size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ +#ifdef _X64EMU + void* pad3; + void* pad4; +#endif + struct thread *sopt_td; /* calling thread or null if kernel */ +}; + + +#define INET_ADDRSTRLEN (16) /* missing in netinet/in.h */ + +/* + * List of values used for set/getsockopt options. + * The base value on FreeBSD is defined as a macro, + * if not available we will use our own enum. + * The TABLE_BASE value is used in the kernel. + */ +#ifndef IP_FW_TABLE_ADD +#define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ +enum ipfw_msg_type { + IP_FW_TABLE_ADD = _IPFW_SOCKOPT_BASE, + IP_FW_TABLE_DEL, + IP_FW_TABLE_FLUSH, + IP_FW_TABLE_GETSIZE, + IP_FW_TABLE_LIST, + IP_FW_DYN_GET, /* new addition */ + + /* IP_FW3 and IP_DUMMYNET3 are the new API */ + IP_FW3 = _IPFW_SOCKOPT_BASE + 8, + IP_DUMMYNET3, + + IP_FW_ADD = _IPFW_SOCKOPT_BASE + 10, + IP_FW_DEL, + IP_FW_FLUSH, + IP_FW_ZERO, + IP_FW_GET, + IP_FW_RESETLOG, + + IP_FW_NAT_CFG, + IP_FW_NAT_DEL, + IP_FW_NAT_GET_CONFIG, + IP_FW_NAT_GET_LOG, + + IP_DUMMYNET_CONFIGURE, + IP_DUMMYNET_DEL , + IP_DUMMYNET_FLUSH, + /* 63 is missing */ + IP_DUMMYNET_GET = _IPFW_SOCKOPT_BASE + 24, + _IPFW_SOCKOPT_END +}; +#endif /* IP_FW_TABLE_ADD */ + +/* + * Part 3: userland stuff + */ + +#ifndef KERNEL_MODULE + +/* + * internal names in struct in6_addr (netinet/in6.h) differ, + * so we remap the FreeBSD names to the platform-specific ones. + */ +#ifndef _WIN32 +#define __u6_addr in6_u +#define __u6_addr32 u6_addr32 +#define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ +#else /* _WIN32 uses different naming */ +#define __u6_addr __u6 +#define __u6_addr32 __s6_addr32 +#endif /* _WIN32 */ + +/* missing in linux netinet/ip.h */ +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ + +/* defined in freebsd netinet/icmp6.h */ +#define ICMP6_MAXTYPE 201 + +/* on freebsd sys/socket.h pf specific */ +#define NET_RT_IFLIST 3 /* survey interface list */ + +#if defined(__linux__) || defined(__CYGWIN32__) +/* on freebsd net/if.h XXX used */ +struct if_data { + /* ... */ + u_long ifi_mtu; /* maximum transmission unit */ +}; + +/* + * Message format for use in obtaining information about interfaces + * from getkerninfo and the routing socket. + * This is used in nat.c + */ +struct if_msghdr { + u_short ifm_msglen; /* to skip over unknown messages */ + u_char ifm_version; /* future binary compatibility */ + u_char ifm_type; /* message type */ + int ifm_addrs; /* like rtm_addrs */ + int ifm_flags; /* value of if_flags */ + u_short ifm_index; /* index for associated ifp */ + struct if_data ifm_data;/* stats and other ifdata */ +}; + +/* + * Message format for use in obtaining information about interface + * addresses from getkerninfo and the routing socket + */ +struct ifa_msghdr { + u_short ifam_msglen; /* to skip over unknown messages */ + u_char ifam_version; /* future binary compatibility */ + u_char ifam_type; /* message type */ + int ifam_addrs; /* like rtm_addrs */ + int ifam_flags; /* value of ifa_flags */ + u_short ifam_index; /* index for associated ifp */ + int ifam_metric; /* value of ifa_metric */ +}; + +#ifndef NO_RTM /* conflicting with netlink */ +/* missing in net/route.h */ +#define RTM_VERSION 5 /* Up the ante and ignore older versions */ +#define RTM_IFINFO 0xe /* iface going up/down etc. */ +#define RTM_NEWADDR 0xc /* address being added to iface */ +#define RTA_IFA 0x20 /* interface addr sockaddr present */ +#endif /* NO_RTM */ + +/* SA_SIZE is used in the userland nat.c modified */ +#define SA_SIZE(sa) \ + ( (!(sa) ) ? \ + sizeof(long) : \ + 1 + ( (sizeof(struct sockaddr) - 1) | (sizeof(long) - 1) ) ) + +/* sys/time.h */ +/* + * Getkerninfo clock information structure + */ +struct clockinfo { + int hz; /* clock frequency */ + int tick; /* micro-seconds per hz tick */ + int spare; + int stathz; /* statistics clock frequency */ + int profhz; /* profiling clock frequency */ +}; + +/* no sin_len in sockaddr, we only remap in userland */ +#define sin_len sin_zero[0] + +#endif /* Linux/Win */ + +/* + * linux does not have a reentrant version of qsort, + * so we the FreeBSD stdlib version. + */ +void qsort_r(void *a, size_t n, size_t es, void *thunk, + int cmp_t(void *, const void *, const void *)); + +/* prototypes from libutil */ +/* humanize_number(3) */ +#define HN_DECIMAL 0x01 +#define HN_NOSPACE 0x02 +#define HN_B 0x04 +#define HN_DIVISOR_1000 0x08 + +#define HN_GETSCALE 0x10 +#define HN_AUTOSCALE 0x20 + +int humanize_number(char *_buf, size_t _len, int64_t _number, + const char *_suffix, int _scale, int _flags); +int expand_number(const char *_buf, int64_t *_num); + +#define setprogname(x) /* not present in linux */ + +extern int optreset; /* not present in linux */ + +size_t strlcpy(char * dst, const char * src, size_t siz); +long long int strtonum(const char *nptr, long long minval, + long long maxval, const char **errstr); + +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen); + + +#else /* KERNEL_MODULE */ + +/* + * Part 4: kernel stuff + */ + +/* linux and windows kernel do not have bcopy ? */ +#define bcopy(_s, _d, _l) memcpy(_d, _s, _l) +/* definitions useful for the kernel side */ +struct route_in6 { + int dummy; +}; + +#ifdef __linux__ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) // or 2.4.x +#include +#endif + +/* skb_dst() and skb_dst_set() was introduced from linux 2.6.31 */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) +void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst); +struct dst_entry *skb_dst(const struct sk_buff *skb); +#endif + +/* The struct flowi changed */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) // check boundaries +#define flow_daddr fl.u.ip4 +#else +#define flow_daddr fl.nl_u.ip4_u +#endif + +#endif /* __linux__ */ + +/* + * Do not load prio_heap.h header because of conflicting names + * with our heap functions defined in include/netinet/ipfw/dn_heap.h + * However do define struct ptr_heap used in linux 3.12.7 etc. + */ +#define _LINUX_PRIO_HEAP_H +struct ptr_heap; + +/* + * The following define prevent the ipv6.h header to be loaded. + * Starting from the 2.6.38 kernel the ipv6.h file, which is included + * by include/net/inetpeer.h in turn included by net/route.h + * include the system tcp.h file while we want to include + * our include/net/tcp.h instead. + */ +#ifndef _NET_IPV6_H +#define _NET_IPV6_H +static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2) +{ + memcpy(a1, a2, sizeof(struct in6_addr)); +} +#endif /* _NET_IPV6_H */ + +#endif /* KERNEL_MODULE */ + +/* + * Part 5: windows specific stuff + */ + +#ifdef _WIN32 +#ifndef KERNEL_MODULE +#define CTL_CODE( DeviceType, Function, Method, Access ) ( \ + ((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method) \ +) + +#define METHOD_BUFFERED 0 +#define METHOD_IN_DIRECT 1 +#define METHOD_OUT_DIRECT 2 +#define METHOD_NEITHER 3 +#define FILE_ANY_ACCESS 0 +#define FILE_READ_DATA ( 0x0001 ) // file & pipe +#define FILE_WRITE_DATA ( 0x0002 ) // file & pipe +#endif /* !KERNEL_MODULE */ + +#define FILE_DEVICE_IPFW 0x00654324 +#define IP_FW_BASE_CTL 0x840 +#define IP_FW_SETSOCKOPT \ + CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 1, METHOD_BUFFERED, FILE_WRITE_DATA) +#define IP_FW_GETSOCKOPT \ + CTL_CODE(FILE_DEVICE_IPFW, IP_FW_BASE_CTL + 2, METHOD_BUFFERED, FILE_ANY_ACCESS) + +/********************************* +* missing declarations in altq.c * +**********************************/ + +#define _IOWR(x,y,t) _IOW(x,y,t) + +/********************************** +* missing declarations in ipfw2.c * +***********************************/ + +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ +#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ +#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ + + +struct ether_addr; +struct ether_addr * ether_aton(const char *a); + +/********************************* +* missing declarations in ipv6.c * +**********************************/ + +struct hostent* gethostbyname2(const char *name, int af); + + +/******************** +* windows wrappings * +*********************/ + +int my_socket(int domain, int ty, int proto); +#define socket(_a, _b, _c) my_socket(_a, _b, _c) + +#endif /* _WIN32 */ +/******************* +* SYSCTL emulation * +********************/ +#if defined (_WIN32) || defined (EMULATE_SYSCTL) +#define STRINGIFY(x) #x + +/* flag is set with the last 2 bits for access, as defined in glue.h + * and the rest for type + */ +enum { + SYSCTLTYPE_INT = 0, + SYSCTLTYPE_UINT, + SYSCTLTYPE_SHORT, + SYSCTLTYPE_USHORT, + SYSCTLTYPE_LONG, + SYSCTLTYPE_ULONG, + SYSCTLTYPE_STRING, +}; + +struct sysctlhead { + uint32_t blocklen; //total size of the entry + uint32_t namelen; //strlen(name) + '\0' + uint32_t flags; //type and access + uint32_t datalen; +}; + +#ifdef _KERNEL + +#ifdef SYSCTL_NODE +#undef SYSCTL_NODE +#endif +#define SYSCTL_NODE(a,b,c,d,e,f) +#define SYSCTL_DECL(a) +#define SYSCTL_VNET_PROC(a,b,c,d,e,f,g,h,i) + +#define GST_HARD_LIMIT 100 + +/* In the module, GST is implemented as an array of + * sysctlentry, but while passing data to the userland + * pointers are useless, the buffer is actually made of: + * - sysctlhead (fixed size, containing lengths) + * - data (typically 32 bit) + * - name (zero-terminated and padded to mod4) + */ + +struct sysctlentry { + struct sysctlhead head; + char* name; + void* data; +}; + +struct sysctltable { + int count; //number of valid tables + int totalsize; //total size of valid entries of al the valid tables + void* namebuffer; //a buffer for all chained names + struct sysctlentry entry[GST_HARD_LIMIT]; +}; + +#ifdef SYSBEGIN +#undef SYSBEGIN +#endif +#define SYSBEGIN(x) void sysctl_addgroup_##x() { +#ifdef SYSEND +#undef SYSEND +#endif +#define SYSEND } + +/* XXX remove duplication */ +#define SYSCTL_INT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) + +#define SYSCTL_VNET_INT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) + +#define SYSCTL_UINT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) + +#define SYSCTL_VNET_UINT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) + +#define SYSCTL_LONG(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e) + +#define SYSCTL_ULONG(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e) +#define TUNABLE_INT(a,b) + +void keinit_GST(void); +void keexit_GST(void); +int kesysctl_emu_set(void* p, int l); +int kesysctl_emu_get(struct sockopt* sopt); +void sysctl_pushback(char* name, int flags, int datalen, void* data); + +#endif /* _KERNEL */ + +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen); +#endif /* _WIN32" || EMULATE_SYSCTL */ +#ifdef _WIN32 +int do_cmd(int optname, void *optval, uintptr_t optlen); + +#endif /* _WIN32 */ + +#define __PAST_END(v, idx) v[idx] +#endif /* !_GLUE_H */ diff --git a/ipfw/Makefile b/ipfw/Makefile new file mode 100644 index 0000000..a32d02a --- /dev/null +++ b/ipfw/Makefile @@ -0,0 +1,128 @@ +# +# $Id: Makefile 11688 2012-08-12 20:58:26Z luigi $ +# +# GNUMakefile to build the userland part of ipfw on Linux and Windows +# +# Do not set with = or := so we can inherit from the caller + +include ../Makefile.inc + +all: $(TARGET) + +#TCC=c:/path/to/tcc + +# common flags +EXTRA_CFLAGS += -O1 +EXTRA_CFLAGS += -Wall +EXTRA_CFLAGS += -include ../glue.h +EXTRA_CFLAGS += -I ./include_e -I ./include + +TARGET := ipfw +ifneq ($(VER),openwrt) +ifeq ($(OSARCH),Linux) + EXTRA_CFLAGS += -D__BSD_VISIBLE + EXTRA_CFLAGS += -Werror + # Required by GCC 4.6 + EXTRA_CFLAGS += -Wno-unused-but-set-variable +endif +ifeq ($(OSARCH),FreeBSD) + EXTRA_CFLAGS += -D__BSD_VISIBLE + EXTRA_CFLAGS += -Werror +endif +ifeq ($(OSARCH),Darwin) + EXTRA_CFLAGS += -D__BSD_VISIBLE + EXTRA_CFLAGS += -Werror +endif + +ifeq ($(OSARCH),Windows) +# we only support Cygwin and tcc as compilers. +ifeq ($(WIN64),1) + EXTRA_CFLAGS += -D_X64EMU +endif + +ifeq ($(TCC),) # cygwin + EXTRA_CFLAGS += -I/cygdrive/c/$(DDKDIR)/inc/ddk + EXTRA_CFLAGS += -I . + EXTRA_CFLAGS += -pipe -Wall +else #-- build with tcc + # TCC points to the root of tcc tree + CC=$(TCC)/tcc.exe + EXTRA_CFLAGS += -DTCC -I.. + EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include + EXTRA_CFLAGS += -nostdinc + + EFILES_. += err.h grp.h netdb.h pwd.h sysexits.h + EFILES_arpa += inet.h + EFILES_net += if.h + EFILES_netinet += in.h in_systm.h ip.h ip_icmp.h + EFILES_sys += cdefs.h wait.h ioctl.h socket.h + +endif + # EXTRA_CFLAGS += -D_WIN32 # see who defines it + EXTRA_CFLAGS += -Dsetsockopt=wnd_setsockopt + EXTRA_CFLAGS += -Dgetsockopt=wnd_getsockopt + EXTRA_CFLAGS += -DEMULATE_SYSCTL + EFILES_net += ethernet.h route.h + EFILES_netinet += ether.h icmp6.h + EFILES_sys += sysctl.h + TARGET = ipfw.exe +ipfw: $(TARGET) +endif # windows +endif # !openwrt + +CFLAGS += $(EXTRA_CFLAGS) +# Location of OS headers and libraries. After our stuff. +USRDIR?= /usr +ifeq ($(TCC),) + CFLAGS += -I$(USRDIR)/include + LDFLAGS += -L$(USRDIR)/lib +else + LDFLAGS += -L. -L$(TCC)/lib -lws2_32 +endif + +OBJS = ipfw2.o dummynet.o main.o ipv6.o qsort_r.o +OBJS += expand_number.o humanize_number.o glue.o + +# we don't use ALTQ +CFLAGS += -DNO_ALTQ +#OBJS += altq.o + +all: $(TARGET) + -@echo "Done build for $(OSARCH) VER $(VER)" + +$(TARGET): $(OBJS) + $(MSG) " LD $@" + $(HIDE)$(CC) $(LDFLAGS) -o $@ $^ + +$(OBJS) : ipfw2.h ../glue.h include_e + +# support to create empty dirs and files in include_e/ +# EDIRS is the list of directories, EFILES is the list of files. +EFILES_sys += sockio.h +EFILES_. += libutil.h +EFILES_netinet += __emtpy.h + +M ?= $(shell pwd) + +# first make a list of directories from variable names +EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES))) +# then prepend the directory name to individual files. +# $(empty) serves to interpret the following space literally, +# and the ": = " substitution packs spaces into one. +EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i): = ))) + +include_e: + $(MSG) "building include_e in $M" + -@rm -rf $(M)/include_e opt_* + -@mkdir -p $(M)/include_e + -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) + -@(cd $(M)/include_e/netinet; \ + for i in ip_fw.h ip_dummynet.h tcp.h; do \ + cp ../../../sys/netinet/$$i .; done; ) + +clean distclean: + -@rm -rf $(OBJS) $(TARGET) include_e + +diff: + -@(diff -upr $(BSD_HEAD)/sbin/ipfw .) + diff --git a/ipfw/add_rules b/ipfw/add_rules new file mode 100755 index 0000000..f7866d7 --- /dev/null +++ b/ipfw/add_rules @@ -0,0 +1,27 @@ +#!/bin/bash +# +# A test script to add rules + +PRG=./ipfw + +myfun() { + $PRG add 10 count icmp from any to 131.114.9.128 + $PRG add 20 count icmp from 131.114.9.128 to any + $PRG add 20 count icmp from any to 131.114.9.130 + $PRG add 30 count icmp from 131.114.9.130 to any + $PRG add 40 count icmp from any to 131.114.9.129 + $PRG add 50 count icmp from 131.114.9.129 to any + $PRG add 60 count icmp from 131.114.9.236 to any + sleep 1 + $PRG del 10 + $PRG del 20 + $PRG del 20 + $PRG del 30 + $PRG del 40 + $PRG del 50 + $PRG del 60 +} + +for ((i=0;i<100;i++)) ; do + myfun +done diff --git a/ipfw/dummynet.c b/ipfw/dummynet.c new file mode 100644 index 0000000..15f00b6 --- /dev/null +++ b/ipfw/dummynet.c @@ -0,0 +1,1459 @@ +/* + * Copyright (c) 2002-2003,2010 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD: head/sbin/ipfw/dummynet.c 206843 2010-04-19 15:11:45Z luigi $ + * + * dummynet support + */ + +#include +#include +/* XXX there are several sysctl leftover here */ +#include + +#include "ipfw2.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include /* inet_ntoa */ + + +static struct _s_x dummynet_params[] = { + { "plr", TOK_PLR }, + { "noerror", TOK_NOERROR }, + { "buckets", TOK_BUCKETS }, + { "dst-ip", TOK_DSTIP }, + { "src-ip", TOK_SRCIP }, + { "dst-port", TOK_DSTPORT }, + { "src-port", TOK_SRCPORT }, + { "proto", TOK_PROTO }, + { "weight", TOK_WEIGHT }, + { "lmax", TOK_LMAX }, + { "maxlen", TOK_LMAX }, + { "all", TOK_ALL }, + { "mask", TOK_MASK }, /* alias for both */ + { "sched_mask", TOK_SCHED_MASK }, + { "flow_mask", TOK_FLOW_MASK }, + { "droptail", TOK_DROPTAIL }, + { "red", TOK_RED }, + { "gred", TOK_GRED }, + { "bw", TOK_BW }, + { "bandwidth", TOK_BW }, + { "delay", TOK_DELAY }, + { "link", TOK_LINK }, + { "pipe", TOK_PIPE }, + { "queue", TOK_QUEUE }, + { "flowset", TOK_FLOWSET }, + { "sched", TOK_SCHED }, + { "pri", TOK_PRI }, + { "priority", TOK_PRI }, + { "type", TOK_TYPE }, + { "flow-id", TOK_FLOWID}, + { "dst-ipv6", TOK_DSTIP6}, + { "dst-ip6", TOK_DSTIP6}, + { "src-ipv6", TOK_SRCIP6}, + { "src-ip6", TOK_SRCIP6}, + { "profile", TOK_PROFILE}, + { "burst", TOK_BURST}, + { "dummynet-params", TOK_NULL }, + { NULL, 0 } /* terminator */ +}; + +#define O_NEXT(p, len) ((void *)((char *)p + len)) + +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} + +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + +/* handle variable length structures moving back the pointer and fixing length */ +static void * +o_compact(struct dn_id **o, int len, int real_length, int type) +{ + struct dn_id *ret = *o; + + ret = O_NEXT(*o, -len); + oid_fill(ret, real_length, type, 0); + *o = O_NEXT(ret, real_length); + return ret; +} + +#if 0 +static int +sort_q(void *arg, const void *pa, const void *pb) +{ + int rev = (co.do_sort < 0); + int field = rev ? -co.do_sort : co.do_sort; + long long res = 0; + const struct dn_flow_queue *a = pa; + const struct dn_flow_queue *b = pb; + + switch (field) { + case 1: /* pkts */ + res = a->len - b->len; + break; + case 2: /* bytes */ + res = a->len_bytes - b->len_bytes; + break; + + case 3: /* tot pkts */ + res = a->tot_pkts - b->tot_pkts; + break; + + case 4: /* tot bytes */ + res = a->tot_bytes - b->tot_bytes; + break; + } + if (res < 0) + res = -1; + if (res > 0) + res = 1; + return (int)(rev ? res : -res); +} +#endif + +/* print a mask and header for the subsequent list of flows */ +static void +print_mask(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) { + printf(" " + "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + id->extra ? "queue," : "", + id->proto, + id->src_ip, id->src_port, + id->dst_ip, id->dst_port); + } else { + char buf[255]; + printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", + id->extra ? "queue," : "", + id->proto, id->flow_id6); + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); + printf("%s/0x%04x -> ", buf, id->src_port); + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); + printf("%s/0x%04x\n", buf, id->dst_port); + } +} + +static void +print_header(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) + printf("BKT Prot ___Source IP/port____ " + "____Dest. IP/port____ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + else + printf("BKT ___Prot___ _flow-id_ " + "______________Source IPv6/port_______________ " + "_______________Dest. IPv6/port_______________ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); +} + +static void +list_flow(struct dn_flow *ni, int *print) +{ + char buff[255]; + struct protoent *pe = NULL; + struct in_addr ina; + struct ipfw_flow_id *id = &ni->fid; + + if (*print) { + print_header(&ni->fid); + *print = 0; + } + pe = getprotobynumber(id->proto); + /* XXX: Should check for IPv4 flows */ + printf("%3u%c", (ni->oid.id) & 0xff, + id->extra ? '*' : ' '); + if (!IS_IP6_FLOW_ID(id)) { + if (pe) + printf("%-4s ", pe->p_name); + else + printf("%4u ", id->proto); + ina.s_addr = htonl(id->src_ip); + printf("%15s/%-5d ", + inet_ntoa(ina), id->src_port); + ina.s_addr = htonl(id->dst_ip); + printf("%15s/%-5d ", + inet_ntoa(ina), id->dst_port); + } else { + /* Print IPv6 flows */ + if (pe != NULL) + printf("%9s ", pe->p_name); + else + printf("%9u ", id->proto); + printf("%7d %39s/%-5d ", id->flow_id6, + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), + id->src_port); + printf(" %39s/%-5d ", + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), + id->dst_port); + } + pr_u64(&ni->tot_pkts, 4); + pr_u64(&ni->tot_bytes, 8); + printf("%2u %4u %3u\n", + ni->length, ni->len_bytes, ni->drops); +} + +static void +print_flowset_parms(struct dn_fs *fs, char *prefix) +{ + int l; + char qs[30]; + char plr[30]; + char red[90]; /* Display RED parameters */ + + l = fs->qsize; + if (fs->flags & DN_QSIZE_BYTES) { + if (l >= 8192) + sprintf(qs, "%d KB", l / 1024); + else + sprintf(qs, "%d B", l); + } else + sprintf(qs, "%3d sl.", l); + if (fs->plr) + sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); + else + plr[0] = '\0'; + + if (fs->flags & DN_IS_RED) /* RED parameters */ + sprintf(red, + "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', + 1.0 * fs->w_q / (double)(1 << SCALE_RED), + fs->min_th, + fs->max_th, + 1.0 * fs->max_p / (double)(1 << SCALE_RED)); + else + sprintf(red, "droptail"); + + if (prefix[0]) { + printf("%s %s%s %d queues (%d buckets) %s\n", + prefix, qs, plr, fs->oid.id, fs->buckets, red); + prefix[0] = '\0'; + } else { + printf("q%05d %s%s %d flows (%d buckets) sched %d " + "weight %d lmax %d pri %d %s\n", + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); + if (fs->flags & DN_HAVE_MASK) + print_mask(&fs->flow_mask); + } +} + +static void +print_extra_delay_parms(struct dn_profile *p) +{ + double loss; + if (p->samples_no <= 0) + return; + + loss = p->loss_level; + loss /= p->samples_no; + printf("\t profile: name \"%s\" loss %f samples %d\n", + p->name, loss, p->samples_no); +} + +static void +flush_buf(char *buf) +{ + if (buf[0]) + printf("%s\n", buf); + buf[0] = '\0'; +} + +/* + * generic list routine. We expect objects in a specific order, i.e. + * PIPES AND SCHEDULERS: + * link; scheduler; internal flowset if any; instances + * we can tell a pipe from the number. + * + * FLOWSETS: + * flowset; queues; + * link i (int queue); scheduler i; si(i) { flowsets() : queues } + */ +static void +list_pipes(struct dn_id *oid, struct dn_id *end) +{ + char buf[160]; /* pending buffer */ + int toPrint = 1; /* print header */ + + buf[0] = '\0'; + for (; oid != end; oid = O_NEXT(oid, oid->len)) { + if (oid->len < sizeof(*oid)) + errx(1, "invalid oid len %d\n", oid->len); + + switch (oid->type) { + default: + flush_buf(buf); + printf("unrecognized object %d size %d\n", oid->type, oid->len); + break; + case DN_TEXT: /* list of attached flowsets */ + { + int i, l; + struct { + struct dn_id id; + uint32_t p[0]; + } *d = (void *)oid; + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); + if (l == 0) + break; + printf(" Children flowsets: "); + for (i = 0; i < l; i++) + printf("%u ", d->p[i]); + printf("\n"); + break; + } + case DN_CMD_GET: + if (co.verbose) + printf("answer for cmd %d, len %d\n", oid->type, oid->id); + break; + case DN_SCH: { + struct dn_sch *s = (struct dn_sch *)oid; + flush_buf(buf); + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", + s->sched_nr, + s->name, s->flags, s->buckets, s->oid.id); + if (s->flags & DN_HAVE_MASK) + print_mask(&s->sched_mask); + } + break; + + case DN_FLOW: + list_flow((struct dn_flow *)oid, &toPrint); + break; + + case DN_LINK: { + struct dn_link *p = (struct dn_link *)oid; + double b = p->bandwidth; + char bwbuf[30]; + char burst[5 + 7]; + + /* This starts a new object so flush buffer */ + flush_buf(buf); + /* data rate */ + if (b == 0) + sprintf(bwbuf, "unlimited "); + else if (b >= 1000000) + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); + else if (b >= 1000) + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); + else + sprintf(bwbuf, "%7.3f bit/s ", b); + + if (humanize_number(burst, sizeof(burst), p->burst, + "", HN_AUTOSCALE, 0) < 0 || co.verbose) + sprintf(burst, "%d", (int)p->burst); + sprintf(buf, "%05d: %s %4d ms burst %s", + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); + } + break; + + case DN_FS: + print_flowset_parms((struct dn_fs *)oid, buf); + break; + case DN_PROFILE: + flush_buf(buf); + print_extra_delay_parms((struct dn_profile *)oid); + } + flush_buf(buf); // XXX does it really go here ? + } +} + +/* + * Delete pipe, queue or scheduler i + */ +int +ipfw_delete_pipe(int do_pipe, int i) +{ + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : + ( (do_pipe == 2) ? DN_FS : DN_SCH); + cmd.a[0] = i; + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); + if (i) { + i = 1; + warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); + } + return i; +} + +/* + * Code to parse delay profiles. + * + * Some link types introduce extra delays in the transmission + * of a packet, e.g. because of MAC level framing, contention on + * the use of the channel, MAC level retransmissions and so on. + * From our point of view, the channel is effectively unavailable + * for this extra time, which is constant or variable depending + * on the link type. Additionally, packets may be dropped after this + * time (e.g. on a wireless link after too many retransmissions). + * We can model the additional delay with an empirical curve + * that represents its distribution. + * + * cumulative probability + * 1.0 ^ + * | + * L +-- loss-level x + * | ****** + * | * + * | ***** + * | * + * | ** + * | * + * +-------*-------------------> + * delay + * + * The empirical curve may have both vertical and horizontal lines. + * Vertical lines represent constant delay for a range of + * probabilities; horizontal lines correspond to a discontinuty + * in the delay distribution: the link will use the largest delay + * for a given probability. + * + * To pass the curve to dummynet, we must store the parameters + * in a file as described below, and issue the command + * + * ipfw pipe config ... bw XXX profile ... + * + * The file format is the following, with whitespace acting as + * a separator and '#' indicating the beginning a comment: + * + * samples N + * the number of samples used in the internal + * representation (2..1024; default 100); + * + * loss-level L + * The probability above which packets are lost. + * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); + * + * name identifier + * Optional a name (listed by "ipfw pipe show") + * to identify the distribution; + * + * "delay prob" | "prob delay" + * One of these two lines is mandatory and defines + * the format of the following lines with data points. + * + * XXX YYY + * 2 or more lines representing points in the curve, + * with either delay or probability first, according + * to the chosen format. + * The unit for delay is milliseconds. + * + * Data points does not need to be ordered or equal to the number + * specified in the "samples" line. ipfw will sort and interpolate + * the curve as needed. + * + * Example of a profile file: + + name bla_bla_bla + samples 100 + loss-level 0.86 + prob delay + 0 200 # minimum overhead is 200ms + 0.5 200 + 0.5 300 + 0.8 1000 + 0.9 1300 + 1 1300 + + * Internally, we will convert the curve to a fixed number of + * samples, and when it is time to transmit a packet we will + * model the extra delay as extra bits in the packet. + * + */ + +#define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN +#define ED_TOK_SAMPLES "samples" +#define ED_TOK_LOSS "loss-level" +#define ED_TOK_NAME "name" +#define ED_TOK_DELAY "delay" +#define ED_TOK_PROB "prob" +#define ED_TOK_BW "bw" +#define ED_SEPARATORS " \t\n" +#define ED_MIN_SAMPLES_NO 2 + +/* + * returns 1 if s is a non-negative number, with at least one '.' + */ +static int +is_valid_number(const char *s) +{ + int i, dots_found = 0; + int len = strlen(s); + + for (i = 0; i 1)) + return 0; + return 1; +} + +/* + * Take as input a string describing a bandwidth value + * and return the numeric bandwidth value. + * set clocking interface or bandwidth value + */ +static void +read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) +{ + if (*bandwidth != -1) + warnx("duplicate token, override bandwidth value!"); + + if (arg[0] >= 'a' && arg[0] <= 'z') { + if (!if_name) { + errx(1, "no if support"); + } + if (namelen >= IFNAMSIZ) + warn("interface name truncated"); + namelen--; + /* interface name */ + strncpy(if_name, arg, namelen); + if_name[namelen] = '\0'; + *bandwidth = 0; + } else { /* read bandwidth value */ + int bw; + char *end = NULL; + + bw = strtoul(arg, &end, 0); + if (*end == 'K' || *end == 'k') { + end++; + bw *= 1000; + } else if (*end == 'M' || *end == 'm') { + end++; + bw *= 1000000; + } + if ((*end == 'B' && + _substrcmp2(end, "Bi", "Bit/s") != 0) || + _substrcmp2(end, "by", "bytes") == 0) + bw *= 8; + + if (bw < 0) + errx(EX_DATAERR, "bandwidth too large"); + + *bandwidth = bw; + if (if_name) + if_name[0] = '\0'; + } +} + +struct point { + double prob; + double delay; +}; + +static int +compare_points(const void *vp1, const void *vp2) +{ + const struct point *p1 = vp1; + const struct point *p2 = vp2; + double res = 0; + + res = p1->prob - p2->prob; + if (res == 0) + res = p1->delay - p2->delay; + if (res < 0) + return -1; + else if (res > 0) + return 1; + else + return 0; +} + +#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno + +/* + * Interpolate a set of proability-value tuples. + * + * This function takes as input a tuple of values + * and samples the interpolated curve described from the tuples. + * + * The user defined points are stored in the ponts structure. + * The number of points is stored in points_no. + * The user defined sampling value is stored in samples_no. + * The resulting samples are in the "samples" pointer. + * + * We assume that The last point for the '1' value of the + * probability should be defined. (XXX add checks for this) + * + * The input data are points and points_no. + * The output data are s (the array of s_no samples) + * and s_no (the number of samples) + * + */ +static void +interpolate_samples(struct point *p, int points_no, + int *samples, int samples_no, const char *filename) +{ + double dy; /* delta on the y axis */ + double y; /* current value of y */ + double x; /* current value of x */ + double m; /* the y slope */ + int i; /* samples index */ + int curr; /* points current index */ + + /* make sure that there are enough points. */ + /* XXX Duplicated should be removed */ + if (points_no < 3) + errx(EX_DATAERR, "%s too few samples, need at least %d", + filename, 3); + + qsort(p, points_no, sizeof(struct point), compare_points); + + dy = 1.0/samples_no; + y = 0; + + for (i=0, curr = 0; i < samples_no; i++, y+=dy) { + /* This statment move the curr pointer to the next point + * skipping the points with the same x value. We are + * guaranteed to exit from the loop because the + * last possible value of y is stricly less than 1 + * and the last possible value of the y points is 1 */ + while ( y >= p[curr+1].prob ) curr++; + + /* compute the slope of the curve */ + m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob); + /* compute the x value starting from the current point */ + x = p[curr].delay + (y - p[curr].prob) * m; + samples[i] = x; + } + + /* add the last sample */ + samples[i] = p[curr+1].delay; +} + +/* + * p is the link (old pipe) + * pf is the profile + */ +static void +load_extra_delays(const char *filename, struct dn_profile *p, + struct dn_link *link) +{ + char line[ED_MAX_LINE_LEN]; + FILE *f; + int lineno = 0; + + int samples = -1; + double loss = -1.0; + char profile_name[ED_MAX_NAME_LEN]; + int delay_first = -1; + int do_points = 0; + struct point points[ED_MAX_SAMPLES_NO]; + int points_no = 0; + + /* XXX link never NULL? */ + p->link_nr = link->link_nr; + + profile_name[0] = '\0'; + f = fopen(filename, "r"); + if (f == NULL) + err(EX_UNAVAILABLE, "fopen: %s", filename); + + while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ + char *s, *cur = line, *name = NULL, *arg = NULL; + + ++lineno; + + /* parse the line */ + while (cur) { + s = strsep(&cur, ED_SEPARATORS); + if (s == NULL || *s == '#') + break; + if (*s == '\0') + continue; + if (arg) + errx(ED_EFMT("too many arguments")); + if (name == NULL) + name = s; + else + arg = s; + } + + if ((name == NULL) || (*name == '#')) /* empty line */ + continue; + if (arg == NULL) + errx(ED_EFMT("missing arg for %s"), name); + + if (!strcasecmp(name, ED_TOK_SAMPLES)) { + if (samples > 0) + errx(ED_EFMT("duplicate ``samples'' line")); + if (atoi(arg) <=0) + errx(ED_EFMT("invalid number of samples")); + samples = atoi(arg); + if (samples>=ED_MAX_SAMPLES_NO-1) + errx(ED_EFMT("too many samples, maximum is %d"), + ED_MAX_SAMPLES_NO-1); + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_BW)) { + char buf[IFNAMSIZ]; + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); + p->bandwidth = link->bandwidth; + } else if (!strcasecmp(name, ED_TOK_LOSS)) { + if (loss != -1.0) + errx(ED_EFMT("duplicated token: %s"), name); + if (!is_valid_number(arg)) + errx(ED_EFMT("invalid %s"), arg); + loss = atof(arg); + if (loss > 1) + errx(ED_EFMT("%s greater than 1.0"), name); + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_NAME)) { + if (profile_name[0] != '\0') + errx(ED_EFMT("duplicated token: %s"), name); + strncpy(profile_name, arg, sizeof(profile_name) - 1); + profile_name[sizeof(profile_name)-1] = '\0'; + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_DELAY)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 1; + do_points = 1; + } else if (!strcasecmp(name, ED_TOK_PROB)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 0; + do_points = 1; + } else if (do_points) { + if (!is_valid_number(name) || !is_valid_number(arg)) + errx(ED_EFMT("invalid point found")); + if (delay_first) { + points[points_no].delay = atof(name); + points[points_no].prob = atof(arg); + } else { + points[points_no].delay = atof(arg); + points[points_no].prob = atof(name); + } + if (points[points_no].prob > 1.0) + errx(ED_EFMT("probability greater than 1.0")); + ++points_no; + } else { + errx(ED_EFMT("unrecognised command '%s'"), name); + } + } + + fclose (f); + + if (samples == -1) { + warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); + samples = 100; + } + + if (loss == -1.0) { + warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); + loss = 1; + } + + interpolate_samples(points, points_no, p->samples, samples, filename); + + p->samples_no = samples++; + p->loss_level = loss * samples; + strncpy(p->name, profile_name, sizeof(p->name)); +} + +/* + * configuration of pipes, schedulers, flowsets. + * When we configure a new scheduler, an empty pipe is created, so: + * + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility + * sched N+Delta type fifo sched_mask ... + * pipe N+Delta + * flowset N+Delta pipe N+Delta (no parameters) + * sched N type wf2q+ sched_mask ... + * pipe N + * + * do_pipe = 2 -> flowset N config + * flowset N parameters + * + * do_pipe = 3 -> sched N config + * sched N parameters (default no pipe) + * optional Pipe N config ... + * pipe ==> + */ +void +ipfw_config_pipe(int ac, char **av) +{ + int i; + u_int j; + char *end; + void *par = NULL; + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + struct ipfw_flow_id *mask = NULL; + int lmax; + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; + size_t max_pf_size = sizeof(struct dn_profile) + ED_MAX_SAMPLES_NO * sizeof(int); + + /* + * allocate space for 1 header, + * 1 scheduler, 1 link, 1 flowset, 1 profile + */ + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs); + lmax += max_pf_size; + + av++; ac--; + /* Pipe number */ + if (ac && isdigit(**av)) { + i = atoi(*av); av++; ac--; + } else + i = -1; + if (i <= 0) + errx(EX_USAGE, "need a pipe/flowset/sched number"); + base = buf = safe_calloc(1, lmax); + /* all commands start with a 'CONFIGURE' and a version */ + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + switch (co.do_pipe) { + case 1: /* "pipe N config ..." */ + /* Allocate space for the WF2Q+ scheduler, its link + * and the FIFO flowset. Set the number, but leave + * the scheduler subtype and other parameters to 0 + * so the kernel will use appropriate defaults. + * XXX todo: add a flag to record if a parameter + * is actually configured. + * If we do a 'pipe config' mask -> sched_mask. + * The FIFO scheduler and link are derived from the + * WF2Q+ one in the kernel. + */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + sch->sched_nr = i; + sch->oid.subtype = 0; /* defaults to WF2Q+ */ + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + *flags |= DN_PIPE_CMD; + + p->link_nr = i; + + /* This flowset is only for the FIFO scheduler */ + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + break; + + case 2: /* "queue N config ... " */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + fs->fs_nr = i; + mask = &fs->flow_mask; + flags = &fs->flags; + buckets = &fs->buckets; + break; + + case 3: /* "sched N config ..." */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + fs = o_next(&buf, sizeof(*fs), DN_FS); + sch->sched_nr = i; + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + /* fs is used only with !MULTIQUEUE schedulers */ + fs->fs_nr = i + DN_MAX_ID; + fs->sched_nr = i; + break; + } + /* set to -1 those fields for which we want to reuse existing + * values from the kernel. + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. + * XXX todo: support reuse of the mask. + */ + if (p) + p->bandwidth = -1; + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) + fs->par[j] = -1; + while (ac > 0) { + double d; + int tok = match_token(dummynet_params, *av); + ac--; av++; + + switch(tok) { + case TOK_NOERROR: + NEED(fs, "noerror is only for pipes"); + fs->flags |= DN_NOERROR; + break; + + case TOK_PLR: + NEED(fs, "plr is only for pipes"); + NEED1("plr needs argument 0..1\n"); + d = strtod(av[0], NULL); + if (d > 1) + d = 1; + else if (d < 0) + d = 0; + fs->plr = (int)(d*0x7fffffff); + ac--; av++; + break; + + case TOK_QUEUE: + NEED(fs, "queue is only for pipes or flowsets"); + NEED1("queue needs queue size\n"); + end = NULL; + fs->qsize = strtoul(av[0], &end, 0); + if (*end == 'K' || *end == 'k') { + fs->flags |= DN_QSIZE_BYTES; + fs->qsize *= 1024; + } else if (*end == 'B' || + _substrcmp2(end, "by", "bytes") == 0) { + fs->flags |= DN_QSIZE_BYTES; + } + ac--; av++; + break; + + case TOK_BUCKETS: + NEED(fs, "buckets is only for pipes or flowsets"); + NEED1("buckets needs argument\n"); + *buckets = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_FLOW_MASK: + case TOK_SCHED_MASK: + case TOK_MASK: + NEED(mask, "tok_mask"); + NEED1("mask needs mask specifier\n"); + /* + * per-flow queue, mask is dst_ip, dst_port, + * src_ip, src_port, proto measured in bits + */ + par = NULL; + + bzero(mask, sizeof(*mask)); + end = NULL; + + while (ac >= 1) { + uint32_t *p32 = NULL; + uint16_t *p16 = NULL; + uint32_t *p20 = NULL; + struct in6_addr *pa6 = NULL; + uint32_t a; + + tok = match_token(dummynet_params, *av); + ac--; av++; + switch(tok) { + case TOK_ALL: + /* + * special case, all bits significant + * except 'extra' (the queue number) + */ + mask->dst_ip = ~0; + mask->src_ip = ~0; + mask->dst_port = ~0; + mask->src_port = ~0; + mask->proto = ~0; + n2mask(&mask->dst_ip6, 128); + n2mask(&mask->src_ip6, 128); + mask->flow_id6 = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_QUEUE: + mask->extra = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_DSTIP: + mask->addr_type = 4; + p32 = &mask->dst_ip; + break; + + case TOK_SRCIP: + mask->addr_type = 4; + p32 = &mask->src_ip; + break; + + case TOK_DSTIP6: + mask->addr_type = 6; + pa6 = &mask->dst_ip6; + break; + + case TOK_SRCIP6: + mask->addr_type = 6; + pa6 = &mask->src_ip6; + break; + + case TOK_FLOWID: + mask->addr_type = 6; + p20 = &mask->flow_id6; + break; + + case TOK_DSTPORT: + p16 = &mask->dst_port; + break; + + case TOK_SRCPORT: + p16 = &mask->src_port; + break; + + case TOK_PROTO: + break; + + default: + ac++; av--; /* backtrack */ + goto end_mask; + } + if (ac < 1) + errx(EX_USAGE, "mask: value missing"); + if (*av[0] == '/') { + a = strtoul(av[0]+1, &end, 0); + if (pa6 == NULL) + a = (a == 32) ? ~0 : (1 << a) - 1; + } else + a = strtoul(av[0], &end, 0); + if (p32 != NULL) + *p32 = a; + else if (p16 != NULL) { + if (a > 0xFFFF) + errx(EX_DATAERR, + "port mask must be 16 bit"); + *p16 = (uint16_t)a; + } else if (p20 != NULL) { + if (a > 0xfffff) + errx(EX_DATAERR, + "flow_id mask must be 20 bit"); + *p20 = (uint32_t)a; + } else if (pa6 != NULL) { + if (a > 128) + errx(EX_DATAERR, + "in6addr invalid mask len"); + else + n2mask(pa6, a); + } else { + if (a > 0xFF) + errx(EX_DATAERR, + "proto mask must be 8 bit"); + mask->proto = (uint8_t)a; + } + if (a != 0) + *flags |= DN_HAVE_MASK; + ac--; av++; + } /* end while, config masks */ +end_mask: + break; + + case TOK_RED: + case TOK_GRED: + NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); + fs->flags |= DN_IS_RED; + if (tok == TOK_GRED) + fs->flags |= DN_IS_GENTLE_RED; + /* + * the format for parameters is w_q/min_th/max_th/max_p + */ + if ((end = strsep(&av[0], "/"))) { + double w_q = strtod(end, NULL); + if (w_q > 1 || w_q <= 0) + errx(EX_DATAERR, "0 < w_q <= 1"); + fs->w_q = (int) (w_q * (1 << SCALE_RED)); + } + if ((end = strsep(&av[0], "/"))) { + fs->min_th = strtoul(end, &end, 0); + if (*end == 'K' || *end == 'k') + fs->min_th *= 1024; + } + if ((end = strsep(&av[0], "/"))) { + fs->max_th = strtoul(end, &end, 0); + if (*end == 'K' || *end == 'k') + fs->max_th *= 1024; + } + if ((end = strsep(&av[0], "/"))) { + double max_p = strtod(end, NULL); + if (max_p > 1 || max_p <= 0) + errx(EX_DATAERR, "0 < max_p <= 1"); + fs->max_p = (int)(max_p * (1 << SCALE_RED)); + } + ac--; av++; + break; + + case TOK_DROPTAIL: + NEED(fs, "droptail is only for flowsets"); + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + break; + + case TOK_BW: + NEED(p, "bw is only for links"); + NEED1("bw needs bandwidth or interface\n"); + read_bandwidth(av[0], &p->bandwidth, NULL, 0); + ac--; av++; + break; + + case TOK_DELAY: + NEED(p, "delay is only for links"); + NEED1("delay needs argument 0..10000ms\n"); + p->delay = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_TYPE: { + int l; + NEED(sch, "type is only for schedulers"); + NEED1("type needs a string"); + l = strlen(av[0]); + if (l == 0 || l > 15) + errx(1, "type %s too long\n", av[0]); + strcpy(sch->name, av[0]); + sch->oid.subtype = 0; /* use string */ + ac--; av++; + break; + } + + case TOK_WEIGHT: + NEED(fs, "weight is only for flowsets"); + NEED1("weight needs argument\n"); + fs->par[0] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_LMAX: + NEED(fs, "lmax is only for flowsets"); + NEED1("lmax needs argument\n"); + fs->par[1] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PRI: + NEED(fs, "priority is only for flowsets"); + NEED1("priority needs argument\n"); + fs->par[2] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_SCHED: + case TOK_PIPE: + NEED(fs, "pipe/sched"); + NEED1("pipe/link/sched needs number\n"); + fs->sched_nr = strtoul(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PROFILE: + { + size_t real_length; + + NEED((!pf), "profile already set"); + NEED(p, "profile"); + NEED1("extra delay needs the file name\n"); + + /* load the profile structure using the DN_API */ + pf = o_next(&buf, max_pf_size, DN_PROFILE); + load_extra_delays(av[0], pf, p); //XXX can't fail? + + /* compact the dn_id structure */ + real_length = sizeof(struct dn_profile) + + pf->samples_no * sizeof(int); + o_compact(&buf, max_pf_size, real_length, DN_PROFILE); + --ac; ++av; + } + break; + + case TOK_BURST: + NEED(p, "burst"); + NEED1("burst needs argument\n"); + errno = 0; + if (expand_number(av[0], (int64_t *)&p->burst) < 0) + if (errno != ERANGE) + errx(EX_DATAERR, + "burst: invalid argument"); + if (errno || p->burst > (1ULL << 48) - 1) + errx(EX_DATAERR, + "burst: out of range (0..2^48-1)"); + ac--; av++; + break; + + default: + errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); + } + } + + /* check validity of parameters */ + if (p) { + if (p->delay > 10000) + errx(EX_DATAERR, "delay must be < 10000"); + if (p->bandwidth == -1) + p->bandwidth = 0; + } + if (fs) { + /* XXX accept a 0 scheduler to keep the default */ + if (fs->flags & DN_QSIZE_BYTES) { + size_t len; + long limit; + + len = sizeof(limit); + if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", + &limit, &len, NULL, 0) == -1) + limit = 1024*1024; + if (fs->qsize > limit) + errx(EX_DATAERR, "queue size must be < %ldB", limit); + } else { + size_t len; + long limit; + + len = sizeof(limit); + if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", + &limit, &len, NULL, 0) == -1) + limit = 100; + if (fs->qsize > limit) + errx(EX_DATAERR, "2 <= queue size <= %ld", limit); + } + + if (fs->flags & DN_IS_RED) { + size_t len; + int lookup_depth, avg_pkt_size; + double w_q; + + if (fs->min_th >= fs->max_th) + errx(EX_DATAERR, "min_th %d must be < than max_th %d", + fs->min_th, fs->max_th); + if (fs->max_th == 0) + errx(EX_DATAERR, "max_th must be > 0"); + + len = sizeof(int); + if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", + &lookup_depth, &len, NULL, 0) == -1) + lookup_depth = 256; + if (lookup_depth == 0) + errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" + " must be greater than zero"); + + len = sizeof(int); + if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", + &avg_pkt_size, &len, NULL, 0) == -1) + avg_pkt_size = 512; + + if (avg_pkt_size == 0) + errx(EX_DATAERR, + "net.inet.ip.dummynet.red_avg_pkt_size must" + " be greater than zero"); + + /* + * Ticks needed for sending a medium-sized packet. + * Unfortunately, when we are configuring a WF2Q+ queue, we + * do not have bandwidth information, because that is stored + * in the parent pipe, and also we have multiple queues + * competing for it. So we set s=0, which is not very + * correct. But on the other hand, why do we want RED with + * WF2Q+ ? + */ +#if 0 + if (p.bandwidth==0) /* this is a WF2Q+ queue */ + s = 0; + else + s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; +#endif + /* + * max idle time (in ticks) before avg queue size becomes 0. + * NOTA: (3/w_q) is approx the value x so that + * (1-w_q)^x < 10^-3. + */ + w_q = ((double)fs->w_q) / (1 << SCALE_RED); +#if 0 // go in kernel + idle = s * 3. / w_q; + fs->lookup_step = (int)idle / lookup_depth; + if (!fs->lookup_step) + fs->lookup_step = 1; + weight = 1 - w_q; + for (t = fs->lookup_step; t > 1; --t) + weight *= 1 - w_q; + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); +#endif /* code moved in the kernel */ + } + } + + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); + + if (i) + err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); +} + +void +dummynet_flush(void) +{ + struct dn_id oid; + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_cmd(IP_DUMMYNET3, &oid, oid.len); +} + +/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' + * Returns the number of ranges, and possibly stores them + * in the array v of size len. + */ +static int +parse_range(int ac, char *av[], uint32_t *v, int len) +{ + int n = 0; + char *endptr, *s; + uint32_t base[2]; + + if (v == NULL || len < 2) { + v = base; + len = 2; + } + + for (s = *av; s != NULL; av++, ac--) { + v[0] = strtoul(s, &endptr, 10); + v[1] = (*endptr != '-') ? v[0] : + strtoul(endptr+1, &endptr, 10); + if (*endptr == '\0') { /* prepare for next round */ + s = (ac > 0) ? *(av+1) : NULL; + } else { + if (*endptr != ',') { + warn("invalid number: %s", s); + s = ++endptr; + continue; + } + /* continue processing from here */ + s = ++endptr; + ac++; + av--; + } + if (v[1] < v[0] || + v[1] >= DN_MAX_ID-1 || + v[1] >= DN_MAX_ID-1) { + continue; /* invalid entry */ + } + n++; + /* translate if 'pipe list' */ + if (co.do_pipe == 1) { + v[0] += DN_MAX_ID; + v[1] += DN_MAX_ID; + } + v = (n*2 < len) ? v + 2 : base; + } + return n; +} + +/* main entry point for dummynet list functions. co.do_pipe indicates + * which function we want to support. + * av may contain filtering arguments, either individual entries + * or ranges, or lists (space or commas are valid separators). + * Format for a range can be n1-n2 or n3 n4 n5 ... + * In a range n1 must be <= n2, otherwise the range is ignored. + * A number 'n4' is translate in a range 'n4-n4' + * All number must be > 0 and < DN_MAX_ID-1 + */ +void +dummynet_list(int ac, char *av[], int show_counters) +{ + struct dn_id *oid, *x = NULL; + int ret, i; + int n; /* # of ranges */ + u_int buflen, l; + u_int max_size; /* largest obj passed up */ + + (void)show_counters; // XXX unused, but we should use it. + ac--; + av++; /* skip 'list' | 'show' word */ + + n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ + + /* Allocate space to store ranges */ + l = sizeof(*oid) + sizeof(uint32_t) * n * 2; + oid = safe_calloc(1, l); + oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); + + if (n > 0) /* store ranges in idx */ + parse_range(ac, av, (uint32_t *)(oid + 1), n*2); + /* + * Compute the size of the largest object returned. If the + * response leaves at least this much spare space in the + * buffer, then surely the response is complete; otherwise + * there might be a risk of truncation and we will need to + * retry with a larger buffer. + * XXX don't bother with smaller structs. + */ + max_size = sizeof(struct dn_fs); + if (max_size < sizeof(struct dn_sch)) + max_size = sizeof(struct dn_sch); + if (max_size < sizeof(struct dn_flow)) + max_size = sizeof(struct dn_flow); + + switch (co.do_pipe) { + case 1: + oid->subtype = DN_LINK; /* list pipe */ + break; + case 2: + oid->subtype = DN_FS; /* list queue */ + break; + case 3: + oid->subtype = DN_SCH; /* list sched */ + break; + } + + /* + * Ask the kernel an estimate of the required space (result + * in oid.id), unless we are requesting a subset of objects, + * in which case the kernel does not give an exact answer. + * In any case, space might grow in the meantime due to the + * creation of new queues, so we must be prepared to retry. + */ + if (n > 0) { + buflen = 4*1024; + } else { + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0 || oid->id <= sizeof(*oid)) + goto done; + buflen = oid->id + max_size; + oid->len = sizeof(*oid); /* restore */ + } + /* Try a few times, until the buffer fits */ + for (i = 0; i < 20; i++) { + l = buflen; + x = safe_realloc(x, l); + bcopy(oid, x, oid->len); + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); + if (ret != 0 || x->id <= sizeof(*oid)) + goto done; /* no response */ + if (l + max_size <= buflen) + break; /* ok */ + buflen *= 2; /* double for next attempt */ + } + list_pipes(x, O_NEXT(x, l)); +done: + if (x) + free(x); + free(oid); +} diff --git a/ipfw/expand_number.c b/ipfw/expand_number.c new file mode 100644 index 0000000..d557111 --- /dev/null +++ b/ipfw/expand_number.c @@ -0,0 +1,100 @@ +/*- + * Copyright (c) 2007 Eric Anderson + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +// #include +__FBSDID("$FreeBSD: src/lib/libutil/expand_number.c,v 1.2.4.2 2009/06/10 14:52:34 des Exp $"); + +#include +#include +#include +#include +//#include +#include + +/* + * Convert an expression of the following forms to a int64_t. + * 1) A positive decimal number. + * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). + * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10). + * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20). + * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30). + * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40). + * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50). + * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60). + */ +int +expand_number(const char *buf, int64_t *num) +{ + static const char unit[] = "bkmgtpe"; + char *endptr, s; + int64_t number; + int i; + + number = strtoimax(buf, &endptr, 0); + + if (endptr == buf) { + /* No valid digits. */ + errno = EINVAL; + return (-1); + } + + if (*endptr == '\0') { + /* No unit. */ + *num = number; + return (0); + } + + s = tolower(*endptr); + switch (s) { + case 'b': + case 'k': + case 'm': + case 'g': + case 't': + case 'p': + case 'e': + break; + default: + /* Unrecognized unit. */ + errno = EINVAL; + return (-1); + } + + for (i = 0; unit[i] != '\0'; i++) { + if (s == unit[i]) + break; + if ((number < 0 && (number << 10) > number) || + (number >= 0 && (number << 10) < number)) { + errno = ERANGE; + return (-1); + } + number <<= 10; + } + + *num = number; + return (0); +} diff --git a/ipfw/glue.c b/ipfw/glue.c new file mode 100644 index 0000000..a3ef641 --- /dev/null +++ b/ipfw/glue.c @@ -0,0 +1,852 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: glue.c 12264 2013-04-27 20:21:06Z luigi $ + * + * Userland functions missing in linux/Windows + */ + +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#endif /* _WIN32 */ + +#ifndef HAVE_NAT +/* dummy nat functions */ +void +ipfw_show_nat(int ac, char **av) +{ + fprintf(stderr, "%s unsupported\n", __FUNCTION__); +} + +void +ipfw_config_nat(int ac, char **av) +{ + fprintf(stderr, "%s unsupported\n", __FUNCTION__); +} +#endif + +#ifdef __linux__ +int optreset; /* missing in linux */ +#endif + +/* + * not implemented in linux. + * taken from /usr/src/lib/libc/string/strlcpy.c + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} + + +/* missing in linux and windows */ +long long int +strtonum(const char *nptr, long long minval, long long maxval, + const char **errstr) +{ + long long ret; + int errno_c = errno; /* save actual errno */ + + errno = 0; +#ifdef TCC + ret = strtol(nptr, (char **)errstr, 0); +#else + ret = strtoll(nptr, (char **)errstr, 0); +#endif + /* We accept only a string that represent exactly a number (ie. start + * and end with a digit). + * FreeBSD version wants errstr==NULL if no error occurs, otherwise + * errstr should point to an error string. + * For our purspose, we implement only the invalid error, ranges + * error aren't checked + */ + if (errno != 0 || nptr == *errstr || **errstr != '\0') + *errstr = "invalid"; + else { + *errstr = NULL; + errno = errno_c; + } + return ret; +} + +#if defined (_WIN32) || defined (EMULATE_SYSCTL) +//XXX missing prerequisites +#include //openwrt +#include //openwrt +#include +#include +#endif + +/* + * set or get system information + * XXX lock acquisition/serialize calls + * + * we export this as sys/module/ipfw_mod/parameters/___ + * This function get or/and set the value of the sysctl passed by + * the name parameter. If the old value is not desired, + * oldp and oldlenp should be set to NULL. + * + * XXX + * I do not know how this works in FreeBSD in the case + * where there are no write permission on the sysctl var. + * We read the value and set return variables in any way + * but returns -1 on write failures, regardless the + * read success. + * + * Since there is no information on types, in the following + * code we assume a length of 4 is a int. + * + * Returns 0 on success, -1 on errors. + */ +int +sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) +{ +#if defined (_WIN32) || defined (EMULATE_SYSCTL) + /* + * we embed the sysctl request in the usual sockopt mechanics. + * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 + * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET + * subcommands. + * the syntax of this function is fully compatible with + * POSIX sysctlby name: + * if newp and newlen are != 0 => this is a set + * else if oldp and oldlen are != 0 => this is a get + * to avoid too much overhead in the module, the whole + * sysctltable is returned, and the parsing is done in userland, + * a probe request is done to retrieve the size needed to + * transfer the table, before the real request + * if both old and new params = 0 => this is a print + * this is a special request, done only by main() + * to implement the extension './ipfw sysctl', + * a command that bypasses the normal getopt, and that + * is available on those platforms that use this + * sysctl emulation. + * in this case, a negative oldlen signals that *oldp + * is actually a FILE* to print somewhere else than stdout + */ + + int l; + int ret; + struct dn_id* oid; + struct sysctlhead* entry; + char* pstring; + char* pdata; + FILE* fp; + + if((oldlenp != NULL) && (*oldlenp < 0)) + fp = (FILE*)oldp; + else + fp = stdout; + if(newp != NULL && newlen != 0) + { + //this is a set + l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen; + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_SET; + oid->id = DN_API_VERSION; + + entry = (struct sysctlhead*)(oid+1); + pdata = (char*)(entry+1); + pstring = pdata + newlen; + + entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3; + entry->namelen = strlen(name)+1; + entry->flags = 0; + entry->datalen = newlen; + + bcopy(newp, pdata, newlen); + bcopy(name, pstring, strlen(name)+1); + + ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); + if (ret != 0) + return -1; + } + else + { + //this is a get or a print + l = sizeof(struct dn_id); + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_GET; + oid->id = DN_API_VERSION; + + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0) + return -1; + + l=oid->id; + free(oid); + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_GET; + oid->id = DN_API_VERSION; + + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0) + return -1; + + entry = (struct sysctlhead*)(oid+1); + while(entry->blocklen != 0) + { + pdata = (char*)(entry+1); + pstring = pdata+entry->datalen; + + //time to check if this is a get or a print + if(name != NULL && oldp != NULL && *oldlenp > 0) + { + //this is a get + if(strcmp(name,pstring) == 0) + { + //match found, sanity chech on len + if(*oldlenp < entry->datalen) + { + printf("%s error: buffer too small\n",__FUNCTION__); + return -1; + } + *oldlenp = entry->datalen; + bcopy(pdata, oldp, *oldlenp); + return 0; + } + } + else + { + //this is a print + if( name == NULL ) + goto print; + if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) + goto print; + else + goto skip; +print: + fprintf(fp, "%s: ",pstring); + switch( entry->flags >> 2 ) + { + case SYSCTLTYPE_LONG: + fprintf(fp, "%li ", *(long*)(pdata)); + break; + case SYSCTLTYPE_UINT: + fprintf(fp, "%u ", *(unsigned int*)(pdata)); + break; + case SYSCTLTYPE_ULONG: + fprintf(fp, "%lu ", *(unsigned long*)(pdata)); + break; + case SYSCTLTYPE_INT: + default: + fprintf(fp, "%i ", *(int*)(pdata)); + } + if( (entry->flags & 0x00000003) == CTLFLAG_RD ) + fprintf(fp, "\t(read only)\n"); + else + fprintf(fp, "\n"); +skip: ; + } + entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen); + } + free(oid); + return 0; + } + //fallback for invalid options + return -1; + +#else /* __linux__ */ + FILE *fp; + char *basename = "/sys/module/ipfw_mod/parameters/"; + char filename[256]; /* full filename */ + char *varp; + int ret = 0; /* return value */ + long d; + + if (name == NULL) /* XXX set errno */ + return -1; + + /* locate the filename */ + varp = strrchr(name, '.'); + if (varp == NULL) /* XXX set errno */ + return -1; + + snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); + + /* + * XXX we could open the file here, in rw mode + * but need to check if a file have write + * permissions. + */ + + /* check parameters */ + if (oldp && oldlenp) { /* read mode */ + fp = fopen(filename, "r"); + if (fp == NULL) { + fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename); + return -1; + } + if (fscanf(fp, "%ld", &d) != 1) { + ret = -1; + } else if (*oldlenp == sizeof(int)) { + int dst = d; + memcpy(oldp, &dst, *oldlenp); + } else if (*oldlenp == sizeof(long)) { + memcpy(oldp, &d, *oldlenp); + } else { + fprintf(stderr, "unknown paramerer len %d\n", + (int)*oldlenp); + } + fclose(fp); + } + + if (newp && newlen) { /* write */ + fp = fopen(filename, "w"); + if (fp == NULL) { + fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename); + return -1; + } + if (newlen == sizeof(int)) { + if (fprintf(fp, "%d", *(int *)newp) < 1) + ret = -1; + } else if (newlen == sizeof(long)) { + if (fprintf(fp, "%ld", *(long *)newp) < 1) + ret = -1; + } else { + fprintf(stderr, "unknown paramerer len %d\n", + (int)newlen); + } + + fclose(fp); + } + + return ret; +#endif /* __linux__ */ +} + +#ifdef _WIN32 +/* + * On windows, set/getsockopt are mapped to DeviceIoControl() + */ +int +wnd_setsockopt(int s, int level, int sopt_name, const void *optval, + socklen_t optlen) +{ + size_t len = sizeof (struct sockopt) + optlen; + struct sockopt *sock; + DWORD n; + BOOL result; + HANDLE _dev_h = (HANDLE)s; + + /* allocate a data structure for communication */ + sock = malloc(len); + if (sock == NULL) + return -1; + + sock->sopt_dir = SOPT_SET; + sock->sopt_name = sopt_name; + sock->sopt_valsize = optlen; + sock->sopt_val = (void *)(sock+1); + + memcpy(sock->sopt_val, optval, optlen); + result = DeviceIoControl (_dev_h, IP_FW_SETSOCKOPT, sock, len, + NULL, 0, &n, NULL); + free (sock); + + return (result ? 0 : -1); +} + +int +wnd_getsockopt(int s, int level, int sopt_name, void *optval, + socklen_t *optlen) +{ + size_t len = sizeof (struct sockopt) + *optlen; + struct sockopt *sock; + DWORD n; + BOOL result; + HANDLE _dev_h = (HANDLE)s; + + sock = malloc(len); + if (sock == NULL) + return -1; + + sock->sopt_dir = SOPT_GET; + sock->sopt_name = sopt_name; + sock->sopt_valsize = *optlen; + sock->sopt_val = (void *)(sock+1); + + memcpy (sock->sopt_val, optval, *optlen); + + result = DeviceIoControl (_dev_h, IP_FW_GETSOCKOPT, sock, len, + sock, len, &n, NULL); + //printf("len = %i, returned = %u, valsize = %i\n",len,n,sock->sopt_valsize); + *optlen = sock->sopt_valsize; + memcpy (optval, sock->sopt_val, *optlen); + free (sock); + return (result ? 0 : -1); +} + +int +my_socket(int domain, int ty, int proto) +{ + TCHAR *pcCommPort = TEXT("\\\\.\\Ipfw"); + HANDLE _dev_h = INVALID_HANDLE_VALUE; + + /* Special Handling For Accessing Device On Windows 2000 Terminal Server + See Microsoft KB Article 259131 */ + if (_dev_h == INVALID_HANDLE_VALUE) { + _dev_h = CreateFile (pcCommPort, + GENERIC_READ | GENERIC_WRITE, + 0, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + } + if (_dev_h == INVALID_HANDLE_VALUE) { + printf("%s failed %u, cannot talk to kernel module\n", + __FUNCTION__, (unsigned)GetLastError()); + return -1; + } + return (int)_dev_h; +} + +struct hostent* gethostbyname2(const char *name, int af) +{ + return gethostbyname(name); +} + +struct ether_addr* ether_aton(const char *a) +{ + fprintf(stderr, "%s empty\n", __FUNCTION__); + return NULL; +} + +#ifdef TCC +int opterr = 1, /* if error message should be printed */ + optind = 1, /* index into parent argv vector */ + optopt, /* character checked for validity */ + optreset; /* reset getopt */ +char *optarg; /* argument associated with option */ + +#define BADCH (int)'?' +#define BADARG (int)':' +#define EMSG "" + +#define PROGNAME "ipfw" +/* + * getopt -- + * Parse argc/argv argument vector. + */ +int +getopt(nargc, nargv, ostr) + int nargc; + char * const nargv[]; + const char *ostr; +{ + static char *place = EMSG; /* option letter processing */ + char *oli; /* option letter list index */ + + if (optreset || *place == 0) { /* update scanning pointer */ + optreset = 0; + place = nargv[optind]; + if (optind >= nargc || *place++ != '-') { + /* Argument is absent or is not an option */ + place = EMSG; + return (-1); + } + optopt = *place++; + if (optopt == '-' && *place == 0) { + /* "--" => end of options */ + ++optind; + place = EMSG; + return (-1); + } + if (optopt == 0) { + /* Solitary '-', treat as a '-' option + if the program (eg su) is looking for it. */ + place = EMSG; + if (strchr(ostr, '-') == NULL) + return (-1); + optopt = '-'; + } + } else + optopt = *place++; + + /* See if option letter is one the caller wanted... */ + if (optopt == ':' || (oli = strchr(ostr, optopt)) == NULL) { + if (*place == 0) + ++optind; + if (opterr && *ostr != ':') + (void)fprintf(stderr, + "%s: illegal option -- %c\n", PROGNAME, + optopt); + return (BADCH); + } + + /* Does this option need an argument? */ + if (oli[1] != ':') { + /* don't need argument */ + optarg = NULL; + if (*place == 0) + ++optind; + } else { + /* Option-argument is either the rest of this argument or the + entire next argument. */ + if (*place) + optarg = place; + else if (nargc > ++optind) + optarg = nargv[optind]; + else { + /* option-argument absent */ + place = EMSG; + if (*ostr == ':') + return (BADARG); + if (opterr) + (void)fprintf(stderr, + "%s: option requires an argument -- %c\n", + PROGNAME, optopt); + return (BADCH); + } + place = EMSG; + ++optind; + } + return (optopt); /* return option letter */ +} + +//static FILE *err_file = stderr; +void +verrx(int ex, int eval, const char *fmt, va_list ap) +{ + fprintf(stderr, "%s: ", PROGNAME); + if (fmt != NULL) + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + if (ex) + exit(eval); +} +void +errx(int eval, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + verrx(1, eval, fmt, ap); + va_end(ap); +} + +void +warnx(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + verrx(0, 0, fmt, ap); + va_end(ap); +} + +char * +strsep(char **stringp, const char *delim) +{ + char *s; + const char *spanp; + int c, sc; + char *tok; + + if ((s = *stringp) == NULL) + return (NULL); + for (tok = s;;) { + c = *s++; + spanp = delim; + do { + if ((sc = *spanp++) == c) { + if (c == 0) + s = NULL; + else + s[-1] = 0; + *stringp = s; + return (tok); + } + } while (sc != 0); + } + /* NOTREACHED */ +} + +static unsigned char +tolower(unsigned char c) +{ + return (c >= 'A' && c <= 'Z') ? c + 'a' - 'A' : c; +} + +static int isdigit(unsigned char c) +{ + return (c >= '0' && c <= '9'); +} + +static int isxdigit(unsigned char c) +{ + return (strchr("0123456789ABCDEFabcdef", c) ? 1 : 0); +} + +static int isspace(unsigned char c) +{ + return (strchr(" \t\n\r", c) ? 1 : 0); +} + +static int isascii(unsigned char c) +{ + return (c < 128); +} + +static int islower(unsigned char c) +{ + return (c >= 'a' && c <= 'z'); +} + +int +strcasecmp(const char *s1, const char *s2) +{ + const unsigned char + *us1 = (const unsigned char *)s1, + *us2 = (const unsigned char *)s2; + + while (tolower(*us1) == tolower(*us2++)) + if (*us1++ == '\0') + return (0); + return (tolower(*us1) - tolower(*--us2)); +} + +intmax_t +strtoimax(const char * restrict nptr, char ** restrict endptr, int base) +{ + return strtol(nptr, endptr,base); +} + +void +setservent(int a) +{ +} + +#define NS_INADDRSZ 128 + +int +inet_pton(int af, const char *src, void *dst) +{ + static const char digits[] = "0123456789"; + int saw_digit, octets, ch; + u_char tmp[NS_INADDRSZ], *tp; + + if (af != AF_INET) { + errno = EINVAL; + return -1; + } + + saw_digit = 0; + octets = 0; + *(tp = tmp) = 0; + while ((ch = *src++) != '\0') { + const char *pch; + + if ((pch = strchr(digits, ch)) != NULL) { + u_int new = *tp * 10 + (pch - digits); + + if (saw_digit && *tp == 0) + return (0); + if (new > 255) + return (0); + *tp = new; + if (!saw_digit) { + if (++octets > 4) + return (0); + saw_digit = 1; + } + } else if (ch == '.' && saw_digit) { + if (octets == 4) + return (0); + *++tp = 0; + saw_digit = 0; + } else + return (0); + } + if (octets < 4) + return (0); + memcpy(dst, tmp, NS_INADDRSZ); + return (1); +} + +const char * +inet_ntop(int af, const void *_src, char *dst, socklen_t size) +{ + static const char fmt[] = "%u.%u.%u.%u"; + char tmp[sizeof "255.255.255.255"]; + const u_char *src = _src; + int l; + if (af != AF_INET) { + errno = EINVAL; + return NULL; + } + + l = snprintf(tmp, sizeof(tmp), fmt, src[0], src[1], src[2], src[3]); + if (l <= 0 || (socklen_t) l >= size) { + errno = ENOSPC; + return (NULL); + } + strlcpy(dst, tmp, size); + return (dst); +} + +/*% + * Check whether "cp" is a valid ascii representation + * of an Internet address and convert to a binary address. + * Returns 1 if the address is valid, 0 if not. + * This replaces inet_addr, the return value from which + * cannot distinguish between failure and a local broadcast address. + */ +int +inet_aton(const char *cp, struct in_addr *addr) { + u_long val; + int base, n; + char c; + u_int8_t parts[4]; + u_int8_t *pp = parts; + int digit; + + c = *cp; + for (;;) { + /* + * Collect number up to ``.''. + * Values are specified as for C: + * 0x=hex, 0=octal, isdigit=decimal. + */ + if (!isdigit((unsigned char)c)) + return (0); + val = 0; base = 10; digit = 0; + if (c == '0') { + c = *++cp; + if (c == 'x' || c == 'X') + base = 16, c = *++cp; + else { + base = 8; + digit = 1 ; + } + } + for (;;) { + if (isascii(c) && isdigit((unsigned char)c)) { + if (base == 8 && (c == '8' || c == '9')) + return (0); + val = (val * base) + (c - '0'); + c = *++cp; + digit = 1; + } else if (base == 16 && isascii(c) && + isxdigit((unsigned char)c)) { + val = (val << 4) | + (c + 10 - (islower((unsigned char)c) ? 'a' : 'A')); + c = *++cp; + digit = 1; + } else + break; + } + if (c == '.') { + /* + * Internet format: + * a.b.c.d + * a.b.c (with c treated as 16 bits) + * a.b (with b treated as 24 bits) + */ + if (pp >= parts + 3 || val > 0xffU) + return (0); + *pp++ = val; + c = *++cp; + } else + break; + } + /* + * Check for trailing characters. + */ + if (c != '\0' && (!isascii(c) || !isspace((unsigned char)c))) + return (0); + /* + * Did we get a valid digit? + */ + if (!digit) + return (0); + /* + * Concoct the address according to + * the number of parts specified. + */ + n = pp - parts + 1; + switch (n) { + case 1: /*%< a -- 32 bits */ + break; + + case 2: /*%< a.b -- 8.24 bits */ + if (val > 0xffffffU) + return (0); + val |= parts[0] << 24; + break; + + case 3: /*%< a.b.c -- 8.8.16 bits */ + if (val > 0xffffU) + return (0); + val |= (parts[0] << 24) | (parts[1] << 16); + break; + + case 4: /*%< a.b.c.d -- 8.8.8.8 bits */ + if (val > 0xffU) + return (0); + val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8); + break; + } + if (addr != NULL) + addr->s_addr = htonl(val); + return (1); +} + +#endif /* TCC */ + +#endif /* _WIN32 */ diff --git a/ipfw/humanize_number.c b/ipfw/humanize_number.c new file mode 100644 index 0000000..90aa18b --- /dev/null +++ b/ipfw/humanize_number.c @@ -0,0 +1,153 @@ +/* $NetBSD: humanize_number.c,v 1.13 2007/12/14 17:26:19 christos Exp $ */ + +/* + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +// #include +__FBSDID("$FreeBSD: src/lib/libutil/humanize_number.c,v 1.2.10.1 2008/04/20 16:29:01 antoine Exp $"); + +#include +#include +#include +#include +#include +#include +// #include +//#include + +int +humanize_number(char *buf, size_t len, int64_t bytes, + const char *suffix, int scale, int flags) +{ + const char *prefixes, *sep; + int b, i, r, maxscale, s1, s2, sign; + int64_t divisor, max; + size_t baselen; + + assert(buf != NULL); + assert(suffix != NULL); + assert(scale >= 0); + + if (flags & HN_DIVISOR_1000) { + /* SI for decimal multiplies */ + divisor = 1000; + if (flags & HN_B) + prefixes = "B\0k\0M\0G\0T\0P\0E"; + else + prefixes = "\0\0k\0M\0G\0T\0P\0E"; + } else { + /* + * binary multiplies + * XXX IEC 60027-2 recommends Ki, Mi, Gi... + */ + divisor = 1024; + if (flags & HN_B) + prefixes = "B\0K\0M\0G\0T\0P\0E"; + else + prefixes = "\0\0K\0M\0G\0T\0P\0E"; + } + +#define SCALE2PREFIX(scale) (&prefixes[(scale) << 1]) + maxscale = 7; + + if (scale >= maxscale && + (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0) + return (-1); + + if (buf == NULL || suffix == NULL) + return (-1); + + if (len > 0) + buf[0] = '\0'; + if (bytes < 0) { + sign = -1; + bytes *= -100; + baselen = 3; /* sign, digit, prefix */ + } else { + sign = 1; + bytes *= 100; + baselen = 2; /* digit, prefix */ + } + if (flags & HN_NOSPACE) + sep = ""; + else { + sep = " "; + baselen++; + } + baselen += strlen(suffix); + + /* Check if enough room for `x y' + suffix + `\0' */ + if (len < baselen + 1) + return (-1); + + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { + /* See if there is additional columns can be used. */ + for (max = 100, i = len - baselen; i-- > 0;) + max *= 10; + + /* + * Divide the number until it fits the given column. + * If there will be an overflow by the rounding below, + * divide once more. + */ + for (i = 0; bytes >= max - 50 && i < maxscale; i++) + bytes /= divisor; + + if (scale & HN_GETSCALE) + return (i); + } else + for (i = 0; i < scale && i < maxscale; i++) + bytes /= divisor; + + /* If a value <= 9.9 after rounding and ... */ + if (bytes < 995 && i > 0 && flags & HN_DECIMAL) { + /* baselen + \0 + .N */ + if (len < baselen + 1 + 2) + return (-1); + b = ((int)bytes + 5) / 10; + s1 = b / 10; + s2 = b % 10; + r = snprintf(buf, len, "%d%s%d%s%s%s", + sign * s1, ".", s2, + sep, SCALE2PREFIX(i), suffix); + } else + r = snprintf(buf, len, "%" PRId64 "%s%s%s", + sign * ((bytes + 50) / 100), + sep, SCALE2PREFIX(i), suffix); + + return (r); +} diff --git a/ipfw/include/alias.h b/ipfw/include/alias.h new file mode 100644 index 0000000..888bd0d --- /dev/null +++ b/ipfw/include/alias.h @@ -0,0 +1,71 @@ +#ifndef _ALIAS_H_ +#define _ALIAS_H_ + +#define LIBALIAS_BUF_SIZE 128 + +/* + * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log + * every time a link is created or deleted. This is useful for debugging. + */ +#define PKT_ALIAS_LOG 0x01 + +/* + * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp, + * telnet or web servers will be prevented by the aliasing mechanism. + */ +#define PKT_ALIAS_DENY_INCOMING 0x02 + +/* + * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the + * same port as they originated on. This allows e.g. rsh to work *99% of the + * time*, but _not_ 100% (it will be slightly flakey instead of not working + * at all). This mode bit is set by PacketAliasInit(), so it is a default + * mode of operation. + */ +#define PKT_ALIAS_SAME_PORTS 0x04 + +/* + * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g. + * destination port and/or address is zero), the packet aliasing engine will + * attempt to allocate a socket for the aliasing port it chooses. This will + * avoid interference with the host machine. Fully specified links do not + * require this. This bit is set after a call to PacketAliasInit(), so it is + * a default mode of operation. + */ +#ifndef NO_USE_SOCKETS +#define PKT_ALIAS_USE_SOCKETS 0x08 +#endif +/*- + * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with + * unregistered source addresses will be aliased. Private + * addresses are those in the following ranges: + * + * 10.0.0.0 -> 10.255.255.255 + * 172.16.0.0 -> 172.31.255.255 + * 192.168.0.0 -> 192.168.255.255 + */ +#define PKT_ALIAS_UNREGISTERED_ONLY 0x10 + +/* + * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic + * aliasing links will be reset whenever PacketAliasSetAddress() changes the + * default aliasing address. If the default aliasing address is left + * unchanged by this function call, then the table of dynamic aliasing links + * will be left intact. This bit is set after a call to PacketAliasInit(). + */ +#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 + + +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + +#endif /* !_ALIAS_H_ */ diff --git a/ipfw/include/net/if_dl.h b/ipfw/include/net/if_dl.h new file mode 100644 index 0000000..4d2b4f7 --- /dev/null +++ b/ipfw/include/net/if_dl.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_dl.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/net/if_dl.h,v 1.14 2005/01/07 01:45:34 imp Exp $ + */ + +#ifndef _NET_IF_DL_H_ +#define _NET_IF_DL_H_ + +/* + * A Link-Level Sockaddr may specify the interface in one of two + * ways: either by means of a system-provided index number (computed + * anew and possibly differently on every reboot), or by a human-readable + * string such as "il0" (for managerial convenience). + * + * Census taking actions, such as something akin to SIOCGCONF would return + * both the index and the human name. + * + * High volume transactions (such as giving a link-level ``from'' address + * in a recvfrom or recvmsg call) may be likely only to provide the indexed + * form, (which requires fewer copy operations and less space). + * + * The form and interpretation of the link-level address is purely a matter + * of convention between the device driver and its consumers; however, it is + * expected that all drivers for an interface of a given if_type will agree. + */ + +/* + * Structure of a Link-Level sockaddr: + */ +struct sockaddr_dl { + u_char sdl_len; /* Total length of sockaddr */ + u_char sdl_family; /* AF_LINK */ + u_short sdl_index; /* if != 0, system given index for interface */ + u_char sdl_type; /* interface type */ + u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ + u_char sdl_alen; /* link level address length */ + u_char sdl_slen; /* link layer selector length */ + char sdl_data[46]; /* minimum work area, can be larger; + contains both if name and ll address */ +}; + +#define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen)) + +#ifndef _KERNEL + +#include + +__BEGIN_DECLS +void link_addr(const char *, struct sockaddr_dl *); +char *link_ntoa(const struct sockaddr_dl *); +__END_DECLS + +#endif /* !_KERNEL */ + +#endif diff --git a/ipfw/include/net/pfvar.h b/ipfw/include/net/pfvar.h new file mode 100644 index 0000000..304cb16 --- /dev/null +++ b/ipfw/include/net/pfvar.h @@ -0,0 +1,32 @@ +#ifndef _PF_VAR_H_ +#define _PF_VAR_H_ + +/* + * replacement for FreeBSD's pfqueue.h + */ +#include + +#define DIOCSTARTALTQ _IO ('D', 42) +#define DIOCSTOPALTQ _IO ('D', 43) + +struct pf_altq { + TAILQ_ENTRY(pf_altq) entries; + /* ... */ + u_int32_t qid; /* return value */ + +#define PF_QNAME_SIZE 64 + char qname[PF_QNAME_SIZE]; /* queue name */ + +}; + +struct pfioc_altq { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + struct pf_altq altq; +}; + +#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) +#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) + +#endif /* !_PF_VAR_H */ diff --git a/ipfw/include/timeconv.h b/ipfw/include/timeconv.h new file mode 100644 index 0000000..5377ebb --- /dev/null +++ b/ipfw/include/timeconv.h @@ -0,0 +1,14 @@ +/* + * simple override for _long_to_time() + */ +#ifndef _TIMECONV_H_ +#define _TIMECONV_H_ +static __inline time_t +_long_to_time(long tlong) +{ + if (sizeof(long) == sizeof(__int32_t)) + return((time_t)(__int32_t)(tlong)); + return((time_t)tlong); +} + +#endif /* _TIMECONV_H_ */ diff --git a/ipfw/ipfw.8 b/ipfw/ipfw.8 new file mode 100644 index 0000000..bc8d819 --- /dev/null +++ b/ipfw/ipfw.8 @@ -0,0 +1,3476 @@ +.\" +.\" $FreeBSD$ +.\" +.Dd October 25, 2012 +.Dt IPFW 8 +.Os +.Sh NAME +.Nm ipfw +.Nd User interface for firewall, traffic shaper, packet scheduler, +in-kernel NAT. +.Sh SYNOPSIS +.Ss FIREWALL CONFIGURATION +.Nm +.Op Fl cq +.Cm add +.Ar rule +.Nm +.Op Fl acdefnNStT +.Op Cm set Ar N +.Brq Cm list | show +.Op Ar rule | first-last ... +.Nm +.Op Fl f | q +.Op Cm set Ar N +.Cm flush +.Nm +.Op Fl q +.Op Cm set Ar N +.Brq Cm delete | zero | resetlog +.Op Ar number ... +.Pp +.Nm +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... +.Nm +.Cm set move +.Op Cm rule +.Ar number Cm to Ar number +.Nm +.Cm set swap Ar number number +.Nm +.Cm set show +.Ss SYSCTL SHORTCUTS +.Nm +.Cm enable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Nm +.Cm disable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Ss LOOKUP TABLES +.Nm +.Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value +.Nm +.Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen +.Nm +.Cm table +.Brq Ar number | all +.Cm flush +.Nm +.Cm table +.Brq Ar number | all +.Cm list +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) +.Nm +.Brq Cm pipe | queue | sched +.Ar number +.Cm config +.Ar config-options +.Nm +.Op Fl s Op Ar field +.Brq Cm pipe | queue | sched +.Brq Cm delete | list | show +.Op Ar number ... +.Ss IN-KERNEL NAT +.Nm +.Op Fl q +.Cm nat +.Ar number +.Cm config +.Ar config-options +.Pp +.Nm +.Op Fl cfnNqS +.Oo +.Fl p Ar preproc +.Oo +.Ar preproc-flags +.Oc +.Oc +.Ar pathname +.Sh DESCRIPTION +The +.Nm +utility is the user interface for controlling the +.Xr ipfw 4 +firewall, the +.Xr dummynet 4 +traffic shaper/packet scheduler, and the +in-kernel NAT services. +.Pp +A firewall configuration, or +.Em ruleset , +is made of a list of +.Em rules +numbered from 1 to 65535. +Packets are passed to the firewall +from a number of different places in the protocol stack +(depending on the source and destination of the packet, +it is possible for the firewall to be +invoked multiple times on the same packet). +The packet passed to the firewall is compared +against each of the rules in the +.Em ruleset , +in rule-number order +(multiple rules with the same number are permitted, in which case +they are processed in order of insertion). +When a match is found, the action corresponding to the +matching rule is performed. +.Pp +Depending on the action and certain system settings, packets +can be reinjected into the firewall at some rule after the +matching one for further processing. +.Pp +A ruleset always includes a +.Em default +rule (numbered 65535) which cannot be modified or deleted, +and matches all packets. +The action associated with the +.Em default +rule can be either +.Cm deny +or +.Cm allow +depending on how the kernel is configured. +.Pp +If the ruleset includes one or more rules with the +.Cm keep-state +or +.Cm limit +option, +the firewall will have a +.Em stateful +behaviour, i.e., upon a match it will create +.Em dynamic rules , +i.e., rules that match packets with the same 5-tuple +(protocol, source and destination addresses and ports) +as the packet which caused their creation. +Dynamic rules, which have a limited lifetime, are checked +at the first occurrence of a +.Cm check-state , +.Cm keep-state +or +.Cm limit +rule, and are typically used to open the firewall on-demand to +legitimate traffic only. +See the +.Sx STATEFUL FIREWALL +and +.Sx EXAMPLES +Sections below for more information on the stateful behaviour of +.Nm . +.Pp +All rules (including dynamic ones) have a few associated counters: +a packet count, a byte count, a log count and a timestamp +indicating the time of the last match. +Counters can be displayed or reset with +.Nm +commands. +.Pp +Each rule belongs to one of 32 different +.Em sets +, and there are +.Nm +commands to atomically manipulate sets, such as enable, +disable, swap sets, move all rules in a set to another +one, delete all rules in a set. +These can be useful to +install temporary configurations, or to test them. +See Section +.Sx SETS OF RULES +for more information on +.Em sets . +.Pp +Rules can be added with the +.Cm add +command; deleted individually or in groups with the +.Cm delete +command, and globally (except those in set 31) with the +.Cm flush +command; displayed, optionally with the content of the +counters, using the +.Cm show +and +.Cm list +commands. +Finally, counters can be reset with the +.Cm zero +and +.Cm resetlog +commands. +.Pp +.Ss COMMAND OPTIONS +The following general options are available when invoking +.Nm : +.Bl -tag -width indent +.It Fl a +Show counter values when listing rules. +The +.Cm show +command implies this option. +.It Fl b +Only show the action and the comment, not the body of a rule. +Implies +.Fl c . +.It Fl c +When entering or showing rules, print them in compact form, +i.e., omitting the "ip from any to any" string +when this does not carry any additional information. +.It Fl d +When listing, show dynamic rules in addition to static ones. +.It Fl e +When listing and +.Fl d +is specified, also show expired dynamic rules. +.It Fl f +Do not ask for confirmation for commands that can cause problems +if misused, i.e., +.Cm flush . +If there is no tty associated with the process, this is implied. +.It Fl i +When listing a table (see the +.Sx LOOKUP TABLES +section below for more information on lookup tables), format values +as IP addresses. +By default, values are shown as integers. +.It Fl n +Only check syntax of the command strings, without actually passing +them to the kernel. +.It Fl N +Try to resolve addresses and service names in output. +.It Fl q +Be quiet when executing the +.Cm add , +.Cm nat , +.Cm zero , +.Cm resetlog +or +.Cm flush +commands; +(implies +.Fl f ) . +This is useful when updating rulesets by executing multiple +.Nm +commands in a script +(e.g., +.Ql sh\ /etc/rc.firewall ) , +or by processing a file with many +.Nm +rules across a remote login session. +It also stops a table add or delete +from failing if the entry already exists or is not present. +.Pp +The reason why this option may be important is that +for some of these actions, +.Nm +may print a message; if the action results in blocking the +traffic to the remote client, +the remote login session will be closed +and the rest of the ruleset will not be processed. +Access to the console would then be required to recover. +.It Fl S +When listing rules, show the +.Em set +each rule belongs to. +If this flag is not specified, disabled rules will not be +listed. +.It Fl s Op Ar field +When listing pipes, sort according to one of the four +counters (total or current packets or bytes). +.It Fl t +When listing, show last match timestamp converted with ctime(). +.It Fl T +When listing, show last match timestamp as seconds from the epoch. +This form can be more convenient for postprocessing by scripts. +.El +.Ss LIST OF RULES AND PREPROCESSING +To ease configuration, rules can be put into a file which is +processed using +.Nm +as shown in the last synopsis line. +An absolute +.Ar pathname +must be used. +The file will be read line by line and applied as arguments to the +.Nm +utility. +.Pp +Optionally, a preprocessor can be specified using +.Fl p Ar preproc +where +.Ar pathname +is to be piped through. +Useful preprocessors include +.Xr cpp 1 +and +.Xr m4 1 . +If +.Ar preproc +does not start with a slash +.Pq Ql / +as its first character, the usual +.Ev PATH +name search is performed. +Care should be taken with this in environments where not all +file systems are mounted (yet) by the time +.Nm +is being run (e.g.\& when they are mounted over NFS). +Once +.Fl p +has been specified, any additional arguments are passed on to the preprocessor +for interpretation. +This allows for flexible configuration files (like conditionalizing +them on the local hostname) and the use of macros to centralize +frequently required arguments like IP addresses. +.Ss TRAFFIC SHAPER CONFIGURATION +The +.Nm +.Cm pipe , queue +and +.Cm sched +commands are used to configure the traffic shaper and packet scheduler. +See the +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +Section below for details. +.Pp +If the world and the kernel get out of sync the +.Nm +ABI may break, preventing you from being able to add any rules. +This can adversely affect the booting process. +You can use +.Nm +.Cm disable +.Cm firewall +to temporarily disable the firewall to regain access to the network, +allowing you to fix the problem. +.Sh PACKET FLOW +A packet is checked against the active ruleset in multiple places +in the protocol stack, under control of several sysctl variables. +These places and variables are shown below, and it is important to +have this picture in mind in order to design a correct ruleset. +.Bd -literal -offset indent + ^ to upper layers V + | | + +----------->-----------+ + ^ V + [ip(6)_input] [ip(6)_output] net.inet(6).ip(6).fw.enable=1 + | | + ^ V + [ether_demux] [ether_output_frame] net.link.ether.ipfw=1 + | | + +-->--[bdg_forward]-->--+ net.link.bridge.ipfw=1 + ^ V + | to devices | +.Ed +.Pp +The number of +times the same packet goes through the firewall can +vary between 0 and 4 depending on packet source and +destination, and system configuration. +.Pp +Note that as packets flow through the stack, headers can be +stripped or added to it, and so they may or may not be available +for inspection. +E.g., incoming packets will include the MAC header when +.Nm +is invoked from +.Cm ether_demux() , +but the same packets will have the MAC header stripped off when +.Nm +is invoked from +.Cm ip_input() +or +.Cm ip6_input() . +.Pp +Also note that each packet is always checked against the complete ruleset, +irrespective of the place where the check occurs, or the source of the packet. +If a rule contains some match patterns or actions which are not valid +for the place of invocation (e.g.\& trying to match a MAC header within +.Cm ip_input +or +.Cm ip6_input ), +the match pattern will not match, but a +.Cm not +operator in front of such patterns +.Em will +cause the pattern to +.Em always +match on those packets. +It is thus the responsibility of +the programmer, if necessary, to write a suitable ruleset to +differentiate among the possible places. +.Cm skipto +rules can be useful here, as an example: +.Bd -literal -offset indent +# packets from ether_demux or bdg_forward +ipfw add 10 skipto 1000 all from any to any layer2 in +# packets from ip_input +ipfw add 10 skipto 2000 all from any to any not layer2 in +# packets from ip_output +ipfw add 10 skipto 3000 all from any to any not layer2 out +# packets from ether_output_frame +ipfw add 10 skipto 4000 all from any to any layer2 out +.Ed +.Pp +(yes, at the moment there is no way to differentiate between +ether_demux and bdg_forward). +.Sh SYNTAX +In general, each keyword or argument must be provided as +a separate command line argument, with no leading or trailing +spaces. +Keywords are case-sensitive, whereas arguments may +or may not be case-sensitive depending on their nature +(e.g.\& uid's are, hostnames are not). +.Pp +Some arguments (e.g., port or address lists) are comma-separated +lists of values. +In this case, spaces after commas ',' are allowed to make +the line more readable. +You can also put the entire +command (including flags) into a single argument. +E.g., the following forms are equivalent: +.Bd -literal -offset indent +ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8 +ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 +ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" +.Ed +.Sh RULE FORMAT +The format of firewall rules is the following: +.Bd -ragged -offset indent +.Bk -words +.Op Ar rule_number +.Op Cm set Ar set_number +.Op Cm prob Ar match_probability +.Ar action +.Op Cm log Op Cm logamount Ar number +.Op Cm altq Ar queue +.Oo +.Bro Cm tag | untag +.Brc Ar number +.Oc +.Ar body +.Ek +.Ed +.Pp +where the body of the rule specifies which information is used +for filtering packets, among the following: +.Pp +.Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact +.It Layer-2 header fields +When available +.It IPv4 and IPv6 Protocol +TCP, UDP, ICMP, etc. +.It Source and dest. addresses and ports +.It Direction +See Section +.Sx PACKET FLOW +.It Transmit and receive interface +By name or address +.It Misc. IP header fields +Version, type of service, datagram length, identification, +fragment flag (non-zero IP offset), +Time To Live +.It IP options +.It IPv6 Extension headers +Fragmentation, Hop-by-Hop options, +Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options. +.It IPv6 Flow-ID +.It Misc. TCP header fields +TCP flags (SYN, FIN, ACK, RST, etc.), +sequence number, acknowledgment number, +window +.It TCP options +.It ICMP types +for ICMP packets +.It ICMP6 types +for ICMP6 packets +.It User/group ID +When the packet can be associated with a local socket. +.It Divert status +Whether a packet came from a divert socket (e.g., +.Xr natd 8 ) . +.It Fib annotation state +Whether a packet has been tagged for using a specific FIB (routing table) +in future forwarding decisions. +.El +.Pp +Note that some of the above information, e.g.\& source MAC or IP addresses and +TCP/UDP ports, can be easily spoofed, so filtering on those fields +alone might not guarantee the desired results. +.Bl -tag -width indent +.It Ar rule_number +Each rule is associated with a +.Ar rule_number +in the range 1..65535, with the latter reserved for the +.Em default +rule. +Rules are checked sequentially by rule number. +Multiple rules can have the same number, in which case they are +checked (and listed) according to the order in which they have +been added. +If a rule is entered without specifying a number, the kernel will +assign one in such a way that the rule becomes the last one +before the +.Em default +rule. +Automatic rule numbers are assigned by incrementing the last +non-default rule number by the value of the sysctl variable +.Ar net.inet.ip.fw.autoinc_step +which defaults to 100. +If this is not possible (e.g.\& because we would go beyond the +maximum allowed rule number), the number of the last +non-default value is used instead. +.It Cm set Ar set_number +Each rule is associated with a +.Ar set_number +in the range 0..31. +Sets can be individually disabled and enabled, so this parameter +is of fundamental importance for atomic ruleset manipulation. +It can be also used to simplify deletion of groups of rules. +If a rule is entered without specifying a set number, +set 0 will be used. +.br +Set 31 is special in that it cannot be disabled, +and rules in set 31 are not deleted by the +.Nm ipfw flush +command (but you can delete them with the +.Nm ipfw delete set 31 +command). +Set 31 is also used for the +.Em default +rule. +.It Cm prob Ar match_probability +A match is only declared with the specified probability +(floating point number between 0 and 1). +This can be useful for a number of applications such as +random packet drop or +(in conjunction with +.Nm dummynet ) +to simulate the effect of multiple paths leading to out-of-order +packet delivery. +.Pp +Note: this condition is checked before any other condition, including +ones such as keep-state or check-state which might have side effects. +.It Cm log Op Cm logamount Ar number +Packets matching a rule with the +.Cm log +keyword will be made available for logging in two ways: +if the sysctl variable +.Va net.inet.ip.fw.verbose +is set to 0 (default), one can use +.Xr bpf 4 +attached to the +.Li ipfw0 +pseudo interface. +This pseudo interface can be created after a boot +manually by using the following command: +.Bd -literal -offset indent +# ifconfig ipfw0 create +.Ed +.Pp +Or, automatically at boot time by adding the following +line to the +.Xr rc.conf 5 +file: +.Bd -literal -offset indent +firewall_logif="YES" +.Ed +.Pp +There is no overhead if no +.Xr bpf 4 +is attached to the pseudo interface. +.Pp +If +.Va net.inet.ip.fw.verbose +is set to 1, packets will be logged to +.Xr syslogd 8 +with a +.Dv LOG_SECURITY +facility up to a maximum of +.Cm logamount +packets. +If no +.Cm logamount +is specified, the limit is taken from the sysctl variable +.Va net.inet.ip.fw.verbose_limit . +In both cases, a value of 0 means unlimited logging. +.Pp +Once the limit is reached, logging can be re-enabled by +clearing the logging counter or the packet counter for that entry, see the +.Cm resetlog +command. +.Pp +Note: logging is done after all other packet matching conditions +have been successfully verified, and before performing the final +action (accept, deny, etc.) on the packet. +.It Cm tag Ar number +When a packet matches a rule with the +.Cm tag +keyword, the numeric tag for the given +.Ar number +in the range 1..65534 will be attached to the packet. +The tag acts as an internal marker (it is not sent out over +the wire) that can be used to identify these packets later on. +This can be used, for example, to provide trust between interfaces +and to start doing policy-based filtering. +A packet can have multiple tags at the same time. +Tags are "sticky", meaning once a tag is applied to a packet by a +matching rule it exists until explicit removal. +Tags are kept with the packet everywhere within the kernel, but are +lost when packet leaves the kernel, for example, on transmitting +packet out to the network or sending packet to a +.Xr divert 4 +socket. +.Pp +To check for previously applied tags, use the +.Cm tagged +rule option. +To delete previously applied tag, use the +.Cm untag +keyword. +.Pp +Note: since tags are kept with the packet everywhere in kernelspace, +they can be set and unset anywhere in the kernel network subsystem +(using the +.Xr mbuf_tags 9 +facility), not only by means of the +.Xr ipfw 4 +.Cm tag +and +.Cm untag +keywords. +For example, there can be a specialized +.Xr netgraph 4 +node doing traffic analyzing and tagging for later inspecting +in firewall. +.It Cm untag Ar number +When a packet matches a rule with the +.Cm untag +keyword, the tag with the number +.Ar number +is searched among the tags attached to this packet and, +if found, removed from it. +Other tags bound to packet, if present, are left untouched. +.It Cm altq Ar queue +When a packet matches a rule with the +.Cm altq +keyword, the ALTQ identifier for the given +.Ar queue +(see +.Xr altq 4 ) +will be attached. +Note that this ALTQ tag is only meaningful for packets going "out" of IPFW, +and not being rejected or going to divert sockets. +Note that if there is insufficient memory at the time the packet is +processed, it will not be tagged, so it is wise to make your ALTQ +"default" queue policy account for this. +If multiple +.Cm altq +rules match a single packet, only the first one adds the ALTQ classification +tag. +In doing so, traffic may be shaped by using +.Cm count Cm altq Ar queue +rules for classification early in the ruleset, then later applying +the filtering decision. +For example, +.Cm check-state +and +.Cm keep-state +rules may come later and provide the actual filtering decisions in +addition to the fallback ALTQ tag. +.Pp +You must run +.Xr pfctl 8 +to set up the queues before IPFW will be able to look them up by name, +and if the ALTQ disciplines are rearranged, the rules in containing the +queue identifiers in the kernel will likely have gone stale and need +to be reloaded. +Stale queue identifiers will probably result in misclassification. +.Pp +All system ALTQ processing can be turned on or off via +.Nm +.Cm enable Ar altq +and +.Nm +.Cm disable Ar altq . +The usage of +.Va net.inet.ip.fw.one_pass +is irrelevant to ALTQ traffic shaping, as the actual rule action is followed +always after adding an ALTQ tag. +.El +.Ss RULE ACTIONS +A rule can be associated with one of the following actions, which +will be executed when the packet matches the body of the rule. +.Bl -tag -width indent +.It Cm allow | accept | pass | permit +Allow packets that match rule. +The search terminates. +.It Cm check-state +Checks the packet against the dynamic ruleset. +If a match is found, execute the action associated with +the rule which generated this dynamic rule, otherwise +move to the next rule. +.br +.Cm Check-state +rules do not have a body. +If no +.Cm check-state +rule is found, the dynamic ruleset is checked at the first +.Cm keep-state +or +.Cm limit +rule. +.It Cm count +Update counters for all packets that match rule. +The search continues with the next rule. +.It Cm deny | drop +Discard packets that match this rule. +The search terminates. +.It Cm divert Ar port +Divert packets that match this rule to the +.Xr divert 4 +socket bound to port +.Ar port . +The search terminates. +.It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port +Change the next-hop on matching packets to +.Ar ipaddr , +which can be an IP address or a host name. +For IPv4, the next hop can also be supplied by the last table +looked up for the packet by using the +.Cm tablearg +keyword instead of an explicit address. +The search terminates if this rule matches. +.Pp +If +.Ar ipaddr +is a local address, then matching packets will be forwarded to +.Ar port +(or the port number in the packet if one is not specified in the rule) +on the local machine. +.br +If +.Ar ipaddr +is not a local address, then the port number +(if specified) is ignored, and the packet will be +forwarded to the remote address, using the route as found in +the local routing table for that IP. +.br +A +.Ar fwd +rule will not match layer-2 packets (those received +on ether_input, ether_output, or bridged). +.br +The +.Cm fwd +action does not change the contents of the packet at all. +In particular, the destination address remains unmodified, so +packets forwarded to another system will usually be rejected by that system +unless there is a matching rule on that system to capture them. +For packets forwarded locally, +the local address of the socket will be +set to the original destination address of the packet. +This makes the +.Xr netstat 1 +entry look rather weird but is intended for +use with transparent proxy servers. +.It Cm nat Ar nat_nr | tablearg +Pass packet to a +nat instance +(for network address translation, address redirect, etc.): +see the +.Sx NETWORK ADDRESS TRANSLATION (NAT) +Section for further information. +.It Cm pipe Ar pipe_nr +Pass packet to a +.Nm dummynet +.Dq pipe +(for bandwidth limitation, delay, etc.). +See the +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +Section for further information. +The search terminates; however, on exit from the pipe and if +the +.Xr sysctl 8 +variable +.Va net.inet.ip.fw.one_pass +is not set, the packet is passed again to the firewall code +starting from the next rule. +.It Cm queue Ar queue_nr +Pass packet to a +.Nm dummynet +.Dq queue +(for bandwidth limitation using WF2Q+). +.It Cm reject +(Deprecated). +Synonym for +.Cm unreach host . +.It Cm reset +Discard packets that match this rule, and if the +packet is a TCP packet, try to send a TCP reset (RST) notice. +The search terminates. +.It Cm reset6 +Discard packets that match this rule, and if the +packet is a TCP packet, try to send a TCP reset (RST) notice. +The search terminates. +.It Cm skipto Ar number | tablearg +Skip all subsequent rules numbered less than +.Ar number . +The search continues with the first rule numbered +.Ar number +or higher. +It is possible to use the +.Cm tablearg +keyword with a skipto for a +.Em computed +skipto, but care should be used, as no destination caching +is possible in this case so the rules are always walked to find it, +starting from the +.Cm skipto . +.It Cm call Ar number | tablearg +The current rule number is saved in the internal stack and +ruleset processing continues with the first rule numbered +.Ar number +or higher. +If later a rule with the +.Cm return +action is encountered, the processing returns to the first rule +with number of this +.Cm call +rule plus one or higher +(the same behaviour as with packets returning from +.Xr divert 4 +socket after a +.Cm divert +action). +This could be used to make somewhat like an assembly language +.Dq subroutine +calls to rules with common checks for different interfaces, etc. +.Pp +Rule with any number could be called, not just forward jumps as with +.Cm skipto . +So, to prevent endless loops in case of mistakes, both +.Cm call +and +.Cm return +actions don't do any jumps and simply go to the next rule if memory +cannot be allocated or stack overflowed/underflowed. +.Pp +Internally stack for rule numbers is implemented using +.Xr mbuf_tags 9 +facility and currently has size of 16 entries. +As mbuf tags are lost when packet leaves the kernel, +.Cm divert +should not be used in subroutines to avoid endless loops +and other undesired effects. +.It Cm return +Takes rule number saved to internal stack by the last +.Cm call +action and returns ruleset processing to the first rule +with number greater than number of corresponding +.Cm call +rule. +See description of the +.Cm call +action for more details. +.Pp +Note that +.Cm return +rules usually end a +.Dq subroutine +and thus are unconditional, but +.Nm +command-line utility currently requires every action except +.Cm check-state +to have body. +While it is sometimes useful to return only on some packets, +usually you want to print just +.Dq return +for readability. +A workaround for this is to use new syntax and +.Fl c +switch: +.Bd -literal -offset indent +# Add a rule without actual body +ipfw add 2999 return via any + +# List rules without "from any to any" part +ipfw -c list +.Ed +.Pp +This cosmetic annoyance may be fixed in future releases. +.It Cm tee Ar port +Send a copy of packets matching this rule to the +.Xr divert 4 +socket bound to port +.Ar port . +The search continues with the next rule. +.It Cm unreach Ar code +Discard packets that match this rule, and try to send an ICMP +unreachable notice with code +.Ar code , +where +.Ar code +is a number from 0 to 255, or one of these aliases: +.Cm net , host , protocol , port , +.Cm needfrag , srcfail , net-unknown , host-unknown , +.Cm isolated , net-prohib , host-prohib , tosnet , +.Cm toshost , filter-prohib , host-precedence +or +.Cm precedence-cutoff . +The search terminates. +.It Cm unreach6 Ar code +Discard packets that match this rule, and try to send an ICMPv6 +unreachable notice with code +.Ar code , +where +.Ar code +is a number from 0, 1, 3 or 4, or one of these aliases: +.Cm no-route, admin-prohib, address +or +.Cm port . +The search terminates. +.It Cm netgraph Ar cookie +Divert packet into netgraph with given +.Ar cookie . +The search terminates. +If packet is later returned from netgraph it is either +accepted or continues with the next rule, depending on +.Va net.inet.ip.fw.one_pass +sysctl variable. +.It Cm ngtee Ar cookie +A copy of packet is diverted into netgraph, original +packet continues with the next rule. +See +.Xr ng_ipfw 4 +for more information on +.Cm netgraph +and +.Cm ngtee +actions. +.It Cm setfib Ar fibnum | tablearg +The packet is tagged so as to use the FIB (routing table) +.Ar fibnum +in any subsequent forwarding decisions. +In the current implementation, this is limited to the values 0 through 15, see +.Xr setfib 2 . +Processing continues at the next rule. +It is possible to use the +.Cm tablearg +keyword with setfib. +If the tablearg value is not within the compiled range of fibs, +the packet's fib is set to 0. +.It Cm setdscp Ar DSCP | number | tablearg +Set specified DiffServ codepoint for an IPv4/IPv6 packet. +Processing continues at the next rule. +Supported values are: +.Pp +.Cm CS0 +.Pq Dv 000000 , +.Cm CS1 +.Pq Dv 001000 , +.Cm CS2 +.Pq Dv 010000 , +.Cm CS3 +.Pq Dv 011000 , +.Cm CS4 +.Pq Dv 100000 , +.Cm CS5 +.Pq Dv 101000 , +.Cm CS6 +.Pq Dv 110000 , +.Cm CS7 +.Pq Dv 111000 , +.Cm AF11 +.Pq Dv 001010 , +.Cm AF12 +.Pq Dv 001100 , +.Cm AF13 +.Pq Dv 001110 , +.Cm AF21 +.Pq Dv 010010 , +.Cm AF22 +.Pq Dv 010100 , +.Cm AF23 +.Pq Dv 010110 , +.Cm AF31 +.Pq Dv 011010 , +.Cm AF32 +.Pq Dv 011100 , +.Cm AF33 +.Pq Dv 011110 , +.Cm AF41 +.Pq Dv 100010 , +.Cm AF42 +.Pq Dv 100100 , +.Cm AF43 +.Pq Dv 100110 , +.Cm EF +.Pq Dv 101110 , +.Cm BE +.Pq Dv 000000 . +Additionally, DSCP value can be specified by number (0..64). +It is also possible to use the +.Cm tablearg +keyword with setdscp. +If the tablearg value is not within the 0..64 range, lower 6 bits of supplied +value are used. +.It Cm reass +Queue and reassemble IP fragments. +If the packet is not fragmented, counters are updated and +processing continues with the next rule. +If the packet is the last logical fragment, the packet is reassembled and, if +.Va net.inet.ip.fw.one_pass +is set to 0, processing continues with the next rule. +Otherwise, the packet is allowed to pass and the search terminates. +If the packet is a fragment in the middle of a logical group of fragments, +it is consumed and +processing stops immediately. +.Pp +Fragment handling can be tuned via +.Va net.inet.ip.maxfragpackets +and +.Va net.inet.ip.maxfragsperpacket +which limit, respectively, the maximum number of processable +fragments (default: 800) and +the maximum number of fragments per packet (default: 16). +.Pp +NOTA BENE: since fragments do not contain port numbers, +they should be avoided with the +.Nm reass +rule. +Alternatively, direction-based (like +.Nm in +/ +.Nm out +) and source-based (like +.Nm via +) match patterns can be used to select fragments. +.Pp +Usually a simple rule like: +.Bd -literal -offset indent +# reassemble incoming fragments +ipfw add reass all from any to any in +.Ed +.Pp +is all you need at the beginning of your ruleset. +.El +.Ss RULE BODY +The body of a rule contains zero or more patterns (such as +specific source and destination addresses or ports, +protocol options, incoming or outgoing interfaces, etc.) +that the packet must match in order to be recognised. +In general, the patterns are connected by (implicit) +.Cm and +operators -- i.e., all must match in order for the +rule to match. +Individual patterns can be prefixed by the +.Cm not +operator to reverse the result of the match, as in +.Pp +.Dl "ipfw add 100 allow ip from not 1.2.3.4 to any" +.Pp +Additionally, sets of alternative match patterns +.Pq Em or-blocks +can be constructed by putting the patterns in +lists enclosed between parentheses ( ) or braces { }, and +using the +.Cm or +operator as follows: +.Pp +.Dl "ipfw add 100 allow ip from { x or not y or z } to any" +.Pp +Only one level of parentheses is allowed. +Beware that most shells have special meanings for parentheses +or braces, so it is advisable to put a backslash \\ in front of them +to prevent such interpretations. +.Pp +The body of a rule must in general include a source and destination +address specifier. +The keyword +.Ar any +can be used in various places to specify that the content of +a required field is irrelevant. +.Pp +The rule body has the following format: +.Bd -ragged -offset indent +.Op Ar proto Cm from Ar src Cm to Ar dst +.Op Ar options +.Ed +.Pp +The first part (proto from src to dst) is for backward +compatibility with earlier versions of +.Fx . +In modern +.Fx +any match pattern (including MAC headers, IP protocols, +addresses and ports) can be specified in the +.Ar options +section. +.Pp +Rule fields have the following meaning: +.Bl -tag -width indent +.It Ar proto : protocol | Cm { Ar protocol Cm or ... } +.It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number +An IP protocol specified by number or name +(for a complete list see +.Pa /etc/protocols ) , +or one of the following keywords: +.Bl -tag -width indent +.It Cm ip4 | ipv4 +Matches IPv4 packets. +.It Cm ip6 | ipv6 +Matches IPv6 packets. +.It Cm ip | all +Matches any packet. +.El +.Pp +The +.Cm ipv6 +in +.Cm proto +option will be treated as inner protocol. +And, the +.Cm ipv4 +is not available in +.Cm proto +option. +.Pp +The +.Cm { Ar protocol Cm or ... } +format (an +.Em or-block ) +is provided for convenience only but its use is deprecated. +.It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports +An address (or a list, see below) +optionally followed by +.Ar ports +specifiers. +.Pp +The second format +.Em ( or-block +with multiple addresses) is provided for convenience only and +its use is discouraged. +.It Ar addr : Oo Cm not Oc Bro +.Cm any | me | me6 | +.Cm table Ns Pq Ar number Ns Op , Ns Ar value +.Ar | addr-list | addr-set +.Brc +.Bl -tag -width indent +.It Cm any +matches any IP address. +.It Cm me +matches any IP address configured on an interface in the system. +.It Cm me6 +matches any IPv6 address configured on an interface in the system. +The address list is evaluated at the time the packet is +analysed. +.It Cm table Ns Pq Ar number Ns Op , Ns Ar value +Matches any IPv4 address for which an entry exists in the lookup table +.Ar number . +If an optional 32-bit unsigned +.Ar value +is also specified, an entry will match only if it has this value. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.El +.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list +.It Ar ip-addr : +A host or subnet address specified in one of the following ways: +.Bl -tag -width indent +.It Ar numeric-ip | hostname +Matches a single IPv4 address, specified as dotted-quad or a hostname. +Hostnames are resolved at the time the rule is added to the firewall list. +.It Ar addr Ns / Ns Ar masklen +Matches all addresses with base +.Ar addr +(specified as an IP address, a network number, or a hostname) +and mask width of +.Cm masklen +bits. +As an example, 1.2.3.4/25 or 1.2.3.0/25 will match +all IP numbers from 1.2.3.0 to 1.2.3.127 . +.It Ar addr Ns : Ns Ar mask +Matches all addresses with base +.Ar addr +(specified as an IP address, a network number, or a hostname) +and the mask of +.Ar mask , +specified as a dotted quad. +As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match +1.*.3.*. +This form is advised only for non-contiguous +masks. +It is better to resort to the +.Ar addr Ns / Ns Ar masklen +format for contiguous masks, which is more compact and less +error-prone. +.El +.It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm } +.It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list +Matches all addresses with base address +.Ar addr +(specified as an IP address, a network number, or a hostname) +and whose last byte is in the list between braces { } . +Note that there must be no spaces between braces and +numbers (spaces after commas are allowed). +Elements of the list can be specified as single entries +or ranges. +The +.Ar masklen +field is used to limit the size of the set of addresses, +and can have any value between 24 and 32. +If not specified, +it will be assumed as 24. +.br +This format is particularly useful to handle sparse address sets +within a single rule. +Because the matching occurs using a +bitmask, it takes constant time and dramatically reduces +the complexity of rulesets. +.br +As an example, an address specified as 1.2.3.4/24{128,35-55,89} +or 1.2.3.0/24{128,35-55,89} +will match the following IP addresses: +.br +1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 . +.It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list +.It Ar ip6-addr : +A host or subnet specified one of the following ways: +.Bl -tag -width indent +.It Ar numeric-ip | hostname +Matches a single IPv6 address as allowed by +.Xr inet_pton 3 +or a hostname. +Hostnames are resolved at the time the rule is added to the firewall +list. +.It Ar addr Ns / Ns Ar masklen +Matches all IPv6 addresses with base +.Ar addr +(specified as allowed by +.Xr inet_pton +or a hostname) +and mask width of +.Cm masklen +bits. +.El +.Pp +No support for sets of IPv6 addresses is provided because IPv6 addresses +are typically random past the initial prefix. +.It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports +For protocols which support port numbers (such as TCP and UDP), optional +.Cm ports +may be specified as one or more ports or port ranges, separated +by commas but no spaces, and an optional +.Cm not +operator. +The +.Ql \&- +notation specifies a range of ports (including boundaries). +.Pp +Service names (from +.Pa /etc/services ) +may be used instead of numeric port values. +The length of the port list is limited to 30 ports or ranges, +though one can specify larger ranges by using an +.Em or-block +in the +.Cm options +section of the rule. +.Pp +A backslash +.Pq Ql \e +can be used to escape the dash +.Pq Ql - +character in a service name (from a shell, the backslash must be +typed twice to avoid the shell itself interpreting it as an escape +character). +.Pp +.Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any" +.Pp +Fragmented packets which have a non-zero offset (i.e., not the first +fragment) will never match a rule which has one or more port +specifications. +See the +.Cm frag +option for details on matching fragmented packets. +.El +.Ss RULE OPTIONS (MATCH PATTERNS) +Additional match patterns can be used within +rules. +Zero or more of these so-called +.Em options +can be present in a rule, optionally prefixed by the +.Cm not +operand, and possibly grouped into +.Em or-blocks . +.Pp +The following match patterns can be used (listed in alphabetical order): +.Bl -tag -width indent +.It Cm // this is a comment. +Inserts the specified text as a comment in the rule. +Everything following // is considered as a comment and stored in the rule. +You can have comment-only rules, which are listed as having a +.Cm count +action followed by the comment. +.It Cm bridged +Alias for +.Cm layer2 . +.It Cm diverted +Matches only packets generated by a divert socket. +.It Cm diverted-loopback +Matches only packets coming from a divert socket back into the IP stack +input for delivery. +.It Cm diverted-output +Matches only packets going from a divert socket back outward to the IP +stack output for delivery. +.It Cm dst-ip Ar ip-address +Matches IPv4 packets whose destination IP is one of the address(es) +specified as argument. +.It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address +Matches IPv6 packets whose destination IP is one of the address(es) +specified as argument. +.It Cm dst-port Ar ports +Matches IP packets whose destination port is one of the port(s) +specified as argument. +.It Cm established +Matches TCP packets that have the RST or ACK bits set. +.It Cm ext6hdr Ar header +Matches IPv6 packets containing the extended header given by +.Ar header . +Supported headers are: +.Pp +Fragment, +.Pq Cm frag , +Hop-to-hop options +.Pq Cm hopopt , +any type of Routing Header +.Pq Cm route , +Source routing Routing Header Type 0 +.Pq Cm rthdr0 , +Mobile IPv6 Routing Header Type 2 +.Pq Cm rthdr2 , +Destination options +.Pq Cm dstopt , +IPSec authentication headers +.Pq Cm ah , +and IPsec encapsulated security payload headers +.Pq Cm esp . +.It Cm fib Ar fibnum +Matches a packet that has been tagged to use +the given FIB (routing table) number. +.It Cm flow-id Ar labels +Matches IPv6 packets containing any of the flow labels given in +.Ar labels . +.Ar labels +is a comma separated list of numeric flow labels. +.It Cm frag +Matches packets that are fragments and not the first +fragment of an IP datagram. +Note that these packets will not have +the next protocol header (e.g.\& TCP, UDP) so options that look into +these headers cannot match. +.It Cm gid Ar group +Matches all TCP or UDP packets sent by or received for a +.Ar group . +A +.Ar group +may be specified by name or number. +.It Cm jail Ar prisonID +Matches all TCP or UDP packets sent by or received for the +jail whos prison ID is +.Ar prisonID . +.It Cm icmptypes Ar types +Matches ICMP packets whose ICMP type is in the list +.Ar types . +The list may be specified as any combination of +individual types (numeric) separated by commas. +.Em Ranges are not allowed . +The supported ICMP types are: +.Pp +echo reply +.Pq Cm 0 , +destination unreachable +.Pq Cm 3 , +source quench +.Pq Cm 4 , +redirect +.Pq Cm 5 , +echo request +.Pq Cm 8 , +router advertisement +.Pq Cm 9 , +router solicitation +.Pq Cm 10 , +time-to-live exceeded +.Pq Cm 11 , +IP header bad +.Pq Cm 12 , +timestamp request +.Pq Cm 13 , +timestamp reply +.Pq Cm 14 , +information request +.Pq Cm 15 , +information reply +.Pq Cm 16 , +address mask request +.Pq Cm 17 +and address mask reply +.Pq Cm 18 . +.It Cm icmp6types Ar types +Matches ICMP6 packets whose ICMP6 type is in the list of +.Ar types . +The list may be specified as any combination of +individual types (numeric) separated by commas. +.Em Ranges are not allowed . +.It Cm in | out +Matches incoming or outgoing packets, respectively. +.Cm in +and +.Cm out +are mutually exclusive (in fact, +.Cm out +is implemented as +.Cm not in Ns No ). +.It Cm ipid Ar id-list +Matches IPv4 packets whose +.Cm ip_id +field has value included in +.Ar id-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm iplen Ar len-list +Matches IP packets whose total length, including header and data, is +in the set +.Ar len-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm ipoptions Ar spec +Matches packets whose IPv4 header contains the comma separated list of +options specified in +.Ar spec . +The supported IP options are: +.Pp +.Cm ssrr +(strict source route), +.Cm lsrr +(loose source route), +.Cm rr +(record packet route) and +.Cm ts +(timestamp). +The absence of a particular option may be denoted +with a +.Ql \&! . +.It Cm ipprecedence Ar precedence +Matches IPv4 packets whose precedence field is equal to +.Ar precedence . +.It Cm ipsec +Matches packets that have IPSEC history associated with them +(i.e., the packet comes encapsulated in IPSEC, the kernel +has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly +decapsulate it). +.Pp +Note that specifying +.Cm ipsec +is different from specifying +.Cm proto Ar ipsec +as the latter will only look at the specific IP protocol field, +irrespective of IPSEC kernel support and the validity of the IPSEC data. +.Pp +Further note that this flag is silently ignored in kernels without +IPSEC support. +It does not affect rule processing when given and the +rules are handled as if with no +.Cm ipsec +flag. +.It Cm iptos Ar spec +Matches IPv4 packets whose +.Cm tos +field contains the comma separated list of +service types specified in +.Ar spec . +The supported IP types of service are: +.Pp +.Cm lowdelay +.Pq Dv IPTOS_LOWDELAY , +.Cm throughput +.Pq Dv IPTOS_THROUGHPUT , +.Cm reliability +.Pq Dv IPTOS_RELIABILITY , +.Cm mincost +.Pq Dv IPTOS_MINCOST , +.Cm congestion +.Pq Dv IPTOS_ECN_CE . +The absence of a particular type may be denoted +with a +.Ql \&! . +.It Cm dscp spec Ns Op , Ns Ar spec +Matches IPv4/IPv6 packets whose +.Cm DS +field value is contained in +.Ar spec +mask. +Multiple values can be specified via +the comma separated list. +Value can be one of keywords used in +.Cm setdscp +action or exact number. +.It Cm ipttl Ar ttl-list +Matches IPv4 packets whose time to live is included in +.Ar ttl-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm ipversion Ar ver +Matches IP packets whose IP version field is +.Ar ver . +.It Cm keep-state +Upon a match, the firewall will create a dynamic rule, whose +default behaviour is to match bidirectional traffic between +source and destination IP/port using the same protocol. +The rule has a limited lifetime (controlled by a set of +.Xr sysctl 8 +variables), and the lifetime is refreshed every time a matching +packet is found. +.It Cm layer2 +Matches only layer2 packets, i.e., those passed to +.Nm +from ether_demux() and ether_output_frame(). +.It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N +The firewall will only allow +.Ar N +connections with the same +set of parameters as specified in the rule. +One or more +of source and destination addresses and ports can be +specified. +Currently, +only IPv4 flows are supported. +.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N +Search an entry in lookup table +.Ar N +that matches the field specified as argument. +If not found, the match fails. +Otherwise, the match succeeds and +.Cm tablearg +is set to the value extracted from the table. +.Pp +This option can be useful to quickly dispatch traffic based on +certain packet fields. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.It Cm { MAC | mac } Ar dst-mac src-mac +Match packets with a given +.Ar dst-mac +and +.Ar src-mac +addresses, specified as the +.Cm any +keyword (matching any MAC address), or six groups of hex digits +separated by colons, +and optionally followed by a mask indicating the significant bits. +The mask may be specified using either of the following methods: +.Bl -enum -width indent +.It +A slash +.Pq / +followed by the number of significant bits. +For example, an address with 33 significant bits could be specified as: +.Pp +.Dl "MAC 10:20:30:40:50:60/33 any" +.Pp +.It +An ampersand +.Pq & +followed by a bitmask specified as six groups of hex digits separated +by colons. +For example, an address in which the last 16 bits are significant could +be specified as: +.Pp +.Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any" +.Pp +Note that the ampersand character has a special meaning in many shells +and should generally be escaped. +.Pp +.El +Note that the order of MAC addresses (destination first, +source second) is +the same as on the wire, but the opposite of the one used for +IP addresses. +.It Cm mac-type Ar mac-type +Matches packets whose Ethernet Type field +corresponds to one of those specified as argument. +.Ar mac-type +is specified in the same way as +.Cm port numbers +(i.e., one or more comma-separated single values or ranges). +You can use symbolic names for known values such as +.Em vlan , ipv4, ipv6 . +Values can be entered as decimal or hexadecimal (if prefixed by 0x), +and they are always printed as hexadecimal (unless the +.Cm -N +option is used, in which case symbolic resolution will be attempted). +.It Cm proto Ar protocol +Matches packets with the corresponding IP protocol. +.It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar table Ns Pq Ar number Ns Op , Ns Ar value | Ar ipno | Ar any +Matches packets received, transmitted or going through, +respectively, the interface specified by exact name +.Po Ar ifX Pc , +by device name +.Po Ar if* Pc , +by IP address, or through some interface. +.Pp +The +.Cm via +keyword causes the interface to always be checked. +If +.Cm recv +or +.Cm xmit +is used instead of +.Cm via , +then only the receive or transmit interface (respectively) +is checked. +By specifying both, it is possible to match packets based on +both receive and transmit interface, e.g.: +.Pp +.Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1" +.Pp +The +.Cm recv +interface can be tested on either incoming or outgoing packets, +while the +.Cm xmit +interface can only be tested on outgoing packets. +So +.Cm out +is required (and +.Cm in +is invalid) whenever +.Cm xmit +is used. +.Pp +A packet might not have a receive or transmit interface: packets +originating from the local host have no receive interface, +while packets destined for the local host have no transmit +interface. +.It Cm setup +Matches TCP packets that have the SYN bit set but no ACK bit. +This is the short form of +.Dq Li tcpflags\ syn,!ack . +.It Cm sockarg +Matches packets that are associated to a local socket and +for which the SO_USER_COOKIE socket option has been set +to a non-zero value. +As a side effect, the value of the +option is made available as +.Cm tablearg +value, which in turn can be used as +.Cm skipto +or +.Cm pipe +number. +.It Cm src-ip Ar ip-address +Matches IPv4 packets whose source IP is one of the address(es) +specified as an argument. +.It Cm src-ip6 Ar ip6-address +Matches IPv6 packets whose source IP is one of the address(es) +specified as an argument. +.It Cm src-port Ar ports +Matches IP packets whose source port is one of the port(s) +specified as argument. +.It Cm tagged Ar tag-list +Matches packets whose tags are included in +.Ar tag-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +Tags can be applied to the packet using +.Cm tag +rule action parameter (see it's description for details on tags). +.It Cm tcpack Ar ack +TCP packets only. +Match if the TCP header acknowledgment number field is set to +.Ar ack . +.It Cm tcpdatalen Ar tcpdatalen-list +Matches TCP packets whose length of TCP data is +.Ar tcpdatalen-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm tcpflags Ar spec +TCP packets only. +Match if the TCP header contains the comma separated list of +flags specified in +.Ar spec . +The supported TCP flags are: +.Pp +.Cm fin , +.Cm syn , +.Cm rst , +.Cm psh , +.Cm ack +and +.Cm urg . +The absence of a particular flag may be denoted +with a +.Ql \&! . +A rule which contains a +.Cm tcpflags +specification can never match a fragmented packet which has +a non-zero offset. +See the +.Cm frag +option for details on matching fragmented packets. +.It Cm tcpseq Ar seq +TCP packets only. +Match if the TCP header sequence number field is set to +.Ar seq . +.It Cm tcpwin Ar tcpwin-list +Matches TCP packets whose header window field is set to +.Ar tcpwin-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm tcpoptions Ar spec +TCP packets only. +Match if the TCP header contains the comma separated list of +options specified in +.Ar spec . +The supported TCP options are: +.Pp +.Cm mss +(maximum segment size), +.Cm window +(tcp window advertisement), +.Cm sack +(selective ack), +.Cm ts +(rfc1323 timestamp) and +.Cm cc +(rfc1644 t/tcp connection count). +The absence of a particular option may be denoted +with a +.Ql \&! . +.It Cm uid Ar user +Match all TCP or UDP packets sent by or received for a +.Ar user . +A +.Ar user +may be matched by name or identification number. +.It Cm verrevpath +For incoming packets, +a routing table lookup is done on the packet's source address. +If the interface on which the packet entered the system matches the +outgoing interface for the route, +the packet matches. +If the interfaces do not match up, +the packet does not match. +All outgoing packets or packets with no incoming interface match. +.Pp +The name and functionality of the option is intentionally similar to +the Cisco IOS command: +.Pp +.Dl ip verify unicast reverse-path +.Pp +This option can be used to make anti-spoofing rules to reject all +packets with source addresses not from this interface. +See also the option +.Cm antispoof . +.It Cm versrcreach +For incoming packets, +a routing table lookup is done on the packet's source address. +If a route to the source address exists, but not the default route +or a blackhole/reject route, the packet matches. +Otherwise, the packet does not match. +All outgoing packets match. +.Pp +The name and functionality of the option is intentionally similar to +the Cisco IOS command: +.Pp +.Dl ip verify unicast source reachable-via any +.Pp +This option can be used to make anti-spoofing rules to reject all +packets whose source address is unreachable. +.It Cm antispoof +For incoming packets, the packet's source address is checked if it +belongs to a directly connected network. +If the network is directly connected, then the interface the packet +came on in is compared to the interface the network is connected to. +When incoming interface and directly connected interface are not the +same, the packet does not match. +Otherwise, the packet does match. +All outgoing packets match. +.Pp +This option can be used to make anti-spoofing rules to reject all +packets that pretend to be from a directly connected network but do +not come in through that interface. +This option is similar to but more restricted than +.Cm verrevpath +because it engages only on packets with source addresses of directly +connected networks instead of all source addresses. +.El +.Sh LOOKUP TABLES +Lookup tables are useful to handle large sparse sets of +addresses or other search keys (e.g., ports, jail IDs, interface names). +In the rest of this section we will use the term ``address''. +There may be up to 65535 different lookup tables, numbered 0 to 65534. +.Pp +Each entry is represented by an +.Ar addr Ns Op / Ns Ar masklen +and will match all addresses with base +.Ar addr +(specified as an IPv4/IPv6 address, a hostname or an unsigned integer) +and mask width of +.Ar masklen +bits. +If +.Ar masklen +is not specified, it defaults to 32 for IPv4 and 128 for IPv6. +When looking up an IP address in a table, the most specific +entry will match. +Associated with each entry is a 32-bit unsigned +.Ar value , +which can optionally be checked by a rule matching code. +When adding an entry, if +.Ar value +is not specified, it defaults to 0. +.Pp +An entry can be added to a table +.Pq Cm add , +or removed from a table +.Pq Cm delete . +A table can be examined +.Pq Cm list +or flushed +.Pq Cm flush . +.Pp +Internally, each table is stored in a Radix tree, the same way as +the routing table (see +.Xr route 4 ) . +.Pp +Lookup tables currently support only ports, jail IDs, IPv4/IPv6 addresses +and interface names. +Wildcards is not supported for interface names. +.Pp +The +.Cm tablearg +feature provides the ability to use a value, looked up in the table, as +the argument for a rule action, action parameter or rule option. +This can significantly reduce number of rules in some configurations. +If two tables are used in a rule, the result of the second (destination) +is used. +The +.Cm tablearg +argument can be used with the following actions: +.Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib, +action parameters: +.Cm tag, untag, +rule options: +.Cm limit, tagged. +.Pp +When used with +.Cm fwd +it is possible to supply table entries with values +that are in the form of IP addresses or hostnames. +See the +.Sx EXAMPLES +Section for example usage of tables and the tablearg keyword. +.Pp +When used with the +.Cm skipto +action, the user should be aware that the code will walk the ruleset +up to a rule equal to, or past, the given number, +and should therefore try keep the +ruleset compact between the skipto and the target rules. +.Sh SETS OF RULES +Each rule belongs to one of 32 different +.Em sets +, numbered 0 to 31. +Set 31 is reserved for the default rule. +.Pp +By default, rules are put in set 0, unless you use the +.Cm set N +attribute when entering a new rule. +Sets can be individually and atomically enabled or disabled, +so this mechanism permits an easy way to store multiple configurations +of the firewall and quickly (and atomically) switch between them. +The command to enable/disable sets is +.Bd -ragged -offset indent +.Nm +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... +.Ed +.Pp +where multiple +.Cm enable +or +.Cm disable +sections can be specified. +Command execution is atomic on all the sets specified in the command. +By default, all sets are enabled. +.Pp +When you disable a set, its rules behave as if they do not exist +in the firewall configuration, with only one exception: +.Bd -ragged -offset indent +dynamic rules created from a rule before it had been disabled +will still be active until they expire. +In order to delete +dynamic rules you have to explicitly delete the parent rule +which generated them. +.Ed +.Pp +The set number of rules can be changed with the command +.Bd -ragged -offset indent +.Nm +.Cm set move +.Brq Cm rule Ar rule-number | old-set +.Cm to Ar new-set +.Ed +.Pp +Also, you can atomically swap two rulesets with the command +.Bd -ragged -offset indent +.Nm +.Cm set swap Ar first-set second-set +.Ed +.Pp +See the +.Sx EXAMPLES +Section on some possible uses of sets of rules. +.Sh STATEFUL FIREWALL +Stateful operation is a way for the firewall to dynamically +create rules for specific flows when packets that +match a given pattern are detected. +Support for stateful +operation comes through the +.Cm check-state , keep-state +and +.Cm limit +options of +.Nm rules . +.Pp +Dynamic rules are created when a packet matches a +.Cm keep-state +or +.Cm limit +rule, causing the creation of a +.Em dynamic +rule which will match all and only packets with +a given +.Em protocol +between a +.Em src-ip/src-port dst-ip/dst-port +pair of addresses +.Em ( src +and +.Em dst +are used here only to denote the initial match addresses, but they +are completely equivalent afterwards). +Dynamic rules will be checked at the first +.Cm check-state, keep-state +or +.Cm limit +occurrence, and the action performed upon a match will be the same +as in the parent rule. +.Pp +Note that no additional attributes other than protocol and IP addresses +and ports are checked on dynamic rules. +.Pp +The typical use of dynamic rules is to keep a closed firewall configuration, +but let the first TCP SYN packet from the inside network install a +dynamic rule for the flow so that packets belonging to that session +will be allowed through the firewall: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add allow tcp from my-subnet to any setup keep-state" +.Dl "ipfw add deny tcp from any to any" +.Pp +A similar approach can be used for UDP, where an UDP packet coming +from the inside will install a dynamic rule to let the response through +the firewall: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add allow udp from my-subnet to any keep-state" +.Dl "ipfw add deny udp from any to any" +.Pp +Dynamic rules expire after some time, which depends on the status +of the flow and the setting of some +.Cm sysctl +variables. +See Section +.Sx SYSCTL VARIABLES +for more details. +For TCP sessions, dynamic rules can be instructed to periodically +send keepalive packets to refresh the state of the rule when it is +about to expire. +.Pp +See Section +.Sx EXAMPLES +for more examples on how to use dynamic rules. +.Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +.Nm +is also the user interface for the +.Nm dummynet +traffic shaper, packet scheduler and network emulator, a subsystem that +can artificially queue, delay or drop packets +emulating the behaviour of certain network links +or queueing systems. +.Pp +.Nm dummynet +operates by first using the firewall to select packets +using any match pattern that can be used in +.Nm +rules. +Matching packets are then passed to either of two +different objects, which implement the traffic regulation: +.Bl -hang -offset XXXX +.It Em pipe +A +.Em pipe +emulates a +.Em link +with given bandwidth and propagation delay, +driven by a FIFO scheduler and a single queue with programmable +queue size and packet loss rate. +Packets are appended to the queue as they come out from +.Nm ipfw , +and then transferred in FIFO order to the link at the desired rate. +.It Em queue +A +.Em queue +is an abstraction used to implement packet scheduling +using one of several packet scheduling algorithms. +Packets sent to a +.Em queue +are first grouped into flows according to a mask on the 5-tuple. +Flows are then passed to the scheduler associated to the +.Em queue , +and each flow uses scheduling parameters (weight and others) +as configured in the +.Em queue +itself. +A scheduler in turn is connected to an emulated link, +and arbitrates the link's bandwidth among backlogged flows according to +weights and to the features of the scheduling algorithm in use. +.El +.Pp +In practice, +.Em pipes +can be used to set hard limits to the bandwidth that a flow can use, whereas +.Em queues +can be used to determine how different flows share the available bandwidth. +.Pp +A graphical representation of the binding of queues, +flows, schedulers and links is below. +.Bd -literal -offset indent + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ +.Ed +It is important to understand the role of the SCHED_MASK +and FLOW_MASK, which are configured through the commands +.Dl "ipfw sched N config mask SCHED_MASK ..." +and +.Dl "ipfw queue X config mask FLOW_MASK ..." . +.Pp +The SCHED_MASK is used to assign flows to one or more +scheduler instances, one for each +value of the packet's 5-tuple after applying SCHED_MASK. +As an example, using ``src-ip 0xffffff00'' creates one instance +for each /24 destination subnet. +.Pp +The FLOW_MASK, together with the SCHED_MASK, is used to split +packets into flows. +As an example, using +``src-ip 0x000000ff'' +together with the previous SCHED_MASK makes a flow for +each individual source address. +In turn, flows for each /24 +subnet will be sent to the same scheduler instance. +.Pp +The above diagram holds even for the +.Em pipe +case, with the only restriction that a +.Em pipe +only supports a SCHED_MASK, and forces the use of a FIFO +scheduler (these are for backward compatibility reasons; +in fact, internally, a +.Nm dummynet's +pipe is implemented exactly as above). +.Pp +There are two modes of +.Nm dummynet +operation: +.Dq normal +and +.Dq fast . +The +.Dq normal +mode tries to emulate a real link: the +.Nm dummynet +scheduler ensures that the packet will not leave the pipe faster than it +would on the real link with a given bandwidth. +The +.Dq fast +mode allows certain packets to bypass the +.Nm dummynet +scheduler (if packet flow does not exceed pipe's bandwidth). +This is the reason why the +.Dq fast +mode requires less CPU cycles per packet (on average) and packet latency +can be significantly lower in comparison to a real link with the same +bandwidth. +The default mode is +.Dq normal . +The +.Dq fast +mode can be enabled by setting the +.Va net.inet.ip.dummynet.io_fast +.Xr sysctl 8 +variable to a non-zero value. +.Pp +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION +The +.Em pipe , +.Em queue +and +.Em scheduler +configuration commands are the following: +.Bd -ragged -offset indent +.Cm pipe Ar number Cm config Ar pipe-configuration +.Pp +.Cm queue Ar number Cm config Ar queue-configuration +.Pp +.Cm sched Ar number Cm config Ar sched-configuration +.Ed +.Pp +The following parameters can be configured for a pipe: +.Pp +.Bl -tag -width indent -compact +.It Cm bw Ar bandwidth | device +Bandwidth, measured in +.Sm off +.Op Cm K | M +.Brq Cm bit/s | Byte/s . +.Sm on +.Pp +A value of 0 (default) means unlimited bandwidth. +The unit must immediately follow the number, as in +.Pp +.Dl "ipfw pipe 1 config bw 300Kbit/s" +.Pp +If a device name is specified instead of a numeric value, as in +.Pp +.Dl "ipfw pipe 1 config bw tun0" +.Pp +then the transmit clock is supplied by the specified device. +At the moment only the +.Xr tun 4 +device supports this +functionality, for use in conjunction with +.Xr ppp 8 . +.Pp +.It Cm delay Ar ms-delay +Propagation delay, measured in milliseconds. +The value is rounded to the next multiple of the clock tick +(typically 10ms, but it is a good practice to run kernels +with +.Dq "options HZ=1000" +to reduce +the granularity to 1ms or less). +The default value is 0, meaning no delay. +.Pp +.It Cm burst Ar size +If the data to be sent exceeds the pipe's bandwidth limit +(and the pipe was previously idle), up to +.Ar size +bytes of data are allowed to bypass the +.Nm dummynet +scheduler, and will be sent as fast as the physical link allows. +Any additional data will be transmitted at the rate specified +by the +.Nm pipe +bandwidth. +The burst size depends on how long the pipe has been idle; +the effective burst size is calculated as follows: +MAX( +.Ar size +, +.Nm bw +* pipe_idle_time). +.Pp +.It Cm profile Ar filename +A file specifying the additional overhead incurred in the transmission +of a packet on the link. +.Pp +Some link types introduce extra delays in the transmission +of a packet, e.g., because of MAC level framing, contention on +the use of the channel, MAC level retransmissions and so on. +From our point of view, the channel is effectively unavailable +for this extra time, which is constant or variable depending +on the link type. +Additionally, packets may be dropped after this +time (e.g., on a wireless link after too many retransmissions). +We can model the additional delay with an empirical curve +that represents its distribution. +.Bd -literal -offset indent + cumulative probability + 1.0 ^ + | + L +-- loss-level x + | ****** + | * + | ***** + | * + | ** + | * + +-------*-------------------> + delay +.Ed +The empirical curve may have both vertical and horizontal lines. +Vertical lines represent constant delay for a range of +probabilities. +Horizontal lines correspond to a discontinuity in the delay +distribution: the pipe will use the largest delay for a +given probability. +.Pp +The file format is the following, with whitespace acting as +a separator and '#' indicating the beginning a comment: +.Bl -tag -width indent +.It Cm name Ar identifier +optional name (listed by "ipfw pipe show") +to identify the delay distribution; +.It Cm bw Ar value +the bandwidth used for the pipe. +If not specified here, it must be present +explicitly as a configuration parameter for the pipe; +.It Cm loss-level Ar L +the probability above which packets are lost. +(0.0 <= L <= 1.0, default 1.0 i.e., no loss); +.It Cm samples Ar N +the number of samples used in the internal +representation of the curve (2..1024; default 100); +.It Cm "delay prob" | "prob delay" +One of these two lines is mandatory and defines +the format of the following lines with data points. +.It Ar XXX Ar YYY +2 or more lines representing points in the curve, +with either delay or probability first, according +to the chosen format. +The unit for delay is milliseconds. +Data points do not need to be sorted. +Also, the number of actual lines can be different +from the value of the "samples" parameter: +.Nm +utility will sort and interpolate +the curve as needed. +.El +.Pp +Example of a profile file: +.Bd -literal -offset indent +name bla_bla_bla +samples 100 +loss-level 0.86 +prob delay +0 200 # minimum overhead is 200ms +0.5 200 +0.5 300 +0.8 1000 +0.9 1300 +1 1300 +#configuration file end +.Ed +.El +.Pp +The following parameters can be configured for a queue: +.Pp +.Bl -tag -width indent -compact +.It Cm pipe Ar pipe_nr +Connects a queue to the specified pipe. +Multiple queues (with the same or different weights) can be connected to +the same pipe, which specifies the aggregate rate for the set of queues. +.Pp +.It Cm weight Ar weight +Specifies the weight to be used for flows matching this queue. +The weight must be in the range 1..100, and defaults to 1. +.El +.Pp +The following case-insensitive parameters can be configured for a +scheduler: +.Pp +.Bl -tag -width indent -compact +.It Cm type Ar {fifo | wf2q+ | rr | qfq} +specifies the scheduling algorithm to use. +.Bl -tag -width indent -compact +.It Cm fifo +is just a FIFO scheduler (which means that all packets +are stored in the same queue as they arrive to the scheduler). +FIFO has O(1) per-packet time complexity, with very low +constants (estimate 60-80ns on a 2GHz desktop machine) +but gives no service guarantees. +.It Cm wf2q+ +implements the WF2Q+ algorithm, which is a Weighted Fair Queueing +algorithm which permits flows to share bandwidth according to +their weights. +Note that weights are not priorities; even a flow +with a minuscule weight will never starve. +WF2Q+ has O(log N) per-packet processing cost, where N is the number +of flows, and is the default algorithm used by previous versions +dummynet's queues. +.It Cm rr +implements the Deficit Round Robin algorithm, which has O(1) processing +costs (roughly, 100-150ns per packet) +and permits bandwidth allocation according to weights, but +with poor service guarantees. +.It Cm qfq +implements the QFQ algorithm, which is a very fast variant of +WF2Q+, with similar service guarantees and O(1) processing +costs (roughly, 200-250ns per packet). +.El +.El +.Pp +In addition to the type, all parameters allowed for a pipe can also +be specified for a scheduler. +.Pp +Finally, the following parameters can be configured for both +pipes and queues: +.Pp +.Bl -tag -width XXXX -compact +.It Cm buckets Ar hash-table-size +Specifies the size of the hash table used for storing the +various queues. +Default value is 64 controlled by the +.Xr sysctl 8 +variable +.Va net.inet.ip.dummynet.hash_size , +allowed range is 16 to 65536. +.Pp +.It Cm mask Ar mask-specifier +Packets sent to a given pipe or queue by an +.Nm +rule can be further classified into multiple flows, each of which is then +sent to a different +.Em dynamic +pipe or queue. +A flow identifier is constructed by masking the IP addresses, +ports and protocol types as specified with the +.Cm mask +options in the configuration of the pipe or queue. +For each different flow identifier, a new pipe or queue is created +with the same parameters as the original object, and matching packets +are sent to it. +.Pp +Thus, when +.Em dynamic pipes +are used, each flow will get the same bandwidth as defined by the pipe, +whereas when +.Em dynamic queues +are used, each flow will share the parent's pipe bandwidth evenly +with other flows generated by the same queue (note that other queues +with different weights might be connected to the same pipe). +.br +Available mask specifiers are a combination of one or more of the following: +.Pp +.Cm dst-ip Ar mask , +.Cm dst-ip6 Ar mask , +.Cm src-ip Ar mask , +.Cm src-ip6 Ar mask , +.Cm dst-port Ar mask , +.Cm src-port Ar mask , +.Cm flow-id Ar mask , +.Cm proto Ar mask +or +.Cm all , +.Pp +where the latter means all bits in all fields are significant. +.Pp +.It Cm noerror +When a packet is dropped by a +.Nm dummynet +queue or pipe, the error +is normally reported to the caller routine in the kernel, in the +same way as it happens when a device queue fills up. +Setting this +option reports the packet as successfully delivered, which can be +needed for some experimental setups where you want to simulate +loss or congestion at a remote router. +.Pp +.It Cm plr Ar packet-loss-rate +Packet loss rate. +Argument +.Ar packet-loss-rate +is a floating-point number between 0 and 1, with 0 meaning no +loss, 1 meaning 100% loss. +The loss rate is internally represented on 31 bits. +.Pp +.It Cm queue Brq Ar slots | size Ns Cm Kbytes +Queue size, in +.Ar slots +or +.Cm KBytes . +Default value is 50 slots, which +is the typical queue size for Ethernet devices. +Note that for slow speed links you should keep the queue +size short or your traffic might be affected by a significant +queueing delay. +E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit +or 20s of queue on a 30Kbit/s pipe. +Even worse effects can result if you get packets from an +interface with a much larger MTU, e.g.\& the loopback interface +with its 16KB packets. +The +.Xr sysctl 8 +variables +.Em net.inet.ip.dummynet.pipe_byte_limit +and +.Em net.inet.ip.dummynet.pipe_slot_limit +control the maximum lengths that can be specified. +.Pp +.It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p +Make use of the RED (Random Early Detection) queue management algorithm. +.Ar w_q +and +.Ar max_p +are floating +point numbers between 0 and 1 (0 not included), while +.Ar min_th +and +.Ar max_th +are integer numbers specifying thresholds for queue management +(thresholds are computed in bytes if the queue has been defined +in bytes, in slots otherwise). +The +.Nm dummynet +also supports the gentle RED variant (gred). +Three +.Xr sysctl 8 +variables can be used to control the RED behaviour: +.Bl -tag -width indent +.It Va net.inet.ip.dummynet.red_lookup_depth +specifies the accuracy in computing the average queue +when the link is idle (defaults to 256, must be greater than zero) +.It Va net.inet.ip.dummynet.red_avg_pkt_size +specifies the expected average packet size (defaults to 512, must be +greater than zero) +.It Va net.inet.ip.dummynet.red_max_pkt_size +specifies the expected maximum packet size, only used when queue +thresholds are in bytes (defaults to 1500, must be greater than zero). +.El +.El +.Pp +When used with IPv6 data, +.Nm dummynet +currently has several limitations. +Information necessary to route link-local packets to an +interface is not available after processing by +.Nm dummynet +so those packets are dropped in the output path. +Care should be taken to ensure that link-local packets are not passed to +.Nm dummynet . +.Sh CHECKLIST +Here are some important points to consider when designing your +rules: +.Bl -bullet +.It +Remember that you filter both packets going +.Cm in +and +.Cm out . +Most connections need packets going in both directions. +.It +Remember to test very carefully. +It is a good idea to be near the console when doing this. +If you cannot be near the console, +use an auto-recovery script such as the one in +.Pa /usr/share/examples/ipfw/change_rules.sh . +.It +Do not forget the loopback interface. +.El +.Sh FINE POINTS +.Bl -bullet +.It +There are circumstances where fragmented datagrams are unconditionally +dropped. +TCP packets are dropped if they do not contain at least 20 bytes of +TCP header, UDP packets are dropped if they do not contain a full 8 +byte UDP header, and ICMP packets are dropped if they do not contain +4 bytes of ICMP header, enough to specify the ICMP type, code, and +checksum. +These packets are simply logged as +.Dq pullup failed +since there may not be enough good data in the packet to produce a +meaningful log entry. +.It +Another type of packet is unconditionally dropped, a TCP packet with a +fragment offset of one. +This is a valid packet, but it only has one use, to try +to circumvent firewalls. +When logging is enabled, these packets are +reported as being dropped by rule -1. +.It +If you are logged in over a network, loading the +.Xr kld 4 +version of +.Nm +is probably not as straightforward as you would think. +The following command line is recommended: +.Bd -literal -offset indent +kldload ipfw && \e +ipfw add 32000 allow ip from any to any +.Ed +.Pp +Along the same lines, doing an +.Bd -literal -offset indent +ipfw flush +.Ed +.Pp +in similar surroundings is also a bad idea. +.It +The +.Nm +filter list may not be modified if the system security level +is set to 3 or higher +(see +.Xr init 8 +for information on system security levels). +.El +.Sh PACKET DIVERSION +A +.Xr divert 4 +socket bound to the specified port will receive all packets +diverted to that port. +If no socket is bound to the destination port, or if the divert module is +not loaded, or if the kernel was not compiled with divert socket support, +the packets are dropped. +.Sh NETWORK ADDRESS TRANSLATION (NAT) +.Nm +support in-kernel NAT using the kernel version of +.Xr libalias 3 . +.Pp +The nat configuration command is the following: +.Bd -ragged -offset indent +.Bk -words +.Cm nat +.Ar nat_number +.Cm config +.Ar nat-configuration +.Ek +.Ed +.Pp +The following parameters can be configured: +.Bl -tag -width indent +.It Cm ip Ar ip_address +Define an ip address to use for aliasing. +.It Cm if Ar nic +Use ip address of NIC for aliasing, dynamically changing +it if NIC's ip address changes. +.It Cm log +Enable logging on this nat instance. +.It Cm deny_in +Deny any incoming connection from outside world. +.It Cm same_ports +Try to leave the alias port numbers unchanged from +the actual local port numbers. +.It Cm unreg_only +Traffic on the local network not originating from an +unregistered address spaces will be ignored. +.It Cm reset +Reset table of the packet aliasing engine on address change. +.It Cm reverse +Reverse the way libalias handles aliasing. +.It Cm proxy_only +Obey transparent proxy rules only, packet aliasing is not performed. +.It Cm skip_global +Skip instance in case of global state lookup (see below). +.El +.Pp +Some specials value can be supplied instead of +.Va nat_number: +.Bl -tag -width indent +.It Cm global +Looks up translation state in all configured nat instances. +If an entry is found, packet is aliased according to that entry. +If no entry was found in any of the instances, packet is passed unchanged, +and no new entry will be created. +See section +.Sx MULTIPLE INSTANCES +in +.Xr natd 8 +for more information. +.It Cm tablearg +Uses argument supplied in lookup table. +See +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.El +.Pp +To let the packet continue after being (de)aliased, set the sysctl variable +.Va net.inet.ip.fw.one_pass +to 0. +For more information about aliasing modes, refer to +.Xr libalias 3 . +See Section +.Sx EXAMPLES +for some examples about nat usage. +.Ss REDIRECT AND LSNAT SUPPORT IN IPFW +Redirect and LSNAT support follow closely the syntax used in +.Xr natd 8 . +See Section +.Sx EXAMPLES +for some examples on how to do redirect and lsnat. +.Ss SCTP NAT SUPPORT +SCTP nat can be configured in a similar manner to TCP through the +.Nm +command line tool. +The main difference is that +.Nm sctp nat +does not do port translation. +Since the local and global side ports will be the same, +there is no need to specify both. +Ports are redirected as follows: +.Bd -ragged -offset indent +.Bk -words +.Cm nat +.Ar nat_number +.Cm config if +.Ar nic +.Cm redirect_port sctp +.Ar ip_address [,addr_list] {[port | port-port] [,ports]} +.Ek +.Ed +.Pp +Most +.Nm sctp nat +configuration can be done in real-time through the +.Xr sysctl 8 +interface. +All may be changed dynamically, though the hash_table size will only +change for new +.Nm nat +instances. +See +.Sx SYSCTL VARIABLES +for more info. +.Sh LOADER TUNABLES +Tunables can be set in +.Xr loader 8 +prompt, +.Xr loader.conf 5 +or +.Xr kenv 1 +before ipfw module gets loaded. +.Bl -tag -width indent +.It Va net.inet.ip.fw.default_to_accept: No 0 +Defines ipfw last rule behavior. +This value overrides +.Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)" +from kernel configuration file. +.It Va net.inet.ip.fw.tables_max: No 128 +Defines number of tables available in ipfw. +Number cannot exceed 65534. +.El +.Sh SYSCTL VARIABLES +A set of +.Xr sysctl 8 +variables controls the behaviour of the firewall and +associated modules +.Pq Nm dummynet , bridge , sctp nat . +These are shown below together with their default value +(but always check with the +.Xr sysctl 8 +command what value is actually in use) and meaning: +.Bl -tag -width indent +.It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0 +Defines how the +.Nm nat +responds to receipt of global OOTB ASCONF-AddIP: +.Bl -tag -width indent +.It Cm 0 +No response (unless a partially matching association exists - +ports and vtags match but global address does not) +.It Cm 1 +.Nm nat +will accept and process all OOTB global AddIP messages. +.El +.Pp +Option 1 should never be selected as this forms a security risk. +An attacker can +establish multiple fake associations by sending AddIP messages. +.It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5 +Defines the maximum number of chunks in an SCTP packet that will be +parsed for a +packet that matches an existing association. +This value is enforced to be greater or equal than +.Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit . +A high value is +a DoS risk yet setting too low a value may result in +important control chunks in +the packet not being located and parsed. +.It Va net.inet.ip.alias.sctp.error_on_ootb: No 1 +Defines when the +.Nm nat +responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets. +An OOTB packet is a packet that arrives with no existing association +registered in the +.Nm nat +and is not an INIT or ASCONF-AddIP packet: +.Bl -tag -width indent +.It Cm 0 +ErrorM is never sent in response to OOTB packets. +.It Cm 1 +ErrorM is only sent to OOTB packets received on the local side. +.It Cm 2 +ErrorM is sent to the local side and on the global side ONLY if there is a +partial match (ports and vtags match but the source global IP does not). +This value is only useful if the +.Nm nat +is tracking global IP addresses. +.It Cm 3 +ErrorM is sent in response to all OOTB packets on both +the local and global side +(DoS risk). +.El +.Pp +At the moment the default is 0, since the ErrorM packet is not yet +supported by most SCTP stacks. +When it is supported, and if not tracking +global addresses, we recommend setting this value to 1 to allow +multi-homed local hosts to function with the +.Nm nat . +To track global addresses, we recommend setting this value to 2 to +allow global hosts to be informed when they need to (re)send an +ASCONF-AddIP. +Value 3 should never be chosen (except for debugging) as the +.Nm nat +will respond to all OOTB global packets (a DoS risk). +.It Va net.inet.ip.alias.sctp.hashtable_size: No 2003 +Size of hash tables used for +.Nm nat +lookups (100 < prime_number > 1000001). +This value sets the +.Nm hash table +size for any future created +.Nm nat +instance and therefore must be set prior to creating a +.Nm nat +instance. +The table sizes may be changed to suit specific needs. +If there will be few +concurrent associations, and memory is scarce, you may make these smaller. +If there will be many thousands (or millions) of concurrent associations, you +should make these larger. +A prime number is best for the table size. +The sysctl +update function will adjust your input value to the next highest prime number. +.It Va net.inet.ip.alias.sctp.holddown_time: No 0 +Hold association in table for this many seconds after receiving a +SHUTDOWN-COMPLETE. +This allows endpoints to correct shutdown gracefully if a +shutdown_complete is lost and retransmissions are required. +.It Va net.inet.ip.alias.sctp.init_timer: No 15 +Timeout value while waiting for (INIT-ACK|AddIP-ACK). +This value cannot be 0. +.It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2 +Defines the maximum number of chunks in an SCTP packet that will be parsed when +no existing association exists that matches that packet. +Ideally this packet +will only be an INIT or ASCONF-AddIP packet. +A higher value may become a DoS +risk as malformed packets can consume processing resources. +.It Va net.inet.ip.alias.sctp.param_proc_limit: No 25 +Defines the maximum number of parameters within a chunk that will be +parsed in a +packet. +As for other similar sysctl variables, larger values pose a DoS risk. +.It Va net.inet.ip.alias.sctp.log_level: No 0 +Level of detail in the system log messages (0 \- minimal, 1 \- event, +2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug). +May be a good +option in high loss environments. +.It Va net.inet.ip.alias.sctp.shutdown_time: No 15 +Timeout value while waiting for SHUTDOWN-COMPLETE. +This value cannot be 0. +.It Va net.inet.ip.alias.sctp.track_global_addresses: No 0 +Enables/disables global IP address tracking within the +.Nm nat +and places an +upper limit on the number of addresses tracked for each association: +.Bl -tag -width indent +.It Cm 0 +Global tracking is disabled +.It Cm >1 +Enables tracking, the maximum number of addresses tracked for each +association is limited to this value +.El +.Pp +This variable is fully dynamic, the new value will be adopted for all newly +arriving associations, existing associations are treated +as they were previously. +Global tracking will decrease the number of collisions within the +.Nm nat +at a cost +of increased processing load, memory usage, complexity, and possible +.Nm nat +state +problems in complex networks with multiple +.Nm nats . +We recommend not tracking +global IP addresses, this will still result in a fully functional +.Nm nat . +.It Va net.inet.ip.alias.sctp.up_timer: No 300 +Timeout value to keep an association up with no traffic. +This value cannot be 0. +.It Va net.inet.ip.dummynet.expire : No 1 +Lazily delete dynamic pipes/queue once they have no pending traffic. +You can disable this by setting the variable to 0, in which case +the pipes/queues will only be deleted when the threshold is reached. +.It Va net.inet.ip.dummynet.hash_size : No 64 +Default size of the hash table used for dynamic pipes/queues. +This value is used when no +.Cm buckets +option is specified when configuring a pipe/queue. +.It Va net.inet.ip.dummynet.io_fast : No 0 +If set to a non-zero value, +the +.Dq fast +mode of +.Nm dummynet +operation (see above) is enabled. +.It Va net.inet.ip.dummynet.io_pkt +Number of packets passed to +.Nm dummynet . +.It Va net.inet.ip.dummynet.io_pkt_drop +Number of packets dropped by +.Nm dummynet . +.It Va net.inet.ip.dummynet.io_pkt_fast +Number of packets bypassed by the +.Nm dummynet +scheduler. +.It Va net.inet.ip.dummynet.max_chain_len : No 16 +Target value for the maximum number of pipes/queues in a hash bucket. +The product +.Cm max_chain_len*hash_size +is used to determine the threshold over which empty pipes/queues +will be expired even when +.Cm net.inet.ip.dummynet.expire=0 . +.It Va net.inet.ip.dummynet.red_lookup_depth : No 256 +.It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512 +.It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500 +Parameters used in the computations of the drop probability +for the RED algorithm. +.It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576 +.It Va net.inet.ip.dummynet.pipe_slot_limit : No 100 +The maximum queue size that can be specified in bytes or packets. +These limits prevent accidental exhaustion of resources such as mbufs. +If you raise these limits, +you should make sure the system is configured so that sufficient resources +are available. +.It Va net.inet.ip.fw.autoinc_step : No 100 +Delta between rule numbers when auto-generating them. +The value must be in the range 1..1000. +.It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets +The current number of buckets in the hash table for dynamic rules +(readonly). +.It Va net.inet.ip.fw.debug : No 1 +Controls debugging messages produced by +.Nm . +.It Va net.inet.ip.fw.default_rule : No 65535 +The default rule number (read-only). +By the design of +.Nm , the default rule is the last one, so its number +can also serve as the highest number allowed for a rule. +.It Va net.inet.ip.fw.dyn_buckets : No 256 +The number of buckets in the hash table for dynamic rules. +Must be a power of 2, up to 65536. +It only takes effect when all dynamic rules have expired, so you +are advised to use a +.Cm flush +command to make sure that the hash table is resized. +.It Va net.inet.ip.fw.dyn_count : No 3 +Current number of dynamic rules +(read-only). +.It Va net.inet.ip.fw.dyn_keepalive : No 1 +Enables generation of keepalive packets for +.Cm keep-state +rules on TCP sessions. +A keepalive is generated to both +sides of the connection every 5 seconds for the last 20 +seconds of the lifetime of the rule. +.It Va net.inet.ip.fw.dyn_max : No 8192 +Maximum number of dynamic rules. +When you hit this limit, no more dynamic rules can be +installed until old ones expire. +.It Va net.inet.ip.fw.dyn_ack_lifetime : No 300 +.It Va net.inet.ip.fw.dyn_syn_lifetime : No 20 +.It Va net.inet.ip.fw.dyn_fin_lifetime : No 1 +.It Va net.inet.ip.fw.dyn_rst_lifetime : No 1 +.It Va net.inet.ip.fw.dyn_udp_lifetime : No 5 +.It Va net.inet.ip.fw.dyn_short_lifetime : No 30 +These variables control the lifetime, in seconds, of dynamic +rules. +Upon the initial SYN exchange the lifetime is kept short, +then increased after both SYN have been seen, then decreased +again during the final FIN exchange or when a RST is received. +Both +.Em dyn_fin_lifetime +and +.Em dyn_rst_lifetime +must be strictly lower than 5 seconds, the period of +repetition of keepalives. +The firewall enforces that. +.It Va net.inet.ip.fw.dyn_keep_states: No 0 +Keep dynamic states on rule/set deletion. +States are relinked to default rule (65535). +This can be handly for ruleset reload. +Turned off by default. +.It Va net.inet.ip.fw.enable : No 1 +Enables the firewall. +Setting this variable to 0 lets you run your machine without +firewall even if compiled in. +.It Va net.inet6.ip6.fw.enable : No 1 +provides the same functionality as above for the IPv6 case. +.It Va net.inet.ip.fw.one_pass : No 1 +When set, the packet exiting from the +.Nm dummynet +pipe or from +.Xr ng_ipfw 4 +node is not passed though the firewall again. +Otherwise, after an action, the packet is +reinjected into the firewall at the next rule. +.It Va net.inet.ip.fw.tables_max : No 128 +Maximum number of tables. +.It Va net.inet.ip.fw.verbose : No 1 +Enables verbose messages. +.It Va net.inet.ip.fw.verbose_limit : No 0 +Limits the number of messages produced by a verbose firewall. +.It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1 +If enabled packets with unknown IPv6 Extension Headers will be denied. +.It Va net.link.ether.ipfw : No 0 +Controls whether layer-2 packets are passed to +.Nm . +Default is no. +.It Va net.link.bridge.ipfw : No 0 +Controls whether bridged packets are passed to +.Nm . +Default is no. +.El +.Sh EXAMPLES +There are far too many possible uses of +.Nm +so this Section will only give a small set of examples. +.Pp +.Ss BASIC PACKET FILTERING +This command adds an entry which denies all tcp packets from +.Em cracker.evil.org +to the telnet port of +.Em wolf.tambov.su +from being forwarded by the host: +.Pp +.Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet" +.Pp +This one disallows any connection from the entire cracker's +network to my host: +.Pp +.Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org" +.Pp +A first and efficient way to limit access (not using dynamic rules) +is the use of the following rules: +.Pp +.Dl "ipfw add allow tcp from any to any established" +.Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup" +.Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup" +.Dl "..." +.Dl "ipfw add deny tcp from any to any" +.Pp +The first rule will be a quick match for normal TCP packets, +but it will not match the initial SYN packet, which will be +matched by the +.Cm setup +rules only for selected source/destination pairs. +All other SYN packets will be rejected by the final +.Cm deny +rule. +.Pp +If you administer one or more subnets, you can take advantage +of the address sets and or-blocks and write extremely +compact rulesets which selectively enable services to blocks +of clients, as below: +.Pp +.Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q" +.Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q" +.Dl "" +.Dl "ipfw add allow ip from ${goodguys} to any" +.Dl "ipfw add deny ip from ${badguys} to any" +.Dl "... normal policies ..." +.Pp +The +.Cm verrevpath +option could be used to do automated anti-spoofing by adding the +following to the top of a ruleset: +.Pp +.Dl "ipfw add deny ip from any to any not verrevpath in" +.Pp +This rule drops all incoming packets that appear to be coming to the +system on the wrong interface. +For example, a packet with a source +address belonging to a host on a protected internal network would be +dropped if it tried to enter the system from an external interface. +.Pp +The +.Cm antispoof +option could be used to do similar but more restricted anti-spoofing +by adding the following to the top of a ruleset: +.Pp +.Dl "ipfw add deny ip from any to any not antispoof in" +.Pp +This rule drops all incoming packets that appear to be coming from another +directly connected system but on the wrong interface. +For example, a packet with a source address of +.Li 192.168.0.0/24 , +configured on +.Li fxp0 , +but coming in on +.Li fxp1 +would be dropped. +.Pp +The +.Cm setdscp +option could be used to (re)mark user traffic, +by adding the following to the appropriate place in ruleset: +.Pp +.Dl "ipfw add setdscp be ip from any to any dscp af11,af21" +.Ss DYNAMIC RULES +In order to protect a site from flood attacks involving fake +TCP packets, it is safer to use dynamic rules: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add deny tcp from any to any established" +.Dl "ipfw add allow tcp from my-net to any setup keep-state" +.Pp +This will let the firewall install dynamic rules only for +those connection which start with a regular SYN packet coming +from the inside of our network. +Dynamic rules are checked when encountering the first +occurrence of a +.Cm check-state , +.Cm keep-state +or +.Cm limit +rule. +A +.Cm check-state +rule should usually be placed near the beginning of the +ruleset to minimize the amount of work scanning the ruleset. +Your mileage may vary. +.Pp +To limit the number of connections a user can open +you can use the following type of rules: +.Pp +.Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10" +.Dl "ipfw add allow tcp from any to me setup limit src-addr 4" +.Pp +The former (assuming it runs on a gateway) will allow each host +on a /24 network to open at most 10 TCP connections. +The latter can be placed on a server to make sure that a single +client does not use more than 4 simultaneous connections. +.Pp +.Em BEWARE : +stateful rules can be subject to denial-of-service attacks +by a SYN-flood which opens a huge number of dynamic rules. +The effects of such attacks can be partially limited by +acting on a set of +.Xr sysctl 8 +variables which control the operation of the firewall. +.Pp +Here is a good usage of the +.Cm list +command to see accounting records and timestamp information: +.Pp +.Dl ipfw -at list +.Pp +or in short form without timestamps: +.Pp +.Dl ipfw -a list +.Pp +which is equivalent to: +.Pp +.Dl ipfw show +.Pp +Next rule diverts all incoming packets from 192.168.2.0/24 +to divert port 5000: +.Pp +.Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in +.Ss TRAFFIC SHAPING +The following rules show some of the applications of +.Nm +and +.Nm dummynet +for simulations and the like. +.Pp +This rule drops random incoming packets with a probability +of 5%: +.Pp +.Dl "ipfw add prob 0.05 deny ip from any to any in" +.Pp +A similar effect can be achieved making use of +.Nm dummynet +pipes: +.Pp +.Dl "ipfw add pipe 10 ip from any to any" +.Dl "ipfw pipe 10 config plr 0.05" +.Pp +We can use pipes to artificially limit bandwidth, e.g.\& on a +machine acting as a router, if we want to limit traffic from +local clients on 192.168.2.0/24 we do: +.Pp +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" +.Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes" +.Pp +note that we use the +.Cm out +modifier so that the rule is not used twice. +Remember in fact that +.Nm +rules are checked both on incoming and outgoing packets. +.Pp +Should we want to simulate a bidirectional link with bandwidth +limitations, the correct way is the following: +.Pp +.Dl "ipfw add pipe 1 ip from any to any out" +.Dl "ipfw add pipe 2 ip from any to any in" +.Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes" +.Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes" +.Pp +The above can be very useful, e.g.\& if you want to see how +your fancy Web page will look for a residential user who +is connected only through a slow link. +You should not use only one pipe for both directions, unless +you want to simulate a half-duplex medium (e.g.\& AppleTalk, +Ethernet, IRDA). +It is not necessary that both pipes have the same configuration, +so we can also simulate asymmetric links. +.Pp +Should we want to verify network performance with the RED queue +management algorithm: +.Pp +.Dl "ipfw add pipe 1 ip from any to any" +.Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1" +.Pp +Another typical application of the traffic shaper is to +introduce some delay in the communication. +This can significantly affect applications which do a lot of Remote +Procedure Calls, and where the round-trip-time of the +connection often becomes a limiting factor much more than +bandwidth: +.Pp +.Dl "ipfw add pipe 1 ip from any to any out" +.Dl "ipfw add pipe 2 ip from any to any in" +.Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s" +.Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s" +.Pp +Per-flow queueing can be useful for a variety of purposes. +A very simple one is counting traffic: +.Pp +.Dl "ipfw add pipe 1 tcp from any to any" +.Dl "ipfw add pipe 1 udp from any to any" +.Dl "ipfw add pipe 1 ip from any to any" +.Dl "ipfw pipe 1 config mask all" +.Pp +The above set of rules will create queues (and collect +statistics) for all traffic. +Because the pipes have no limitations, the only effect is +collecting statistics. +Note that we need 3 rules, not just the last one, because +when +.Nm +tries to match IP packets it will not consider ports, so we +would not see connections on separate ports as different +ones. +.Pp +A more sophisticated example is limiting the outbound traffic +on a net with per-host limits, rather than per-network limits: +.Pp +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" +.Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in" +.Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" +.Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" +.Ss LOOKUP TABLES +In the following example, we need to create several traffic bandwidth +classes and we need different hosts/networks to fall into different classes. +We create one pipe for each class and configure them accordingly. +Then we create a single table and fill it with IP subnets and addresses. +For each subnet/host we set the argument equal to the number of the pipe +that it should use. +Then we classify traffic using a single rule: +.Pp +.Dl "ipfw pipe 1 config bw 1000Kbyte/s" +.Dl "ipfw pipe 4 config bw 4000Kbyte/s" +.Dl "..." +.Dl "ipfw table 1 add 192.168.2.0/24 1" +.Dl "ipfw table 1 add 192.168.0.0/27 4" +.Dl "ipfw table 1 add 192.168.0.2 1" +.Dl "..." +.Dl "ipfw add pipe tablearg ip from table(1) to any" +.Pp +Using the +.Cm fwd +action, the table entries may include hostnames and IP addresses. +.Pp +.Dl "ipfw table 1 add 192.168.2.0/24 10.23.2.1" +.Dl "ipfw table 1 add 192.168.0.0/27 router1.dmz" +.Dl "..." +.Dl "ipfw add 100 fwd tablearg ip from any to table(1)" +.Pp +In the following example per-interface firewall is created: +.Pp +.Dl "ipfw table 10 add vlan20 12000" +.Dl "ipfw table 10 add vlan30 13000" +.Dl "ipfw table 20 add vlan20 22000" +.Dl "ipfw table 20 add vlan30 23000" +.Dl ".." +.Dl "ipfw add 100 ipfw skipto tablearg ip from any to any recv 'table(10)' in" +.Dl "ipfw add 200 ipfw skipto tablearg ip from any to any xmit 'table(10)' out" +.Ss SETS OF RULES +To add a set of rules atomically, e.g.\& set 18: +.Pp +.Dl "ipfw set disable 18" +.Dl "ipfw add NN set 18 ... # repeat as needed" +.Dl "ipfw set enable 18" +.Pp +To delete a set of rules atomically the command is simply: +.Pp +.Dl "ipfw delete set 18" +.Pp +To test a ruleset and disable it and regain control if something goes wrong: +.Pp +.Dl "ipfw set disable 18" +.Dl "ipfw add NN set 18 ... # repeat as needed" +.Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18" +.Pp +Here if everything goes well, you press control-C before the "sleep" +terminates, and your ruleset will be left active. +Otherwise, e.g.\& if +you cannot access your box, the ruleset will be disabled after +the sleep terminates thus restoring the previous situation. +.Pp +To show rules of the specific set: +.Pp +.Dl "ipfw set 18 show" +.Pp +To show rules of the disabled set: +.Pp +.Dl "ipfw -S set 18 show" +.Pp +To clear a specific rule counters of the specific set: +.Pp +.Dl "ipfw set 18 zero NN" +.Pp +To delete a specific rule of the specific set: +.Pp +.Dl "ipfw set 18 delete NN" +.Ss NAT, REDIRECT AND LSNAT +First redirect all the traffic to nat instance 123: +.Pp +.Dl "ipfw add nat 123 all from any to any" +.Pp +Then to configure nat instance 123 to alias all the outgoing traffic with ip +192.168.0.123, blocking all incoming connections, trying to keep +same ports on both sides, clearing aliasing table on address change +and keeping a log of traffic/link statistics: +.Pp +.Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports" +.Pp +Or to change address of instance 123, aliasing table will be cleared (see +reset option): +.Pp +.Dl "ipfw nat 123 config ip 10.0.0.1" +.Pp +To see configuration of nat instance 123: +.Pp +.Dl "ipfw nat 123 show config" +.Pp +To show logs of all the instances in range 111-999: +.Pp +.Dl "ipfw nat 111-999 show" +.Pp +To see configurations of all instances: +.Pp +.Dl "ipfw nat show config" +.Pp +Or a redirect rule with mixed modes could looks like: +.Pp +.Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66" +.Dl " redirect_port tcp 192.168.0.1:80 500" +.Dl " redirect_proto udp 192.168.1.43 192.168.1.1" +.Dl " redirect_addr 192.168.0.10,192.168.0.11" +.Dl " 10.0.0.100 # LSNAT" +.Dl " redirect_port tcp 192.168.0.1:80,192.168.0.10:22" +.Dl " 500 # LSNAT" +.Pp +or it could be split in: +.Pp +.Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66" +.Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500" +.Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1" +.Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12" +.Dl " 10.0.0.100" +.Dl "ipfw nat 5 config redirect_port tcp" +.Dl " 192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500" +.Sh SEE ALSO +.Xr cpp 1 , +.Xr m4 1 , +.Xr altq 4 , +.Xr divert 4 , +.Xr dummynet 4 , +.Xr if_bridge 4 , +.Xr ip 4 , +.Xr ipfirewall 4 , +.Xr ng_ipfw 4 , +.Xr protocols 5 , +.Xr services 5 , +.Xr init 8 , +.Xr kldload 8 , +.Xr reboot 8 , +.Xr sysctl 8 , +.Xr syslogd 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx 2.0 . +.Nm dummynet +was introduced in +.Fx 2.2.8 . +Stateful extensions were introduced in +.Fx 4.0 . +.Nm ipfw2 +was introduced in Summer 2002. +.Sh AUTHORS +.An Ugen J. S. Antsilevich , +.An Poul-Henning Kamp , +.An Alex Nash , +.An Archie Cobbs , +.An Luigi Rizzo . +.Pp +.An -nosplit +API based upon code written by +.An Daniel Boulet +for BSDI. +.Pp +Dummynet has been introduced by Luigi Rizzo in 1997-1998. +.Pp +Some early work (1999-2000) on the +.Nm dummynet +traffic shaper supported by Akamba Corp. +.Pp +The ipfw core (ipfw2) has been completely redesigned and +reimplemented by Luigi Rizzo in summer 2002. +Further +actions and +options have been added by various developer over the years. +.Pp +.An -nosplit +In-kernel NAT support written by +.An Paolo Pisati Aq piso@FreeBSD.org +as part of a Summer of Code 2005 project. +.Pp +SCTP +.Nm nat +support has been developed by +.An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au . +The primary developers and maintainers are David Hayes and Jason But. +For further information visit: +.Aq http://www.caia.swin.edu.au/urp/SONATA +.Pp +Delay profiles have been developed by Alessandro Cerri and +Luigi Rizzo, supported by the +European Commission within Projects Onelab and Onelab2. +.Sh BUGS +The syntax has grown over the years and sometimes it might be confusing. +Unfortunately, backward compatibility prevents cleaning up mistakes +made in the definition of the syntax. +.Pp +.Em !!! WARNING !!! +.Pp +Misconfiguring the firewall can put your computer in an unusable state, +possibly shutting down network services and requiring console access to +regain control of it. +.Pp +Incoming packet fragments diverted by +.Cm divert +are reassembled before delivery to the socket. +The action used on those packet is the one from the +rule which matches the first fragment of the packet. +.Pp +Packets diverted to userland, and then reinserted by a userland process +may lose various packet attributes. +The packet source interface name +will be preserved if it is shorter than 8 bytes and the userland process +saves and reuses the sockaddr_in +(as does +.Xr natd 8 ) ; +otherwise, it may be lost. +If a packet is reinserted in this manner, later rules may be incorrectly +applied, making the order of +.Cm divert +rules in the rule sequence very important. +.Pp +Dummynet drops all packets with IPv6 link-local addresses. +.Pp +Rules using +.Cm uid +or +.Cm gid +may not behave as expected. +In particular, incoming SYN packets may +have no uid or gid associated with them since they do not yet belong +to a TCP connection, and the uid/gid associated with a packet may not +be as expected if the associated process calls +.Xr setuid 2 +or similar system calls. +.Pp +Rule syntax is subject to the command line environment and some patterns +may need to be escaped with the backslash character +or quoted appropriately. +.Pp +Due to the architecture of +.Xr libalias 3 , +ipfw nat is not compatible with the TCP segmentation offloading (TSO). +Thus, to reliably nat your network traffic, please disable TSO +on your NICs using +.Xr ifconfig 8 . +.Pp +ICMP error messages are not implicitly matched by dynamic rules +for the respective conversations. +To avoid failures of network error detection and path MTU discovery, +ICMP error messages may need to be allowed explicitly through static +rules. +.Pp +Rules using +.Cm call +and +.Cm return +actions may lead to confusing behaviour if ruleset has mistakes, +and/or interaction with other subsystems (netgraph, dummynet, etc.) is used. +One possible case for this is packet leaving +.Nm +in subroutine on the input pass, while later on output encountering unpaired +.Cm return +first. +As the call stack is kept intact after input pass, packet will suddenly +return to the rule number used on input pass, not on output one. +Order of processing should be checked carefully to avoid such mistakes. diff --git a/ipfw/ipfw2.c b/ipfw/ipfw2.c new file mode 100644 index 0000000..5dbfd4a --- /dev/null +++ b/ipfw/ipfw2.c @@ -0,0 +1,3994 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipfw2.c 206843 2010-04-19 15:11:45Z luigi $ + */ + +#include +#include +#include +#include +#include + +#include "ipfw2.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* ctime */ +#include /* _long_to_time */ +#include +#include +#include /* offsetof */ + +#include +#include /* only IFNAMSIZ */ +#include +#include /* only n_short, n_long */ +#include +#include +#include +#include +#include + +struct cmdline_opts co; /* global options */ + +int resvd_set_number = RESVD_SET; + +int ipfw_socket = -1; + +#ifndef s6_addr32 +#define s6_addr32 __u6_addr.__u6_addr32 +#endif + +#define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ + if (!av[0]) \ + errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ + if (_substrcmp(*av, "tablearg") == 0) { \ + arg = IP_FW_TABLEARG; \ + break; \ + } \ + \ + { \ + long _xval; \ + char *end; \ + \ + _xval = strtol(*av, &end, 10); \ + \ + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ + errx(EX_DATAERR, "%s: invalid argument: %s", \ + match_value(s_x, tok), *av); \ + \ + if (errno == ERANGE || _xval < min || _xval > max) \ + errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ + match_value(s_x, tok), min, max, *av); \ + \ + if (_xval == IP_FW_TABLEARG) \ + errx(EX_DATAERR, "%s: illegal argument value: %s", \ + match_value(s_x, tok), *av); \ + arg = _xval; \ + } \ +} while (0) + +static void +PRINT_UINT_ARG(const char *str, uint32_t arg) +{ + if (str != NULL) + printf("%s",str); + if (arg == IP_FW_TABLEARG) + printf("tablearg"); + else + printf("%u", arg); +} + +static struct _s_x f_tcpflags[] = { + { "syn", TH_SYN }, + { "fin", TH_FIN }, + { "ack", TH_ACK }, + { "psh", TH_PUSH }, + { "rst", TH_RST }, + { "urg", TH_URG }, + { "tcp flag", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_tcpopts[] = { + { "mss", IP_FW_TCPOPT_MSS }, + { "maxseg", IP_FW_TCPOPT_MSS }, + { "window", IP_FW_TCPOPT_WINDOW }, + { "sack", IP_FW_TCPOPT_SACK }, + { "ts", IP_FW_TCPOPT_TS }, + { "timestamp", IP_FW_TCPOPT_TS }, + { "cc", IP_FW_TCPOPT_CC }, + { "tcp option", 0 }, + { NULL, 0 } +}; + +/* + * IP options span the range 0 to 255 so we need to remap them + * (though in fact only the low 5 bits are significant). + */ +static struct _s_x f_ipopts[] = { + { "ssrr", IP_FW_IPOPT_SSRR}, + { "lsrr", IP_FW_IPOPT_LSRR}, + { "rr", IP_FW_IPOPT_RR}, + { "ts", IP_FW_IPOPT_TS}, + { "ip option", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_iptos[] = { + { "lowdelay", IPTOS_LOWDELAY}, + { "throughput", IPTOS_THROUGHPUT}, + { "reliability", IPTOS_RELIABILITY}, + { "mincost", IPTOS_MINCOST}, + { "congestion", IPTOS_ECN_CE}, + { "ecntransport", IPTOS_ECN_ECT0}, + { "ip tos option", 0}, + { NULL, 0 } +}; + +static struct _s_x limit_masks[] = { + {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, + {"src-addr", DYN_SRC_ADDR}, + {"src-port", DYN_SRC_PORT}, + {"dst-addr", DYN_DST_ADDR}, + {"dst-port", DYN_DST_PORT}, + {NULL, 0} +}; + +/* + * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines + * This is only used in this code. + */ +#define IPPROTO_ETHERTYPE 0x1000 +static struct _s_x ether_types[] = { + /* + * Note, we cannot use "-:&/" in the names because they are field + * separators in the type specifications. Also, we use s = NULL as + * end-delimiter, because a type of 0 can be legal. + */ + { "ip", 0x0800 }, + { "ipv4", 0x0800 }, + { "ipv6", 0x86dd }, + { "arp", 0x0806 }, + { "rarp", 0x8035 }, + { "vlan", 0x8100 }, + { "loop", 0x9000 }, + { "trail", 0x1000 }, + { "at", 0x809b }, + { "atalk", 0x809b }, + { "aarp", 0x80f3 }, + { "pppoe_disc", 0x8863 }, + { "pppoe_sess", 0x8864 }, + { "ipx_8022", 0x00E0 }, + { "ipx_8023", 0x0000 }, + { "ipx_ii", 0x8137 }, + { "ipx_snap", 0x8137 }, + { "ipx", 0x8137 }, + { "ns", 0x0600 }, + { NULL, 0 } +}; + + +static struct _s_x rule_actions[] = { + { "accept", TOK_ACCEPT }, + { "pass", TOK_ACCEPT }, + { "allow", TOK_ACCEPT }, + { "permit", TOK_ACCEPT }, + { "count", TOK_COUNT }, + { "pipe", TOK_PIPE }, + { "queue", TOK_QUEUE }, + { "divert", TOK_DIVERT }, + { "tee", TOK_TEE }, + { "netgraph", TOK_NETGRAPH }, + { "ngtee", TOK_NGTEE }, + { "fwd", TOK_FORWARD }, + { "forward", TOK_FORWARD }, + { "skipto", TOK_SKIPTO }, + { "deny", TOK_DENY }, + { "drop", TOK_DENY }, + { "reject", TOK_REJECT }, + { "reset6", TOK_RESET6 }, + { "reset", TOK_RESET }, + { "unreach6", TOK_UNREACH6 }, + { "unreach", TOK_UNREACH }, + { "check-state", TOK_CHECKSTATE }, + { "//", TOK_COMMENT }, + { "nat", TOK_NAT }, + { "reass", TOK_REASS }, + { "setfib", TOK_SETFIB }, + { "call", TOK_CALL }, + { "return", TOK_RETURN }, + { NULL, 0 } /* terminator */ +}; + +static struct _s_x rule_action_params[] = { + { "altq", TOK_ALTQ }, + { "log", TOK_LOG }, + { "tag", TOK_TAG }, + { "untag", TOK_UNTAG }, + { NULL, 0 } /* terminator */ +}; + +/* + * The 'lookup' instruction accepts one of the following arguments. + * -1 is a terminator for the list. + * Arguments are passed as v[1] in O_DST_LOOKUP options. + */ +static int lookup_key[] = { + TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, + TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; + +static struct _s_x rule_options[] = { + { "tagged", TOK_TAGGED }, + { "uid", TOK_UID }, + { "gid", TOK_GID }, + { "jail", TOK_JAIL }, + { "in", TOK_IN }, + { "limit", TOK_LIMIT }, + { "keep-state", TOK_KEEPSTATE }, + { "bridged", TOK_LAYER2 }, + { "layer2", TOK_LAYER2 }, + { "out", TOK_OUT }, + { "diverted", TOK_DIVERTED }, + { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, + { "diverted-output", TOK_DIVERTEDOUTPUT }, + { "xmit", TOK_XMIT }, + { "recv", TOK_RECV }, + { "via", TOK_VIA }, + { "fragment", TOK_FRAG }, + { "frag", TOK_FRAG }, + { "fib", TOK_FIB }, + { "ipoptions", TOK_IPOPTS }, + { "ipopts", TOK_IPOPTS }, + { "iplen", TOK_IPLEN }, + { "ipid", TOK_IPID }, + { "ipprecedence", TOK_IPPRECEDENCE }, + { "dscp", TOK_DSCP }, + { "iptos", TOK_IPTOS }, + { "ipttl", TOK_IPTTL }, + { "ipversion", TOK_IPVER }, + { "ipver", TOK_IPVER }, + { "estab", TOK_ESTAB }, + { "established", TOK_ESTAB }, + { "setup", TOK_SETUP }, + { "sockarg", TOK_SOCKARG }, + { "tcpdatalen", TOK_TCPDATALEN }, + { "tcpflags", TOK_TCPFLAGS }, + { "tcpflgs", TOK_TCPFLAGS }, + { "tcpoptions", TOK_TCPOPTS }, + { "tcpopts", TOK_TCPOPTS }, + { "tcpseq", TOK_TCPSEQ }, + { "tcpack", TOK_TCPACK }, + { "tcpwin", TOK_TCPWIN }, + { "icmptype", TOK_ICMPTYPES }, + { "icmptypes", TOK_ICMPTYPES }, + { "dst-ip", TOK_DSTIP }, + { "src-ip", TOK_SRCIP }, + { "dst-port", TOK_DSTPORT }, + { "src-port", TOK_SRCPORT }, + { "proto", TOK_PROTO }, + { "MAC", TOK_MAC }, + { "mac", TOK_MAC }, + { "mac-type", TOK_MACTYPE }, + { "verrevpath", TOK_VERREVPATH }, + { "versrcreach", TOK_VERSRCREACH }, + { "antispoof", TOK_ANTISPOOF }, + { "ipsec", TOK_IPSEC }, + { "icmp6type", TOK_ICMP6TYPES }, + { "icmp6types", TOK_ICMP6TYPES }, + { "ext6hdr", TOK_EXT6HDR}, + { "flow-id", TOK_FLOWID}, + { "ipv6", TOK_IPV6}, + { "ip6", TOK_IPV6}, + { "ipv4", TOK_IPV4}, + { "ip4", TOK_IPV4}, + { "dst-ipv6", TOK_DSTIP6}, + { "dst-ip6", TOK_DSTIP6}, + { "src-ipv6", TOK_SRCIP6}, + { "src-ip6", TOK_SRCIP6}, + { "lookup", TOK_LOOKUP}, + { "//", TOK_COMMENT }, + + { "not", TOK_NOT }, /* pseudo option */ + { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ + { "or", TOK_OR }, /* pseudo option */ + { "|", /* escape */ TOK_OR }, /* pseudo option */ + { "{", TOK_STARTBRACE }, /* pseudo option */ + { "(", TOK_STARTBRACE }, /* pseudo option */ + { "}", TOK_ENDBRACE }, /* pseudo option */ + { ")", TOK_ENDBRACE }, /* pseudo option */ + { NULL, 0 } /* terminator */ +}; + +/* + * Helper routine to print a possibly unaligned uint64_t on + * various platform. If width > 0, print the value with + * the desired width, followed by a space; + * otherwise, return the required width. + */ +int +pr_u64(uint64_t *pd, int width) +{ +#ifdef TCC +#define U64_FMT "I64" +#else +#define U64_FMT "llu" +#endif + uint64_t u; + unsigned long long d; + + bcopy (pd, &u, sizeof(u)); + d = u; + return (width > 0) ? + printf("%*" U64_FMT " ", width, d) : + snprintf(NULL, 0, "%" U64_FMT, d) ; +#undef U64_FMT +} + +void * +safe_calloc(size_t number, size_t size) +{ + void *ret = calloc(number, size); + + if (ret == NULL) + err(EX_OSERR, "calloc"); + return ret; +} + +void * +safe_realloc(void *ptr, size_t size) +{ + void *ret = realloc(ptr, size); + + if (ret == NULL) + err(EX_OSERR, "realloc"); + return ret; +} + +/* + * conditionally runs the command. + * Selected options or negative -> getsockopt + */ +int +do_cmd(int optname, void *optval, uintptr_t optlen) +{ + int i; + + if (co.test_only) + return 0; + + if (ipfw_socket == -1) + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (ipfw_socket < 0) + err(EX_UNAVAILABLE, "socket"); + + if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || + optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST || + optname == IP_FW_TABLE_GETSIZE || + optname == IP_FW_NAT_GET_CONFIG || + optname < 0 || + optname == IP_FW_NAT_GET_LOG) { + if (optname < 0) + optname = -optname; + i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, + (socklen_t *)optlen); + } else { + i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); + } + return i; +} + +#if 0 // XXX still unused +/* + * do_setcmd3 - pass ipfw control cmd to kernel + * @optname: option name + * @optval: pointer to option data + * @optlen: option length + * + * Function encapsulates option value in IP_FW3 socket option + * and calls setsockopt(). + * Function returns 0 on success or -1 otherwise. + */ +static int +do_setcmd3(int optname, void *optval, socklen_t optlen) +{ + socklen_t len; + ip_fw3_opheader *op3; + + if (co.test_only) + return (0); + + if (ipfw_socket == -1) + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (ipfw_socket < 0) + err(EX_UNAVAILABLE, "socket"); + + len = sizeof(ip_fw3_opheader) + optlen; + op3 = alloca(len); + /* Zero reserved fields */ + memset(op3, 0, sizeof(ip_fw3_opheader)); + memcpy(op3 + 1, optval, optlen); + op3->opcode = optname; + + return setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, len); +} +#endif // XXX still unused + +/** + * match_token takes a table and a string, returns the value associated + * with the string (-1 in case of failure). + */ +int +match_token(struct _s_x *table, char *string) +{ + struct _s_x *pt; + uint i = strlen(string); + + for (pt = table ; i && pt->s != NULL ; pt++) + if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) + return pt->x; + return -1; +} + +/** + * match_value takes a table and a value, returns the string associated + * with the value (NULL in case of failure). + */ +char const * +match_value(struct _s_x *p, int value) +{ + for (; p->s != NULL; p++) + if (p->x == value) + return p->s; + return NULL; +} + +/* + * _substrcmp takes two strings and returns 1 if they do not match, + * and 0 if they match exactly or the first string is a sub-string + * of the second. A warning is printed to stderr in the case that the + * first string is a sub-string of the second. + * + * This function will be removed in the future through the usual + * deprecation process. + */ +int +_substrcmp(const char *str1, const char* str2) +{ + + if (strncmp(str1, str2, strlen(str1)) != 0) + return 1; + + if (strlen(str1) != strlen(str2)) + warnx("DEPRECATED: '%s' matched '%s' as a sub-string", + str1, str2); + return 0; +} + +/* + * _substrcmp2 takes three strings and returns 1 if the first two do not match, + * and 0 if they match exactly or the second string is a sub-string + * of the first. A warning is printed to stderr in the case that the + * first string does not match the third. + * + * This function exists to warn about the bizarre construction + * strncmp(str, "by", 2) which is used to allow people to use a shortcut + * for "bytes". The problem is that in addition to accepting "by", + * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any + * other string beginning with "by". + * + * This function will be removed in the future through the usual + * deprecation process. + */ +int +_substrcmp2(const char *str1, const char* str2, const char* str3) +{ + + if (strncmp(str1, str2, strlen(str2)) != 0) + return 1; + + if (strcmp(str1, str3) != 0) + warnx("DEPRECATED: '%s' matched '%s'", + str1, str3); + return 0; +} + +/* + * prints one port, symbolic or numeric + */ +static void +print_port(int proto, uint16_t port) +{ + + if (proto == IPPROTO_ETHERTYPE) { + char const *s; + + if (co.do_resolv && (s = match_value(ether_types, port)) ) + printf("%s", s); + else + printf("0x%04x", port); + } else { + struct servent *se = NULL; + if (co.do_resolv) { + struct protoent *pe = getprotobynumber(proto); + + se = getservbyport(htons(port), pe ? pe->p_name : NULL); + } + if (se) + printf("%s", se->s_name); + else + printf("%d", port); + } +} + +static struct _s_x _port_name[] = { + {"dst-port", O_IP_DSTPORT}, + {"src-port", O_IP_SRCPORT}, + {"ipid", O_IPID}, + {"iplen", O_IPLEN}, + {"ipttl", O_IPTTL}, + {"mac-type", O_MAC_TYPE}, + {"tcpdatalen", O_TCPDATALEN}, + {"tcpwin", O_TCPWIN}, + {"tagged", O_TAGGED}, + {NULL, 0} +}; + +/* + * Print the values in a list 16-bit items of the types above. + * XXX todo: add support for mask. + */ +static void +print_newports(ipfw_insn_u16 *cmd, int proto, int opcode) +{ + uint16_t *p = cmd->ports; + int i; + char const *sep; + + if (opcode != 0) { + sep = match_value(_port_name, opcode); + if (sep == NULL) + sep = "???"; + printf (" %s", sep); + } + sep = " "; + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + printf("%s", sep); + print_port(proto, p[0]); + if (p[0] != p[1]) { + printf("-"); + print_port(proto, p[1]); + } + sep = ","; + } +} + +/* + * Like strtol, but also translates service names into port numbers + * for some protocols. + * In particular: + * proto == -1 disables the protocol check; + * proto == IPPROTO_ETHERTYPE looks up an internal table + * proto == matches the values there. + * Returns *end == s in case the parameter is not found. + */ +static int +strtoport(char *s, char **end, int base, int proto) +{ + char *p, *buf; + char *s1; + int i; + + *end = s; /* default - not found */ + if (*s == '\0') + return 0; /* not found */ + + if (isdigit(*s)) + return strtol(s, end, base); + + /* + * find separator. '\\' escapes the next char. + */ + for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) + if (*s1 == '\\' && s1[1] != '\0') + s1++; + + buf = safe_calloc(s1 - s + 1, 1); + + /* + * copy into a buffer skipping backslashes + */ + for (p = s, i = 0; p != s1 ; p++) + if (*p != '\\') + buf[i++] = *p; + buf[i++] = '\0'; + + if (proto == IPPROTO_ETHERTYPE) { + i = match_token(ether_types, buf); + free(buf); + if (i != -1) { /* found */ + *end = s1; + return i; + } + } else { + struct protoent *pe = NULL; + struct servent *se; + + if (proto != 0) + pe = getprotobynumber(proto); + setservent(1); + se = getservbyname(buf, pe ? pe->p_name : NULL); + free(buf); + if (se != NULL) { + *end = s1; + return ntohs(se->s_port); + } + } + return 0; /* not found */ +} + +/* + * Fill the body of the command with the list of port ranges. + */ +static int +fill_newports(ipfw_insn_u16 *cmd, char *av, int proto) +{ + uint16_t a, b, *p = cmd->ports; + int i = 0; + char *s = av; + + while (*s) { + a = strtoport(av, &s, 0, proto); + if (s == av) /* empty or invalid argument */ + return (0); + + switch (*s) { + case '-': /* a range */ + av = s + 1; + b = strtoport(av, &s, 0, proto); + /* Reject expressions like '1-abc' or '1-2-3'. */ + if (s == av || (*s != ',' && *s != '\0')) + return (0); + p[0] = a; + p[1] = b; + break; + case ',': /* comma separated list */ + case '\0': + p[0] = p[1] = a; + break; + default: + warnx("port list: invalid separator <%c> in <%s>", + *s, av); + return (0); + } + + i++; + p += 2; + av = s + 1; + } + if (i > 0) { + if (i + 1 > F_LEN_MASK) + errx(EX_DATAERR, "too many ports/ranges\n"); + cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ + } + return (i); +} + +static struct _s_x icmpcodes[] = { + { "net", ICMP_UNREACH_NET }, + { "host", ICMP_UNREACH_HOST }, + { "protocol", ICMP_UNREACH_PROTOCOL }, + { "port", ICMP_UNREACH_PORT }, + { "needfrag", ICMP_UNREACH_NEEDFRAG }, + { "srcfail", ICMP_UNREACH_SRCFAIL }, + { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, + { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, + { "isolated", ICMP_UNREACH_ISOLATED }, + { "net-prohib", ICMP_UNREACH_NET_PROHIB }, + { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, + { "tosnet", ICMP_UNREACH_TOSNET }, + { "toshost", ICMP_UNREACH_TOSHOST }, + { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, + { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, + { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, + { NULL, 0 } +}; + +static void +fill_reject_code(u_short *codep, char *str) +{ + int val; + char *s; + + val = strtoul(str, &s, 0); + if (s == str || *s != '\0' || val >= 0x100) + val = match_token(icmpcodes, str); + if (val < 0) + errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); + *codep = val; + return; +} + +static void +print_reject_code(uint16_t code) +{ + char const *s = match_value(icmpcodes, code); + + if (s != NULL) + printf("unreach %s", s); + else + printf("unreach %u", code); +} + +/* + * Returns the number of bits set (from left) in a contiguous bitmask, + * or -1 if the mask is not contiguous. + * XXX this needs a proper fix. + * This effectively works on masks in big-endian (network) format. + * when compiled on little endian architectures. + * + * First bit is bit 7 of the first byte -- note, for MAC addresses, + * the first bit on the wire is bit 0 of the first byte. + * len is the max length in bits. + */ +int +contigmask(uint8_t *p, int len) +{ + int i, n; + + for (i=0; iarg1 & 0xff; + uint8_t clear = (cmd->arg1 >> 8) & 0xff; + + if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { + printf(" setup"); + return; + } + + printf(" %s ", name); + for (i=0; list[i].x != 0; i++) { + if (set & list[i].x) { + set &= ~list[i].x; + printf("%s%s", comma, list[i].s); + comma = ","; + } + if (clear & list[i].x) { + clear &= ~list[i].x; + printf("%s!%s", comma, list[i].s); + comma = ","; + } + } +} + +/* + * Print the ip address contained in a command. + */ +static void +print_ip(ipfw_insn_ip *cmd, char const *s) +{ + struct hostent *he = NULL; + uint32_t len = F_LEN((ipfw_insn *)cmd); + uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; + + if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { + uint32_t d = a[1]; + const char *arg = ""; + + if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) + arg = match_value(rule_options, lookup_key[d]); + printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "", + arg, cmd->o.arg1); + return; + } + printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); + + if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { + printf("me"); + return; + } + if (cmd->o.opcode == O_IP_SRC_LOOKUP || + cmd->o.opcode == O_IP_DST_LOOKUP) { + printf("table(%u", ((ipfw_insn *)cmd)->arg1); + if (len == F_INSN_SIZE(ipfw_insn_u32)) + printf(",%u", *a); + printf(")"); + return; + } + if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { + uint32_t x, *map = (uint32_t *)&(cmd->mask); + int i, j; + char comma = '{'; + + x = cmd->o.arg1 - 1; + x = htonl( ~x ); + cmd->addr.s_addr = htonl(cmd->addr.s_addr); + printf("%s/%d", inet_ntoa(cmd->addr), + contigmask((uint8_t *)&x, 32)); + x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); + x &= 0xff; /* base */ + /* + * Print bits and ranges. + * Locate first bit set (i), then locate first bit unset (j). + * If we have 3+ consecutive bits set, then print them as a + * range, otherwise only print the initial bit and rescan. + */ + for (i=0; i < cmd->o.arg1; i++) + if (map[i/32] & (1<<(i & 31))) { + for (j=i+1; j < cmd->o.arg1; j++) + if (!(map[ j/32] & (1<<(j & 31)))) + break; + printf("%c%d", comma, i+x); + if (j>i+2) { /* range has at least 3 elements */ + printf("-%d", j-1+x); + i = j-1; + } + comma = ','; + } + printf("}"); + return; + } + /* + * len == 2 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + for (len = len / 2; len > 0; len--, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? + 32 : contigmask((uint8_t *)&(a[1]), 32); + if (mb == 32 && co.do_resolv) + he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); + if (he != NULL) /* resolved to name */ + printf("%s", he->h_name); + else if (mb == 0) /* any */ + printf("any"); + else { /* numeric IP followed by some kind of mask */ + printf("%s", inet_ntoa( *((struct in_addr *)&a[0]) ) ); + if (mb < 0) + printf(":%s", inet_ntoa( *((struct in_addr *)&a[1]) ) ); + else if (mb < 32) + printf("/%d", mb); + } + if (len > 1) + printf(","); + } +} + +/* + * prints a MAC address/mask pair + */ +static void +print_mac(uint8_t *addr, uint8_t *mask) +{ + int l = contigmask(mask, 48); + + if (l == 0) + printf(" any"); + else { + printf(" %02x:%02x:%02x:%02x:%02x:%02x", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + if (l == -1) + printf("&%02x:%02x:%02x:%02x:%02x:%02x", + mask[0], mask[1], mask[2], + mask[3], mask[4], mask[5]); + else if (l < 48) + printf("/%d", l); + } +} + +static void +fill_icmptypes(ipfw_insn_u32 *cmd, char *av) +{ + uint8_t type; + + cmd->d[0] = 0; + while (*av) { + if (*av == ',') + av++; + + type = strtoul(av, &av, 0); + + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ICMP type"); + + if (type > 31) + errx(EX_DATAERR, "ICMP type out of range"); + + cmd->d[0] |= 1 << type; + } + cmd->o.opcode = O_ICMPTYPE; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); +} + +static void +print_icmptypes(ipfw_insn_u32 *cmd) +{ + int i; + char sep= ' '; + + printf(" icmptypes"); + for (i = 0; i < 32; i++) { + if ( (cmd->d[0] & (1 << (i))) == 0) + continue; + printf("%c%d", sep, i); + sep = ','; + } +} + +/* + * show_ipfw() prints the body of an ipfw rule. + * Because the standard rule has at least proto src_ip dst_ip, we use + * a helper function to produce these entries if not provided explicitly. + * The first argument is the list of fields we have, the second is + * the list of fields we want to be printed. + * + * Special cases if we have provided a MAC header: + * + if the rule does not contain IP addresses/ports, do not print them; + * + if the rule does not contain an IP proto, print "all" instead of "ip"; + * + * Once we have 'have_options', IP header fields are printed as options. + */ +#define HAVE_PROTO 0x0001 +#define HAVE_SRCIP 0x0002 +#define HAVE_DSTIP 0x0004 +#define HAVE_PROTO4 0x0008 +#define HAVE_PROTO6 0x0010 +#define HAVE_IP 0x0100 +#define HAVE_OPTIONS 0x8000 + +static void +show_prerequisites(int *flags, int want, int cmd) +{ + (void)cmd; /* UNUSED */ + if (co.comment_only) + return; + if ( (*flags & HAVE_IP) == HAVE_IP) + *flags |= HAVE_OPTIONS; + + if ( !(*flags & HAVE_OPTIONS)) { + if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { + if ( (*flags & HAVE_PROTO4)) + printf(" ip4"); + else if ( (*flags & HAVE_PROTO6)) + printf(" ip6"); + else + printf(" ip"); + } + if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) + printf(" from any"); + if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) + printf(" to any"); + } + *flags |= want; +} + +static void +show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) +{ + static int twidth = 0; + int l; + ipfw_insn *cmd, *tagptr = NULL; + const char *comment = NULL; /* ptr to comment if we have one */ + int proto = 0; /* default */ + int flags = 0; /* prerequisites */ + ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ + ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ + int or_block = 0; /* we are in an or block */ + uint32_t set_disable; + + bcopy(&rule->next_rule, &set_disable, sizeof(set_disable)); + + if (set_disable & (1 << rule->set)) { /* disabled */ + if (!co.show_sets) + return; + else + printf("# DISABLED "); + } + printf("%05u ", rule->rulenum); + + if (pcwidth > 0 || bcwidth > 0) { + pr_u64(&rule->pcnt, pcwidth); + pr_u64(&rule->bcnt, bcwidth); + } + + if (co.do_time == 2) + printf("%10u ", rule->timestamp); + else if (co.do_time == 1) { + char timestr[30]; + time_t t = (time_t)0; + + if (twidth == 0) { + strcpy(timestr, ctime(&t)); + *strchr(timestr, '\n') = '\0'; + twidth = strlen(timestr); + } + if (rule->timestamp) { + t = _long_to_time(rule->timestamp); + + strcpy(timestr, ctime(&t)); + *strchr(timestr, '\n') = '\0'; + printf("%s ", timestr); + } else { + printf("%*s", twidth, " "); + } + } + + if (co.show_sets) + printf("set %d ", rule->set); + + /* + * print the optional "match probability" + */ + if (rule->cmd_len > 0) { + cmd = rule->cmd ; + if (cmd->opcode == O_PROB) { + ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; + double d = 1.0 * p->d[0]; + + d = (d / 0x7fffffff); + printf("prob %f ", d); + } + } + + /* + * first print actions + */ + for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); + l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { + switch(cmd->opcode) { + case O_CHECK_STATE: + printf("check-state"); + /* avoid printing anything else */ + flags = HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP; + break; + + case O_ACCEPT: + printf("allow"); + break; + + case O_COUNT: + printf("count"); + break; + + case O_DENY: + printf("deny"); + break; + + case O_REJECT: + if (cmd->arg1 == ICMP_REJECT_RST) + printf("reset"); + else if (cmd->arg1 == ICMP_UNREACH_HOST) + printf("reject"); + else + print_reject_code(cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1 == ICMP6_UNREACH_RST) + printf("reset6"); + else + print_unreach6_code(cmd->arg1); + break; + + case O_SKIPTO: + PRINT_UINT_ARG("skipto ", cmd->arg1); + break; + + case O_PIPE: + PRINT_UINT_ARG("pipe ", cmd->arg1); + break; + + case O_QUEUE: + PRINT_UINT_ARG("queue ", cmd->arg1); + break; + + case O_DIVERT: + PRINT_UINT_ARG("divert ", cmd->arg1); + break; + + case O_TEE: + PRINT_UINT_ARG("tee ", cmd->arg1); + break; + + case O_NETGRAPH: + PRINT_UINT_ARG("netgraph ", cmd->arg1); + break; + + case O_NGTEE: + PRINT_UINT_ARG("ngtee ", cmd->arg1); + break; + + case O_FORWARD_IP: + { + ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; + + if (s->sa.sin_addr.s_addr == INADDR_ANY) { + printf("fwd tablearg"); + } else { + printf("fwd %s", inet_ntoa(s->sa.sin_addr)); + } + if (s->sa.sin_port) + printf(",%d", s->sa.sin_port); + } + break; + +#if 0 // XXX unused yet + case O_FORWARD_IP6: + { + char buf[4 + INET6_ADDRSTRLEN + 1]; + ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd; + + printf("fwd %s", inet_ntop(AF_INET6, &s->sa.sin6_addr, + buf, sizeof(buf))); + if (s->sa.sin6_port) + printf(",%d", s->sa.sin6_port); + } + break; +#endif // XXX unused yet + + + case O_LOG: /* O_LOG is printed last */ + logptr = (ipfw_insn_log *)cmd; + break; + + case O_ALTQ: /* O_ALTQ is printed after O_LOG */ + altqptr = (ipfw_insn_altq *)cmd; + break; + + case O_TAG: + tagptr = cmd; + break; + + case O_NAT: + if (cmd->arg1 != 0) + PRINT_UINT_ARG("nat ", cmd->arg1); + else + printf("nat global"); + break; + + case O_SETFIB: + PRINT_UINT_ARG("setfib ", cmd->arg1); + break; + + case O_REASS: + printf("reass"); + break; + + case O_CALLRETURN: + if (cmd->len & F_NOT) + printf("return"); + else + PRINT_UINT_ARG("call ", cmd->arg1); + break; + + default: + printf("** unrecognized action %d len %d ", + cmd->opcode, cmd->len); + } + } + if (logptr) { + if (logptr->max_log > 0) + printf(" log logamount %d", logptr->max_log); + else + printf(" log"); + } +#ifndef NO_ALTQ + if (altqptr) { + print_altq_cmd(altqptr); + } +#endif + if (tagptr) { + if (tagptr->len & F_NOT) + PRINT_UINT_ARG(" untag ", tagptr->arg1); + else + PRINT_UINT_ARG(" tag ", tagptr->arg1); + } + + /* + * then print the body. + */ + for (l = rule->act_ofs, cmd = rule->cmd ; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + if ((cmd->len & F_OR) || (cmd->len & F_NOT)) + continue; + if (cmd->opcode == O_IP4) { + flags |= HAVE_PROTO4; + break; + } else if (cmd->opcode == O_IP6) { + flags |= HAVE_PROTO6; + break; + } + } + if (rule->_pad & 1) { /* empty rules before options */ + if (!co.do_compact) { + show_prerequisites(&flags, HAVE_PROTO, 0); + printf(" from any to any"); + } + flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | + HAVE_SRCIP | HAVE_DSTIP; + } + + if (co.comment_only) + comment = "..."; + + for (l = rule->act_ofs, cmd = rule->cmd ; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + /* useful alias */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; + + if (co.comment_only) { + if (cmd->opcode != O_NOP) + continue; + printf(" // %s\n", (char *)(cmd + 1)); + return; + } + + show_prerequisites(&flags, 0, cmd->opcode); + + switch(cmd->opcode) { + case O_PROB: + break; /* done already */ + + case O_PROBE_STATE: + break; /* no need to print anything here */ + + case O_IP_SRC: + case O_IP_SRC_LOOKUP: + case O_IP_SRC_MASK: + case O_IP_SRC_ME: + case O_IP_SRC_SET: + show_prerequisites(&flags, HAVE_PROTO, 0); + if (!(flags & HAVE_SRCIP)) + printf(" from"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip((ipfw_insn_ip *)cmd, + (flags & HAVE_OPTIONS) ? " src-ip" : ""); + flags |= HAVE_SRCIP; + break; + + case O_IP_DST: + case O_IP_DST_LOOKUP: + case O_IP_DST_MASK: + case O_IP_DST_ME: + case O_IP_DST_SET: + show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); + if (!(flags & HAVE_DSTIP)) + printf(" to"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip((ipfw_insn_ip *)cmd, + (flags & HAVE_OPTIONS) ? " dst-ip" : ""); + flags |= HAVE_DSTIP; + break; + + case O_IP6_SRC: + case O_IP6_SRC_MASK: + case O_IP6_SRC_ME: + show_prerequisites(&flags, HAVE_PROTO, 0); + if (!(flags & HAVE_SRCIP)) + printf(" from"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip6((ipfw_insn_ip6 *)cmd, + (flags & HAVE_OPTIONS) ? " src-ip6" : ""); + flags |= HAVE_SRCIP | HAVE_PROTO; + break; + + case O_IP6_DST: + case O_IP6_DST_MASK: + case O_IP6_DST_ME: + show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); + if (!(flags & HAVE_DSTIP)) + printf(" to"); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + print_ip6((ipfw_insn_ip6 *)cmd, + (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); + flags |= HAVE_DSTIP; + break; + + case O_FLOW6ID: + print_flow6id( (ipfw_insn_u32 *) cmd ); + flags |= HAVE_OPTIONS; + break; + + case O_IP_DSTPORT: + show_prerequisites(&flags, + HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP, 0); + case O_IP_SRCPORT: + if (flags & HAVE_DSTIP) + flags |= HAVE_IP; + show_prerequisites(&flags, + HAVE_PROTO | HAVE_SRCIP, 0); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + if (cmd->len & F_NOT) + printf(" not"); + print_newports((ipfw_insn_u16 *)cmd, proto, + (flags & HAVE_OPTIONS) ? cmd->opcode : 0); + break; + + case O_PROTO: { + struct protoent *pe = NULL; + + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + if (cmd->len & F_NOT) + printf(" not"); + proto = cmd->arg1; + pe = getprotobynumber(cmd->arg1); + if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && + !(flags & HAVE_PROTO)) + show_prerequisites(&flags, + HAVE_PROTO | HAVE_IP | HAVE_SRCIP | + HAVE_DSTIP | HAVE_OPTIONS, 0); + if (flags & HAVE_OPTIONS) + printf(" proto"); + if (pe) + printf(" %s", pe->p_name); + else + printf(" %u", cmd->arg1); + } + flags |= HAVE_PROTO; + break; + + default: /*options ... */ + if (!(cmd->len & (F_OR|F_NOT))) + if (((cmd->opcode == O_IP6) && + (flags & HAVE_PROTO6)) || + ((cmd->opcode == O_IP4) && + (flags & HAVE_PROTO4))) + break; + show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); + if ((cmd->len & F_OR) && !or_block) + printf(" {"); + if (cmd->len & F_NOT && cmd->opcode != O_IN) + printf(" not"); + switch(cmd->opcode) { + case O_MACADDR2: { + ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; + + printf(" MAC"); + print_mac(m->addr, m->mask); + print_mac(m->addr + 6, m->mask + 6); + } + break; + + case O_MAC_TYPE: + print_newports((ipfw_insn_u16 *)cmd, + IPPROTO_ETHERTYPE, cmd->opcode); + break; + + + case O_FRAG: + printf(" frag"); + break; + + case O_FIB: + printf(" fib %u", cmd->arg1 ); + break; + case O_SOCKARG: + printf(" sockarg"); + break; + + case O_IN: + printf(cmd->len & F_NOT ? " out" : " in"); + break; + + case O_DIVERTED: + switch (cmd->arg1) { + case 3: + printf(" diverted"); + break; + case 1: + printf(" diverted-loopback"); + break; + case 2: + printf(" diverted-output"); + break; + default: + printf(" diverted-?<%u>", cmd->arg1); + break; + } + break; + + case O_LAYER2: + printf(" layer2"); + break; + case O_XMIT: + case O_RECV: + case O_VIA: + { + char const *s; + ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; + + if (cmd->opcode == O_XMIT) + s = "xmit"; + else if (cmd->opcode == O_RECV) + s = "recv"; + else /* if (cmd->opcode == O_VIA) */ + s = "via"; + if (cmdif->name[0] == '\0') + printf(" %s %s", s, + inet_ntoa(cmdif->p.ip)); + else + printf(" %s %s", s, cmdif->name); + + break; + } + case O_IPID: + if (F_LEN(cmd) == 1) + printf(" ipid %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_IPID); + break; + + case O_IPTTL: + if (F_LEN(cmd) == 1) + printf(" ipttl %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_IPTTL); + break; + + case O_IPVER: + printf(" ipver %u", cmd->arg1 ); + break; + + case O_IPPRECEDENCE: + printf(" ipprecedence %u", (cmd->arg1) >> 5 ); + break; + + case O_IPLEN: + if (F_LEN(cmd) == 1) + printf(" iplen %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_IPLEN); + break; + + case O_IPOPT: + print_flags("ipoptions", cmd, f_ipopts); + break; + + case O_IPTOS: + print_flags("iptos", cmd, f_iptos); + break; + + case O_ICMPTYPE: + print_icmptypes((ipfw_insn_u32 *)cmd); + break; + + case O_ESTAB: + printf(" established"); + break; + + case O_TCPDATALEN: + if (F_LEN(cmd) == 1) + printf(" tcpdatalen %u", cmd->arg1 ); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_TCPDATALEN); + break; + + case O_TCPFLAGS: + print_flags("tcpflags", cmd, f_tcpflags); + break; + + case O_TCPOPTS: + print_flags("tcpoptions", cmd, f_tcpopts); + break; + + case O_TCPWIN: + printf(" tcpwin %d", ntohs(cmd->arg1)); + break; + + case O_TCPACK: + printf(" tcpack %d", ntohl(cmd32->d[0])); + break; + + case O_TCPSEQ: + printf(" tcpseq %d", ntohl(cmd32->d[0])); + break; + + case O_UID: + { + struct passwd *pwd = getpwuid(cmd32->d[0]); + + if (pwd) + printf(" uid %s", pwd->pw_name); + else + printf(" uid %u", cmd32->d[0]); + } + break; + + case O_GID: + { + struct group *grp = getgrgid(cmd32->d[0]); + + if (grp) + printf(" gid %s", grp->gr_name); + else + printf(" gid %u", cmd32->d[0]); + } + break; + + case O_JAIL: + printf(" jail %d", cmd32->d[0]); + break; + + case O_VERREVPATH: + printf(" verrevpath"); + break; + + case O_VERSRCREACH: + printf(" versrcreach"); + break; + + case O_ANTISPOOF: + printf(" antispoof"); + break; + + case O_IPSEC: + printf(" ipsec"); + break; + + case O_NOP: + comment = (char *)(cmd + 1); + break; + + case O_KEEP_STATE: + printf(" keep-state"); + break; + + case O_LIMIT: { + struct _s_x *p = limit_masks; + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + uint8_t x = c->limit_mask; + char const *comma = " "; + + printf(" limit"); + for (; p->x != 0 ; p++) + if ((x & p->x) == p->x) { + x &= ~p->x; + printf("%s%s", comma, p->s); + comma = ","; + } + PRINT_UINT_ARG(" ", c->conn_limit); + break; + } + + case O_IP6: + printf(" ip6"); + break; + + case O_IP4: + printf(" ip4"); + break; + + case O_ICMP6TYPE: + print_icmp6types((ipfw_insn_u32 *)cmd); + break; + + case O_EXT_HDR: + print_ext6hdr( (ipfw_insn *) cmd ); + break; + + case O_TAGGED: + if (F_LEN(cmd) == 1) + PRINT_UINT_ARG(" tagged ", cmd->arg1); + else + print_newports((ipfw_insn_u16 *)cmd, 0, + O_TAGGED); + break; + + default: + printf(" [opcode %d len %d]", + cmd->opcode, cmd->len); + } + } + if (cmd->len & F_OR) { + printf(" or"); + or_block = 1; + } else if (or_block) { + printf(" }"); + or_block = 0; + } + } + show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP + | HAVE_IP, 0); + if (comment) + printf(" // %s", comment); + printf("\n"); +} + +static void +show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) +{ + struct protoent *pe; + struct in_addr a; + uint16_t rulenum; + char buf[INET6_ADDRSTRLEN]; + + if (!co.do_expired) { + if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) + return; + } + bcopy(&d->rule, &rulenum, sizeof(rulenum)); + printf("%05d", rulenum); + if (pcwidth > 0 || bcwidth > 0) { + printf(" "); + pr_u64(&d->pcnt, pcwidth); + pr_u64(&d->bcnt, bcwidth); + printf("(%ds)", d->expire); + } + switch (d->dyn_type) { + case O_LIMIT_PARENT: + printf(" PARENT %d", d->count); + break; + case O_LIMIT: + printf(" LIMIT"); + break; + case O_KEEP_STATE: /* bidir, no mask */ + printf(" STATE"); + break; + } + + if ((pe = getprotobynumber(d->id.proto)) != NULL) + printf(" %s", pe->p_name); + else + printf(" proto %u", d->id.proto); + + if (d->id.addr_type == 4) { + a.s_addr = htonl(d->id.src_ip); + printf(" %s %d", inet_ntoa(a), d->id.src_port); + + a.s_addr = htonl(d->id.dst_ip); + printf(" <-> %s %d", inet_ntoa(a), d->id.dst_port); + } else if (d->id.addr_type == 6) { + printf(" %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, + sizeof(buf)), d->id.src_port); + printf(" <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf, + sizeof(buf)), d->id.dst_port); + } else + printf(" UNKNOWN <-> UNKNOWN\n"); + + printf("\n"); +} + +/* + * This one handles all set-related commands + * ipfw set { show | enable | disable } + * ipfw set swap X Y + * ipfw set move X to Y + * ipfw set move rule X to Y + */ +void +ipfw_sets_handler(char *av[]) +{ + uint32_t set_disable, masks[2]; + int i, nbytes; + uint16_t rulenum; + uint8_t cmd, new_set; + + av++; + + if (av[0] == NULL) + errx(EX_USAGE, "set needs command"); + if (_substrcmp(*av, "show") == 0) { + void *data = NULL; + char const *msg; + int nalloc; + + nalloc = nbytes = sizeof(struct ip_fw); + while (nbytes >= nalloc) { + if (data) + free(data); + nalloc = nalloc * 2 + 200; + nbytes = nalloc; + data = safe_calloc(1, nbytes); + if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) + err(EX_OSERR, "getsockopt(IP_FW_GET)"); + } + + bcopy(&((struct ip_fw *)data)->next_rule, + &set_disable, sizeof(set_disable)); + + for (i = 0, msg = "disable" ; i < RESVD_SET; i++) + if ((set_disable & (1< RESVD_SET) + errx(EX_DATAERR, "invalid set number %s\n", av[0]); + if (!isdigit(*(av[1])) || new_set > RESVD_SET) + errx(EX_DATAERR, "invalid set number %s\n", av[1]); + masks[0] = (4 << 24) | (new_set << 16) | (rulenum); + i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); + } else if (_substrcmp(*av, "move") == 0) { + av++; + if (av[0] && _substrcmp(*av, "rule") == 0) { + cmd = 2; + av++; + } else + cmd = 3; + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || + av[3] != NULL || _substrcmp(av[1], "to") != 0) + errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); + rulenum = atoi(av[0]); + new_set = atoi(av[2]); + if (!isdigit(*(av[0])) || (cmd == 3 && rulenum > RESVD_SET) || + (cmd == 2 && rulenum == IPFW_DEFAULT_RULE) ) + errx(EX_DATAERR, "invalid source number %s\n", av[0]); + if (!isdigit(*(av[2])) || new_set > RESVD_SET) + errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); + masks[0] = (cmd << 24) | (new_set << 16) | (rulenum); + i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); + } else if (_substrcmp(*av, "disable") == 0 || + _substrcmp(*av, "enable") == 0 ) { + int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; + + av++; + masks[0] = masks[1] = 0; + + while (av[0]) { + if (isdigit(**av)) { + i = atoi(*av); + if (i < 0 || i > RESVD_SET) + errx(EX_DATAERR, + "invalid set number %d\n", i); + masks[which] |= (1<= nalloc) { + nalloc = nalloc * 2 + 200; + nbytes = nalloc; + data = safe_realloc(data, nbytes); + if (do_cmd(ocmd, data, (uintptr_t)&nbytes) < 0) + err(EX_OSERR, "getsockopt(IP_%s_GET)", + co.do_pipe ? "DUMMYNET" : "FW"); + } + + /* + * Count static rules. They have variable size so we + * need to scan the list to count them. + */ + for (nstat = 1, r = data, lim = (char *)data + nbytes; + r->rulenum < IPFW_DEFAULT_RULE && (char *)r < lim; + ++nstat, r = NEXT(r) ) + ; /* nothing */ + + /* + * Count dynamic rules. This is easier as they have + * fixed size. + */ + r = NEXT(r); + dynrules = (ipfw_dyn_rule *)r ; + n = (char *)r - (char *)data; + ndyn = (nbytes - n) / sizeof *dynrules; + + /* if showing stats, figure out column widths ahead of time */ + bcwidth = pcwidth = 0; + if (show_counters) { + for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { + /* skip rules from another set */ + if (co.use_set && r->set != co.use_set - 1) + continue; + + /* packet counter */ + width = pr_u64(&r->pcnt, 0); + if (width > pcwidth) + pcwidth = width; + + /* byte counter */ + width = pr_u64(&r->bcnt, 0); + if (width > bcwidth) + bcwidth = width; + } + } + if (co.do_dynamic && ndyn) { + for (n = 0, d = dynrules; n < ndyn; n++, d++) { + if (co.use_set) { + /* skip rules from another set */ + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co.use_set - 1) + continue; + } + width = pr_u64(&d->pcnt, 0); + if (width > pcwidth) + pcwidth = width; + + width = pr_u64(&d->bcnt, 0); + if (width > bcwidth) + bcwidth = width; + } + } + /* if no rule numbers were specified, list all rules */ + if (ac == 0) { + for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { + if (co.use_set && r->set != co.use_set - 1) + continue; + show_ipfw(r, pcwidth, bcwidth); + } + + if (co.do_dynamic && ndyn) { + printf("## Dynamic rules (%d):\n", ndyn); + for (n = 0, d = dynrules; n < ndyn; n++, d++) { + if (co.use_set) { + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co.use_set - 1) + continue; + } + show_dyn_ipfw(d, pcwidth, bcwidth); + } + } + goto done; + } + + /* display specific rules requested on command line */ + + for (lac = ac, lav = av; lac != 0; lac--) { + /* convert command line rule # */ + last = rnum = strtoul(*lav++, &endptr, 10); + if (*endptr == '-') + last = strtoul(endptr+1, &endptr, 10); + if (*endptr) { + exitval = EX_USAGE; + warnx("invalid rule number: %s", *(lav - 1)); + continue; + } + for (n = seen = 0, r = data; n < nstat; n++, r = NEXT(r) ) { + if (r->rulenum > last) + break; + if (co.use_set && r->set != co.use_set - 1) + continue; + if (r->rulenum >= rnum && r->rulenum <= last) { + show_ipfw(r, pcwidth, bcwidth); + seen = 1; + } + } + if (!seen) { + /* give precedence to other error(s) */ + if (exitval == EX_OK) + exitval = EX_UNAVAILABLE; + warnx("rule %lu does not exist", rnum); + } + } + + if (co.do_dynamic && ndyn) { + printf("## Dynamic rules:\n"); + for (lac = ac, lav = av; lac != 0; lac--) { + last = rnum = strtoul(*lav++, &endptr, 10); + if (*endptr == '-') + last = strtoul(endptr+1, &endptr, 10); + if (*endptr) + /* already warned */ + continue; + for (n = 0, d = dynrules; n < ndyn; n++, d++) { + uint16_t rulenum; + + bcopy(&d->rule, &rulenum, sizeof(rulenum)); + if (rulenum > rnum) + break; + if (co.use_set) { + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co.use_set - 1) + continue; + } + if (r->rulenum >= rnum && r->rulenum <= last) + show_dyn_ipfw(d, pcwidth, bcwidth); + } + } + } + + ac = 0; + +done: + free(data); + + if (exitval != EX_OK) + exit(exitval); +#undef NEXT +} + +static int +lookup_host (char *host, struct in_addr *ipaddr) +{ + struct hostent *he; + + if (!inet_aton(host, ipaddr)) { + if ((he = gethostbyname(host)) == NULL) + return(-1); + *ipaddr = *(struct in_addr *)he->h_addr_list[0]; + } + return(0); +} + +/* + * fills the addr and mask fields in the instruction as appropriate from av. + * Update length as appropriate. + * The following formats are allowed: + * me returns O_IP_*_ME + * 1.2.3.4 single IP address + * 1.2.3.4:5.6.7.8 address:mask + * 1.2.3.4/24 address/mask + * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet + * We can have multiple comma-separated address/mask entries. + */ +static void +fill_ip(ipfw_insn_ip *cmd, char *av) +{ + int len = 0; + uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; + + cmd->o.len &= ~F_LEN_MASK; /* zero len */ + + if (_substrcmp(av, "any") == 0) + return; + + if (_substrcmp(av, "me") == 0) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return; + } + + if (strncmp(av, "table(", 6) == 0) { + char *p = strchr(av + 6, ','); + + if (p) + *p++ = '\0'; + cmd->o.opcode = O_IP_DST_LOOKUP; + cmd->o.arg1 = strtoul(av + 6, NULL, 0); + if (p) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); + d[0] = strtoul(p, NULL, 0); + } else + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return; + } + + while (av) { + /* + * After the address we can have '/' or ':' indicating a mask, + * ',' indicating another address follows, '{' indicating a + * set of addresses of unspecified size. + */ + char *t = NULL, *p = strpbrk(av, "/:,{"); + int masklen; + char md, nd = '\0'; + + if (p) { + md = *p; + *p++ = '\0'; + if ((t = strpbrk(p, ",{")) != NULL) { + nd = *t; + *t = '\0'; + } + } else + md = '\0'; + + if (lookup_host(av, (struct in_addr *)&d[0]) != 0) + errx(EX_NOHOST, "hostname ``%s'' unknown", av); + switch (md) { + case ':': + if (!inet_aton(p, (struct in_addr *)&d[1])) + errx(EX_DATAERR, "bad netmask ``%s''", p); + break; + case '/': + masklen = atoi(p); + if (masklen == 0) + d[1] = htonl(0); /* mask */ + else if (masklen > 32) + errx(EX_DATAERR, "bad width ``%s''", p); + else + d[1] = htonl(~0 << (32 - masklen)); + break; + case '{': /* no mask, assume /24 and put back the '{' */ + d[1] = htonl(~0 << (32 - 24)); + *(--p) = md; + break; + + case ',': /* single address plus continuation */ + *(--p) = md; + /* FALLTHROUGH */ + case 0: /* initialization value */ + default: + d[1] = htonl(~0); /* force /32 */ + break; + } + d[0] &= d[1]; /* mask base address with mask */ + if (t) + *t = nd; + /* find next separator */ + if (p) + p = strpbrk(p, ",{"); + if (p && *p == '{') { + /* + * We have a set of addresses. They are stored as follows: + * arg1 is the set size (powers of 2, 2..256) + * addr is the base address IN HOST FORMAT + * mask.. is an array of arg1 bits (rounded up to + * the next multiple of 32) with bits set + * for each host in the map. + */ + uint32_t *map = (uint32_t *)&cmd->mask; + int low, high; + int i = contigmask((uint8_t *)&(d[1]), 32); + + if (len > 0) + errx(EX_DATAERR, "address set cannot be in a list"); + if (i < 24 || i > 31) + errx(EX_DATAERR, "invalid set with mask %d\n", i); + cmd->o.arg1 = 1<<(32-i); /* map length */ + d[0] = ntohl(d[0]); /* base addr in host format */ + cmd->o.opcode = O_IP_DST_SET; /* default */ + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; + for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) + map[i] = 0; /* clear map */ + + av = p + 1; + low = d[0] & 0xff; + high = low + cmd->o.arg1 - 1; + /* + * Here, i stores the previous value when we specify a range + * of addresses within a mask, e.g. 45-63. i = -1 means we + * have no previous value. + */ + i = -1; /* previous value in a range */ + while (isdigit(*av)) { + char *s; + int a = strtol(av, &s, 0); + + if (s == av) { /* no parameter */ + if (*av != '}') + errx(EX_DATAERR, "set not closed\n"); + if (i != -1) + errx(EX_DATAERR, "incomplete range %d-", i); + break; + } + if (a < low || a > high) + errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", + a, low, high); + a -= low; + if (i == -1) /* no previous in range */ + i = a; + else { /* check that range is valid */ + if (i > a) + errx(EX_DATAERR, "invalid range %d-%d", + i+low, a+low); + if (*s == '-') + errx(EX_DATAERR, "double '-' in range"); + } + for (; i <= a; i++) + map[i/32] |= 1<<(i & 31); + i = -1; + if (*s == '-') + i = a; + else if (*s == '}') + break; + av = s+1; + } + return; + } + av = p; + if (av) /* then *av must be a ',' */ + av++; + + /* Check this entry */ + if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ + /* + * 'any' turns the entire list into a NOP. + * 'not any' never matches, so it is removed from the + * list unless it is the only item, in which case we + * report an error. + */ + if (cmd->o.len & F_NOT) { /* "not any" never matches */ + if (av == NULL && len == 0) /* only this entry */ + errx(EX_DATAERR, "not any never matches"); + } + /* else do nothing and skip this entry */ + return; + } + /* A single IP can be stored in an optimized format */ + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); + return; + } + len += 2; /* two words... */ + d += 2; + } /* end while */ + if (len + 1 > F_LEN_MASK) + errx(EX_DATAERR, "address list too long"); + cmd->o.len |= len+1; +} + + +/* n2mask sets n bits of the mask */ +void +n2mask(struct in6_addr *mask, int n) +{ + static int minimask[9] = + { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; + u_char *p; + + memset(mask, 0, sizeof(struct in6_addr)); + p = (u_char *) mask; + for (; n > 0; p++, n -= 8) { + if (n >= 8) + *p = 0xff; + else + *p = minimask[n]; + } + return; +} + +/* + * helper function to process a set of flags and set bits in the + * appropriate masks. + */ +static void +fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, + struct _s_x *flags, char *p) +{ + uint8_t set=0, clear=0; + + while (p && *p) { + char *q; /* points to the separator */ + int val; + uint8_t *which; /* mask we are working on */ + + if (*p == '!') { + p++; + which = &clear; + } else + which = &set; + q = strchr(p, ','); + if (q) + *q++ = '\0'; + val = match_token(flags, p); + if (val <= 0) + errx(EX_DATAERR, "invalid flag %s", p); + *which |= (uint8_t)val; + p = q; + } + cmd->opcode = opcode; + cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; + cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); +} + + +void +ipfw_delete(char *av[]) +{ + uint32_t rulenum; + int i; + int exitval = EX_OK; + int do_set = 0; + + av++; + NEED1("missing rule specification"); + if ( *av && _substrcmp(*av, "set") == 0) { + /* Do not allow using the following syntax: + * ipfw set N delete set M + */ + if (co.use_set) + errx(EX_DATAERR, "invalid syntax"); + do_set = 1; /* delete set */ + av++; + } + + /* Rule number */ + while (*av && isdigit(**av)) { + i = atoi(*av); av++; + if (co.do_nat) { + exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); + if (exitval) { + exitval = EX_UNAVAILABLE; + warn("rule %u not available", i); + } + } else if (co.do_pipe) { + exitval = ipfw_delete_pipe(co.do_pipe, i); + } else { + if (co.use_set) + rulenum = (i & 0xffff) | (5 << 24) | + ((co.use_set - 1) << 16); + else + rulenum = (i & 0xffff) | (do_set << 24); + i = do_cmd(IP_FW_DEL, &rulenum, sizeof rulenum); + if (i) { + exitval = EX_UNAVAILABLE; + warn("rule %u: setsockopt(IP_FW_DEL)", + rulenum); + } + } + } + if (exitval != EX_OK) + exit(exitval); +} + + +/* + * fill the interface structure. We do not check the name as we can + * create interfaces dynamically, so checking them at insert time + * makes relatively little sense. + * Interface names containing '*', '?', or '[' are assumed to be shell + * patterns which match interfaces. + */ +static void +fill_iface(ipfw_insn_if *cmd, char *arg) +{ + cmd->name[0] = '\0'; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); + + /* Parse the interface or address */ + if (strcmp(arg, "any") == 0) + cmd->o.len = 0; /* effectively ignore this command */ + else if (!isdigit(*arg)) { + strlcpy(cmd->name, arg, sizeof(cmd->name)); + cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; + } else if (!inet_aton(arg, &cmd->p.ip)) + errx(EX_DATAERR, "bad ip address ``%s''", arg); +} + +static void +get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) +{ + int i; + size_t l; + char *ap, *ptr, *optr; + struct ether_addr *mac; + const char *macset = "0123456789abcdefABCDEF:"; + + if (strcmp(p, "any") == 0) { + for (i = 0; i < ETHER_ADDR_LEN; i++) + addr[i] = mask[i] = 0; + return; + } + + optr = ptr = strdup(p); + if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { + l = strlen(ap); + if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) + errx(EX_DATAERR, "Incorrect MAC address"); + bcopy(mac, addr, ETHER_ADDR_LEN); + } else + errx(EX_DATAERR, "Incorrect MAC address"); + + if (ptr != NULL) { /* we have mask? */ + if (p[ptr - optr - 1] == '/') { /* mask len */ + long ml = strtol(ptr, &ap, 10); + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) + errx(EX_DATAERR, "Incorrect mask length"); + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); + } else { /* mask */ + l = strlen(ptr); + if (strspn(ptr, macset) != l || + (mac = ether_aton(ptr)) == NULL) + errx(EX_DATAERR, "Incorrect mask"); + bcopy(mac, mask, ETHER_ADDR_LEN); + } + } else { /* default mask: ff:ff:ff:ff:ff:ff */ + for (i = 0; i < ETHER_ADDR_LEN; i++) + mask[i] = 0xff; + } + for (i = 0; i < ETHER_ADDR_LEN; i++) + addr[i] &= mask[i]; + + free(optr); +} + +/* + * helper function, updates the pointer to cmd with the length + * of the current command, and also cleans up the first word of + * the new command in case it has been clobbered before. + */ +static ipfw_insn * +next_cmd(ipfw_insn *cmd) +{ + cmd += F_LEN(cmd); + bzero(cmd, sizeof(*cmd)); + return cmd; +} + +/* + * Takes arguments and copies them into a comment + */ +static void +fill_comment(ipfw_insn *cmd, char **av) +{ + int i, l; + char *p = (char *)(cmd + 1); + + cmd->opcode = O_NOP; + cmd->len = (cmd->len & (F_NOT | F_OR)); + + /* Compute length of comment string. */ + for (i = 0, l = 0; av[i] != NULL; i++) + l += strlen(av[i]) + 1; + if (l == 0) + return; + if (l > 84) + errx(EX_DATAERR, + "comment too long (max 80 chars)"); + l = 1 + (l+3)/4; + cmd->len = (cmd->len & (F_NOT | F_OR)) | l; + for (i = 0; av[i] != NULL; i++) { + strcpy(p, av[i]); + p += strlen(av[i]); + *p++ = ' '; + } + *(--p) = '\0'; +} + +/* + * A function to fill simple commands of size 1. + * Existing flags are preserved. + */ +static void +fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) +{ + cmd->opcode = opcode; + cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; + cmd->arg1 = arg; +} + +/* + * Fetch and add the MAC address and type, with masks. This generates one or + * two microinstructions, and returns the pointer to the last one. + */ +static ipfw_insn * +add_mac(ipfw_insn *cmd, char *av[]) +{ + ipfw_insn_mac *mac; + + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) + errx(EX_DATAERR, "MAC dst src"); + + cmd->opcode = O_MACADDR2; + cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); + + mac = (ipfw_insn_mac *)cmd; + get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ + get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), + &(mac->mask[ETHER_ADDR_LEN])); /* src */ + return cmd; +} + +static ipfw_insn * +add_mactype(ipfw_insn *cmd, char *av) +{ + if (!av) + errx(EX_DATAERR, "missing MAC type"); + if (strcmp(av, "any") != 0) { /* we have a non-null type */ + fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); + cmd->opcode = O_MAC_TYPE; + return cmd; + } else + return NULL; +} + +static ipfw_insn * +add_proto0(ipfw_insn *cmd, char *av, u_char *protop) +{ + struct protoent *pe; + char *ep; + int proto; + + proto = strtol(av, &ep, 10); + if (*ep != '\0' || proto <= 0) { + if ((pe = getprotobyname(av)) == NULL) + return NULL; + proto = pe->p_proto; + } + + fill_cmd(cmd, O_PROTO, 0, proto); + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_proto(ipfw_insn *cmd, char *av, u_char *protop) +{ + u_char proto = IPPROTO_IP; + + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) + ; /* do not set O_IP4 nor O_IP6 */ + else if (strcmp(av, "ip4") == 0) + /* explicit "just IPv4" rule */ + fill_cmd(cmd, O_IP4, 0, 0); + else if (strcmp(av, "ip6") == 0) { + /* explicit "just IPv6" rule */ + proto = IPPROTO_IPV6; + fill_cmd(cmd, O_IP6, 0, 0); + } else + return add_proto0(cmd, av, protop); + + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) +{ + u_char proto = IPPROTO_IP; + + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) + ; /* do not set O_IP4 nor O_IP6 */ + else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) + /* explicit "just IPv4" rule */ + fill_cmd(cmd, O_IP4, 0, 0); + else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { + /* explicit "just IPv6" rule */ + proto = IPPROTO_IPV6; + fill_cmd(cmd, O_IP6, 0, 0); + } else + return add_proto0(cmd, av, protop); + + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_srcip(ipfw_insn *cmd, char *av) +{ + fill_ip((ipfw_insn_ip *)cmd, av); + if (cmd->opcode == O_IP_DST_SET) /* set */ + cmd->opcode = O_IP_SRC_SET; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + cmd->opcode = O_IP_SRC_LOOKUP; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ + cmd->opcode = O_IP_SRC_ME; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ + cmd->opcode = O_IP_SRC; + else /* addr/mask */ + cmd->opcode = O_IP_SRC_MASK; + return cmd; +} + +static ipfw_insn * +add_dstip(ipfw_insn *cmd, char *av) +{ + fill_ip((ipfw_insn_ip *)cmd, av); + if (cmd->opcode == O_IP_DST_SET) /* set */ + ; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + ; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ + cmd->opcode = O_IP_DST_ME; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ + cmd->opcode = O_IP_DST; + else /* addr/mask */ + cmd->opcode = O_IP_DST_MASK; + return cmd; +} + +static ipfw_insn * +add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) +{ + /* XXX "any" is trapped before. Perhaps "to" */ + if (_substrcmp(av, "any") == 0) { + return NULL; + } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { + /* XXX todo: check that we have a protocol with ports */ + cmd->opcode = opcode; + return cmd; + } + return NULL; +} + +static ipfw_insn * +add_src(ipfw_insn *cmd, char *av, u_char proto) +{ + struct in6_addr a; + char *host, *ch; + ipfw_insn *ret = NULL; + + if ((host = strdup(av)) == NULL) + return NULL; + if ((ch = strrchr(host, '/')) != NULL) + *ch = '\0'; + + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || + inet_pton(AF_INET6, host, &a) == 1) + ret = add_srcip6(cmd, av); + /* XXX: should check for IPv4, not !IPv6 */ + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || + inet_pton(AF_INET6, host, &a) != 1)) + ret = add_srcip(cmd, av); + if (ret == NULL && strcmp(av, "any") != 0) + ret = cmd; + + free(host); + return ret; +} + +static ipfw_insn * +add_dst(ipfw_insn *cmd, char *av, u_char proto) +{ + struct in6_addr a; + char *host, *ch; + ipfw_insn *ret = NULL; + + if ((host = strdup(av)) == NULL) + return NULL; + if ((ch = strrchr(host, '/')) != NULL) + *ch = '\0'; + + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || + inet_pton(AF_INET6, host, &a) == 1) + ret = add_dstip6(cmd, av); + /* XXX: should check for IPv4, not !IPv6 */ + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || + inet_pton(AF_INET6, host, &a) != 1)) + ret = add_dstip(cmd, av); + if (ret == NULL && strcmp(av, "any") != 0) + ret = cmd; + + free(host); + return ret; +} + +/* + * Parse arguments and assemble the microinstructions which make up a rule. + * Rules are added into the 'rulebuf' and then copied in the correct order + * into the actual rule. + * + * The syntax for a rule starts with the action, followed by + * optional action parameters, and the various match patterns. + * In the assembled microcode, the first opcode must be an O_PROBE_STATE + * (generated if the rule includes a keep-state option), then the + * various match patterns, log/altq actions, and the actual action. + * + */ +void +ipfw_add(char *av[]) +{ + /* + * rules are added into the 'rulebuf' and then copied in + * the correct order into the actual rule. + * Some things that need to go out of order (prob, action etc.) + * go into actbuf[]. + */ + static uint32_t rulebuf[255], actbuf[255], cmdbuf[255]; + + ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; + ipfw_insn *first_cmd; /* first match pattern */ + + struct ip_fw *rule; + + /* + * various flags used to record that we entered some fields. + */ + ipfw_insn *have_state = NULL; /* check-state or keep-state */ + ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; + size_t len; + + int i; + + int open_par = 0; /* open parenthesis ( */ + + /* proto is here because it is used to fetch ports */ + u_char proto = IPPROTO_IP; /* default protocol */ + + double match_prob = 1; /* match probability, default is always match */ + + bzero(actbuf, sizeof(actbuf)); /* actions go here */ + bzero(cmdbuf, sizeof(cmdbuf)); + bzero(rulebuf, sizeof(rulebuf)); + + rule = (struct ip_fw *)rulebuf; + cmd = (ipfw_insn *)cmdbuf; + action = (ipfw_insn *)actbuf; + + av++; + + /* [rule N] -- Rule number optional */ + if (av[0] && isdigit(**av)) { + rule->rulenum = atoi(*av); + av++; + } + + /* [set N] -- set number (0..RESVD_SET), optional */ + if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { + int set = strtoul(av[1], NULL, 10); + if (set < 0 || set > RESVD_SET) + errx(EX_DATAERR, "illegal set %s", av[1]); + rule->set = set; + av += 2; + } + + /* [prob D] -- match probability, optional */ + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { + match_prob = strtod(av[1], NULL); + + if (match_prob <= 0 || match_prob > 1) + errx(EX_DATAERR, "illegal match prob. %s", av[1]); + av += 2; + } + + /* action -- mandatory */ + NEED1("missing action"); + i = match_token(rule_actions, *av); + av++; + action->len = 1; /* default */ + switch(i) { + case TOK_CHECKSTATE: + have_state = action; + action->opcode = O_CHECK_STATE; + break; + + case TOK_ACCEPT: + action->opcode = O_ACCEPT; + break; + + case TOK_DENY: + action->opcode = O_DENY; + action->arg1 = 0; + break; + + case TOK_REJECT: + action->opcode = O_REJECT; + action->arg1 = ICMP_UNREACH_HOST; + break; + + case TOK_RESET: + action->opcode = O_REJECT; + action->arg1 = ICMP_REJECT_RST; + break; + + case TOK_RESET6: + action->opcode = O_UNREACH6; + action->arg1 = ICMP6_UNREACH_RST; + break; + + case TOK_UNREACH: + action->opcode = O_REJECT; + NEED1("missing reject code"); + fill_reject_code(&action->arg1, *av); + av++; + break; + + case TOK_UNREACH6: + action->opcode = O_UNREACH6; + NEED1("missing unreach code"); + fill_unreach6_code(&action->arg1, *av); + av++; + break; + + case TOK_COUNT: + action->opcode = O_COUNT; + break; + + case TOK_NAT: + action->opcode = O_NAT; + action->len = F_INSN_SIZE(ipfw_insn_nat); + goto chkarg; + + case TOK_QUEUE: + action->opcode = O_QUEUE; + goto chkarg; + case TOK_PIPE: + action->opcode = O_PIPE; + goto chkarg; + case TOK_SKIPTO: + action->opcode = O_SKIPTO; + goto chkarg; + case TOK_NETGRAPH: + action->opcode = O_NETGRAPH; + goto chkarg; + case TOK_NGTEE: + action->opcode = O_NGTEE; + goto chkarg; + case TOK_DIVERT: + action->opcode = O_DIVERT; + goto chkarg; + case TOK_TEE: + action->opcode = O_TEE; + goto chkarg; + case TOK_CALL: + action->opcode = O_CALLRETURN; +chkarg: + if (!av[0]) + errx(EX_USAGE, "missing argument for %s", *(av - 1)); + if (isdigit(**av)) { + action->arg1 = strtoul(*av, NULL, 10); + if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) + errx(EX_DATAERR, "illegal argument for %s", + *(av - 1)); + } else if (_substrcmp(*av, "tablearg") == 0) { + action->arg1 = IP_FW_TABLEARG; + } else if (i == TOK_DIVERT || i == TOK_TEE) { + struct servent *s; + setservent(1); + s = getservbyname(av[0], "divert"); + if (s != NULL) + action->arg1 = ntohs(s->s_port); + else + errx(EX_DATAERR, "illegal divert/tee port"); + } else + errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); + av++; + break; + + case TOK_FORWARD: { + ipfw_insn_sa *p = (ipfw_insn_sa *)action; + char *s, *end; + + NEED1("missing forward address[:port]"); + + action->opcode = O_FORWARD_IP; + action->len = F_INSN_SIZE(ipfw_insn_sa); + + /* + * In the kernel we assume AF_INET and use only + * sin_port and sin_addr. Remember to set sin_len as + * the routing code seems to use it too. + */ + p->sa.sin_family = AF_INET; + p->sa.sin_len = sizeof(struct sockaddr_in); + p->sa.sin_port = 0; + /* + * locate the address-port separator (':' or ',') + */ + s = strchr(*av, ':'); + if (s == NULL) + s = strchr(*av, ','); + if (s != NULL) { + *(s++) = '\0'; + i = strtoport(s, &end, 0 /* base */, 0 /* proto */); + if (s == end) + errx(EX_DATAERR, + "illegal forwarding port ``%s''", s); + p->sa.sin_port = (u_short)i; + } + if (_substrcmp(*av, "tablearg") == 0) + p->sa.sin_addr.s_addr = INADDR_ANY; + else + lookup_host(*av, &(p->sa.sin_addr)); + av++; + break; + } + case TOK_COMMENT: + /* pretend it is a 'count' rule followed by the comment */ + action->opcode = O_COUNT; + av--; /* go back... */ + break; + + case TOK_SETFIB: + { + int numfibs; + size_t intsize = sizeof(int); + + action->opcode = O_SETFIB; + NEED1("missing fib number"); + action->arg1 = strtoul(*av, NULL, 10); + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + errx(EX_DATAERR, "fibs not suported.\n"); + if (action->arg1 >= numfibs) /* Temporary */ + errx(EX_DATAERR, "fib too large.\n"); + av++; + break; + } + + case TOK_REASS: + action->opcode = O_REASS; + break; + + case TOK_RETURN: + fill_cmd(action, O_CALLRETURN, F_NOT, 0); + break; + + default: + errx(EX_DATAERR, "invalid action %s\n", av[-1]); + } + action = next_cmd(action); + + /* + * [altq queuename] -- altq tag, optional + * [log [logamount N]] -- log, optional + * + * If they exist, it go first in the cmdbuf, but then it is + * skipped in the copy section to the end of the buffer. + */ + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { + av++; + switch (i) { + case TOK_LOG: + { + ipfw_insn_log *c = (ipfw_insn_log *)cmd; + int l; + + if (have_log) + errx(EX_DATAERR, + "log cannot be specified more than once"); + have_log = (ipfw_insn *)c; + cmd->len = F_INSN_SIZE(ipfw_insn_log); + cmd->opcode = O_LOG; + if (av[0] && _substrcmp(*av, "logamount") == 0) { + av++; + NEED1("logamount requires argument"); + l = atoi(*av); + if (l < 0) + errx(EX_DATAERR, + "logamount must be positive"); + c->max_log = l; + av++; + } else { + len = sizeof(c->max_log); + if (sysctlbyname("net.inet.ip.fw.verbose_limit", + &c->max_log, &len, NULL, 0) == -1) + errx(1, "sysctlbyname(\"%s\")", + "net.inet.ip.fw.verbose_limit"); + } + } + break; + +#ifndef NO_ALTQ + case TOK_ALTQ: + { + ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; + + NEED1("missing altq queue name"); + if (have_altq) + errx(EX_DATAERR, + "altq cannot be specified more than once"); + have_altq = (ipfw_insn *)a; + cmd->len = F_INSN_SIZE(ipfw_insn_altq); + cmd->opcode = O_ALTQ; + a->qid = altq_name_to_qid(*av); + av++; + } + break; +#endif + + case TOK_TAG: + case TOK_UNTAG: { + uint16_t tag; + + if (have_tag) + errx(EX_USAGE, "tag and untag cannot be " + "specified more than once"); + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, + rule_action_params); + have_tag = cmd; + fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); + av++; + break; + } + + default: + abort(); + } + cmd = next_cmd(cmd); + } + + if (have_state) /* must be a check-state, we are done */ + goto done; + +#define OR_START(target) \ + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ + if (open_par) \ + errx(EX_USAGE, "nested \"(\" not allowed\n"); \ + prev = NULL; \ + open_par = 1; \ + if ( (av[0])[1] == '\0') { \ + av++; \ + } else \ + (*av)++; \ + } \ + target: \ + + +#define CLOSE_PAR \ + if (open_par) { \ + if (av[0] && ( \ + strcmp(*av, ")") == 0 || \ + strcmp(*av, "}") == 0)) { \ + prev = NULL; \ + open_par = 0; \ + av++; \ + } else \ + errx(EX_USAGE, "missing \")\"\n"); \ + } + +#define NOT_BLOCK \ + if (av[0] && _substrcmp(*av, "not") == 0) { \ + if (cmd->len & F_NOT) \ + errx(EX_USAGE, "double \"not\" not allowed\n"); \ + cmd->len |= F_NOT; \ + av++; \ + } + +#define OR_BLOCK(target) \ + if (av[0] && _substrcmp(*av, "or") == 0) { \ + if (prev == NULL || open_par == 0) \ + errx(EX_DATAERR, "invalid OR block"); \ + prev->len |= F_OR; \ + av++; \ + goto target; \ + } \ + CLOSE_PAR; + + first_cmd = cmd; + +#if 0 + /* + * MAC addresses, optional. + * If we have this, we skip the part "proto from src to dst" + * and jump straight to the option parsing. + */ + NOT_BLOCK; + NEED1("missing protocol"); + if (_substrcmp(*av, "MAC") == 0 || + _substrcmp(*av, "mac") == 0) { + av++; /* the "MAC" keyword */ + add_mac(cmd, av); /* exits in case of errors */ + cmd = next_cmd(cmd); + av += 2; /* dst-mac and src-mac */ + NOT_BLOCK; + NEED1("missing mac type"); + if (add_mactype(cmd, av[0])) + cmd = next_cmd(cmd); + av++; /* any or mac-type */ + goto read_options; + } +#endif + + /* + * protocol, mandatory + */ + OR_START(get_proto); + NOT_BLOCK; + NEED1("missing protocol"); + if (add_proto_compat(cmd, *av, &proto)) { + av++; + if (F_LEN(cmd) != 0) { + prev = cmd; + cmd = next_cmd(cmd); + } + } else if (first_cmd != cmd) { + errx(EX_DATAERR, "invalid protocol ``%s''", *av); + } else + goto read_options; + OR_BLOCK(get_proto); + + /* + * "from", mandatory + */ + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) + errx(EX_USAGE, "missing ``from''"); + av++; + + /* + * source IP, mandatory + */ + OR_START(source_ip); + NOT_BLOCK; /* optional "not" */ + NEED1("missing source address"); + if (add_src(cmd, *av, proto)) { + av++; + if (F_LEN(cmd) != 0) { /* ! any */ + prev = cmd; + cmd = next_cmd(cmd); + } + } else + errx(EX_USAGE, "bad source address %s", *av); + OR_BLOCK(source_ip); + + /* + * source ports, optional + */ + NOT_BLOCK; /* optional "not" */ + if ( av[0] != NULL ) { + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_SRCPORT)) { + av++; + if (F_LEN(cmd) != 0) + cmd = next_cmd(cmd); + } + } + + /* + * "to", mandatory + */ + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) + errx(EX_USAGE, "missing ``to''"); + av++; + + /* + * destination, mandatory + */ + OR_START(dest_ip); + NOT_BLOCK; /* optional "not" */ + NEED1("missing dst address"); + if (add_dst(cmd, *av, proto)) { + av++; + if (F_LEN(cmd) != 0) { /* ! any */ + prev = cmd; + cmd = next_cmd(cmd); + } + } else + errx( EX_USAGE, "bad destination address %s", *av); + OR_BLOCK(dest_ip); + + /* + * dest. ports, optional + */ + NOT_BLOCK; /* optional "not" */ + if (av[0]) { + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_DSTPORT)) { + av++; + if (F_LEN(cmd) != 0) + cmd = next_cmd(cmd); + } + } + +read_options: + if (av[0] && first_cmd == cmd) { + /* + * nothing specified so far, store in the rule to ease + * printout later. + */ + rule->_pad = 1; + } + prev = NULL; + while ( av[0] != NULL ) { + char *s; + ipfw_insn_u32 *cmd32; /* alias for cmd */ + + s = *av; + cmd32 = (ipfw_insn_u32 *)cmd; + + if (*s == '!') { /* alternate syntax for NOT */ + if (cmd->len & F_NOT) + errx(EX_USAGE, "double \"not\" not allowed\n"); + cmd->len = F_NOT; + s++; + } + i = match_token(rule_options, s); + av++; + switch(i) { + case TOK_NOT: + if (cmd->len & F_NOT) + errx(EX_USAGE, "double \"not\" not allowed\n"); + cmd->len = F_NOT; + break; + + case TOK_OR: + if (open_par == 0 || prev == NULL) + errx(EX_USAGE, "invalid \"or\" block\n"); + prev->len |= F_OR; + break; + + case TOK_STARTBRACE: + if (open_par) + errx(EX_USAGE, "+nested \"(\" not allowed\n"); + open_par = 1; + break; + + case TOK_ENDBRACE: + if (!open_par) + errx(EX_USAGE, "+missing \")\"\n"); + open_par = 0; + prev = NULL; + break; + + case TOK_IN: + fill_cmd(cmd, O_IN, 0, 0); + break; + + case TOK_OUT: + cmd->len ^= F_NOT; /* toggle F_NOT */ + fill_cmd(cmd, O_IN, 0, 0); + break; + + case TOK_DIVERTED: + fill_cmd(cmd, O_DIVERTED, 0, 3); + break; + + case TOK_DIVERTEDLOOPBACK: + fill_cmd(cmd, O_DIVERTED, 0, 1); + break; + + case TOK_DIVERTEDOUTPUT: + fill_cmd(cmd, O_DIVERTED, 0, 2); + break; + + case TOK_FRAG: + fill_cmd(cmd, O_FRAG, 0, 0); + break; + + case TOK_LAYER2: + fill_cmd(cmd, O_LAYER2, 0, 0); + break; + + case TOK_XMIT: + case TOK_RECV: + case TOK_VIA: + NEED1("recv, xmit, via require interface name" + " or address"); + fill_iface((ipfw_insn_if *)cmd, av[0]); + av++; + if (F_LEN(cmd) == 0) /* not a valid address */ + break; + if (i == TOK_XMIT) + cmd->opcode = O_XMIT; + else if (i == TOK_RECV) + cmd->opcode = O_RECV; + else if (i == TOK_VIA) + cmd->opcode = O_VIA; + break; + + case TOK_ICMPTYPES: + NEED1("icmptypes requires list of types"); + fill_icmptypes((ipfw_insn_u32 *)cmd, *av); + av++; + break; + + case TOK_ICMP6TYPES: + NEED1("icmptypes requires list of types"); + fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); + av++; + break; + + case TOK_IPTTL: + NEED1("ipttl requires TTL"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPTTL)) + errx(EX_DATAERR, "invalid ipttl %s", *av); + } else + fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPID: + NEED1("ipid requires id"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPID)) + errx(EX_DATAERR, "invalid ipid %s", *av); + } else + fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPLEN: + NEED1("iplen requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPLEN)) + errx(EX_DATAERR, "invalid ip len %s", *av); + } else + fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPVER: + NEED1("ipver requires version"); + fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPPRECEDENCE: + NEED1("ipprecedence requires value"); + fill_cmd(cmd, O_IPPRECEDENCE, 0, + (strtoul(*av, NULL, 0) & 7) << 5); + av++; + break; + + case TOK_IPOPTS: + NEED1("missing argument for ipoptions"); + fill_flags(cmd, O_IPOPT, f_ipopts, *av); + av++; + break; + + case TOK_IPTOS: + NEED1("missing argument for iptos"); + fill_flags(cmd, O_IPTOS, f_iptos, *av); + av++; + break; + + case TOK_UID: + NEED1("uid requires argument"); + { + char *end; + uid_t uid; + struct passwd *pwd; + + cmd->opcode = O_UID; + uid = strtoul(*av, &end, 0); + pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); + if (pwd == NULL) + errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); + cmd32->d[0] = pwd->pw_uid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_GID: + NEED1("gid requires argument"); + { + char *end; + gid_t gid; + struct group *grp; + + cmd->opcode = O_GID; + gid = strtoul(*av, &end, 0); + grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); + if (grp == NULL) + errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); + cmd32->d[0] = grp->gr_gid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_JAIL: + NEED1("jail requires argument"); + { + char *end; + int jid; + + cmd->opcode = O_JAIL; + jid = (int)strtol(*av, &end, 0); + if (jid < 0 || *end != '\0') + errx(EX_DATAERR, "jail requires prison ID"); + cmd32->d[0] = (uint32_t)jid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_ESTAB: + fill_cmd(cmd, O_ESTAB, 0, 0); + break; + + case TOK_SETUP: + fill_cmd(cmd, O_TCPFLAGS, 0, + (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); + break; + + case TOK_TCPDATALEN: + NEED1("tcpdatalen requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TCPDATALEN)) + errx(EX_DATAERR, "invalid tcpdata len %s", *av); + } else + fill_cmd(cmd, O_TCPDATALEN, 0, + strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPOPTS: + NEED1("missing argument for tcpoptions"); + fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); + av++; + break; + + case TOK_TCPSEQ: + case TOK_TCPACK: + NEED1("tcpseq/tcpack requires argument"); + cmd->len = F_INSN_SIZE(ipfw_insn_u32); + cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; + cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPWIN: + NEED1("tcpwin requires length"); + fill_cmd(cmd, O_TCPWIN, 0, + htons(strtoul(*av, NULL, 0))); + av++; + break; + + case TOK_TCPFLAGS: + NEED1("missing argument for tcpflags"); + cmd->opcode = O_TCPFLAGS; + fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); + av++; + break; + + case TOK_KEEPSTATE: + if (open_par) + errx(EX_USAGE, "keep-state cannot be part " + "of an or block"); + if (have_state) + errx(EX_USAGE, "only one of keep-state " + "and limit is allowed"); + have_state = cmd; + fill_cmd(cmd, O_KEEP_STATE, 0, 0); + break; + + case TOK_LIMIT: { + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + int val; + + if (open_par) + errx(EX_USAGE, + "limit cannot be part of an or block"); + if (have_state) + errx(EX_USAGE, "only one of keep-state and " + "limit is allowed"); + have_state = cmd; + + cmd->len = F_INSN_SIZE(ipfw_insn_limit); + cmd->opcode = O_LIMIT; + c->limit_mask = c->conn_limit = 0; + + while ( av[0] != NULL ) { + if ((val = match_token(limit_masks, *av)) <= 0) + break; + c->limit_mask |= val; + av++; + } + + if (c->limit_mask == 0) + errx(EX_USAGE, "limit: missing limit mask"); + + GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, + TOK_LIMIT, rule_options); + + av++; + break; + } + + case TOK_PROTO: + NEED1("missing protocol"); + if (add_proto(cmd, *av, &proto)) { + av++; + } else + errx(EX_DATAERR, "invalid protocol ``%s''", + *av); + break; + + case TOK_SRCIP: + NEED1("missing source IP"); + if (add_srcip(cmd, *av)) { + av++; + } + break; + + case TOK_DSTIP: + NEED1("missing destination IP"); + if (add_dstip(cmd, *av)) { + av++; + } + break; + + case TOK_SRCIP6: + NEED1("missing source IP6"); + if (add_srcip6(cmd, *av)) { + av++; + } + break; + + case TOK_DSTIP6: + NEED1("missing destination IP6"); + if (add_dstip6(cmd, *av)) { + av++; + } + break; + + case TOK_SRCPORT: + NEED1("missing source port"); + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_SRCPORT)) { + av++; + } else + errx(EX_DATAERR, "invalid source port %s", *av); + break; + + case TOK_DSTPORT: + NEED1("missing destination port"); + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_DSTPORT)) { + av++; + } else + errx(EX_DATAERR, "invalid destination port %s", + *av); + break; + + case TOK_MAC: + if (add_mac(cmd, av)) + av += 2; + break; + + case TOK_MACTYPE: + NEED1("missing mac type"); + if (!add_mactype(cmd, *av)) + errx(EX_DATAERR, "invalid mac type %s", *av); + av++; + break; + + case TOK_VERREVPATH: + fill_cmd(cmd, O_VERREVPATH, 0, 0); + break; + + case TOK_VERSRCREACH: + fill_cmd(cmd, O_VERSRCREACH, 0, 0); + break; + + case TOK_ANTISPOOF: + fill_cmd(cmd, O_ANTISPOOF, 0, 0); + break; + + case TOK_IPSEC: + fill_cmd(cmd, O_IPSEC, 0, 0); + break; + + case TOK_IPV6: + fill_cmd(cmd, O_IP6, 0, 0); + break; + + case TOK_IPV4: + fill_cmd(cmd, O_IP4, 0, 0); + break; + + case TOK_EXT6HDR: + fill_ext6hdr( cmd, *av ); + av++; + break; + + case TOK_FLOWID: + if (proto != IPPROTO_IPV6 ) + errx( EX_USAGE, "flow-id filter is active " + "only for ipv6 protocol\n"); + fill_flow6( (ipfw_insn_u32 *) cmd, *av ); + av++; + break; + + case TOK_COMMENT: + fill_comment(cmd, av); + av[0]=NULL; + break; + + case TOK_TAGGED: + if (av[0] && strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TAGGED)) + errx(EX_DATAERR, "tagged: invalid tag" + " list: %s", *av); + } + else { + uint16_t tag; + + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, + TOK_TAGGED, rule_options); + fill_cmd(cmd, O_TAGGED, 0, tag); + } + av++; + break; + + case TOK_FIB: + NEED1("fib requires fib number"); + fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); + av++; + break; + case TOK_SOCKARG: + fill_cmd(cmd, O_SOCKARG, 0, 0); + break; + + case TOK_LOOKUP: { + ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; + char *p; + int j; + + if (!av[0] || !av[1]) + errx(EX_USAGE, "format: lookup argument tablenum"); + cmd->opcode = O_IP_DST_LOOKUP; + cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; + i = match_token(rule_options, *av); + for (j = 0; lookup_key[j] >= 0 ; j++) { + if (i == lookup_key[j]) + break; + } + if (lookup_key[j] <= 0) + errx(EX_USAGE, "format: cannot lookup on %s", *av); + __PAST_END(c->d, 1) = j; // i converted to option + av++; + cmd->arg1 = strtoul(*av, &p, 0); + if (p && *p) + errx(EX_USAGE, "format: lookup argument tablenum"); + av++; + } + break; + + default: + errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); + } + if (F_LEN(cmd) > 0) { /* prepare to advance */ + prev = cmd; + cmd = next_cmd(cmd); + } + } + +done: + /* + * Now copy stuff into the rule. + * If we have a keep-state option, the first instruction + * must be a PROBE_STATE (which is generated here). + * If we have a LOG option, it was stored as the first command, + * and now must be moved to the top of the action part. + */ + dst = (ipfw_insn *)rule->cmd; + + /* + * First thing to write into the command stream is the match probability. + */ + if (match_prob != 1) { /* 1 means always match */ + dst->opcode = O_PROB; + dst->len = 2; + *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); + dst += dst->len; + } + + /* + * generate O_PROBE_STATE if necessary + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + fill_cmd(dst, O_PROBE_STATE, 0, 0); + dst = next_cmd(dst); + } + + /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ + for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { + i = F_LEN(src); + + switch (src->opcode) { + case O_LOG: + case O_KEEP_STATE: + case O_LIMIT: + case O_ALTQ: + case O_TAG: + break; + default: + bcopy(src, dst, i * sizeof(uint32_t)); + dst += i; + } + } + + /* + * put back the have_state command as last opcode + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + i = F_LEN(have_state); + bcopy(have_state, dst, i * sizeof(uint32_t)); + dst += i; + } + /* + * start action section + */ + rule->act_ofs = dst - rule->cmd; + + /* put back O_LOG, O_ALTQ, O_TAG if necessary */ + if (have_log) { + i = F_LEN(have_log); + bcopy(have_log, dst, i * sizeof(uint32_t)); + dst += i; + } + if (have_altq) { + i = F_LEN(have_altq); + bcopy(have_altq, dst, i * sizeof(uint32_t)); + dst += i; + } + if (have_tag) { + i = F_LEN(have_tag); + bcopy(have_tag, dst, i * sizeof(uint32_t)); + dst += i; + } + /* + * copy all other actions + */ + for (src = (ipfw_insn *)actbuf; src != action; src += i) { + i = F_LEN(src); + bcopy(src, dst, i * sizeof(uint32_t)); + dst += i; + } + + rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); + i = (char *)dst - (char *)rule; + if (do_cmd(IP_FW_ADD, rule, (uintptr_t)&i) == -1) + err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_ADD"); + if (!co.do_quiet) + show_ipfw(rule, 0, 0); +} + +/* + * clear the counters or the log counters. + */ +void +ipfw_zero(int ac, char *av[], int optname /* 0 = IP_FW_ZERO, 1 = IP_FW_RESETLOG */) +{ + uint32_t arg, saved_arg; + int failed = EX_OK; + char const *errstr; + char const *name = optname ? "RESETLOG" : "ZERO"; + + optname = optname ? IP_FW_RESETLOG : IP_FW_ZERO; + + av++; ac--; + + if (!ac) { + /* clear all entries */ + if (do_cmd(optname, NULL, 0) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_FW_%s)", name); + if (!co.do_quiet) + printf("%s.\n", optname == IP_FW_ZERO ? + "Accounting cleared":"Logging counts reset"); + + return; + } + + while (ac) { + /* Rule number */ + if (isdigit(**av)) { + arg = strtonum(*av, 0, 0xffff, &errstr); + if (errstr) + errx(EX_DATAERR, + "invalid rule number %s\n", *av); + saved_arg = arg; + if (co.use_set) + arg |= (1 << 24) | ((co.use_set - 1) << 16); + av++; + ac--; + if (do_cmd(optname, &arg, sizeof(arg))) { + warn("rule %u: setsockopt(IP_FW_%s)", + saved_arg, name); + failed = EX_UNAVAILABLE; + } else if (!co.do_quiet) + printf("Entry %d %s.\n", saved_arg, + optname == IP_FW_ZERO ? + "cleared" : "logging count reset"); + } else { + errx(EX_USAGE, "invalid rule number ``%s''", *av); + } + } + if (failed != EX_OK) + exit(failed); +} + +void +ipfw_flush(int force) +{ + int cmd = co.do_pipe ? IP_DUMMYNET_FLUSH : IP_FW_FLUSH; + + if (!force && !co.do_quiet) { /* need to ask user */ + int c; + + printf("Are you sure? [yn] "); + fflush(stdout); + do { + c = toupper(getc(stdin)); + while (c != '\n' && getc(stdin) != '\n') + if (feof(stdin)) + return; /* and do not flush */ + } while (c != 'Y' && c != 'N'); + printf("\n"); + if (c == 'N') /* user said no */ + return; + } + if (co.do_pipe) { + dummynet_flush(); + return; + } + /* `ipfw set N flush` - is the same that `ipfw delete set N` */ + if (co.use_set) { + uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); + if (do_cmd(IP_FW_DEL, &arg, sizeof(arg)) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_FW_DEL)"); + } else if (do_cmd(cmd, NULL, 0) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_%s_FLUSH)", + co.do_pipe ? "DUMMYNET" : "FW"); + if (!co.do_quiet) + printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); +} + + +static void table_list(ipfw_table_entry ent, int need_header); + +/* + * This one handles all table-related commands + * ipfw table N add addr[/masklen] [value] + * ipfw table N delete addr[/masklen] + * ipfw table {N | all} flush + * ipfw table {N | all} list + */ +void +ipfw_table_handler(int ac, char *av[]) +{ + ipfw_table_entry ent; + int do_add; + int is_all; + size_t len; + char *p; + uint32_t a; + uint32_t tables_max; + + len = sizeof(tables_max); + if (sysctlbyname("net.inet.ip.fw.tables_max", &tables_max, &len, + NULL, 0) == -1) { +#ifdef IPFW_TABLES_MAX + warn("Warn: Failed to get the max tables number via sysctl. " + "Using the compiled in defaults. \nThe reason was"); + tables_max = IPFW_TABLES_MAX; +#else + errx(1, "Failed sysctlbyname(\"net.inet.ip.fw.tables_max\")"); +#endif + } + + ac--; av++; + if (ac && isdigit(**av)) { + ent.tbl = atoi(*av); + is_all = 0; + ac--; av++; + } else if (ac && _substrcmp(*av, "all") == 0) { + ent.tbl = 0; + is_all = 1; + ac--; av++; + } else + errx(EX_USAGE, "table number or 'all' keyword required"); + if (ent.tbl >= tables_max) + errx(EX_USAGE, "The table number exceeds the maximum allowed " + "value (%d)", tables_max - 1); + NEED1("table needs command"); + if (is_all && _substrcmp(*av, "list") != 0 + && _substrcmp(*av, "flush") != 0) + errx(EX_USAGE, "table number required"); + + if (_substrcmp(*av, "add") == 0 || + _substrcmp(*av, "delete") == 0) { + do_add = **av == 'a'; + ac--; av++; + if (!ac) + errx(EX_USAGE, "IP address required"); + p = strchr(*av, '/'); + if (p) { + *p++ = '\0'; + ent.masklen = atoi(p); + if (ent.masklen > 32) + errx(EX_DATAERR, "bad width ``%s''", p); + } else + ent.masklen = 32; + if (lookup_host(*av, (struct in_addr *)&ent.addr) != 0) + errx(EX_NOHOST, "hostname ``%s'' unknown", *av); + ac--; av++; + if (do_add && ac) { + unsigned int tval; + /* isdigit is a bit of a hack here.. */ + if (strchr(*av, (int)'.') == NULL && isdigit(**av)) { + ent.value = strtoul(*av, NULL, 0); + } else { + if (lookup_host(*av, (struct in_addr *)&tval) == 0) { + /* The value must be stored in host order * + * so that the values < 65k can be distinguished */ + ent.value = ntohl(tval); + } else { + errx(EX_NOHOST, "hostname ``%s'' unknown", *av); + } + } + } else + ent.value = 0; + if (do_cmd(do_add ? IP_FW_TABLE_ADD : IP_FW_TABLE_DEL, + &ent, sizeof(ent)) < 0) { + /* If running silent, don't bomb out on these errors. */ + if (!(co.do_quiet && (errno == (do_add ? EEXIST : ESRCH)))) + err(EX_OSERR, "setsockopt(IP_FW_TABLE_%s)", + do_add ? "ADD" : "DEL"); + /* In silent mode, react to a failed add by deleting */ + if (do_add) { + do_cmd(IP_FW_TABLE_DEL, &ent, sizeof(ent)); + if (do_cmd(IP_FW_TABLE_ADD, + &ent, sizeof(ent)) < 0) + err(EX_OSERR, + "setsockopt(IP_FW_TABLE_ADD)"); + } + } + } else if (_substrcmp(*av, "flush") == 0) { + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); + do { + if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl, + sizeof(ent.tbl)) < 0) + err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); + } while (++ent.tbl < a); + } else if (_substrcmp(*av, "list") == 0) { + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); + do { + table_list(ent, is_all); + } while (++ent.tbl < a); + } else + errx(EX_USAGE, "invalid table command %s", *av); +} + +static void +table_list(ipfw_table_entry ent, int need_header) +{ + ipfw_table *tbl; + socklen_t l; + uint32_t a; + + a = ent.tbl; + l = sizeof(a); + if (do_cmd(IP_FW_TABLE_GETSIZE, &a, (uintptr_t)&l) < 0) + err(EX_OSERR, "getsockopt(IP_FW_TABLE_GETSIZE)"); + + /* If a is zero we have nothing to do, the table is empty. */ + if (a == 0) + return; + + l = sizeof(*tbl) + a * sizeof(ipfw_table_entry); + tbl = safe_calloc(1, l); + tbl->tbl = ent.tbl; + if (do_cmd(IP_FW_TABLE_LIST, tbl, (uintptr_t)&l) < 0) + err(EX_OSERR, "getsockopt(IP_FW_TABLE_LIST)"); + if (tbl->cnt && need_header) + printf("---table(%d)---\n", tbl->tbl); + for (a = 0; a < tbl->cnt; a++) { + unsigned int tval; + tval = tbl->ent[a].value; + if (co.do_value_as_ip) { + char tbuf[128]; + strncpy(tbuf, inet_ntoa(*(struct in_addr *) + &tbl->ent[a].addr), 127); + /* inet_ntoa expects network order */ + tval = htonl(tval); + printf("%s/%u %s\n", tbuf, tbl->ent[a].masklen, + inet_ntoa(*(struct in_addr *)&tval)); + } else { + printf("%s/%u %u\n", + inet_ntoa(*(struct in_addr *)&tbl->ent[a].addr), + tbl->ent[a].masklen, tval); + } + } + free(tbl); +} diff --git a/ipfw/ipfw2.h b/ipfw/ipfw2.h new file mode 100644 index 0000000..1f280f5 --- /dev/null +++ b/ipfw/ipfw2.h @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipfw2.h 206843 2010-04-19 15:11:45Z luigi $ + */ + +/* + * Options that can be set on the command line. + * When reading commands from a file, a subset of the options can also + * be applied globally by specifying them before the file name. + * After that, each line can contain its own option that changes + * the global value. + * XXX The context is not restored after each line. + */ + +struct cmdline_opts { + /* boolean options: */ + int do_value_as_ip; /* show table value as IP */ + int do_resolv; /* try to resolve all ip to names */ + int do_time; /* Show time stamps */ + int do_quiet; /* Be quiet in add and flush */ + int do_pipe; /* this cmd refers to a pipe/queue/sched */ + int do_nat; /* this cmd refers to a nat config */ + int do_dynamic; /* display dynamic rules */ + int do_expired; /* display expired dynamic rules */ + int do_compact; /* show rules in compact mode */ + int do_force; /* do not ask for confirmation */ + int show_sets; /* display the set each rule belongs to */ + int test_only; /* only check syntax */ + int comment_only; /* only print action and comment */ + int verbose; /* be verbose on some commands */ + + /* The options below can have multiple values. */ + + int do_sort; /* field to sort results (0 = no) */ + /* valid fields are 1 and above */ + + int use_set; /* work with specified set number */ + /* 0 means all sets, otherwise apply to set use_set - 1 */ + +}; + +extern struct cmdline_opts co; + +/* + * _s_x is a structure that stores a string <-> token pairs, used in + * various places in the parser. Entries are stored in arrays, + * with an entry with s=NULL as terminator. + * The search routines are match_token() and match_value(). + * Often, an element with x=0 contains an error string. + * + */ +struct _s_x { + char const *s; + int x; +}; + +enum tokens { + TOK_NULL=0, + + TOK_OR, + TOK_NOT, + TOK_STARTBRACE, + TOK_ENDBRACE, + + TOK_ACCEPT, + TOK_COUNT, + TOK_PIPE, + TOK_LINK, + TOK_QUEUE, + TOK_FLOWSET, + TOK_SCHED, + TOK_DIVERT, + TOK_TEE, + TOK_NETGRAPH, + TOK_NGTEE, + TOK_FORWARD, + TOK_SKIPTO, + TOK_DENY, + TOK_REJECT, + TOK_RESET, + TOK_UNREACH, + TOK_CHECKSTATE, + TOK_NAT, + TOK_REASS, + TOK_CALL, + TOK_RETURN, + + TOK_ALTQ, + TOK_LOG, + TOK_TAG, + TOK_UNTAG, + + TOK_TAGGED, + TOK_UID, + TOK_GID, + TOK_JAIL, + TOK_IN, + TOK_LIMIT, + TOK_KEEPSTATE, + TOK_LAYER2, + TOK_OUT, + TOK_DIVERTED, + TOK_DIVERTEDLOOPBACK, + TOK_DIVERTEDOUTPUT, + TOK_XMIT, + TOK_RECV, + TOK_VIA, + TOK_FRAG, + TOK_IPOPTS, + TOK_IPLEN, + TOK_IPID, + TOK_IPPRECEDENCE, + TOK_DSCP, + TOK_IPTOS, + TOK_IPTTL, + TOK_IPVER, + TOK_ESTAB, + TOK_SETUP, + TOK_TCPDATALEN, + TOK_TCPFLAGS, + TOK_TCPOPTS, + TOK_TCPSEQ, + TOK_TCPACK, + TOK_TCPWIN, + TOK_ICMPTYPES, + TOK_MAC, + TOK_MACTYPE, + TOK_VERREVPATH, + TOK_VERSRCREACH, + TOK_ANTISPOOF, + TOK_IPSEC, + TOK_COMMENT, + + TOK_PLR, + TOK_NOERROR, + TOK_BUCKETS, + TOK_DSTIP, + TOK_SRCIP, + TOK_DSTPORT, + TOK_SRCPORT, + TOK_ALL, + TOK_MASK, + TOK_FLOW_MASK, + TOK_SCHED_MASK, + TOK_BW, + TOK_DELAY, + TOK_PROFILE, + TOK_BURST, + TOK_RED, + TOK_GRED, + TOK_DROPTAIL, + TOK_PROTO, + /* dummynet tokens */ + TOK_WEIGHT, + TOK_LMAX, + TOK_PRI, + TOK_TYPE, + TOK_SLOTSIZE, + + TOK_IP, + TOK_IF, + TOK_ALOG, + TOK_DENY_INC, + TOK_SAME_PORTS, + TOK_UNREG_ONLY, + TOK_SKIP_GLOBAL, + TOK_RESET_ADDR, + TOK_ALIAS_REV, + TOK_PROXY_ONLY, + TOK_REDIR_ADDR, + TOK_REDIR_PORT, + TOK_REDIR_PROTO, + + TOK_IPV6, + TOK_FLOWID, + TOK_ICMP6TYPES, + TOK_EXT6HDR, + TOK_DSTIP6, + TOK_SRCIP6, + + TOK_IPV4, + TOK_UNREACH6, + TOK_RESET6, + + TOK_FIB, + TOK_SETFIB, + TOK_LOOKUP, + TOK_SOCKARG, +}; +/* + * the following macro returns an error message if we run out of + * arguments. + */ +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} + +int pr_u64(uint64_t *pd, int width); + +/* memory allocation support */ +void *safe_calloc(size_t number, size_t size); +void *safe_realloc(void *ptr, size_t size); + +/* string comparison functions used for historical compatibility */ +int _substrcmp(const char *str1, const char* str2); +int _substrcmp2(const char *str1, const char* str2, const char* str3); + +/* utility functions */ +int match_token(struct _s_x *table, char *string); +char const *match_value(struct _s_x *p, int value); + +int do_cmd(int optname, void *optval, uintptr_t optlen); + +struct in6_addr; +void n2mask(struct in6_addr *mask, int n); +int contigmask(uint8_t *p, int len); + +/* + * Forward declarations to avoid include way too many headers. + * C does not allow duplicated typedefs, so we use the base struct + * that the typedef points to. + * Should the typedefs use a different type, the compiler will + * still detect the change when compiling the body of the + * functions involved, so we do not lose error checking. + */ +struct _ipfw_insn; +struct _ipfw_insn_altq; +struct _ipfw_insn_u32; +struct _ipfw_insn_ip6; +struct _ipfw_insn_icmp6; + +/* + * The reserved set numer. This is a constant in ip_fw.h + * but we store it in a variable so other files do not depend + * in that header just for one constant. + */ +extern int resvd_set_number; + +/* first-level command handlers */ +void ipfw_add(char *av[]); +void ipfw_show_nat(int ac, char **av); +void ipfw_config_pipe(int ac, char **av); +void ipfw_config_nat(int ac, char **av); +void ipfw_sets_handler(char *av[]); +void ipfw_table_handler(int ac, char *av[]); +void ipfw_sysctl_handler(char *av[], int which); +void ipfw_delete(char *av[]); +void ipfw_flush(int force); +void ipfw_zero(int ac, char *av[], int optname); +void ipfw_list(int ac, char *av[], int show_counters); + +/* altq.c */ +void altq_set_enabled(int enabled); +u_int32_t altq_name_to_qid(const char *name); + +void print_altq_cmd(struct _ipfw_insn_altq *altqptr); + +/* dummynet.c */ +void dummynet_list(int ac, char *av[], int show_counters); +void dummynet_flush(void); +int ipfw_delete_pipe(int pipe_or_queue, int n); + +/* ipv6.c */ +void print_unreach6_code(uint16_t code); +void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s); +void print_flow6id(struct _ipfw_insn_u32 *cmd); +void print_icmp6types(struct _ipfw_insn_u32 *cmd); +void print_ext6hdr(struct _ipfw_insn *cmd ); + +struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av); +struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av); + +void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av ); +void fill_unreach6_code(u_short *codep, char *str); +void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av); +int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); diff --git a/ipfw/ipv6.c b/ipfw/ipv6.c new file mode 100644 index 0000000..3cfc4df --- /dev/null +++ b/ipfw/ipv6.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: user/luigi/ipfw3-head/sbin/ipfw/ipv6.c 187770 2009-01-27 12:01:30Z luigi $ + * + * ipv6 support + */ + +#include +#include + +#include "ipfw2.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static struct _s_x icmp6codes[] = { + { "no-route", ICMP6_DST_UNREACH_NOROUTE }, + { "admin-prohib", ICMP6_DST_UNREACH_ADMIN }, + { "address", ICMP6_DST_UNREACH_ADDR }, + { "port", ICMP6_DST_UNREACH_NOPORT }, + { NULL, 0 } +}; + +void +fill_unreach6_code(u_short *codep, char *str) +{ + int val; + char *s; + + val = strtoul(str, &s, 0); + if (s == str || *s != '\0' || val >= 0x100) + val = match_token(icmp6codes, str); + if (val < 0) + errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str); + *codep = val; + return; +} + +void +print_unreach6_code(uint16_t code) +{ + char const *s = match_value(icmp6codes, code); + + if (s != NULL) + printf("unreach6 %s", s); + else + printf("unreach6 %u", code); +} + +/* + * Print the ip address contained in a command. + */ +void +print_ip6(ipfw_insn_ip6 *cmd, char const *s) +{ + struct hostent *he = NULL; + int len = F_LEN((ipfw_insn *) cmd) - 1; + struct in6_addr *a = &(cmd->addr6); + char trad[255]; + + printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); + + if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) { + printf("me6"); + return; + } + if (cmd->o.opcode == O_IP6) { + printf(" ip6"); + return; + } + + /* + * len == 4 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + + for (len = len / 4; len > 0; len -= 2, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ? + 128 : contigmask((uint8_t *)&(a[1]), 128); + + if (mb == 128 && co.do_resolv) + he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6); + if (he != NULL) /* resolved to name */ + printf("%s", he->h_name); + else if (mb == 0) /* any */ + printf("any"); + else { /* numeric IP followed by some kind of mask */ + if (inet_ntop(AF_INET6, a, trad, sizeof( trad ) ) == NULL) + printf("Error ntop in print_ip6\n"); + printf("%s", trad ); + if (mb < 0) /* XXX not really legal... */ + printf(":%s", + inet_ntop(AF_INET6, &a[1], trad, sizeof(trad))); + else if (mb < 128) + printf("/%d", mb); + } + if (len > 2) + printf(","); + } +} + +void +fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av) +{ + uint8_t type; + + bzero(cmd, sizeof(*cmd)); + while (*av) { + if (*av == ',') + av++; + type = strtoul(av, &av, 0); + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ICMP6 type"); + /* + * XXX: shouldn't this be 0xFF? I can't see any reason why + * we shouldn't be able to filter all possiable values + * regardless of the ability of the rest of the kernel to do + * anything useful with them. + */ + if (type > ICMP6_MAXTYPE) + errx(EX_DATAERR, "ICMP6 type out of range"); + cmd->d[type / 32] |= ( 1 << (type % 32)); + } + cmd->o.opcode = O_ICMP6TYPE; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6); +} + + +void +print_icmp6types(ipfw_insn_u32 *cmd) +{ + int i, j; + char sep= ' '; + + printf(" ip6 icmp6types"); + for (i = 0; i < 7; i++) + for (j=0; j < 32; ++j) { + if ( (cmd->d[i] & (1 << (j))) == 0) + continue; + printf("%c%d", sep, (i*32 + j)); + sep = ','; + } +} + +void +print_flow6id( ipfw_insn_u32 *cmd) +{ + uint16_t i, limit = cmd->o.arg1; + char sep = ','; + + printf(" flow-id "); + for( i=0; i < limit; ++i) { + if (i == limit - 1) + sep = ' '; + printf("%d%c", cmd->d[i], sep); + } +} + +/* structure and define for the extension header in ipv6 */ +static struct _s_x ext6hdrcodes[] = { + { "frag", EXT_FRAGMENT }, + { "hopopt", EXT_HOPOPTS }, + { "route", EXT_ROUTING }, + { "dstopt", EXT_DSTOPTS }, + { "ah", EXT_AH }, + { "esp", EXT_ESP }, + { "rthdr0", EXT_RTHDR0 }, + { "rthdr2", EXT_RTHDR2 }, + { NULL, 0 } +}; + +/* fills command for the extension header filtering */ +int +fill_ext6hdr( ipfw_insn *cmd, char *av) +{ + int tok; + char *s = av; + + cmd->arg1 = 0; + + while(s) { + av = strsep( &s, ",") ; + tok = match_token(ext6hdrcodes, av); + switch (tok) { + case EXT_FRAGMENT: + cmd->arg1 |= EXT_FRAGMENT; + break; + + case EXT_HOPOPTS: + cmd->arg1 |= EXT_HOPOPTS; + break; + + case EXT_ROUTING: + cmd->arg1 |= EXT_ROUTING; + break; + + case EXT_DSTOPTS: + cmd->arg1 |= EXT_DSTOPTS; + break; + + case EXT_AH: + cmd->arg1 |= EXT_AH; + break; + + case EXT_ESP: + cmd->arg1 |= EXT_ESP; + break; + + case EXT_RTHDR0: + cmd->arg1 |= EXT_RTHDR0; + break; + + case EXT_RTHDR2: + cmd->arg1 |= EXT_RTHDR2; + break; + + default: + errx( EX_DATAERR, "invalid option for ipv6 exten header" ); + break; + } + } + if (cmd->arg1 == 0 ) + return 0; + cmd->opcode = O_EXT_HDR; + cmd->len |= F_INSN_SIZE( ipfw_insn ); + return 1; +} + +void +print_ext6hdr( ipfw_insn *cmd ) +{ + char sep = ' '; + + printf(" extension header:"); + if (cmd->arg1 & EXT_FRAGMENT ) { + printf("%cfragmentation", sep); + sep = ','; + } + if (cmd->arg1 & EXT_HOPOPTS ) { + printf("%chop options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_ROUTING ) { + printf("%crouting options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_RTHDR0 ) { + printf("%crthdr0", sep); + sep = ','; + } + if (cmd->arg1 & EXT_RTHDR2 ) { + printf("%crthdr2", sep); + sep = ','; + } + if (cmd->arg1 & EXT_DSTOPTS ) { + printf("%cdestination options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_AH ) { + printf("%cauthentication header", sep); + sep = ','; + } + if (cmd->arg1 & EXT_ESP ) { + printf("%cencapsulated security payload", sep); + } +} + +/* Try to find ipv6 address by hostname */ +static int +lookup_host6 (char *host, struct in6_addr *ip6addr) +{ + struct hostent *he; + + if (!inet_pton(AF_INET6, host, ip6addr)) { + if ((he = gethostbyname2(host, AF_INET6)) == NULL) + return(-1); + memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr)); + } + return(0); +} + + +/* + * fill the addr and mask fields in the instruction as appropriate from av. + * Update length as appropriate. + * The following formats are allowed: + * any matches any IP6. Actually returns an empty instruction. + * me returns O_IP6_*_ME + * + * 03f1::234:123:0342 single IP6 addres + * 03f1::234:123:0342/24 address/mask + * 03f1::234:123:0342/24,03f1::234:123:0343/ List of address + * + * Set of address (as in ipv6) not supported because ipv6 address + * are typically random past the initial prefix. + * Return 1 on success, 0 on failure. + */ +static int +fill_ip6(ipfw_insn_ip6 *cmd, char *av) +{ + int len = 0; + struct in6_addr *d = &(cmd->addr6); + /* + * Needed for multiple address. + * Note d[1] points to struct in6_add r mask6 of cmd + */ + + cmd->o.len &= ~F_LEN_MASK; /* zero len */ + + if (strcmp(av, "any") == 0) + return (1); + + + if (strcmp(av, "me") == 0) { /* Set the data for "me" opt*/ + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + if (strcmp(av, "me6") == 0) { /* Set the data for "me" opt*/ + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + av = strdup(av); + while (av) { + /* + * After the address we can have '/' indicating a mask, + * or ',' indicating another address follows. + */ + + char *p; + int masklen; + char md = '\0'; + + if ((p = strpbrk(av, "/,")) ) { + md = *p; /* save the separator */ + *p = '\0'; /* terminate address string */ + p++; /* and skip past it */ + } + /* now p points to NULL, mask or next entry */ + + /* lookup stores address in *d as a side effect */ + if (lookup_host6(av, d) != 0) { + /* XXX: failed. Free memory and go */ + errx(EX_DATAERR, "bad address \"%s\"", av); + } + /* next, look at the mask, if any */ + masklen = (md == '/') ? atoi(p) : 128; + if (masklen > 128 || masklen < 0) + errx(EX_DATAERR, "bad width \"%s\''", p); + else + n2mask(&d[1], masklen); + + APPLY_MASK(d, &d[1]) /* mask base address with mask */ + + /* find next separator */ + + if (md == '/') { /* find separator past the mask */ + p = strpbrk(p, ","); + if (p != NULL) + p++; + } + av = p; + + /* Check this entry */ + if (masklen == 0) { + /* + * 'any' turns the entire list into a NOP. + * 'not any' never matches, so it is removed from the + * list unless it is the only item, in which case we + * report an error. + */ + if (cmd->o.len & F_NOT && av == NULL && len == 0) + errx(EX_DATAERR, "not any never matches"); + continue; + } + + /* + * A single IP can be stored alone + */ + if (masklen == 128 && av == NULL && len == 0) { + len = F_INSN_SIZE(struct in6_addr); + break; + } + + /* Update length and pointer to arguments */ + len += F_INSN_SIZE(struct in6_addr)*2; + d += 2; + } /* end while */ + + /* + * Total length of the command, remember that 1 is the size of + * the base command. + */ + if (len + 1 > F_LEN_MASK) + errx(EX_DATAERR, "address list too long"); + cmd->o.len |= len+1; + free(av); + return (1); +} + +/* + * fills command for ipv6 flow-id filtering + * note that the 20 bit flow number is stored in a array of u_int32_t + * it's supported lists of flow-id, so in the o.arg1 we store how many + * additional flow-id we want to filter, the basic is 1 + */ +void +fill_flow6( ipfw_insn_u32 *cmd, char *av ) +{ + u_int32_t type; /* Current flow number */ + u_int16_t nflow = 0; /* Current flow index */ + char *s = av; + cmd->d[0] = 0; /* Initializing the base number*/ + + while (s) { + av = strsep( &s, ",") ; + type = strtoul(av, &av, 0); + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); + if (type > 0xfffff) + errx(EX_DATAERR, "flow number out of range %s", av); + cmd->d[nflow] |= type; + nflow++; + } + if( nflow > 0 ) { + cmd->o.opcode = O_FLOW6ID; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow; + cmd->o.arg1 = nflow; + } + else { + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); + } +} + +ipfw_insn * +add_srcip6(ipfw_insn *cmd, char *av) +{ + + fill_ip6((ipfw_insn_ip6 *)cmd, av); + if (F_LEN(cmd) == 0) { /* any */ + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ + cmd->opcode = O_IP6_SRC_ME; + } else if (F_LEN(cmd) == + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { + /* single IP, no mask*/ + cmd->opcode = O_IP6_SRC; + } else { /* addr/mask opt */ + cmd->opcode = O_IP6_SRC_MASK; + } + return cmd; +} + +ipfw_insn * +add_dstip6(ipfw_insn *cmd, char *av) +{ + + fill_ip6((ipfw_insn_ip6 *)cmd, av); + if (F_LEN(cmd) == 0) { /* any */ + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ + cmd->opcode = O_IP6_DST_ME; + } else if (F_LEN(cmd) == + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { + /* single IP, no mask*/ + cmd->opcode = O_IP6_DST; + } else { /* addr/mask opt */ + cmd->opcode = O_IP6_DST_MASK; + } + return cmd; +} diff --git a/ipfw/main.c b/ipfw/main.c new file mode 100644 index 0000000..7bd9105 --- /dev/null +++ b/ipfw/main.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2002-2003,2010 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * Command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/main.c 206494 2010-04-12 08:27:53Z luigi $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipfw2.h" + +static void +help(void) +{ + fprintf(stderr, +"ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n" +"\tipfw [-abcdefhnNqStTv] \n\n" +"where is one of the following:\n\n" +"add [num] [set N] [prob x] RULE-BODY\n" +"{pipe|queue} N config PIPE-BODY\n" +"[pipe|queue] {zero|delete|show} [N{,N}]\n" +"nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n" +" reverse|proxy_only|redirect_addr linkspec|\n" +" redirect_port linkspec|redirect_proto linkspec}\n" +"set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n" +"set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n" +"table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n" +"table all {flush | list}\n" +"\n" +"RULE-BODY: check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n" +"ACTION: check-state | allow | count | deny | unreach{,6} CODE |\n" +" skipto N | {divert|tee} PORT | forward ADDR |\n" +" pipe N | queue N | nat N | setfib FIB | reass\n" +"PARAMS: [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n" +"ADDR: [ MAC dst src ether_type ] \n" +" [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n" +" [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n" +"IPADDR: [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n" +"IP6ADDR: [not] { any | me | me6 | ip6/bits | IP6LIST }\n" +"IP6LIST: { ip6 | ip6/bits }[,IP6LIST]\n" +"IPLIST: { ip | ip/bits | ip:mask }[,IPLIST]\n" +"OPTION_LIST: OPTION [OPTION_LIST]\n" +"OPTION: bridged | diverted | diverted-loopback | diverted-output |\n" +" {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n" +" {dst-port|src-port} LIST |\n" +" estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n" +" iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n" +" ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n" +" icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n" +" mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n" +" setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n" +" tcpdatalen LIST | verrevpath | versrcreach | antispoof\n" +); + + exit(0); +} + +/* + * Called with the arguments, including program name because getopt + * wants it to be present. + * Returns 0 if successful, 1 if empty command, errx() in case of errors. + * First thing we do is process parameters creating an argv[] array + * which includes the program name and a NULL entry at the end. + * If we are called with a single string, we split it on whitespace. + * Also, arguments with a trailing ',' are joined to the next one. + * The pointers (av[]) and data are in a single chunk of memory. + * av[0] points to the original program name, all other entries + * point into the allocated chunk. + */ +static int +ipfw_main(int oldac, char **oldav) +{ + int ch, ac; + const char *errstr; + char **av, **save_av; + int do_acct = 0; /* Show packet/byte count */ + int try_next = 0; /* set if pipe cmd not found */ + int av_size; /* compute the av size */ + char *av_p; /* used to build the av list */ + +#define WHITESP " \t\f\v\n\r" + if (oldac < 2) + return 1; /* need at least one argument */ + + if (oldac == 2) { + /* + * If we are called with one argument, try to split it into + * words for subsequent parsing. Spaces after a ',' are + * removed by copying the string in-place. + */ + char *arg = oldav[1]; /* The string is the first arg. */ + int l = strlen(arg); + int copy = 0; /* 1 if we need to copy, 0 otherwise */ + int i, j; + + for (i = j = 0; i < l; i++) { + if (arg[i] == '#') /* comment marker */ + break; + if (copy) { + arg[j++] = arg[i]; + copy = !strchr("," WHITESP, arg[i]); + } else { + copy = !strchr(WHITESP, arg[i]); + if (copy) + arg[j++] = arg[i]; + } + } + if (!copy && j > 0) /* last char was a 'blank', remove it */ + j--; + l = j; /* the new argument length */ + arg[j++] = '\0'; + if (l == 0) /* empty string! */ + return 1; + + /* + * First, count number of arguments. Because of the previous + * processing, this is just the number of blanks plus 1. + */ + for (i = 0, ac = 1; i < l; i++) + if (strchr(WHITESP, arg[i]) != NULL) + ac++; + + /* + * Allocate the argument list structure as a single block + * of memory, containing pointers and the argument + * strings. We include one entry for the program name + * because getopt expects it, and a NULL at the end + * to simplify further parsing. + */ + ac++; /* add 1 for the program name */ + av_size = (ac+1) * sizeof(char *) + l + 1; + av = safe_calloc(av_size, 1); + + /* + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[]. For each one, + * j is the initial character, i is the one past the end. + */ + av_p = (char *)&av[ac+1]; + for (ac = 1, i = j = 0; i < l; i++) { + if (strchr(WHITESP, arg[i]) != NULL || i == l-1) { + if (i == l-1) + i++; + bcopy(arg+j, av_p, i-j); + av[ac] = av_p; + av_p += i-j; /* the length of the string */ + *av_p++ = '\0'; + ac++; + j = i + 1; + } + } + } else { + /* + * If an argument ends with ',' join with the next one. + */ + int first, i, l=0; + + /* + * Allocate the argument list structure as a single block + * of memory, containing both pointers and the argument + * strings. We include some space for the program name + * because getopt expects it. + * We add an extra pointer to the end of the array, + * to make simpler further parsing. + */ + for (i=0; i= 2 && !strcmp(av[1], "sysctl")) { + char *s; + int i; + + if (ac != 3) { + printf( "sysctl emulation usage:\n" + " ipfw sysctl name[=value]\n" + " ipfw sysctl -a\n"); + return 0; + } + s = strchr(av[2], '='); + if (s == NULL) { + s = !strcmp(av[2], "-a") ? NULL : av[2]; + sysctlbyname(s, NULL, NULL, NULL, 0); + } else { /* ipfw sysctl x.y.z=value */ + /* assume an INT value, will extend later */ + if (s[1] == '\0') { + printf("ipfw sysctl: missing value\n\n"); + return 0; + } + *s = '\0'; + i = strtol(s+1, NULL, 0); + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); + } + return 0; + } +#endif + + /* Save arguments for final freeing of memory. */ + save_av = av; + + optind = optreset = 1; /* restart getopt() */ + while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1) + switch (ch) { + case 'a': + do_acct = 1; + break; + + case 'b': + co.comment_only = 1; + co.do_compact = 1; + break; + + case 'c': + co.do_compact = 1; + break; + + case 'd': + co.do_dynamic = 1; + break; + + case 'e': + co.do_expired = 1; + break; + + case 'f': + co.do_force = 1; + break; + + case 'h': /* help */ + free(save_av); + help(); + break; /* NOTREACHED */ + + case 'i': + co.do_value_as_ip = 1; + break; + + case 'n': + co.test_only = 1; + break; + + case 'N': + co.do_resolv = 1; + break; + + case 'q': + co.do_quiet = 1; + break; + + case 'p': + errx(EX_USAGE, "An absolute pathname must be used " + "with -p option."); + /* NOTREACHED */ + + case 's': /* sort */ + co.do_sort = atoi(optarg); + break; + + case 'S': + co.show_sets = 1; + break; + + case 't': + co.do_time = 1; + break; + + case 'T': + co.do_time = 2; /* numeric timestamp */ + break; + + case 'v': /* verbose */ + co.verbose = 1; + break; + + default: + free(save_av); + return 1; + } + + ac -= optind; + av += optind; + NEED1("bad arguments, for usage summary ``ipfw''"); + + /* + * An undocumented behaviour of ipfw1 was to allow rule numbers first, + * e.g. "100 add allow ..." instead of "add 100 allow ...". + * In case, swap first and second argument to get the normal form. + */ + if (ac > 1 && isdigit(*av[0])) { + char *p = av[0]; + + av[0] = av[1]; + av[1] = p; + } + + /* + * Optional: pipe, queue or nat. + */ + co.do_nat = 0; + co.do_pipe = 0; + co.use_set = 0; + if (!strncmp(*av, "nat", strlen(*av))) + co.do_nat = 1; + else if (!strncmp(*av, "pipe", strlen(*av))) + co.do_pipe = 1; + else if (_substrcmp(*av, "queue") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "flowset") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "sched") == 0) + co.do_pipe = 3; + else if (!strncmp(*av, "set", strlen(*av))) { + if (ac > 1 && isdigit(av[1][0])) { + co.use_set = strtonum(av[1], 0, resvd_set_number, + &errstr); + if (errstr) + errx(EX_DATAERR, + "invalid set number %s\n", av[1]); + ac -= 2; av += 2; co.use_set++; + } + } + + if (co.do_pipe || co.do_nat) { + ac--; + av++; + } + NEED1("missing command"); + + /* + * For pipes, queues and nats we normally say 'nat|pipe NN config' + * but the code is easier to parse as 'nat|pipe config NN' + * so we swap the two arguments. + */ + if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) { + char *p = av[0]; + + av[0] = av[1]; + av[1] = p; + } + + if (co.use_set == 0) { + if (_substrcmp(*av, "add") == 0) + ipfw_add(av); + else if (co.do_nat && _substrcmp(*av, "show") == 0) + ipfw_show_nat(ac, av); + else if (co.do_pipe && _substrcmp(*av, "config") == 0) + ipfw_config_pipe(ac, av); + else if (co.do_nat && _substrcmp(*av, "config") == 0) + ipfw_config_nat(ac, av); + else if (_substrcmp(*av, "set") == 0) + ipfw_sets_handler(av); + else if (_substrcmp(*av, "table") == 0) + ipfw_table_handler(ac, av); + else if (_substrcmp(*av, "enable") == 0) + ipfw_sysctl_handler(av, 1); + else if (_substrcmp(*av, "disable") == 0) + ipfw_sysctl_handler(av, 0); + else + try_next = 1; + } + + if (co.use_set || try_next) { + if (_substrcmp(*av, "delete") == 0) + ipfw_delete(av); + else if (_substrcmp(*av, "flush") == 0) + ipfw_flush(co.do_force); + else if (_substrcmp(*av, "zero") == 0) + ipfw_zero(ac, av, 0 /* IP_FW_ZERO */); + else if (_substrcmp(*av, "resetlog") == 0) + ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */); + else if (_substrcmp(*av, "print") == 0 || + _substrcmp(*av, "list") == 0) + ipfw_list(ac, av, do_acct); + else if (_substrcmp(*av, "show") == 0) + ipfw_list(ac, av, 1 /* show counters */); + else + errx(EX_USAGE, "bad command `%s'", *av); + } + + /* Free memory allocated in the argument parsing. */ + free(save_av); + return 0; +} + + +static void +ipfw_readfile(int ac, char *av[]) +{ +#define MAX_ARGS 32 + char buf[4096]; + char *progname = av[0]; /* original program name */ + const char *cmd = NULL; /* preprocessor name, if any */ + const char *filename = av[ac-1]; /* file to read */ + int c, lineno=0; + FILE *f = NULL; + pid_t preproc = 0; + + while ((c = getopt(ac, av, "cfNnp:qS")) != -1) { + switch(c) { + case 'c': + co.do_compact = 1; + break; + + case 'f': + co.do_force = 1; + break; + + case 'N': + co.do_resolv = 1; + break; + + case 'n': + co.test_only = 1; + break; + + case 'p': + /* + * ipfw -p cmd [args] filename + * + * We are done with getopt(). All arguments + * except the filename go to the preprocessor, + * so we need to do the following: + * - check that a filename is actually present; + * - advance av by optind-1 to skip arguments + * already processed; + * - decrease ac by optind, to remove the args + * already processed and the final filename; + * - set the last entry in av[] to NULL so + * popen() can detect the end of the array; + * - set optind=ac to let getopt() terminate. + */ + if (optind == ac) + errx(EX_USAGE, "no filename argument"); + cmd = optarg; + av[ac-1] = NULL; + av += optind - 1; + ac -= optind; + optind = ac; + break; + + case 'q': + co.do_quiet = 1; + break; + + case 'S': + co.show_sets = 1; + break; + + default: + errx(EX_USAGE, "bad arguments, for usage" + " summary ``ipfw''"); + } + + } + + if (cmd == NULL && ac != optind + 1) + errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]); + + if ((f = fopen(filename, "r")) == NULL) + err(EX_UNAVAILABLE, "fopen: %s", filename); + + if (cmd != NULL) { /* pipe through preprocessor */ + int pipedes[2]; + + if (pipe(pipedes) == -1) + err(EX_OSERR, "cannot create pipe"); + + preproc = fork(); + if (preproc == -1) + err(EX_OSERR, "cannot fork"); + + if (preproc == 0) { + /* + * Child, will run the preprocessor with the + * file on stdin and the pipe on stdout. + */ + if (dup2(fileno(f), 0) == -1 + || dup2(pipedes[1], 1) == -1) + err(EX_OSERR, "dup2()"); + fclose(f); + close(pipedes[1]); + close(pipedes[0]); + execvp(cmd, av); + err(EX_OSERR, "execvp(%s) failed", cmd); + } else { /* parent, will reopen f as the pipe */ + fclose(f); + close(pipedes[1]); + if ((f = fdopen(pipedes[0], "r")) == NULL) { + int savederrno = errno; + + (void)kill(preproc, SIGTERM); + errno = savederrno; + err(EX_OSERR, "fdopen()"); + } + } + } + + while (fgets(buf, sizeof(buf), f)) { /* read commands */ + char linename[20]; + char *args[2]; + + lineno++; + snprintf(linename, sizeof(linename), "Line %d", lineno); + setprogname(linename); /* XXX */ + args[0] = progname; + args[1] = buf; + ipfw_main(2, args); + } + fclose(f); + if (cmd != NULL) { + int status; + + if (waitpid(preproc, &status, 0) == -1) + errx(EX_OSERR, "waitpid()"); + if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK) + errx(EX_UNAVAILABLE, + "preprocessor exited with status %d", + WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + errx(EX_UNAVAILABLE, + "preprocessor exited with signal %d", + WTERMSIG(status)); + } +} + +int +main(int ac, char *av[]) +{ +#if defined(_WIN32) && defined(TCC) + { + WSADATA wsaData; + int ret=0; + unsigned short wVersionRequested = MAKEWORD(2, 2); + ret = WSAStartup(wVersionRequested, &wsaData); + if (ret != 0) { + /* Tell the user that we could not find a usable */ + /* Winsock DLL. */ + printf("WSAStartup failed with error: %d\n", ret); + return 1; + } + } +#endif + /* + * If the last argument is an absolute pathname, interpret it + * as a file to be preprocessed. + */ + + if (ac > 1 && av[ac - 1][0] == '/') { + if (access(av[ac - 1], R_OK) == 0) + ipfw_readfile(ac, av); + else + err(EX_USAGE, "pathname: %s", av[ac - 1]); + } else { + if (ipfw_main(ac, av)) { + errx(EX_USAGE, + "usage: ipfw [options]\n" + "do \"ipfw -h\" or \"man ipfw\" for details"); + } + } + return EX_OK; +} diff --git a/ipfw/qsort.c b/ipfw/qsort.c new file mode 100644 index 0000000..4258b8c --- /dev/null +++ b/ipfw/qsort.c @@ -0,0 +1,195 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)qsort.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ +#include +__FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.15 2008/01/14 09:21:34 das Exp $"); + +#include + +#ifdef I_AM_QSORT_R +typedef int cmp_t(void *, const void *, const void *); +#else +typedef int cmp_t(const void *, const void *); +#endif +static inline char *med3(char *, char *, char *, cmp_t *, void *); +static inline void swapfunc(char *, char *, int, int); + +#define min(a, b) (a) < (b) ? a : b + +/* + * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". + */ +#define swapcode(TYPE, parmi, parmj, n) { \ + long i = (n) / sizeof (TYPE); \ + TYPE *pi = (TYPE *) (parmi); \ + TYPE *pj = (TYPE *) (parmj); \ + do { \ + TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} + +#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ + es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; + +static inline void +swapfunc(a, b, n, swaptype) + char *a, *b; + int n, swaptype; +{ + if(swaptype <= 1) + swapcode(long, a, b, n) + else + swapcode(char, a, b, n) +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(a); \ + *(long *)(a) = *(long *)(b); \ + *(long *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) + +#ifdef I_AM_QSORT_R +#define CMP(t, x, y) (cmp((t), (x), (y))) +#else +#define CMP(t, x, y) (cmp((x), (y))) +#endif + +static inline char * +med3(char *a, char *b, char *c, cmp_t *cmp, void *thunk +#ifndef I_AM_QSORT_R +__unused // XXX what ? +#endif +) +{ + return CMP(thunk, a, b) < 0 ? + (CMP(thunk, b, c) < 0 ? b : (CMP(thunk, a, c) < 0 ? c : a )) + :(CMP(thunk, b, c) > 0 ? b : (CMP(thunk, a, c) < 0 ? a : c )); +} + +#ifdef I_AM_QSORT_R +void +qsort_r(void *a, size_t n, size_t es, void *thunk, cmp_t *cmp) +#else +#define thunk NULL +void +qsort(void *a, size_t n, size_t es, cmp_t *cmp) +#endif +{ + char *pa, *pb, *pc, *pd, *pl, *pm, *pn; + size_t d, r; + int cmp_result; + int swaptype, swap_cnt; + +loop: SWAPINIT(a, es); + swap_cnt = 0; + if (n < 7) { + for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es) + for (pl = pm; + pl > (char *)a && CMP(thunk, pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + pm = (char *)a + (n / 2) * es; + if (n > 7) { + pl = a; + pn = (char *)a + (n - 1) * es; + if (n > 40) { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, cmp, thunk); + pm = med3(pm - d, pm, pm + d, cmp, thunk); + pn = med3(pn - 2 * d, pn - d, pn, cmp, thunk); + } + pm = med3(pl, pm, pn, cmp, thunk); + } + swap(a, pm); + pa = pb = (char *)a + es; + + pc = pd = (char *)a + (n - 1) * es; + for (;;) { + while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pc, pd); + pd -= es; + } + pc -= es; + } + if (pb > pc) + break; + swap(pb, pc); + swap_cnt = 1; + pb += es; + pc -= es; + } + if (swap_cnt == 0) { /* Switch to insertion sort */ + for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es) + for (pl = pm; + pl > (char *)a && CMP(thunk, pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + + pn = (char *)a + n * es; + r = min(pa - (char *)a, pb - pa); + vecswap(a, pb - r, r); + r = min(pd - pc, pn - pd - es); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > es) +#ifdef I_AM_QSORT_R + qsort_r(a, r / es, es, thunk, cmp); +#else + qsort(a, r / es, es, cmp); +#endif + if ((r = pd - pc) > es) { + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + goto loop; + } +/* qsort(pn - r, r / es, es, cmp);*/ +} diff --git a/ipfw/qsort_r.c b/ipfw/qsort_r.c new file mode 100644 index 0000000..f7c0e54 --- /dev/null +++ b/ipfw/qsort_r.c @@ -0,0 +1,8 @@ +/* + * This file is in the public domain. Originally written by Garrett + * A. Wollman. + * + * $FreeBSD: src/lib/libc/stdlib/qsort_r.c,v 1.1 2002/09/10 02:04:49 wollman Exp $ + */ +#define I_AM_QSORT_R +#include "qsort.c" diff --git a/ipfw/rule_test.sh b/ipfw/rule_test.sh new file mode 100755 index 0000000..d5ad6be --- /dev/null +++ b/ipfw/rule_test.sh @@ -0,0 +1,83 @@ +#/bin/bash + +COMMAND=ipfw + + +echo .########## Set $COMMAND mode .########## +$COMMAND add allow ip from any to any +$COMMAND -q flush + +echo .########## empty rules .########## +$COMMAND list +$COMMAND add allow ip from any to any +$COMMAND add allow ip from any to { 1.2.3.4 or 2.3.4.5 } +$COMMAND add allow { dst-ip 1.2.3.4 or dst-ip 2.3.4.5 } + +echo .########## listing 3 rules .########## +$COMMAND list + +$COMMAND delete 200 +echo .########## listing 2 rules .########## +$COMMAND list + +$COMMAND table 10 add 1.2.3.4 +$COMMAND table 10 add 1.2.3.5 +$COMMAND table 10 add 1.2.3.6 +$COMMAND table 10 add 1.2.3.7/13 +$COMMAND table 10 add 1.2.3.7/20 +$COMMAND table 10 add 1.2.3.7/28 + +echo .########## listing table 10 with 6 elements .########## +$COMMAND table 10 list +$COMMAND table 10 delete 1.2.3.6 + +echo .########## listing table 10 with 5 elements .########## +$COMMAND table 10 list +$COMMAND table 10 flush + +echo .########## table 10 empty .########## +$COMMAND table 10 list + +echo .########## move rule 100 to set 1 300 to 3 .########## +$COMMAND set move rule 100 to 1 +$COMMAND set move rule 300 to 3 +$COMMAND -S show + +echo .########## move rule 200 to 2 but 200 do not exist .###### +$COMMAND set move rule 200 to 2 + +echo .########## add some rules .########## +$COMMAND add 200 queue 2 proto ip +$COMMAND add 300 queue 5 proto ip +$COMMAND add 400 queue 40 proto ip +$COMMAND add 400 queue 50 proto ip + +echo .########## move rule 200 to 2 .###### +$COMMAND set move rule 200 to 2 + +echo .########## move rule 400 to 5 .###### +$COMMAND set move rule 400 to 5 + +echo .########## set 5 show 2 rules .###### +$COMMAND set 5 show + +echo .########## flush set 5 .###### +$COMMAND -q set 5 flush + +echo .########## set 5 show 0 rule .###### +$COMMAND set 5 show + +echo .########## disable set 1 .###### +$COMMAND set disable 1 + +echo .########## show all rules except set 1 .###### +$COMMAND -S show + +echo .########## enable set 1 .###### +$COMMAND set enable 1 + +echo .########## show all rules .###### +$COMMAND -S show + + + diff --git a/ipfw/ws2_32.def b/ipfw/ws2_32.def new file mode 100644 index 0000000..3813911 --- /dev/null +++ b/ipfw/ws2_32.def @@ -0,0 +1,120 @@ +LIBRARY ws2_32.dll + +EXPORTS +FreeAddrInfoW +GetAddrInfoW +GetNameInfoW +WEP +WPUCompleteOverlappedRequest +WSAAccept +WSAAddressToStringA +WSAAddressToStringW +WSAAsyncGetHostByAddr +WSAAsyncGetHostByName +WSAAsyncGetProtoByName +WSAAsyncGetProtoByNumber +WSAAsyncGetServByName +WSAAsyncGetServByPort +WSAAsyncSelect +WSACancelAsyncRequest +WSACancelBlockingCall +WSACleanup +WSACloseEvent +WSAConnect +WSACreateEvent +WSADuplicateSocketA +WSADuplicateSocketW +WSAEnumNameSpaceProvidersA +WSAEnumNameSpaceProvidersW +WSAEnumNetworkEvents +WSAEnumProtocolsA +WSAEnumProtocolsW +WSAEventSelect +WSAGetLastError +WSAGetOverlappedResult +WSAGetQOSByName +WSAGetServiceClassInfoA +WSAGetServiceClassInfoW +WSAGetServiceClassNameByClassIdA +WSAGetServiceClassNameByClassIdW +WSAHtonl +WSAHtons +WSAInstallServiceClassA +WSAInstallServiceClassW +WSAIoctl +WSAIsBlocking +WSAJoinLeaf +WSALookupServiceBeginA +WSALookupServiceBeginW +WSALookupServiceEnd +WSALookupServiceNextA +WSALookupServiceNextW +WSANSPIoctl +WSANtohl +WSANtohs +WSAProviderConfigChange +WSARecv +WSARecvDisconnect +WSARecvFrom +WSARemoveServiceClass +WSAResetEvent +WSASend +WSASendDisconnect +WSASendTo +WSASetBlockingHook +WSASetEvent +WSASetLastError +WSASetServiceA +WSASetServiceW +WSASocketA +WSASocketW +WSAStartup +WSAStringToAddressA +WSAStringToAddressW +WSAUnhookBlockingHook +WSAWaitForMultipleEvents +WSApSetPostRoutine +WSCDeinstallProvider +WSCEnableNSProvider +WSCEnumProtocols +WSCGetProviderPath +WSCInstallNameSpace +WSCInstallProvider +WSCUnInstallNameSpace +WSCUpdateProvider +WSCWriteNameSpaceOrder +WSCWriteProviderOrder +__WSAFDIsSet +accept +bind +closesocket +connect +freeaddrinfo +getaddrinfo +gethostbyaddr +gethostbyname +gethostname +getnameinfo +getpeername +getprotobyname +getprotobynumber +getservbyname +getservbyport +getsockname +getsockopt +htonl +htons +inet_addr +inet_ntoa +ioctlsocket +listen +ntohl +ntohs +recv +recvfrom +select +send +sendto +setsockopt +shutdown +socket diff --git a/kipfw/Makefile b/kipfw/Makefile new file mode 100644 index 0000000..6ca0562 --- /dev/null +++ b/kipfw/Makefile @@ -0,0 +1,367 @@ +# $Id: Makefile 12257 2013-04-26 21:13:24Z luigi $ +# gnu Makefile to build linux/Windows module for ipfw+dummynet. +# +# The defaults are set to build without modifications on PlanetLab +# and possibly 2.6 versions. +# On Windows, we use gnu-make and MSC + +# Some variables need to have specific names, because they are used +# by the build infrastructure on Linux and OpenWrt. They are: +# +# ccflags-y additional $(CC) flags +# M used by Kbuild, we must set it to `pwd` +# obj-m list of .o modules to build +# $(MOD)-y for each $MOD in obj-m, the list of objects +# obj-y same as above, for openwrt +# O_TARGET the link target, for openwrt +# EXTRA_CFLAGS as the name says... in openwrt +# EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too +# KERNELPATH the path to the kernel sources or headers +# (on planetlab it is set already by the build system, +# for other systems we take KSRC which is either guessed +# or taken from the command line. +# +# Not sure about this (the name might be reserved) +# ipfw-cflags our flags for building the module +# +# Other variables are only private and can be renamed. They include: +# +# VER linux version we are building for (2.4 2.6 or openwrt) +# +#--- +# +# The windows files (passthru etc.) are modified version of the +# examples found in the $(DDK)/src/network/ndis/passthru/driver/ +# They can be re-created using the 'ndis-glue' target in the + +include $(PWD)/../Makefile.inc + +TARGET = kipfw + +# lets default for 2.6 for planetlab builds +VER ?= 2.6 + +#--- General values for all types of build --- +# obj-m is the target module +obj-m := ipfw_mod.o + +#-- the list of source files. IPFW_SRCS is our own name. +# Original ipfw and dummynet sources + FreeBSD stuff, +IPFW_SRCS := ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c +IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c +IPFW_SRCS += radix.c in_cksum.c +IPFW_SRCS += ip_dummynet.c ip_dn_io.c ip_dn_glue.c +IPFW_SRCS += dn_heap.c +IPFW_SRCS += dn_sched_fifo.c dn_sched_wf2q.c +IPFW_SRCS += dn_sched_rr.c dn_sched_qfq.c +IPFW_SRCS += dn_sched_prio.c +# Module glue and functions missing in linux +IPFW_SRCS += ipfw2_mod.c bsd_compat.c + +# generic cflags used on all systems +#ipfw-cflags += -DIPFW_HASHTABLES +ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT +# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix) +ipfw-cflags += -D_BSD_SOURCE +ipfw-cflags += -DKERNEL_MODULE # build linux kernel module +# the two header trees for empty and override files +ipfw-cflags += -I $(M)/include_e +ipfw-cflags += -I $(M)/../sys +ipfw-cflags += -include $(M)/../glue.h # headers +ipfw-cflags += -include $(M)/missing.h # headers + +ifeq ($(OSARCH),Windows) #--- { Windows block +ifeq ($(VER),win64) + $(warning ---- building for 64-bit windows ---) + win_arch= -DAMD64=1 +else + win_arch= -Di386=1 +endif + M ?= $(shell pwd) + WIN_SRCS += md_win.c + WIN_SRCS += miniport.c protocol.c passthru.c debug.c + #compiler, linker, target, sources and objects + #DDK is exported from the root makefile + #DDK = C:/WinDDK/7600.16385.1 + + CSOURCES = $(IPFW_SRCS) $(WIN_SRCS) + + COBJS := $(CSOURCES:.c=.obj) + COBJS := $(addprefix $(OBJDIR)/,$(COBJS)) + + #include paths + INCLUDE_PATHS = -Ii386 -I../sys -Iinclude_e -I. + # INCLUDE_PATHS += -I$(OBJDIR) + INCLUDE_PATHS += -I$(DDK)/inc/api + INCLUDE_PATHS += -I$(DDK)/inc/ddk + INCLUDE_PATHS += -I$(DDK)/inc/crt + + # #preprocessor MS defines + PREPROC = -D_X86_=1 -Di386=1 -DSTD_CALL -DCONDITION_HANDLING=1 + PREPROC += -DNT_UP=0 -DNT_INST=0 -DWIN32=100 -D_NT1X_=100 -DWINNT=1 + PREPROC += -D_WIN32_WINNT=0x0501 -DWINVER=0x0501 -D_WIN32_IE=0x0603 + PREPROC += -DWIN32_LEAN_AND_MEAN=1 + PREPROC += -D__BUILDMACHINE__=WinDDK -DFPO=0 -D_DLL=1 + PREPROC += -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1 + PREPROC += -DNDIS51_MINIPORT=1 -DNDIS51=1 + PREPROC += -DMSC_NOOPT -DNTDDI_VERSION=0x05010200 + PREPROC += -DKMDF_MAJOR_VERSION_STRING=01 -DKMDF_MINOR_VERSION_STRING=009 + #PREPROC += -DDBG=1 #debug + PREPROC += -DNDEBUG #always up, seems no effect, possibly no debug? + PREPROC += -DDEVL=1 #always up, seems no effect + #macroing module name, WARNING: must match the one in .inf files + PREPROC += -DMODULENAME=Ipfw + + #our defines + OUR_PREPROC = -D_KERNEL -DKERNEL_MODULE -DKLD_MODULE + OUR_PREPROC += -D__BSD_VISIBLE -DIPFIREWALL_DEFAULT_TO_ACCEPT + OUR_PREPROC += -D__LITTLE_ENDIAN -DSYSCTL_NODE -DEMULATE_SYSCTL + +ifeq ($(TCC),) # Microsoft C compiler + CC = $(DDK)/bin/x86/x86/cl.exe + LD = $(DDK)/bin/x86/x86/link.exe + # #complier options + CFLAGS = -Fo$(OBJDIR)/ -c -FC -Zc:wchar_t- + CFLAGS += -Zl -Zp8 -Gy -Gm- -GF -cbstring -Gz -hotpatch -EHs-c- + CFLAGS += -W2 # -W3 gives too many conversion errors + CFLAGS += -GR- -GF -GS -Zi # XXX do we need this ? + CFLAGS += -Fd$(OBJDIR)/ + CFLAGS += -wd4603 -wd4627 -typedil- + CFLAGS += -FI $(DDK)/inc/api/warning.h + CFLAGS += -FI winmissing.h + CFLAGS += -FI missing.h # headers + CFLAGS += -FI ../glue.h # headers + + #optimization options + OPTIMIZE = -Od -Oi -Oy- + + #linker options + LDFLAGS = /MERGE:_PAGE=PAGE /MERGE:_TEXT=.text + LDFLAGS += /SECTION:INIT,d /OPT:REF /OPT:ICF + LDFLAGS += /IGNORE:4198,4010,4037,4039,4065,4070,4078,4087,4089,4221 + LDFLAGS += /INCREMENTAL:NO /release /NODEFAULTLIB /WX + LDFLAGS += /debug /debugtype:cv,fixup,pdata + LDFLAGS += /version:6.1 /osversion:6.1 /functionpadmin:5 + LDFLAGS += /safeseh /pdbcompress + LDFLAGS += /STACK:0x40000,0x1000 /driver /base:0x10000 /align:0x80 + LDFLAGS += /stub:$(DDK)\\lib\\wxp\\stub512.com + LDFLAGS += /subsystem:native,5.01 /entry:GsDriverEntry@8 + LDFLAGS += /out:$(OBJDIR)/ipfw.sys + + #libraries to build against + LIBS = $(DDK)/lib/wxp/i386/BufferOverflowK.lib + LIBS += $(DDK)/lib/wxp/i386/ntoskrnl.lib + LIBS += $(DDK)/lib/wxp/i386/hal.lib + LIBS += $(DDK)/lib/wxp/i386/wmilib.lib + LIBS += $(DDK)/lib/wxp/i386/ndis.lib + LIBS += $(DDK)/lib/wxp/i386/sehupd.lib +else # use tcc. not working yet for the kernel module. + # TCC points to the root of tcc tree + CC=$(TCC)/bin/wintcc + EXTRA_CFLAGS += -DTCC -I.. + EXTRA_CFLAGS += -I$(TCC)/include/winapi -I$(TCC)/include + EXTRA_CFLAGS += -nostdinc + + CFLAGS += -include winmissing.h -include missing.h -include ../glue.h + CFLAGS += -I../../inc/api -I../../inc/ddk -I../../inc/crt + CFLAGS += -DRC_INVOKED +endif # use tcc + + #empty include directory to be built + M ?= $(shell pwd) + EFILES_asm += div64.h + EFILES_linux += if.h random.h errno.h + EFILES_net += if_types.h inet_hashtables.h route.h + + #targets +all: $(TARGET) + +$(TARGET): include_e + # XXX dangerous rm -rf $(OBJDIR) + mkdir -p $(OBJDIR) + $(MSG) " CC [$(CC)] $(CSOURCES)" + $(HIDE) $(CC) $(INCLUDE_PATHS) $(PREPROC) $(OUR_PREPROC) $(CFLAGS) $(OPTIMIZE) $(CSOURCES) + $(MSG) " LD [$(LD)] $(COBJS)" + $(HIDE) $(LD) $(LDFLAGS) $(COBJS) $(LIBS) + +else # } { linux variables and targets + +# We have three sections: OpenWrt, Linux 2.4 and Linux 2.6 + +ifeq ($(VER),openwrt) #--- { The Makefile section for openwrt --- + # We do not include a dependency on include_e as it is called + # by Makefile.openwrt in Build/Prepare + M=. + obj-y := $(IPFW_SRCS:%.c=%.o) + O_TARGET := $(obj-m) + + # xcflags-y is a temporary variable where we store build options + xcflags-y += -O1 -DLINUX_24 + xcflags-y += -g + + EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags) -DSYSCTL_NODE -DEMULATE_SYSCTL + + # we should not export anything + #export-objs := ipfw2_mod.o +-include $(TOPDIR)/Rules.make +endif # ---- } end openwrt version + + +ifneq ($(shell echo $(VER)|grep '2.4'),) #--- { + # Makefile section for the linux 2.4 version + # tested on linux-2.4.35.4, does not work with 2.4.37 + # + # guess the kernel path -- or is it under /lib/modules ? + KERNELPATH ?= $(KSRC) + + # We need to figure out the gcc include directory, if not + # set by the user through MYGCC_INCLUDE + # Find compiler version (3rd field in last line returned by gcc -v) + # e.g. gcc version 4.3.2 (Debian 4.3.2-1.1) + MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3) + # We don't know the exact directory under /usr/lib/gcc so we guess + MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include + $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)") + + # additional warning + WARN += -Wall -Wundef + WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing + WARN += -fno-common -Werror-implicit-function-declaration + # WARN += -O2 -fno-stack-protector -m32 -msoft-float -mregparm=3 + # -mregparm=3 gives a printk error + WARN += -m32 -msoft-float # -mregparm=3 + #WARN += -freg-struct-return -mpreferred-stack-boundary=2 + WARN += -Wno-sign-compare + WARN += -Wdeclaration-after-statement + ifneq ($(MYGCC_VER),3.4.6) + WARN += -Wno-pointer-sign + endif + + ccflags-y += -O1 -DLINUX_24 + CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \ + -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \ + ${ccflags-y} + # The Main target +all: mod24 + +else # --- } { linux 2.6 and newer + + # This is the Makefile section for Linux 2.6.x including planetlab + +ifeq ($(IPFW_PLANETLAB),1) + $(warning "---- Building for PlanetLab") + ipfw-cflags += -DIPFW_PLANETLAB # PlanetLab compilation +endif + # if not set, use the version from the installed system + KERNELPATH ?= $(KSRC) +# $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)") + WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES + # The main target + + # Required by GCC 4.6 + ccflags-y += -Wno-unused-but-set-variable + + # extract version number (decimal). Newer linuxes have a different dir + LINUX_VERSION_CODE := $(shell V=linux/version.h; G=. ; \ + [ -f $(KERNELPATH)/include/$${V} ] || G=generated/uapi ;\ + grep LINUX_VERSION_CODE $(KERNELPATH)/include/$${G}/linux/version.h | \ + awk '{printf "%d", $$3} ') + + # awk '{printf "%d %03x%02d", $$3, $$3/256, $$3%256} ') + # $(warning version $(LINUX_VERSION_CODE)) + + ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true) + $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)"); + endif + + # Required by kernel <= 2.6.22, ccflags-y is used on newer version + ifeq ($(shell if [ "$(LINUX_VERSION_CODE)" -le 132630 ] ; then echo "true"; fi),true) + EXTRA_CFLAGS += $(ccflags-y) + endif + + $(warning $(shell [ "$(LINUX_VERSION_CODE)" -le 132635 ] && \ + [ `$(MAKE) -version | head -1 | cut -d " " -f 3` != '3.81' ] && \ + echo "**** need make 3.81 *****") ) + # $(warning make is $(MAKE) version is $(shell $(MAKE) -version | head -1) ) + +all: $(TARGET) +$(TARGET): include_e + $(MAKE) -C $(KERNELPATH) V=$(V) M=`pwd` modules + +endif # } --- linux 2.6 and newer + +#-- back to the common section for linux + +# the list of objects used to build the module +ipfw_mod-y = $(IPFW_SRCS:%.c=%.o) + +# additional $(CC) flags +ccflags-y += $(WARN) +ccflags-y += $(ipfw-cflags) +# if we really want debug symbols... +ccflags-y += -g + +mod24: include_e $(obj-m) + +$(obj-m): $(ipfw_mod-y) + $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^ + +# M is the current directory, used in recursive builds +# so we allow it to be overridden +M ?= $(shell pwd) + +endif # } ----- end of the non-Windows block + +ifneq ($(OBJDIR),mia) + $(error objdir set to $(OBJDIR)) +endif + +#--- various common targets +clean: + -@rm -f *.o *.ko Module.symvers *.mod.c + -@# rm -rf $(OBJDIR) + -@rm -rf include_e + +distclean: clean + -@rm -f .*cmd modules.order opt_* + -@rm -rf .tmp_versions .*.o.d _CL_* + +# support to create empty dirs and files in include_e/ +# EFILES_foo/bar is the list of files to be created in foo/bar +# (/ and . are allowed in gmake variable names) + +EFILES_. += opt_inet.h opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h +EFILES_. += opt_mbuf_stress_test.h opt_param.h opt_ipdivert.h + +EFILES_altq += if_altq.h +EFILES_arpa += inet.h +EFILES_machine += in_cksum.h +EFILES_net += ethernet.h netisr.h pf_mtag.h bpf.h if_types.h vnet.h + +EFILES_netinet += ether.h icmp6.h if_ether.h in.h in_pcb.h in_var.h +EFILES_netinet += in_systm.h ip_carp.h ip_var.h pim.h +EFILES_netinet += sctp.h tcp_timer.h tcpip.h udp_var.h +EFILES_netinet6 += ip6_var.h + +EFILES_sys += _lock.h _rwlock.h rmlock.h _mutex.h jail.h +EFILES_sys += condvar.h eventhandler.h domain.h +EFILES_sys += limits.h lock.h mutex.h priv.h +EFILES_sys += proc.h rwlock.h socket.h socketvar.h +EFILES_sys += sysctl.h time.h ucred.h + +# first make a list of directories from variable names +EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES))) +# then prepend the directory name to individual files. +# $(empty) serves to interpret the following space literally, +# and the ": = " substitution packs spaces into one. +EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i): = ))) + +include_e: + -@rm -rf $(M)/include_e opt_* + -@mkdir -p $(M)/include_e + -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) + +#--- some other targets for testing purposes +test_radix: test_radix.o radix.o +test_lookup: ip_fw_lookup.o +test_radix test_lookup: CFLAGS=-Wall -Werror -O1 diff --git a/kipfw/bsd_compat.c b/kipfw/bsd_compat.c new file mode 100644 index 0000000..ed2ce5d --- /dev/null +++ b/kipfw/bsd_compat.c @@ -0,0 +1,568 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: bsd_compat.c 11530 2012-08-01 10:29:32Z luigi $ + * + * kernel variables and functions that are not available in linux. + */ + +#include +#include /* do_div on 2.4 */ +#include /* get_random_bytes on 2.4 */ +#include +#include +#include + +/* + * gettimeofday would be in sys/time.h but it is not + * visible if _KERNEL is defined + */ +int gettimeofday(struct timeval *, struct timezone *); + +int ticks; /* kernel ticks counter */ +int hz = 1000; /* default clock time */ +long tick = 1000; /* XXX is this 100000/hz ? */ +int bootverbose = 0; +struct timeval boottime; + +int ip_defttl = 64; /* XXX set default value */ +int max_linkhdr = 16; +int fw_one_pass = 1; +u_long in_ifaddrhmask; /* mask for hash table */ +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +u_int rt_numfibs = RT_NUMFIBS; + +/* + * pfil hook support. + * We make pfil_head_get return a non-null pointer, which is then ignored + * in our 'add-hook' routines. + */ +struct pfil_head; +typedef int (pfil_hook_t) + (void *, struct mbuf **, struct ifnet *, int, struct inpcb *); + +struct pfil_head * +pfil_head_get(int proto, u_long flags) +{ + static int dummy; + return (struct pfil_head *)&dummy; +} + +int +pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) +{ + return 0; +} + +int +pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) +{ + return 0; +} + +/* define empty body for kernel function */ +int +priv_check(struct thread *td, int priv) +{ + return 0; +} + +int +securelevel_ge(struct ucred *cr, int level) +{ + return 0; +} + +int +sysctl_handle_int(SYSCTL_HANDLER_ARGS) +{ + return 0; +} + +int +sysctl_handle_long(SYSCTL_HANDLER_ARGS) +{ + return 0; +} + +void +ether_demux(struct ifnet *ifp, struct mbuf *m) +{ + return; +} + +int +ether_output_frame(struct ifnet *ifp, struct mbuf *m) +{ + return 0; +} + +void +in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) +{ + return; +} + +void +icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) +{ + return; +} + +u_short +in_cksum_skip(struct mbuf *m, int len, int skip) +{ + return 0; +} + +u_short +in_cksum_hdr(struct ip *ip) +{ + return 0; +} + +/* + * we don't really reassemble, just return whatever we had. + */ +struct mbuf * +ip_reass(struct mbuf *clone) +{ + return clone; +} +#ifdef INP_LOCK_ASSERT +#undef INP_LOCK_ASSERT +#define INP_LOCK_ASSERT(a) +#endif + +/* credentials check */ +#include +#ifdef __linux__ +int +cred_check(void *_insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb) +{ + int match = 0; + ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn; + + if (*ugid_lookupp == 0) { /* actively lookup and copy in cache */ + /* returns null if any element of the chain up to file is null. + * if sk != NULL then we also have a reference + */ + *ugid_lookupp = linux_lookup(proto, + src_ip.s_addr, htons(src_port), + dst_ip.s_addr, htons(dst_port), + skb, oif ? 1 : 0, u); + } + if (*ugid_lookupp < 0) + return 0; + + if (insn->o.opcode == O_UID) + match = (u->uid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_JAIL) + match = (u->xid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_GID) + match = (u->gid == (uid_t)insn->d[0]); + return match; +} +#endif /* __linux__ */ + +int +jailed(struct ucred *cred) +{ + return 0; +} + +/* +* Return 1 if an internet address is for a ``local'' host +* (one to which we have a connection). If subnetsarelocal +* is true, this includes other subnets of the local net. +* Otherwise, it includes only the directly-connected (sub)nets. +*/ +int +in_localaddr(struct in_addr in) +{ + return 1; +} + +int +sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) +{ + size_t valsize = sopt->sopt_valsize; + + if (len < valsize) + sopt->sopt_valsize = valsize = len; + //printf("copyout buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len); + bcopy(buf, sopt->sopt_val, valsize); + return 0; +} + +/* + * copy data from userland to kernel + */ +int +sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) +{ + size_t valsize = sopt->sopt_valsize; + + if (valsize < minlen) + return EINVAL; + if (valsize > len) + sopt->sopt_valsize = valsize = len; + //printf("copyin buf = %p, sopt = %p, soptval = %p, len = %d \n", buf, sopt, sopt->sopt_val, len); + bcopy(sopt->sopt_val, buf, valsize); + return 0; +} + +void +getmicrouptime(struct timeval *tv) +{ + do_gettimeofday(tv); +} + + +#include + +char * +inet_ntoa_r(struct in_addr ina, char *buf) +{ +#ifdef _WIN32 +#else + unsigned char *ucp = (unsigned char *)&ina; + + sprintf(buf, "%d.%d.%d.%d", + ucp[0] & 0xff, + ucp[1] & 0xff, + ucp[2] & 0xff, + ucp[3] & 0xff); +#endif + return buf; +} + +char * +inet_ntoa(struct in_addr ina) +{ + static char buf[16]; + return inet_ntoa_r(ina, buf); +} + +int +random(void) +{ +#ifdef _WIN32 + static unsigned long seed; + if (seed == 0) { + LARGE_INTEGER tm; + KeQuerySystemTime(&tm); + seed = tm.LowPart; + } + return RtlRandomEx(&seed) & 0x7fffffff; +#else + int r; + get_random_bytes(&r, sizeof(r)); + return r & 0x7fffffff; +#endif +} + + +/* + * do_div really does a u64 / u32 bit division. + * we save the sign and convert to uint befor calling. + * We are safe just because we always call it with small operands. + */ +int64_t +div64(int64_t a, int64_t b) +{ +#ifdef _WIN32 + int a1 = a, b1 = b; + return a1/b1; +#else + uint64_t ua, ub; + int sign = ((a>0)?1:-1) * ((b>0)?1:-1); + + ua = ((a>0)?a:-a); + ub = ((b>0)?b:-b); + do_div(ua, ub); + return sign*ua; +#endif +} + +#ifdef __MIPSEL__ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} +#endif // __MIPSEL__ + +/* + * compact version of fnmatch. + */ +int +fnmatch(const char *pattern, const char *string, int flags) +{ + char s; + + if (!string || !pattern) + return 1; /* no match */ + while ( (s = *string++) ) { + char p = *pattern++; + if (p == '\0') /* pattern is over, no match */ + return 1; + if (p == '*') /* wildcard, match */ + return 0; + if (p == '.' || p == s) /* char match, continue */ + continue; + return 1; /* no match */ + } + /* end of string, make sure the pattern is over too */ + if (*pattern == '\0' || *pattern == '*') + return 0; + return 1; /* no match */ +} + + +/* + * linux 2.6.33 defines these functions to access to + * skbuff internal structures. Define the missing + * function for the previous versions too. + */ +#ifdef linux +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) +inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) +{ + skb->dst = dst; +} + +inline struct dst_entry *skb_dst(const struct sk_buff *skb) +{ + return (struct dst_entry *)skb->dst; +} +#endif /* < 2.6.31 */ +#endif /* linux */ + + +/* support for sysctl emulation. + * XXX this is actually MI code that should be enabled also on openwrt + */ +#ifdef EMULATE_SYSCTL +static struct sysctltable GST; + +int +kesysctl_emu_get(struct sockopt* sopt) +{ + struct dn_id* oid = sopt->sopt_val; + struct sysctlhead* entry; + int sizeneeded = sizeof(struct dn_id) + GST.totalsize + + sizeof(struct sysctlhead); + unsigned char* pstring; + unsigned char* pdata; + int i; + + if (sopt->sopt_valsize < sizeneeded) { + // this is a probe to retrieve the space needed for + // a dump of the sysctl table + oid->id = sizeneeded; + sopt->sopt_valsize = sizeof(struct dn_id); + return 0; + } + + entry = (struct sysctlhead*)(oid+1); + for( i=0; iblocklen = GST.entry[i].head.blocklen; + entry->namelen = GST.entry[i].head.namelen; + entry->flags = GST.entry[i].head.flags; + entry->datalen = GST.entry[i].head.datalen; + pdata = (unsigned char*)(entry+1); + pstring = pdata+GST.entry[i].head.datalen; + bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen); + bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen); + entry = (struct sysctlhead*) + ((unsigned char*)(entry) + GST.entry[i].head.blocklen); + } + sopt->sopt_valsize = sizeneeded; + return 0; +} + +int +kesysctl_emu_set(void* p, int l) +{ + struct sysctlhead* entry; + unsigned char* pdata; + unsigned char* pstring; + int i = 0; + + entry = (struct sysctlhead*)(((struct dn_id*)p)+1); + pdata = (unsigned char*)(entry+1); + pstring = pdata + entry->datalen; + + for (i=0; idatalen != GST.entry[i].head.datalen) { + printf("%s: len mismatch, user %d vs kernel %d\n", + __FUNCTION__, entry->datalen, + GST.entry[i].head.datalen); + return -1; + } + // check access (at the moment flags handles only the R/W rights + //later on will be type + access + if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) { + printf("%s: the entry %s is read only\n", + __FUNCTION__,GST.entry[i].name); + return -1; + } + bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen); + return 0; + } + printf("%s: match not found\n",__FUNCTION__); + return 0; +} + +/* convert all _ to . until the first . */ +static void +underscoretopoint(char* s) +{ + for (; *s && *s != '.'; s++) + if (*s == '_') + *s = '.'; +} + +static int +formatnames() +{ + int i; + int size=0; + char* name; + + for (i=0; i> 2, + GST.entry[i].head.flags & 0x00000003); + printf("data %i\n", *(int*)(GST.entry[i].data)); + printf("datalen %i\n", GST.entry[i].head.datalen); + printf("blocklen %i\n", GST.entry[i].head.blocklen); + } +} + +void sysctl_addgroup_f1(); +void sysctl_addgroup_f2(); +void sysctl_addgroup_f3(); +void sysctl_addgroup_f4(); + +void +keinit_GST() +{ + int ret; + + sysctl_addgroup_f1(); + sysctl_addgroup_f2(); + sysctl_addgroup_f3(); + sysctl_addgroup_f4(); + ret = formatnames(); + if (ret != 0) + printf("conversion of names failed for some reason\n"); + //dumpGST(); + printf("*** Global Sysctl Table entries = %i, total size = %i ***\n", + GST.count, GST.totalsize); +} + +void +keexit_GST() +{ + if (GST.namebuffer != NULL) + free(GST.namebuffer,0); + bzero(&GST, sizeof(GST)); +} + +void +sysctl_pushback(char* name, int flags, int datalen, void* data) +{ + if (GST.count >= GST_HARD_LIMIT) { + printf("WARNING: global sysctl table full, this entry will not be added," + "please recompile the module increasing the table size\n"); + return; + } + GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0' + GST.entry[GST.count].name = name; + GST.entry[GST.count].head.flags = flags; + GST.entry[GST.count].data = data; + GST.entry[GST.count].head.datalen = datalen; + GST.entry[GST.count].head.blocklen = + ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen + + GST.entry[GST.count].head.datalen)+3) & ~3; + GST.totalsize += GST.entry[GST.count].head.blocklen; + GST.count++; +} +#endif /* EMULATE_SYSCTL */ diff --git a/kipfw/debug.c b/kipfw/debug.c new file mode 100644 index 0000000..67a4f23 --- /dev/null +++ b/kipfw/debug.c @@ -0,0 +1,67 @@ +#include + +const char* texify_cmd(int i) +{ + if (i==110) + return("IP_FW_ADD"); + if (i==111) + return("IP_FW_DEL"); + if (i==112) + return("IP_FW_FLUSH"); + if (i==113) + return("IP_FW_ZERO"); + if (i==114) + return("IP_FW_GET"); + if (i==115) + return("IP_FW_RESETLOG"); + if (i==116) + return("IP_FW_NAT_CFG"); + if (i==117) + return("IP_FW_NAT_DEL"); + if (i==118) + return("IP_FW_NAT_GET_CONFIG"); + if (i==119) + return("IP_FW_NAT_GET_LOG"); + if (i==120) + return("IP_DUMMYNET_CONFIGURE"); + if (i==121) + return("IP_DUMMYNET_DEL"); + if (i==122) + return("IP_DUMMYNET_FLUSH"); + if (i==124) + return("IP_DUMMYNET_GET"); + if (i==108) + return("IP_FW3"); + if (i==109) + return("IP_DUMMYNET3"); + return ("BOH"); +} + +const char* texify_proto(unsigned int p) +{ + if (p==1) + return("ICMP"); + if (p==6) + return("TCP"); + if (p==17) + return("UDP"); + return("OTHER"); +} + +void hexdump(unsigned char* addr, int len, const char *msg) +{ + int i; + const int cicli = len/8; + const int resto = len%8; + unsigned char d[8]; + + DbgPrint("%s at %p len %d\n", msg, addr, len); + for (i=0; i<=cicli; i++) { + bzero(d, 8); + bcopy(addr+i*8, d, i < cicli ? 8 : resto); + DbgPrint("%04X %02X %02X %02X %02X %02X %02X %02X %02X\n", + i*8, d[0], d[1], d[2], d[3], d[4], + d[5], d[6], d[7]); + } + DbgPrint("\n"); +} diff --git a/kipfw/ipfw2_mod.c b/kipfw/ipfw2_mod.c new file mode 100644 index 0000000..d0824ce --- /dev/null +++ b/kipfw/ipfw2_mod.c @@ -0,0 +1,955 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: ipfw2_mod.c 12501 2014-01-10 01:09:14Z luigi $ + * + * The main interface to build ipfw+dummynet as a linux module. + * (and possibly as a windows module as well, though that part + * is not complete yet). + * + * The control interface uses the sockopt mechanism + * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). + * + * The data interface uses the netfilter interface, at the moment + * hooked to the PRE_ROUTING and POST_ROUTING hooks. + * Unfortunately the netfilter interface is a moving target, + * so we need a set of macros to adapt to the various cases. + * + * In the netfilter hook we just mark packet as 'QUEUE' and then + * let the queue handler to do the whole work (filtering and + * possibly emulation). + * As we receive packets, we wrap them with an mbuf descriptor + * so the existing ipfw+dummynet code runs unmodified. + */ + +#include +#include /* sizeof struct mbuf */ +#include /* NGROUPS */ + +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#endif + +#ifdef __linux__ +#include +#include + +#ifndef CONFIG_NETFILTER +#error should configure netfilter (broken on 2.6.26 and below ?) +#endif + +#include +#include /* NF_IP_PRI_FILTER */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) +#include /* nf_queue */ +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +#define __read_mostly +#endif + +#endif /* !__linux__ */ + +#include /* in_addr */ +#include /* ip_fw_ctl_t, ip_fw_chk_t */ +#include /* ip_fw_ctl_t, ip_fw_chk_t */ +#include /* ip_dn_ctl_t, ip_dn_io_t */ +#include /* PFIL_IN, PFIL_OUT */ + +#ifdef __linux__ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,13) +/* XXX was < 2.6.0: inet_hashtables.h is introduced in 2.6.14 */ +// #warning --- inet_hashtables not present on 2.4 +#include +#include +#include +static inline int inet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +#else +#include /* inet_lookup */ +#endif +#endif /* __linux__ */ + +#include /* inet_iif */ + +/* + * Here we allocate some global variables used in the firewall. + */ +//ip_dn_ctl_t *ip_dn_ctl_ptr; +int (*ip_dn_ctl_ptr)(struct sockopt *); + +ip_fw_ctl_t *ip_fw_ctl_ptr; + +int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); +ip_fw_chk_t *ip_fw_chk_ptr; + +void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +/* Divert hooks. */ +void (*ip_divert_ptr)(struct mbuf *m, int incoming); + +/* ng_ipfw hooks. */ +ng_ipfw_input_t *ng_ipfw_input_p = NULL; + +/*--- + * Glue code to implement the registration of children with the parent. + * Each child should call my_mod_register() when linking, so that + * module_init() and module_exit() can call init_children() and + * fini_children() to provide the necessary initialization. + * We use the same mechanism for MODULE_ and SYSINIT_. + * The former only get a pointer to the moduledata, + * the latter have two function pointers (init/uninit) + */ +#include +struct mod_args { + const char *name; + int order; + struct moduledata *mod; + void (*init)(void), (*uninit)(void); +}; + +static unsigned int mod_idx; +static struct mod_args mods[10]; /* hard limit to 10 modules */ + +int +my_mod_register(const char *name, int order, + struct moduledata *mod, void *init, void *uninit); +/* + * my_mod_register should be called automatically as the init + * functions in the submodules. Unfortunately this compiler/linker + * trick is not supported yet so we call it manually. + */ +int +my_mod_register(const char *name, int order, + struct moduledata *mod, void *init, void *uninit) +{ + struct mod_args m; + + m.name = name; + m.order = order; + m.mod = mod; + m.init = init; + m.uninit = uninit; + + printf("%s %s called\n", __FUNCTION__, name); + if (mod_idx < sizeof(mods) / sizeof(mods[0])) + mods[mod_idx++] = m; + return 0; +} + +static void +init_children(void) +{ + unsigned int i; + + /* Call the functions registered at init time. */ + printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx); + for (i = 0; i < mod_idx; i++) { + struct mod_args *m = &mods[i]; + printf("+++ start module %d %s %s at %p order 0x%x\n", + i, m->name, m->mod ? m->mod->name : "SYSINIT", + m->mod, m->order); + if (m->mod && m->mod->evhand) + m->mod->evhand(NULL, MOD_LOAD, m->mod->priv); + else if (m->init) + m->init(); + } +} + +static void +fini_children(void) +{ + int i; + + /* Call the functions registered at init time. */ + for (i = mod_idx - 1; i >= 0; i--) { + struct mod_args *m = &mods[i]; + printf("+++ end module %d %s %s at %p order 0x%x\n", + i, m->name, m->mod ? m->mod->name : "SYSINIT", + m->mod, m->order); + if (m->mod && m->mod->evhand) + m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv); + else if (m->uninit) + m->uninit(); + } +} +/*--- end of module binding helper functions ---*/ + +/*--- + * Control hooks: + * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention. + * then call the ipfw handler in order to manage requests. + * In turn this is called by the linux set/get handlers. + */ +static int +ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user) +{ + struct thread t; + int ret = EINVAL; + + memset(s, 0, sizeof(*s)); + s->sopt_name = cmd; + s->sopt_dir = dir; + s->sopt_valsize = len; + s->sopt_val = user; + + /* sopt_td is not used but it is referenced */ + memset(&t, 0, sizeof(t)); + s->sopt_td = &t; + + //printf("%s called with cmd %d len %d sopt %p user %p\n", __FUNCTION__, cmd, len, s, user); + + if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 || + cmd < IP_DUMMYNET_CONFIGURE)) + ret = ip_fw_ctl_ptr(s); + else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 || + cmd >= IP_DUMMYNET_CONFIGURE)) + ret = ip_dn_ctl_ptr(s); + + return -ret; /* errors are < 0 on linux */ +} + +#ifdef linux +/* + * Convert an mbuf into an skbuff + * At the moment this only works for ip packets fully contained + * in a single mbuf. We assume that on entry ip_len and ip_off are + * in host format, and the ip checksum is not computed. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) /* check boundary */ +int dst_output(struct skbuff *s) +{ + return 0; +} + +struct sk_buff * +mbuf2skbuff(struct mbuf* m) +{ + return NULL; +} +#else +struct sk_buff * +mbuf2skbuff(struct mbuf* m) +{ + struct sk_buff *skb; + size_t len = m->m_pkthdr.len; + + /* used to lookup the routing table */ + struct rtable *r; + struct flowi fl; + int ret = 0; /* success for ip_route_output_key() */ + + struct ip *ip = mtod(m, struct ip *); + + /* XXX ip_output has ip_len and ip_off in network format, + * linux expects host format */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, ip->ip_hl<<2); + + /* fill flowi struct, we need just the dst addr, see XXX */ + bzero(&fl, sizeof(fl)); + flow_daddr.daddr = ip->ip_dst.s_addr; + + /* + * ip_route_output_key() should increment + * r->u.dst.__use and call a dst_hold(dst) + * XXX verify how we release the resources. + */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,38) /* check boundary */ + r = ip_route_output_key(&init_net, &fl.u.ip4); +#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26) /* check boundary */ + ret = ip_route_output_key(&init_net, &r, &fl); +#else + ret = ip_route_output_key(&r, &fl); +#endif + if (ret != 0 || r == NULL ) { + printf("NO ROUTE FOUND\n"); + return NULL; + } + + /* allocate the skbuff and the data */ + skb = alloc_skb(len + sizeof(struct ethhdr), GFP_ATOMIC); + if (skb == NULL) { + printf("%s: can not allocate SKB buffers.\n", __FUNCTION__); + return NULL; + } + + skb->protocol = htons(ETH_P_IP); // XXX 8 or 16 bit ? + /* sk_dst_set XXX take the lock (?) */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) + skb_dst_set(skb, &r->u.dst); +#else + skb_dst_set(skb, &r->dst); +#endif + skb->dev = skb_dst(skb)->dev; + + /* reserve space for ethernet header */ + skb_reserve(skb, sizeof(struct ethhdr)); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) + skb_reset_network_header(skb); // skb->network_header = skb->data - skb->head +#else + skb->nh.raw = skb->data; +#endif + /* set skbuff tail pointers and copy content */ + skb_put(skb, len); + memcpy(skb->data, m->m_data, len); + + return skb; +} +#endif /* linux 2.6+ */ +#endif /* linux */ + + +/* + * This function is called to reinject packets to the + * kernel stack within the linux netfilter system + * or to send a new created mbuf. + * In the first case we have a valid sk_buff pointer + * encapsulated within the fake mbuf, so we can call + * the reinject function trough netisr_dispatch. + * In the last case we need to build a sk_buff from scratch, + * before sending out the packet. + */ +int +ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct inpcb *inp) +{ + (void)opt; (void)ro; (void)flags; (void)imo; (void)inp; /* UNUSED */ + if ( m->m_skb != NULL ) { /* reinjected packet, just call dispatch */ + ND("sending... "); + netisr_dispatch(0, m); + } else { + /* self-generated packet, wrap as appropriate and send */ +#ifdef __linux__ + struct sk_buff *skb = mbuf2skbuff(m); + + if (skb != NULL) + dst_output(skb); +#else /* Windows */ + D("unimplemented."); +#endif + FREE_PKT(m); + } + return 0; +} + +/* + * setsockopt hook has no return value other than the error code. + */ +int +do_ipfw_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + struct sockopt s; /* pass arguments */ + (void)sk; /* UNUSED */ + return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user); +} + +/* + * getsockopt can can return a block of data in response. + */ +int +do_ipfw_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + struct sockopt s; /* pass arguments */ + int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user); + + (void)sk; /* UNUSED */ + *len = s.sopt_valsize; /* return length back to the caller */ + return ret; +} + +#ifdef __linux__ + +/* + * declare our [get|set]sockopt hooks + */ +static struct nf_sockopt_ops ipfw_sockopts = { + .pf = PF_INET, + .set_optmin = _IPFW_SOCKOPT_BASE, + .set_optmax = _IPFW_SOCKOPT_END, + .set = do_ipfw_set_ctl, + .get_optmin = _IPFW_SOCKOPT_BASE, + .get_optmax = _IPFW_SOCKOPT_END, + .get = do_ipfw_get_ctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) + .owner = THIS_MODULE, +#endif +}; + +/*---- + * We need a number of macros to adapt to the various APIs in + * different linux versions. Among them: + * + * - the hook names change between macros (NF_IP*) and enum NF_INET_* + * + * - the second argument to the netfilter hook is + * struct sk_buff ** in kernels <= 2.6.22 + * struct sk_buff * in kernels > 2.6.22 + * + * - NF_STOP is not defined before 2.6 so we remap it to NF_ACCEPT + * + * - the packet descriptor passed to the queue handler is + * struct nf_info in kernels <= 2.6.24 + * struct nf_queue_entry in kernels <= 2.6.24 + * + * - the arguments to the queue handler also change; + */ + +/* + * declare hook to grab packets from the netfilter interface. + * The NF_* names change in different versions of linux, in some + * cases they are #defines, in others they are enum, so we + * need to adapt. + */ +#ifndef NF_IP_PRE_ROUTING +#define NF_IP_PRE_ROUTING NF_INET_PRE_ROUTING +#endif +#ifndef NF_IP_POST_ROUTING +#define NF_IP_POST_ROUTING NF_INET_POST_ROUTING +#endif + +/* + * ipfw hooks into the POST_ROUTING and the PRE_ROUTING chains. + * PlanetLab sets skb_tag to the slice id in the LOCAL_INPUT and + * POST_ROUTING chains, so if we want to use that information we + * need to hook the LOCAL_INPUT chain instead of the PRE_ROUTING. + * However at the moment the skb_tag info is not reliable so + * we stay with the standard hooks. + */ +#if 0 // defined(IPFW_PLANETLAB) +#define IPFW_HOOK_IN NF_IP_LOCAL_IN +#else +#define IPFW_HOOK_IN NF_IP_PRE_ROUTING +#endif + +/* + * The main netfilter hook. + * To make life simple, we queue everything and then do all the + * decision in the queue handler. + * + * XXX note that in 2.4 and up to 2.6.22 the skbuf is passed as sk_buff** + * so we have an #ifdef to set the proper argument type. + */ +static unsigned int +call_ipfw( +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) + unsigned int hooknum, +#else + const struct nf_hook_ops *hooknum, +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) // in 2.6.22 we have ** + struct sk_buff **skb, +#else + struct sk_buff *skb, +#endif + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + (void)hooknum; (void)skb; (void)in; (void)out; (void)okfn; /* UNUSED */ + return NF_QUEUE; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) /* XXX was 2.6.0 */ +#define NF_STOP NF_ACCEPT +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + +/* + * nf_queue_entry is a recent addition, in previous versions + * of the code the struct is called nf_info. + */ +#define nf_queue_entry nf_info /* for simplicity */ + +/* also, 2.4 and perhaps something else have different arguments */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX unsure */ +/* on 2.4 we use nf_info */ +#define QH_ARGS struct sk_buff *skb, struct nf_info *info, void *data +#else /* 2.6.14. 2.6.24 */ +#define QH_ARGS struct sk_buff *skb, struct nf_info *info, unsigned int qnum, void *data +#endif + +#define DEFINE_SKB /* nothing, already an argument */ +#define REINJECT(_inf, _verd) nf_reinject(skb, _inf, _verd) + +#else /* 2.6.25 and above */ + +#define QH_ARGS struct nf_queue_entry *info, unsigned int queuenum +#define DEFINE_SKB struct sk_buff *skb = info->skb; +#define REINJECT(_inf, _verd) nf_reinject(_inf, _verd) +#endif + +/* + * used by dummynet when dropping packets + * XXX use dummynet_send() + */ +void +reinject_drop(struct mbuf* m) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) /* unsure on the exact boundary */ + struct sk_buff *skb = (struct sk_buff *)m; +#endif + REINJECT(m->queue_entry, NF_DROP); +} + +/* + * The real call to the firewall. nf_queue_entry points to the skbuf, + * and eventually we need to return both through nf_reinject(). + */ +static int +ipfw2_queue_handler(QH_ARGS) +{ + DEFINE_SKB /* no semicolon here, goes in the macro */ + int ret = 0; /* return value */ + struct mbuf *m; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + if (skb->nh.iph == NULL) { + printf("null dp, len %d reinject now\n", skb->len); + REINJECT(info, NF_ACCEPT); + return 0; + } +#endif + m = malloc(sizeof(*m), 0, 0); + if (m == NULL) { + printf("malloc fail, len %d reinject now\n", skb->len); + REINJECT(info, NF_ACCEPT); + return 0; + } + + m->m_skb = skb; + m->m_len = skb->len; /* len from ip header to end */ + m->m_pkthdr.len = skb->len; /* total packet len */ + m->m_pkthdr.rcvif = info->indev; + m->queue_entry = info; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) /* XXX was 2.6.0 */ + m->m_data = (char *)skb->nh.iph; +#else + m->m_data = (char *)skb_network_header(skb); // XXX unsigned ? */ +#endif + + /* XXX add the interface */ + if (info->hook == IPFW_HOOK_IN) { + ret = ipfw_check_hook(NULL, &m, info->indev, PFIL_IN, NULL); + } else { + ret = ipfw_check_hook(NULL, &m, info->outdev, PFIL_OUT, NULL); + } + + if (m != NULL) { /* Accept. reinject and free the mbuf */ + REINJECT(info, NF_ACCEPT); + m_freem(m); + } else if (ret == 0) { + /* dummynet has kept the packet, will reinject later. */ + } else { + /* + * Packet dropped by ipfw or dummynet. Nothing to do as + * FREE_PKT already did a reinject as NF_DROP + */ + } + return 0; +} + +struct route; +struct ip_moptions; +struct inpcb; + +/* XXX should include prototypes for netisr_dispatch and ip_output */ +/* + * The reinjection routine after a packet comes out from dummynet. + * We must update the skb timestamp so ping reports the right time. + * This routine is also used (with num == -1) as FREE_PKT. XXX + */ +void +netisr_dispatch(int num, struct mbuf *m) +{ + struct nf_queue_entry *info = m->queue_entry; + struct sk_buff *skb = m->m_skb; /* always used */ + + /* + * This function can be called by the FREE_PKT() + * used when ipfw generate their own mbuf packets + * or by the mbuf2skbuff() function. + */ + m_freem(m); + + /* XXX check + * info is null in the case of a real mbuf + * (one created by the ipfw code without a + * valid sk_buff pointer + */ + if (info == NULL) + return; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) // XXX above 2.6.x ? + __net_timestamp(skb); /* update timestamp */ +#endif + + /* XXX to obey one-pass, possibly call the queue handler here */ + REINJECT(info, ((num == -1)?NF_DROP:NF_STOP)); /* accept but no more firewall */ +} + +/* + * socket lookup function for linux. + * This code is used to associate uid, gid, jail/xid to packets, + * and store the info in a cache *ugp where they can be accessed quickly. + * The function returns 1 if the info is found, -1 otherwise. + * + * We do this only on selected protocols: TCP, ... + * + * The chain is the following + * sk_buff* sock* socket* file* + * skb -> sk ->sk_socket->file ->f_owner ->pid + * skb -> sk ->sk_socket->file ->f_uid (direct) + * skb -> sk ->sk_socket->file ->f_cred->fsuid (2.6.29+) + * + * Related headers: + * linux/skbuff.h struct skbuff + * net/sock.h struct sock + * linux/net.h struct socket + * linux/fs.h struct file + * + * With vserver we may have sk->sk_xid and sk->sk_nid that + * which we store in fw_groups[1] (matches O_JAIL) and fw_groups[2] + * (no matches yet) + * + * Note- for locally generated, outgoing packets we should not need + * need a lookup because the sk_buff already points to the socket where + * the info is. + */ +extern struct inet_hashinfo tcp_hashinfo; +int +linux_lookup(const int proto, const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + struct sk_buff *skb, int dir, struct bsd_ucred *u) +{ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,13) /* XXX was 2.6.0 */ + return -1; +#else + struct sock *sk; + int ret = -1; /* default return value */ + int st = -1; /* state */ + + + if (proto != IPPROTO_TCP) /* XXX extend for UDP */ + return -1; + + if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) { + panic(" -- this should not happen\n"); + return -1; + } + + if (skb->sk) { + sk = skb->sk; + } else { + /* + * Try a lookup. On a match, sk has a refcount that we must + * release on exit (we know it because skb->sk = NULL). + * + * inet_lookup above 2.6.24 has an additional 'net' parameter + * so we use a macro to conditionally supply it. + * swap dst and src depending on the direction. + */ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24) +#define _OPT_NET_ARG +#else +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) +/* there is no dev_net() on 2.6.25 */ +#define _OPT_NET_ARG (skb->dev->nd_net), +#else /* 2.6.26 and above */ +#define _OPT_NET_ARG dev_net(skb->dev), +#endif +#endif + sk = (dir) ? /* dir != 0 on output */ + inet_lookup(_OPT_NET_ARG &tcp_hashinfo, + daddr, dport, saddr, sport, // match outgoing + inet_iif(skb)) : + inet_lookup(_OPT_NET_ARG &tcp_hashinfo, + saddr, sport, daddr, dport, // match incoming + skb->dev->ifindex); +#undef _OPT_NET_ARG + + if (sk == NULL) /* no match, nothing to be done */ + return -1; + } + ret = 1; /* retrying won't make things better */ + st = sk->sk_state; +#ifdef CONFIG_VSERVER + u->xid = sk->sk_xid; + u->nid = sk->sk_nid; +#else + u->xid = u->nid = 0; +#endif + /* + * Exclude tcp states where sk points to a inet_timewait_sock which + * has no sk_socket field (surely TCP_TIME_WAIT, perhaps more). + * To be safe, use a whitelist and not a blacklist. + * Before dereferencing sk_socket grab a lock on sk_callback_lock. + * + * Once again we need conditional code because the UID and GID + * location changes between kernels. + */ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28) +/* use the current's real uid/gid */ +#define _CURR_UID f_uid +#define _CURR_GID f_gid +#else /* 2.6.29 and above */ +/* use the current's file access real uid/gid */ +#define _CURR_UID f_cred->fsuid +#define _CURR_GID f_cred->fsgid +#endif + +#define GOOD_STATES ( \ + (1<sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + u->uid = sk->sk_socket->file->_CURR_UID; + u->gid = sk->sk_socket->file->_CURR_GID; + } + read_unlock_bh(&sk->sk_callback_lock); + } else { + u->uid = u->gid = 0; + } + if (!skb->sk) /* return the reference that came from the lookup */ + sock_put(sk); +#undef GOOD_STATES +#undef _CURR_UID +#undef _CURR_GID + return ret; + +#endif /* LINUX > 2.4 */ +} + +/* + * Now prepare to hook the various functions. + * Linux 2.4 has a different API so we need some adaptation + * for register and unregister hooks + * + * the unregister function changed arguments between 2.6.22 and 2.6.24 + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) +struct nf_queue_handler ipfw2_queue_handler_desc = { + .outfn = ipfw2_queue_handler, +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) + .name = "ipfw2 dummynet queue", +#endif +}; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) +#define REG_QH_ARG(pf, fn) pf, &(fn ## _desc) +#else +#define REG_QH_ARG(pf, fn) &(fn ## _desc) +#endif +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) /* XXX was 2.6.0 */ +static int +nf_register_hooks(struct nf_hook_ops *ops, int n) +{ + int i, ret = 0; + for (i = 0; i < n; i++) { + ret = nf_register_hook(ops + i); + if (ret < 0) + break; + } + return ret; +} + +static void +nf_unregister_hooks(struct nf_hook_ops *ops, int n) +{ + int i; + for (i = 0; i < n; i++) { + nf_unregister_hook(ops + i); + } +} +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* XXX was 2.6.0 */ +#define REG_QH_ARG(pf, fn) pf, fn, NULL +#endif +#define UNREG_QH_ARG(pf, fn) //fn /* argument for nf_[un]register_queue_handler */ +#define SET_MOD_OWNER + +#else /* linux > 2.6.17 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +#define UNREG_QH_ARG(pf, fn) //fn +#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) +#define UNREG_QH_ARG(pf, fn) pf, &(fn ## _desc) +#else +#define UNREG_QH_ARG(pf, fn) +#endif /* 2.6.0 < LINUX > 2.6.24 */ + +#define SET_MOD_OWNER .owner = THIS_MODULE, + +#endif /* !LINUX < 2.6.0 */ + +static struct nf_hook_ops ipfw_ops[] __read_mostly = { + { + .hook = call_ipfw, + .pf = PF_INET, + .hooknum = IPFW_HOOK_IN, + .priority = NF_IP_PRI_FILTER, + SET_MOD_OWNER + }, + { + .hook = call_ipfw, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_FILTER, + SET_MOD_OWNER + }, +}; +#endif /* __linux__ */ + +/* descriptors for the children, until i find a way for the + * linker to produce them + */ +extern moduledata_t *moddesc_ipfw; +extern moduledata_t *moddesc_dummynet; +extern moduledata_t *moddesc_dn_fifo; +extern moduledata_t *moddesc_dn_wf2qp; +extern moduledata_t *moddesc_dn_rr; +extern moduledata_t *moddesc_dn_qfq; +extern moduledata_t *moddesc_dn_prio; +extern void *sysinit_ipfw_init; +extern void *sysuninit_ipfw_destroy; +extern void *sysinit_vnet_ipfw_init; +extern void *sysuninit_vnet_ipfw_uninit; + +/* + * Module glue - init and exit function. + */ +int __init +ipfw_module_init(void) +{ + int ret = 0; +#ifdef _WIN32 + unsigned long resolution; +#endif + + rn_init(64); + my_mod_register("ipfw", 1, moddesc_ipfw, NULL, NULL); + my_mod_register("sy_ipfw", 2, NULL, + sysinit_ipfw_init, sysuninit_ipfw_destroy); + my_mod_register("sy_Vnet_ipfw", 3, NULL, + sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit); + my_mod_register("dummynet", 4, moddesc_dummynet, NULL, NULL); + my_mod_register("dn_fifo", 5, moddesc_dn_fifo, NULL, NULL); + my_mod_register("dn_wf2qp", 6, moddesc_dn_wf2qp, NULL, NULL); + my_mod_register("dn_rr", 7, moddesc_dn_rr, NULL, NULL); + my_mod_register("dn_qfq", 8, moddesc_dn_qfq, NULL, NULL); + my_mod_register("dn_prio", 9, moddesc_dn_prio, NULL, NULL); + init_children(); + +#ifdef _WIN32 + resolution = ExSetTimerResolution(1, TRUE); + printf("*** ExSetTimerResolution: resolution set to %d n-sec ***\n",resolution); +#endif +#ifdef EMULATE_SYSCTL + keinit_GST(); +#endif + +#ifdef __linux__ + /* sockopt register, in order to talk with user space */ + ret = nf_register_sockopt(&ipfw_sockopts); + if (ret < 0) { + printf("error %d in nf_register_sockopt\n", ret); + goto clean_modules; + } + + /* queue handler registration, in order to get network + * packet under a private queue */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,2) + ret = +#endif + nf_register_queue_handler(REG_QH_ARG(PF_INET, ipfw2_queue_handler) ); + if (ret < 0) /* queue busy */ + goto unregister_sockopt; + + ret = nf_register_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops)); + if (ret < 0) + goto unregister_sockopt; + + printf("%s loaded\n", __FUNCTION__); + return 0; + + +/* handle errors on load */ +unregister_sockopt: + nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) ); + nf_unregister_sockopt(&ipfw_sockopts); + +clean_modules: + fini_children(); + printf("%s error\n", __FUNCTION__); + +#endif /* __linux__ */ + return ret; +} + +/* module shutdown */ +void __exit +ipfw_module_exit(void) +{ +#ifdef EMULATE_SYSCTL + keexit_GST(); +#endif +#ifdef _WIN32 + ExSetTimerResolution(0,FALSE); + +#else /* linux hook */ + nf_unregister_hooks(ipfw_ops, ARRAY_SIZE(ipfw_ops)); + /* maybe drain the queue before unregistering ? */ + nf_unregister_queue_handler(UNREG_QH_ARG(PF_INET, ipfw2_queue_handler) ); + nf_unregister_sockopt(&ipfw_sockopts); +#endif /* __linux__ */ + + fini_children(); + + printf("%s unloaded\n", __FUNCTION__); +} + +#ifdef __linux__ +module_init(ipfw_module_init) +module_exit(ipfw_module_exit) +MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ +#endif diff --git a/kipfw/md_win.c b/kipfw/md_win.c new file mode 100644 index 0000000..9e66889 --- /dev/null +++ b/kipfw/md_win.c @@ -0,0 +1,636 @@ +/* + * Copyright (C) 2010 Luigi Rizzo, Francesco Magno, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * kernel variables and functions that are not available in Windows. + */ + +#include /* provides PFIL_IN and PFIL_OUT */ +#include +#include /* in_addr */ +#include +#include +#include + +/* credentials check */ +int +cred_check(void *_insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb) +{ + return 0; +} + +/* + * as good as anywhere, place here the missing calls + */ + +void * +my_alloc(int size) +{ + void *_ret = ExAllocatePoolWithTag(NonPagedPool, size, 'wfpi'); + if (_ret) + memset(_ret, 0, size); + return _ret; +} + +void +panic(const char *fmt, ...) +{ + printf("%s", fmt); + for (;;); +} + +int securelevel = 0; + +int ffs(int bits) +{ + int i; + if (bits == 0) + return (0); + for (i = 1; ; i++, bits >>= 1) { + if (bits & 1) + break; + } + return (i); +} + +void +do_gettimeofday(struct timeval *tv) +{ + static LARGE_INTEGER prevtime; //system time in 100-nsec resolution + static LARGE_INTEGER prevcount; //RTC counter value + static LARGE_INTEGER freq; //frequency + + LARGE_INTEGER currtime; + LARGE_INTEGER currcount; + if (prevtime.QuadPart == 0) { //first time we ask for system time + KeQuerySystemTime(&prevtime); + prevcount = KeQueryPerformanceCounter(&freq); + currtime.QuadPart = prevtime.QuadPart; + } else { + KeQuerySystemTime(&currtime); + currcount = KeQueryPerformanceCounter(&freq); + if (currtime.QuadPart == prevtime.QuadPart) { + //time has NOT changed, calculate time using ticks and DO NOT update + LONGLONG difftime = 0; //difference in 100-nsec + LONGLONG diffcount = 0; //clock count difference + //printf("time has NOT changed\n"); + diffcount = currcount.QuadPart - prevcount.QuadPart; + diffcount *= 10000000; + difftime = diffcount / freq.QuadPart; + currtime.QuadPart += difftime; + } else { + //time has changed, update and return SystemTime + //printf("time has changed\n"); + prevtime.QuadPart = currtime.QuadPart; + prevcount.QuadPart = currcount.QuadPart; + } + } + currtime.QuadPart /= 10; //convert in usec + tv->tv_sec = currtime.QuadPart / (LONGLONG)1000000; + tv->tv_usec = currtime.QuadPart % (LONGLONG)1000000; + //printf("sec %d usec %d\n",tv->tv_sec, tv->tv_usec); +} + +int time_uptime_w32() +{ + int ret; + LARGE_INTEGER tm; + KeQuerySystemTime(&tm); + ret = (int)(tm.QuadPart / (LONGLONG)1000000); + return ret; +} + + +/* + * Windows version of firewall hook. We receive a partial copy of + * the packet which points to the original buffers. In output, + * the refcount has been already incremented. + * The function reconstructs + * the whole packet in a contiguous memory area, builds a fake mbuf, + * calls the firewall, does the eventual cleaning and returns + * to MiniportSend or ProtocolReceive, which will silently return + * (dropping packet) or continue its execution (allowing packet). + * The memory area contains: + * - the fake mbuf, filled with data needed by ipfw, and information + * for reinjection + * - the packet data + */ +void hexdump(PUCHAR,int, const char *); +static char _if_in[] = "incoming"; +static char _if_out[] = "outgoing"; + +int +ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, + NDIS_HANDLE Context) +{ + unsigned int BufferCount = 0; + unsigned TotalPacketLength = 0; + PNDIS_BUFFER pCurrentBuffer = NULL; + PNDIS_BUFFER pNextBuffer = NULL; + struct mbuf* m; + unsigned char* payload = NULL; + unsigned int ofs, l; + unsigned short EtherType = 0; + unsigned int i = 0; + int ret = 0; + PNDIS_BUFFER pNdisBuffer, old_head, old_tail; + NDIS_HANDLE PacketPool; + PADAPT pAdapt; + NDIS_STATUS Status; + + /* In NDIS, packets are a chain of NDIS_BUFFER. We query + * the packet to get a pointer of chain's head, the length + * of the chain, and the length of the packet itself. + * Then allocate a buffer for the mbuf and the payload. + */ + NdisQueryPacket(pNdisPacket, NULL, &BufferCount, + &pCurrentBuffer, &TotalPacketLength); + m = malloc(sizeof(struct mbuf) + TotalPacketLength, 0, 0 ); + if (m == NULL) //resource shortage, drop the packet + goto drop_pkt; + + /* set mbuf fields to point past the MAC header. + * Also set additional W32 info + */ + payload = (unsigned char*)(m + 1); + m->m_len = m->m_pkthdr.len = TotalPacketLength-14; + m->m_pkthdr.rcvif = (void *)((direction==INCOMING) ? _if_in : NULL); + m->m_data = payload + 14; /* past the MAC header */ + m->direction = direction; + m->context = Context; + m->pkt = pNdisPacket; + + /* m_skb != NULL is used in the ip_output routine to check + * for packets that come from the stack and differentiate + * from those internally generated by ipfw. + * The pointer is not used, just needs to be non-null. + */ + m->m_skb = (void *)pNdisPacket; + /* + * Now copy the data from the Windows buffers to the mbuf. + */ + for (i=0, ofs = 0; i < BufferCount; i++) { + unsigned char* src; + NdisQueryBufferSafe(pCurrentBuffer, &src, &l, + NormalPagePriority); + bcopy(src, payload + ofs, l); + ofs += l; + NdisGetNextBuffer(pCurrentBuffer, &pNextBuffer); + pCurrentBuffer = pNextBuffer; + } + /* + * Identify EtherType. If the packet is not IP, simply allow + * and don't bother the firewall. XXX should be done before. + */ + EtherType = *(unsigned short*)(payload + 12); + EtherType = RtlUshortByteSwap(EtherType); + if (EtherType != 0x0800) { + //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType); + free(m, 0); + return PASS; + } + + /* + * Now build a buffer descriptor to replace the original chain. + */ + pAdapt = Context; + PacketPool = direction == OUTGOING ? + pAdapt->SendPacketPoolHandle : pAdapt->RecvPacketPoolHandle; + NdisAllocateBuffer(&Status, &pNdisBuffer, + PacketPool, payload, m->m_pkthdr.len+14); + if (Status != NDIS_STATUS_SUCCESS) + goto drop_pkt; + /* + * Save the old buffer pointers, and put the new one + * into the chain. + */ + pNdisBuffer->Next = NULL; + old_head = NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket); + old_tail = NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket); + NdisReinitializePacket(pNdisPacket); + NdisChainBufferAtFront(pNdisPacket, pNdisBuffer); +#if 0 + if (direction == INCOMING) { + DBGPRINT(("incoming: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength)); + } else { + DBGPRINT(("outgoing: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), TotalPacketLength)); + } +#endif + if (direction == INCOMING) + ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL); + else + ret = ipfw_check_hook(NULL, &m, (struct ifnet*)_if_out, PFIL_OUT, NULL); + + if (m != NULL) { + /* Accept. Restore the old buffer chain, free + * the mbuf and return PASS. + */ + //DBGPRINT(("accepted\n")); + NdisReinitializePacket(pNdisPacket); + NDIS_PACKET_FIRST_NDIS_BUFFER(pNdisPacket) = old_head; + NDIS_PACKET_LAST_NDIS_BUFFER(pNdisPacket) = old_tail; + NdisFreeBuffer(pNdisBuffer); + m_freem(m); + return PASS; + } else if (ret == 0) { + /* dummynet has kept the packet, will reinject later. */ + //DBGPRINT(("kept by dummynet\n")); + return DUMMYNET; + } else { + /* + * Packet dropped by ipfw or dummynet. Nothing to do as + * FREE_PKT already freed the fake mbuf + */ + //DBGPRINT(("dropped by dummynet, ret = %i\n", ret)); + return DROP; + } +drop_pkt: + /* for some reason we cannot proceed. Free any resources + * including those received from above, and return + * faking success. XXX this must be fixed later. + */ + NdisFreePacket(pNdisPacket); + return DROP; +} + +/* + * Windows reinjection function. + * The packet is already available as m->pkt, so we only + * need to send it to the right place. + * Normally a ndis intermediate driver allocates + * a fresh descriptor, while the actual data's ownership is + * retained by the protocol, or the miniport below. + * Since an intermediate driver behaves as a miniport driver + * at the upper edge (towards the protocol), and as a protocol + * driver at the lower edge (towards the NIC), when we handle a + * packet we have a reserved area in both directions (we can use + * only one for each direction at our own discretion). + * Normally this area is used to save a pointer to the original + * packet, so when the driver is done with it, the original descriptor + * can be retrieved, and the resources freed (packet descriptor, + * buffer descriptor(s) and the actual data). In our driver this + * area is used to mark the reinjected packets as 'orphan', because + * the original descriptor is gone long ago. This way we can handle + * correctly the resource freeing when the callback function + * is called by NDIS. + */ + +void +netisr_dispatch(int num, struct mbuf *m) +{ + unsigned char* payload = (unsigned char*)(m+1); + PADAPT pAdapt = m->context; + NDIS_STATUS Status; + PNDIS_PACKET pPacket = m->pkt; + PNDIS_BUFFER pNdisBuffer; + NDIS_HANDLE PacketPool; + + if (num < 0) + goto drop_pkt; + + //debug print +#if 0 + DbgPrint("reinject %s\n", m->direction == OUTGOING ? + "outgoing" : "incoming"); +#endif + NdisAcquireSpinLock(&pAdapt->Lock); + if (m->direction == OUTGOING) { + //we must first check if the adapter is going down, + // in this case abort the reinjection + if (pAdapt->PTDeviceState > NdisDeviceStateD0) { + pAdapt->OutstandingSends--; + // XXX should we notify up ? + NdisReleaseSpinLock(&pAdapt->Lock); + goto drop_pkt; + } + } else { + /* if the upper miniport edge is not initialized or + * the miniport edge is in low power state, abort + * XXX we should notify the error. + */ + if (!pAdapt->MiniportHandle || + pAdapt->MPDeviceState > NdisDeviceStateD0) { + NdisReleaseSpinLock(&pAdapt->Lock); + goto drop_pkt; + } + } + NdisReleaseSpinLock(&pAdapt->Lock); + + if (m->direction == OUTGOING) { + PSEND_RSVD SendRsvd; + /* use the 8-bytes protocol reserved area, the first + * field is used to mark/the packet as 'orphan', the + * second stores the pointer to the mbuf, so in the + * the SendComplete handler we know that this is a + * reinjected packet and can free correctly. + */ + SendRsvd = (PSEND_RSVD)(pPacket->ProtocolReserved); + SendRsvd->OriginalPkt = NULL; + SendRsvd->pMbuf = m; + //do the actual send + NdisSend(&Status, pAdapt->BindingHandle, pPacket); + if (Status != NDIS_STATUS_PENDING) { + /* done, call the callback now */ + PtSendComplete(m->context, m->pkt, Status); + } + return; /* unconditional return here. */ + } else { + /* There's no need to check the 8-bytes miniport + * reserved area since the path going up will be always + * syncronous, and all the cleanup will be done inline. + * If the reinjected packed comes from a PtReceivePacket, + * there will be no callback. + * Otherwise PtReceiveComplete will be called but will just + * return since all the cleaning is alreqady done */ + // do the actual receive. + ULONG Proc = KeGetCurrentProcessorNumber(); + pAdapt->ReceivedIndicationFlags[Proc] = TRUE; + NdisMEthIndicateReceive(pAdapt->MiniportHandle, NULL, payload, 14, payload+14, m->m_len, m->m_len); + NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle); + pAdapt->ReceivedIndicationFlags[Proc] = FALSE; + } +drop_pkt: + /* NDIS_PACKET exists and must be freed only if + * the packet come from a PtReceivePacket, oherwise + * m->pkt will ne null. + */ + if (m->pkt != NULL) + { + NdisUnchainBufferAtFront(m->pkt, &pNdisBuffer); + NdisFreeBuffer(pNdisBuffer); + NdisFreePacket(m->pkt); + } + m_freem(m); +} + +void win_freem(void *); /* wrapper for m_freem() for protocol.c */ +void +win_freem(void *_m) +{ + struct mbuf *m = _m; + m_freem(m); +} + +/* + * not implemented in linux. + * taken from /usr/src/lib/libc/string/strlcpy.c + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} + +void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt) +{ + PNDIS_BUFFER pNdisBuffer; + + NdisQueryPacket(Packet, NULL, NULL, &pNdisBuffer, NULL); + NdisUnchainBufferAtFront(Packet, &pNdisBuffer); + NdisFreeBuffer(pNdisBuffer); + win_freem(m); + NdisFreePacket(Packet); + ADAPT_DECR_PENDING_SENDS(pAdapt); +} + +int +ipfw2_qhandler_w32_oldstyle(int direction, + NDIS_HANDLE ProtocolBindingContext, + unsigned char* HeaderBuffer, + unsigned int HeaderBufferSize, + unsigned char* LookAheadBuffer, + unsigned int LookAheadBufferSize, + unsigned int PacketSize) +{ + struct mbuf* m; + unsigned char* payload = NULL; + unsigned short EtherType = 0; + int ret = 0; + + /* We are in a special case when NIC signals an incoming + * packet using old style calls. This is done passing + * a pointer to the MAC header and a pointer to the + * rest of the packet. + * We simply allocate space for the mbuf and the + * subsequent payload section. + */ + m = malloc(sizeof(struct mbuf) + HeaderBufferSize + LookAheadBufferSize, 0, 0 ); + if (m == NULL) //resource shortage, drop the packet + return DROP; + + /* set mbuf fields to point past the MAC header. + * Also set additional W32 info. + * m->pkt here is set to null because the notification + * from the NIC has come with a header+loolahead buffer, + * no NDIS_PACKET has been provided. + */ + payload = (unsigned char*)(m + 1); + m->m_len = m->m_pkthdr.len = HeaderBufferSize+LookAheadBufferSize-14; + m->m_data = payload + 14; /* past the MAC header */ + m->direction = direction; + m->context = ProtocolBindingContext; + m->pkt = NULL; + + /* + * Now copy the data from the Windows buffers to the mbuf. + */ + bcopy(HeaderBuffer, payload, HeaderBufferSize); + bcopy(LookAheadBuffer, payload+HeaderBufferSize, LookAheadBufferSize); + //hexdump(payload,HeaderBufferSize+LookAheadBufferSize,"qhandler"); + /* + * Identify EtherType. If the packet is not IP, simply allow + * and don't bother the firewall. XXX should be done before. + */ + EtherType = *(unsigned short*)(payload + 12); + EtherType = RtlUshortByteSwap(EtherType); + if (EtherType != 0x0800) { + //DbgPrint("ethertype = %X, skipping ipfw\n",EtherType); + free(m, 0); + return PASS; + } + + //DbgPrint("incoming_raw: proto %u (%s), src %08X, dst %08X, sport %u, dport %u, len %u\n", *(payload+14+9), texify_proto(*(payload+14+9)), *(unsigned int*)(payload+14+12), *(unsigned int*)(payload+14+16), ntohs((*((unsigned short int*)(payload+14+20)))), ntohs((*((unsigned short int*)(payload+14+22)))), HeaderBufferSize+LookAheadBufferSize); + + /* Query the firewall */ + ret = ipfw_check_hook(NULL, &m, NULL, PFIL_IN, NULL); + + if (m != NULL) { + /* Accept. Free the mbuf and return PASS. */ + //DbgPrint("accepted\n"); + m_freem(m); + return PASS; + } else if (ret == 0) { + /* dummynet has kept the packet, will reinject later. */ + //DbgPrint("kept by dummynet\n"); + return DUMMYNET; + } else { + /* + * Packet dropped by ipfw or dummynet. Nothing to do as + * FREE_PKT already freed the fake mbuf + */ + //DbgPrint("dropped by dummynet, ret = %i\n", ret); + return DROP; + } +} + +/* forward declaration because those functions are used only here, + * no point to make them visible in passthru/protocol/miniport */ +int do_ipfw_set_ctl(struct sock *sk, int cmd, + void __user *user, unsigned int len); +int do_ipfw_get_ctl(struct sock *sk, int cmd, + void __user *user, int *len); + +NTSTATUS +DevIoControl( + IN PDEVICE_OBJECT pDeviceObject, + IN PIRP pIrp + ) +/*++ + +Routine Description: + + This is the dispatch routine for handling device ioctl requests. + +Arguments: + + pDeviceObject - Pointer to the device object. + + pIrp - Pointer to the request packet. + +Return Value: + + Status is returned. + +--*/ +{ + PIO_STACK_LOCATION pIrpSp; + NTSTATUS NtStatus = STATUS_SUCCESS; + unsigned long BytesReturned = 0; + unsigned long FunctionCode; + unsigned long len; + struct sockopt *sopt; + int ret = 0; + + UNREFERENCED_PARAMETER(pDeviceObject); + + pIrpSp = IoGetCurrentIrpStackLocation(pIrp); + + /* + * Using METHOD_BUFFERED as communication method, the userland + * side calls DeviceIoControl passing an input buffer and an output + * and their respective length (ipfw uses the same length for both). + * The system creates a single I/O buffer, with len=max(inlen,outlen). + * In the kernel we can read information from this buffer (which is + * directly accessible), overwrite it with our results, and set + * IoStatus.Information with the number of bytes that the system must + * copy back to userland. + * In our sockopt emulation, the initial part of the buffer contains + * a struct sockopt, followed by the data area. + */ + + len = pIrpSp->Parameters.DeviceIoControl.InputBufferLength; + if (len < sizeof(struct sockopt)) + { + return STATUS_NOT_SUPPORTED; // XXX find better value + } + sopt = pIrp->AssociatedIrp.SystemBuffer; + + FunctionCode = pIrpSp->Parameters.DeviceIoControl.IoControlCode; + + len = sopt->sopt_valsize; + + switch (FunctionCode) + { + case IP_FW_SETSOCKOPT: + ret = do_ipfw_set_ctl(NULL, sopt->sopt_name, sopt+1, len); + break; + + case IP_FW_GETSOCKOPT: + ret = do_ipfw_get_ctl(NULL, sopt->sopt_name, sopt+1, &len); + sopt->sopt_valsize = len; + //sanity check on len + if (len + sizeof(struct sockopt) <= pIrpSp->Parameters.DeviceIoControl.InputBufferLength) + BytesReturned = len + sizeof(struct sockopt); + else + BytesReturned = pIrpSp->Parameters.DeviceIoControl.InputBufferLength; + break; + + default: + NtStatus = STATUS_NOT_SUPPORTED; + break; + } + + pIrp->IoStatus.Information = BytesReturned; + pIrp->IoStatus.Status = NtStatus; + IoCompleteRequest(pIrp, IO_NO_INCREMENT); + + return NtStatus; +} + +void dummynet(void * unused); +void ipfw_tick(void * vnetx); + +VOID dummynet_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ) +{ + dummynet(NULL); +} + +VOID ipfw_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ) +{ + ipfw_tick(DeferredContext); +} diff --git a/kipfw/missing.h b/kipfw/missing.h new file mode 100644 index 0000000..237c1dc --- /dev/null +++ b/kipfw/missing.h @@ -0,0 +1,645 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: missing.h 12256 2013-04-26 21:12:44Z luigi $ + * + * Header for kernel variables and functions that are not available in + * userland. + */ + +#ifndef _MISSING_H_ +#define _MISSING_H_ + +#include +#ifdef linux +#include +#include +#include +#endif /* linux */ + +/* portability features, to be set before the rest: */ +#define HAVE_NET_IPLEN /* iplen/ipoff in net format */ +#define WITHOUT_BPF /* do not use bpf logging */ + +#ifdef _WIN32 + +#ifndef DEFINE_SPINLOCK +#define DEFINE_SPINLOCK(x) FAST_MUTEX x +#endif +/* spinlock --> Guarded Mutex KGUARDED_MUTEX */ +/* http://www.reactos.org/wiki/index.php/Guarded_Mutex */ +#define spin_lock_init(_l) +#define spin_lock_bh(_l) +#define spin_unlock_bh(_l) + +#include /* bsd-compat.c */ +#include /* bsd-compat.c */ +#include /* local version */ +#define INADDR_TO_IFP(a, b) b = NULL + +#else /* __linux__ */ + +#define MALLOC_DECLARE(x) /* nothing */ +#include /* do_gettimeofday */ +#include /* local version */ +struct inpcb; + +/* + * Kernel locking support. + * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c + * + * In linux we use spinlock_bh to implement both. + * For 'struct rwlock' we need an #ifdef to change it to spinlock_t + */ + +#ifndef DEFINE_SPINLOCK /* this is for linux 2.4 */ +#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED +#endif + + +#define rw_assert(a, b) +#define rw_destroy(_l) +#define rw_init(_l, msg) spin_lock_init(_l) +#define rw_rlock(_l) spin_lock_bh(_l) +#define rw_runlock(_l) spin_unlock_bh(_l) +#define rw_wlock(_l) spin_lock_bh(_l) +#define rw_wunlock(_l) spin_unlock_bh(_l) +#define rw_init_flags(_l, s, v) + +#define mtx_assert(a, b) +#define mtx_destroy(m) +#define mtx_init(m, a,b,c) spin_lock_init(m) +#define mtx_lock(_l) spin_lock_bh(_l) +#define mtx_unlock(_l) spin_unlock_bh(_l) + +#endif /* __linux__ */ +/* end of locking support */ + +/* + * Reference to an ipfw rule that can be carried outside critical sections. + * A rule is identified by rulenum:rule_id which is ordered. + * In version chain_id the rule can be found in slot 'slot', so + * we don't need a lookup if chain_id == chain->id. + * + * On exit from the firewall this structure refers to the rule after + * the matching one (slot points to the new rule; rulenum:rule_id-1 + * is the matching rule), and additional info (e.g. info often contains + * the insn argument or tablearg in the low 16 bits, in host format). + * On entry, the structure is valid if slot>0, and refers to the starting + * rules. 'info' contains the reason for reinject, e.g. divert port, + * divert direction, and so on. + */ +struct ipfw_rule_ref { + uint32_t slot; /* slot for matching rule */ + uint32_t rulenum; /* matching rule number */ + uint32_t rule_id; /* matching rule id */ + uint32_t chain_id; /* ruleset id */ + uint32_t info; /* see below */ +}; + +enum { + IPFW_INFO_MASK = 0x0000ffff, + IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ + IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ + IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ + IPFW_IS_MASK = 0x30000000, /* which source ? */ + IPFW_IS_DIVERT = 0x20000000, + IPFW_IS_DUMMYNET =0x10000000, + IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ +}; + +/* in netinet/in.h */ +#define in_nullhost(x) ((x).s_addr == INADDR_ANY) + +/* bzero not present on linux, but this should go in glue.h */ +#define bzero(s, n) memset(s, 0, n) +#define bcmp(p1, p2, n) memcmp(p1, p2, n) + +/* ethernet stuff */ +#define ETHERTYPE_IP 0x0800 /* IP protocol */ +//#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */ +struct ether_header { + u_char ether_dhost[ETHER_ADDR_LEN]; + u_char ether_shost[ETHER_ADDR_LEN]; + u_short ether_type; +}; + +#define ETHER_TYPE_LEN 2 /* length of the Ethernet type field */ +#define ETHER_HDR_LEN (ETHER_ADDR_LEN*2+ETHER_TYPE_LEN) + +/* + * Historically, BSD keeps ip_len and ip_off in host format + * when doing layer 3 processing, and this often requires + * to translate the format back and forth. + * To make the process explicit, we define a couple of macros + * that also take into account the fact that at some point + * we may want to keep those fields always in net format. + */ + +#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN) +#define SET_NET_IPLEN(p) do {} while (0) +#define SET_HOST_IPLEN(p) do {} while (0) +#else /* never on linux */ +#define SET_NET_IPLEN(p) do { \ + struct ip *h_ip = (p); \ + h_ip->ip_len = htons(h_ip->ip_len); \ + h_ip->ip_off = htons(h_ip->ip_off); \ + } while (0) + +#define SET_HOST_IPLEN(p) do { \ + struct ip *h_ip = (p); \ + h_ip->ip_len = ntohs(h_ip->ip_len); \ + h_ip->ip_off = ntohs(h_ip->ip_off); \ + } while (0) +#endif /* !HAVE_NET_IPLEN */ + +/* ip_dummynet.c */ +#define __FreeBSD_version 500035 + +#ifdef __linux__ +struct moduledata; +int my_mod_register(const char *name, + int order, struct moduledata *mod, void *init, void *uninit); + +/* define some macro for ip_dummynet */ + +struct malloc_type { +}; + +#define MALLOC_DEFINE(type, shortdesc, longdesc) \ + struct malloc_type type[1]; void *md_dummy_ ## type = type + +#define CTASSERT(x) + +/* log... does not use the first argument */ +#define LOG_ERR 0x100 +#define LOG_INFO 0x200 +#define log(_level, fmt, arg...) do { \ + int _qwerty=_level;(void)_qwerty; printk(KERN_ERR fmt, ##arg); } while (0) + +/* + * gettimeofday would be in sys/time.h but it is not + * visible if _KERNEL is defined + */ +int gettimeofday(struct timeval *, struct timezone *); + +#else /* _WIN32 */ +#define MALLOC_DEFINE(a,b,c) +#endif /* _WIN32 */ + +extern int hz; +extern long tick; /* exists in 2.4 but not in 2.6 */ +extern int bootverbose; +extern struct timeval boottime; + +/* The time_uptime a FreeBSD variable increased each second */ +#ifdef __linux__ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,37) /* revise boundaries */ +#define time_uptime get_seconds() +#else /* OpenWRT */ +#define time_uptime CURRENT_TIME +#endif +#else /* WIN32 */ +#define time_uptime time_uptime_w32() +#endif + +extern int max_linkhdr; +extern int ip_defttl; +extern u_long in_ifaddrhmask; /* mask for hash table */ +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +/*-------------------------------------------------*/ + +/* define, includes and functions missing in linux */ +/* include and define */ +#include /* inet_ntoa */ + +struct mbuf; + +/* used by ip_dummynet.c */ +void reinject_drop(struct mbuf* m); + +#include /* error define */ +#include /* IFNAMESIZ */ + +void rn_init(int); +/* + * some network structure can be defined in the bsd way + * by using the _FAVOR_BSD definition. This is not true + * for icmp structure. + * XXX struct icmp contains bsd names in + * /usr/include/netinet/ip_icmp.h + */ +#ifdef __linux__ +#define icmp_code code +#define icmp_type type + +/* linux in6_addr has no member __u6_addr + * replace the whole structure ? + */ +#define __u6_addr in6_u +#define __u6_addr32 u6_addr32 +#endif /* __linux__ */ + +/* defined in linux/sctp.h with no bsd definition */ +struct sctphdr { + uint16_t src_port; /* source port */ + uint16_t dest_port; /* destination port */ + uint32_t v_tag; /* verification tag of packet */ + uint32_t checksum; /* Adler32 C-Sum */ + /* chunks follow... */ +}; + +/* missing definition */ +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_ACK 0x10 + +#define RTF_CLONING 0x100 /* generate new routes on use */ + +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_CARP 112 /* CARP */ +#ifndef _WIN32 +#define IPPROTO_IPV4 IPPROTO_IPIP /* for compatibility */ +#endif + +#define CARP_VERSION 2 +#define CARP_ADVERTISEMENT 0x01 + +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ + +#define IP_FORWARDING 0x1 /* most of ip header exists */ + +#define NETISR_IP 2 /* same as AF_INET */ + +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ + +extern int securelevel; + +struct carp_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int8_t carp_type:4, + carp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int8_t carp_version:4, + carp_type:4; +#endif +}; + +struct pim { + int dummy; /* windows compiler does not like empty definition */ +}; + +#ifndef _WIN32 +struct route { + struct rtentry *ro_rt; + struct sockaddr ro_dst; +}; +#endif + +struct ifaltq { + void *ifq_head; +}; + +/* + * ifnet->if_snd is used in ip_dummynet.c to take the transmission + * clock. + */ +#if defined( __linux__) +#define if_xname name +#define if_snd XXX +/* search local the ip addresses, used for the "me" keyword */ +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) +#define INADDR_TO_IFP(ip, b) \ + b = ip_dev_find(ip.s_addr) +#else +#define INADDR_TO_IFP(ip, b) \ + b = ip_dev_find((struct net *)&init_net, ip.s_addr) +#endif + +#elif defined( _WIN32 ) +/* used in ip_dummynet.c */ +struct ifnet { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ +// struct ifaltq if_snd; /* output queue (includes altq) */ +}; + +struct net_device { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ +}; +#endif + +/* involves mbufs */ +int in_cksum(struct mbuf *m, int len); +#define divert_cookie(mtag) 0 +#define divert_info(mtag) 0 +#define pf_find_mtag(a) NULL +#define pf_get_mtag(a) NULL +#ifndef _WIN32 +#define AF_LINK AF_ASH /* ? our sys/socket.h */ +#endif + +/* we don't pullup, either success or free and fail */ +#define m_pullup(m, x) \ + ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL)) + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + sa_family_t af; /* for ECN */ + u_int32_t qid; /* queue id */ +}; + +#if 0 // ndef radix +/* radix stuff in radix.h and radix.c */ +struct radix_node { + caddr_t rn_key; /* object of search */ + caddr_t rn_mask; /* netmask, if present */ +}; +#endif /* !radix */ + +/* missing kernel functions */ +char *inet_ntoa(struct in_addr ina); +int random(void); + +/* + * Return the risult of a/b + * + * this is used in linux kernel space, + * since the 64bit division needs to + * be done using a macro + */ +int64_t +div64(int64_t a, int64_t b); + +char * +inet_ntoa_r(struct in_addr ina, char *buf); + +/* from bsd sys/queue.h */ +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +/* depending of linux version */ +#ifndef ETHERTYPE_IPV6 +#define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ +#endif + +/*-------------------------------------------------*/ +#define RT_NUMFIBS 1 +extern u_int rt_numfibs; + +/* involves kernel locking function */ +#ifdef RTFREE +#undef RTFREE +#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n"); +#endif + +void getmicrouptime(struct timeval *tv); + +/* from sys/netinet/ip_output.c */ +struct ip_moptions; +struct route; +struct ip; + +struct mbuf *ip_reass(struct mbuf *); +u_short in_cksum_hdr(struct ip *); +int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct inpcb *inp); + +/* from net/netisr.c */ +void netisr_dispatch(int num, struct mbuf *m); + +/* definition moved in missing.c */ +int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); + +int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); + +/* defined in session.c */ +int priv_check(struct thread *td, int priv); + +/* struct ucred is in linux/socket.h and has pid, uid, gid. + * We need a 'bsd_ucred' to store also the extra info + */ + +struct bsd_ucred { + uid_t uid; + gid_t gid; + uint32_t xid; + uint32_t nid; +}; + +int +cred_check(void *insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb); + +int securelevel_ge(struct ucred *cr, int level); + +struct sysctl_oid; +struct sysctl_req; + +#ifdef _WIN32 +#define module_param_named(_name, _var, _ty, _perm) +#else /* !_WIN32 */ + +/* Linux 2.4 is mostly for openwrt */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#include /* generic_ffs() used in ip_fw2.c */ +typedef uint32_t __be32; +typedef uint16_t __be16; +struct sock; +struct net; +struct inet_hashinfo; +struct sock *inet_lookup( + struct inet_hashinfo *hashinfo, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif); +struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); +#endif /* Linux < 2.6 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) && \ + LINUX_VERSION_CODE > KERNEL_VERSION(2,6,16) /* XXX NOT sure, in 2.6.9 give an error */ +#define module_param_named(_name, _var, _ty, _perm) \ + //module_param(_name, _ty, 0644) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +typedef unsigned long uintptr_t; + +#ifdef __i386__ +static inline unsigned long __fls(unsigned long word) +{ + asm("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} +#endif + +#endif /* LINUX < 2.6.25 */ + +#endif /* !_WIN32 so maybe __linux__ */ + +#if defined (__linux__) && !defined (EMULATE_SYSCTL) +#define SYSCTL_DECL(_1) +#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8) +#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6) +#define _SYSCTL_BASE(_name, _var, _ty, _perm) \ + module_param_named(_name, *(_var), _ty, \ + ( (_perm) == CTLFLAG_RD) ? 0444: 0644 ) +#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b) + +#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, int, _mode) + +#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, long, _mode) + +#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, ulong, _mode) + +#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, uint, _mode) + +#define TUNABLE_INT(_name, _ptr) + +#define SYSCTL_VNET_PROC SYSCTL_PROC +#define SYSCTL_VNET_INT SYSCTL_INT +#define SYSCTL_VNET_UINT SYSCTL_UINT + +#endif + +#define SYSCTL_HANDLER_ARGS \ + struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req +int sysctl_handle_int(SYSCTL_HANDLER_ARGS); +int sysctl_handle_long(SYSCTL_HANDLER_ARGS); + + +void ether_demux(struct ifnet *ifp, struct mbuf *m); + +int ether_output_frame(struct ifnet *ifp, struct mbuf *m); + +void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); + +void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu); + +void rtfree(struct rtentry *rt); + +u_short in_cksum_skip(struct mbuf *m, int len, int skip); + +#ifdef INP_LOCK_ASSERT +#undef INP_LOCK_ASSERT +#define INP_LOCK_ASSERT(a) +#endif + +int jailed(struct ucred *cred); + +/* +* Return 1 if an internet address is for a ``local'' host +* (one to which we have a connection). If subnetsarelocal +* is true, this includes other subnets of the local net. +* Otherwise, it includes only the directly-connected (sub)nets. +*/ +int in_localaddr(struct in_addr in); + +/* the prototype is already in the headers */ +//int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); + +int fnmatch(const char *pattern, const char *string, int flags); + +int +linux_lookup(const int proto, const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + struct sk_buff *skb, int dir, struct bsd_ucred *u); + +/* vnet wrappers, in vnet.h and ip_var.h */ +//int ipfw_init(void); +//void ipfw_destroy(void); + +#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ +#define MTAG_IPFW_RULE 1262273568 /* rule reference */ + +struct ip_fw_args; +extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); + +#define curvnet NULL +#define CURVNET_SET(_v) +#define CURVNET_RESTORE() +#define VNET_ASSERT(condition) + +#define VNET_NAME(n) n +#define VNET_DECLARE(t, n) extern t n +#define VNET_DEFINE(t, n) t n +#define _VNET_PTR(b, n) &VNET_NAME(n) +/* + * Virtualized global variable accessor macros. + */ +#define VNET_VNET_PTR(vnet, n) (&(n)) +#define VNET_VNET(vnet, n) (n) + +#define VNET_PTR(n) (&(n)) +#define VNET(n) (n) + +VNET_DECLARE(int, ip_defttl); +#define V_ip_defttl VNET(ip_defttl); + +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, + int dir, struct inpcb *inp); + +/* hooks for divert */ +extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); + +extern int (*ip_dn_ctl_ptr)(struct sockopt *); +typedef int ip_fw_ctl_t(struct sockopt *); +extern ip_fw_ctl_t *ip_fw_ctl_ptr; + +/* netgraph prototypes */ +typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); +extern ng_ipfw_input_t *ng_ipfw_input_p; + +/* For kernel ipfw_ether and ipfw_bridge. */ +struct ip_fw_args; +typedef int ip_fw_chk_t(struct ip_fw_args *args); +extern ip_fw_chk_t *ip_fw_chk_ptr; + +#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) +#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) +#define V_tcbinfo VNET(tcbinfo) +#define V_udbinfo VNET(udbinfo) + +#endif /* !_MISSING_H_ */ diff --git a/kipfw/mysetenv.sh b/kipfw/mysetenv.sh new file mode 100644 index 0000000..baea772 --- /dev/null +++ b/kipfw/mysetenv.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# bash script to set a suitable environment to call MSVC's build +# to build a 64-bit version of the kernel. +# +# inspired by C:/winddk/7600.16385.1/bin/setenv.bat +# see http://www.osronline.com/ddkx/ddtools/build_ref_0kqb.htm + +############################################################# +# edit theese variables to meet your configuration # +# - DRIVE is the hard drive letter where DDK is installed # +# - DDK is the path to the DDK's root directory # +# - CYGDDK is the complete cygwin path to DDK # +############################################################# +if [ $# -ne 3 ]; then +echo "invalid params" && exit 1 +fi +DRIVE=$1 +DDK=$2 +CYGDDK=/cygdrive/c/${DDK} +TARGETOS=$3 +MYDIR=`pwd` # XXX luigi + +if [ "$TARGETOS" = "wnet" ]; then +export DDK_TARGET_OS=WinNET +export _NT_TARGET_VERSION=0x502 +fi + +if [ "$TARGETOS" = "wlh" ]; then +export DDK_TARGET_OS=WinLH +export _NT_TARGET_VERSION=0x600 +fi + +if [ "$TARGETOS" = "win7" ]; then +export DDK_TARGET_OS=Win7 +export _NT_TARGET_VERSION=0x601 +fi + + +############################################################# +# don't edit anything else below this point # +############################################################# + +D=${DRIVE}${DDK} +DB=${D}/bin +DI=${D}/inc +DL=${D}/lib + + +export AMD64=1 +export ATL_INC_PATH=$DI # defaults to DDKROOT/inc +export ATL_INC_ROOT=$DI # XXX redundant ? +export ATL_LIB_PATH=${DL}/atl/* +export BASEDIR=$D # default +export BUFFER_OVERFLOW_CHECKS=1 +export BUILD_ALLOW_COMPILER_WARNINGS=1 +export BUILD_ALT_DIR=chk_${TARGETOS}_AMD64 +export BUILD_DEFAULT="-ei -nmake -i -nosqm" # can go on the command line +export BUILD_DEFAULT_TARGETS="-amd64" # can also go on the command line +export BUILD_MAKE_PROGRAM=nmake.exe # default to nmake +export BUILD_MULTIPROCESSOR=1 # parallel make, same as -M +export BUILD_OPTIONS=" ~imca ~toastpkg" +export COFFBASE_TXT_FILE=${DB}/coffbase.txt +export CPU=AMD64 +export CRT_INC_PATH=${DI}/crt # default +export CRT_LIB_PATH=${DL}/crt/* # not default, it seems uses lib/{wnet,win7}/* +export DDKBUILDENV=chk # checked or free +export DDK_INC_PATH=${DI}/ddk +export DDK_LIB_DEST=${DL}/${TARGETOS} +export DDK_LIB_PATH=${DL}/${TARGETOS}/* +export DEPRECATE_DDK_FUNCTIONS=1 +export DRIVER_INC_PATH=${DI}/ddk +export HALKIT_INC_PATH=${DI}/ddk +export HALKIT_LIB_PATH=${DL}/${TARGETOS}/* +export IFSKIT_INC_PATH=${DI}/ddk +export IFSKIT_LIB_DEST=${DL}/${TARGETOS} +export IFSKIT_LIB_PATH=${DL}/${TARGETOS}/* +export Include=${DI}/api +export KMDF_INC_PATH=${DI}/wdf/kmdf +export KMDF_LIB_PATH=${DL}/wdf/kmdf/* +export LANGUAGE_NEUTRAL=0 +export Lib=${DL} +export LINK_LIB_IGNORE=4198 +export MFC_INC_PATH=${DI}/mfc42 +export MFC_LIB_PATH=${DL}/mfc/* +export MSC_OPTIMIZATION="/Od /Oi" +export NEW_CRTS=1 +export NO_BINPLACE=TRUE +export NO_BROWSER_FILE=TRUE +export NTDBGFILES=1 +export NTDEBUG=ntsd +export NTDEBUGTYPE=both +# need NTMAKEENV to point to the binary dir +export NTMAKEENV=${DB} +export OAK_INC_PATH=${DI}/api + +export PATH="${CYGDDK}/bin/amd64:${CYGDDK}/tools/sdv/bin:${CYGDDK}/tools/pfd/bin/bin/x86_AMD64\ +:${CYGDDK}/bin/SelfSign:${CYGDDK}/bin/x86/amd64:${CYGDDK}/bin/x86\ +:${CYGDDK}/tools/pfd/bin/bin/AMD64:${CYGDDK}/tools/tracing/amd64:$PATH" + +export PATHEXT=".COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC" +export PROJECT_ROOT=${D}/src +export PUBLIC_ROOT=${D} +export RAZZLETOOLPATH=${DB} +export RCNOFONTMAP=1 +export SDK_INC_PATH=${DI}/api +export SDK_LIB_DEST=${DL}/${TARGETOS} +export SDK_LIB_PATH=${DL}/${TARGETOS}/* +export SDV=${D}/tools/sdv +export separate_object_root=FALSE +export TEMP=tmpbuild +export TMP=tmpbuild +export UMDF_INC_PATH=${DI}/wdf/umdf +export USE_OBJECT_ROOT=1 +export WDM_INC_PATH=${DI}/ddk +export WPP_CONFIG_PATH=${DB}/wppconfig +export _AMD64bit=true +export _BUILDARCH=AMD64 +export _BuildType=chk +export _NTDRIVE=${DRIVE} +export _NTROOT=${DDK} +# +# --- XXX note, it spams C:/winddk/7600.16385.1/build.dat +# -c: delete objs, -e: generare build.* logfiles, -f rescan sources, -g color errors +unset MAKEFLAGS +echo "emv ${MAKE} flags ${MAKEFLAGS}" +cd kipfw-mod && build -cefg +echo "done" +#cp objchk_${TARGETOS}_amd64/amd64/ipfw.sys ../binary/ipfw.sys diff --git a/kipfw/netipfw.inf b/kipfw/netipfw.inf new file mode 100644 index 0000000..5dee2c8 --- /dev/null +++ b/kipfw/netipfw.inf @@ -0,0 +1,81 @@ +; version section +[Version] +Signature = "$Windows NT$" +Class = NetService +ClassGUID = {4D36E974-E325-11CE-BFC1-08002BE10318} +Provider = %Unipi% +DriverVer = 08/12/2012,3.0.1.1 + +; manufacturer section +[Manufacturer] +%Unipi% = UNIPI,NTx86,NTamd64 + +; control flags section +; optional, unused in netipfw.inf inf, used in netipfw_m.inf +[ControlFlags] + +; models section +[UNIPI] ; Win2k +%Desc% = Ipfw.ndi, unipi_ipfw +[UNIPI.NTx86] ;For WinXP and later +%Desc% = Ipfw.ndi, unipi_ipfw +[UNIPI.NTamd64] ;For x64 +%Desc% = Ipfw.ndi, unipi_ipfw + +; ddinstall section +[Ipfw.ndi] +AddReg = Ipfw.ndi.AddReg, Ipfw.AddReg +Characteristics = 0x4410 ; NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!! +CopyFiles = Ipfw.Files.Sys +CopyInf = netipfw_m.inf + +; remove section +[Ipfw.ndi.Remove] +DelFiles = Ipfw.Files.Sys + +;ddinstall.services section +[Ipfw.ndi.Services] +AddService = Ipfw,,Ipfw.AddService + +[Ipfw.AddService] +DisplayName = %ServiceDesc% +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\ipfw.sys +AddReg = Ipfw.AddService.AddReg + +[Ipfw.AddService.AddReg] + +;file copy related sections +[SourceDisksNames] +1=%DiskDescription%,"",, + +[SourceDisksFiles] +ipfw.sys=1 + +[DestinationDirs] +DefaultDestDir = 12 +Ipfw.Files.Sys = 12 ; %windir%\System32\drivers + +; ddinstall->copyfiles points here +[Ipfw.Files.Sys] +ipfw.sys,,,2 + +; ddinstall->addreg points here +[Ipfw.ndi.AddReg] +HKR, Ndi, HelpText, , %HELP% ; this is displayed at the bottom of the General page of the Connection Properties dialog box +HKR, Ndi, FilterClass, , failover +HKR, Ndi, FilterDeviceInfId, , unipi_ipfwmp +HKR, Ndi, Service, , Ipfw +HKR, Ndi\Interfaces, UpperRange, , noupper +HKR, Ndi\Interfaces, LowerRange, , nolower +HKR, Ndi\Interfaces, FilterMediaTypes, , "ethernet, tokenring, fddi, wan" + +;strings section +[Strings] +Unipi = "Unipi" +DiskDescription = "Ipfw Driver Disk" +Desc = "ipfw+dummynet" +HELP = "This is ipfw and dummynet network emulator, developed by unipi.it" +ServiceDesc = "ipfw service" diff --git a/kipfw/netipfw_m.inf b/kipfw/netipfw_m.inf new file mode 100644 index 0000000..a299b12 --- /dev/null +++ b/kipfw/netipfw_m.inf @@ -0,0 +1,56 @@ +; version section +[Version] +Signature = "$Windows NT$" +Class = Net +ClassGUID = {4D36E972-E325-11CE-BFC1-08002BE10318} +Provider = %Unipi% +DriverVer = 08/12/2012,3.0.1.1 + +; control flags section +; optional, unused in netipfw.inf inf, used in netipfw_m.inf +[ControlFlags] +ExcludeFromSelect = unipi_ipfwmp + +; destinationdirs section, optional +[DestinationDirs] +DefaultDestDir=12 +; No files to copy + +; manufacturer section +[Manufacturer] +%Unipi% = UNIPI,NTx86,NTamd64 + +; models section +[UNIPI] ; Win2k +%Desc% = IpfwMP.ndi, unipi_ipfwmp +[UNIPI.NTx86] ;For WinXP and later +%Desc% = IpfwMP.ndi, unipi_ipfwmp +[UNIPI.NTamd64] ;For x64 +%Desc% = IpfwMP.ndi, unipi_ipfwmp + +; ddinstall section +[IpfwMP.ndi] +AddReg = IpfwMP.ndi.AddReg +Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN + +; ddinstall->addreg points here +[IpfwMP.ndi.AddReg] +HKR, Ndi, Service, 0, IpfwMP + +;ddinstall.services section +[IpfwMP.ndi.Services] +AddService = IpfwMP,0x2, IpfwMP.AddService + +[IpfwMP.AddService] +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\ipfw.sys +AddReg = IpfwMP.AddService.AddReg + +[IpfwMP.AddService.AddReg] +; None + +[Strings] +Unipi = "Unipi" +Desc = "Ipfw Miniport" diff --git a/kipfw/sources b/kipfw/sources new file mode 100644 index 0000000..9481e75 --- /dev/null +++ b/kipfw/sources @@ -0,0 +1,20 @@ +TARGETNAME=ipfw +TARGETTYPE=DRIVER + +C_DEFINES=$(C_DEFINES) -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1 + +MSC_WARNING_LEVEL=/W2 + +# The driver is built in the XP or .NET build environment +# So let us build NDIS 5.1 version. +C_DEFINES=$(C_DEFINES) -DNDIS51_MINIPORT=1 +C_DEFINES=$(C_DEFINES) -DNDIS51=1 + +# Enable dummynet preprocessing macros +C_DEFINES=$(C_DEFINES) /D_WIN32 /DMODULENAME=Ipfw /D_BSD_SOURCE /DKERNEL_MODULE /D_KERNEL /DKLD_MODULE /D__BSD_VISIBLE /DIPFIREWALL_DEFAULT_TO_ACCEPT /D__LITTLE_ENDIAN /DSYSCTL_NODE /DEMULATE_SYSCTL -FIwinmissing.h -FImissing.h -FI../glue.h /DWIN32_LEAN_AND_MEAN=1 + +TARGETLIBS=$(DDK_LIB_PATH)\ndis.lib + +INCLUDES= include_e ; ../sys + +SOURCES= ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c radix.c in_cksum.c ip_dummynet.c ip_dn_io.c ip_dn_glue.c dn_heap.c dn_sched_fifo.c dn_sched_wf2q.c dn_sched_rr.c dn_sched_qfq.c dn_sched_prio.c ipfw2_mod.c bsd_compat.c md_win.c miniport.c protocol.c passthru.c debug.c diff --git a/kipfw/win-passthru.diff b/kipfw/win-passthru.diff new file mode 100644 index 0000000..eeb211b --- /dev/null +++ b/kipfw/win-passthru.diff @@ -0,0 +1,251 @@ +diff -ubwrp original_passthru/miniport.c kipfw/miniport.c +--- original_passthru/miniport.c 2012-08-01 14:34:15.096679600 +0200 ++++ kipfw/miniport.c 2012-08-01 14:34:11.377929600 +0200 +@@ -223,6 +223,7 @@ Return Value: + // + // Use NDIS 5.1 packet stacking: + // ++ if (0) // XXX IPFW - make sure we don't go in here + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; +@@ -347,6 +348,25 @@ Return Value: + MediaSpecificInfo, + MediaSpecificInfoSize); + } ++#if 1 /* IPFW: query the firewall */ ++ /* if dummynet keeps the packet, we mimic success. ++ * otherwise continue as usual. ++ */ ++ { ++ int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING, ++ MiniportAdapterContext); ++ if (ret != PASS) { ++ if (ret == DROP) ++ return NDIS_STATUS_FAILURE; ++ else { //dummynet kept the packet ++#ifndef WIN9X ++ NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); ++#endif ++ return NDIS_STATUS_SUCCESS; //otherwise simply continue ++ } ++ } ++ } ++#endif /* end of IPFW code */ + + NdisSend(&Status, + pAdapt->BindingHandle, +diff -ubwrp original_passthru/passthru.c kipfw/passthru.c +--- original_passthru/passthru.c 2012-08-01 14:34:15.268554600 +0200 ++++ kipfw/passthru.c 2012-08-01 14:34:11.534179600 +0200 +@@ -47,8 +47,15 @@ NDIS_HANDLE NdisWrapperHandle; + // To support ioctls from user-mode: + // + +-#define LINKNAME_STRING L"\\DosDevices\\Passthru" +-#define NTDEVICE_STRING L"\\Device\\Passthru" ++#define STR2(x) #x ++#define STR(x) STR2(x) ++#define DOSPREFIX "\\DosDevices\\" ++#define NTPREFIX "\\Device\\" ++#define WIDEN2(x) L ## x ++#define WIDEN(x) WIDEN2(x) ++#define LINKNAME_STRING WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME)) ++#define NTDEVICE_STRING WIDEN(NTPREFIX) WIDEN(STR(MODULENAME)) ++#define PROTOCOLNAME_STRING WIDEN(STR(MODULENAME)) + + NDIS_HANDLE NdisDeviceHandle = NULL; + PDEVICE_OBJECT ControlDeviceObject = NULL; +@@ -136,8 +143,8 @@ Return Value: + // Either the Send or the SendPackets handler should be specified. + // If SendPackets handler is specified, SendHandler is ignored + // +- MChars.SendHandler = NULL; // MPSend; +- MChars.SendPacketsHandler = MPSendPackets; ++ MChars.SendHandler = MPSend; // IPFW: use MPSend, not SendPackets ++ MChars.SendPacketsHandler = NULL; + + Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle, + &MChars, +@@ -165,7 +172,7 @@ Return Value: + // This is needed to ensure that NDIS can correctly determine + // the binding and call us to bind to miniports below. + // +- NdisInitUnicodeString(&Name, L"Passthru"); // Protocol name ++ NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING); // Protocol name + PChars.Name = Name; + PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete; + PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete; +@@ -205,6 +212,8 @@ Return Value: + NdisTerminateWrapper(NdisWrapperHandle, NULL); + } + ++ ipfw_module_init(); // IPFW - start the system ++ + return(Status); + } + +@@ -276,7 +285,8 @@ Return Value: + DispatchTable[IRP_MJ_CREATE] = PtDispatch; + DispatchTable[IRP_MJ_CLEANUP] = PtDispatch; + DispatchTable[IRP_MJ_CLOSE] = PtDispatch; +- DispatchTable[IRP_MJ_DEVICE_CONTROL] = PtDispatch; ++ // IPFW we use DevIoControl ? ++ DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl; + + + NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING); +@@ -453,6 +463,7 @@ PtUnload( + + NdisFreeSpinLock(&GlobalLock); + ++ ipfw_module_exit(); // IPFW unloading dummynet ++ + DBGPRINT(("PtUnload: done!\n")); + } +- +diff -ubwrp original_passthru/passthru.h kipfw/passthru.h +--- original_passthru/passthru.h 2012-08-01 14:34:15.049804600 +0200 ++++ kipfw/passthru.h 2012-08-01 14:34:11.362304600 +0200 +@@ -61,6 +61,13 @@ PtDispatch( + IN PIRP Irp + ); + ++DRIVER_DISPATCH DevIoControl; ++NTSTATUS ++DevIoControl( ++ IN PDEVICE_OBJECT pDeviceObject, ++ IN PIRP pIrp ++ ); ++ + NDIS_STATUS + PtRegisterDevice( + VOID +@@ -366,6 +373,7 @@ PtDereferenceAdapt( + typedef struct _SEND_RSVD + { + PNDIS_PACKET OriginalPkt; ++ struct mbuf* pMbuf; // IPFW extension, reference to the mbuf + } SEND_RSVD, *PSEND_RSVD; + + // +@@ -376,6 +384,7 @@ typedef struct _SEND_RSVD + typedef struct _RECV_RSVD + { + PNDIS_PACKET OriginalPkt; ++ struct mbuf* pMbuf; // IPFW extension, reference to the mbuf + } RECV_RSVD, *PRECV_RSVD; + + C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved)); +@@ -475,3 +484,17 @@ IsIMDeviceStateOn( + */ + #define IsIMDeviceStateOn(_pP) ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) + ++#include "winmissing.h" ++ ++int ipfw_module_init(void); ++void ipfw_module_exit(void); ++int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, ++ NDIS_HANDLE Context); ++int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext, ++ unsigned char* HeaderBuffer, unsigned int HeaderBufferSize, ++ unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize, ++ unsigned int PacketSize); ++void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt); ++void hexdump(PUCHAR,int, const char *); ++void my_init(); ++void my_exit(); +\ Manca newline alla fine del file +Solo in original_passthru: passthru.htm +Solo in original_passthru: passthru.rc +diff -ubwrp original_passthru/protocol.c kipfw/protocol.c +--- original_passthru/protocol.c 2012-08-01 14:34:15.112304600 +0200 ++++ kipfw/protocol.c 2012-08-01 14:34:11.409179600 +0200 +@@ -841,6 +841,14 @@ Return Value: + SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved); + Pkt = SendRsvd->OriginalPkt; + ++#if 1 // IPFW - new code ++ //DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt); ++ if (Pkt == NULL) { //this is a reinjected packet, with no 'father' ++ CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt); ++ return; ++ } ++#endif /* IPFW */ ++ + #ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Pkt, Packet); + #endif +@@ -1021,6 +1029,13 @@ Return Value: + + if (pAdapt->MiniportHandle != NULL) + { ++#if 1 /* IPFW: query the firewall */ ++ int ret; ++ ret = ipfw2_qhandler_w32(MyPacket, INCOMING, ++ ProtocolBindingContext); ++ if (ret != PASS) ++ return 0; //otherwise simply continue ++#endif /* end of IPFW code */ + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + +@@ -1055,6 +1070,13 @@ Return Value: + { + case NdisMedium802_3: + case NdisMediumWan: ++ //DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize); ++ //hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive"); ++ { ++ int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize); ++ if (ret != PASS) ++ return NDIS_STATUS_SUCCESS; ++ } + NdisMEthIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, +@@ -1120,6 +1142,21 @@ Return Value: + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + ULONG Proc = KeGetCurrentProcessorNumber(); + ++ /* Warning: this is a poor implementation of the PtReceiveComplete ++ * made by MS, and it's a well known (but never fixed) issue. ++ * Since the ProcessorNumber here can be different from the one ++ * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete ++ * will not be called, causing poor performance in the incoming traffic. ++ * In our driver, PtReceive is called for IP packets ONLY by particulary ++ * old NIC drivers, and the poor performance can be seen even ++ * in traffic not handled by ipfw or dummynet. ++ * Fortunately, this is quite rare, all the incoming IP packets ++ * will arrive through PtReceivePacket, and this callback will never ++ * be called. For reinjected traffic, a workaround is done ++ * commuting the ReceivedIndicationFlag and calling ++ * NdisMEthIndicateReceiveComplete manually for each packet. ++ */ ++ + if (((pAdapt->MiniportHandle != NULL) + && (pAdapt->MPDeviceState == NdisDeviceStateD0)) + && (pAdapt->ReceivedIndicationFlags[Proc])) +@@ -1199,7 +1236,7 @@ Return Value: + // See also: PtReceive(). + // + (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining); +- if (Remaining) ++ if (0 && Remaining) + { + // + // We can reuse "Packet". Indicate it up and be done with it. +@@ -1247,6 +1284,13 @@ Return Value: + + if (pAdapt->MiniportHandle != NULL) + { ++#if 1 /* IPFW: query the firewall */ ++ int ret; ++ ret = ipfw2_qhandler_w32(MyPacket, INCOMING, ++ ProtocolBindingContext); ++ if (ret != PASS) ++ return 0; //otherwise simply continue ++#endif /* end of IPFW code */ + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + diff --git a/kipfw/winmissing.h b/kipfw/winmissing.h new file mode 100644 index 0000000..5870264 --- /dev/null +++ b/kipfw/winmissing.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2010 Francesco Magno, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: winmissing.h 11647 2012-08-06 23:20:21Z luigi $ + * definitions and other things needed to build freebsd kernel + * modules in Windows (with the MSVC compiler) + */ + +#ifndef _WINMISSING_H_ +#define _WINMISSING_H_ + +#include +#include +#include +#include +#include +#include + +typedef UCHAR u_char; +typedef UCHAR u_int8_t; +typedef UCHAR uint8_t; +typedef USHORT u_short; +typedef USHORT u_int16_t; +typedef USHORT uint16_t; +typedef USHORT n_short; +typedef UINT u_int; +typedef INT32 int32_t; +typedef UINT32 u_int32_t; +typedef UINT32 uint32_t; +typedef ULONG u_long; +typedef ULONG n_long; +typedef UINT64 uint64_t; +typedef UINT64 u_int64_t; +typedef INT64 int64_t; + +typedef UINT32 in_addr_t; +typedef UCHAR sa_family_t; +typedef USHORT in_port_t; +typedef UINT32 __gid_t; +typedef UINT32 gid_t; +typedef UINT32 __uid_t; +typedef UINT32 uid_t; +typedef ULONG n_time; +typedef char* caddr_t; + +/* linux_lookup uses __be32 and __be16 in the prototype */ +typedef uint32_t __be32; /* XXX __u32 __bitwise __be32 */ +typedef uint16_t __be16; /* XXX */ + +//*** DEBUG STUFF *** +/* + * To see the debugging messages you need DbgView +http://technet.microsoft.com/en-us/sysinternals/bb896647.aspx + */ +#define printf DbgPrint +#define log(lev, ...) DbgPrint(__VA_ARGS__) +const char* texify_cmd(int i); +const char* texify_proto(unsigned int p); +//*** end DEBUG STUFF *** + +#define snprintf _snprintf +#define timespec timeval +struct timeval { + long tv_sec; + long tv_usec; +}; + +struct in_addr { + in_addr_t s_addr; +}; + +struct sockaddr_in { + uint8_t sin_len; + sa_family_t sin_family; + in_port_t sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; + +/* XXX watch out, windows names are actually longer */ +#define IFNAMSIZ 16 +#define IF_NAMESIZE 16 + +#define ETHER_ADDR_LEN 6 + +/* we do not include the windows headers for in6_addr so + * we need to provide our own definition for the kernel. + */ +struct in6_addr { + union { + uint8_t __u6_addr8[16]; + uint16_t __u6_addr16[8]; + uint32_t __u6_addr32[4]; + } __u6_addr; /* 128-bit IP6 address */ +}; + +#define htons(x) RtlUshortByteSwap(x) +#define ntohs(x) RtlUshortByteSwap(x) +#define htonl(x) RtlUlongByteSwap(x) +#define ntohl(x) RtlUlongByteSwap(x) + +#define ENOSPC 28 /* No space left on device */ +#define EOPNOTSUPP 45 /* Operation not supported */ +#define EACCES 13 /* Permission denied */ +#define ENOENT 2 /* No such file or directory */ +#define EINVAL 22 /* Invalid argument */ +#define EPROTONOSUPPORT 43 /* Protocol not supported */ +#define ENOMEM 12 /* Cannot allocate memory */ +#define EEXIST 17 /* File exists */ +#define ESRCH 3 +#define ENOBUFS 55 /* No buffer space available */ +#define EBUSY 16 /* Module busy */ + + +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#define __packed +#define __aligned(x); +#define __user +#define __init +#define __exit +#define __func__ __FUNCTION__ +#define inline __inline + +struct sockaddr_in6 { + int dummy; +}; + +//SPINLOCKS +#define DEFINE_SPINLOCK(x) NDIS_SPIN_LOCK x +#define mtx_init(m,a,b,c) NdisAllocateSpinLock(m) +#define mtx_lock(_l) NdisAcquireSpinLock(_l) +#define mtx_unlock(_l) NdisReleaseSpinLock(_l) +#define mtx_destroy(m) NdisFreeSpinLock(m) +#define mtx_assert(a, b) + +#define rw_rlock(_l) NdisAcquireSpinLock(_l) +#define rw_runlock(_l) NdisReleaseSpinLock(_l) +#define rw_assert(a, b) +#define rw_wlock(_l) NdisAcquireSpinLock(_l) +#define rw_wunlock(_l) NdisReleaseSpinLock(_l) +#define rw_destroy(_l) NdisFreeSpinLock(_l) +#define rw_init(_l, msg) NdisAllocateSpinLock(_l) +#define rw_init_flags(_l, s, v) NdisAllocateSpinLock(_l) + +#define rwlock_t NDIS_SPIN_LOCK +#define spinlock_t NDIS_SPIN_LOCK + +#define s6_addr __u6_addr.__u6_addr8 + + +struct icmphdr { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ +}; + +#define ICMP_ECHO 8 /* echo service */ + +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_SSRR 137 /* strict source route */ +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ + +#define IPPROTO_ICMP 1 /* control message protocol */ +#define IPPROTO_TCP 6 /* tcp */ +#define IPPROTO_UDP 17 /* user datagram protocol */ +#define IPPROTO_ICMPV6 58 /* ICMP6 */ +#define IPPROTO_SCTP 132 /* SCTP */ +#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ +#define IPPROTO_ROUTING 43 /* IP6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ +#define IPPROTO_DSTOPTS 60 /* IP6 destination option */ +#define IPPROTO_AH 51 /* IP6 Auth Header */ +#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ +#define IPPROTO_NONE 59 /* IP6 no next header */ +#define IPPROTO_PIM 103 /* Protocol Independent Mcast */ + +#define IPPROTO_IPV6 41 +#define IPPROTO_IPV4 4 /* IPv4 encapsulation */ + + +#define INADDR_ANY (uint32_t)0x00000000 + +#define AF_INET 2 /* internetwork: UDP, TCP, etc. */ +#define AF_LINK 18 /* Link layer interface */ + +#define IN_CLASSD(i) (((uint32_t)(i) & 0xf0000000) == 0xe0000000) +#define IN_MULTICAST(i) IN_CLASSD(i) + +#define DROP 0 +#define PASS 1 +#define DUMMYNET 2 +#define INCOMING 0 +#define OUTGOING 1 + +size_t strlcpy(char *dst, const char *src, size_t siz); +void do_gettimeofday(struct timeval *tv); +int ffs(int bits); +int time_uptime_w32(); + +#endif /* _WINMISSING_H_ */ diff --git a/kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk b/kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk new file mode 100644 index 0000000..de31ced Binary files /dev/null and b/kmod-ipfw3_2.4.35.4-brcm-2.4-1_mipsel.ipk differ diff --git a/modified_passthru/miniport.c b/modified_passthru/miniport.c new file mode 100644 index 0000000..3baff88 --- /dev/null +++ b/modified_passthru/miniport.c @@ -0,0 +1,1481 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + miniport.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + +#include "precomp.h" +#pragma hdrstop + + + +NDIS_STATUS +MPInitialize( + OUT PNDIS_STATUS OpenErrorStatus, + OUT PUINT SelectedMediumIndex, + IN PNDIS_MEDIUM MediumArray, + IN UINT MediumArraySize, + IN NDIS_HANDLE MiniportAdapterHandle, + IN NDIS_HANDLE WrapperConfigurationContext + ) +/*++ + +Routine Description: + + This is the initialize handler which gets called as a result of + the BindAdapter handler calling NdisIMInitializeDeviceInstanceEx. + The context parameter which we pass there is the adapter structure + which we retrieve here. + + Arguments: + + OpenErrorStatus Not used by us. + SelectedMediumIndex Place-holder for what media we are using + MediumArray Array of ndis media passed down to us to pick from + MediumArraySize Size of the array + MiniportAdapterHandle The handle NDIS uses to refer to us + WrapperConfigurationContext For use by NdisOpenConfiguration + +Return Value: + + NDIS_STATUS_SUCCESS unless something goes wrong + +--*/ +{ + UINT i; + PADAPT pAdapt; + NDIS_STATUS Status = NDIS_STATUS_FAILURE; + NDIS_MEDIUM Medium; + + UNREFERENCED_PARAMETER(WrapperConfigurationContext); + + do + { + // + // Start off by retrieving our adapter context and storing + // the Miniport handle in it. + // + pAdapt = NdisIMGetDeviceContext(MiniportAdapterHandle); + pAdapt->MiniportIsHalted = FALSE; + + DBGPRINT(("==> Miniport Initialize: Adapt %p\n", pAdapt)); + + // + // Usually we export the medium type of the adapter below as our + // virtual miniport's medium type. However if the adapter below us + // is a WAN device, then we claim to be of medium type 802.3. + // + Medium = pAdapt->Medium; + + if (Medium == NdisMediumWan) + { + Medium = NdisMedium802_3; + } + + for (i = 0; i < MediumArraySize; i++) + { + if (MediumArray[i] == Medium) + { + *SelectedMediumIndex = i; + break; + } + } + + if (i == MediumArraySize) + { + Status = NDIS_STATUS_UNSUPPORTED_MEDIA; + break; + } + + + // + // Set the attributes now. NDIS_ATTRIBUTE_DESERIALIZE enables us + // to make up-calls to NDIS without having to call NdisIMSwitchToMiniport + // or NdisIMQueueCallBack. This also forces us to protect our data using + // spinlocks where appropriate. Also in this case NDIS does not queue + // packets on our behalf. Since this is a very simple pass-thru + // miniport, we do not have a need to protect anything. However in + // a general case there will be a need to use per-adapter spin-locks + // for the packet queues at the very least. + // + NdisMSetAttributesEx(MiniportAdapterHandle, + pAdapt, + 0, // CheckForHangTimeInSeconds + NDIS_ATTRIBUTE_IGNORE_PACKET_TIMEOUT | + NDIS_ATTRIBUTE_IGNORE_REQUEST_TIMEOUT| + NDIS_ATTRIBUTE_INTERMEDIATE_DRIVER | + NDIS_ATTRIBUTE_DESERIALIZE | + NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND, + 0); + + pAdapt->MiniportHandle = MiniportAdapterHandle; + // + // Initialize LastIndicatedStatus to be NDIS_STATUS_MEDIA_CONNECT + // + pAdapt->LastIndicatedStatus = NDIS_STATUS_MEDIA_CONNECT; + + // + // Initialize the power states for both the lower binding (PTDeviceState) + // and our miniport edge to Powered On. + // + pAdapt->MPDeviceState = NdisDeviceStateD0; + pAdapt->PTDeviceState = NdisDeviceStateD0; + + // + // Add this adapter to the global pAdapt List + // + NdisAcquireSpinLock(&GlobalLock); + + pAdapt->Next = pAdaptList; + pAdaptList = pAdapt; + + NdisReleaseSpinLock(&GlobalLock); + + // + // Create an ioctl interface + // + (VOID)PtRegisterDevice(); + + Status = NDIS_STATUS_SUCCESS; + } + while (FALSE); + + // + // If we had received an UnbindAdapter notification on the underlying + // adapter, we would have blocked that thread waiting for the IM Init + // process to complete. Wake up any such thread. + // + ASSERT(pAdapt->MiniportInitPending == TRUE); + pAdapt->MiniportInitPending = FALSE; + NdisSetEvent(&pAdapt->MiniportInitEvent); + + if (Status == NDIS_STATUS_SUCCESS) + { + PtReferenceAdapt(pAdapt); + } + + DBGPRINT(("<== Miniport Initialize: Adapt %p, Status %x\n", pAdapt, Status)); + + *OpenErrorStatus = Status; + + + return Status; +} + + +NDIS_STATUS +MPSend( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet, + IN UINT Flags + ) +/*++ + +Routine Description: + + Send Packet handler. Either this or our SendPackets (array) handler is called + based on which one is enabled in our Miniport Characteristics. + +Arguments: + + MiniportAdapterContext Pointer to the adapter + Packet Packet to send + Flags Unused, passed down below + +Return Value: + + Return code from NdisSend + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + PNDIS_PACKET MyPacket; + PVOID MediaSpecificInfo = NULL; + ULONG MediaSpecificInfoSize = 0; + + // + // The driver should fail the send if the virtual miniport is in low + // power state + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + return NDIS_STATUS_FAILURE; + } + +#ifdef NDIS51 + // + // Use NDIS 5.1 packet stacking: + // + if (0) // XXX IPFW - make sure we don't go in here + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; + + // + // Packet stacks: Check if we can use the same packet for sending down. + // + + pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". + // + // NOTE: if we needed to keep per-packet information in packets + // sent down, we can use pStack->IMReserved[]. + // + ASSERT(pStack); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + return NDIS_STATUS_FAILURE; + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + NdisSend(&Status, + pAdapt->BindingHandle, + Packet); + + if (Status != NDIS_STATUS_PENDING) + { + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + + return(Status); + } + } +#endif // NDIS51 + + // + // We are either not using packet stacks, or there isn't stack space + // in the original packet passed down to us. Allocate a new packet + // to wrap the data with. + // + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + return NDIS_STATUS_FAILURE; + + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisAllocatePacket(&Status, + &MyPacket, + pAdapt->SendPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PSEND_RSVD SendRsvd; + + // + // Save a pointer to the original packet in our reserved + // area in the new packet. This is needed so that we can + // get back to the original packet when the new packet's send + // is completed. + // + SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved); + SendRsvd->OriginalPkt = Packet; + + NdisGetPacketFlags(MyPacket) = Flags; + + // + // Set up the new packet so that it describes the same + // data as the original packet. + // + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); +#ifdef WIN9X + // + // Work around the fact that NDIS does not initialize this + // to FALSE on Win9x. + // + NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE; +#endif + + // + // Copy the OOB Offset from the original packet to the new + // packet. + // + NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket), + NDIS_OOB_DATA_FROM_PACKET(Packet), + sizeof(NDIS_PACKET_OOB_DATA)); + +#ifndef WIN9X + // + // Copy the right parts of per packet info into the new packet. + // This API is not available on Win9x since task offload is + // not supported on that platform. + // + NdisIMCopySendPerPacketInfo(MyPacket, Packet); +#endif + + // + // Copy the Media specific information + // + NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet, + &MediaSpecificInfo, + &MediaSpecificInfoSize); + + if (MediaSpecificInfo || MediaSpecificInfoSize) + { + NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket, + MediaSpecificInfo, + MediaSpecificInfoSize); + } +#if 1 /* IPFW: query the firewall */ + /* if dummynet keeps the packet, we mimic success. + * otherwise continue as usual. + */ + { + int ret = ipfw2_qhandler_w32(MyPacket, OUTGOING, + MiniportAdapterContext); + if (ret != PASS) { + if (ret == DROP) + return NDIS_STATUS_FAILURE; + else { //dummynet kept the packet +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + return NDIS_STATUS_SUCCESS; //otherwise simply continue + } + } + } +#endif /* end of IPFW code */ + + NdisSend(&Status, + pAdapt->BindingHandle, + MyPacket); + + + if (Status != NDIS_STATUS_PENDING) + { +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + NdisFreePacket(MyPacket); + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + else + { + ADAPT_DECR_PENDING_SENDS(pAdapt); + // + // We are out of packets. Silently drop it. Alternatively we can deal with it: + // - By keeping separate send and receive pools + // - Dynamically allocate more pools as needed and free them when not needed + // + } + + return(Status); +} + + +VOID +MPSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PPNDIS_PACKET PacketArray, + IN UINT NumberOfPackets + ) +/*++ + +Routine Description: + + Send Packet Array handler. Either this or our SendPacket handler is called + based on which one is enabled in our Miniport Characteristics. + +Arguments: + + MiniportAdapterContext Pointer to our adapter + PacketArray Set of packets to send + NumberOfPackets Self-explanatory + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + UINT i; + PVOID MediaSpecificInfo = NULL; + UINT MediaSpecificInfoSize = 0; + + + for (i = 0; i < NumberOfPackets; i++) + { + PNDIS_PACKET Packet, MyPacket; + + Packet = PacketArray[i]; + // + // The driver should fail the send if the virtual miniport is in low + // power state + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + NDIS_STATUS_FAILURE); + continue; + } + +#ifdef NDIS51 + + // + // Use NDIS 5.1 packet stacking: + // + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; + + // + // Packet stacks: Check if we can use the same packet for sending down. + // + pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". + // + // NOTE: if we needed to keep per-packet information in packets + // sent down, we can use pStack->IMReserved[]. + // + ASSERT(pStack); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + NDIS_STATUS_FAILURE); + } + else + { + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisSend(&Status, + pAdapt->BindingHandle, + Packet); + + if (Status != NDIS_STATUS_PENDING) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + Status); + + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + continue; + } + } +#endif + do + { + NdisAcquireSpinLock(&pAdapt->Lock); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisAllocatePacket(&Status, + &MyPacket, + pAdapt->SendPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PSEND_RSVD SendRsvd; + + SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved); + SendRsvd->OriginalPkt = Packet; + + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); +#ifdef WIN9X + // + // Work around the fact that NDIS does not initialize this + // to FALSE on Win9x. + // + NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE; +#endif // WIN9X + + // + // Copy the OOB data from the original packet to the new + // packet. + // + NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket), + NDIS_OOB_DATA_FROM_PACKET(Packet), + sizeof(NDIS_PACKET_OOB_DATA)); + // + // Copy relevant parts of the per packet info into the new packet + // +#ifndef WIN9X + NdisIMCopySendPerPacketInfo(MyPacket, Packet); +#endif + + // + // Copy the Media specific information + // + NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet, + &MediaSpecificInfo, + &MediaSpecificInfoSize); + + if (MediaSpecificInfo || MediaSpecificInfoSize) + { + NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket, + MediaSpecificInfo, + MediaSpecificInfoSize); + } + + NdisSend(&Status, + pAdapt->BindingHandle, + MyPacket); + + if (Status != NDIS_STATUS_PENDING) + { +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + NdisFreePacket(MyPacket); + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + else + { + // + // The driver cannot allocate a packet. + // + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + while (FALSE); + + if (Status != NDIS_STATUS_PENDING) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + Status); + } + } +} + + +NDIS_STATUS +MPQueryInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesWritten, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + + Entry point called by NDIS to query for the value of the specified OID. + Typical processing is to forward the query down to the underlying miniport. + + The following OIDs are filtered here: + + OID_PNP_QUERY_POWER - return success right here + + OID_GEN_SUPPORTED_GUIDS - do not forward, otherwise we will show up + multiple instances of private GUIDs supported by the underlying miniport. + + OID_PNP_CAPABILITIES - we do send this down to the lower miniport, but + the values returned are postprocessed before we complete this request; + see PtRequestComplete. + + NOTE on OID_TCP_TASK_OFFLOAD - if this IM driver modifies the contents + of data it passes through such that a lower miniport may not be able + to perform TCP task offload, then it should not forward this OID down, + but fail it here with the status NDIS_STATUS_NOT_SUPPORTED. This is to + avoid performing incorrect transformations on data. + + If our miniport edge (upper edge) is at a low-power state, fail the request. + + If our protocol edge (lower edge) has been notified of a low-power state, + we pend this request until the miniport below has been set to D0. Since + requests to miniports are serialized always, at most a single request will + be pended. + +Arguments: + + MiniportAdapterContext Pointer to the adapter structure + Oid Oid for this query + InformationBuffer Buffer for information + InformationBufferLength Size of this buffer + BytesWritten Specifies how much info is written + BytesNeeded In case the buffer is smaller than what we need, tell them how much is needed + + +Return Value: + + Return code from the NdisRequest below. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status = NDIS_STATUS_FAILURE; + + do + { + if (Oid == OID_PNP_QUERY_POWER) + { + // + // Do not forward this. + // + Status = NDIS_STATUS_SUCCESS; + break; + } + + if (Oid == OID_GEN_SUPPORTED_GUIDS) + { + // + // Do not forward this, otherwise we will end up with multiple + // instances of private GUIDs that the underlying miniport + // supports. + // + Status = NDIS_STATUS_NOT_SUPPORTED; + break; + } + + if (Oid == OID_TCP_TASK_OFFLOAD) + { + // + // Fail this -if- this driver performs data transformations + // that can interfere with a lower driver's ability to offload + // TCP tasks. + // + // Status = NDIS_STATUS_NOT_SUPPORTED; + // break; + // + } + // + // If the miniport below is unbinding, just fail any request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + NdisReleaseSpinLock(&pAdapt->Lock); + // + // All other queries are failed, if the miniport is not at D0, + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + Status = NDIS_STATUS_FAILURE; + break; + } + + pAdapt->Request.RequestType = NdisRequestQueryInformation; + pAdapt->Request.DATA.QUERY_INFORMATION.Oid = Oid; + pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer = InformationBuffer; + pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength = InformationBufferLength; + pAdapt->BytesNeeded = BytesNeeded; + pAdapt->BytesReadOrWritten = BytesWritten; + + // + // If the miniport below is binding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + // + // If the Protocol device state is OFF, mark this request as being + // pended. We queue this until the device state is back to D0. + // + if ((pAdapt->PTDeviceState > NdisDeviceStateD0) + && (pAdapt->StandingBy == FALSE)) + { + pAdapt->QueuedRequest = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_PENDING; + break; + } + // + // This is in the process of powering down the system, always fail the request + // + if (pAdapt->StandingBy == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingRequests = TRUE; + + NdisReleaseSpinLock(&pAdapt->Lock); + + // + // default case, most requests will be passed to the miniport below + // + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + + if (Status != NDIS_STATUS_PENDING) + { + PtRequestComplete(pAdapt, &pAdapt->Request, Status); + Status = NDIS_STATUS_PENDING; + } + + } while (FALSE); + + return(Status); + +} + + +VOID +MPQueryPNPCapabilities( + IN OUT PADAPT pAdapt, + OUT PNDIS_STATUS pStatus + ) +/*++ + +Routine Description: + + Postprocess a request for OID_PNP_CAPABILITIES that was forwarded + down to the underlying miniport, and has been completed by it. + +Arguments: + + pAdapt - Pointer to the adapter structure + pStatus - Place to return final status + +Return Value: + + None. + +--*/ + +{ + PNDIS_PNP_CAPABILITIES pPNPCapabilities; + PNDIS_PM_WAKE_UP_CAPABILITIES pPMstruct; + + if (pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength >= sizeof(NDIS_PNP_CAPABILITIES)) + { + pPNPCapabilities = (PNDIS_PNP_CAPABILITIES)(pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer); + + // + // The following fields must be overwritten by an IM driver. + // + pPMstruct= & pPNPCapabilities->WakeUpCapabilities; + pPMstruct->MinMagicPacketWakeUp = NdisDeviceStateUnspecified; + pPMstruct->MinPatternWakeUp = NdisDeviceStateUnspecified; + pPMstruct->MinLinkChangeWakeUp = NdisDeviceStateUnspecified; + *pAdapt->BytesReadOrWritten = sizeof(NDIS_PNP_CAPABILITIES); + *pAdapt->BytesNeeded = 0; + + + // + // Setting our internal flags + // Default, device is ON + // + pAdapt->MPDeviceState = NdisDeviceStateD0; + pAdapt->PTDeviceState = NdisDeviceStateD0; + + *pStatus = NDIS_STATUS_SUCCESS; + } + else + { + *pAdapt->BytesNeeded= sizeof(NDIS_PNP_CAPABILITIES); + *pStatus = NDIS_STATUS_RESOURCES; + } +} + + +NDIS_STATUS +MPSetInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + + Miniport SetInfo handler. + + In the case of OID_PNP_SET_POWER, record the power state and return the OID. + Do not pass below + If the device is suspended, do not block the SET_POWER_OID + as it is used to reactivate the Passthru miniport + + + PM- If the MP is not ON (DeviceState > D0) return immediately (except for 'query power' and 'set power') + If MP is ON, but the PT is not at D0, then queue the queue the request for later processing + + Requests to miniports are always serialized + + +Arguments: + + MiniportAdapterContext Pointer to the adapter structure + Oid Oid for this query + InformationBuffer Buffer for information + InformationBufferLength Size of this buffer + BytesRead Specifies how much info is read + BytesNeeded In case the buffer is smaller than what we need, tell them how much is needed + +Return Value: + + Return code from the NdisRequest below. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + + Status = NDIS_STATUS_FAILURE; + + do + { + // + // The Set Power should not be sent to the miniport below the Passthru, but is handled internally + // + if (Oid == OID_PNP_SET_POWER) + { + MPProcessSetPowerOid(&Status, + pAdapt, + InformationBuffer, + InformationBufferLength, + BytesRead, + BytesNeeded); + break; + + } + + // + // If the miniport below is unbinding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + NdisReleaseSpinLock(&pAdapt->Lock); + // + // All other Set Information requests are failed, if the miniport is + // not at D0 or is transitioning to a device state greater than D0. + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + Status = NDIS_STATUS_FAILURE; + break; + } + + // Set up the Request and return the result + pAdapt->Request.RequestType = NdisRequestSetInformation; + pAdapt->Request.DATA.SET_INFORMATION.Oid = Oid; + pAdapt->Request.DATA.SET_INFORMATION.InformationBuffer = InformationBuffer; + pAdapt->Request.DATA.SET_INFORMATION.InformationBufferLength = InformationBufferLength; + pAdapt->BytesNeeded = BytesNeeded; + pAdapt->BytesReadOrWritten = BytesRead; + + // + // If the miniport below is unbinding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + + // + // If the device below is at a low power state, we cannot send it the + // request now, and must pend it. + // + if ((pAdapt->PTDeviceState > NdisDeviceStateD0) + && (pAdapt->StandingBy == FALSE)) + { + pAdapt->QueuedRequest = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_PENDING; + break; + } + // + // This is in the process of powering down the system, always fail the request + // + if (pAdapt->StandingBy == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingRequests = TRUE; + + NdisReleaseSpinLock(&pAdapt->Lock); + // + // Forward the request to the device below. + // + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + if (Status != NDIS_STATUS_PENDING) + { + *BytesRead = pAdapt->Request.DATA.SET_INFORMATION.BytesRead; + *BytesNeeded = pAdapt->Request.DATA.SET_INFORMATION.BytesNeeded; + pAdapt->OutstandingRequests = FALSE; + } + + } while (FALSE); + + return(Status); +} + + +VOID +MPProcessSetPowerOid( + IN OUT PNDIS_STATUS pNdisStatus, + IN PADAPT pAdapt, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + This routine does all the procssing for a request with a SetPower Oid + The miniport shoud accept the Set Power and transition to the new state + + The Set Power should not be passed to the miniport below + + If the IM miniport is going into a low power state, then there is no guarantee if it will ever + be asked go back to D0, before getting halted. No requests should be pended or queued. + + +Arguments: + pNdisStatus - Status of the operation + pAdapt - The Adapter structure + InformationBuffer - The New DeviceState + InformationBufferLength + BytesRead - No of bytes read + BytesNeeded - No of bytes needed + + +Return Value: + Status - NDIS_STATUS_SUCCESS if all the wait events succeed. + +--*/ +{ + + + NDIS_DEVICE_POWER_STATE NewDeviceState; + + DBGPRINT(("==>MPProcessSetPowerOid: Adapt %p\n", pAdapt)); + + ASSERT (InformationBuffer != NULL); + + *pNdisStatus = NDIS_STATUS_FAILURE; + + do + { + // + // Check for invalid length + // + if (InformationBufferLength < sizeof(NDIS_DEVICE_POWER_STATE)) + { + *pNdisStatus = NDIS_STATUS_INVALID_LENGTH; + break; + } + + NewDeviceState = (*(PNDIS_DEVICE_POWER_STATE)InformationBuffer); + + // + // Check for invalid device state + // + if ((pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0)) + { + // + // If the miniport is in a non-D0 state, the miniport can only receive a Set Power to D0 + // + ASSERT (!(pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0)); + + *pNdisStatus = NDIS_STATUS_FAILURE; + break; + } + + // + // Is the miniport transitioning from an On (D0) state to an Low Power State (>D0) + // If so, then set the StandingBy Flag - (Block all incoming requests) + // + if (pAdapt->MPDeviceState == NdisDeviceStateD0 && NewDeviceState > NdisDeviceStateD0) + { + pAdapt->StandingBy = TRUE; + } + + // + // If the miniport is transitioning from a low power state to ON (D0), then clear the StandingBy flag + // All incoming requests will be pended until the physical miniport turns ON. + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0 && NewDeviceState == NdisDeviceStateD0) + { + pAdapt->StandingBy = FALSE; + } + + // + // Now update the state in the pAdapt structure; + // + pAdapt->MPDeviceState = NewDeviceState; + + *pNdisStatus = NDIS_STATUS_SUCCESS; + + + } while (FALSE); + + if (*pNdisStatus == NDIS_STATUS_SUCCESS) + { + // + // The miniport resume from low power state + // + if (pAdapt->StandingBy == FALSE) + { + // + // If we need to indicate the media connect state + // + if (pAdapt->LastIndicatedStatus != pAdapt->LatestUnIndicateStatus) + { + if (pAdapt->MiniportHandle != NULL) + { + NdisMIndicateStatus(pAdapt->MiniportHandle, + pAdapt->LatestUnIndicateStatus, + (PVOID)NULL, + 0); + NdisMIndicateStatusComplete(pAdapt->MiniportHandle); + pAdapt->LastIndicatedStatus = pAdapt->LatestUnIndicateStatus; + } + } + } + else + { + // + // Initialize LatestUnIndicatedStatus + // + pAdapt->LatestUnIndicateStatus = pAdapt->LastIndicatedStatus; + } + *BytesRead = sizeof(NDIS_DEVICE_POWER_STATE); + *BytesNeeded = 0; + } + else + { + *BytesRead = 0; + *BytesNeeded = sizeof (NDIS_DEVICE_POWER_STATE); + } + + DBGPRINT(("<==MPProcessSetPowerOid: Adapt %p\n", pAdapt)); +} + + +VOID +MPReturnPacket( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet + ) +/*++ + +Routine Description: + + NDIS Miniport entry point called whenever protocols are done with + a packet that we had indicated up and they had queued up for returning + later. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + Packet - packet being returned. + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + +#ifdef NDIS51 + // + // Packet stacking: Check if this packet belongs to us. + // + if (NdisGetPoolFromPacket(Packet) != pAdapt->RecvPacketPoolHandle) + { + // + // We reused the original packet in a receive indication. + // Simply return it to the miniport below us. + // + NdisReturnPackets(&Packet, 1); + } + else +#endif // NDIS51 + { + // + // This is a packet allocated from this IM's receive packet pool. + // Reclaim our packet, and return the original to the driver below. + // + + PNDIS_PACKET MyPacket; + PRECV_RSVD RecvRsvd; + + RecvRsvd = (PRECV_RSVD)(Packet->MiniportReserved); + MyPacket = RecvRsvd->OriginalPkt; + + NdisFreePacket(Packet); + NdisReturnPackets(&MyPacket, 1); + } +} + + +NDIS_STATUS +MPTransferData( + OUT PNDIS_PACKET Packet, + OUT PUINT BytesTransferred, + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_HANDLE MiniportReceiveContext, + IN UINT ByteOffset, + IN UINT BytesToTransfer + ) +/*++ + +Routine Description: + + Miniport's transfer data handler. + +Arguments: + + Packet Destination packet + BytesTransferred Place-holder for how much data was copied + MiniportAdapterContext Pointer to the adapter structure + MiniportReceiveContext Context + ByteOffset Offset into the packet for copying data + BytesToTransfer How much to copy. + +Return Value: + + Status of transfer + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + + // + // Return, if the device is OFF + // + + if (IsIMDeviceStateOn(pAdapt) == FALSE) + { + return NDIS_STATUS_FAILURE; + } + + NdisTransferData(&Status, + pAdapt->BindingHandle, + MiniportReceiveContext, + ByteOffset, + BytesToTransfer, + Packet, + BytesTransferred); + + return(Status); +} + +VOID +MPHalt( + IN NDIS_HANDLE MiniportAdapterContext + ) +/*++ + +Routine Description: + + Halt handler. All the hard-work for clean-up is done here. + +Arguments: + + MiniportAdapterContext Pointer to the Adapter + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + PADAPT *ppCursor; + + DBGPRINT(("==>MiniportHalt: Adapt %p\n", pAdapt)); + + pAdapt->MiniportHandle = NULL; + pAdapt->MiniportIsHalted = TRUE; + + // + // Remove this adapter from the global list + // + NdisAcquireSpinLock(&GlobalLock); + + for (ppCursor = &pAdaptList; *ppCursor != NULL; ppCursor = &(*ppCursor)->Next) + { + if (*ppCursor == pAdapt) + { + *ppCursor = pAdapt->Next; + break; + } + } + + NdisReleaseSpinLock(&GlobalLock); + + // + // Delete the ioctl interface that was created when the miniport + // was created. + // + (VOID)PtDeregisterDevice(); + + // + // If we have a valid bind, close the miniport below the protocol + // +#pragma prefast(suppress: __WARNING_DEREF_NULL_PTR, "pAdapt cannot be NULL") + if (pAdapt->BindingHandle != NULL) + { + // + // Close the binding below. and wait for it to complete + // + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(&Status, pAdapt->BindingHandle); + + if (Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + Status = pAdapt->Status; + } + + ASSERT (Status == NDIS_STATUS_SUCCESS); + + pAdapt->BindingHandle = NULL; + + PtDereferenceAdapt(pAdapt); + } + + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + + + DBGPRINT(("<== MiniportHalt: pAdapt %p\n", pAdapt)); +} + + +#ifdef NDIS51_MINIPORT + +VOID +MPCancelSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PVOID CancelId + ) +/*++ + +Routine Description: + + The miniport entry point to handle cancellation of all send packets + that match the given CancelId. If we have queued any packets that match + this, then we should dequeue them and call NdisMSendComplete for all + such packets, with a status of NDIS_STATUS_REQUEST_ABORTED. + + We should also call NdisCancelSendPackets in turn, on each lower binding + that this adapter corresponds to. This is to let miniports below cancel + any matching packets. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + CancelId - ID of packets to be cancelled. + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + + // + // If we queue packets on our adapter structure, this would be + // the place to acquire a spinlock to it, unlink any packets whose + // Id matches CancelId, release the spinlock and call NdisMSendComplete + // with NDIS_STATUS_REQUEST_ABORTED for all unlinked packets. + // + + // + // Next, pass this down so that we let the miniport(s) below cancel + // any packets that they might have queued. + // + NdisCancelSendPackets(pAdapt->BindingHandle, CancelId); + + return; +} + +VOID +MPDevicePnPEvent( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_DEVICE_PNP_EVENT DevicePnPEvent, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength + ) +/*++ + +Routine Description: + + This handler is called to notify us of PnP events directed to + our miniport device object. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + DevicePnPEvent - the event + InformationBuffer - Points to additional event-specific information + InformationBufferLength - length of above + +Return Value: + + None +--*/ +{ + // TBD - add code/comments about processing this. + + UNREFERENCED_PARAMETER(MiniportAdapterContext); + UNREFERENCED_PARAMETER(DevicePnPEvent); + UNREFERENCED_PARAMETER(InformationBuffer); + UNREFERENCED_PARAMETER(InformationBufferLength); + + return; +} + +VOID +MPAdapterShutdown( + IN NDIS_HANDLE MiniportAdapterContext + ) +/*++ + +Routine Description: + + This handler is called to notify us of an impending system shutdown. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + +Return Value: + + None +--*/ +{ + UNREFERENCED_PARAMETER(MiniportAdapterContext); + + return; +} + +#endif + + +VOID +MPFreeAllPacketPools( + IN PADAPT pAdapt + ) +/*++ + +Routine Description: + + Free all packet pools on the specified adapter. + +Arguments: + + pAdapt - pointer to ADAPT structure + +Return Value: + + None + +--*/ +{ + if (pAdapt->RecvPacketPoolHandle != NULL) + { + // + // Free the packet pool that is used to indicate receives + // + NdisFreePacketPool(pAdapt->RecvPacketPoolHandle); + + pAdapt->RecvPacketPoolHandle = NULL; + } + + if (pAdapt->SendPacketPoolHandle != NULL) + { + + // + // Free the packet pool that is used to send packets below + // + + NdisFreePacketPool(pAdapt->SendPacketPoolHandle); + + pAdapt->SendPacketPoolHandle = NULL; + + } +} + diff --git a/modified_passthru/passthru.c b/modified_passthru/passthru.c new file mode 100644 index 0000000..c366173 --- /dev/null +++ b/modified_passthru/passthru.c @@ -0,0 +1,469 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + passthru.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + + +#include "precomp.h" +#pragma hdrstop + +#pragma NDIS_INIT_FUNCTION(DriverEntry) + +NDIS_HANDLE ProtHandle = NULL; +NDIS_HANDLE DriverHandle = NULL; +NDIS_MEDIUM MediumArray[4] = + { + NdisMedium802_3, // Ethernet + NdisMedium802_5, // Token-ring + NdisMediumFddi, // Fddi + NdisMediumWan // NDISWAN + }; + +NDIS_SPIN_LOCK GlobalLock; + +PADAPT pAdaptList = NULL; +LONG MiniportCount = 0; + +NDIS_HANDLE NdisWrapperHandle; + +// +// To support ioctls from user-mode: +// + +#define STR2(x) #x +#define STR(x) STR2(x) +#define DOSPREFIX "\\DosDevices\\" +#define NTPREFIX "\\Device\\" +#define WIDEN2(x) L ## x +#define WIDEN(x) WIDEN2(x) +#define LINKNAME_STRING WIDEN(DOSPREFIX) WIDEN(STR(MODULENAME)) +#define NTDEVICE_STRING WIDEN(NTPREFIX) WIDEN(STR(MODULENAME)) +#define PROTOCOLNAME_STRING WIDEN(STR(MODULENAME)) + +NDIS_HANDLE NdisDeviceHandle = NULL; +PDEVICE_OBJECT ControlDeviceObject = NULL; + +enum _DEVICE_STATE +{ + PS_DEVICE_STATE_READY = 0, // ready for create/delete + PS_DEVICE_STATE_CREATING, // create operation in progress + PS_DEVICE_STATE_DELETING // delete operation in progress +} ControlDeviceState = PS_DEVICE_STATE_READY; + + + +NTSTATUS +DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ) +/*++ + +Routine Description: + + First entry point to be called, when this driver is loaded. + Register with NDIS as an intermediate driver. + +Arguments: + + DriverObject - pointer to the system's driver object structure + for this driver + + RegistryPath - system's registry path for this driver + +Return Value: + + STATUS_SUCCESS if all initialization is successful, STATUS_XXX + error code if not. + +--*/ +{ + NDIS_STATUS Status; + NDIS_PROTOCOL_CHARACTERISTICS PChars; + NDIS_MINIPORT_CHARACTERISTICS MChars; + NDIS_STRING Name; + + Status = NDIS_STATUS_SUCCESS; + NdisAllocateSpinLock(&GlobalLock); + + NdisMInitializeWrapper(&NdisWrapperHandle, DriverObject, RegistryPath, NULL); + + do + { + // + // Register the miniport with NDIS. Note that it is the miniport + // which was started as a driver and not the protocol. Also the miniport + // must be registered prior to the protocol since the protocol's BindAdapter + // handler can be initiated anytime and when it is, it must be ready to + // start driver instances. + // + + NdisZeroMemory(&MChars, sizeof(NDIS_MINIPORT_CHARACTERISTICS)); + + MChars.MajorNdisVersion = PASSTHRU_MAJOR_NDIS_VERSION; + MChars.MinorNdisVersion = PASSTHRU_MINOR_NDIS_VERSION; + + MChars.InitializeHandler = MPInitialize; + MChars.QueryInformationHandler = MPQueryInformation; + MChars.SetInformationHandler = MPSetInformation; + MChars.ResetHandler = NULL; + MChars.TransferDataHandler = MPTransferData; + MChars.HaltHandler = MPHalt; +#ifdef NDIS51_MINIPORT + MChars.CancelSendPacketsHandler = MPCancelSendPackets; + MChars.PnPEventNotifyHandler = MPDevicePnPEvent; + MChars.AdapterShutdownHandler = MPAdapterShutdown; +#endif // NDIS51_MINIPORT + + // + // We will disable the check for hang timeout so we do not + // need a check for hang handler! + // + MChars.CheckForHangHandler = NULL; + MChars.ReturnPacketHandler = MPReturnPacket; + + // + // Either the Send or the SendPackets handler should be specified. + // If SendPackets handler is specified, SendHandler is ignored + // + MChars.SendHandler = MPSend; // IPFW: use MPSend, not SendPackets + MChars.SendPacketsHandler = NULL; + + Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle, + &MChars, + sizeof(MChars), + &DriverHandle); + if (Status != NDIS_STATUS_SUCCESS) + { + break; + } + +#ifndef WIN9X + NdisMRegisterUnloadHandler(NdisWrapperHandle, PtUnload); +#endif + + // + // Now register the protocol. + // + NdisZeroMemory(&PChars, sizeof(NDIS_PROTOCOL_CHARACTERISTICS)); + PChars.MajorNdisVersion = PASSTHRU_PROT_MAJOR_NDIS_VERSION; + PChars.MinorNdisVersion = PASSTHRU_PROT_MINOR_NDIS_VERSION; + + // + // Make sure the protocol-name matches the service-name + // (from the INF) under which this protocol is installed. + // This is needed to ensure that NDIS can correctly determine + // the binding and call us to bind to miniports below. + // + NdisInitUnicodeString(&Name, PROTOCOLNAME_STRING); // Protocol name + PChars.Name = Name; + PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete; + PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete; + PChars.SendCompleteHandler = PtSendComplete; + PChars.TransferDataCompleteHandler = PtTransferDataComplete; + + PChars.ResetCompleteHandler = PtResetComplete; + PChars.RequestCompleteHandler = PtRequestComplete; + PChars.ReceiveHandler = PtReceive; + PChars.ReceiveCompleteHandler = PtReceiveComplete; + PChars.StatusHandler = PtStatus; + PChars.StatusCompleteHandler = PtStatusComplete; + PChars.BindAdapterHandler = PtBindAdapter; + PChars.UnbindAdapterHandler = PtUnbindAdapter; + PChars.UnloadHandler = PtUnloadProtocol; + + PChars.ReceivePacketHandler = PtReceivePacket; + PChars.PnPEventHandler= PtPNPHandler; + + NdisRegisterProtocol(&Status, + &ProtHandle, + &PChars, + sizeof(NDIS_PROTOCOL_CHARACTERISTICS)); + + if (Status != NDIS_STATUS_SUCCESS) + { + NdisIMDeregisterLayeredMiniport(DriverHandle); + break; + } + + NdisIMAssociateMiniport(DriverHandle, ProtHandle); + } + while (FALSE); + + if (Status != NDIS_STATUS_SUCCESS) + { + NdisTerminateWrapper(NdisWrapperHandle, NULL); + } + + ipfw_module_init(); // IPFW - start the system + + return(Status); +} + + +NDIS_STATUS +PtRegisterDevice( + VOID + ) +/*++ + +Routine Description: + + Register an ioctl interface - a device object to be used for this + purpose is created by NDIS when we call NdisMRegisterDevice. + + This routine is called whenever a new miniport instance is + initialized. However, we only create one global device object, + when the first miniport instance is initialized. This routine + handles potential race conditions with PtDeregisterDevice via + the ControlDeviceState and MiniportCount variables. + + NOTE: do not call this from DriverEntry; it will prevent the driver + from being unloaded (e.g. on uninstall). + +Arguments: + + None + +Return Value: + + NDIS_STATUS_SUCCESS if we successfully register a device object. + +--*/ +{ + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + UNICODE_STRING DeviceName; + UNICODE_STRING DeviceLinkUnicodeString; + PDRIVER_DISPATCH DispatchTable[IRP_MJ_MAXIMUM_FUNCTION+1]; + + DBGPRINT(("==>PtRegisterDevice\n")); + + NdisAcquireSpinLock(&GlobalLock); + + ++MiniportCount; + + if (1 == MiniportCount) + { + ASSERT(ControlDeviceState != PS_DEVICE_STATE_CREATING); + + // + // Another thread could be running PtDeregisterDevice on + // behalf of another miniport instance. If so, wait for + // it to exit. + // + while (ControlDeviceState != PS_DEVICE_STATE_READY) + { + NdisReleaseSpinLock(&GlobalLock); + NdisMSleep(1); + NdisAcquireSpinLock(&GlobalLock); + } + + ControlDeviceState = PS_DEVICE_STATE_CREATING; + + NdisReleaseSpinLock(&GlobalLock); + + + NdisZeroMemory(DispatchTable, (IRP_MJ_MAXIMUM_FUNCTION+1) * sizeof(PDRIVER_DISPATCH)); + + DispatchTable[IRP_MJ_CREATE] = PtDispatch; + DispatchTable[IRP_MJ_CLEANUP] = PtDispatch; + DispatchTable[IRP_MJ_CLOSE] = PtDispatch; + // IPFW we use DevIoControl ? + DispatchTable[IRP_MJ_DEVICE_CONTROL] = DevIoControl; + + + NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING); + NdisInitUnicodeString(&DeviceLinkUnicodeString, LINKNAME_STRING); + + // + // Create a device object and register our dispatch handlers + // + + Status = NdisMRegisterDevice( + NdisWrapperHandle, + &DeviceName, + &DeviceLinkUnicodeString, + &DispatchTable[0], + &ControlDeviceObject, + &NdisDeviceHandle + ); + + NdisAcquireSpinLock(&GlobalLock); + + ControlDeviceState = PS_DEVICE_STATE_READY; + } + + NdisReleaseSpinLock(&GlobalLock); + + DBGPRINT(("<==PtRegisterDevice: %x\n", Status)); + + return (Status); +} + + +NTSTATUS +PtDispatch( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +/*++ +Routine Description: + + Process IRPs sent to this device. + +Arguments: + + DeviceObject - pointer to a device object + Irp - pointer to an I/O Request Packet + +Return Value: + + NTSTATUS - STATUS_SUCCESS always - change this when adding + real code to handle ioctls. + +--*/ +{ + PIO_STACK_LOCATION irpStack; + NTSTATUS status = STATUS_SUCCESS; + + UNREFERENCED_PARAMETER(DeviceObject); + + DBGPRINT(("==>Pt Dispatch\n")); + irpStack = IoGetCurrentIrpStackLocation(Irp); + + + switch (irpStack->MajorFunction) + { + case IRP_MJ_CREATE: + break; + + case IRP_MJ_CLEANUP: + break; + + case IRP_MJ_CLOSE: + break; + + case IRP_MJ_DEVICE_CONTROL: + // + // Add code here to handle ioctl commands sent to passthru. + // + break; + default: + break; + } + + Irp->IoStatus.Status = status; + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + DBGPRINT(("<== Pt Dispatch\n")); + + return status; + +} + + +NDIS_STATUS +PtDeregisterDevice( + VOID + ) +/*++ + +Routine Description: + + Deregister the ioctl interface. This is called whenever a miniport + instance is halted. When the last miniport instance is halted, we + request NDIS to delete the device object + +Arguments: + + NdisDeviceHandle - Handle returned by NdisMRegisterDevice + +Return Value: + + NDIS_STATUS_SUCCESS if everything worked ok + +--*/ +{ + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + + DBGPRINT(("==>PassthruDeregisterDevice\n")); + + NdisAcquireSpinLock(&GlobalLock); + + ASSERT(MiniportCount > 0); + + --MiniportCount; + + if (0 == MiniportCount) + { + // + // All miniport instances have been halted. Deregister + // the control device. + // + + ASSERT(ControlDeviceState == PS_DEVICE_STATE_READY); + + // + // Block PtRegisterDevice() while we release the control + // device lock and deregister the device. + // + ControlDeviceState = PS_DEVICE_STATE_DELETING; + + NdisReleaseSpinLock(&GlobalLock); + + if (NdisDeviceHandle != NULL) + { + Status = NdisMDeregisterDevice(NdisDeviceHandle); + NdisDeviceHandle = NULL; + } + + NdisAcquireSpinLock(&GlobalLock); + ControlDeviceState = PS_DEVICE_STATE_READY; + } + + NdisReleaseSpinLock(&GlobalLock); + + DBGPRINT(("<== PassthruDeregisterDevice: %x\n", Status)); + return Status; + +} + +VOID +PtUnload( + IN PDRIVER_OBJECT DriverObject + ) +// +// PassThru driver unload function +// +{ + UNREFERENCED_PARAMETER(DriverObject); + + DBGPRINT(("PtUnload: entered\n")); + + PtUnloadProtocol(); + + NdisIMDeregisterLayeredMiniport(DriverHandle); + + NdisFreeSpinLock(&GlobalLock); + + ipfw_module_exit(); // IPFW unloading dummynet + + DBGPRINT(("PtUnload: done!\n")); +} diff --git a/modified_passthru/passthru.h b/modified_passthru/passthru.h new file mode 100644 index 0000000..6e79db7 --- /dev/null +++ b/modified_passthru/passthru.h @@ -0,0 +1,500 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + passthru.h + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + +#ifdef NDIS51_MINIPORT +#define PASSTHRU_MAJOR_NDIS_VERSION 5 +#define PASSTHRU_MINOR_NDIS_VERSION 1 +#else +#define PASSTHRU_MAJOR_NDIS_VERSION 4 +#define PASSTHRU_MINOR_NDIS_VERSION 0 +#endif + +#ifdef NDIS51 +#define PASSTHRU_PROT_MAJOR_NDIS_VERSION 5 +#define PASSTHRU_PROT_MINOR_NDIS_VERSION 0 +#else +#define PASSTHRU_PROT_MAJOR_NDIS_VERSION 4 +#define PASSTHRU_PROT_MINOR_NDIS_VERSION 0 +#endif + +#define MAX_BUNDLEID_LENGTH 50 + +#define TAG 'ImPa' +#define WAIT_INFINITE 0 + + + +//advance declaration +typedef struct _ADAPT ADAPT, *PADAPT; + +DRIVER_INITIALIZE DriverEntry; +extern +NTSTATUS +DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ); + +DRIVER_DISPATCH PtDispatch; +NTSTATUS +PtDispatch( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ); + +DRIVER_DISPATCH DevIoControl; +NTSTATUS +DevIoControl( + IN PDEVICE_OBJECT pDeviceObject, + IN PIRP pIrp + ); + +NDIS_STATUS +PtRegisterDevice( + VOID + ); + +NDIS_STATUS +PtDeregisterDevice( + VOID + ); + +DRIVER_UNLOAD PtUnload; +VOID +PtUnloadProtocol( + VOID + ); + +// +// Protocol proto-types +// +extern +VOID +PtOpenAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status, + IN NDIS_STATUS OpenErrorStatus + ); + +extern +VOID +PtCloseAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ); + +extern +VOID +PtResetComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ); + +extern +VOID +PtRequestComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_REQUEST NdisRequest, + IN NDIS_STATUS Status + ); + +extern +VOID +PtStatus( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS GeneralStatus, + IN PVOID StatusBuffer, + IN UINT StatusBufferSize + ); + +extern +VOID +PtStatusComplete( + IN NDIS_HANDLE ProtocolBindingContext + ); + +extern +VOID +PtSendComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status + ); + +extern +VOID +PtTransferDataComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status, + IN UINT BytesTransferred + ); + +extern +NDIS_STATUS +PtReceive( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE MacReceiveContext, + IN PVOID HeaderBuffer, + IN UINT HeaderBufferSize, + IN PVOID LookAheadBuffer, + IN UINT LookaheadBufferSize, + IN UINT PacketSize + ); + +extern +VOID +PtReceiveComplete( + IN NDIS_HANDLE ProtocolBindingContext + ); + +extern +INT +PtReceivePacket( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet + ); + +extern +VOID +PtBindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE BindContext, + IN PNDIS_STRING DeviceName, + IN PVOID SystemSpecific1, + IN PVOID SystemSpecific2 + ); + +extern +VOID +PtUnbindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE UnbindContext + ); + +VOID +PtUnload( + IN PDRIVER_OBJECT DriverObject + ); + + + +extern +NDIS_STATUS +PtPNPHandler( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNET_PNP_EVENT pNetPnPEvent + ); + + + + +NDIS_STATUS +PtPnPNetEventReconfigure( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ); + +NDIS_STATUS +PtPnPNetEventSetPower ( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ); + + +// +// Miniport proto-types +// +NDIS_STATUS +MPInitialize( + OUT PNDIS_STATUS OpenErrorStatus, + OUT PUINT SelectedMediumIndex, + IN PNDIS_MEDIUM MediumArray, + IN UINT MediumArraySize, + IN NDIS_HANDLE MiniportAdapterHandle, + IN NDIS_HANDLE WrapperConfigurationContext + ); + +VOID +MPSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PPNDIS_PACKET PacketArray, + IN UINT NumberOfPackets + ); + +NDIS_STATUS +MPSend( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet, + IN UINT Flags + ); + +NDIS_STATUS +MPQueryInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesWritten, + OUT PULONG BytesNeeded + ); + +NDIS_STATUS +MPSetInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ); + +VOID +MPReturnPacket( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet + ); + +NDIS_STATUS +MPTransferData( + OUT PNDIS_PACKET Packet, + OUT PUINT BytesTransferred, + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_HANDLE MiniportReceiveContext, + IN UINT ByteOffset, + IN UINT BytesToTransfer + ); + +VOID +MPHalt( + IN NDIS_HANDLE MiniportAdapterContext + ); + + +VOID +MPQueryPNPCapabilities( + OUT PADAPT MiniportProtocolContext, + OUT PNDIS_STATUS Status + ); + + +#ifdef NDIS51_MINIPORT + +VOID +MPCancelSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PVOID CancelId + ); + +VOID +MPAdapterShutdown( + IN NDIS_HANDLE MiniportAdapterContext + ); + +VOID +MPDevicePnPEvent( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_DEVICE_PNP_EVENT DevicePnPEvent, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength + ); + +#endif // NDIS51_MINIPORT + +VOID +MPFreeAllPacketPools( + IN PADAPT pAdapt + ); + + +VOID +MPProcessSetPowerOid( + IN OUT PNDIS_STATUS pNdisStatus, + IN PADAPT pAdapt, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ); + +VOID +PtReferenceAdapt( + IN PADAPT pAdapt + ); + +BOOLEAN +PtDereferenceAdapt( + IN PADAPT pAdapt + ); + +// +// There should be no DbgPrint's in the Free version of the driver +// +#if DBG + +#define DBGPRINT(Fmt) \ + { \ + DbgPrint("Passthru: "); \ + DbgPrint Fmt; \ + } + +#else // if DBG + +#define DBGPRINT(Fmt) + +#endif // if DBG + +#define NUM_PKTS_IN_POOL 256 + + +// +// Protocol reserved part of a sent packet that is allocated by us. +// +typedef struct _SEND_RSVD +{ + PNDIS_PACKET OriginalPkt; + struct mbuf* pMbuf; // IPFW extension, reference to the mbuf +} SEND_RSVD, *PSEND_RSVD; + +// +// Miniport reserved part of a received packet that is allocated by +// us. Note that this should fit into the MiniportReserved space +// in an NDIS_PACKET. +// +typedef struct _RECV_RSVD +{ + PNDIS_PACKET OriginalPkt; + struct mbuf* pMbuf; // IPFW extension, reference to the mbuf +} RECV_RSVD, *PRECV_RSVD; + +C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved)); + +// +// Event Codes related to the PassthruEvent Structure +// + +typedef enum +{ + Passthru_Invalid, + Passthru_SetPower, + Passthru_Unbind + +} PASSSTHRU_EVENT_CODE, *PPASTHRU_EVENT_CODE; + +// +// Passthru Event with a code to state why they have been state +// + +typedef struct _PASSTHRU_EVENT +{ + NDIS_EVENT Event; + PASSSTHRU_EVENT_CODE Code; + +} PASSTHRU_EVENT, *PPASSTHRU_EVENT; + + +// +// Structure used by both the miniport as well as the protocol part of the intermediate driver +// to represent an adapter and its corres. lower bindings +// +typedef struct _ADAPT +{ + struct _ADAPT * Next; + + NDIS_HANDLE BindingHandle; // To the lower miniport + NDIS_HANDLE MiniportHandle; // NDIS Handle to for miniport up-calls + NDIS_HANDLE SendPacketPoolHandle; + NDIS_HANDLE RecvPacketPoolHandle; + NDIS_STATUS Status; // Open Status + NDIS_EVENT Event; // Used by bind/halt for Open/Close Adapter synch. + NDIS_MEDIUM Medium; + NDIS_REQUEST Request; // This is used to wrap a request coming down + // to us. This exploits the fact that requests + // are serialized down to us. + PULONG BytesNeeded; + PULONG BytesReadOrWritten; + BOOLEAN ReceivedIndicationFlags[32]; + + BOOLEAN OutstandingRequests; // TRUE iff a request is pending + // at the miniport below + BOOLEAN QueuedRequest; // TRUE iff a request is queued at + // this IM miniport + + BOOLEAN StandingBy; // True - When the miniport or protocol is transitioning from a D0 to Standby (>D0) State + BOOLEAN UnbindingInProcess; + NDIS_SPIN_LOCK Lock; + // False - At all other times, - Flag is cleared after a transition to D0 + + NDIS_DEVICE_POWER_STATE MPDeviceState; // Miniport's Device State + NDIS_DEVICE_POWER_STATE PTDeviceState; // Protocol's Device State + NDIS_STRING DeviceName; // For initializing the miniport edge + NDIS_EVENT MiniportInitEvent; // For blocking UnbindAdapter while + // an IM Init is in progress. + BOOLEAN MiniportInitPending; // TRUE iff IMInit in progress + NDIS_STATUS LastIndicatedStatus; // The last indicated media status + NDIS_STATUS LatestUnIndicateStatus; // The latest suppressed media status + ULONG OutstandingSends; + LONG RefCount; + BOOLEAN MiniportIsHalted; +} ADAPT, *PADAPT; + +extern NDIS_HANDLE ProtHandle, DriverHandle; +extern NDIS_MEDIUM MediumArray[4]; +extern PADAPT pAdaptList; +extern NDIS_SPIN_LOCK GlobalLock; + + +#define ADAPT_MINIPORT_HANDLE(_pAdapt) ((_pAdapt)->MiniportHandle) +#define ADAPT_DECR_PENDING_SENDS(_pAdapt) \ + { \ + NdisAcquireSpinLock(&(_pAdapt)->Lock); \ + (_pAdapt)->OutstandingSends--; \ + NdisReleaseSpinLock(&(_pAdapt)->Lock); \ + } + +// +// Custom Macros to be used by the passthru driver +// +/* +BOOLEAN +IsIMDeviceStateOn( + PADAPT + ) + +*/ +#define IsIMDeviceStateOn(_pP) ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) + +#include "winmissing.h" + +int ipfw_module_init(void); +void ipfw_module_exit(void); +int ipfw2_qhandler_w32(PNDIS_PACKET pNdisPacket, int direction, + NDIS_HANDLE Context); +int ipfw2_qhandler_w32_oldstyle(int direction, NDIS_HANDLE ProtocolBindingContext, + unsigned char* HeaderBuffer, unsigned int HeaderBufferSize, + unsigned char* LookAheadBuffer, unsigned int LookAheadBufferSize, + unsigned int PacketSize); +void CleanupReinjected(PNDIS_PACKET Packet, struct mbuf* m, PADAPT pAdapt); +void hexdump(PUCHAR,int, const char *); +void my_init(); +void my_exit(); \ No newline at end of file diff --git a/modified_passthru/precomp.h b/modified_passthru/precomp.h new file mode 100644 index 0000000..b2870d1 --- /dev/null +++ b/modified_passthru/precomp.h @@ -0,0 +1,11 @@ +#pragma warning(disable:4214) // bit field types other than int + +#pragma warning(disable:4201) // nameless struct/union +#pragma warning(disable:4115) // named type definition in parentheses +#pragma warning(disable:4127) // conditional expression is constant +#pragma warning(disable:4054) // cast of function pointer to PVOID +#pragma warning(disable:4244) // conversion from 'int' to 'BOOLEAN', possible loss of data + +#include +#include "passthru.h" + diff --git a/modified_passthru/protocol.c b/modified_passthru/protocol.c new file mode 100644 index 0000000..9db4c36 --- /dev/null +++ b/modified_passthru/protocol.c @@ -0,0 +1,1670 @@ +/*++ + +Copyright(c) 1992-2000 Microsoft Corporation + +Module Name: + + protocol.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + + +#include "precomp.h" +#pragma hdrstop + +#define MAX_PACKET_POOL_SIZE 0x0000FFFF +#define MIN_PACKET_POOL_SIZE 0x000000FF + +// +// NDIS version as 0xMMMMmmmm, where M=Major/m=minor (0x00050001 = 5.1); +// initially unknown (0) +// +ULONG NdisDotSysVersion = 0x0; + + +#define NDIS_SYS_VERSION_51 0x00050001 + + +VOID +PtBindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE BindContext, + IN PNDIS_STRING DeviceName, + IN PVOID SystemSpecific1, + IN PVOID SystemSpecific2 + ) +/*++ + +Routine Description: + + Called by NDIS to bind to a miniport below. + +Arguments: + + Status - Return status of bind here. + BindContext - Can be passed to NdisCompleteBindAdapter if this call is pended. + DeviceName - Device name to bind to. This is passed to NdisOpenAdapter. + SystemSpecific1 - Can be passed to NdisOpenProtocolConfiguration to read per-binding information + SystemSpecific2 - Unused + +Return Value: + + NDIS_STATUS_PENDING if this call is pended. In this case call NdisCompleteBindAdapter + to complete. + Anything else Completes this call synchronously + +--*/ +{ + NDIS_HANDLE ConfigHandle = NULL; + PNDIS_CONFIGURATION_PARAMETER Param; + NDIS_STRING DeviceStr = NDIS_STRING_CONST("UpperBindings"); + NDIS_STRING NdisVersionStr = NDIS_STRING_CONST("NdisVersion"); + PADAPT pAdapt = NULL; + NDIS_STATUS Sts; + UINT MediumIndex; + ULONG TotalSize; + BOOLEAN NoCleanUpNeeded = FALSE; + + + UNREFERENCED_PARAMETER(BindContext); + UNREFERENCED_PARAMETER(SystemSpecific2); + + DBGPRINT(("==> Protocol BindAdapter\n")); + + do + { + // + // Access the configuration section for our binding-specific + // parameters. + // + NdisOpenProtocolConfiguration(Status, + &ConfigHandle, + SystemSpecific1); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + if (NdisDotSysVersion == 0) + { + NdisReadConfiguration(Status, + &Param, + ConfigHandle, + &NdisVersionStr, // "NdisVersion" + NdisParameterInteger); + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + NdisDotSysVersion = Param->ParameterData.IntegerData; + } + + + // + // Read the "UpperBindings" reserved key that contains a list + // of device names representing our miniport instances corresponding + // to this lower binding. Since this is a 1:1 IM driver, this key + // contains exactly one name. + // + // If we want to implement a N:1 mux driver (N adapter instances + // over a single lower binding), then UpperBindings will be a + // MULTI_SZ containing a list of device names - we would loop through + // this list, calling NdisIMInitializeDeviceInstanceEx once for + // each name in it. + // + NdisReadConfiguration(Status, + &Param, + ConfigHandle, + &DeviceStr, + NdisParameterString); + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Allocate memory for the Adapter structure. This represents both the + // protocol context as well as the adapter structure when the miniport + // is initialized. + // + // In addition to the base structure, allocate space for the device + // instance string. + // + TotalSize = sizeof(ADAPT) + Param->ParameterData.StringData.MaximumLength; + + NdisAllocateMemoryWithTag(&pAdapt, TotalSize, TAG); + + if (pAdapt == NULL) + { + *Status = NDIS_STATUS_RESOURCES; + break; + } + + // + // Initialize the adapter structure. We copy in the IM device + // name as well, because we may need to use it in a call to + // NdisIMCancelInitializeDeviceInstance. The string returned + // by NdisReadConfiguration is active (i.e. available) only + // for the duration of this call to our BindAdapter handler. + // + NdisZeroMemory(pAdapt, TotalSize); + pAdapt->DeviceName.MaximumLength = Param->ParameterData.StringData.MaximumLength; + pAdapt->DeviceName.Length = Param->ParameterData.StringData.Length; + pAdapt->DeviceName.Buffer = (PWCHAR)((ULONG_PTR)pAdapt + sizeof(ADAPT)); + NdisMoveMemory(pAdapt->DeviceName.Buffer, + Param->ParameterData.StringData.Buffer, + Param->ParameterData.StringData.MaximumLength); + + + + NdisInitializeEvent(&pAdapt->Event); + NdisAllocateSpinLock(&pAdapt->Lock); + + // + // Allocate a packet pool for sends. We need this to pass sends down. + // We cannot use the same packet descriptor that came down to our send + // handler (see also NDIS 5.1 packet stacking). + // + NdisAllocatePacketPoolEx(Status, + &pAdapt->SendPacketPoolHandle, + MIN_PACKET_POOL_SIZE, + MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE, + sizeof(SEND_RSVD)); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Allocate a packet pool for receives. We need this to indicate receives. + // Same consideration as sends (see also NDIS 5.1 packet stacking). + // + NdisAllocatePacketPoolEx(Status, + &pAdapt->RecvPacketPoolHandle, + MIN_PACKET_POOL_SIZE, + MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE, + PROTOCOL_RESERVED_SIZE_IN_PACKET); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Now open the adapter below and complete the initialization + // + NdisOpenAdapter(Status, + &Sts, + &pAdapt->BindingHandle, + &MediumIndex, + MediumArray, + sizeof(MediumArray)/sizeof(NDIS_MEDIUM), + ProtHandle, + pAdapt, + DeviceName, + 0, + NULL); + + if (*Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + *Status = pAdapt->Status; + } + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + PtReferenceAdapt(pAdapt); + +#pragma prefast(suppress: __WARNING_POTENTIAL_BUFFER_OVERFLOW, "Ndis guarantees MediumIndex to be within bounds"); + pAdapt->Medium = MediumArray[MediumIndex]; + + // + // Now ask NDIS to initialize our miniport (upper) edge. + // Set the flag below to synchronize with a possible call + // to our protocol Unbind handler that may come in before + // our miniport initialization happens. + // + pAdapt->MiniportInitPending = TRUE; + NdisInitializeEvent(&pAdapt->MiniportInitEvent); + + PtReferenceAdapt(pAdapt); + + *Status = NdisIMInitializeDeviceInstanceEx(DriverHandle, + &pAdapt->DeviceName, + pAdapt); + + if (*Status != NDIS_STATUS_SUCCESS) + { + if (pAdapt->MiniportIsHalted == TRUE) + { + NoCleanUpNeeded = TRUE; + } + + DBGPRINT(("BindAdapter: Adapt %p, IMInitializeDeviceInstance error %x\n", + pAdapt, *Status)); + + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + + break; + } + + PtDereferenceAdapt(pAdapt); + + } while(FALSE); + + // + // Close the configuration handle now - see comments above with + // the call to NdisIMInitializeDeviceInstanceEx. + // + if (ConfigHandle != NULL) + { + NdisCloseConfiguration(ConfigHandle); + } + + if ((*Status != NDIS_STATUS_SUCCESS) && (NoCleanUpNeeded == FALSE)) + { + if (pAdapt != NULL) + { + if (pAdapt->BindingHandle != NULL) + { + NDIS_STATUS LocalStatus; + + // + // Close the binding we opened above. + // + + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(&LocalStatus, pAdapt->BindingHandle); + pAdapt->BindingHandle = NULL; + + if (LocalStatus == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + LocalStatus = pAdapt->Status; + + + } + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + } + } + } + + + DBGPRINT(("<== Protocol BindAdapter: pAdapt %p, Status %x\n", pAdapt, *Status)); +} + + +VOID +PtOpenAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status, + IN NDIS_STATUS OpenErrorStatus + ) +/*++ + +Routine Description: + + Completion routine for NdisOpenAdapter issued from within the PtBindAdapter. Simply + unblock the caller. + +Arguments: + + ProtocolBindingContext Pointer to the adapter + Status Status of the NdisOpenAdapter call + OpenErrorStatus Secondary status(ignored by us). + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + UNREFERENCED_PARAMETER(OpenErrorStatus); + + DBGPRINT(("==> PtOpenAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status)); + pAdapt->Status = Status; + NdisSetEvent(&pAdapt->Event); +} + + +VOID +PtUnbindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE UnbindContext + ) +/*++ + +Routine Description: + + Called by NDIS when we are required to unbind to the adapter below. + This functions shares functionality with the miniport's HaltHandler. + The code should ensure that NdisCloseAdapter and NdisFreeMemory is called + only once between the two functions + +Arguments: + + Status Placeholder for return status + ProtocolBindingContext Pointer to the adapter structure + UnbindContext Context for NdisUnbindComplete() if this pends + +Return Value: + + Status for NdisIMDeinitializeDeviceContext + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS LocalStatus; + + UNREFERENCED_PARAMETER(UnbindContext); + + DBGPRINT(("==> PtUnbindAdapter: Adapt %p\n", pAdapt)); + + // + // Set the flag that the miniport below is unbinding, so the request handlers will + // fail any request comming later + // + NdisAcquireSpinLock(&pAdapt->Lock); + pAdapt->UnbindingInProcess = TRUE; + if (pAdapt->QueuedRequest == TRUE) + { + pAdapt->QueuedRequest = FALSE; + NdisReleaseSpinLock(&pAdapt->Lock); + + PtRequestComplete(pAdapt, + &pAdapt->Request, + NDIS_STATUS_FAILURE ); + + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } +#ifndef WIN9X + // + // Check if we had called NdisIMInitializeDeviceInstanceEx and + // we are awaiting a call to MiniportInitialize. + // + if (pAdapt->MiniportInitPending == TRUE) + { + // + // Try to cancel the pending IMInit process. + // + LocalStatus = NdisIMCancelInitializeDeviceInstance( + DriverHandle, + &pAdapt->DeviceName); + + if (LocalStatus == NDIS_STATUS_SUCCESS) + { + // + // Successfully cancelled IM Initialization; our + // Miniport Initialize routine will not be called + // for this device. + // + pAdapt->MiniportInitPending = FALSE; + ASSERT(pAdapt->MiniportHandle == NULL); + } + else + { + // + // Our Miniport Initialize routine will be called + // (may be running on another thread at this time). + // Wait for it to finish. + // + NdisWaitEvent(&pAdapt->MiniportInitEvent, 0); + ASSERT(pAdapt->MiniportInitPending == FALSE); + } + + } +#endif // !WIN9X + + // + // Call NDIS to remove our device-instance. We do most of the work + // inside the HaltHandler. + // + // The Handle will be NULL if our miniport Halt Handler has been called or + // if the IM device was never initialized + // + + if (pAdapt->MiniportHandle != NULL) + { + *Status = NdisIMDeInitializeDeviceInstance(pAdapt->MiniportHandle); + + if (*Status != NDIS_STATUS_SUCCESS) + { + *Status = NDIS_STATUS_FAILURE; + } + } + else + { + // + // We need to do some work here. + // Close the binding below us + // and release the memory allocated. + // + + if(pAdapt->BindingHandle != NULL) + { + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(Status, pAdapt->BindingHandle); + + // + // Wait for it to complete + // + if(*Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + *Status = pAdapt->Status; + } + pAdapt->BindingHandle = NULL; + } + else + { + // + // Both Our MiniportHandle and Binding Handle should not be NULL. + // + *Status = NDIS_STATUS_FAILURE; + ASSERT(0); + } + + // + // Free the memory here, if was not released earlier(by calling the HaltHandler) + // + MPFreeAllPacketPools(pAdapt); + NdisFreeSpinLock(&pAdapt->Lock); + NdisFreeMemory(pAdapt, 0, 0); + } + + DBGPRINT(("<== PtUnbindAdapter: Adapt %p\n", pAdapt)); +} + +VOID +PtUnloadProtocol( + VOID +) +{ + NDIS_STATUS Status; + + if (ProtHandle != NULL) + { + NdisDeregisterProtocol(&Status, ProtHandle); + ProtHandle = NULL; + } + + DBGPRINT(("PtUnloadProtocol: done!\n")); +} + + + +VOID +PtCloseAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion for the CloseAdapter call. + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + Status Completion status + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + DBGPRINT(("CloseAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status)); + pAdapt->Status = Status; + NdisSetEvent(&pAdapt->Event); +} + + +VOID +PtResetComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion for the reset. + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + Status Completion status + +Return Value: + + None. + +--*/ +{ + + UNREFERENCED_PARAMETER(ProtocolBindingContext); + UNREFERENCED_PARAMETER(Status); + // + // We never issue a reset, so we should not be here. + // + ASSERT(0); +} + + +VOID +PtRequestComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_REQUEST NdisRequest, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion handler for the previously posted request. All OIDS + are completed by and sent to the same miniport that they were requested for. + If Oid == OID_PNP_QUERY_POWER then the data structure needs to returned with all entries = + NdisDeviceStateUnspecified + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + NdisRequest The posted request + Status Completion status + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + NDIS_OID Oid = pAdapt->Request.DATA.SET_INFORMATION.Oid ; + + // + // Since our request is not outstanding anymore + // + ASSERT(pAdapt->OutstandingRequests == TRUE); + + pAdapt->OutstandingRequests = FALSE; + + // + // Complete the Set or Query, and fill in the buffer for OID_PNP_CAPABILITIES, if need be. + // + switch (NdisRequest->RequestType) + { + case NdisRequestQueryInformation: + + // + // We never pass OID_PNP_QUERY_POWER down. + // + ASSERT(Oid != OID_PNP_QUERY_POWER); + + if ((Oid == OID_PNP_CAPABILITIES) && (Status == NDIS_STATUS_SUCCESS)) + { + MPQueryPNPCapabilities(pAdapt, &Status); + } + *pAdapt->BytesReadOrWritten = NdisRequest->DATA.QUERY_INFORMATION.BytesWritten; + *pAdapt->BytesNeeded = NdisRequest->DATA.QUERY_INFORMATION.BytesNeeded; + + if (((Oid == OID_GEN_MAC_OPTIONS) + && (Status == NDIS_STATUS_SUCCESS)) + && (NdisDotSysVersion >= NDIS_SYS_VERSION_51)) + { + // + // Only do this on Windows XP or greater (NDIS.SYS v 5.1); + // do not do in Windows 2000 (NDIS.SYS v 5.0)) + // + + // + // Remove the no-loopback bit from mac-options. In essence we are + // telling NDIS that we can handle loopback. We don't, but the + // interface below us does. If we do not do this, then loopback + // processing happens both below us and above us. This is wasteful + // at best and if Netmon is running, it will see multiple copies + // of loopback packets when sniffing above us. + // + // Only the lowest miniport is a stack of layered miniports should + // ever report this bit set to NDIS. + // + *(PULONG)NdisRequest->DATA.QUERY_INFORMATION.InformationBuffer &= ~NDIS_MAC_OPTION_NO_LOOPBACK; + } + + NdisMQueryInformationComplete(pAdapt->MiniportHandle, + Status); + break; + + case NdisRequestSetInformation: + + ASSERT( Oid != OID_PNP_SET_POWER); + + *pAdapt->BytesReadOrWritten = NdisRequest->DATA.SET_INFORMATION.BytesRead; + *pAdapt->BytesNeeded = NdisRequest->DATA.SET_INFORMATION.BytesNeeded; + NdisMSetInformationComplete(pAdapt->MiniportHandle, + Status); + break; + + default: + ASSERT(0); + break; + } + +} + + +VOID +PtStatus( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS GeneralStatus, + IN PVOID StatusBuffer, + IN UINT StatusBufferSize + ) +/*++ + +Routine Description: + + Status handler for the lower-edge(protocol). + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + GeneralStatus Status code + StatusBuffer Status buffer + StatusBufferSize Size of the status buffer + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + + // + // Pass up this indication only if the upper edge miniport is initialized + // and powered on. Also ignore indications that might be sent by the lower + // miniport when it isn't at D0. + // + if ((pAdapt->MiniportHandle != NULL) && + (pAdapt->MPDeviceState == NdisDeviceStateD0) && + (pAdapt->PTDeviceState == NdisDeviceStateD0)) + { + if ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || + (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT)) + { + + pAdapt->LastIndicatedStatus = GeneralStatus; + } + NdisMIndicateStatus(pAdapt->MiniportHandle, + GeneralStatus, + StatusBuffer, + StatusBufferSize); + } + // + // Save the last indicated media status + // + else + { + if ((pAdapt->MiniportHandle != NULL) && + ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || + (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT))) + { + pAdapt->LatestUnIndicateStatus = GeneralStatus; + } + } + +} + + +VOID +PtStatusComplete( + IN NDIS_HANDLE ProtocolBindingContext + ) +/*++ + +Routine Description: + + +Arguments: + + +Return Value: + + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + + // + // Pass up this indication only if the upper edge miniport is initialized + // and powered on. Also ignore indications that might be sent by the lower + // miniport when it isn't at D0. + // + if ((pAdapt->MiniportHandle != NULL) && + (pAdapt->MPDeviceState == NdisDeviceStateD0) && + (pAdapt->PTDeviceState == NdisDeviceStateD0)) + { + NdisMIndicateStatusComplete(pAdapt->MiniportHandle); + } +} + + +VOID +PtSendComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Called by NDIS when the miniport below had completed a send. We should + complete the corresponding upper-edge send this represents. + +Arguments: + + ProtocolBindingContext - Points to ADAPT structure + Packet - Low level packet being completed + Status - status of send + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + PNDIS_PACKET Pkt; + NDIS_HANDLE PoolHandle; + +#ifdef NDIS51 + // + // Packet stacking: + // + // Determine if the packet we are completing is the one we allocated. If so, then + // get the original packet from the reserved area and completed it and free the + // allocated packet. If this is the packet that was sent down to us, then just + // complete it + // + PoolHandle = NdisGetPoolFromPacket(Packet); + if (PoolHandle != pAdapt->SendPacketPoolHandle) + { + // + // We had passed down a packet belonging to the protocol above us. + // + // DBGPRINT(("PtSendComp: Adapt %p, Stacked Packet %p\n", pAdapt, Packet)); + + NdisMSendComplete(pAdapt->MiniportHandle, + Packet, + Status); + } + else +#endif // NDIS51 + { + PSEND_RSVD SendRsvd; + + SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved); + Pkt = SendRsvd->OriginalPkt; + +#if 1 // IPFW - new code + //DbgPrint("SendComplete: packet %p pkt %p\n", Packet, Pkt); + if (Pkt == NULL) { //this is a reinjected packet, with no 'father' + CleanupReinjected(Packet, SendRsvd->pMbuf, pAdapt); + return; + } +#endif /* IPFW */ + +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Pkt, Packet); +#endif + + NdisDprFreePacket(Packet); + + NdisMSendComplete(pAdapt->MiniportHandle, + Pkt, + Status); + } + // + // Decrease the outstanding send count + // + ADAPT_DECR_PENDING_SENDS(pAdapt); +} + + +VOID +PtTransferDataComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status, + IN UINT BytesTransferred + ) +/*++ + +Routine Description: + + Entry point called by NDIS to indicate completion of a call by us + to NdisTransferData. + + See notes under SendComplete. + +Arguments: + +Return Value: + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + if(pAdapt->MiniportHandle) + { + NdisMTransferDataComplete(pAdapt->MiniportHandle, + Packet, + Status, + BytesTransferred); + } +} + + +NDIS_STATUS +PtReceive( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE MacReceiveContext, + IN PVOID HeaderBuffer, + IN UINT HeaderBufferSize, + IN PVOID LookAheadBuffer, + IN UINT LookAheadBufferSize, + IN UINT PacketSize + ) +/*++ + +Routine Description: + + Handle receive data indicated up by the miniport below. We pass + it along to the protocol above us. + + If the miniport below indicates packets, NDIS would more + likely call us at our ReceivePacket handler. However we + might be called here in certain situations even though + the miniport below has indicated a receive packet, e.g. + if the miniport had set packet status to NDIS_STATUS_RESOURCES. + +Arguments: + + + +Return Value: + + NDIS_STATUS_SUCCESS if we processed the receive successfully, + NDIS_STATUS_XXX error code if we discarded it. + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + PNDIS_PACKET MyPacket, Packet = NULL; + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + ULONG Proc = KeGetCurrentProcessorNumber(); + + if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0)) + { + Status = NDIS_STATUS_FAILURE; + } + else do + { + // + // Get at the packet, if any, indicated up by the miniport below. + // + Packet = NdisGetReceivedPacket(pAdapt->BindingHandle, MacReceiveContext); + if (Packet != NULL) + { + // + // The miniport below did indicate up a packet. Use information + // from that packet to construct a new packet to indicate up. + // + +#ifdef NDIS51 + // + // NDIS 5.1 NOTE: Do not reuse the original packet in indicating + // up a receive, even if there is sufficient packet stack space. + // If we had to do so, we would have had to overwrite the + // status field in the original packet to NDIS_STATUS_RESOURCES, + // and it is not allowed for protocols to overwrite this field + // in received packets. + // +#endif // NDIS51 + + // + // Get a packet off the pool and indicate that up + // + NdisDprAllocatePacket(&Status, + &MyPacket, + pAdapt->RecvPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + // + // Make our packet point to data from the original + // packet. NOTE: this works only because we are + // indicating a receive directly from the context of + // our receive indication. If we need to queue this + // packet and indicate it from another thread context, + // we will also have to allocate a new buffer and copy + // over the packet contents, OOB data and per-packet + // information. This is because the packet data + // is available only for the duration of this + // receive indication call. + // + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); + + // + // Get the original packet (it could be the same packet as the + // one received or a different one based on the number of layered + // miniports below) and set it on the indicated packet so the OOB + // data is visible correctly at protocols above. If the IM driver + // modifies the packet in any way it should not set the new packet's + // original packet equal to the original packet of the packet that + // was indicated to it from the underlying driver, in this case, the + // IM driver should also ensure that the related per packet info should + // be copied to the new packet. + // we can set the original packet to the original packet of the packet + // indicated from the underlying driver because the driver doesn't modify + // the data content in the packet. + // + NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet)); + NDIS_SET_PACKET_HEADER_SIZE(MyPacket, HeaderBufferSize); + + // + // Copy packet flags. + // + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + // + // Force protocols above to make a copy if they want to hang + // on to data in this packet. This is because we are in our + // Receive handler (not ReceivePacket) and we can't return a + // ref count from here. + // + NDIS_SET_PACKET_STATUS(MyPacket, NDIS_STATUS_RESOURCES); + + // + // By setting NDIS_STATUS_RESOURCES, we also know that we can reclaim + // this packet as soon as the call to NdisMIndicateReceivePacket + // returns. + // + + if (pAdapt->MiniportHandle != NULL) + { +#if 1 /* IPFW: query the firewall */ + int ret; + ret = ipfw2_qhandler_w32(MyPacket, INCOMING, + ProtocolBindingContext); + if (ret != PASS) + return 0; //otherwise simply continue +#endif /* end of IPFW code */ + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + + // + // Reclaim the indicated packet. Since we had set its status + // to NDIS_STATUS_RESOURCES, we are guaranteed that protocols + // above are done with it. + // + NdisDprFreePacket(MyPacket); + + break; + } + } + else + { + // + // The miniport below us uses the old-style (not packet) + // receive indication. Fall through. + // + } + + // + // Fall through if the miniport below us has either not + // indicated a packet or we could not allocate one + // + pAdapt->ReceivedIndicationFlags[Proc] = TRUE; + if (pAdapt->MiniportHandle == NULL) + { + break; + } + switch (pAdapt->Medium) + { + case NdisMedium802_3: + case NdisMediumWan: + //DbgPrint("EthIndicateReceive context %p, header at %p len %u, lookahead at %p len %u, packetsize %u\n",ProtocolBindingContext,HeaderBuffer,HeaderBufferSize,LookAheadBuffer,LookAheadBufferSize,PacketSize); + //hexdump(HeaderBuffer,HeaderBufferSize+LookAheadBufferSize,"EthIndicateReceive"); + { + int ret = ipfw2_qhandler_w32_oldstyle(INCOMING, ProtocolBindingContext, HeaderBuffer, HeaderBufferSize, LookAheadBuffer, LookAheadBufferSize, PacketSize); + if (ret != PASS) + return NDIS_STATUS_SUCCESS; + } + NdisMEthIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; + + case NdisMedium802_5: + NdisMTrIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; + +#if FDDI + case NdisMediumFddi: + NdisMFddiIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; +#endif + default: + ASSERT(FALSE); + break; + } + + } while(FALSE); + + return Status; +} + + +VOID +PtReceiveComplete( + IN NDIS_HANDLE ProtocolBindingContext + ) +/*++ + +Routine Description: + + Called by the adapter below us when it is done indicating a batch of + received packets. + +Arguments: + + ProtocolBindingContext Pointer to our adapter structure. + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + ULONG Proc = KeGetCurrentProcessorNumber(); + + /* Warning: this is a poor implementation of the PtReceiveComplete + * made by MS, and it's a well known (but never fixed) issue. + * Since the ProcessorNumber here can be different from the one + * that processed the PtReceive, sometimes NdisMEthIndicateReceiveComplete + * will not be called, causing poor performance in the incoming traffic. + * In our driver, PtReceive is called for IP packets ONLY by particulary + * old NIC drivers, and the poor performance can be seen even + * in traffic not handled by ipfw or dummynet. + * Fortunately, this is quite rare, all the incoming IP packets + * will arrive through PtReceivePacket, and this callback will never + * be called. For reinjected traffic, a workaround is done + * commuting the ReceivedIndicationFlag and calling + * NdisMEthIndicateReceiveComplete manually for each packet. + */ + + if (((pAdapt->MiniportHandle != NULL) + && (pAdapt->MPDeviceState == NdisDeviceStateD0)) + && (pAdapt->ReceivedIndicationFlags[Proc])) + { + switch (pAdapt->Medium) + { + case NdisMedium802_3: + case NdisMediumWan: + NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle); + break; + + case NdisMedium802_5: + NdisMTrIndicateReceiveComplete(pAdapt->MiniportHandle); + break; +#if FDDI + case NdisMediumFddi: + NdisMFddiIndicateReceiveComplete(pAdapt->MiniportHandle); + break; +#endif + default: + ASSERT(FALSE); + break; + } + } + + pAdapt->ReceivedIndicationFlags[Proc] = FALSE; +} + + +INT +PtReceivePacket( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet + ) +/*++ + +Routine Description: + + ReceivePacket handler. Called by NDIS if the miniport below supports + NDIS 4.0 style receives. Re-package the buffer chain in a new packet + and indicate the new packet to protocols above us. Any context for + packets indicated up must be kept in the MiniportReserved field. + + NDIS 5.1 - packet stacking - if there is sufficient "stack space" in + the packet passed to us, we can use the same packet in a receive + indication. + +Arguments: + + ProtocolBindingContext - Pointer to our adapter structure. + Packet - Pointer to the packet + +Return Value: + + == 0 -> We are done with the packet + != 0 -> We will keep the packet and call NdisReturnPackets() this + many times when done. +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS Status; + PNDIS_PACKET MyPacket; + BOOLEAN Remaining; + + // + // Drop the packet silently if the upper miniport edge isn't initialized or + // the miniport edge is in low power state + // + if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0)) + { + return 0; + } + +#ifdef NDIS51 + // + // Check if we can reuse the same packet for indicating up. + // See also: PtReceive(). + // + (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (0 && Remaining) + { + // + // We can reuse "Packet". Indicate it up and be done with it. + // + Status = NDIS_GET_PACKET_STATUS(Packet); + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &Packet, 1); + return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0); + } +#endif // NDIS51 + + // + // Get a packet off the pool and indicate that up + // + NdisDprAllocatePacket(&Status, + &MyPacket, + pAdapt->RecvPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PRECV_RSVD RecvRsvd; + + RecvRsvd = (PRECV_RSVD)(MyPacket->MiniportReserved); + RecvRsvd->OriginalPkt = Packet; + + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); + + // + // Get the original packet (it could be the same packet as the one + // received or a different one based on the number of layered miniports + // below) and set it on the indicated packet so the OOB data is visible + // correctly to protocols above us. + // + NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet)); + + // + // Set Packet Flags + // + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + Status = NDIS_GET_PACKET_STATUS(Packet); + + NDIS_SET_PACKET_STATUS(MyPacket, Status); + NDIS_SET_PACKET_HEADER_SIZE(MyPacket, NDIS_GET_PACKET_HEADER_SIZE(Packet)); + + if (pAdapt->MiniportHandle != NULL) + { +#if 1 /* IPFW: query the firewall */ + int ret; + ret = ipfw2_qhandler_w32(MyPacket, INCOMING, + ProtocolBindingContext); + if (ret != PASS) + return 0; //otherwise simply continue +#endif /* end of IPFW code */ + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + + // + // Check if we had indicated up the packet with NDIS_STATUS_RESOURCES + // NOTE -- do not use NDIS_GET_PACKET_STATUS(MyPacket) for this since + // it might have changed! Use the value saved in the local variable. + // + if (Status == NDIS_STATUS_RESOURCES) + { + // + // Our ReturnPackets handler will not be called for this packet. + // We should reclaim it right here. + // + NdisDprFreePacket(MyPacket); + } + + return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0); + } + else + { + // + // We are out of packets. Silently drop it. + // + return(0); + } +} + + +NDIS_STATUS +PtPNPHandler( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNET_PNP_EVENT pNetPnPEvent + ) + +/*++ +Routine Description: + + This is called by NDIS to notify us of a PNP event related to a lower + binding. Based on the event, this dispatches to other helper routines. + + NDIS 5.1: forward this event to the upper protocol(s) by calling + NdisIMNotifyPnPEvent. + +Arguments: + + ProtocolBindingContext - Pointer to our adapter structure. Can be NULL + for "global" notifications + + pNetPnPEvent - Pointer to the PNP event to be processed. + +Return Value: + + NDIS_STATUS code indicating status of event processing. + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + + DBGPRINT(("PtPnPHandler: Adapt %p, Event %d\n", pAdapt, pNetPnPEvent->NetEvent)); + + switch (pNetPnPEvent->NetEvent) + { + case NetEventSetPower: + Status = PtPnPNetEventSetPower(pAdapt, pNetPnPEvent); + break; + + case NetEventReconfigure: + Status = PtPnPNetEventReconfigure(pAdapt, pNetPnPEvent); + break; + + default: +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above, before + // doing anything else with it. + // + if (pAdapt && pAdapt->MiniportHandle) + { + Status = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#else + Status = NDIS_STATUS_SUCCESS; + +#endif // NDIS51 + + break; + } + + return Status; +} + + +NDIS_STATUS +PtPnPNetEventReconfigure( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ) +/*++ +Routine Description: + + This routine is called from NDIS to notify our protocol edge of a + reconfiguration of parameters for either a specific binding (pAdapt + is not NULL), or global parameters if any (pAdapt is NULL). + +Arguments: + + pAdapt - Pointer to our adapter structure. + pNetPnPEvent - the reconfigure event + +Return Value: + + NDIS_STATUS_SUCCESS + +--*/ +{ + NDIS_STATUS ReconfigStatus = NDIS_STATUS_SUCCESS; + NDIS_STATUS ReturnStatus = NDIS_STATUS_SUCCESS; + + do + { + // + // Is this is a global reconfiguration notification ? + // + if (pAdapt == NULL) + { + // + // An important event that causes this notification to us is if + // one of our upper-edge miniport instances was enabled after being + // disabled earlier, e.g. from Device Manager in Win2000. Note that + // NDIS calls this because we had set up an association between our + // miniport and protocol entities by calling NdisIMAssociateMiniport. + // + // Since we would have torn down the lower binding for that miniport, + // we need NDIS' assistance to re-bind to the lower miniport. The + // call to NdisReEnumerateProtocolBindings does exactly that. + // + NdisReEnumerateProtocolBindings (ProtHandle); + + break; + } + +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above before doing anything + // with it. + // + if (pAdapt->MiniportHandle) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + ReconfigStatus = NDIS_STATUS_SUCCESS; + + } while(FALSE); + + DBGPRINT(("<==PtPNPNetEventReconfigure: pAdapt %p\n", pAdapt)); + +#ifdef NDIS51 + // + // Overwrite status with what upper-layer protocol(s) returned. + // + ReconfigStatus = ReturnStatus; +#endif + + return ReconfigStatus; +} + + +NDIS_STATUS +PtPnPNetEventSetPower( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ) +/*++ +Routine Description: + + This is a notification to our protocol edge of the power state + of the lower miniport. If it is going to a low-power state, we must + wait here for all outstanding sends and requests to complete. + + NDIS 5.1: Since we use packet stacking, it is not sufficient to + check usage of our local send packet pool to detect whether or not + all outstanding sends have completed. For this, use the new API + NdisQueryPendingIOCount. + + NDIS 5.1: Use the 5.1 API NdisIMNotifyPnPEvent to pass on PnP + notifications to upper protocol(s). + +Arguments: + + pAdapt - Pointer to the adpater structure + pNetPnPEvent - The Net Pnp Event. this contains the new device state + +Return Value: + + NDIS_STATUS_SUCCESS or the status returned by upper-layer protocols. + +--*/ +{ + PNDIS_DEVICE_POWER_STATE pDeviceState =(PNDIS_DEVICE_POWER_STATE)(pNetPnPEvent->Buffer); + NDIS_DEVICE_POWER_STATE PrevDeviceState = pAdapt->PTDeviceState; + NDIS_STATUS Status; + NDIS_STATUS ReturnStatus; + + ReturnStatus = NDIS_STATUS_SUCCESS; + + // + // Set the Internal Device State, this blocks all new sends or receives + // + NdisAcquireSpinLock(&pAdapt->Lock); + pAdapt->PTDeviceState = *pDeviceState; + + // + // Check if the miniport below is going to a low power state. + // + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + // + // If the miniport below is going to standby, fail all incoming requests + // + if (PrevDeviceState == NdisDeviceStateD0) + { + pAdapt->StandingBy = TRUE; + } + + NdisReleaseSpinLock(&pAdapt->Lock); + +#ifdef NDIS51 + // + // Notify upper layer protocol(s) first. + // + if (pAdapt->MiniportHandle != NULL) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + // + // Wait for outstanding sends and requests to complete. + // + while (pAdapt->OutstandingSends != 0) + { + NdisMSleep(2); + } + + while (pAdapt->OutstandingRequests == TRUE) + { + // + // sleep till outstanding requests complete + // + NdisMSleep(2); + } + + // + // If the below miniport is going to low power state, complete the queued request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->QueuedRequest) + { + pAdapt->QueuedRequest = FALSE; + NdisReleaseSpinLock(&pAdapt->Lock); + PtRequestComplete(pAdapt, &pAdapt->Request, NDIS_STATUS_FAILURE); + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } + + + ASSERT(NdisPacketPoolUsage(pAdapt->SendPacketPoolHandle) == 0); + ASSERT(pAdapt->OutstandingRequests == FALSE); + } + else + { + // + // If the physical miniport is powering up (from Low power state to D0), + // clear the flag + // + if (PrevDeviceState > NdisDeviceStateD0) + { + pAdapt->StandingBy = FALSE; + } + // + // The device below is being turned on. If we had a request + // pending, send it down now. + // + if (pAdapt->QueuedRequest == TRUE) + { + pAdapt->QueuedRequest = FALSE; + + pAdapt->OutstandingRequests = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + if (Status != NDIS_STATUS_PENDING) + { + PtRequestComplete(pAdapt, + &pAdapt->Request, + Status); + + } + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } + + +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above + // + if (pAdapt->MiniportHandle) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + } + + return ReturnStatus; +} + +VOID +PtReferenceAdapt( + IN PADAPT pAdapt + ) +{ + NdisAcquireSpinLock(&pAdapt->Lock); + + ASSERT(pAdapt->RefCount >= 0); + + pAdapt->RefCount ++; + NdisReleaseSpinLock(&pAdapt->Lock); +} + + +BOOLEAN +PtDereferenceAdapt( + IN PADAPT pAdapt + ) +{ + NdisAcquireSpinLock(&pAdapt->Lock); + + ASSERT(pAdapt->RefCount > 0); + + pAdapt->RefCount--; + + if (pAdapt->RefCount == 0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + + // + // Free all resources on this adapter structure. + // + MPFreeAllPacketPools (pAdapt);; + NdisFreeSpinLock(&pAdapt->Lock); + NdisFreeMemory(pAdapt, 0 , 0); + + return TRUE; + + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + + return FALSE; + } +} + + diff --git a/original_passthru/makefile b/original_passthru/makefile new file mode 100644 index 0000000..c6c9e94 --- /dev/null +++ b/original_passthru/makefile @@ -0,0 +1,22 @@ +# +# DO NOT EDIT THIS FILE!!! Edit .\sources. if you want to add a new source +# file to this component. This file merely indirects to the real make file +# that is shared by all the components of NT +# + +#!INCLUDE $(NTMAKEENV)\makefile.def + + +!IF DEFINED(_NT_TARGET_VERSION) +! IF $(_NT_TARGET_VERSION)>=0x501 +! INCLUDE $(NTMAKEENV)\makefile.def +! ELSE +# Only warn once per directory +! INCLUDE $(NTMAKEENV)\makefile.plt +! IF "$(BUILD_PASS)"=="PASS1" +! message BUILDMSG: Warning : The sample "$(MAKEDIR)" is not valid for the current OS target. +! ENDIF +! ENDIF +!ELSE +! INCLUDE $(NTMAKEENV)\makefile.def +!ENDIF diff --git a/original_passthru/miniport.c b/original_passthru/miniport.c new file mode 100644 index 0000000..a7f3bbc --- /dev/null +++ b/original_passthru/miniport.c @@ -0,0 +1,1461 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + miniport.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + +#include "precomp.h" +#pragma hdrstop + + + +NDIS_STATUS +MPInitialize( + OUT PNDIS_STATUS OpenErrorStatus, + OUT PUINT SelectedMediumIndex, + IN PNDIS_MEDIUM MediumArray, + IN UINT MediumArraySize, + IN NDIS_HANDLE MiniportAdapterHandle, + IN NDIS_HANDLE WrapperConfigurationContext + ) +/*++ + +Routine Description: + + This is the initialize handler which gets called as a result of + the BindAdapter handler calling NdisIMInitializeDeviceInstanceEx. + The context parameter which we pass there is the adapter structure + which we retrieve here. + + Arguments: + + OpenErrorStatus Not used by us. + SelectedMediumIndex Place-holder for what media we are using + MediumArray Array of ndis media passed down to us to pick from + MediumArraySize Size of the array + MiniportAdapterHandle The handle NDIS uses to refer to us + WrapperConfigurationContext For use by NdisOpenConfiguration + +Return Value: + + NDIS_STATUS_SUCCESS unless something goes wrong + +--*/ +{ + UINT i; + PADAPT pAdapt; + NDIS_STATUS Status = NDIS_STATUS_FAILURE; + NDIS_MEDIUM Medium; + + UNREFERENCED_PARAMETER(WrapperConfigurationContext); + + do + { + // + // Start off by retrieving our adapter context and storing + // the Miniport handle in it. + // + pAdapt = NdisIMGetDeviceContext(MiniportAdapterHandle); + pAdapt->MiniportIsHalted = FALSE; + + DBGPRINT(("==> Miniport Initialize: Adapt %p\n", pAdapt)); + + // + // Usually we export the medium type of the adapter below as our + // virtual miniport's medium type. However if the adapter below us + // is a WAN device, then we claim to be of medium type 802.3. + // + Medium = pAdapt->Medium; + + if (Medium == NdisMediumWan) + { + Medium = NdisMedium802_3; + } + + for (i = 0; i < MediumArraySize; i++) + { + if (MediumArray[i] == Medium) + { + *SelectedMediumIndex = i; + break; + } + } + + if (i == MediumArraySize) + { + Status = NDIS_STATUS_UNSUPPORTED_MEDIA; + break; + } + + + // + // Set the attributes now. NDIS_ATTRIBUTE_DESERIALIZE enables us + // to make up-calls to NDIS without having to call NdisIMSwitchToMiniport + // or NdisIMQueueCallBack. This also forces us to protect our data using + // spinlocks where appropriate. Also in this case NDIS does not queue + // packets on our behalf. Since this is a very simple pass-thru + // miniport, we do not have a need to protect anything. However in + // a general case there will be a need to use per-adapter spin-locks + // for the packet queues at the very least. + // + NdisMSetAttributesEx(MiniportAdapterHandle, + pAdapt, + 0, // CheckForHangTimeInSeconds + NDIS_ATTRIBUTE_IGNORE_PACKET_TIMEOUT | + NDIS_ATTRIBUTE_IGNORE_REQUEST_TIMEOUT| + NDIS_ATTRIBUTE_INTERMEDIATE_DRIVER | + NDIS_ATTRIBUTE_DESERIALIZE | + NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND, + 0); + + pAdapt->MiniportHandle = MiniportAdapterHandle; + // + // Initialize LastIndicatedStatus to be NDIS_STATUS_MEDIA_CONNECT + // + pAdapt->LastIndicatedStatus = NDIS_STATUS_MEDIA_CONNECT; + + // + // Initialize the power states for both the lower binding (PTDeviceState) + // and our miniport edge to Powered On. + // + pAdapt->MPDeviceState = NdisDeviceStateD0; + pAdapt->PTDeviceState = NdisDeviceStateD0; + + // + // Add this adapter to the global pAdapt List + // + NdisAcquireSpinLock(&GlobalLock); + + pAdapt->Next = pAdaptList; + pAdaptList = pAdapt; + + NdisReleaseSpinLock(&GlobalLock); + + // + // Create an ioctl interface + // + (VOID)PtRegisterDevice(); + + Status = NDIS_STATUS_SUCCESS; + } + while (FALSE); + + // + // If we had received an UnbindAdapter notification on the underlying + // adapter, we would have blocked that thread waiting for the IM Init + // process to complete. Wake up any such thread. + // + ASSERT(pAdapt->MiniportInitPending == TRUE); + pAdapt->MiniportInitPending = FALSE; + NdisSetEvent(&pAdapt->MiniportInitEvent); + + if (Status == NDIS_STATUS_SUCCESS) + { + PtReferenceAdapt(pAdapt); + } + + DBGPRINT(("<== Miniport Initialize: Adapt %p, Status %x\n", pAdapt, Status)); + + *OpenErrorStatus = Status; + + + return Status; +} + + +NDIS_STATUS +MPSend( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet, + IN UINT Flags + ) +/*++ + +Routine Description: + + Send Packet handler. Either this or our SendPackets (array) handler is called + based on which one is enabled in our Miniport Characteristics. + +Arguments: + + MiniportAdapterContext Pointer to the adapter + Packet Packet to send + Flags Unused, passed down below + +Return Value: + + Return code from NdisSend + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + PNDIS_PACKET MyPacket; + PVOID MediaSpecificInfo = NULL; + ULONG MediaSpecificInfoSize = 0; + + // + // The driver should fail the send if the virtual miniport is in low + // power state + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + return NDIS_STATUS_FAILURE; + } + +#ifdef NDIS51 + // + // Use NDIS 5.1 packet stacking: + // + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; + + // + // Packet stacks: Check if we can use the same packet for sending down. + // + + pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". + // + // NOTE: if we needed to keep per-packet information in packets + // sent down, we can use pStack->IMReserved[]. + // + ASSERT(pStack); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + return NDIS_STATUS_FAILURE; + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + NdisSend(&Status, + pAdapt->BindingHandle, + Packet); + + if (Status != NDIS_STATUS_PENDING) + { + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + + return(Status); + } + } +#endif // NDIS51 + + // + // We are either not using packet stacks, or there isn't stack space + // in the original packet passed down to us. Allocate a new packet + // to wrap the data with. + // + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + return NDIS_STATUS_FAILURE; + + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisAllocatePacket(&Status, + &MyPacket, + pAdapt->SendPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PSEND_RSVD SendRsvd; + + // + // Save a pointer to the original packet in our reserved + // area in the new packet. This is needed so that we can + // get back to the original packet when the new packet's send + // is completed. + // + SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved); + SendRsvd->OriginalPkt = Packet; + + NdisGetPacketFlags(MyPacket) = Flags; + + // + // Set up the new packet so that it describes the same + // data as the original packet. + // + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); +#ifdef WIN9X + // + // Work around the fact that NDIS does not initialize this + // to FALSE on Win9x. + // + NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE; +#endif + + // + // Copy the OOB Offset from the original packet to the new + // packet. + // + NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket), + NDIS_OOB_DATA_FROM_PACKET(Packet), + sizeof(NDIS_PACKET_OOB_DATA)); + +#ifndef WIN9X + // + // Copy the right parts of per packet info into the new packet. + // This API is not available on Win9x since task offload is + // not supported on that platform. + // + NdisIMCopySendPerPacketInfo(MyPacket, Packet); +#endif + + // + // Copy the Media specific information + // + NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet, + &MediaSpecificInfo, + &MediaSpecificInfoSize); + + if (MediaSpecificInfo || MediaSpecificInfoSize) + { + NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket, + MediaSpecificInfo, + MediaSpecificInfoSize); + } + + NdisSend(&Status, + pAdapt->BindingHandle, + MyPacket); + + + if (Status != NDIS_STATUS_PENDING) + { +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + NdisFreePacket(MyPacket); + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + else + { + ADAPT_DECR_PENDING_SENDS(pAdapt); + // + // We are out of packets. Silently drop it. Alternatively we can deal with it: + // - By keeping separate send and receive pools + // - Dynamically allocate more pools as needed and free them when not needed + // + } + + return(Status); +} + + +VOID +MPSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PPNDIS_PACKET PacketArray, + IN UINT NumberOfPackets + ) +/*++ + +Routine Description: + + Send Packet Array handler. Either this or our SendPacket handler is called + based on which one is enabled in our Miniport Characteristics. + +Arguments: + + MiniportAdapterContext Pointer to our adapter + PacketArray Set of packets to send + NumberOfPackets Self-explanatory + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + UINT i; + PVOID MediaSpecificInfo = NULL; + UINT MediaSpecificInfoSize = 0; + + + for (i = 0; i < NumberOfPackets; i++) + { + PNDIS_PACKET Packet, MyPacket; + + Packet = PacketArray[i]; + // + // The driver should fail the send if the virtual miniport is in low + // power state + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + NDIS_STATUS_FAILURE); + continue; + } + +#ifdef NDIS51 + + // + // Use NDIS 5.1 packet stacking: + // + { + PNDIS_PACKET_STACK pStack; + BOOLEAN Remaining; + + // + // Packet stacks: Check if we can use the same packet for sending down. + // + pStack = NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". + // + // NOTE: if we needed to keep per-packet information in packets + // sent down, we can use pStack->IMReserved[]. + // + ASSERT(pStack); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + NDIS_STATUS_FAILURE); + } + else + { + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisSend(&Status, + pAdapt->BindingHandle, + Packet); + + if (Status != NDIS_STATUS_PENDING) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + Status); + + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + continue; + } + } +#endif + do + { + NdisAcquireSpinLock(&pAdapt->Lock); + // + // If the below miniport is going to low power state, stop sending down any packet. + // + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingSends++; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisAllocatePacket(&Status, + &MyPacket, + pAdapt->SendPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PSEND_RSVD SendRsvd; + + SendRsvd = (PSEND_RSVD)(MyPacket->ProtocolReserved); + SendRsvd->OriginalPkt = Packet; + + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); +#ifdef WIN9X + // + // Work around the fact that NDIS does not initialize this + // to FALSE on Win9x. + // + NDIS_PACKET_VALID_COUNTS(MyPacket) = FALSE; +#endif // WIN9X + + // + // Copy the OOB data from the original packet to the new + // packet. + // + NdisMoveMemory(NDIS_OOB_DATA_FROM_PACKET(MyPacket), + NDIS_OOB_DATA_FROM_PACKET(Packet), + sizeof(NDIS_PACKET_OOB_DATA)); + // + // Copy relevant parts of the per packet info into the new packet + // +#ifndef WIN9X + NdisIMCopySendPerPacketInfo(MyPacket, Packet); +#endif + + // + // Copy the Media specific information + // + NDIS_GET_PACKET_MEDIA_SPECIFIC_INFO(Packet, + &MediaSpecificInfo, + &MediaSpecificInfoSize); + + if (MediaSpecificInfo || MediaSpecificInfoSize) + { + NDIS_SET_PACKET_MEDIA_SPECIFIC_INFO(MyPacket, + MediaSpecificInfo, + MediaSpecificInfoSize); + } + + NdisSend(&Status, + pAdapt->BindingHandle, + MyPacket); + + if (Status != NDIS_STATUS_PENDING) + { +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Packet, MyPacket); +#endif + NdisFreePacket(MyPacket); + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + else + { + // + // The driver cannot allocate a packet. + // + ADAPT_DECR_PENDING_SENDS(pAdapt); + } + } + while (FALSE); + + if (Status != NDIS_STATUS_PENDING) + { + NdisMSendComplete(ADAPT_MINIPORT_HANDLE(pAdapt), + Packet, + Status); + } + } +} + + +NDIS_STATUS +MPQueryInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesWritten, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + + Entry point called by NDIS to query for the value of the specified OID. + Typical processing is to forward the query down to the underlying miniport. + + The following OIDs are filtered here: + + OID_PNP_QUERY_POWER - return success right here + + OID_GEN_SUPPORTED_GUIDS - do not forward, otherwise we will show up + multiple instances of private GUIDs supported by the underlying miniport. + + OID_PNP_CAPABILITIES - we do send this down to the lower miniport, but + the values returned are postprocessed before we complete this request; + see PtRequestComplete. + + NOTE on OID_TCP_TASK_OFFLOAD - if this IM driver modifies the contents + of data it passes through such that a lower miniport may not be able + to perform TCP task offload, then it should not forward this OID down, + but fail it here with the status NDIS_STATUS_NOT_SUPPORTED. This is to + avoid performing incorrect transformations on data. + + If our miniport edge (upper edge) is at a low-power state, fail the request. + + If our protocol edge (lower edge) has been notified of a low-power state, + we pend this request until the miniport below has been set to D0. Since + requests to miniports are serialized always, at most a single request will + be pended. + +Arguments: + + MiniportAdapterContext Pointer to the adapter structure + Oid Oid for this query + InformationBuffer Buffer for information + InformationBufferLength Size of this buffer + BytesWritten Specifies how much info is written + BytesNeeded In case the buffer is smaller than what we need, tell them how much is needed + + +Return Value: + + Return code from the NdisRequest below. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status = NDIS_STATUS_FAILURE; + + do + { + if (Oid == OID_PNP_QUERY_POWER) + { + // + // Do not forward this. + // + Status = NDIS_STATUS_SUCCESS; + break; + } + + if (Oid == OID_GEN_SUPPORTED_GUIDS) + { + // + // Do not forward this, otherwise we will end up with multiple + // instances of private GUIDs that the underlying miniport + // supports. + // + Status = NDIS_STATUS_NOT_SUPPORTED; + break; + } + + if (Oid == OID_TCP_TASK_OFFLOAD) + { + // + // Fail this -if- this driver performs data transformations + // that can interfere with a lower driver's ability to offload + // TCP tasks. + // + // Status = NDIS_STATUS_NOT_SUPPORTED; + // break; + // + } + // + // If the miniport below is unbinding, just fail any request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + NdisReleaseSpinLock(&pAdapt->Lock); + // + // All other queries are failed, if the miniport is not at D0, + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + Status = NDIS_STATUS_FAILURE; + break; + } + + pAdapt->Request.RequestType = NdisRequestQueryInformation; + pAdapt->Request.DATA.QUERY_INFORMATION.Oid = Oid; + pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer = InformationBuffer; + pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength = InformationBufferLength; + pAdapt->BytesNeeded = BytesNeeded; + pAdapt->BytesReadOrWritten = BytesWritten; + + // + // If the miniport below is binding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + // + // If the Protocol device state is OFF, mark this request as being + // pended. We queue this until the device state is back to D0. + // + if ((pAdapt->PTDeviceState > NdisDeviceStateD0) + && (pAdapt->StandingBy == FALSE)) + { + pAdapt->QueuedRequest = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_PENDING; + break; + } + // + // This is in the process of powering down the system, always fail the request + // + if (pAdapt->StandingBy == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingRequests = TRUE; + + NdisReleaseSpinLock(&pAdapt->Lock); + + // + // default case, most requests will be passed to the miniport below + // + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + + if (Status != NDIS_STATUS_PENDING) + { + PtRequestComplete(pAdapt, &pAdapt->Request, Status); + Status = NDIS_STATUS_PENDING; + } + + } while (FALSE); + + return(Status); + +} + + +VOID +MPQueryPNPCapabilities( + IN OUT PADAPT pAdapt, + OUT PNDIS_STATUS pStatus + ) +/*++ + +Routine Description: + + Postprocess a request for OID_PNP_CAPABILITIES that was forwarded + down to the underlying miniport, and has been completed by it. + +Arguments: + + pAdapt - Pointer to the adapter structure + pStatus - Place to return final status + +Return Value: + + None. + +--*/ + +{ + PNDIS_PNP_CAPABILITIES pPNPCapabilities; + PNDIS_PM_WAKE_UP_CAPABILITIES pPMstruct; + + if (pAdapt->Request.DATA.QUERY_INFORMATION.InformationBufferLength >= sizeof(NDIS_PNP_CAPABILITIES)) + { + pPNPCapabilities = (PNDIS_PNP_CAPABILITIES)(pAdapt->Request.DATA.QUERY_INFORMATION.InformationBuffer); + + // + // The following fields must be overwritten by an IM driver. + // + pPMstruct= & pPNPCapabilities->WakeUpCapabilities; + pPMstruct->MinMagicPacketWakeUp = NdisDeviceStateUnspecified; + pPMstruct->MinPatternWakeUp = NdisDeviceStateUnspecified; + pPMstruct->MinLinkChangeWakeUp = NdisDeviceStateUnspecified; + *pAdapt->BytesReadOrWritten = sizeof(NDIS_PNP_CAPABILITIES); + *pAdapt->BytesNeeded = 0; + + + // + // Setting our internal flags + // Default, device is ON + // + pAdapt->MPDeviceState = NdisDeviceStateD0; + pAdapt->PTDeviceState = NdisDeviceStateD0; + + *pStatus = NDIS_STATUS_SUCCESS; + } + else + { + *pAdapt->BytesNeeded= sizeof(NDIS_PNP_CAPABILITIES); + *pStatus = NDIS_STATUS_RESOURCES; + } +} + + +NDIS_STATUS +MPSetInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + + Miniport SetInfo handler. + + In the case of OID_PNP_SET_POWER, record the power state and return the OID. + Do not pass below + If the device is suspended, do not block the SET_POWER_OID + as it is used to reactivate the Passthru miniport + + + PM- If the MP is not ON (DeviceState > D0) return immediately (except for 'query power' and 'set power') + If MP is ON, but the PT is not at D0, then queue the queue the request for later processing + + Requests to miniports are always serialized + + +Arguments: + + MiniportAdapterContext Pointer to the adapter structure + Oid Oid for this query + InformationBuffer Buffer for information + InformationBufferLength Size of this buffer + BytesRead Specifies how much info is read + BytesNeeded In case the buffer is smaller than what we need, tell them how much is needed + +Return Value: + + Return code from the NdisRequest below. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + + Status = NDIS_STATUS_FAILURE; + + do + { + // + // The Set Power should not be sent to the miniport below the Passthru, but is handled internally + // + if (Oid == OID_PNP_SET_POWER) + { + MPProcessSetPowerOid(&Status, + pAdapt, + InformationBuffer, + InformationBufferLength, + BytesRead, + BytesNeeded); + break; + + } + + // + // If the miniport below is unbinding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + NdisReleaseSpinLock(&pAdapt->Lock); + // + // All other Set Information requests are failed, if the miniport is + // not at D0 or is transitioning to a device state greater than D0. + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0) + { + Status = NDIS_STATUS_FAILURE; + break; + } + + // Set up the Request and return the result + pAdapt->Request.RequestType = NdisRequestSetInformation; + pAdapt->Request.DATA.SET_INFORMATION.Oid = Oid; + pAdapt->Request.DATA.SET_INFORMATION.InformationBuffer = InformationBuffer; + pAdapt->Request.DATA.SET_INFORMATION.InformationBufferLength = InformationBufferLength; + pAdapt->BytesNeeded = BytesNeeded; + pAdapt->BytesReadOrWritten = BytesRead; + + // + // If the miniport below is unbinding, fail the request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->UnbindingInProcess == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + + // + // If the device below is at a low power state, we cannot send it the + // request now, and must pend it. + // + if ((pAdapt->PTDeviceState > NdisDeviceStateD0) + && (pAdapt->StandingBy == FALSE)) + { + pAdapt->QueuedRequest = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_PENDING; + break; + } + // + // This is in the process of powering down the system, always fail the request + // + if (pAdapt->StandingBy == TRUE) + { + NdisReleaseSpinLock(&pAdapt->Lock); + Status = NDIS_STATUS_FAILURE; + break; + } + pAdapt->OutstandingRequests = TRUE; + + NdisReleaseSpinLock(&pAdapt->Lock); + // + // Forward the request to the device below. + // + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + if (Status != NDIS_STATUS_PENDING) + { + *BytesRead = pAdapt->Request.DATA.SET_INFORMATION.BytesRead; + *BytesNeeded = pAdapt->Request.DATA.SET_INFORMATION.BytesNeeded; + pAdapt->OutstandingRequests = FALSE; + } + + } while (FALSE); + + return(Status); +} + + +VOID +MPProcessSetPowerOid( + IN OUT PNDIS_STATUS pNdisStatus, + IN PADAPT pAdapt, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ) +/*++ + +Routine Description: + This routine does all the procssing for a request with a SetPower Oid + The miniport shoud accept the Set Power and transition to the new state + + The Set Power should not be passed to the miniport below + + If the IM miniport is going into a low power state, then there is no guarantee if it will ever + be asked go back to D0, before getting halted. No requests should be pended or queued. + + +Arguments: + pNdisStatus - Status of the operation + pAdapt - The Adapter structure + InformationBuffer - The New DeviceState + InformationBufferLength + BytesRead - No of bytes read + BytesNeeded - No of bytes needed + + +Return Value: + Status - NDIS_STATUS_SUCCESS if all the wait events succeed. + +--*/ +{ + + + NDIS_DEVICE_POWER_STATE NewDeviceState; + + DBGPRINT(("==>MPProcessSetPowerOid: Adapt %p\n", pAdapt)); + + ASSERT (InformationBuffer != NULL); + + *pNdisStatus = NDIS_STATUS_FAILURE; + + do + { + // + // Check for invalid length + // + if (InformationBufferLength < sizeof(NDIS_DEVICE_POWER_STATE)) + { + *pNdisStatus = NDIS_STATUS_INVALID_LENGTH; + break; + } + + NewDeviceState = (*(PNDIS_DEVICE_POWER_STATE)InformationBuffer); + + // + // Check for invalid device state + // + if ((pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0)) + { + // + // If the miniport is in a non-D0 state, the miniport can only receive a Set Power to D0 + // + ASSERT (!(pAdapt->MPDeviceState > NdisDeviceStateD0) && (NewDeviceState != NdisDeviceStateD0)); + + *pNdisStatus = NDIS_STATUS_FAILURE; + break; + } + + // + // Is the miniport transitioning from an On (D0) state to an Low Power State (>D0) + // If so, then set the StandingBy Flag - (Block all incoming requests) + // + if (pAdapt->MPDeviceState == NdisDeviceStateD0 && NewDeviceState > NdisDeviceStateD0) + { + pAdapt->StandingBy = TRUE; + } + + // + // If the miniport is transitioning from a low power state to ON (D0), then clear the StandingBy flag + // All incoming requests will be pended until the physical miniport turns ON. + // + if (pAdapt->MPDeviceState > NdisDeviceStateD0 && NewDeviceState == NdisDeviceStateD0) + { + pAdapt->StandingBy = FALSE; + } + + // + // Now update the state in the pAdapt structure; + // + pAdapt->MPDeviceState = NewDeviceState; + + *pNdisStatus = NDIS_STATUS_SUCCESS; + + + } while (FALSE); + + if (*pNdisStatus == NDIS_STATUS_SUCCESS) + { + // + // The miniport resume from low power state + // + if (pAdapt->StandingBy == FALSE) + { + // + // If we need to indicate the media connect state + // + if (pAdapt->LastIndicatedStatus != pAdapt->LatestUnIndicateStatus) + { + if (pAdapt->MiniportHandle != NULL) + { + NdisMIndicateStatus(pAdapt->MiniportHandle, + pAdapt->LatestUnIndicateStatus, + (PVOID)NULL, + 0); + NdisMIndicateStatusComplete(pAdapt->MiniportHandle); + pAdapt->LastIndicatedStatus = pAdapt->LatestUnIndicateStatus; + } + } + } + else + { + // + // Initialize LatestUnIndicatedStatus + // + pAdapt->LatestUnIndicateStatus = pAdapt->LastIndicatedStatus; + } + *BytesRead = sizeof(NDIS_DEVICE_POWER_STATE); + *BytesNeeded = 0; + } + else + { + *BytesRead = 0; + *BytesNeeded = sizeof (NDIS_DEVICE_POWER_STATE); + } + + DBGPRINT(("<==MPProcessSetPowerOid: Adapt %p\n", pAdapt)); +} + + +VOID +MPReturnPacket( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet + ) +/*++ + +Routine Description: + + NDIS Miniport entry point called whenever protocols are done with + a packet that we had indicated up and they had queued up for returning + later. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + Packet - packet being returned. + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + +#ifdef NDIS51 + // + // Packet stacking: Check if this packet belongs to us. + // + if (NdisGetPoolFromPacket(Packet) != pAdapt->RecvPacketPoolHandle) + { + // + // We reused the original packet in a receive indication. + // Simply return it to the miniport below us. + // + NdisReturnPackets(&Packet, 1); + } + else +#endif // NDIS51 + { + // + // This is a packet allocated from this IM's receive packet pool. + // Reclaim our packet, and return the original to the driver below. + // + + PNDIS_PACKET MyPacket; + PRECV_RSVD RecvRsvd; + + RecvRsvd = (PRECV_RSVD)(Packet->MiniportReserved); + MyPacket = RecvRsvd->OriginalPkt; + + NdisFreePacket(Packet); + NdisReturnPackets(&MyPacket, 1); + } +} + + +NDIS_STATUS +MPTransferData( + OUT PNDIS_PACKET Packet, + OUT PUINT BytesTransferred, + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_HANDLE MiniportReceiveContext, + IN UINT ByteOffset, + IN UINT BytesToTransfer + ) +/*++ + +Routine Description: + + Miniport's transfer data handler. + +Arguments: + + Packet Destination packet + BytesTransferred Place-holder for how much data was copied + MiniportAdapterContext Pointer to the adapter structure + MiniportReceiveContext Context + ByteOffset Offset into the packet for copying data + BytesToTransfer How much to copy. + +Return Value: + + Status of transfer + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + + // + // Return, if the device is OFF + // + + if (IsIMDeviceStateOn(pAdapt) == FALSE) + { + return NDIS_STATUS_FAILURE; + } + + NdisTransferData(&Status, + pAdapt->BindingHandle, + MiniportReceiveContext, + ByteOffset, + BytesToTransfer, + Packet, + BytesTransferred); + + return(Status); +} + +VOID +MPHalt( + IN NDIS_HANDLE MiniportAdapterContext + ) +/*++ + +Routine Description: + + Halt handler. All the hard-work for clean-up is done here. + +Arguments: + + MiniportAdapterContext Pointer to the Adapter + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + NDIS_STATUS Status; + PADAPT *ppCursor; + + DBGPRINT(("==>MiniportHalt: Adapt %p\n", pAdapt)); + + pAdapt->MiniportHandle = NULL; + pAdapt->MiniportIsHalted = TRUE; + + // + // Remove this adapter from the global list + // + NdisAcquireSpinLock(&GlobalLock); + + for (ppCursor = &pAdaptList; *ppCursor != NULL; ppCursor = &(*ppCursor)->Next) + { + if (*ppCursor == pAdapt) + { + *ppCursor = pAdapt->Next; + break; + } + } + + NdisReleaseSpinLock(&GlobalLock); + + // + // Delete the ioctl interface that was created when the miniport + // was created. + // + (VOID)PtDeregisterDevice(); + + // + // If we have a valid bind, close the miniport below the protocol + // +#pragma prefast(suppress: __WARNING_DEREF_NULL_PTR, "pAdapt cannot be NULL") + if (pAdapt->BindingHandle != NULL) + { + // + // Close the binding below. and wait for it to complete + // + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(&Status, pAdapt->BindingHandle); + + if (Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + Status = pAdapt->Status; + } + + ASSERT (Status == NDIS_STATUS_SUCCESS); + + pAdapt->BindingHandle = NULL; + + PtDereferenceAdapt(pAdapt); + } + + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + + + DBGPRINT(("<== MiniportHalt: pAdapt %p\n", pAdapt)); +} + + +#ifdef NDIS51_MINIPORT + +VOID +MPCancelSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PVOID CancelId + ) +/*++ + +Routine Description: + + The miniport entry point to handle cancellation of all send packets + that match the given CancelId. If we have queued any packets that match + this, then we should dequeue them and call NdisMSendComplete for all + such packets, with a status of NDIS_STATUS_REQUEST_ABORTED. + + We should also call NdisCancelSendPackets in turn, on each lower binding + that this adapter corresponds to. This is to let miniports below cancel + any matching packets. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + CancelId - ID of packets to be cancelled. + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)MiniportAdapterContext; + + // + // If we queue packets on our adapter structure, this would be + // the place to acquire a spinlock to it, unlink any packets whose + // Id matches CancelId, release the spinlock and call NdisMSendComplete + // with NDIS_STATUS_REQUEST_ABORTED for all unlinked packets. + // + + // + // Next, pass this down so that we let the miniport(s) below cancel + // any packets that they might have queued. + // + NdisCancelSendPackets(pAdapt->BindingHandle, CancelId); + + return; +} + +VOID +MPDevicePnPEvent( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_DEVICE_PNP_EVENT DevicePnPEvent, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength + ) +/*++ + +Routine Description: + + This handler is called to notify us of PnP events directed to + our miniport device object. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + DevicePnPEvent - the event + InformationBuffer - Points to additional event-specific information + InformationBufferLength - length of above + +Return Value: + + None +--*/ +{ + // TBD - add code/comments about processing this. + + UNREFERENCED_PARAMETER(MiniportAdapterContext); + UNREFERENCED_PARAMETER(DevicePnPEvent); + UNREFERENCED_PARAMETER(InformationBuffer); + UNREFERENCED_PARAMETER(InformationBufferLength); + + return; +} + +VOID +MPAdapterShutdown( + IN NDIS_HANDLE MiniportAdapterContext + ) +/*++ + +Routine Description: + + This handler is called to notify us of an impending system shutdown. + +Arguments: + + MiniportAdapterContext - pointer to ADAPT structure + +Return Value: + + None +--*/ +{ + UNREFERENCED_PARAMETER(MiniportAdapterContext); + + return; +} + +#endif + + +VOID +MPFreeAllPacketPools( + IN PADAPT pAdapt + ) +/*++ + +Routine Description: + + Free all packet pools on the specified adapter. + +Arguments: + + pAdapt - pointer to ADAPT structure + +Return Value: + + None + +--*/ +{ + if (pAdapt->RecvPacketPoolHandle != NULL) + { + // + // Free the packet pool that is used to indicate receives + // + NdisFreePacketPool(pAdapt->RecvPacketPoolHandle); + + pAdapt->RecvPacketPoolHandle = NULL; + } + + if (pAdapt->SendPacketPoolHandle != NULL) + { + + // + // Free the packet pool that is used to send packets below + // + + NdisFreePacketPool(pAdapt->SendPacketPoolHandle); + + pAdapt->SendPacketPoolHandle = NULL; + + } +} + diff --git a/original_passthru/netsf.inf b/original_passthru/netsf.inf new file mode 100644 index 0000000..5e03a01 --- /dev/null +++ b/original_passthru/netsf.inf @@ -0,0 +1,165 @@ +; -- NETSF.INF -- +; +; Passthru driver INF file - this is the INF for the service (protocol) +; part. +; +; Copyright (c) 1993-2001, Microsoft Corporation +; +; ---------------------------------------------------------------------- +; Notes: +; 0. The term "filter" is used in this INF to refer to an NDIS IM driver that +; implements a 1:1 relationship between upper and lower bindings. +; +; 1. Items specifically required for a filter have been marked with +; "!!--Filter Specific--!!" keyword +; 2. In general a filter DOES NOT require a notify object for proper installation. +; A notify object is only required if one wants to have better control +; over binding operations or if one wants to receive notifications +; when other components get installed/removed/bound/unbound. +; Since Windows 2000 systems do not have support for CopyINF directive, +; a notify object is required to programmatically copy the miniport INF +; file to the system INF directory. Previous versions of this INF file +; erroneously used to copy the INF files directly by using the CopyFiles +; directive. +; On Windows XP, you can install a filter IM without a notify object. +; by following the instructions in (4). +; +; 3. If you want to use this INF file with your own IM driver, please +; make the following modifications: +; File netsf.inf +; -------------- +; a. In section [SourceDiskFiles] and [Passthru.Files.Sys] +; change passthru.sys to the name of your own driver binary. +; b. In section [Passthru.ndi.AddReg], change values of +; BindForm and MiniportId to appropriate values. +; File netsf_m.inf +; ---------------- +; a. Replace MS_PassthruMP with InfId of your miniport. +; b. In section [PassthruMP.AddService], +; change ServiceBinary appropriately. +; c. In section [PassthruMP.ndi.AddReg], +; change "Passthru" in the line having "Service" +; to reflect the appropriate name +; +; +; ---------------------------------------------------------------------- + +[Version] +Signature = "$Windows NT$" +Class = NetService +ClassGUID = {4D36E974-E325-11CE-BFC1-08002BE10318} +Provider = %Msft% +DriverVer =10/01/2002,6.0.5019.0 + +[Manufacturer] +%Msft% = MSFT,NTx86,NTia64,NTamd64 + +[ControlFlags] + +;========================================================================= +; +;========================================================================= +;For Win2K + +[MSFT] +%Passthru_Desc% = Passthru.ndi, ms_passthru + +;For WinXP and later + +[MSFT.NTx86] +%Passthru_Desc% = Passthru.ndi, ms_passthru + +[MSFT.NTia64] +%Passthru_Desc% = Passthru.ndi, ms_passthru + +[MSFT.NTamd64] +%Passthru_Desc% = Passthru.ndi, ms_passthru + + +[Passthru.ndi] +AddReg = Passthru.ndi.AddReg, Passthru.AddReg +Characteristics = 0x4410 ; NCF_FILTER | NCF_NDIS_PROTOCOL !--Filter Specific--!! +CopyFiles = Passthru.Files.Sys +CopyInf = netsf_m.inf + +[Passthru.ndi.Remove] +DelFiles = Passthru.Files.Sys + +[Passthru.ndi.Services] +AddService = Passthru,, Passthru.AddService + +[Passthru.AddService] +DisplayName = %PassthruService_Desc% +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\passthru.sys +AddReg = Passthru.AddService.AddReg + + +[Passthru.AddService.AddReg] +; ---------------------------------------------------------------------- +; Add any miniport-specific parameters here. These are params that your +; filter device is going to use. +; +;HKR, Parameters, ParameterName, 0x10000, "MultiSz", "Parameter", "Value" +;HKR, Parameters, ParameterName2, 0x10001, 4 + + +; ---------------------------------------------------------------------- +; File copy +; +[SourceDisksNames] +1=%DiskDescription%,"",, + +[SourceDisksFiles] +passthru.sys=1 + +[DestinationDirs] +DefaultDestDir = 12 +Passthru.Files.Sys = 12 ; %windir%\System32\drivers + +[Passthru.Files.Sys] +passthru.sys,,,2 + +; ---------------------------------------------------------------------- +; Filter Install +; + +[Passthru.ndi.AddReg] +HKR, Ndi, HelpText, , %Passthru_HELP% + +; ---------------------------------------------------------------------- +; !!--Filter Specific--!! +; +; Note: +; 1. Other components may also have UpperRange/LowerRange but for filters +; the value of both of them must be noupper/nolower +; 2. The value FilterClass is required. +; 3. The value Service is required +; 4. FilterDeviceInfId is the InfId of the filter device (miniport) that will +; be installed for each filtered adapter. +; In this case this is ms_passthrump (refer to netsf_m.inf) +; +HKR, Ndi, FilterClass, , failover +HKR, Ndi, FilterDeviceInfId, , ms_passthrump +HKR, Ndi, Service, , Passthru +HKR, Ndi\Interfaces, UpperRange, , noupper +HKR, Ndi\Interfaces, LowerRange, , nolower +HKR, Ndi\Interfaces, FilterMediaTypes, , "ethernet, tokenring, fddi, wan" + +[Passthru.AddReg] +; The following key is Required +; The following key is Passthru specific +HKR, Parameters, Param1, 0, 4 + +; ---------------------------------------------------------------------- +[Strings] +Msft = "Microsoft" +DiskDescription = "Microsoft Passthru Driver Disk" + +Passthru_Desc = "Passthru Driver" +Passthru_HELP = "Passthru Driver" +PassthruService_Desc = "Passthru Service" + + diff --git a/original_passthru/netsf_m.inf b/original_passthru/netsf_m.inf new file mode 100644 index 0000000..6605a02 --- /dev/null +++ b/original_passthru/netsf_m.inf @@ -0,0 +1,93 @@ +; -- NETSF_M.INF -- +; +; Passsthru Miniport INF file +; +; Copyright (c) 1993-1999, Microsoft Corporation + +; ---------------------------------------------------------------------- +; Notes: +; 0. The term "filter" is used here to refer to an NDIS IM driver that +; implements a 1:1 relationship between upper and lower bindings. +; 1. Items specifically required for a filter have been marked with +; "!!--Filter Specific--!!" keyword +; 2. A filter DOES NOT require a notify object for proper installation. +; A notify object is only required if one wants to have better control +; over binding operations or if one wants to receive notifications +; when other components get installed/removed/bound/unbound. +; This sample uses a notify object as an example only. If you do not +; want to use a notify object, please comment out the lines that add +; ClsId and ComponentDll registry keys. +; ---------------------------------------------------------------------- + +[Version] +signature = "$Windows NT$" +Class = Net +ClassGUID = {4d36e972-e325-11ce-bfc1-08002be10318} +Provider = %Msft% +DriverVer =10/01/2002,6.0.5019.0 + +[ControlFlags] +ExcludeFromSelect = ms_passthrump + +[DestinationDirs] +DefaultDestDir=12 +; No files to copy + +[Manufacturer] +%Msft% = MSFT,NTx86,NTia64,NTamd64 + +;For Win2K + +[MSFT] +%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump + +;For WinXP and later + +[MSFT.NTx86] +%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump + +[MSFT.NTia64] +%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump + +[MSFT.NTamd64] +%PassthruMP_Desc% = PassthruMP.ndi, ms_passthrump + + +[PassthruMP.ndi] +AddReg = PassthruMP.ndi.AddReg +Characteristics = 0x29 ;NCF_NOT_USER_REMOVABLE | NCF_VIRTUAL | NCF_HIDDEN + +[PassthruMP.ndi.AddReg] +HKR, Ndi, Service, 0, PassthruMP + +[PassthruMP.ndi.Services] +AddService = PassthruMP,0x2, PassthruMP.AddService + + +[PassthruMP.AddService] +ServiceType = 1 ;SERVICE_KERNEL_DRIVER +StartType = 3 ;SERVICE_DEMAND_START +ErrorControl = 1 ;SERVICE_ERROR_NORMAL +ServiceBinary = %12%\passthru.sys +AddReg = PassthruMP.AddService.AddReg + + +[PassthruMP.AddService.AddReg] +; ---------------------------------------------------------------------- +; Add any miniport-specific parameters here. These are params that your +; filter device is going to use. +; +;HKR, Parameters, ParameterName, 0x10000, "MultiSz", "Parameter", "Value" +;HKR, Parameters, ParameterName2, 0x10001, 4 + +[Strings] +Msft = "Microsoft" +PassthruMP_Desc = "Passthru Miniport" + +[SourceDisksNames] +;None + +[SourceDisksFiles] +;None + + diff --git a/original_passthru/passthru.c b/original_passthru/passthru.c new file mode 100644 index 0000000..f614f2a --- /dev/null +++ b/original_passthru/passthru.c @@ -0,0 +1,458 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + passthru.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + + +#include "precomp.h" +#pragma hdrstop + +#pragma NDIS_INIT_FUNCTION(DriverEntry) + +NDIS_HANDLE ProtHandle = NULL; +NDIS_HANDLE DriverHandle = NULL; +NDIS_MEDIUM MediumArray[4] = + { + NdisMedium802_3, // Ethernet + NdisMedium802_5, // Token-ring + NdisMediumFddi, // Fddi + NdisMediumWan // NDISWAN + }; + +NDIS_SPIN_LOCK GlobalLock; + +PADAPT pAdaptList = NULL; +LONG MiniportCount = 0; + +NDIS_HANDLE NdisWrapperHandle; + +// +// To support ioctls from user-mode: +// + +#define LINKNAME_STRING L"\\DosDevices\\Passthru" +#define NTDEVICE_STRING L"\\Device\\Passthru" + +NDIS_HANDLE NdisDeviceHandle = NULL; +PDEVICE_OBJECT ControlDeviceObject = NULL; + +enum _DEVICE_STATE +{ + PS_DEVICE_STATE_READY = 0, // ready for create/delete + PS_DEVICE_STATE_CREATING, // create operation in progress + PS_DEVICE_STATE_DELETING // delete operation in progress +} ControlDeviceState = PS_DEVICE_STATE_READY; + + + +NTSTATUS +DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ) +/*++ + +Routine Description: + + First entry point to be called, when this driver is loaded. + Register with NDIS as an intermediate driver. + +Arguments: + + DriverObject - pointer to the system's driver object structure + for this driver + + RegistryPath - system's registry path for this driver + +Return Value: + + STATUS_SUCCESS if all initialization is successful, STATUS_XXX + error code if not. + +--*/ +{ + NDIS_STATUS Status; + NDIS_PROTOCOL_CHARACTERISTICS PChars; + NDIS_MINIPORT_CHARACTERISTICS MChars; + NDIS_STRING Name; + + Status = NDIS_STATUS_SUCCESS; + NdisAllocateSpinLock(&GlobalLock); + + NdisMInitializeWrapper(&NdisWrapperHandle, DriverObject, RegistryPath, NULL); + + do + { + // + // Register the miniport with NDIS. Note that it is the miniport + // which was started as a driver and not the protocol. Also the miniport + // must be registered prior to the protocol since the protocol's BindAdapter + // handler can be initiated anytime and when it is, it must be ready to + // start driver instances. + // + + NdisZeroMemory(&MChars, sizeof(NDIS_MINIPORT_CHARACTERISTICS)); + + MChars.MajorNdisVersion = PASSTHRU_MAJOR_NDIS_VERSION; + MChars.MinorNdisVersion = PASSTHRU_MINOR_NDIS_VERSION; + + MChars.InitializeHandler = MPInitialize; + MChars.QueryInformationHandler = MPQueryInformation; + MChars.SetInformationHandler = MPSetInformation; + MChars.ResetHandler = NULL; + MChars.TransferDataHandler = MPTransferData; + MChars.HaltHandler = MPHalt; +#ifdef NDIS51_MINIPORT + MChars.CancelSendPacketsHandler = MPCancelSendPackets; + MChars.PnPEventNotifyHandler = MPDevicePnPEvent; + MChars.AdapterShutdownHandler = MPAdapterShutdown; +#endif // NDIS51_MINIPORT + + // + // We will disable the check for hang timeout so we do not + // need a check for hang handler! + // + MChars.CheckForHangHandler = NULL; + MChars.ReturnPacketHandler = MPReturnPacket; + + // + // Either the Send or the SendPackets handler should be specified. + // If SendPackets handler is specified, SendHandler is ignored + // + MChars.SendHandler = NULL; // MPSend; + MChars.SendPacketsHandler = MPSendPackets; + + Status = NdisIMRegisterLayeredMiniport(NdisWrapperHandle, + &MChars, + sizeof(MChars), + &DriverHandle); + if (Status != NDIS_STATUS_SUCCESS) + { + break; + } + +#ifndef WIN9X + NdisMRegisterUnloadHandler(NdisWrapperHandle, PtUnload); +#endif + + // + // Now register the protocol. + // + NdisZeroMemory(&PChars, sizeof(NDIS_PROTOCOL_CHARACTERISTICS)); + PChars.MajorNdisVersion = PASSTHRU_PROT_MAJOR_NDIS_VERSION; + PChars.MinorNdisVersion = PASSTHRU_PROT_MINOR_NDIS_VERSION; + + // + // Make sure the protocol-name matches the service-name + // (from the INF) under which this protocol is installed. + // This is needed to ensure that NDIS can correctly determine + // the binding and call us to bind to miniports below. + // + NdisInitUnicodeString(&Name, L"Passthru"); // Protocol name + PChars.Name = Name; + PChars.OpenAdapterCompleteHandler = PtOpenAdapterComplete; + PChars.CloseAdapterCompleteHandler = PtCloseAdapterComplete; + PChars.SendCompleteHandler = PtSendComplete; + PChars.TransferDataCompleteHandler = PtTransferDataComplete; + + PChars.ResetCompleteHandler = PtResetComplete; + PChars.RequestCompleteHandler = PtRequestComplete; + PChars.ReceiveHandler = PtReceive; + PChars.ReceiveCompleteHandler = PtReceiveComplete; + PChars.StatusHandler = PtStatus; + PChars.StatusCompleteHandler = PtStatusComplete; + PChars.BindAdapterHandler = PtBindAdapter; + PChars.UnbindAdapterHandler = PtUnbindAdapter; + PChars.UnloadHandler = PtUnloadProtocol; + + PChars.ReceivePacketHandler = PtReceivePacket; + PChars.PnPEventHandler= PtPNPHandler; + + NdisRegisterProtocol(&Status, + &ProtHandle, + &PChars, + sizeof(NDIS_PROTOCOL_CHARACTERISTICS)); + + if (Status != NDIS_STATUS_SUCCESS) + { + NdisIMDeregisterLayeredMiniport(DriverHandle); + break; + } + + NdisIMAssociateMiniport(DriverHandle, ProtHandle); + } + while (FALSE); + + if (Status != NDIS_STATUS_SUCCESS) + { + NdisTerminateWrapper(NdisWrapperHandle, NULL); + } + + return(Status); +} + + +NDIS_STATUS +PtRegisterDevice( + VOID + ) +/*++ + +Routine Description: + + Register an ioctl interface - a device object to be used for this + purpose is created by NDIS when we call NdisMRegisterDevice. + + This routine is called whenever a new miniport instance is + initialized. However, we only create one global device object, + when the first miniport instance is initialized. This routine + handles potential race conditions with PtDeregisterDevice via + the ControlDeviceState and MiniportCount variables. + + NOTE: do not call this from DriverEntry; it will prevent the driver + from being unloaded (e.g. on uninstall). + +Arguments: + + None + +Return Value: + + NDIS_STATUS_SUCCESS if we successfully register a device object. + +--*/ +{ + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + UNICODE_STRING DeviceName; + UNICODE_STRING DeviceLinkUnicodeString; + PDRIVER_DISPATCH DispatchTable[IRP_MJ_MAXIMUM_FUNCTION+1]; + + DBGPRINT(("==>PtRegisterDevice\n")); + + NdisAcquireSpinLock(&GlobalLock); + + ++MiniportCount; + + if (1 == MiniportCount) + { + ASSERT(ControlDeviceState != PS_DEVICE_STATE_CREATING); + + // + // Another thread could be running PtDeregisterDevice on + // behalf of another miniport instance. If so, wait for + // it to exit. + // + while (ControlDeviceState != PS_DEVICE_STATE_READY) + { + NdisReleaseSpinLock(&GlobalLock); + NdisMSleep(1); + NdisAcquireSpinLock(&GlobalLock); + } + + ControlDeviceState = PS_DEVICE_STATE_CREATING; + + NdisReleaseSpinLock(&GlobalLock); + + + NdisZeroMemory(DispatchTable, (IRP_MJ_MAXIMUM_FUNCTION+1) * sizeof(PDRIVER_DISPATCH)); + + DispatchTable[IRP_MJ_CREATE] = PtDispatch; + DispatchTable[IRP_MJ_CLEANUP] = PtDispatch; + DispatchTable[IRP_MJ_CLOSE] = PtDispatch; + DispatchTable[IRP_MJ_DEVICE_CONTROL] = PtDispatch; + + + NdisInitUnicodeString(&DeviceName, NTDEVICE_STRING); + NdisInitUnicodeString(&DeviceLinkUnicodeString, LINKNAME_STRING); + + // + // Create a device object and register our dispatch handlers + // + + Status = NdisMRegisterDevice( + NdisWrapperHandle, + &DeviceName, + &DeviceLinkUnicodeString, + &DispatchTable[0], + &ControlDeviceObject, + &NdisDeviceHandle + ); + + NdisAcquireSpinLock(&GlobalLock); + + ControlDeviceState = PS_DEVICE_STATE_READY; + } + + NdisReleaseSpinLock(&GlobalLock); + + DBGPRINT(("<==PtRegisterDevice: %x\n", Status)); + + return (Status); +} + + +NTSTATUS +PtDispatch( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +/*++ +Routine Description: + + Process IRPs sent to this device. + +Arguments: + + DeviceObject - pointer to a device object + Irp - pointer to an I/O Request Packet + +Return Value: + + NTSTATUS - STATUS_SUCCESS always - change this when adding + real code to handle ioctls. + +--*/ +{ + PIO_STACK_LOCATION irpStack; + NTSTATUS status = STATUS_SUCCESS; + + UNREFERENCED_PARAMETER(DeviceObject); + + DBGPRINT(("==>Pt Dispatch\n")); + irpStack = IoGetCurrentIrpStackLocation(Irp); + + + switch (irpStack->MajorFunction) + { + case IRP_MJ_CREATE: + break; + + case IRP_MJ_CLEANUP: + break; + + case IRP_MJ_CLOSE: + break; + + case IRP_MJ_DEVICE_CONTROL: + // + // Add code here to handle ioctl commands sent to passthru. + // + break; + default: + break; + } + + Irp->IoStatus.Status = status; + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + DBGPRINT(("<== Pt Dispatch\n")); + + return status; + +} + + +NDIS_STATUS +PtDeregisterDevice( + VOID + ) +/*++ + +Routine Description: + + Deregister the ioctl interface. This is called whenever a miniport + instance is halted. When the last miniport instance is halted, we + request NDIS to delete the device object + +Arguments: + + NdisDeviceHandle - Handle returned by NdisMRegisterDevice + +Return Value: + + NDIS_STATUS_SUCCESS if everything worked ok + +--*/ +{ + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + + DBGPRINT(("==>PassthruDeregisterDevice\n")); + + NdisAcquireSpinLock(&GlobalLock); + + ASSERT(MiniportCount > 0); + + --MiniportCount; + + if (0 == MiniportCount) + { + // + // All miniport instances have been halted. Deregister + // the control device. + // + + ASSERT(ControlDeviceState == PS_DEVICE_STATE_READY); + + // + // Block PtRegisterDevice() while we release the control + // device lock and deregister the device. + // + ControlDeviceState = PS_DEVICE_STATE_DELETING; + + NdisReleaseSpinLock(&GlobalLock); + + if (NdisDeviceHandle != NULL) + { + Status = NdisMDeregisterDevice(NdisDeviceHandle); + NdisDeviceHandle = NULL; + } + + NdisAcquireSpinLock(&GlobalLock); + ControlDeviceState = PS_DEVICE_STATE_READY; + } + + NdisReleaseSpinLock(&GlobalLock); + + DBGPRINT(("<== PassthruDeregisterDevice: %x\n", Status)); + return Status; + +} + +VOID +PtUnload( + IN PDRIVER_OBJECT DriverObject + ) +// +// PassThru driver unload function +// +{ + UNREFERENCED_PARAMETER(DriverObject); + + DBGPRINT(("PtUnload: entered\n")); + + PtUnloadProtocol(); + + NdisIMDeregisterLayeredMiniport(DriverHandle); + + NdisFreeSpinLock(&GlobalLock); + + DBGPRINT(("PtUnload: done!\n")); +} + diff --git a/original_passthru/passthru.h b/original_passthru/passthru.h new file mode 100644 index 0000000..badde8a --- /dev/null +++ b/original_passthru/passthru.h @@ -0,0 +1,477 @@ +/*++ + +Copyright (c) 1992-2000 Microsoft Corporation + +Module Name: + + passthru.h + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + +#ifdef NDIS51_MINIPORT +#define PASSTHRU_MAJOR_NDIS_VERSION 5 +#define PASSTHRU_MINOR_NDIS_VERSION 1 +#else +#define PASSTHRU_MAJOR_NDIS_VERSION 4 +#define PASSTHRU_MINOR_NDIS_VERSION 0 +#endif + +#ifdef NDIS51 +#define PASSTHRU_PROT_MAJOR_NDIS_VERSION 5 +#define PASSTHRU_PROT_MINOR_NDIS_VERSION 0 +#else +#define PASSTHRU_PROT_MAJOR_NDIS_VERSION 4 +#define PASSTHRU_PROT_MINOR_NDIS_VERSION 0 +#endif + +#define MAX_BUNDLEID_LENGTH 50 + +#define TAG 'ImPa' +#define WAIT_INFINITE 0 + + + +//advance declaration +typedef struct _ADAPT ADAPT, *PADAPT; + +DRIVER_INITIALIZE DriverEntry; +extern +NTSTATUS +DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ); + +DRIVER_DISPATCH PtDispatch; +NTSTATUS +PtDispatch( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ); + +NDIS_STATUS +PtRegisterDevice( + VOID + ); + +NDIS_STATUS +PtDeregisterDevice( + VOID + ); + +DRIVER_UNLOAD PtUnload; +VOID +PtUnloadProtocol( + VOID + ); + +// +// Protocol proto-types +// +extern +VOID +PtOpenAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status, + IN NDIS_STATUS OpenErrorStatus + ); + +extern +VOID +PtCloseAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ); + +extern +VOID +PtResetComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ); + +extern +VOID +PtRequestComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_REQUEST NdisRequest, + IN NDIS_STATUS Status + ); + +extern +VOID +PtStatus( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS GeneralStatus, + IN PVOID StatusBuffer, + IN UINT StatusBufferSize + ); + +extern +VOID +PtStatusComplete( + IN NDIS_HANDLE ProtocolBindingContext + ); + +extern +VOID +PtSendComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status + ); + +extern +VOID +PtTransferDataComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status, + IN UINT BytesTransferred + ); + +extern +NDIS_STATUS +PtReceive( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE MacReceiveContext, + IN PVOID HeaderBuffer, + IN UINT HeaderBufferSize, + IN PVOID LookAheadBuffer, + IN UINT LookaheadBufferSize, + IN UINT PacketSize + ); + +extern +VOID +PtReceiveComplete( + IN NDIS_HANDLE ProtocolBindingContext + ); + +extern +INT +PtReceivePacket( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet + ); + +extern +VOID +PtBindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE BindContext, + IN PNDIS_STRING DeviceName, + IN PVOID SystemSpecific1, + IN PVOID SystemSpecific2 + ); + +extern +VOID +PtUnbindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE UnbindContext + ); + +VOID +PtUnload( + IN PDRIVER_OBJECT DriverObject + ); + + + +extern +NDIS_STATUS +PtPNPHandler( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNET_PNP_EVENT pNetPnPEvent + ); + + + + +NDIS_STATUS +PtPnPNetEventReconfigure( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ); + +NDIS_STATUS +PtPnPNetEventSetPower ( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ); + + +// +// Miniport proto-types +// +NDIS_STATUS +MPInitialize( + OUT PNDIS_STATUS OpenErrorStatus, + OUT PUINT SelectedMediumIndex, + IN PNDIS_MEDIUM MediumArray, + IN UINT MediumArraySize, + IN NDIS_HANDLE MiniportAdapterHandle, + IN NDIS_HANDLE WrapperConfigurationContext + ); + +VOID +MPSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PPNDIS_PACKET PacketArray, + IN UINT NumberOfPackets + ); + +NDIS_STATUS +MPSend( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet, + IN UINT Flags + ); + +NDIS_STATUS +MPQueryInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesWritten, + OUT PULONG BytesNeeded + ); + +NDIS_STATUS +MPSetInformation( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_OID Oid, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ); + +VOID +MPReturnPacket( + IN NDIS_HANDLE MiniportAdapterContext, + IN PNDIS_PACKET Packet + ); + +NDIS_STATUS +MPTransferData( + OUT PNDIS_PACKET Packet, + OUT PUINT BytesTransferred, + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_HANDLE MiniportReceiveContext, + IN UINT ByteOffset, + IN UINT BytesToTransfer + ); + +VOID +MPHalt( + IN NDIS_HANDLE MiniportAdapterContext + ); + + +VOID +MPQueryPNPCapabilities( + OUT PADAPT MiniportProtocolContext, + OUT PNDIS_STATUS Status + ); + + +#ifdef NDIS51_MINIPORT + +VOID +MPCancelSendPackets( + IN NDIS_HANDLE MiniportAdapterContext, + IN PVOID CancelId + ); + +VOID +MPAdapterShutdown( + IN NDIS_HANDLE MiniportAdapterContext + ); + +VOID +MPDevicePnPEvent( + IN NDIS_HANDLE MiniportAdapterContext, + IN NDIS_DEVICE_PNP_EVENT DevicePnPEvent, + IN PVOID InformationBuffer, + IN ULONG InformationBufferLength + ); + +#endif // NDIS51_MINIPORT + +VOID +MPFreeAllPacketPools( + IN PADAPT pAdapt + ); + + +VOID +MPProcessSetPowerOid( + IN OUT PNDIS_STATUS pNdisStatus, + IN PADAPT pAdapt, + __in_bcount(InformationBufferLength) IN PVOID InformationBuffer, + IN ULONG InformationBufferLength, + OUT PULONG BytesRead, + OUT PULONG BytesNeeded + ); + +VOID +PtReferenceAdapt( + IN PADAPT pAdapt + ); + +BOOLEAN +PtDereferenceAdapt( + IN PADAPT pAdapt + ); + +// +// There should be no DbgPrint's in the Free version of the driver +// +#if DBG + +#define DBGPRINT(Fmt) \ + { \ + DbgPrint("Passthru: "); \ + DbgPrint Fmt; \ + } + +#else // if DBG + +#define DBGPRINT(Fmt) + +#endif // if DBG + +#define NUM_PKTS_IN_POOL 256 + + +// +// Protocol reserved part of a sent packet that is allocated by us. +// +typedef struct _SEND_RSVD +{ + PNDIS_PACKET OriginalPkt; +} SEND_RSVD, *PSEND_RSVD; + +// +// Miniport reserved part of a received packet that is allocated by +// us. Note that this should fit into the MiniportReserved space +// in an NDIS_PACKET. +// +typedef struct _RECV_RSVD +{ + PNDIS_PACKET OriginalPkt; +} RECV_RSVD, *PRECV_RSVD; + +C_ASSERT(sizeof(RECV_RSVD) <= sizeof(((PNDIS_PACKET)0)->MiniportReserved)); + +// +// Event Codes related to the PassthruEvent Structure +// + +typedef enum +{ + Passthru_Invalid, + Passthru_SetPower, + Passthru_Unbind + +} PASSSTHRU_EVENT_CODE, *PPASTHRU_EVENT_CODE; + +// +// Passthru Event with a code to state why they have been state +// + +typedef struct _PASSTHRU_EVENT +{ + NDIS_EVENT Event; + PASSSTHRU_EVENT_CODE Code; + +} PASSTHRU_EVENT, *PPASSTHRU_EVENT; + + +// +// Structure used by both the miniport as well as the protocol part of the intermediate driver +// to represent an adapter and its corres. lower bindings +// +typedef struct _ADAPT +{ + struct _ADAPT * Next; + + NDIS_HANDLE BindingHandle; // To the lower miniport + NDIS_HANDLE MiniportHandle; // NDIS Handle to for miniport up-calls + NDIS_HANDLE SendPacketPoolHandle; + NDIS_HANDLE RecvPacketPoolHandle; + NDIS_STATUS Status; // Open Status + NDIS_EVENT Event; // Used by bind/halt for Open/Close Adapter synch. + NDIS_MEDIUM Medium; + NDIS_REQUEST Request; // This is used to wrap a request coming down + // to us. This exploits the fact that requests + // are serialized down to us. + PULONG BytesNeeded; + PULONG BytesReadOrWritten; + BOOLEAN ReceivedIndicationFlags[32]; + + BOOLEAN OutstandingRequests; // TRUE iff a request is pending + // at the miniport below + BOOLEAN QueuedRequest; // TRUE iff a request is queued at + // this IM miniport + + BOOLEAN StandingBy; // True - When the miniport or protocol is transitioning from a D0 to Standby (>D0) State + BOOLEAN UnbindingInProcess; + NDIS_SPIN_LOCK Lock; + // False - At all other times, - Flag is cleared after a transition to D0 + + NDIS_DEVICE_POWER_STATE MPDeviceState; // Miniport's Device State + NDIS_DEVICE_POWER_STATE PTDeviceState; // Protocol's Device State + NDIS_STRING DeviceName; // For initializing the miniport edge + NDIS_EVENT MiniportInitEvent; // For blocking UnbindAdapter while + // an IM Init is in progress. + BOOLEAN MiniportInitPending; // TRUE iff IMInit in progress + NDIS_STATUS LastIndicatedStatus; // The last indicated media status + NDIS_STATUS LatestUnIndicateStatus; // The latest suppressed media status + ULONG OutstandingSends; + LONG RefCount; + BOOLEAN MiniportIsHalted; +} ADAPT, *PADAPT; + +extern NDIS_HANDLE ProtHandle, DriverHandle; +extern NDIS_MEDIUM MediumArray[4]; +extern PADAPT pAdaptList; +extern NDIS_SPIN_LOCK GlobalLock; + + +#define ADAPT_MINIPORT_HANDLE(_pAdapt) ((_pAdapt)->MiniportHandle) +#define ADAPT_DECR_PENDING_SENDS(_pAdapt) \ + { \ + NdisAcquireSpinLock(&(_pAdapt)->Lock); \ + (_pAdapt)->OutstandingSends--; \ + NdisReleaseSpinLock(&(_pAdapt)->Lock); \ + } + +// +// Custom Macros to be used by the passthru driver +// +/* +BOOLEAN +IsIMDeviceStateOn( + PADAPT + ) + +*/ +#define IsIMDeviceStateOn(_pP) ((_pP)->MPDeviceState == NdisDeviceStateD0 && (_pP)->PTDeviceState == NdisDeviceStateD0 ) + diff --git a/original_passthru/passthru.htm b/original_passthru/passthru.htm new file mode 100644 index 0000000..ee23278 --- /dev/null +++ b/original_passthru/passthru.htm @@ -0,0 +1,486 @@ + + + + + + + + +passthru + + + + + + + + + + + + +
+ +

+ + + +PASSTHRU.SYS +- Sample NDIS Intermediate Driver

+ +

SUMMARY

+ +

Passthru Intermediate Miniport Driver

+ +

The Passthru +sample is a do-nothing pass-through NDIS 5 driver that demonstrates the basic +principles underlying an NDIS Intermediate Miniport (IM) driver. This driver +exposes a virtual adapter for each binding to a real or virtual NDIS adapter. +Protocols bind to these virtual adapters as if they are real adapters.

+ +

The Passthru +driver re-packages and sends down all requests and sends submitted to this +virtual adapter. The Passthru driver can be modified +to change the data before passing it along. For example, it could +encrypt/compress outgoing and decrypt/decompress incoming data.

+ +

Passthru also re-packages and indicates up +all received data and status indications that it receives at its lower +(protocol) edge.

+ +

BUILDING THE SAMPLE

+ +

Run the build +command from this directory to build the sample—it creates the binary Passthru.sys.

+ +

To install this driver on +Windows® 2000, use the PASSTHRU sample notification object and INFs, also found in this DDK.

+ +

INSTALLING THE SAMPLE

+ +

Passthru is installed as a service (called +“Passthru Driver” in the supplied INFs/notification +object). To install, follow the steps below.

+ +

Prepare a floppy disk (or +installation directory) that contains these files: netsf.inf, +netsf_m.inf and passthru.sys.

+ +

On the desktop, +right-click the My Network Places icon and choose Properties.

+ +

Right-click on the +relevant Local Area Connection icon and choose Properties.

+ +

Click Install, +then Service, then Add, then Have Disk. +

+ +

Browse to the +drive/directory containing the files listed above. Click OK. This should +show “Passthru Driver” in a list of Network Services. +Highlight this and click OK. This should install the Passthru +driver.

+ +

Click OK or Yes each time the system prompts with a warning +regarding installation of unsigned files. This is necessary because binaries +generated via the DDK build environment are not signed.

+ +

Two .INF files are needed +rather than one because Passthru is installed both as +a protocol and a miniport.

+ +

CODE TOUR

+ +

File Manifest

+ +
File           Description
 
Makefile       Used during compilation to create the object and sys files
Miniport.c     Miniport related functions of the passthru driver
Netsf.inf      Installation INF for the service (protocol side installation)
Netsf_m.inf    Installation INF for the miniport (virtual device installation)
Passthru.c     DriverEntry routine and any routines common to the passthru miniport and protocol 
Passthru.h     Prototypes of all functions and data structures used by the Passthru driver
Passthru.htm   Documentation for the Passthru driver (this file)
Passthru.rc    Resource file for the Passthru driver
Precomp.h      Precompile header file
Protocol.c     Protocol related functions of the Passthru driver
Sources        List of source files that are compiled and linked to create the passthru driver. This can be modified to create binaries that operate on previous Windows versions (e.g. Windows 2000).
+ +

Programming Tour

+ +

Basic steps in initializing and +halting of Passthru driver:

+ +

1) During DriverEntry, +the Passthru driver registers as a protocol and an +Intermediate miniport driver.

+ +

2) Later on, NDIS calls PassthruÂ’s BindAdapterHandler, PtBindAdapter, for each underlying NDIS adapter to which it +is configured to bind.

+ +

3) In the context of BindAdapterHandler and after successfully opening a binding +to the underlying adapter, the Passthru driver +queries the reserved keyword "UpperBindings" +to get a list of device names for the virtual adapters that this particular +binding is to expose. Since this driver implements a 1:1 relationship between +lower bindings and virtual adapters, this list contains a single name. “Mux” IM drivers that expose multiple virtual adapters over +a single underlying adapter will process multiple entries in UpperBindings.

+ +

4) For each device name, the Passthru driver calls NdisIMInitializeDeviceInstanceEx.

+ +

5) In response, NDIS will +eventually call back Passthru miniportÂ’s MiniportInitialize entry point, MPInitialize.

+ +

6) After MPInitialize +successfully returns, NDIS takes care of getting upper-layer protocols to bind +to the newly created virtual adapter(s).

+ +

7) All requests and sends coming +from upper-layer protocols for the Passthru miniport +driver are repackaged and sent down to NDIS, to be passed to the underlying +NDIS adapter.

+ +

8) All indications arriving from +bindings to an underlying NDIS adapter are forwarded up as if they generated +from PassthruÂ’s virtual adapters.

+ +

9) NDIS calls the Passthru driverÂ’s ProtocolUnbind +entry point to request it to close the binding between an underlying adapter +and Passthru protocol. In processing this, the Passthru driver first calls NdisIMDeInitializeDeviceInstance +for the virtual adapter(s) representing that particular binding.

+ +

10) NDIS in turn will close all +the bindings between upper-layer protocols and virtual Passthru +adapter.

+ +

11) After all the bindings are +closed, NDIS calls the Passthru driverÂ’s MiniportHalt entry point (MPHalt) +for the virtual adapter.

+ +

12) The Passthru +protocol then closes the binding to the underlying adapter by calling NdisCloseAdapter, and completes the unbind request issued +in step 9.

+ +

13) Handling Power Management

+ +

13.1 During initialization, the Passthru miniport should set the Attribute 'NDIS_ATTRIBUTE_NO_HALT_ON_SUSPEND' +in its call to NdisMSetAttributesEx.

+ +

13.2 When the Passthru +miniport is requested to report its Plug and Play capabilities +(OID_PNP_CAPABILITIES), the Passthru miniport must +pass the request to the underlying miniport. If this request succeeds, then the +Passthru miniport should overwrite the following +fields before successfully completing the original request:

+ +

NDIS_DEVICE_POWER_STATE          MinMagicPacketWakeUp += NdisDeviceStateUnspecified;

+ +

NDIS_DEVICE_POWER_STATE          MinPatternWakeUp= +NdisDeviceStateUnspecified;

+ +

NDIS_DEVICE_POWER_STATE          MinLinkChangeWakeUp=NdisDeviceStateUnspecified

+ +

If the miniport below the Passthru protocol fails this request, then the status that +was returned should be used to respond to the original request that was made to +the Passthru miniport.

+ +

13.3 OID_PNP_SET_POWER and OID_PNP_QUERY_POWER +should not be passed to the miniport below the Passthru +protocol, as those miniports will receive independent +requests from NDIS.

+ +

13.4 NDIS calls the Passthru driverÂ’s ProtocolPnPEvent +entry point (PtPnPHandler) whenever the underlying adapter +is transitioned to a different power state. If the underlying adapter is +transitioning to a low power state, the IM driver should wait for all +outstanding sends and requests to complete.

+ +

14) NDIS 5.1 Features

+ +

14.1 All NDIS 5.1 features in Passthru are identified by #ifdef +NDIS51 compiler directives. The following major features are illustrated (refer +to the DDK documentation for more information on these):

+ +

Packet stacking: this allows an IM driver to +reuse a packet submitted to its protocol or miniport edge to forward data down +(or up) to the adjacent layer.

+ +

Canceling Sends: Passthru +propagates send cancellations from protocols above it to lower miniports.

+ +

PnP Event Propagation: Passthru +propagates PnP events arriving at its protocol (lower) edge to higher layer +protocols that are bound to its virtual adapter.

+ +

NdisQueryPendingIOCount: Passthru +uses this new API to determine if any I/O operations are in progress on its +lower binding.

+ +

15) For Win2K SP2 and WinXP, the Passthru sample no +longer requires a Notify Object. The Notify Object has been removed.

+ +

 

+ +

Top of page

+ + + + + +
+

 

+
+ +

© 1999 Microsoft +Corporation

+ +
+ + + + + diff --git a/original_passthru/passthru.rc b/original_passthru/passthru.rc new file mode 100644 index 0000000..6ae427c --- /dev/null +++ b/original_passthru/passthru.rc @@ -0,0 +1,41 @@ +#include +#include + +/*-----------------------------------------------*/ +/* the following lines are specific to this file */ +/*-----------------------------------------------*/ + +/* VER_FILETYPE, VER_FILESUBTYPE, VER_FILEDESCRIPTION_STR + * and VER_INTERNALNAME_STR must be defined before including COMMON.VER + * The strings don't need a '\0', since common.ver has them. + */ +#define VER_FILETYPE VFT_DRV +/* possible values: VFT_UNKNOWN + VFT_APP + VFT_DLL + VFT_DRV + VFT_FONT + VFT_VXD + VFT_STATIC_LIB +*/ +#define VER_FILESUBTYPE VFT2_DRV_NETWORK +/* possible values VFT2_UNKNOWN + VFT2_DRV_PRINTER + VFT2_DRV_KEYBOARD + VFT2_DRV_LANGUAGE + VFT2_DRV_DISPLAY + VFT2_DRV_MOUSE + VFT2_DRV_NETWORK + VFT2_DRV_SYSTEM + VFT2_DRV_INSTALLABLE + VFT2_DRV_SOUND + VFT2_DRV_COMM +*/ +#define VER_FILEDESCRIPTION_STR "Sample NDIS 4.0 Intermediate Miniport Driver" +#define VER_INTERNALNAME_STR "PASSTHRU.SYS" +#define VER_ORIGINALFILENAME_STR "PASSTHRU.SYS" +#define VER_LANGNEUTRAL + +#include "common.ver" + + diff --git a/original_passthru/precomp.h b/original_passthru/precomp.h new file mode 100644 index 0000000..b2870d1 --- /dev/null +++ b/original_passthru/precomp.h @@ -0,0 +1,11 @@ +#pragma warning(disable:4214) // bit field types other than int + +#pragma warning(disable:4201) // nameless struct/union +#pragma warning(disable:4115) // named type definition in parentheses +#pragma warning(disable:4127) // conditional expression is constant +#pragma warning(disable:4054) // cast of function pointer to PVOID +#pragma warning(disable:4244) // conversion from 'int' to 'BOOLEAN', possible loss of data + +#include +#include "passthru.h" + diff --git a/original_passthru/protocol.c b/original_passthru/protocol.c new file mode 100644 index 0000000..213924c --- /dev/null +++ b/original_passthru/protocol.c @@ -0,0 +1,1626 @@ +/*++ + +Copyright(c) 1992-2000 Microsoft Corporation + +Module Name: + + protocol.c + +Abstract: + + Ndis Intermediate Miniport driver sample. This is a passthru driver. + +Author: + +Environment: + + +Revision History: + + +--*/ + + +#include "precomp.h" +#pragma hdrstop + +#define MAX_PACKET_POOL_SIZE 0x0000FFFF +#define MIN_PACKET_POOL_SIZE 0x000000FF + +// +// NDIS version as 0xMMMMmmmm, where M=Major/m=minor (0x00050001 = 5.1); +// initially unknown (0) +// +ULONG NdisDotSysVersion = 0x0; + + +#define NDIS_SYS_VERSION_51 0x00050001 + + +VOID +PtBindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE BindContext, + IN PNDIS_STRING DeviceName, + IN PVOID SystemSpecific1, + IN PVOID SystemSpecific2 + ) +/*++ + +Routine Description: + + Called by NDIS to bind to a miniport below. + +Arguments: + + Status - Return status of bind here. + BindContext - Can be passed to NdisCompleteBindAdapter if this call is pended. + DeviceName - Device name to bind to. This is passed to NdisOpenAdapter. + SystemSpecific1 - Can be passed to NdisOpenProtocolConfiguration to read per-binding information + SystemSpecific2 - Unused + +Return Value: + + NDIS_STATUS_PENDING if this call is pended. In this case call NdisCompleteBindAdapter + to complete. + Anything else Completes this call synchronously + +--*/ +{ + NDIS_HANDLE ConfigHandle = NULL; + PNDIS_CONFIGURATION_PARAMETER Param; + NDIS_STRING DeviceStr = NDIS_STRING_CONST("UpperBindings"); + NDIS_STRING NdisVersionStr = NDIS_STRING_CONST("NdisVersion"); + PADAPT pAdapt = NULL; + NDIS_STATUS Sts; + UINT MediumIndex; + ULONG TotalSize; + BOOLEAN NoCleanUpNeeded = FALSE; + + + UNREFERENCED_PARAMETER(BindContext); + UNREFERENCED_PARAMETER(SystemSpecific2); + + DBGPRINT(("==> Protocol BindAdapter\n")); + + do + { + // + // Access the configuration section for our binding-specific + // parameters. + // + NdisOpenProtocolConfiguration(Status, + &ConfigHandle, + SystemSpecific1); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + if (NdisDotSysVersion == 0) + { + NdisReadConfiguration(Status, + &Param, + ConfigHandle, + &NdisVersionStr, // "NdisVersion" + NdisParameterInteger); + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + NdisDotSysVersion = Param->ParameterData.IntegerData; + } + + + // + // Read the "UpperBindings" reserved key that contains a list + // of device names representing our miniport instances corresponding + // to this lower binding. Since this is a 1:1 IM driver, this key + // contains exactly one name. + // + // If we want to implement a N:1 mux driver (N adapter instances + // over a single lower binding), then UpperBindings will be a + // MULTI_SZ containing a list of device names - we would loop through + // this list, calling NdisIMInitializeDeviceInstanceEx once for + // each name in it. + // + NdisReadConfiguration(Status, + &Param, + ConfigHandle, + &DeviceStr, + NdisParameterString); + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Allocate memory for the Adapter structure. This represents both the + // protocol context as well as the adapter structure when the miniport + // is initialized. + // + // In addition to the base structure, allocate space for the device + // instance string. + // + TotalSize = sizeof(ADAPT) + Param->ParameterData.StringData.MaximumLength; + + NdisAllocateMemoryWithTag(&pAdapt, TotalSize, TAG); + + if (pAdapt == NULL) + { + *Status = NDIS_STATUS_RESOURCES; + break; + } + + // + // Initialize the adapter structure. We copy in the IM device + // name as well, because we may need to use it in a call to + // NdisIMCancelInitializeDeviceInstance. The string returned + // by NdisReadConfiguration is active (i.e. available) only + // for the duration of this call to our BindAdapter handler. + // + NdisZeroMemory(pAdapt, TotalSize); + pAdapt->DeviceName.MaximumLength = Param->ParameterData.StringData.MaximumLength; + pAdapt->DeviceName.Length = Param->ParameterData.StringData.Length; + pAdapt->DeviceName.Buffer = (PWCHAR)((ULONG_PTR)pAdapt + sizeof(ADAPT)); + NdisMoveMemory(pAdapt->DeviceName.Buffer, + Param->ParameterData.StringData.Buffer, + Param->ParameterData.StringData.MaximumLength); + + + + NdisInitializeEvent(&pAdapt->Event); + NdisAllocateSpinLock(&pAdapt->Lock); + + // + // Allocate a packet pool for sends. We need this to pass sends down. + // We cannot use the same packet descriptor that came down to our send + // handler (see also NDIS 5.1 packet stacking). + // + NdisAllocatePacketPoolEx(Status, + &pAdapt->SendPacketPoolHandle, + MIN_PACKET_POOL_SIZE, + MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE, + sizeof(SEND_RSVD)); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Allocate a packet pool for receives. We need this to indicate receives. + // Same consideration as sends (see also NDIS 5.1 packet stacking). + // + NdisAllocatePacketPoolEx(Status, + &pAdapt->RecvPacketPoolHandle, + MIN_PACKET_POOL_SIZE, + MAX_PACKET_POOL_SIZE - MIN_PACKET_POOL_SIZE, + PROTOCOL_RESERVED_SIZE_IN_PACKET); + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + + // + // Now open the adapter below and complete the initialization + // + NdisOpenAdapter(Status, + &Sts, + &pAdapt->BindingHandle, + &MediumIndex, + MediumArray, + sizeof(MediumArray)/sizeof(NDIS_MEDIUM), + ProtHandle, + pAdapt, + DeviceName, + 0, + NULL); + + if (*Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + *Status = pAdapt->Status; + } + + if (*Status != NDIS_STATUS_SUCCESS) + { + break; + } + PtReferenceAdapt(pAdapt); + +#pragma prefast(suppress: __WARNING_POTENTIAL_BUFFER_OVERFLOW, "Ndis guarantees MediumIndex to be within bounds"); + pAdapt->Medium = MediumArray[MediumIndex]; + + // + // Now ask NDIS to initialize our miniport (upper) edge. + // Set the flag below to synchronize with a possible call + // to our protocol Unbind handler that may come in before + // our miniport initialization happens. + // + pAdapt->MiniportInitPending = TRUE; + NdisInitializeEvent(&pAdapt->MiniportInitEvent); + + PtReferenceAdapt(pAdapt); + + *Status = NdisIMInitializeDeviceInstanceEx(DriverHandle, + &pAdapt->DeviceName, + pAdapt); + + if (*Status != NDIS_STATUS_SUCCESS) + { + if (pAdapt->MiniportIsHalted == TRUE) + { + NoCleanUpNeeded = TRUE; + } + + DBGPRINT(("BindAdapter: Adapt %p, IMInitializeDeviceInstance error %x\n", + pAdapt, *Status)); + + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + + break; + } + + PtDereferenceAdapt(pAdapt); + + } while(FALSE); + + // + // Close the configuration handle now - see comments above with + // the call to NdisIMInitializeDeviceInstanceEx. + // + if (ConfigHandle != NULL) + { + NdisCloseConfiguration(ConfigHandle); + } + + if ((*Status != NDIS_STATUS_SUCCESS) && (NoCleanUpNeeded == FALSE)) + { + if (pAdapt != NULL) + { + if (pAdapt->BindingHandle != NULL) + { + NDIS_STATUS LocalStatus; + + // + // Close the binding we opened above. + // + + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(&LocalStatus, pAdapt->BindingHandle); + pAdapt->BindingHandle = NULL; + + if (LocalStatus == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + LocalStatus = pAdapt->Status; + + + } + if (PtDereferenceAdapt(pAdapt)) + { + pAdapt = NULL; + } + } + } + } + + + DBGPRINT(("<== Protocol BindAdapter: pAdapt %p, Status %x\n", pAdapt, *Status)); +} + + +VOID +PtOpenAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status, + IN NDIS_STATUS OpenErrorStatus + ) +/*++ + +Routine Description: + + Completion routine for NdisOpenAdapter issued from within the PtBindAdapter. Simply + unblock the caller. + +Arguments: + + ProtocolBindingContext Pointer to the adapter + Status Status of the NdisOpenAdapter call + OpenErrorStatus Secondary status(ignored by us). + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + UNREFERENCED_PARAMETER(OpenErrorStatus); + + DBGPRINT(("==> PtOpenAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status)); + pAdapt->Status = Status; + NdisSetEvent(&pAdapt->Event); +} + + +VOID +PtUnbindAdapter( + OUT PNDIS_STATUS Status, + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE UnbindContext + ) +/*++ + +Routine Description: + + Called by NDIS when we are required to unbind to the adapter below. + This functions shares functionality with the miniport's HaltHandler. + The code should ensure that NdisCloseAdapter and NdisFreeMemory is called + only once between the two functions + +Arguments: + + Status Placeholder for return status + ProtocolBindingContext Pointer to the adapter structure + UnbindContext Context for NdisUnbindComplete() if this pends + +Return Value: + + Status for NdisIMDeinitializeDeviceContext + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS LocalStatus; + + UNREFERENCED_PARAMETER(UnbindContext); + + DBGPRINT(("==> PtUnbindAdapter: Adapt %p\n", pAdapt)); + + // + // Set the flag that the miniport below is unbinding, so the request handlers will + // fail any request comming later + // + NdisAcquireSpinLock(&pAdapt->Lock); + pAdapt->UnbindingInProcess = TRUE; + if (pAdapt->QueuedRequest == TRUE) + { + pAdapt->QueuedRequest = FALSE; + NdisReleaseSpinLock(&pAdapt->Lock); + + PtRequestComplete(pAdapt, + &pAdapt->Request, + NDIS_STATUS_FAILURE ); + + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } +#ifndef WIN9X + // + // Check if we had called NdisIMInitializeDeviceInstanceEx and + // we are awaiting a call to MiniportInitialize. + // + if (pAdapt->MiniportInitPending == TRUE) + { + // + // Try to cancel the pending IMInit process. + // + LocalStatus = NdisIMCancelInitializeDeviceInstance( + DriverHandle, + &pAdapt->DeviceName); + + if (LocalStatus == NDIS_STATUS_SUCCESS) + { + // + // Successfully cancelled IM Initialization; our + // Miniport Initialize routine will not be called + // for this device. + // + pAdapt->MiniportInitPending = FALSE; + ASSERT(pAdapt->MiniportHandle == NULL); + } + else + { + // + // Our Miniport Initialize routine will be called + // (may be running on another thread at this time). + // Wait for it to finish. + // + NdisWaitEvent(&pAdapt->MiniportInitEvent, 0); + ASSERT(pAdapt->MiniportInitPending == FALSE); + } + + } +#endif // !WIN9X + + // + // Call NDIS to remove our device-instance. We do most of the work + // inside the HaltHandler. + // + // The Handle will be NULL if our miniport Halt Handler has been called or + // if the IM device was never initialized + // + + if (pAdapt->MiniportHandle != NULL) + { + *Status = NdisIMDeInitializeDeviceInstance(pAdapt->MiniportHandle); + + if (*Status != NDIS_STATUS_SUCCESS) + { + *Status = NDIS_STATUS_FAILURE; + } + } + else + { + // + // We need to do some work here. + // Close the binding below us + // and release the memory allocated. + // + + if(pAdapt->BindingHandle != NULL) + { + NdisResetEvent(&pAdapt->Event); + + NdisCloseAdapter(Status, pAdapt->BindingHandle); + + // + // Wait for it to complete + // + if(*Status == NDIS_STATUS_PENDING) + { + NdisWaitEvent(&pAdapt->Event, 0); + *Status = pAdapt->Status; + } + pAdapt->BindingHandle = NULL; + } + else + { + // + // Both Our MiniportHandle and Binding Handle should not be NULL. + // + *Status = NDIS_STATUS_FAILURE; + ASSERT(0); + } + + // + // Free the memory here, if was not released earlier(by calling the HaltHandler) + // + MPFreeAllPacketPools(pAdapt); + NdisFreeSpinLock(&pAdapt->Lock); + NdisFreeMemory(pAdapt, 0, 0); + } + + DBGPRINT(("<== PtUnbindAdapter: Adapt %p\n", pAdapt)); +} + +VOID +PtUnloadProtocol( + VOID +) +{ + NDIS_STATUS Status; + + if (ProtHandle != NULL) + { + NdisDeregisterProtocol(&Status, ProtHandle); + ProtHandle = NULL; + } + + DBGPRINT(("PtUnloadProtocol: done!\n")); +} + + + +VOID +PtCloseAdapterComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion for the CloseAdapter call. + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + Status Completion status + +Return Value: + + None. + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + DBGPRINT(("CloseAdapterComplete: Adapt %p, Status %x\n", pAdapt, Status)); + pAdapt->Status = Status; + NdisSetEvent(&pAdapt->Event); +} + + +VOID +PtResetComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion for the reset. + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + Status Completion status + +Return Value: + + None. + +--*/ +{ + + UNREFERENCED_PARAMETER(ProtocolBindingContext); + UNREFERENCED_PARAMETER(Status); + // + // We never issue a reset, so we should not be here. + // + ASSERT(0); +} + + +VOID +PtRequestComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_REQUEST NdisRequest, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Completion handler for the previously posted request. All OIDS + are completed by and sent to the same miniport that they were requested for. + If Oid == OID_PNP_QUERY_POWER then the data structure needs to returned with all entries = + NdisDeviceStateUnspecified + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + NdisRequest The posted request + Status Completion status + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + NDIS_OID Oid = pAdapt->Request.DATA.SET_INFORMATION.Oid ; + + // + // Since our request is not outstanding anymore + // + ASSERT(pAdapt->OutstandingRequests == TRUE); + + pAdapt->OutstandingRequests = FALSE; + + // + // Complete the Set or Query, and fill in the buffer for OID_PNP_CAPABILITIES, if need be. + // + switch (NdisRequest->RequestType) + { + case NdisRequestQueryInformation: + + // + // We never pass OID_PNP_QUERY_POWER down. + // + ASSERT(Oid != OID_PNP_QUERY_POWER); + + if ((Oid == OID_PNP_CAPABILITIES) && (Status == NDIS_STATUS_SUCCESS)) + { + MPQueryPNPCapabilities(pAdapt, &Status); + } + *pAdapt->BytesReadOrWritten = NdisRequest->DATA.QUERY_INFORMATION.BytesWritten; + *pAdapt->BytesNeeded = NdisRequest->DATA.QUERY_INFORMATION.BytesNeeded; + + if (((Oid == OID_GEN_MAC_OPTIONS) + && (Status == NDIS_STATUS_SUCCESS)) + && (NdisDotSysVersion >= NDIS_SYS_VERSION_51)) + { + // + // Only do this on Windows XP or greater (NDIS.SYS v 5.1); + // do not do in Windows 2000 (NDIS.SYS v 5.0)) + // + + // + // Remove the no-loopback bit from mac-options. In essence we are + // telling NDIS that we can handle loopback. We don't, but the + // interface below us does. If we do not do this, then loopback + // processing happens both below us and above us. This is wasteful + // at best and if Netmon is running, it will see multiple copies + // of loopback packets when sniffing above us. + // + // Only the lowest miniport is a stack of layered miniports should + // ever report this bit set to NDIS. + // + *(PULONG)NdisRequest->DATA.QUERY_INFORMATION.InformationBuffer &= ~NDIS_MAC_OPTION_NO_LOOPBACK; + } + + NdisMQueryInformationComplete(pAdapt->MiniportHandle, + Status); + break; + + case NdisRequestSetInformation: + + ASSERT( Oid != OID_PNP_SET_POWER); + + *pAdapt->BytesReadOrWritten = NdisRequest->DATA.SET_INFORMATION.BytesRead; + *pAdapt->BytesNeeded = NdisRequest->DATA.SET_INFORMATION.BytesNeeded; + NdisMSetInformationComplete(pAdapt->MiniportHandle, + Status); + break; + + default: + ASSERT(0); + break; + } + +} + + +VOID +PtStatus( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_STATUS GeneralStatus, + IN PVOID StatusBuffer, + IN UINT StatusBufferSize + ) +/*++ + +Routine Description: + + Status handler for the lower-edge(protocol). + +Arguments: + + ProtocolBindingContext Pointer to the adapter structure + GeneralStatus Status code + StatusBuffer Status buffer + StatusBufferSize Size of the status buffer + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + + // + // Pass up this indication only if the upper edge miniport is initialized + // and powered on. Also ignore indications that might be sent by the lower + // miniport when it isn't at D0. + // + if ((pAdapt->MiniportHandle != NULL) && + (pAdapt->MPDeviceState == NdisDeviceStateD0) && + (pAdapt->PTDeviceState == NdisDeviceStateD0)) + { + if ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || + (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT)) + { + + pAdapt->LastIndicatedStatus = GeneralStatus; + } + NdisMIndicateStatus(pAdapt->MiniportHandle, + GeneralStatus, + StatusBuffer, + StatusBufferSize); + } + // + // Save the last indicated media status + // + else + { + if ((pAdapt->MiniportHandle != NULL) && + ((GeneralStatus == NDIS_STATUS_MEDIA_CONNECT) || + (GeneralStatus == NDIS_STATUS_MEDIA_DISCONNECT))) + { + pAdapt->LatestUnIndicateStatus = GeneralStatus; + } + } + +} + + +VOID +PtStatusComplete( + IN NDIS_HANDLE ProtocolBindingContext + ) +/*++ + +Routine Description: + + +Arguments: + + +Return Value: + + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + + // + // Pass up this indication only if the upper edge miniport is initialized + // and powered on. Also ignore indications that might be sent by the lower + // miniport when it isn't at D0. + // + if ((pAdapt->MiniportHandle != NULL) && + (pAdapt->MPDeviceState == NdisDeviceStateD0) && + (pAdapt->PTDeviceState == NdisDeviceStateD0)) + { + NdisMIndicateStatusComplete(pAdapt->MiniportHandle); + } +} + + +VOID +PtSendComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status + ) +/*++ + +Routine Description: + + Called by NDIS when the miniport below had completed a send. We should + complete the corresponding upper-edge send this represents. + +Arguments: + + ProtocolBindingContext - Points to ADAPT structure + Packet - Low level packet being completed + Status - status of send + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + PNDIS_PACKET Pkt; + NDIS_HANDLE PoolHandle; + +#ifdef NDIS51 + // + // Packet stacking: + // + // Determine if the packet we are completing is the one we allocated. If so, then + // get the original packet from the reserved area and completed it and free the + // allocated packet. If this is the packet that was sent down to us, then just + // complete it + // + PoolHandle = NdisGetPoolFromPacket(Packet); + if (PoolHandle != pAdapt->SendPacketPoolHandle) + { + // + // We had passed down a packet belonging to the protocol above us. + // + // DBGPRINT(("PtSendComp: Adapt %p, Stacked Packet %p\n", pAdapt, Packet)); + + NdisMSendComplete(pAdapt->MiniportHandle, + Packet, + Status); + } + else +#endif // NDIS51 + { + PSEND_RSVD SendRsvd; + + SendRsvd = (PSEND_RSVD)(Packet->ProtocolReserved); + Pkt = SendRsvd->OriginalPkt; + +#ifndef WIN9X + NdisIMCopySendCompletePerPacketInfo (Pkt, Packet); +#endif + + NdisDprFreePacket(Packet); + + NdisMSendComplete(pAdapt->MiniportHandle, + Pkt, + Status); + } + // + // Decrease the outstanding send count + // + ADAPT_DECR_PENDING_SENDS(pAdapt); +} + + +VOID +PtTransferDataComplete( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet, + IN NDIS_STATUS Status, + IN UINT BytesTransferred + ) +/*++ + +Routine Description: + + Entry point called by NDIS to indicate completion of a call by us + to NdisTransferData. + + See notes under SendComplete. + +Arguments: + +Return Value: + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + + if(pAdapt->MiniportHandle) + { + NdisMTransferDataComplete(pAdapt->MiniportHandle, + Packet, + Status, + BytesTransferred); + } +} + + +NDIS_STATUS +PtReceive( + IN NDIS_HANDLE ProtocolBindingContext, + IN NDIS_HANDLE MacReceiveContext, + IN PVOID HeaderBuffer, + IN UINT HeaderBufferSize, + IN PVOID LookAheadBuffer, + IN UINT LookAheadBufferSize, + IN UINT PacketSize + ) +/*++ + +Routine Description: + + Handle receive data indicated up by the miniport below. We pass + it along to the protocol above us. + + If the miniport below indicates packets, NDIS would more + likely call us at our ReceivePacket handler. However we + might be called here in certain situations even though + the miniport below has indicated a receive packet, e.g. + if the miniport had set packet status to NDIS_STATUS_RESOURCES. + +Arguments: + + + +Return Value: + + NDIS_STATUS_SUCCESS if we processed the receive successfully, + NDIS_STATUS_XXX error code if we discarded it. + +--*/ +{ + PADAPT pAdapt = (PADAPT)ProtocolBindingContext; + PNDIS_PACKET MyPacket, Packet = NULL; + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + ULONG Proc = KeGetCurrentProcessorNumber(); + + if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0)) + { + Status = NDIS_STATUS_FAILURE; + } + else do + { + // + // Get at the packet, if any, indicated up by the miniport below. + // + Packet = NdisGetReceivedPacket(pAdapt->BindingHandle, MacReceiveContext); + if (Packet != NULL) + { + // + // The miniport below did indicate up a packet. Use information + // from that packet to construct a new packet to indicate up. + // + +#ifdef NDIS51 + // + // NDIS 5.1 NOTE: Do not reuse the original packet in indicating + // up a receive, even if there is sufficient packet stack space. + // If we had to do so, we would have had to overwrite the + // status field in the original packet to NDIS_STATUS_RESOURCES, + // and it is not allowed for protocols to overwrite this field + // in received packets. + // +#endif // NDIS51 + + // + // Get a packet off the pool and indicate that up + // + NdisDprAllocatePacket(&Status, + &MyPacket, + pAdapt->RecvPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + // + // Make our packet point to data from the original + // packet. NOTE: this works only because we are + // indicating a receive directly from the context of + // our receive indication. If we need to queue this + // packet and indicate it from another thread context, + // we will also have to allocate a new buffer and copy + // over the packet contents, OOB data and per-packet + // information. This is because the packet data + // is available only for the duration of this + // receive indication call. + // + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); + + // + // Get the original packet (it could be the same packet as the + // one received or a different one based on the number of layered + // miniports below) and set it on the indicated packet so the OOB + // data is visible correctly at protocols above. If the IM driver + // modifies the packet in any way it should not set the new packet's + // original packet equal to the original packet of the packet that + // was indicated to it from the underlying driver, in this case, the + // IM driver should also ensure that the related per packet info should + // be copied to the new packet. + // we can set the original packet to the original packet of the packet + // indicated from the underlying driver because the driver doesn't modify + // the data content in the packet. + // + NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet)); + NDIS_SET_PACKET_HEADER_SIZE(MyPacket, HeaderBufferSize); + + // + // Copy packet flags. + // + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + // + // Force protocols above to make a copy if they want to hang + // on to data in this packet. This is because we are in our + // Receive handler (not ReceivePacket) and we can't return a + // ref count from here. + // + NDIS_SET_PACKET_STATUS(MyPacket, NDIS_STATUS_RESOURCES); + + // + // By setting NDIS_STATUS_RESOURCES, we also know that we can reclaim + // this packet as soon as the call to NdisMIndicateReceivePacket + // returns. + // + + if (pAdapt->MiniportHandle != NULL) + { + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + + // + // Reclaim the indicated packet. Since we had set its status + // to NDIS_STATUS_RESOURCES, we are guaranteed that protocols + // above are done with it. + // + NdisDprFreePacket(MyPacket); + + break; + } + } + else + { + // + // The miniport below us uses the old-style (not packet) + // receive indication. Fall through. + // + } + + // + // Fall through if the miniport below us has either not + // indicated a packet or we could not allocate one + // + pAdapt->ReceivedIndicationFlags[Proc] = TRUE; + if (pAdapt->MiniportHandle == NULL) + { + break; + } + switch (pAdapt->Medium) + { + case NdisMedium802_3: + case NdisMediumWan: + NdisMEthIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; + + case NdisMedium802_5: + NdisMTrIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; + +#if FDDI + case NdisMediumFddi: + NdisMFddiIndicateReceive(pAdapt->MiniportHandle, + MacReceiveContext, + HeaderBuffer, + HeaderBufferSize, + LookAheadBuffer, + LookAheadBufferSize, + PacketSize); + break; +#endif + default: + ASSERT(FALSE); + break; + } + + } while(FALSE); + + return Status; +} + + +VOID +PtReceiveComplete( + IN NDIS_HANDLE ProtocolBindingContext + ) +/*++ + +Routine Description: + + Called by the adapter below us when it is done indicating a batch of + received packets. + +Arguments: + + ProtocolBindingContext Pointer to our adapter structure. + +Return Value: + + None + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + ULONG Proc = KeGetCurrentProcessorNumber(); + + if (((pAdapt->MiniportHandle != NULL) + && (pAdapt->MPDeviceState == NdisDeviceStateD0)) + && (pAdapt->ReceivedIndicationFlags[Proc])) + { + switch (pAdapt->Medium) + { + case NdisMedium802_3: + case NdisMediumWan: + NdisMEthIndicateReceiveComplete(pAdapt->MiniportHandle); + break; + + case NdisMedium802_5: + NdisMTrIndicateReceiveComplete(pAdapt->MiniportHandle); + break; +#if FDDI + case NdisMediumFddi: + NdisMFddiIndicateReceiveComplete(pAdapt->MiniportHandle); + break; +#endif + default: + ASSERT(FALSE); + break; + } + } + + pAdapt->ReceivedIndicationFlags[Proc] = FALSE; +} + + +INT +PtReceivePacket( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNDIS_PACKET Packet + ) +/*++ + +Routine Description: + + ReceivePacket handler. Called by NDIS if the miniport below supports + NDIS 4.0 style receives. Re-package the buffer chain in a new packet + and indicate the new packet to protocols above us. Any context for + packets indicated up must be kept in the MiniportReserved field. + + NDIS 5.1 - packet stacking - if there is sufficient "stack space" in + the packet passed to us, we can use the same packet in a receive + indication. + +Arguments: + + ProtocolBindingContext - Pointer to our adapter structure. + Packet - Pointer to the packet + +Return Value: + + == 0 -> We are done with the packet + != 0 -> We will keep the packet and call NdisReturnPackets() this + many times when done. +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS Status; + PNDIS_PACKET MyPacket; + BOOLEAN Remaining; + + // + // Drop the packet silently if the upper miniport edge isn't initialized or + // the miniport edge is in low power state + // + if ((!pAdapt->MiniportHandle) || (pAdapt->MPDeviceState > NdisDeviceStateD0)) + { + return 0; + } + +#ifdef NDIS51 + // + // Check if we can reuse the same packet for indicating up. + // See also: PtReceive(). + // + (VOID)NdisIMGetCurrentPacketStack(Packet, &Remaining); + if (Remaining) + { + // + // We can reuse "Packet". Indicate it up and be done with it. + // + Status = NDIS_GET_PACKET_STATUS(Packet); + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &Packet, 1); + return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0); + } +#endif // NDIS51 + + // + // Get a packet off the pool and indicate that up + // + NdisDprAllocatePacket(&Status, + &MyPacket, + pAdapt->RecvPacketPoolHandle); + + if (Status == NDIS_STATUS_SUCCESS) + { + PRECV_RSVD RecvRsvd; + + RecvRsvd = (PRECV_RSVD)(MyPacket->MiniportReserved); + RecvRsvd->OriginalPkt = Packet; + + NDIS_PACKET_FIRST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_FIRST_NDIS_BUFFER(Packet); + NDIS_PACKET_LAST_NDIS_BUFFER(MyPacket) = NDIS_PACKET_LAST_NDIS_BUFFER(Packet); + + // + // Get the original packet (it could be the same packet as the one + // received or a different one based on the number of layered miniports + // below) and set it on the indicated packet so the OOB data is visible + // correctly to protocols above us. + // + NDIS_SET_ORIGINAL_PACKET(MyPacket, NDIS_GET_ORIGINAL_PACKET(Packet)); + + // + // Set Packet Flags + // + NdisGetPacketFlags(MyPacket) = NdisGetPacketFlags(Packet); + + Status = NDIS_GET_PACKET_STATUS(Packet); + + NDIS_SET_PACKET_STATUS(MyPacket, Status); + NDIS_SET_PACKET_HEADER_SIZE(MyPacket, NDIS_GET_PACKET_HEADER_SIZE(Packet)); + + if (pAdapt->MiniportHandle != NULL) + { + NdisMIndicateReceivePacket(pAdapt->MiniportHandle, &MyPacket, 1); + } + + // + // Check if we had indicated up the packet with NDIS_STATUS_RESOURCES + // NOTE -- do not use NDIS_GET_PACKET_STATUS(MyPacket) for this since + // it might have changed! Use the value saved in the local variable. + // + if (Status == NDIS_STATUS_RESOURCES) + { + // + // Our ReturnPackets handler will not be called for this packet. + // We should reclaim it right here. + // + NdisDprFreePacket(MyPacket); + } + + return((Status != NDIS_STATUS_RESOURCES) ? 1 : 0); + } + else + { + // + // We are out of packets. Silently drop it. + // + return(0); + } +} + + +NDIS_STATUS +PtPNPHandler( + IN NDIS_HANDLE ProtocolBindingContext, + IN PNET_PNP_EVENT pNetPnPEvent + ) + +/*++ +Routine Description: + + This is called by NDIS to notify us of a PNP event related to a lower + binding. Based on the event, this dispatches to other helper routines. + + NDIS 5.1: forward this event to the upper protocol(s) by calling + NdisIMNotifyPnPEvent. + +Arguments: + + ProtocolBindingContext - Pointer to our adapter structure. Can be NULL + for "global" notifications + + pNetPnPEvent - Pointer to the PNP event to be processed. + +Return Value: + + NDIS_STATUS code indicating status of event processing. + +--*/ +{ + PADAPT pAdapt =(PADAPT)ProtocolBindingContext; + NDIS_STATUS Status = NDIS_STATUS_SUCCESS; + + DBGPRINT(("PtPnPHandler: Adapt %p, Event %d\n", pAdapt, pNetPnPEvent->NetEvent)); + + switch (pNetPnPEvent->NetEvent) + { + case NetEventSetPower: + Status = PtPnPNetEventSetPower(pAdapt, pNetPnPEvent); + break; + + case NetEventReconfigure: + Status = PtPnPNetEventReconfigure(pAdapt, pNetPnPEvent); + break; + + default: +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above, before + // doing anything else with it. + // + if (pAdapt && pAdapt->MiniportHandle) + { + Status = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#else + Status = NDIS_STATUS_SUCCESS; + +#endif // NDIS51 + + break; + } + + return Status; +} + + +NDIS_STATUS +PtPnPNetEventReconfigure( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ) +/*++ +Routine Description: + + This routine is called from NDIS to notify our protocol edge of a + reconfiguration of parameters for either a specific binding (pAdapt + is not NULL), or global parameters if any (pAdapt is NULL). + +Arguments: + + pAdapt - Pointer to our adapter structure. + pNetPnPEvent - the reconfigure event + +Return Value: + + NDIS_STATUS_SUCCESS + +--*/ +{ + NDIS_STATUS ReconfigStatus = NDIS_STATUS_SUCCESS; + NDIS_STATUS ReturnStatus = NDIS_STATUS_SUCCESS; + + do + { + // + // Is this is a global reconfiguration notification ? + // + if (pAdapt == NULL) + { + // + // An important event that causes this notification to us is if + // one of our upper-edge miniport instances was enabled after being + // disabled earlier, e.g. from Device Manager in Win2000. Note that + // NDIS calls this because we had set up an association between our + // miniport and protocol entities by calling NdisIMAssociateMiniport. + // + // Since we would have torn down the lower binding for that miniport, + // we need NDIS' assistance to re-bind to the lower miniport. The + // call to NdisReEnumerateProtocolBindings does exactly that. + // + NdisReEnumerateProtocolBindings (ProtHandle); + + break; + } + +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above before doing anything + // with it. + // + if (pAdapt->MiniportHandle) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + ReconfigStatus = NDIS_STATUS_SUCCESS; + + } while(FALSE); + + DBGPRINT(("<==PtPNPNetEventReconfigure: pAdapt %p\n", pAdapt)); + +#ifdef NDIS51 + // + // Overwrite status with what upper-layer protocol(s) returned. + // + ReconfigStatus = ReturnStatus; +#endif + + return ReconfigStatus; +} + + +NDIS_STATUS +PtPnPNetEventSetPower( + IN PADAPT pAdapt, + IN PNET_PNP_EVENT pNetPnPEvent + ) +/*++ +Routine Description: + + This is a notification to our protocol edge of the power state + of the lower miniport. If it is going to a low-power state, we must + wait here for all outstanding sends and requests to complete. + + NDIS 5.1: Since we use packet stacking, it is not sufficient to + check usage of our local send packet pool to detect whether or not + all outstanding sends have completed. For this, use the new API + NdisQueryPendingIOCount. + + NDIS 5.1: Use the 5.1 API NdisIMNotifyPnPEvent to pass on PnP + notifications to upper protocol(s). + +Arguments: + + pAdapt - Pointer to the adpater structure + pNetPnPEvent - The Net Pnp Event. this contains the new device state + +Return Value: + + NDIS_STATUS_SUCCESS or the status returned by upper-layer protocols. + +--*/ +{ + PNDIS_DEVICE_POWER_STATE pDeviceState =(PNDIS_DEVICE_POWER_STATE)(pNetPnPEvent->Buffer); + NDIS_DEVICE_POWER_STATE PrevDeviceState = pAdapt->PTDeviceState; + NDIS_STATUS Status; + NDIS_STATUS ReturnStatus; + + ReturnStatus = NDIS_STATUS_SUCCESS; + + // + // Set the Internal Device State, this blocks all new sends or receives + // + NdisAcquireSpinLock(&pAdapt->Lock); + pAdapt->PTDeviceState = *pDeviceState; + + // + // Check if the miniport below is going to a low power state. + // + if (pAdapt->PTDeviceState > NdisDeviceStateD0) + { + // + // If the miniport below is going to standby, fail all incoming requests + // + if (PrevDeviceState == NdisDeviceStateD0) + { + pAdapt->StandingBy = TRUE; + } + + NdisReleaseSpinLock(&pAdapt->Lock); + +#ifdef NDIS51 + // + // Notify upper layer protocol(s) first. + // + if (pAdapt->MiniportHandle != NULL) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + // + // Wait for outstanding sends and requests to complete. + // + while (pAdapt->OutstandingSends != 0) + { + NdisMSleep(2); + } + + while (pAdapt->OutstandingRequests == TRUE) + { + // + // sleep till outstanding requests complete + // + NdisMSleep(2); + } + + // + // If the below miniport is going to low power state, complete the queued request + // + NdisAcquireSpinLock(&pAdapt->Lock); + if (pAdapt->QueuedRequest) + { + pAdapt->QueuedRequest = FALSE; + NdisReleaseSpinLock(&pAdapt->Lock); + PtRequestComplete(pAdapt, &pAdapt->Request, NDIS_STATUS_FAILURE); + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } + + + ASSERT(NdisPacketPoolUsage(pAdapt->SendPacketPoolHandle) == 0); + ASSERT(pAdapt->OutstandingRequests == FALSE); + } + else + { + // + // If the physical miniport is powering up (from Low power state to D0), + // clear the flag + // + if (PrevDeviceState > NdisDeviceStateD0) + { + pAdapt->StandingBy = FALSE; + } + // + // The device below is being turned on. If we had a request + // pending, send it down now. + // + if (pAdapt->QueuedRequest == TRUE) + { + pAdapt->QueuedRequest = FALSE; + + pAdapt->OutstandingRequests = TRUE; + NdisReleaseSpinLock(&pAdapt->Lock); + + NdisRequest(&Status, + pAdapt->BindingHandle, + &pAdapt->Request); + + if (Status != NDIS_STATUS_PENDING) + { + PtRequestComplete(pAdapt, + &pAdapt->Request, + Status); + + } + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + } + + +#ifdef NDIS51 + // + // Pass on this notification to protocol(s) above + // + if (pAdapt->MiniportHandle) + { + ReturnStatus = NdisIMNotifyPnPEvent(pAdapt->MiniportHandle, pNetPnPEvent); + } +#endif // NDIS51 + + } + + return ReturnStatus; +} + +VOID +PtReferenceAdapt( + IN PADAPT pAdapt + ) +{ + NdisAcquireSpinLock(&pAdapt->Lock); + + ASSERT(pAdapt->RefCount >= 0); + + pAdapt->RefCount ++; + NdisReleaseSpinLock(&pAdapt->Lock); +} + + +BOOLEAN +PtDereferenceAdapt( + IN PADAPT pAdapt + ) +{ + NdisAcquireSpinLock(&pAdapt->Lock); + + ASSERT(pAdapt->RefCount > 0); + + pAdapt->RefCount--; + + if (pAdapt->RefCount == 0) + { + NdisReleaseSpinLock(&pAdapt->Lock); + + // + // Free all resources on this adapter structure. + // + MPFreeAllPacketPools (pAdapt);; + NdisFreeSpinLock(&pAdapt->Lock); + NdisFreeMemory(pAdapt, 0 , 0); + + return TRUE; + + } + else + { + NdisReleaseSpinLock(&pAdapt->Lock); + + return FALSE; + } +} + + diff --git a/original_passthru/sources b/original_passthru/sources new file mode 100644 index 0000000..d52d78f --- /dev/null +++ b/original_passthru/sources @@ -0,0 +1,39 @@ +TARGETNAME=passthru +TARGETTYPE=DRIVER + +C_DEFINES=$(C_DEFINES) -DNDIS_MINIPORT_DRIVER -DNDIS_WDM=1 + +MSC_WARNING_LEVEL=/WX /W4 + +!if "$(DDK_TARGET_OS)"=="Win2K" +# +# The driver is built in the Win2K build environment +# +C_DEFINES=$(C_DEFINES) -DNDIS40_MINIPORT=1 +C_DEFINES=$(C_DEFINES) -DNDIS40=1 +!else +# +# The driver is built in the XP or .NET build environment +# So let us build NDIS 5.1 version. +# +C_DEFINES=$(C_DEFINES) -DNDIS51_MINIPORT=1 +C_DEFINES=$(C_DEFINES) -DNDIS51=1 +!endif + +# Uncomment the following to build for Win98/SE/WinMe +# This causes several APIs that are not present in Win9X to be +# ifdef'ed out. +# C_DEFINES=$(C_DEFINES) -DWIN9X=1 + +PRECOMPILED_INCLUDE=precomp.h + +TARGETLIBS=$(DDK_LIB_PATH)\ndis.lib + +INCLUDES= + +SOURCES=\ + miniport.c \ + passthru.c \ + passthru.rc \ + protocol.c + diff --git a/planetlab/Makefile.planetlab b/planetlab/Makefile.planetlab new file mode 100644 index 0000000..f341262 --- /dev/null +++ b/planetlab/Makefile.planetlab @@ -0,0 +1,181 @@ +# $Id: Makefile 11687 2012-08-12 20:51:25Z luigi $ +# +# Top level makefile for building ipfw/dummynet (kernel and userspace). +# You can run it manually or also under the Planetlab build. +# Planetlab wants also the 'install' target. +# +# To build on system with non standard Kernel sources or userland files, +# you should run this with +# +# make KERNELPATH=/path/to/linux-2.x.y.z USRDIR=/path/to/usr +# +# We assume that $(USRDIR) contains include/ and lib/ used to build userland. +# + +include Makefile.inc + +DATE ?= $(shell date +%Y%m%d) +SNAPSHOT_NAME=$(DATE)-ipfw3.tgz +BINDIST=$(DATE)-dummynet-linux.tgz +WINDIST=$(DATE)-dummynet-windows.zip + +.PHONY: ipfw kipfw + +########################################### +# windows x86 and x64 specific variables # +########################################### +# DRIVE must be the hard drive letter where DDK is installed +# DDKDIR must be the path to the DDK root directory, without drive letter +# TARGETOS (x64 only) must be one of the following: +# wnet -> windows server 2003 +# wlh -> windows vista and windows server 2008 +# win7 -> windows 7 +# future version must be added here +export WIN64 +export DDK +export DRIVE +export DDKDIR +DRIVE ?= C: +DDKDIR ?= /WinDDK/7600.16385.1 +DDK = $(DRIVE)$(DDKDIR) + +TARGETOS=win7 + +_all: all + +clean distclean: + -@(cd ipfw && $(MAKE) $(@) ) + -@rm -rf kipfw-mod binary64/[A-hj-z]* + +all: kipfw ipfw + @# -- windows only +ifeq ($(OSARCH),Windows) # copy files +ifeq ($(WIN64),) + -@ cp ipfw/ipfw.exe kipfw-mod/$(OBJDIR)/ipfw.sys binary/ + -@ cp kipfw/*.inf binary/ +else + -@ cp binary/* binary64/ + -@ cp ipfw/ipfw.exe kipfw-mod/objchk_win7_amd64/amd64/ipfw.sys binary64/ +endif # WIN64 +endif # Windows + +win64: + $(MAKE) WIN64=1 + +# kipfw-src prepares the sources for the kernel part. +# The windows files (passthru etc.) are modified version of the +# examples found in the $(DDK)/src/network/ndis/passthru/driver/ +# They can be re-created using the 'ndis-glue' target +# # We need a sed trick to remove newlines from the patchfile. + +ndis-glue: + -@mkdir -p kipfw-mod + cp $(DDK)/src/network/ndis/passthru/driver/*.[ch] kipfw-mod + cat kipfw/win-passthru.diff | sed "s/$$(printf '\r')//g" | (cd kipfw-mod; patch ) + +kipfw-src: + -@rm -rf kipfw-mod + -@mkdir -p kipfw-mod + -@cp -Rp kipfw/* kipfw-mod + -@cp `find sys -name \*.c` kipfw-mod + -@(cd kipfw-mod && $(MAKE) include_e) +ifeq ($(OSARCH),Windows) + make ndis-glue +endif + +snapshot: + $(MAKE) distclean + (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME) --exclude .svn \ + --exclude README.openwrt --exclude tags --exclude NOTES \ + --exclude tcc-0.9.25-bsd \ + --exclude original_passthru \ + --exclude ipfw3.diff --exclude add_rules \ + --exclude test --exclude test_ \ + ipfw3-2012 ) + +bindist: + $(MAKE) clean + $(MAKE) all + tar cvzf /tmp/$(BINDIST) ipfw/ipfw ipfw/ipfw.8 kipfw-mod/ipfw_mod.ko + +windist: + $(MAKE) clean + -$(MAKE) all + -rm /tmp/$(WINDIST) + zip -r /tmp/$(WINDIST) binary -x \*.svn\* + + +ipfw: + @(cd ipfw && $(MAKE) $(@) ) + +kipfw: kipfw-src +ifeq ($(WIN64),) # linux or windows 32 bit + @(cd kipfw-mod && $(MAKE) $(@) ) +else #--- windows 64 bit, we use build.exe and nmake + rm -f kipfw-mod/Makefile + mkdir kipfw-mod/tmpbuild # check mysetenv.sh + bash kipfw/mysetenv.sh $(DRIVE) $(DDKDIR) $(TARGETOS) +endif + +IPF3_REPO ?= svn+ssh://some.host/some/path/ipfw3-2012 + +planetlab_update: + # clean and create a local working directory + rm -rf /tmp/pl-tmp + mkdir -p /tmp/pl-tmp/pl + mkdir -p /tmp/pl-tmp/ol2 + # get the trunk version of the PlanetLab repository + # to specify the sshkey use the .ssh/config file + (cd /tmp/pl-tmp/pl; \ + svn co svn+ssh://svn.planet-lab.org/svn/ipfw/trunk) + # get an updated copy of the main ipfw repository + (cd /tmp/pl-tmp/ol2; svn export $(IPFW3_REPO) ) + # copy the new version over the old one + (cd /tmp/pl-tmp; cp -rP ol2/ipfw3/* pl/trunk) + # files cleanup in the old version + (cd /tmp/pl-tmp; diff -r ol2/ipfw3 pl/trunk | \ + grep -v "svn" | awk '{print $$3 $$4}' | \ + sed 's/:/\//' | xargs rm -rf) + # local adjustments here + rm -rf /tmp/pl-tmp/pl/trunk/planetlab/check_planetlab_sync + # commit to the remote repo + @echo "Please, revise the update with the commands:" + @echo "(cd /tmp/pl-tmp/pl/trunk; svn diff)" + @echo "(cd /tmp/pl-tmp/pl/trunk; svn status)" + @echo "and commit with:" + @echo "(cd /tmp/pl-tmp/pl/trunk; svn ci -m 'Update from the mail ipfw repo.')" + +openwrt_release: + # create a temporary directory + $(eval TMPDIR := $(shell mktemp -d -p /tmp/ ipfw3_openwrt_XXXXX)) + # create the source destination directory + $(eval IPFWDIR := ipfw3-$(DATE)) + $(eval DSTDIR := $(TMPDIR)/$(IPFWDIR)) + mkdir $(DSTDIR) + # copy the package, clean objects and svn info + cp -r ./ipfw ./kipfw-mod glue.h Makefile ./configuration README $(DSTDIR) + (cd $(DSTDIR); make -s distclean; find . -name .svn | xargs rm -rf) + (cd $(TMPDIR); tar czf $(IPFWDIR).tar.gz $(IPFWDIR)) + + # create the port files in /tmp/ipfw3-port + $(eval PORTDIR := $(TMPDIR)/ipfw3) + mkdir -p $(PORTDIR)/patches + # generate the Makefile, PKG_VERSION and PKG_MD5SUM + md5sum $(DSTDIR).tar.gz | cut -d ' ' -f 1 > $(TMPDIR)/md5sum + cat ./OPENWRT/Makefile | \ + sed s/PKG_VERSION:=/PKG_VERSION:=$(DATE)/ | \ + sed s/PKG_MD5SUM:=/PKG_MD5SUM:=`cat $(TMPDIR)/md5sum`/ \ + > $(PORTDIR)/Makefile + + @echo "" + @echo "The openwrt port is in $(TMPDIR)/ipfw3-port" + @echo "The source file should be copied to the public server:" + @echo "scp $(DSTDIR).tar.gz marta@info.iet.unipi.it:~marta/public_html/dummynet" + @echo "after this the temporary directory $(TMPDIR) can be removed." + +install: + +diff: + -@(diff -upr $(BSD_HEAD)/sbin/ipfw ipfw) + -@(diff -upr $(BSD_HEAD)/sys sys) + diff --git a/planetlab/check_planetlab_sync b/planetlab/check_planetlab_sync new file mode 100755 index 0000000..f59853f --- /dev/null +++ b/planetlab/check_planetlab_sync @@ -0,0 +1,22 @@ +#!/bin/sh + +# +# This script is used to check the sync of the local repo +# with the remote planetlab repository + +tmpfile=/tmp/chech_planetlab_sync.tmp + +# check for local copy sync +svn diff > /tmp/chech_planetlab_sync.tmp +if [ -s $tmpfile ] ; then + echo "Local repo unsynced, can not continue" + exit -1 + rm $tmpfile +fi + +# export remote copy +svn --force export http://svn.planet-lab.org/svn/ipfw/trunk ./ >> /dev/null + +# check diffs again, output to the user +svn diff +svn status | grep -v check_planetlab_sync diff --git a/planetlab/ipfw b/planetlab/ipfw new file mode 100755 index 0000000..114cafb --- /dev/null +++ b/planetlab/ipfw @@ -0,0 +1,84 @@ +#!/bin/sh +# +# ipfw init the emulation service +# +# chkconfig: 2345 09 91 +# description: ipfw init and shutdown +# + +# Source function library. +. /etc/init.d/functions + +IPFW=ipfw +IPFW_BACKEND=/vsys/ipfw-be +IPFW_MOD=ipfw_mod + +if [ ! -x /sbin/$IPFW ] || [ ! -x ${IPFW_BACKEND} ]; then + echo -n "/sbin/$IPFW does not exist."; warning; echo + exit 0 +fi + +# Load the ipfw module, and initialize netconfig +start() { + # load the module + modprobe $IPFW_MOD >& /dev/null + let ret=$?; + [ $ret -eq 0 ] && success || failure + + # init netconfig + echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null + echo "super init" | ${IPFW_BACKEND} root >& /dev/null + + return $ret +} + +stop() { + # clean netconfig stuff + echo "super dbcleanup" | ${IPFW_BACKEND} root >& /dev/null + echo "Unloading $IPFW_MOD module: " + + # unload the ipfw module + rmmod ${IPFW_MOD} + let ret=$?; + [ $ret -eq 0 ] && success || failure + + return $ret +} + +# echo the ipfw status +status() { + # check for module presence + grep '^ipfw_mod$' /proc/modules >& /dev/null || echo "ipfw not loaded" && return 0 + + # Show active users + USERS=$(grep BLOCK /tmp/ff | wc -l) + echo "ipfw is loaded and there are currently ${USERS} with active emulation." + return 0 +} + +# main +case "$1" in + start) + start + RETVAL=$? + ;; + stop) + stop + RETVAL=$? + ;; + restart) + stop + start + RETVAL=$? + ;; + status) + status + RETVAL=$? + ;; + *) + echo $"Usage: $0 {start|stop|restart|status}" + exit 1 + ;; +esac + +exit $RETVAL diff --git a/planetlab/ipfw.8.gz b/planetlab/ipfw.8.gz new file mode 100644 index 0000000..c2db923 Binary files /dev/null and b/planetlab/ipfw.8.gz differ diff --git a/planetlab/ipfw.cron b/planetlab/ipfw.cron new file mode 100644 index 0000000..1b09340 --- /dev/null +++ b/planetlab/ipfw.cron @@ -0,0 +1,3 @@ +# Runs every 5 minutes and clean ipfw expired rules +# $Id: ipfw.cron 6069 2010-04-15 09:35:33Z marta $ +*/5 * * * * root echo "super killexpired" | /vsys/ipfw-be root > /dev/null 2>&1 diff --git a/planetlab/ipfwroot.spec b/planetlab/ipfwroot.spec new file mode 100644 index 0000000..1170ed7 --- /dev/null +++ b/planetlab/ipfwroot.spec @@ -0,0 +1,135 @@ +# +# $Id: ipfwroot.spec 16174 2009-12-15 13:38:15Z marta $ +# +# TODO: +# restart crond +# +%define url $URL: svn+ssh://onelab2/home/svn/ports-luigi/ipfw3-2012/planetlab/ipfwroot.spec $ + +# Marta Carbone +# 2009 - Universita` di Pisa +# License is BSD. + +# kernel_release, kernel_version and kernel_arch are expected to be set by the build to e.g. +# kernel_release : vs2.3.0.29.1.planetlab +# kernel_version : 2.6.22.14 + +%define name ipfwroot +%define version 0.9 +%define taglevel 11 + +%define release %{kernel_version}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} +%define kernel_id_arch %{kernel_version}-%{kernel_release}-%{kernel_arch} +%define kernel_id %{kernel_version}-%{kernel_release} + +Summary: ipfw and dummynet for Linux +Name: %{name} +Version: %{version} +Release: %{release} +License: BSD +Group: System Environment/Kernel +Source0: %{name}-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot +Requires: vixie-cron +Requires: vsys-scripts + +Vendor: unipi +Packager: PlanetLab +# XXX ask +Distribution: PlanetLab %{plrelease} +URL: %(echo %{url} | cut -d ' ' -f 2) + +%description +ipfw is the Linux port of the FreeBSD ipfw and dummynet packages + +%prep +%setup + +%build +# clean the rpm build directory +rm -rf $RPM_BUILD_ROOT + +# with the new build, we use the kernel-devel rpm for building +%define kernelpath /usr/src/kernels/%{kernel_id_arch} + +%__make KERNELPATH=%kernelpath clean +%__make KERNELPATH=%kernelpath IPFW_PLANETLAB=1 + +%install +install -D -m 755 dummynet2/ipfw_mod.ko $RPM_BUILD_ROOT/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko +install -D -m 755 ipfw/ipfw $RPM_BUILD_ROOT/sbin/ipfw +install -D -m 644 planetlab/ipfw.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/ipfw.cron +install -D -m 755 planetlab/ipfw $RPM_BUILD_ROOT/etc/rc.d/init.d/ipfw + +%clean +rm -rf $RPM_BUILD_ROOT + +%post +### this script is also triggered while the node image is being created at build-time +# some parts of the script do not make sense in this context +# this is why the build exports PL_BOOTCD=1 in such cases +depmod -a +/sbin/chkconfig --add ipfw +# start the service if not building +[ -z "$PL_BOOTCD" ] && service ipfw start + +%postun +# stop the service if not building +[ -z "$PL_BOOTCD" ] && service ipfw stop + +# here there is a list of the final installation directories +%files +%defattr(-,root,root) +%dir /lib/modules/%{kernel_id} +/lib/modules/%{kernel_id}/net/netfilter/ipfw_mod.ko +/sbin/ipfw +%{_sysconfdir}/cron.d/ipfw.cron +/etc/rc.d/init.d/ipfw + +%changelog +* Mon Apr 12 2010 Thierry Parmentelat - ipfw-0.9-11 +- add ipfw initialization script to chkconfig + +* Wed Mar 03 2010 Talip Baris Metin - ipfw-0.9-10 +- - Load module at installation - Marta + +* Mon Jan 11 2010 Thierry Parmentelat - ipfw-0.9-9 +- consistent with vsys-scripts-0.95-13 + +* Mon Jan 11 2010 Marta Carbone +- Integrated the ipfw rules cleanup into the backend + +* Sat Jan 09 2010 Thierry Parmentelat - ipfw-0.9-8 +- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits + +* Wed Jan 06 2010 Marta Carbone +- move to dummynet2, added support for table lookup +- added the vsys-script dependencies and the ipfw initialization + +* Tue Dec 15 2009 Marta Carbone +- more work on the radix code, added sysctl read/write support + +* Sun Nov 29 2009 Thierry Parmentelat - ipfw-0.9-7 +- added missing qsort.c - tag 0.9-6 was broken + +* Thu Nov 26 2009 Thierry Parmentelat - ipfw-0.9-6 +- root: removed goto into the main ipfw switch, enabled slice_id matching +- slice: completely move netconfig checks into the backend + +* Mon Nov 09 2009 Thierry Parmentelat - ipfw-0.9-5 +- additional features on matching packets, including uid match + +* Mon Sep 07 2009 Thierry Parmentelat - ipfw-0.9-4 +- on behalf of Marta Carbone, more options and features + +* Thu Jul 23 2009 Thierry Parmentelat - ipfw-0.9-3 +- fixed memory usage issue + +* Wed Jul 15 2009 Thierry Parmentelat - ipfw-0.9-2 +- patch for building on x86_64 + +* Thu Jun 25 2009 Marta Carbone +- post installation removed for deployment, moved manpages to the slice package + +* Fri Apr 17 2009 Marta Carbone +- Initial release diff --git a/planetlab/ipfwslice.spec b/planetlab/ipfwslice.spec new file mode 100644 index 0000000..cd98b89 --- /dev/null +++ b/planetlab/ipfwslice.spec @@ -0,0 +1,94 @@ +# +# $Id: ipfwslice.spec 16174 2009-12-15 13:38:15Z marta $ +# +# TODO: +# restart crond +# modprobe ipfw_mod.ko (depmod ?) +# +%define url $URL: svn+ssh://onelab2/home/svn/ports-luigi/ipfw3-2012/planetlab/ipfwslice.spec $ + +# Marta Carbone +# 2009 - Universita` di Pisa +# License is BSD. + +%define name ipfwslice +%define version 0.9 +%define taglevel 11 + +%define release %{kernel_version}.%{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} +%define kernel_id_arch %{kernel_version}-%{kernel_release}-%{kernel_arch} +%define kernel_id %{kernel_version}-%{kernel_release} + +Summary: ipfw and dummynet for Linux +Name: %{name} +Version: %{version} +Release: %{release} +License: BSD +Group: System Environment/Kernel +Source0: %{name}-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot + +Vendor: unipi +Packager: PlanetLab +Distribution: PlanetLab %{plrelease} +URL: %(echo %{url} | cut -d ' ' -f 2) + +%description +the frontend part of the ipfw planetlab package + +%prep +%setup + +%build +rm -rf $RPM_BUILD_ROOT + +%install +install -D -m 755 planetlab/netconfig $RPM_BUILD_ROOT/sbin/netconfig +install -D -m 755 planetlab/ipfw.8.gz $RPM_BUILD_ROOT/%{_mandir}/man8/ipfw.8.gz + +%clean +rm -rf $RPM_BUILD_ROOT + +# here there is a list of the final installation directories +%files +%defattr(-,root,root) +/sbin/netconfig +%{_mandir}/man8/ipfw.8* + +%changelog +* Mon Apr 12 2010 Thierry Parmentelat - ipfw-0.9-11 +- add ipfw initialization script to chkconfig + +* Wed Mar 03 2010 Talip Baris Metin - ipfw-0.9-10 +- - Load module at installation - Marta + +* Mon Jan 11 2010 Thierry Parmentelat - ipfw-0.9-9 +- consistent with vsys-scripts-0.95-13 + +* Sat Jan 09 2010 Thierry Parmentelat - ipfw-0.9-8 +- builds on 2.6.22 & 2.6.27 - for 32 and 64 bits + +* Tue Dec 15 2009 Marta Carbone +- more work on the radix code, added sysctl read/write support + +* Sun Nov 29 2009 Thierry Parmentelat - ipfw-0.9-7 +- added missing qsort.c - tag 0.9-6 was broken + +* Thu Nov 26 2009 Thierry Parmentelat - ipfw-0.9-6 +- root: removed goto into the main ipfw switch, enabled slice_id matching +- slice: completely move netconfig checks into the backend + +* Mon Nov 09 2009 Thierry Parmentelat - ipfw-0.9-5 +- additional features on matching packets, including uid match + +* Mon Sep 07 2009 Thierry Parmentelat - ipfw-0.9-4 +- on behalf of Marta Carbone, more options and features + +* Thu Jul 23 2009 Thierry Parmentelat - ipfw-0.9-3 +- fixed memory usage issue + +* Wed Jul 15 2009 Thierry Parmentelat - ipfw-0.9-2 +- patch for building on x86_64 + +* Thu Jun 25 2009 Marta Carbone +- Initial release diff --git a/planetlab/netconfig b/planetlab/netconfig new file mode 100755 index 0000000..7108582 --- /dev/null +++ b/planetlab/netconfig @@ -0,0 +1,14 @@ +#!/bin/sh +# +# Marta Carbone, Luigi Rizzo +# Copyright (C) 2009 Universita` di Pisa +# $Id: netconfig 4533 2009-12-16 14:39:23Z luigi $ +# +# This script is the frontend to be used with the vsys system. +# It simply passes information to the backend and gets back the reply + +PIPE_IN=/vsys/ipfw-be.in +PIPE_OUT=/vsys/ipfw-be.out + +sudo sh -c "echo $* >> ${PIPE_IN}" +sudo sh -c "cat ${PIPE_OUT}" diff --git a/planetlab/planetlab-tags.mk b/planetlab/planetlab-tags.mk new file mode 100644 index 0000000..25eff0e --- /dev/null +++ b/planetlab/planetlab-tags.mk @@ -0,0 +1,6 @@ +# $Id: planetlab-tags.mk 7450 2010-10-18 11:17:43Z marta $ +# These are good to build the ipfw modules from svn on kernels 2.6.22 +# and are used to fetch files from the onelab2 repository. +linux-2.6-SVNBRANCH := 22 +linux-2.6-SVNPATH := http://svn.planet-lab.org/svn/linux-2.6/tags/linux-2.6-22-39-1 +ipfwsrc-SVNPATH := svn+ssh://luigi%40onelab2.iet.unipi.it/home/svn/ports-luigi/dummynet-branches/ipfw3 diff --git a/planetlab/planetlab.mk b/planetlab/planetlab.mk new file mode 100644 index 0000000..6d3504b --- /dev/null +++ b/planetlab/planetlab.mk @@ -0,0 +1,26 @@ +# $Id: planetlab.mk 4533 2009-12-16 14:39:23Z luigi $ +# .mk file to build a module +kernel-MODULES := linux-2.6 +kernel-SPEC := kernel-2.6.spec +kernel-BUILD-FROM-SRPM := yes +ifeq "$(HOSTARCH)" "i386" +kernel-RPMFLAGS:= --target i686 +else +kernel-RPMFLAGS:= --target $(HOSTARCH) +endif +ALL += kernel + +ipfwroot-MODULES := ipfwsrc +ipfwroot-SPEC := planetlab/ipfwroot.spec +ipfwroot-DEPEND-DEVEL-RPMS := kernel-devel +ipfwroot-SPECVARS = kernel_version=$(kernel.rpm-version) \ + kernel_release=$(kernel.rpm-release) \ + kernel_arch=$(kernel.rpm-arch) +ALL += ipfwroot + +ipfwslice-MODULES := ipfwsrc +ipfwslice-SPEC := planetlab/ipfwslice.spec +ipfwslice-SPECVARS = kernel_version=$(kernel.rpm-version) \ + kernel_release=$(kernel.rpm-release) \ + kernel_arch=$(kernel.rpm-arch) +ALL += ipfwslice diff --git a/planetlab/sample_hook b/planetlab/sample_hook new file mode 100755 index 0000000..b47c8de --- /dev/null +++ b/planetlab/sample_hook @@ -0,0 +1,34 @@ +#!/bin/sh + +# +# Marta Carbone +# 2009 - Universita` di Pisa +# +# This is a sample hook file in charge to collect +# statistical information on netconfig usage. It dumps +# on a log file slicename, port and the configuration string +# used to configure a dummynet experiment. +# +# Each time a user configure a dummynet port, this file +# will be executed. +# The following variables will be passed as argument: +# +# ${SLICE} ${PORT} ${CONFIG_STRING} +# ${SLICE} The slicename executing the netconfig command +# ${PORT} The port to be configured +# ${CONFIG_STRING} The configuration string +# +# Note that this script can get additional information +# by executing the ipfw command, e.g. +# ipfw list # list of installed rules +# ipfw show # list of rules and statistical information +# ipfw pipe show # list of pipes +# +# a complete list of ipfw commands is available at: +# http://www.freebsd.org/cgi/man.cgi?query=ipfw&sektion=8 + +# logfile +LOG_FILE=/tmp/ipfw_hook.log + +echo -e `date` >> ${LOG_FILE} +echo "$*" >> ${LOG_FILE} diff --git a/sys/net/if.h b/sys/net/if.h new file mode 100644 index 0000000..1aa8e7b --- /dev/null +++ b/sys/net/if.h @@ -0,0 +1 @@ +#include diff --git a/sys/net/pfil.h b/sys/net/pfil.h new file mode 100644 index 0000000..af26a79 --- /dev/null +++ b/sys/net/pfil.h @@ -0,0 +1,121 @@ +/* $FreeBSD: src/sys/net/pfil.h,v 1.16 2007/06/08 12:43:25 gallatin Exp $ */ +/* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ + +/*- + * Copyright (c) 1996 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PFIL_H_ +#define _NET_PFIL_H_ + +#include +#include +#include +#include +#include +#include + +struct mbuf; +struct ifnet; +struct inpcb; + +/* + * The packet filter hooks are designed for anything to call them to + * possibly intercept the packet. + */ +struct packet_filter_hook { + TAILQ_ENTRY(packet_filter_hook) pfil_link; + int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); + void *pfil_arg; +}; + +#define PFIL_IN 0x00000001 +#define PFIL_OUT 0x00000002 +#define PFIL_WAITOK 0x00000004 +#define PFIL_ALL (PFIL_IN|PFIL_OUT) + +typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; + +#define PFIL_TYPE_AF 1 /* key is AF_* type */ +#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ + +struct pfil_head { + pfil_list_t ph_in; + pfil_list_t ph_out; + int ph_type; + int ph_nhooks; +#if defined( __linux__ ) || defined( _WIN32 ) + rwlock_t ph_mtx; +#else + struct rmlock ph_lock; +#endif + union { + u_long phu_val; + void *phu_ptr; + } ph_un; +#define ph_af ph_un.phu_val +#define ph_ifnet ph_un.phu_ptr + LIST_ENTRY(pfil_head) ph_list; +}; + +int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, + int, struct inpcb *), void *, int, struct pfil_head *); +int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, + int, struct inpcb *), void *, int, struct pfil_head *); +int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, + int, struct inpcb *inp); + +int pfil_head_register(struct pfil_head *); +int pfil_head_unregister(struct pfil_head *); + +struct pfil_head *pfil_head_get(int, u_long); + +#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) +#define PFIL_LOCK_INIT(p) \ + rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE) +#define PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock) +#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t)) +#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock) +#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t)) +#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock) +#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) +#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) + +static __inline struct packet_filter_hook * +pfil_hook_get(int dir, struct pfil_head *ph) +{ + + if (dir == PFIL_IN) + return (TAILQ_FIRST(&ph->ph_in)); + else if (dir == PFIL_OUT) + return (TAILQ_FIRST(&ph->ph_out)); + else + return (NULL); +} + +#endif /* _NET_PFIL_H_ */ diff --git a/sys/net/radix.c b/sys/net/radix.c new file mode 100644 index 0000000..22bac2b --- /dev/null +++ b/sys/net/radix.c @@ -0,0 +1,1203 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.c 8.5 (Berkeley) 5/19/95 + * $FreeBSD: head/sys/net/radix.c 200354 2009-12-10 10:34:30Z luigi $ + */ + +/* + * Routines to build and maintain radix trees for routing lookups. + */ +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include "opt_mpath.h" +#ifdef RADIX_MPATH +#include +#endif +#else /* !_KERNEL */ +#include +#include +#include +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) +#define min(a, b) ((a) < (b) ? (a) : (b) ) +#include +#endif /* !_KERNEL */ + +static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, + walktree_f_t *f, void *w); +static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); +static struct radix_node + *rn_insert(void *, struct radix_node_head *, int *, + struct radix_node [2]), + *rn_newpair(void *, int, struct radix_node[2]), + *rn_search(void *, struct radix_node *), + *rn_search_m(void *, struct radix_node *, void *); + +static int max_keylen; +static struct radix_mask *rn_mkfreelist; +static struct radix_node_head *mask_rnhead; +/* + * Work area -- the following point to 3 buffers of size max_keylen, + * allocated in this order in a block of memory malloc'ed by rn_init. + * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards. + * addmask_key is used in rn_addmask in rw mode and not thread-safe. + */ +static char *rn_zeros, *rn_ones, *addmask_key; + +#define MKGet(m) { \ + if (rn_mkfreelist) { \ + m = rn_mkfreelist; \ + rn_mkfreelist = (m)->rm_mklist; \ + } else \ + R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); } + +#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} + +#define rn_masktop (mask_rnhead->rnh_treetop) + +static int rn_lexobetter(void *m_arg, void *n_arg); +static struct radix_mask * + rn_new_radix_mask(struct radix_node *tt, + struct radix_mask *next); +static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, + int skip); + +/* + * The data structure for the keys is a radix tree with one way + * branching removed. The index rn_bit at an internal node n represents a bit + * position to be tested. The tree is arranged so that all descendants + * of a node n have keys whose bits all agree up to position rn_bit - 1. + * (We say the index of n is rn_bit.) + * + * There is at least one descendant which has a one bit at position rn_bit, + * and at least one with a zero there. + * + * A route is determined by a pair of key and mask. We require that the + * bit-wise logical and of the key and mask to be the key. + * We define the index of a route to associated with the mask to be + * the first bit number in the mask where 0 occurs (with bit number 0 + * representing the highest order bit). + * + * We say a mask is normal if every bit is 0, past the index of the mask. + * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, + * and m is a normal mask, then the route applies to every descendant of n. + * If the index(m) < rn_bit, this implies the trailing last few bits of k + * before bit b are all 0, (and hence consequently true of every descendant + * of n), so the route applies to all descendants of the node as well. + * + * Similar logic shows that a non-normal mask m such that + * index(m) <= index(n) could potentially apply to many children of n. + * Thus, for each non-host route, we attach its mask to a list at an internal + * node as high in the tree as we can go. + * + * The present version of the code makes use of normal routes in short- + * circuiting an explict mask and compare operation when testing whether + * a key satisfies a normal route, and also in remembering the unique leaf + * that governs a subtree. + */ + +/* + * Most of the functions in this code assume that the key/mask arguments + * are sockaddr-like structures, where the first byte is an u_char + * indicating the size of the entire structure. + * + * To make the assumption more explicit, we use the LEN() macro to access + * this field. It is safe to pass an expression with side effects + * to LEN() as the argument is evaluated only once. + * We cast the result to int as this is the dominant usage. + */ +#define LEN(x) ( (int) (*(const u_char *)(x)) ) + +/* + * XXX THIS NEEDS TO BE FIXED + * In the code, pointers to keys and masks are passed as either + * 'void *' (because callers use to pass pointers of various kinds), or + * 'caddr_t' (which is fine for pointer arithmetics, but not very + * clean when you dereference it to access data). Furthermore, caddr_t + * is really 'char *', while the natural type to operate on keys and + * masks would be 'u_char'. This mismatch require a lot of casts and + * intermediate variables to adapt types that clutter the code. + */ + +/* + * Search a node in the tree matching the key. + */ +static struct radix_node * +rn_search(v_arg, head) + void *v_arg; + struct radix_node *head; +{ + register struct radix_node *x; + register caddr_t v; + + for (x = head, v = v_arg; x->rn_bit >= 0;) { + if (x->rn_bmask & v[x->rn_offset]) + x = x->rn_right; + else + x = x->rn_left; + } + return (x); +} + +/* + * Same as above, but with an additional mask. + * XXX note this function is used only once. + */ +static struct radix_node * +rn_search_m(v_arg, head, m_arg) + struct radix_node *head; + void *v_arg, *m_arg; +{ + register struct radix_node *x; + register caddr_t v = v_arg, m = m_arg; + + for (x = head; x->rn_bit >= 0;) { + if ((x->rn_bmask & m[x->rn_offset]) && + (x->rn_bmask & v[x->rn_offset])) + x = x->rn_right; + else + x = x->rn_left; + } + return x; +} + +int +rn_refines(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register caddr_t m = m_arg, n = n_arg; + register caddr_t lim, lim2 = lim = n + LEN(n); + int longer = LEN(n++) - LEN(m++); + int masks_are_equal = 1; + + if (longer > 0) + lim -= longer; + while (n < lim) { + if (*n & ~(*m)) + return 0; + if (*n++ != *m++) + masks_are_equal = 0; + } + while (n < lim2) + if (*n++) + return 0; + if (masks_are_equal && (longer < 0)) + for (lim2 = m - longer; m < lim2; ) + if (*m++) + return 1; + return (!masks_are_equal); +} + +struct radix_node * +rn_lookup(v_arg, m_arg, head) + void *v_arg, *m_arg; + struct radix_node_head *head; +{ + register struct radix_node *x; + caddr_t netmask = 0; + + if (m_arg) { + x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset); + if (x == 0) + return (0); + netmask = x->rn_key; + } + x = rn_match(v_arg, head); + if (x && netmask) { + while (x && x->rn_mask != netmask) + x = x->rn_dupedkey; + } + return x; +} + +static int +rn_satisfies_leaf(trial, leaf, skip) + char *trial; + register struct radix_node *leaf; + int skip; +{ + register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; + char *cplim; + int length = min(LEN(cp), LEN(cp2)); + + if (cp3 == NULL) + cp3 = rn_ones; + else + length = min(length, LEN(cp3)); + cplim = cp + length; cp3 += skip; cp2 += skip; + for (cp += skip; cp < cplim; cp++, cp2++, cp3++) + if ((*cp ^ *cp2) & *cp3) + return 0; + return 1; +} + +struct radix_node * +rn_match(v_arg, head) + void *v_arg; + struct radix_node_head *head; +{ + caddr_t v = v_arg; + register struct radix_node *t = head->rnh_treetop, *x; + register caddr_t cp = v, cp2; + caddr_t cplim; + struct radix_node *saved_t, *top = t; + int off = t->rn_offset, vlen = LEN(cp), matched_off; + register int test, b, rn_bit; + + /* + * Open code rn_search(v, top) to avoid overhead of extra + * subroutine call. + */ + for (; t->rn_bit >= 0; ) { + if (t->rn_bmask & cp[t->rn_offset]) + t = t->rn_right; + else + t = t->rn_left; + } + /* + * See if we match exactly as a host destination + * or at least learn how many bits match, for normal mask finesse. + * + * It doesn't hurt us to limit how many bytes to check + * to the length of the mask, since if it matches we had a genuine + * match and the leaf we have is the most specific one anyway; + * if it didn't match with a shorter length it would fail + * with a long one. This wins big for class B&C netmasks which + * are probably the most common case... + */ + if (t->rn_mask) + vlen = *(u_char *)t->rn_mask; + cp += off; cp2 = t->rn_key + off; cplim = v + vlen; + for (; cp < cplim; cp++, cp2++) + if (*cp != *cp2) + goto on1; + /* + * This extra grot is in case we are explicitly asked + * to look up the default. Ugh! + * + * Never return the root node itself, it seems to cause a + * lot of confusion. + */ + if (t->rn_flags & RNF_ROOT) + t = t->rn_dupedkey; + return t; +on1: + test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ + for (b = 7; (test >>= 1) > 0;) + b--; + matched_off = cp - v; + b += matched_off << 3; + rn_bit = -1 - b; + /* + * If there is a host route in a duped-key chain, it will be first. + */ + if ((saved_t = t)->rn_mask == 0) + t = t->rn_dupedkey; + for (; t; t = t->rn_dupedkey) + /* + * Even if we don't match exactly as a host, + * we may match if the leaf we wound up at is + * a route to a net. + */ + if (t->rn_flags & RNF_NORMAL) { + if (rn_bit <= t->rn_bit) + return t; + } else if (rn_satisfies_leaf(v, t, matched_off)) + return t; + t = saved_t; + /* start searching up the tree */ + do { + register struct radix_mask *m; + t = t->rn_parent; + m = t->rn_mklist; + /* + * If non-contiguous masks ever become important + * we can restore the masking and open coding of + * the search and satisfaction test and put the + * calculation of "off" back before the "do". + */ + while (m) { + if (m->rm_flags & RNF_NORMAL) { + if (rn_bit <= m->rm_bit) + return (m->rm_leaf); + } else { + off = min(t->rn_offset, matched_off); + x = rn_search_m(v, t, m->rm_mask); + while (x && x->rn_mask != m->rm_mask) + x = x->rn_dupedkey; + if (x && rn_satisfies_leaf(v, x, off)) + return x; + } + m = m->rm_mklist; + } + } while (t != top); + return 0; +} + +#ifdef RN_DEBUG +int rn_nodenum; +struct radix_node *rn_clist; +int rn_saveinfo; +int rn_debug = 1; +#endif + +/* + * Whenever we add a new leaf to the tree, we also add a parent node, + * so we allocate them as an array of two elements: the first one must be + * the leaf (see RNTORT() in route.c), the second one is the parent. + * This routine initializes the relevant fields of the nodes, so that + * the leaf is the left child of the parent node, and both nodes have + * (almost) all all fields filled as appropriate. + * (XXX some fields are left unset, see the '#if 0' section). + * The function returns a pointer to the parent node. + */ + +static struct radix_node * +rn_newpair(v, b, nodes) + void *v; + int b; + struct radix_node nodes[2]; +{ + register struct radix_node *tt = nodes, *t = tt + 1; + t->rn_bit = b; + t->rn_bmask = 0x80 >> (b & 7); + t->rn_left = tt; + t->rn_offset = b >> 3; + +#if 0 /* XXX perhaps we should fill these fields as well. */ + t->rn_parent = t->rn_right = NULL; + + tt->rn_mask = NULL; + tt->rn_dupedkey = NULL; + tt->rn_bmask = 0; +#endif + tt->rn_bit = -1; + tt->rn_key = (caddr_t)v; + tt->rn_parent = t; + tt->rn_flags = t->rn_flags = RNF_ACTIVE; + tt->rn_mklist = t->rn_mklist = 0; +#ifdef RN_DEBUG + tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; + tt->rn_ybro = rn_clist; + rn_clist = tt; +#endif + return t; +} + +static struct radix_node * +rn_insert(v_arg, head, dupentry, nodes) + void *v_arg; + struct radix_node_head *head; + int *dupentry; + struct radix_node nodes[2]; +{ + caddr_t v = v_arg; + struct radix_node *top = head->rnh_treetop; + int head_off = top->rn_offset, vlen = LEN(v); + register struct radix_node *t = rn_search(v_arg, top); + register caddr_t cp = v + head_off; + register int b; + struct radix_node *tt; + /* + * Find first bit at which v and t->rn_key differ + */ + { + register caddr_t cp2 = t->rn_key + head_off; + register int cmp_res; + caddr_t cplim = v + vlen; + + while (cp < cplim) + if (*cp2++ != *cp++) + goto on1; + *dupentry = 1; + return t; +on1: + *dupentry = 0; + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; + for (b = (cp - v) << 3; cmp_res; b--) + cmp_res >>= 1; + } + { + register struct radix_node *p, *x = top; + cp = v; + do { + p = x; + if (cp[x->rn_offset] & x->rn_bmask) + x = x->rn_right; + else + x = x->rn_left; + } while (b > (unsigned) x->rn_bit); + /* x->rn_bit < b && x->rn_bit >= 0 */ +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); +#endif + t = rn_newpair(v_arg, b, nodes); + tt = t->rn_left; + if ((cp[p->rn_offset] & p->rn_bmask) == 0) + p->rn_left = t; + else + p->rn_right = t; + x->rn_parent = t; + t->rn_parent = p; /* frees x, p as temp vars below */ + if ((cp[t->rn_offset] & t->rn_bmask) == 0) { + t->rn_right = x; + } else { + t->rn_right = tt; + t->rn_left = x; + } +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); +#endif + } + return (tt); +} + +struct radix_node * +rn_addmask(n_arg, search, skip) + int search, skip; + void *n_arg; +{ + caddr_t netmask = (caddr_t)n_arg; + register struct radix_node *x; + register caddr_t cp, cplim; + register int b = 0, mlen, j; + int maskduplicated, m0, isnormal; + struct radix_node *saved_x; + static int last_zeroed = 0; + + if ((mlen = LEN(netmask)) > max_keylen) + mlen = max_keylen; + if (skip == 0) + skip = 1; + if (mlen <= skip) + return (mask_rnhead->rnh_nodes); + if (skip > 1) + bcopy(rn_ones + 1, addmask_key + 1, skip - 1); + if ((m0 = mlen) > skip) + bcopy(netmask + skip, addmask_key + skip, mlen - skip); + /* + * Trim trailing zeroes. + */ + for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) + cp--; + mlen = cp - addmask_key; + if (mlen <= skip) { + if (m0 >= last_zeroed) + last_zeroed = mlen; + return (mask_rnhead->rnh_nodes); + } + if (m0 < last_zeroed) + bzero(addmask_key + m0, last_zeroed - m0); + *addmask_key = last_zeroed = mlen; + x = rn_search(addmask_key, rn_masktop); + if (bcmp(addmask_key, x->rn_key, mlen) != 0) + x = 0; + if (x || search) + return (x); + R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); + if ((saved_x = x) == 0) + return (0); + netmask = cp = (caddr_t)(x + 2); + bcopy(addmask_key, cp, mlen); + x = rn_insert(cp, mask_rnhead, &maskduplicated, x); + if (maskduplicated) { + log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); + Free(saved_x); + return (x); + } + /* + * Calculate index of mask, and check for normalcy. + * First find the first byte with a 0 bit, then if there are + * more bits left (remember we already trimmed the trailing 0's), + * the pattern must be one of those in normal_chars[], or we have + * a non-contiguous mask. + */ + cplim = netmask + mlen; + isnormal = 1; + for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) + cp++; + if (cp != cplim) { + static char normal_chars[] = { + 0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; + + for (j = 0x80; (j & *cp) != 0; j >>= 1) + b++; + if (*cp != normal_chars[b] || cp != (cplim - 1)) + isnormal = 0; + } + b += (cp - netmask) << 3; + x->rn_bit = -1 - b; + if (isnormal) + x->rn_flags |= RNF_NORMAL; + return (x); +} + +static int /* XXX: arbitrary ordering for non-contiguous masks */ +rn_lexobetter(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register u_char *mp = m_arg, *np = n_arg, *lim; + + if (LEN(mp) > LEN(np)) + return 1; /* not really, but need to check longer one first */ + if (LEN(mp) == LEN(np)) + for (lim = mp + LEN(mp); mp < lim;) + if (*mp++ > *np++) + return 1; + return 0; +} + +static struct radix_mask * +rn_new_radix_mask(tt, next) + register struct radix_node *tt; + register struct radix_mask *next; +{ + register struct radix_mask *m; + + MKGet(m); + if (m == 0) { + log(LOG_ERR, "Mask for route not entered\n"); + return (0); + } + bzero(m, sizeof *m); + m->rm_bit = tt->rn_bit; + m->rm_flags = tt->rn_flags; + if (tt->rn_flags & RNF_NORMAL) + m->rm_leaf = tt; + else + m->rm_mask = tt->rn_mask; + m->rm_mklist = next; + tt->rn_mklist = m; + return m; +} + +struct radix_node * +rn_addroute(v_arg, n_arg, head, treenodes) + void *v_arg, *n_arg; + struct radix_node_head *head; + struct radix_node treenodes[2]; +{ + caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; + register struct radix_node *t, *x = 0, *tt; + struct radix_node *saved_tt, *top = head->rnh_treetop; + short b = 0, b_leaf = 0; + int keyduplicated; + caddr_t mmask; + struct radix_mask *m, **mp; + + /* + * In dealing with non-contiguous masks, there may be + * many different routes which have the same mask. + * We will find it useful to have a unique pointer to + * the mask to speed avoiding duplicate references at + * nodes and possibly save time in calculating indices. + */ + if (netmask) { + if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0) + return (0); + b_leaf = x->rn_bit; + b = -1 - x->rn_bit; + netmask = x->rn_key; + } + /* + * Deal with duplicated keys: attach node to previous instance + */ + saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); + if (keyduplicated) { + for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { +#ifdef RADIX_MPATH + /* permit multipath, if enabled for the family */ + if (rn_mpath_capable(head) && netmask == tt->rn_mask) { + /* + * go down to the end of multipaths, so that + * new entry goes into the end of rn_dupedkey + * chain. + */ + do { + t = tt; + tt = tt->rn_dupedkey; + } while (tt && t->rn_mask == tt->rn_mask); + break; + } +#endif + if (tt->rn_mask == netmask) + return (0); + if (netmask == 0 || + (tt->rn_mask && + ((b_leaf < tt->rn_bit) /* index(netmask) > node */ + || rn_refines(netmask, tt->rn_mask) + || rn_lexobetter(netmask, tt->rn_mask)))) + break; + } + /* + * If the mask is not duplicated, we wouldn't + * find it among possible duplicate key entries + * anyway, so the above test doesn't hurt. + * + * We sort the masks for a duplicated key the same way as + * in a masklist -- most specific to least specific. + * This may require the unfortunate nuisance of relocating + * the head of the list. + * + * We also reverse, or doubly link the list through the + * parent pointer. + */ + if (tt == saved_tt) { + struct radix_node *xx = x; + /* link in at head of list */ + (tt = treenodes)->rn_dupedkey = t; + tt->rn_flags = t->rn_flags; + tt->rn_parent = x = t->rn_parent; + t->rn_parent = tt; /* parent */ + if (x->rn_left == t) + x->rn_left = tt; + else + x->rn_right = tt; + saved_tt = tt; x = xx; + } else { + (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; + t->rn_dupedkey = tt; + tt->rn_parent = t; /* parent */ + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = tt; /* parent */ + } +#ifdef RN_DEBUG + t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; +#endif + tt->rn_key = (caddr_t) v; + tt->rn_bit = -1; + tt->rn_flags = RNF_ACTIVE; + } + /* + * Put mask in tree. + */ + if (netmask) { + tt->rn_mask = netmask; + tt->rn_bit = x->rn_bit; + tt->rn_flags |= x->rn_flags & RNF_NORMAL; + } + t = saved_tt->rn_parent; + if (keyduplicated) + goto on2; + b_leaf = -1 - t->rn_bit; + if (t->rn_right == saved_tt) + x = t->rn_left; + else + x = t->rn_right; + /* Promote general routes from below */ + if (x->rn_bit < 0) { + for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) + if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { + *mp = m = rn_new_radix_mask(x, 0); + if (m) + mp = &m->rm_mklist; + } + } else if (x->rn_mklist) { + /* + * Skip over masks whose index is > that of new node + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m->rm_bit >= b_leaf) + break; + t->rn_mklist = m; *mp = 0; + } +on2: + /* Add new route to highest possible ancestor's list */ + if ((netmask == 0) || (b > t->rn_bit )) + return tt; /* can't lift at all */ + b_leaf = tt->rn_bit; + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + /* + * Search through routes associated with node to + * insert new route according to index. + * Need same criteria as when sorting dupedkeys to avoid + * double loop on deletion. + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { + if (m->rm_bit < b_leaf) + continue; + if (m->rm_bit > b_leaf) + break; + if (m->rm_flags & RNF_NORMAL) { + mmask = m->rm_leaf->rn_mask; + if (tt->rn_flags & RNF_NORMAL) { +#if !defined(RADIX_MPATH) + log(LOG_ERR, + "Non-unique normal route, mask not entered\n"); +#endif + return tt; + } + } else + mmask = m->rm_mask; + if (mmask == netmask) { + m->rm_refs++; + tt->rn_mklist = m; + return tt; + } + if (rn_refines(netmask, mmask) + || rn_lexobetter(netmask, mmask)) + break; + } + *mp = rn_new_radix_mask(tt, *mp); + return tt; +} + +struct radix_node * +rn_delete(v_arg, netmask_arg, head) + void *v_arg, *netmask_arg; + struct radix_node_head *head; +{ + register struct radix_node *t, *p, *x, *tt; + struct radix_mask *m, *saved_m, **mp; + struct radix_node *dupedkey, *saved_tt, *top; + caddr_t v, netmask; + int b, head_off, vlen; + + v = v_arg; + netmask = netmask_arg; + x = head->rnh_treetop; + tt = rn_search(v, x); + head_off = x->rn_offset; + vlen = LEN(v); + saved_tt = tt; + top = x; + if (tt == 0 || + bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) + return (0); + /* + * Delete our route from mask lists. + */ + if (netmask) { + if ((x = rn_addmask(netmask, 1, head_off)) == 0) + return (0); + netmask = x->rn_key; + while (tt->rn_mask != netmask) + if ((tt = tt->rn_dupedkey) == 0) + return (0); + } + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) + goto on1; + if (tt->rn_flags & RNF_NORMAL) { + if (m->rm_leaf != tt || m->rm_refs > 0) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + return 0; /* dangling ref could cause disaster */ + } + } else { + if (m->rm_mask != tt->rn_mask) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + goto on1; + } + if (--m->rm_refs >= 0) + goto on1; + } + b = -1 - tt->rn_bit; + t = saved_tt->rn_parent; + if (b > t->rn_bit) + goto on1; /* Wasn't lifted at all */ + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m == saved_m) { + *mp = m->rm_mklist; + MKFree(m); + break; + } + if (m == 0) { + log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); + if (tt->rn_flags & RNF_NORMAL) + return (0); /* Dangling ref to us */ + } +on1: + /* + * Eliminate us from tree + */ + if (tt->rn_flags & RNF_ROOT) + return (0); +#ifdef RN_DEBUG + /* Get us out of the creation list */ + for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} + if (t) t->rn_ybro = tt->rn_ybro; +#endif + t = tt->rn_parent; + dupedkey = saved_tt->rn_dupedkey; + if (dupedkey) { + /* + * Here, tt is the deletion target and + * saved_tt is the head of the dupekey chain. + */ + if (tt == saved_tt) { + /* remove from head of chain */ + x = dupedkey; x->rn_parent = t; + if (t->rn_left == tt) + t->rn_left = x; + else + t->rn_right = x; + } else { + /* find node in front of tt on the chain */ + for (x = p = saved_tt; p && p->rn_dupedkey != tt;) + p = p->rn_dupedkey; + if (p) { + p->rn_dupedkey = tt->rn_dupedkey; + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = p; + /* parent */ + } else log(LOG_ERR, "rn_delete: couldn't find us\n"); + } + t = tt + 1; + if (t->rn_flags & RNF_ACTIVE) { +#ifndef RN_DEBUG + *++x = *t; + p = t->rn_parent; +#else + b = t->rn_info; + *++x = *t; + t->rn_info = b; + p = t->rn_parent; +#endif + if (p->rn_left == t) + p->rn_left = x; + else + p->rn_right = x; + x->rn_left->rn_parent = x; + x->rn_right->rn_parent = x; + } + goto out; + } + if (t->rn_left == tt) + x = t->rn_right; + else + x = t->rn_left; + p = t->rn_parent; + if (p->rn_right == t) + p->rn_right = x; + else + p->rn_left = x; + x->rn_parent = p; + /* + * Demote routes attached to us. + */ + if (t->rn_mklist) { + if (x->rn_bit >= 0) { + for (mp = &x->rn_mklist; (m = *mp);) + mp = &m->rm_mklist; + *mp = t->rn_mklist; + } else { + /* If there are any key,mask pairs in a sibling + duped-key chain, some subset will appear sorted + in the same order attached to our mklist */ + for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) + if (m == x->rn_mklist) { + struct radix_mask *mm = m->rm_mklist; + x->rn_mklist = 0; + if (--(m->rm_refs) < 0) + MKFree(m); + m = mm; + } + if (m) + log(LOG_ERR, + "rn_delete: Orphaned Mask %p at %p\n", + m, x); + } + } + /* + * We may be holding an active internal node in the tree. + */ + x = tt + 1; + if (t != x) { +#ifndef RN_DEBUG + *t = *x; +#else + b = t->rn_info; + *t = *x; + t->rn_info = b; +#endif + t->rn_left->rn_parent = t; + t->rn_right->rn_parent = t; + p = x->rn_parent; + if (p->rn_left == x) + p->rn_left = t; + else + p->rn_right = t; + } +out: + tt->rn_flags &= ~RNF_ACTIVE; + tt[1].rn_flags &= ~RNF_ACTIVE; + return (tt); +} + +/* + * This is the same as rn_walktree() except for the parameters and the + * exit. + */ +static int +rn_walktree_from(h, a, m, f, w) + struct radix_node_head *h; + void *a, *m; + walktree_f_t *f; + void *w; +{ + int error; + struct radix_node *base, *next; + u_char *xa = (u_char *)a; + u_char *xm = (u_char *)m; + register struct radix_node *rn, *last = 0 /* shut up gcc */; + int stopping = 0; + int lastb; + + /* + * rn_search_m is sort-of-open-coded here. We cannot use the + * function because we need to keep track of the last node seen. + */ + /* printf("about to search\n"); */ + for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { + last = rn; + /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", + rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ + if (!(rn->rn_bmask & xm[rn->rn_offset])) { + break; + } + if (rn->rn_bmask & xa[rn->rn_offset]) { + rn = rn->rn_right; + } else { + rn = rn->rn_left; + } + } + /* printf("done searching\n"); */ + + /* + * Two cases: either we stepped off the end of our mask, + * in which case last == rn, or we reached a leaf, in which + * case we want to start from the last node we looked at. + * Either way, last is the node we want to start from. + */ + rn = last; + lastb = rn->rn_bit; + + /* printf("rn %p, lastb %d\n", rn, lastb);*/ + + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + + while (!stopping) { + /* printf("node %p (%d)\n", rn, rn->rn_bit); */ + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && !(rn->rn_flags & RNF_ROOT)) { + rn = rn->rn_parent; + + /* if went up beyond last, stop */ + if (rn->rn_bit <= lastb) { + stopping = 1; + /* printf("up too far\n"); */ + /* + * XXX we should jump to the 'Process leaves' + * part, because the values of 'rn' and 'next' + * we compute will not be used. Not a big deal + * because this loop will terminate, but it is + * inefficient and hard to understand! + */ + } + } + + /* + * At the top of the tree, no need to traverse the right + * half, prevent the traversal of the entire tree in the + * case of default route. + */ + if (rn->rn_parent->rn_flags & RNF_ROOT) + stopping = 1; + + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base) != 0) { + base = rn->rn_dupedkey; + /* printf("leaf %p\n", rn); */ + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + + if (rn->rn_flags & RNF_ROOT) { + /* printf("root, stopping"); */ + stopping = 1; + } + + } + return 0; +} + +static int +rn_walktree(h, f, w) + struct radix_node_head *h; + walktree_f_t *f; + void *w; +{ + int error; + struct radix_node *base, *next; + register struct radix_node *rn = h->rnh_treetop; + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + + /* First time through node, go left */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + for (;;) { + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && (rn->rn_flags & RNF_ROOT) == 0) + rn = rn->rn_parent; + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base)) { + base = rn->rn_dupedkey; + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + if (rn->rn_flags & RNF_ROOT) + return (0); + } + /* NOTREACHED */ +} + +/* + * Allocate and initialize an empty tree. This has 3 nodes, which are + * part of the radix_node_head (in the order ) and are + * marked RNF_ROOT so they cannot be freed. + * The leaves have all-zero and all-one keys, with significant + * bits starting at 'off'. + * Return 1 on success, 0 on error. + */ +int +rn_inithead(head, off) + void **head; + int off; +{ + register struct radix_node_head *rnh; + register struct radix_node *t, *tt, *ttt; + if (*head) + return (1); + R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); + if (rnh == 0) + return (0); +#ifdef _KERNEL + RADIX_NODE_HEAD_LOCK_INIT(rnh); +#endif + *head = rnh; + t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); + ttt = rnh->rnh_nodes + 2; + t->rn_right = ttt; + t->rn_parent = t; + tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ + tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; + tt->rn_bit = -1 - off; + *ttt = *tt; + ttt->rn_key = rn_ones; + rnh->rnh_addaddr = rn_addroute; + rnh->rnh_deladdr = rn_delete; + rnh->rnh_matchaddr = rn_match; + rnh->rnh_lookup = rn_lookup; + rnh->rnh_walktree = rn_walktree; + rnh->rnh_walktree_from = rn_walktree_from; + rnh->rnh_treetop = t; + return (1); +} + +int +rn_detachhead(void **head) +{ + struct radix_node_head *rnh; + + KASSERT((head != NULL && *head != NULL), + ("%s: head already freed", __func__)); + rnh = *head; + + /* Free nodes. */ + Free(rnh); + + *head = NULL; + return (1); +} + +void +rn_init(int maxk) +{ + char *cp, *cplim; + + max_keylen = maxk; + if (max_keylen == 0) { + log(LOG_ERR, + "rn_init: radix functions require max_keylen be set\n"); + return; + } + R_Malloc(rn_zeros, char *, 3 * max_keylen); + if (rn_zeros == NULL) + panic("rn_init"); + bzero(rn_zeros, 3 * max_keylen); + rn_ones = cp = rn_zeros + max_keylen; + addmask_key = cplim = rn_ones + max_keylen; + while (cp < cplim) + *cp++ = -1; + if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0) + panic("rn_init 2"); +} diff --git a/sys/net/radix.h b/sys/net/radix.h new file mode 100644 index 0000000..4102c99 --- /dev/null +++ b/sys/net/radix.h @@ -0,0 +1,181 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.h 8.2 (Berkeley) 10/31/94 + * $FreeBSD: head/sys/net/radix.h 185747 2008-12-07 21:15:43Z kmacy $ + */ + +#ifndef _RADIX_H_ +#define _RADIX_H_ + +#ifdef _KERNEL +#include +#include +#include +#endif + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_RTABLE); +#endif + +/* + * Radix search tree node layout. + */ + +struct radix_node { + struct radix_mask *rn_mklist; /* list of masks contained in subtree */ + struct radix_node *rn_parent; /* parent */ + short rn_bit; /* bit offset; -1-index(netmask) */ + char rn_bmask; /* node: mask for bit test*/ + u_char rn_flags; /* enumerated next */ +#define RNF_NORMAL 1 /* leaf contains normal route */ +#define RNF_ROOT 2 /* leaf is root leaf for tree */ +#define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ + union { + struct { /* leaf only data: */ + caddr_t rn_Key; /* object of search */ + caddr_t rn_Mask; /* netmask, if present */ + struct radix_node *rn_Dupedkey; + } rn_leaf; + struct { /* node only data: */ + int rn_Off; /* where to start compare */ + struct radix_node *rn_L;/* progeny */ + struct radix_node *rn_R;/* progeny */ + } rn_node; + } rn_u; +#ifdef RN_DEBUG + int rn_info; + struct radix_node *rn_twin; + struct radix_node *rn_ybro; +#endif +}; + +#define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey +#define rn_key rn_u.rn_leaf.rn_Key +#define rn_mask rn_u.rn_leaf.rn_Mask +#define rn_offset rn_u.rn_node.rn_Off +#define rn_left rn_u.rn_node.rn_L +#define rn_right rn_u.rn_node.rn_R + +/* + * Annotations to tree concerning potential routes applying to subtrees. + */ + +struct radix_mask { + short rm_bit; /* bit offset; -1-index(netmask) */ + char rm_unused; /* cf. rn_bmask */ + u_char rm_flags; /* cf. rn_flags */ + struct radix_mask *rm_mklist; /* more masks to try */ + union { + caddr_t rmu_mask; /* the mask */ + struct radix_node *rmu_leaf; /* for normal routes */ + } rm_rmu; + int rm_refs; /* # of references to this struct */ +}; + +#define rm_mask rm_rmu.rmu_mask +#define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ + +typedef int walktree_f_t(struct radix_node *, void *); + +struct radix_node_head { + struct radix_node *rnh_treetop; + u_int rnh_gen; /* generation counter */ + int rnh_multipath; /* multipath capable ? */ + int rnh_addrsize; /* permit, but not require fixed keys */ + int rnh_pktsize; /* permit, but not require fixed keys */ + struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ + (void *v, struct radix_node_head *head); + struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ + (void *v, struct radix_node_head *head); + int (*rnh_walktree) /* traverse tree */ + (struct radix_node_head *head, walktree_f_t *f, void *w); + int (*rnh_walktree_from) /* traverse tree below a */ + (struct radix_node_head *head, void *a, void *m, + walktree_f_t *f, void *w); + void (*rnh_close) /* do something when the last ref drops */ + (struct radix_node *rn, struct radix_node_head *head); + struct radix_node rnh_nodes[3]; /* empty tree for common case */ +#ifdef _KERNEL +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rnh_lock; +#else + struct rwlock rnh_lock; /* locks entire radix tree */ +#endif /* !__linux__ */ +#endif +}; + +#ifndef _KERNEL +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) +#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) +#define Free(p) free((char *)p); +#else +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) +#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) +#define Free(p) free((caddr_t)p, M_RTABLE); + +#define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ + rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) +#define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) + + +#define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) +#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) +#endif /* _KERNEL */ + +void rn_init(int); +int rn_inithead(void **, int); +int rn_detachhead(void **); +int rn_refines(void *, void *); +struct radix_node + *rn_addmask(void *, int, int), + *rn_addroute (void *, void *, struct radix_node_head *, + struct radix_node [2]), + *rn_delete(void *, void *, struct radix_node_head *), + *rn_lookup (void *v_arg, void *m_arg, + struct radix_node_head *head), + *rn_match(void *, struct radix_node_head *); + +#endif /* _RADIX_H_ */ diff --git a/sys/netgraph/ng_ipfw.h b/sys/netgraph/ng_ipfw.h new file mode 100644 index 0000000..de74d4e --- /dev/null +++ b/sys/netgraph/ng_ipfw.h @@ -0,0 +1,33 @@ +/*- + * Copyright 2005, Gleb Smirnoff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $ + */ + +#ifndef _NG_IPFW_H +#define _NG_IPFW_H +#define NG_IPFW_NODE_TYPE "ipfw" +#define NGM_IPFW_COOKIE 1105988990 +#endif /* _NG_IPFW_H */ diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c new file mode 100644 index 0000000..d6acf87 --- /dev/null +++ b/sys/netinet/in_cksum.c @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $"); + +#include +#include + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(struct mbuf *m, int len) +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (uintptr_t) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h new file mode 100644 index 0000000..c9da4d8 --- /dev/null +++ b/sys/netinet/ip.h @@ -0,0 +1,49 @@ +#ifndef _NETINET_IP_H_ +#define _NETINET_IP_H_ + +#define LITTLE_ENDIAN 1234 +#define BIG_ENDIAN 4321 +#if defined(__BIG_ENDIAN) +#define BYTE_ORDER BIG_ENDIAN +//#warning we are in bigendian +#elif defined(__LITTLE_ENDIAN) +//#warning we are in littleendian +#define BYTE_ORDER LITTLE_ENDIAN +#else +#error no platform +#endif + +/* XXX endiannes doesn't belong here */ +// #define LITTLE_ENDIAN 1234 +// #define BIG_ENDIAN 4321 +// #define BYTE_ORDER LITTLE_ENDIAN + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#if BYTE_ORDER == LITTLE_ENDIAN + u_char ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif + u_char ip_tos; /* type of service */ + u_short ip_len; /* total length */ + u_short ip_id; /* identification */ + u_short ip_off; /* fragment offset field */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +} __packed __aligned(4); + +#define IPTOS_LOWDELAY 0x10 + +#endif /* _NETINET_IP_H_ */ diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h new file mode 100644 index 0000000..88b42a4 --- /dev/null +++ b/sys/netinet/ip6.h @@ -0,0 +1,58 @@ +#ifndef _NETINET_IP6_H_ +#define _NETINET_IP6_H_ +#define IN6_ARE_ADDR_EQUAL(a, b) \ +(memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0) + +struct ip6_hdr { + union { + struct ip6_hdrctl { + u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */ + u_int16_t ip6_un1_plen; /* payload length */ + u_int8_t ip6_un1_nxt; /* next header */ + u_int8_t ip6_un1_hlim; /* hop limit */ + } ip6_un1; + u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */ + } ip6_ctlun; + struct in6_addr ip6_src; /* source address */ + struct in6_addr ip6_dst; /* destination address */ +}; +#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt +#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow + + +struct icmp6_hdr { + u_int8_t icmp6_type; /* type field */ + u_int8_t icmp6_code; /* code field */ + u_int16_t icmp6_cksum; /* checksum field */ + union { + u_int32_t icmp6_un_data32[1]; /* type-specific field */ + u_int16_t icmp6_un_data16[2]; /* type-specific field */ + u_int8_t icmp6_un_data8[4]; /* type-specific field */ + } icmp6_dataun; +}; + +struct ip6_hbh { + u_int8_t ip6h_nxt; /* next header */ + u_int8_t ip6h_len; /* length in units of 8 octets */ + /* followed by options */ +}; +struct ip6_rthdr { + u_int8_t ip6r_nxt; /* next header */ + u_int8_t ip6r_len; /* length in units of 8 octets */ + u_int8_t ip6r_type; /* routing type */ + u_int8_t ip6r_segleft; /* segments left */ + /* followed by routing type specific data */ +}; +struct ip6_frag { + u_int8_t ip6f_nxt; /* next header */ + u_int8_t ip6f_reserved; /* reserved field */ + u_int16_t ip6f_offlg; /* offset, reserved, and flag */ + u_int32_t ip6f_ident; /* identification */ +}; +#define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */ +#define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */ +struct ip6_ext { + u_int8_t ip6e_nxt; + u_int8_t ip6e_len; +}; +#endif /* _NETINET_IP6_H_ */ diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h new file mode 100644 index 0000000..eab28f6 --- /dev/null +++ b/sys/netinet/ip_dummynet.h @@ -0,0 +1,263 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_dummynet.h 203321 2010-01-31 21:39:25Z luigi $ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of the kernel-userland API for dummynet. + * + * Setsockopt() and getsockopt() pass a batch of objects, each + * of them starting with a "struct dn_id" which should fully identify + * the object and its relation with others in the sequence. + * The first object in each request should have + * type= DN_CMD_*, id = DN_API_VERSION. + * For other objects, type and subtype specify the object, len indicates + * the total length including the header, and 'id' identifies the specific + * object. + * + * Most objects are numbered with an identifier in the range 1..65535. + * DN_MAX_ID indicates the first value outside the range. + */ + +#define DN_API_VERSION 12500000 +#define DN_MAX_ID 0x10000 + +struct dn_id { + uint16_t len; /* total obj len including this header */ + uint8_t type; + uint8_t subtype; + uint32_t id; /* generic id */ +}; + +/* + * These values are in the type field of struct dn_id. + * To preserve the ABI, never rearrange the list or delete + * entries with the exception of DN_LAST + */ +enum { + DN_NONE = 0, + DN_LINK = 1, + DN_FS, + DN_SCH, + DN_SCH_I, + DN_QUEUE, + DN_DELAY_LINE, + DN_PROFILE, + DN_FLOW, /* struct dn_flow */ + DN_TEXT, /* opaque text is the object */ + + DN_CMD_CONFIG = 0x80, /* objects follow */ + DN_CMD_DELETE, /* subtype + list of entries */ + DN_CMD_GET, /* subtype + list of entries */ + DN_CMD_FLUSH, + /* for compatibility with FreeBSD 7.2/8 */ + DN_COMPAT_PIPE, + DN_COMPAT_QUEUE, + DN_GET_COMPAT, + + /* special commands for emulation of sysctl variables */ + DN_SYSCTL_GET, + DN_SYSCTL_SET, + + DN_LAST, +}; + +enum { /* subtype for schedulers, flowset and the like */ + DN_SCHED_UNKNOWN = 0, + DN_SCHED_FIFO = 1, + DN_SCHED_WF2QP = 2, + /* others are in individual modules */ +}; + +enum { /* user flags */ + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ + DN_NOERROR = 0x0002, /* do not report errors */ + DN_QHT_HASH = 0x0004, /* qht is a hash table */ + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ + DN_IS_RED = 0x0020, + DN_IS_GENTLE_RED= 0x0040, + DN_PIPE_CMD = 0x1000, /* pipe config... */ +}; + +/* + * link template. + */ +struct dn_link { + struct dn_id oid; + + /* + * Userland sets bw and delay in bits/s and milliseconds. + * The kernel converts this back and forth to bits/tick and ticks. + * XXX what about burst ? + */ + int32_t link_nr; + int bandwidth; /* bit/s or bits/tick. */ + int delay; /* ms and ticks */ + uint64_t burst; /* scaled. bits*Hz XXX */ +}; + +/* + * A flowset, which is a template for flows. Contains parameters + * from the command line: id, target scheduler, queue sizes, plr, + * flow masks, buckets for the flow hash, and possibly scheduler- + * specific parameters (weight, quantum and so on). + */ +struct dn_fs { + struct dn_id oid; + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ + int qsize; /* queue size in slots or bytes */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t buckets; /* buckets used for the queue hash table */ + + struct ipfw_flow_id flow_mask; + uint32_t sched_nr; /* the scheduler we attach to */ + /* generic scheduler parameters. Leave them at -1 if unset. + * Now we use 0: weight, 1: lmax, 2: priority + */ + int par[4]; + + /* RED/GRED parameters. + * weight and probabilities are in the range 0..1 represented + * in fixed point arithmetic with SCALE_RED decimal bits. + */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + +}; + +/* + * dn_flow collects flow_id and stats for queues and scheduler + * instances, and is used to pass these info to userland. + * oid.type/oid.subtype describe the object, oid.id is number + * of the parent object. + */ +struct dn_flow { + struct dn_id oid; + struct ipfw_flow_id fid; + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + + +/* + * Scheduler template, mostly indicating the name, number, + * sched_mask and buckets. + */ +struct dn_sch { + struct dn_id oid; + uint32_t sched_nr; /* N, scheduler number */ + uint32_t buckets; /* number of buckets for the instances */ + uint32_t flags; /* have_mask, ... */ + + char name[16]; /* null terminated */ + /* mask to select the appropriate scheduler instance */ + struct ipfw_flow_id sched_mask; /* M */ +}; + + +/* A delay profile is attached to a link. + * Note that a profile, as any other object, cannot be longer than 2^16 + */ +#define ED_MAX_SAMPLES_NO 1024 +struct dn_profile { + struct dn_id oid; + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int link_nr; + int loss_level; + int bandwidth; // XXX use link bandwidth? + int samples_no; /* actual len of samples[] */ + int samples[0]; /* may be shorter */ +}; + + + +/* + * Overall structure of dummynet + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE (bad name). + +A QUEUE defines a classifier, which groups packets into flows +according to a 'mask', puts them into independent queues (one +per flow) with configurable size and queue management policy, +and passes flows to a scheduler: + + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ + +Many QUEUE objects can connect to the same scheduler, each +QUEUE object can have its own set of parameters. + +In turn, the SCHEDuler 'forks' multiple instances according +to a 'sched_mask', each instance manages its own set of queues +and transmits on a private instance of a configurable LINK. + +A PIPE is a simplified version of the above, where there +is no flow_mask, and each scheduler instance handles a single queue. + +The following data structures (visible from userland) describe +the objects used by dummynet: + + + dn_link, contains the main configuration parameters related + to delay and bandwidth; + + dn_profile describes a delay profile; + + dn_flow describes the flow status (flow id, statistics) + + + dn_sch describes a scheduler + + dn_fs describes a flowset (msk, weight, queue parameters) + + * + */ + +#endif /* _IP_DUMMYNET_H */ diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h new file mode 100644 index 0000000..9bfe775 --- /dev/null +++ b/sys/netinet/ip_fw.h @@ -0,0 +1,646 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: user/luigi/ipfw3-head/sys/netinet/ip_fw.h 202072 2010-01-11 10:12:35Z luigi $ + */ + +#ifndef _IPFW2_H +#define _IPFW2_H + +/* + * The default rule number. By the design of ip_fw, the default rule + * is the last one, so its number can also serve as the highest number + * allowed for a rule. The ip_fw code relies on both meanings of this + * constant. + */ +#define IPFW_DEFAULT_RULE 65535 + +/* + * The number of ipfw tables. The maximum allowed table number is the + * (IPFW_TABLES_MAX - 1). + */ +#define IPFW_TABLES_MAX 128 + +/* + * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit + * argument between 1 and 65534. The value 0 is unused, the value + * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the + * can be 1..65534, or 65535 to indicate the use of a 'tablearg' + * result of the most recent table() lookup. + * Note that 16bit is only a historical limit, resulting from + * the use of a 16-bit fields for that value. In reality, we can have + * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg. + */ +#define IPFW_ARG_MIN 1 +#define IPFW_ARG_MAX 65534 +#define IP_FW_TABLEARG 65535 /* XXX should use 0 */ + + /* + * Number of entries in the call stack of the call/return commands. + * Call stack currently is an uint16_t array with rule numbers. + */ +#define IPFW_CALLSTACK_SIZE 16 + +/* IP_FW3 header/opcodes */ +typedef struct _ip_fw3_opheader { + uint16_t opcode; /* Operation opcode */ + uint16_t reserved[3]; /* Align to 64-bit boundary */ +} ip_fw3_opheader; + + +/* IPFW extented tables support XXX what namespace ? */ +#define IP_FW_TABLE_XADD 86 /* add entry */ +#define IP_FW_TABLE_XDEL 87 /* delete entry */ +#define IP_FW_TABLE_XGETSIZE 88 /* get table size */ +#define IP_FW_TABLE_XLIST 89 /* list table contents */ + +/* + * The kernel representation of ipfw rules is made of a list of + * 'instructions' (for all practical purposes equivalent to BPF + * instructions), which specify which fields of the packet + * (or its metadata) should be analysed. + * + * Each instruction is stored in a structure which begins with + * "ipfw_insn", and can contain extra fields depending on the + * instruction type (listed below). + * Note that the code is written so that individual instructions + * have a size which is a multiple of 32 bits. This means that, if + * such structures contain pointers or other 64-bit entities, + * (there is just one instance now) they may end up unaligned on + * 64-bit architectures, so the must be handled with care. + * + * "enum ipfw_opcodes" are the opcodes supported. We can have up + * to 256 different opcodes. When adding new opcodes, they should + * be appended to the end of the opcode list before O_LAST_OPCODE, + * this will prevent the ABI from being broken, otherwise users + * will have to recompile ipfw(8) when they update the kernel. + */ + +enum ipfw_opcodes { /* arguments (4 byte each) */ + O_NOP, + + O_IP_SRC, /* u32 = IP */ + O_IP_SRC_MASK, /* ip = IP/mask */ + O_IP_SRC_ME, /* none */ + O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_DST, /* u32 = IP */ + O_IP_DST_MASK, /* ip = IP/mask */ + O_IP_DST_ME, /* none */ + O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ + O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ + O_PROTO, /* arg1=protocol */ + + O_MACADDR2, /* 2 mac addr:mask */ + O_MAC_TYPE, /* same as srcport */ + + O_LAYER2, /* none */ + O_IN, /* none */ + O_FRAG, /* none */ + + O_RECV, /* none */ + O_XMIT, /* none */ + O_VIA, /* none */ + + O_IPOPT, /* arg1 = 2*u8 bitmap */ + O_IPLEN, /* arg1 = len */ + O_IPID, /* arg1 = id */ + + O_IPTOS, /* arg1 = id */ + O_IPPRECEDENCE, /* arg1 = precedence << 5 */ + O_IPTTL, /* arg1 = TTL */ + + O_IPVER, /* arg1 = version */ + O_UID, /* u32 = id */ + O_GID, /* u32 = id */ + O_ESTAB, /* none (tcp established) */ + O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ + O_TCPWIN, /* arg1 = desired win */ + O_TCPSEQ, /* u32 = desired seq. */ + O_TCPACK, /* u32 = desired seq. */ + O_ICMPTYPE, /* u32 = icmp bitmap */ + O_TCPOPTS, /* arg1 = 2*u8 bitmap */ + + O_VERREVPATH, /* none */ + O_VERSRCREACH, /* none */ + + O_PROBE_STATE, /* none */ + O_KEEP_STATE, /* none */ + O_LIMIT, /* ipfw_insn_limit */ + O_LIMIT_PARENT, /* dyn_type, not an opcode. */ + + /* + * These are really 'actions'. + */ + + O_LOG, /* ipfw_insn_log */ + O_PROB, /* u32 = match probability */ + + O_CHECK_STATE, /* none */ + O_ACCEPT, /* none */ + O_DENY, /* none */ + O_REJECT, /* arg1=icmp arg (same as deny) */ + O_COUNT, /* none */ + O_SKIPTO, /* arg1=next rule number */ + O_PIPE, /* arg1=pipe number */ + O_QUEUE, /* arg1=queue number */ + O_DIVERT, /* arg1=port number */ + O_TEE, /* arg1=port number */ + O_FORWARD_IP, /* fwd sockaddr */ + O_FORWARD_MAC, /* fwd mac */ + O_NAT, /* nope */ + O_REASS, /* none */ + + /* + * More opcodes. + */ + O_IPSEC, /* has ipsec history */ + O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ + O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + O_ANTISPOOF, /* none */ + O_JAIL, /* u32 = id */ + O_ALTQ, /* u32 = altq classif. qid */ + O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ + O_TCPDATALEN, /* arg1 = tcp data len */ + O_IP6_SRC, /* address without mask */ + O_IP6_SRC_ME, /* my addresses */ + O_IP6_SRC_MASK, /* address with the mask */ + O_IP6_DST, + O_IP6_DST_ME, + O_IP6_DST_MASK, + O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ + O_ICMP6TYPE, /* icmp6 packet type filtering */ + O_EXT_HDR, /* filtering for ipv6 extension header */ + O_IP6, + + /* + * actions for ng_ipfw + */ + O_NETGRAPH, /* send to ng_ipfw */ + O_NGTEE, /* copy to ng_ipfw */ + + O_IP4, + + O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ + + O_TAG, /* arg1=tag number */ + O_TAGGED, /* arg1=tag number */ + + O_SETFIB, /* arg1=FIB number */ + O_FIB, /* arg1=FIB desired fib number */ + + O_SOCKARG, /* socket argument */ + + O_CALLRETURN, /* arg1=called rule number */ + + O_FORWARD_IP6, /* fwd sockaddr_in6 */ + + O_LAST_OPCODE /* not an opcode! */ +}; + + +/* + * The extension header are filtered only for presence using a bit + * vector with a flag for each header. + */ +#define EXT_FRAGMENT 0x1 +#define EXT_HOPOPTS 0x2 +#define EXT_ROUTING 0x4 +#define EXT_AH 0x8 +#define EXT_ESP 0x10 +#define EXT_DSTOPTS 0x20 +#define EXT_RTHDR0 0x40 +#define EXT_RTHDR2 0x80 + +/* + * Template for instructions. + * + * ipfw_insn is used for all instructions which require no operands, + * a single 16-bit value (arg1), or a couple of 8-bit values. + * + * For other instructions which require different/larger arguments + * we have derived structures, ipfw_insn_*. + * + * The size of the instruction (in 32-bit words) is in the low + * 6 bits of "len". The 2 remaining bits are used to implement + * NOT and OR on individual instructions. Given a type, you can + * compute the length to be put in "len" using F_INSN_SIZE(t) + * + * F_NOT negates the match result of the instruction. + * + * F_OR is used to build or blocks. By default, instructions + * are evaluated as part of a logical AND. An "or" block + * { X or Y or Z } contains F_OR set in all but the last + * instruction of the block. A match will cause the code + * to skip past the last instruction of the block. + * + * NOTA BENE: in a couple of places we assume that + * sizeof(ipfw_insn) == sizeof(u_int32_t) + * this needs to be fixed. + * + */ +typedef struct _ipfw_insn { /* template for instructions */ + u_int8_t opcode; + u_int8_t len; /* number of 32-bit words */ +#define F_NOT 0x80 +#define F_OR 0x40 +#define F_LEN_MASK 0x3f +#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) + + u_int16_t arg1; +} ipfw_insn; + +/* + * The F_INSN_SIZE(type) computes the size, in 4-byte words, of + * a given type. + */ +#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) + +/* + * This is used to store an array of 16-bit entries (ports etc.) + */ +typedef struct _ipfw_insn_u16 { + ipfw_insn o; + u_int16_t ports[2]; /* there may be more */ +} ipfw_insn_u16; + +/* + * This is used to store an array of 32-bit entries + * (uid, single IPv4 addresses etc.) + */ +typedef struct _ipfw_insn_u32 { + ipfw_insn o; + u_int32_t d[1]; /* one or more */ +} ipfw_insn_u32; + +/* + * This is used to store IP addr-mask pairs. + */ +typedef struct _ipfw_insn_ip { + ipfw_insn o; + struct in_addr addr; + struct in_addr mask; +} ipfw_insn_ip; + +/* + * This is used to forward to a given address (ip). + */ +typedef struct _ipfw_insn_sa { + ipfw_insn o; + struct sockaddr_in sa; +} ipfw_insn_sa; + +/* +* This is used to forward to a given address (ipv6). +*/ +typedef struct _ipfw_insn_sa6 { + ipfw_insn o; + struct sockaddr_in6 sa; +} ipfw_insn_sa6; + +/* + * This is used for MAC addr-mask pairs. + */ +typedef struct _ipfw_insn_mac { + ipfw_insn o; + u_char addr[12]; /* dst[6] + src[6] */ + u_char mask[12]; /* dst[6] + src[6] */ +} ipfw_insn_mac; + +/* + * This is used for interface match rules (recv xx, xmit xx). + */ +typedef struct _ipfw_insn_if { + ipfw_insn o; + union { + struct in_addr ip; + int glob; + } p; + char name[IFNAMSIZ]; +} ipfw_insn_if; + +/* + * This is used for storing an altq queue id number. + */ +typedef struct _ipfw_insn_altq { + ipfw_insn o; + u_int32_t qid; +} ipfw_insn_altq; + +/* + * This is used for limit rules. + */ +typedef struct _ipfw_insn_limit { + ipfw_insn o; + u_int8_t _pad; + u_int8_t limit_mask; /* combination of DYN_* below */ +#define DYN_SRC_ADDR 0x1 +#define DYN_SRC_PORT 0x2 +#define DYN_DST_ADDR 0x4 +#define DYN_DST_PORT 0x8 + + u_int16_t conn_limit; +} ipfw_insn_limit; + +/* + * This is used for log instructions. + */ +typedef struct _ipfw_insn_log { + ipfw_insn o; + u_int32_t max_log; /* how many do we log -- 0 = all */ + u_int32_t log_left; /* how many left to log */ +} ipfw_insn_log; + +/* + * Data structures required by both ipfw(8) and ipfw(4) but not part of the + * management API are protected by IPFW_INTERNAL. + */ +#ifdef IPFW_INTERNAL +/* Server pool support (LSNAT). */ +struct cfg_spool { + LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ + struct in_addr addr; + u_short port; +}; +#endif + +/* Redirect modes id. */ +#define REDIR_ADDR 0x01 +#define REDIR_PORT 0x02 +#define REDIR_PROTO 0x04 + +#ifdef IPFW_INTERNAL +/* Nat redirect configuration. */ +struct cfg_redir { + LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ + u_int16_t mode; /* type of redirect mode */ + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + u_short lport; /* local port */ + u_short pport; /* public port */ + u_short rport; /* remote port */ + u_short pport_cnt; /* number of public ports */ + u_short rport_cnt; /* number of remote ports */ + int proto; /* protocol: tcp/udp */ + struct alias_link **alink; + /* num of entry in spool chain */ + u_int16_t spool_cnt; + /* chain of spool instances */ + LIST_HEAD(spool_chain, cfg_spool) spool_chain; +}; +#endif + +#define NAT_BUF_LEN 1024 + +#ifdef IPFW_INTERNAL +/* Nat configuration data struct. */ +struct cfg_nat { + /* chain of nat instances */ + LIST_ENTRY(cfg_nat) _next; + int id; /* nat id */ + struct in_addr ip; /* nat ip address */ + char if_name[IF_NAMESIZE]; /* interface name */ + int mode; /* aliasing mode */ + struct libalias *lib; /* libalias instance */ + /* number of entry in spool chain */ + int redir_cnt; + /* chain of redir instances */ + LIST_HEAD(redir_chain, cfg_redir) redir_chain; +}; +#endif + +#define SOF_NAT sizeof(struct cfg_nat) +#define SOF_REDIR sizeof(struct cfg_redir) +#define SOF_SPOOL sizeof(struct cfg_spool) + +/* Nat command. */ +typedef struct _ipfw_insn_nat { + ipfw_insn o; + struct cfg_nat *nat; +} ipfw_insn_nat; + +/* Apply ipv6 mask on ipv6 addr */ +#define APPLY_MASK(addr,mask) \ + (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ + (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ + (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ + (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; + +/* Structure for ipv6 */ +typedef struct _ipfw_insn_ip6 { + ipfw_insn o; + struct in6_addr addr6; + struct in6_addr mask6; +} ipfw_insn_ip6; + +/* Used to support icmp6 types */ +typedef struct _ipfw_insn_icmp6 { + ipfw_insn o; + uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h + * define ICMP6_MAXTYPE + * as follows: n = ICMP6_MAXTYPE/32 + 1 + * Actually is 203 + */ +} ipfw_insn_icmp6; + +/* + * Here we have the structure representing an ipfw rule. + * + * It starts with a general area (with link fields and counters) + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. + * + * Given a rule pointer r: + * + * r->cmd is the start of the first instruction. + * ACTION_PTR(r) is the start of the first action (things to do + * once a rule matched). + * + * When assembling instruction, remember the following: + * + * + if a rule has a "keep-state" (or "limit") option, then the + * first instruction (at r->cmd) MUST BE an O_PROBE_STATE + * + if a rule has a "log" option, then the first action + * (at ACTION_PTR(r)) MUST be O_LOG + * + if a rule has an "altq" option, it comes after "log" + * + if a rule has an O_TAG option, it comes after "log" and "altq" + * + * NOTE: we use a simple linked list of rules because we never need + * to delete a rule without scanning the list. We do not use + * queue(3) macros for portability and readability. + */ + +struct ip_fw { +#ifdef _X64EMU + int32_t pad1; +#endif + struct ip_fw *x_next; /* linked list of rules */ +#ifdef _X64EMU + int32_t pad2; +#endif + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ +#define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + uint32_t id; /* rule id */ + + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +#define ACTION_PTR(rule) \ + (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) + +#define RULESIZE(rule) (sizeof(struct ip_fw) + \ + ((struct ip_fw *)(rule))->cmd_len * 4 - 4) + +#if 1 // should be moved to in.h +/* + * This structure is used as a flow mask and a flow id for various + * parts of the code. + * addr_type is used in userland and kernel to mark the address type. + * fib is used in the kernel to record the fib in use. + * _flags is used in the kernel to store tcp flags for dynamic rules. + */ +struct ipfw_flow_id { + uint32_t dst_ip; + uint32_t src_ip; + uint16_t dst_port; + uint16_t src_port; + uint8_t fib; + uint8_t proto; + uint8_t _flags; /* protocol-specific flags */ + uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ + struct in6_addr dst_ip6; + struct in6_addr src_ip6; + uint32_t flow_id6; + uint32_t extra; /* queue/pipe or frag_id */ +}; +#endif + +#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) + +/* + * Dynamic ipfw rule. + */ +typedef struct _ipfw_dyn_rule ipfw_dyn_rule; + +struct _ipfw_dyn_rule { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ipfw_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ +#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ + +/* + * These are used for lookup tables. + */ + +#define IPFW_TABLE_CIDR 1 /* Table for holding IPv4/IPv6 prefixes */ +#define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ +#define IPFW_TABLE_MAXTYPE 2 /* Maximum valid number */ + +typedef struct _ipfw_table_entry { + in_addr_t addr; /* network address */ + u_int32_t value; /* value */ + u_int16_t tbl; /* table number */ + u_int8_t masklen; /* mask length */ +} ipfw_table_entry; + +typedef struct _ipfw_table_xentry { + uint16_t len; /* Total entry length */ + uint8_t type; /* entry type */ + uint8_t masklen; /* mask length */ + uint16_t tbl; /* table number */ + uint32_t value; /* value */ + union { + /* Longest field needs to be aligned by 4-byte boundary */ + struct in6_addr addr6; /* IPv6 address */ + char iface[IF_NAMESIZE]; /* interface name */ + } k; +} ipfw_table_xentry; + +typedef struct _ipfw_table { + u_int32_t size; /* size of entries in bytes */ + u_int32_t cnt; /* # of entries */ + u_int16_t tbl; /* table number */ + ipfw_table_entry ent[0]; /* entries */ +} ipfw_table; + +typedef struct _ipfw_xtable { + ip_fw3_opheader opheader; /* eXtended tables are controlled via IP_FW3 */ + uint32_t size; /* size of entries in bytes */ + uint32_t cnt; /* # of entries */ + uint16_t tbl; /* table number */ + uint8_t type; /* table type */ + ipfw_table_xentry xent[0]; /* entries */ +} ipfw_xtable; + +#endif /* _IPFW2_H */ diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h new file mode 100644 index 0000000..5c7b851 --- /dev/null +++ b/sys/netinet/ip_icmp.h @@ -0,0 +1,17 @@ +/* + * additional define not present in linux + * should go in glue.h + */ +#ifndef _NETINET_IP_ICMP_H_ +#define _NETINET_IP_ICMP_H_ + +#define ICMP_MAXTYPE 40 /* defined as 18 in compat.h */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_UNREACH_HOST 1 /* bad host */ + +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ + +#endif /* _NETINET_IP_ICMP_H_ */ diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c new file mode 100644 index 0000000..fe2e971 --- /dev/null +++ b/sys/netinet/ipfw/dn_heap.c @@ -0,0 +1,588 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, used in dummynet + * + * $Id: dn_heap.c 11480 2012-07-31 08:02:00Z luigi $ + */ + +#include +#include +#ifdef _KERNEL +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/dn_heap.c 203279 2010-01-31 12:20:29Z luigi $"); +#include +#include +#include +#include +#ifndef log +#define log(x, arg...) +#endif + +#else /* !_KERNEL */ + +#include +#include +#include +#include + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) +#define MALLOC_DEFINE(a, b, c) +static void *my_malloc(int s) { return malloc(s); } +static void my_free(void *p) { free(p); } +#define malloc(s, t, w) my_malloc(s) +#define free(p, t) my_free(p) +#endif /* !_KERNEL */ + +MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_resize(struct dn_heap *h, unsigned int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) /* have enough room */ + return 0; +#if 1 /* round to the next power of 2 */ + new_size |= new_size >> 1; + new_size |= new_size >> 2; + new_size |= new_size >> 4; + new_size |= new_size >> 8; + new_size |= new_size >> 16; +#else + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; +#endif + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); + if (p == NULL) { + printf("--- %s, resize %d failed\n", __func__, new_size ); + return 1; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DN_HEAP); + } + h->p = p; + h->size = new_size; + return 0; +} + +int +heap_init(struct dn_heap *h, int size, int ofs) +{ + if (heap_resize(h, size)) + return 1; + h->elements = 0; + h->ofs = ofs; + return 0; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If ofs > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ + } while (0) +/* + * RESET_OFFSET is used for sanity checks. It sets ofs + * to an invalid value. + */ +#define RESET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ + } while (0) + +int +heap_insert(struct dn_heap *h, uint64_t key1, void *p) +{ + int son = h->elements; + + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); + if (p == NULL) { /* data already there, set starting point */ + son = key1; + } else { /* insert new element at the end, possibly resize */ + son = h->elements; + if (son == h->size) /* need resize... */ + // XXX expand by 16 or so + if (heap_resize(h, h->elements+16) ) + return 1; /* failure... */ + h->p[son].object = p; + h->p[son].key = key1; + h->elements++; + } + /* make sure that son >= father along the path */ + while (son > 0) { + int father = HEAP_FATHER(son); + struct dn_heap_entry tmp; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp); + SET_OFFSET(h, son); + son = father; + } + SET_OFFSET(h, son); + return 0; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1; + + if (max < 0) { + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); + return; + } + if (obj == NULL) + father = 0; /* default: move up smallest child */ + else { /* extract specific element, index is at offset */ + if (h->ofs <= 0) + panic("%s: extract from middle not set on %p\n", + __FUNCTION__, h); + father = *((int *)((char *)obj + h->ofs)); + if (father < 0 || father >= h->elements) { + panic("%s: father %d out of bound 0..%d\n", + __FUNCTION__, father, h->elements); + } + } + /* + * below, father is the index of the empty element, which + * we replace at each step with the smallest child until we + * reach the bottom level. + */ + // XXX why removing RESET_OFFSET increases runtime by 10% ? + RESET_OFFSET(h, father); + while ( (child = HEAP_LEFT(father)) <= max ) { + if (child != max && + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child++; /* take right child, otherwise left */ + h->p[father] = h->p[child]; + SET_OFFSET(h, father); + father = child; + } + h->elements--; + if (father != max) { + /* + * Fill hole with last entry and bubble up, + * reusing the insert code + */ + h->p[father] = h->p[max]; + heap_insert(h, father, NULL); + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, uint64_t new_key, void *object) +{ + int temp, i, max = h->elements-1; + struct dn_heap_entry *p, buf; + + if (h->ofs <= 0) + panic("cannot move items on this heap"); + p = h->p; /* shortcut */ + + i = *((int *)((char *)object + h->ofs)); + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ + p[i].key = new_key; + for (; i>0 && + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); + i = temp ) { /* bubble up */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } + } else { /* must move down */ + p[i].key = new_key; + while ( (temp = HEAP_LEFT(i)) <= max ) { + /* found left child */ + if (temp != max && + DN_KEY_LT(p[temp+1].key, p[temp].key)) + temp++; /* select child with min key */ + if (DN_KEY_LT(>p[temp].key, new_key)) { + /* go down */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } else + break; + i = temp; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i; + + for (i = 0; i < h->elements; i++ ) + heap_insert(h, i , NULL); +} + +int +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), + uintptr_t arg) +{ + int i, ret, found; + + for (i = found = 0 ; i < h->elements ;) { + ret = fn(h->p[i].object, arg); + if (ret & HEAP_SCAN_DEL) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (ret & HEAP_SCAN_END) + break; + } + if (found) + heapify(h); + return found; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DN_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * hash table support. + */ + +struct dn_ht { + int buckets; /* how many buckets, really buckets - 1*/ + int entries; /* how many entries */ + int ofs; /* offset of link field */ + uint32_t (*hash)(uintptr_t, int, void *arg); + int (*match)(void *_el, uintptr_t key, int, void *); + void *(*newh)(uintptr_t, int, void *); + void **ht; /* bucket heads */ +}; +/* + * Initialize, allocating bucket pointers inline. + * Recycle previous record if possible. + * If the 'newh' function is not supplied, we assume that the + * key passed to ht_find is the same object to be stored in. + */ +struct dn_ht * +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, + uint32_t (*h)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)) +{ + int l; + + /* + * Notes about rounding bucket size to a power of two. + * Given the original bucket size, we compute the nearest lower and + * higher power of two, minus 1 (respectively b_min and b_max) because + * this value will be used to do an AND with the index returned + * by hash function. + * To choice between these two values, the original bucket size is + * compared with b_min. If the original size is greater than 4/3 b_min, + * we round the bucket size to b_max, else to b_min. + * This ratio try to round to the nearest power of two, advantaging + * the greater size if the different between two power is relatively + * big. + * Rounding the bucket size to a power of two avoid the use of + * module when calculating the correct bucket. + * The ht->buckets variable store the bucket size - 1 to simply + * do an AND between the index returned by hash function and ht->bucket + * instead of a module. + */ + int b_min; /* min buckets */ + int b_max; /* max buckets */ + int b_ori; /* original buckets */ + + if (h == NULL || match == NULL) { + printf("--- missing hash or match function"); + return NULL; + } + if (buckets < 1 || buckets > 65536) + return NULL; + + b_ori = buckets; + /* calculate next power of 2, - 1*/ + buckets |= buckets >> 1; + buckets |= buckets >> 2; + buckets |= buckets >> 4; + buckets |= buckets >> 8; + buckets |= buckets >> 16; + + b_max = buckets; /* Next power */ + b_min = buckets >> 1; /* Previous power */ + + /* Calculate the 'nearest' bucket size */ + if (b_min * 4000 / 3000 < b_ori) + buckets = b_max; + else + buckets = b_min; + + if (ht) { /* see if we can reuse */ + if (buckets <= ht->buckets) { + ht->buckets = buckets; + } else { + /* free pointers if not allocated inline */ + if (ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + ht = NULL; + } + } + if (ht == NULL) { + /* Allocate buckets + 1 entries because buckets is use to + * do the AND with the index returned by hash function + */ + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); + } + if (ht) { + ht->ht = (void **)(ht + 1); + ht->buckets = buckets; + ht->ofs = ofs; + ht->hash = h; + ht->match = match; + ht->newh = newh; + } + return ht; +} + +/* dummy callback for dn_ht_free to unlink all */ +static int +do_del(void *obj, void *arg) +{ + return DNHT_SCAN_DEL; +} + +void +dn_ht_free(struct dn_ht *ht, int flags) +{ + if (ht == NULL) + return; + if (flags & DNHT_REMOVE) { + (void)dn_ht_scan(ht, do_del, NULL); + } else { + if (ht->ht && ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + } +} + +int +dn_ht_entries(struct dn_ht *ht) +{ + return ht ? ht->entries : 0; +} + +/* + * Helper function to scan a bucket in the hash table, it + * can only be called on a non-empty bucket for a valid table. + * + * In lookup and scan, consider ht->ht[i] as pointing to the tail + * of the queue (head is NEXTP(tail). The 'empty' value is irrelevant. + * While searching, start analysing p = head, end when p == tail. + * Note that 'tail' is a cache of the _original_ ht->ht[i] + * and is used to check for loop termination. If you remove + * it, you must also adjust 'p' when deleting the 'tail' element. + */ +#define NEXT(_h, _p) *((void **)((char *)(_p) + (_h)->ofs)) +static int +dn_ht_scan_body(struct dn_ht *ht, int *bucket, + int (*fn)(void *, void *), void *arg) +{ + int ret, found = 0, i = *bucket; + void *tail, *pp, *p, *nextp; + + pp = tail = ht->ht[i]; + do { + p = NEXT(ht, pp); + nextp = NEXT(ht, p); + ret = fn(p, arg); + if ((ret & DNHT_SCAN_DEL) == 0) { + pp = p; /* prepare for next loop */ + } else { + found++; + ht->entries--; + /* skip current element */ + if (pp != p) + /* pp == p implies p == tail */ + NEXT(ht, pp) = nextp; + if (p == tail) + ht->ht[i] = (pp != p) ? pp : NULL; + } + if (ret & DNHT_SCAN_END) { + /* Update ht->ht[i] before returning */ + ht->ht[i] = (ht->ht[i] == NULL) ? NULL : pp; + return found; + } + } while (p != tail); + + (*bucket)++; + return found; +} + +/* + * lookup and optionally create or delete element. + * This is an optimized version of the scan so it is coded + * inline. + */ +void * +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) +{ + int i, found; + void *tail, *pp, *p; /* pp is the prev element, pp is current */ + + if (ht == NULL) /* easy on an empty hash */ + return NULL; + i = (ht->buckets == 1) ? 0 : + (ht->hash(key, flags, arg) & ht->buckets); + + pp = tail = ht->ht[i]; + if (tail) { /* non empty, try a lookup */ + do { + p = NEXT(ht, pp); + found = (flags & DNHT_MATCH_PTR) ? key == (uintptr_t)p : + ht->match(p, key, flags, arg); + if (!found) + continue; + if (flags & DNHT_REMOVE) { + ht->entries--; + if (p != pp) /* skip current element */ + NEXT(ht, pp) = NEXT(ht, p); + if (p == tail) + ht->ht[i] = (pp != p) ? pp : NULL; + } + return p; + } while ( (pp = p) != tail); + } + /* not found */ + if ((flags & DNHT_INSERT) == 0) + return NULL; + p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; + if (p) { + ht->entries++; + if (tail == NULL) { + ht->ht[i] = NEXT(ht, p) = p; + } else { + NEXT(ht, p) = NEXT(ht, tail); + NEXT(ht, tail) = p; + } + } + + return p; +} + +/* + * do a scan with the option to delete the object. + * Similar to the lookup, but the match function is different, + * and we extract 'next' before running the callback because + * the element may be destroyed there. + */ +int +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) +{ + int i, bucket, found = 0; + + if (ht == NULL || fn == NULL) + return 0; + for (i = 0; i <= ht->buckets; i++) { + if (ht->ht[i] == NULL) + continue; /* empty bucket */ + bucket = i; + found += dn_ht_scan_body(ht, &bucket, fn, arg); + if (bucket == i) /* early exit */ + return found; + } + return found; +} + +/* + * Similar to dn_ht_scan(), except that the scan is performed only + * in the bucket 'bucket'. The function returns a correct bucket number if + * the original is invalid. + * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] + * pointer to the last entry processed. Moreover, the bucket number passed + * by caller is decremented, because usually the caller increment it. + */ +int +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), + void *arg) +{ + if (ht == NULL || fn == NULL) + return 0; + if (*bucket > ht->buckets || *bucket < 0) + *bucket = 0; + if (ht->ht[*bucket] == NULL) { + (*bucket)++; + return 0; + } else + return dn_ht_scan_body(ht, bucket, fn, arg); +} diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h new file mode 100644 index 0000000..09b2ac7 --- /dev/null +++ b/sys/netinet/ipfw/dn_heap.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, header file + * + * $FreeBSD: head/sys/netinet/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $ + */ + +#ifndef _IP_DN_HEAP_H +#define _IP_DN_HEAP_H + +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) + +/* + * This module implements a binary heap supporting random extraction. + * + * A heap entry contains an uint64_t key and a pointer to object. + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' + * + * The heap is a struct dn_heap plus a dynamically allocated + * array of dn_heap_entry entries. 'size' represents the size of + * the array, 'elements' count entries in use. The topmost + * element has the smallest key. + * The heap supports ordered insert, and extract from the top. + * To extract an object from the middle of the heap, we the object + * must reserve an 'int32_t' to store the position of the object + * in the heap itself, and the location of this field must be + * passed as an argument to heap_init() -- use -1 if the feature + * is not used. + */ +struct dn_heap_entry { + uint64_t key; /* sorting key, smallest comes first */ + void *object; /* object pointer */ +}; + +struct dn_heap { + int size; /* the size of the array */ + int elements; /* elements in use */ + int ofs; /* offset in the object of heap index */ + struct dn_heap_entry *p; /* array of "size" entries */ +}; + +enum { + HEAP_SCAN_DEL = 1, + HEAP_SCAN_END = 2, +}; + +/* + * heap_init() reinitializes the heap setting the size and the offset + * of the index for random extraction (use -1 if not used). + * The 'elements' counter is set to 0. + * + * SET_HEAP_OFS() indicates where, in the object, is stored the index + * for random extractions from the heap. + * + * heap_free() frees the memory associated to a heap. + * + * heap_insert() adds a key-pointer pair to the heap + * + * HEAP_TOP() returns a pointer to the top element of the heap, + * but makes no checks on its existance (XXX should we change ?) + * + * heap_extract() removes the entry at the top, returing the pointer. + * (the key should have been read before). + * + * heap_scan() invokes a callback on each entry of the heap. + * The callback can return a combination of HEAP_SCAN_DEL and + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must + * be removed, and HEAP_SCAN_END means to terminate the scan. + * heap_scan() returns the number of elements removed. + * Because the order is not guaranteed, we should use heap_scan() + * only as a last resort mechanism. + */ +#define HEAP_TOP(h) ((h)->p) +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) +int heap_init(struct dn_heap *h, int size, int ofs); +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); +void heap_extract(struct dn_heap *h, void *obj); +void heap_free(struct dn_heap *h); +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); + +/*------------------------------------------------------ + * This module implements a generic hash table with support for + * running callbacks on the entire table. To avoid allocating + * memory during hash table operations, objects must reserve + * space for a link field. XXX if the heap is moderately full, + * an SLIST suffices, and we can tolerate the cost of a hash + * computation on each removal. + * + * dn_ht_init() initializes the table, setting the number of + * buckets, the offset of the link field, the main callbacks. + * Callbacks are: + * + * hash(key, flags, arg) called to return a bucket index. + * match(obj, key, flags, arg) called to determine if key + * matches the current 'obj' in the heap + * newh(key, flags, arg) optional, used to allocate a new + * object during insertions. + * + * dn_ht_free() frees the heap or unlink elements. + * DNHT_REMOVE unlink elements, 0 frees the heap. + * You need two calls to do both. + * + * dn_ht_find() is the main lookup function, which can also be + * used to insert or delete elements in the hash table. + * The final 'arg' is passed to all callbacks. + * + * dn_ht_scan() is used to invoke a callback on all entries of + * the heap, or possibly on just one bucket. The callback + * is invoked with a pointer to the object, and must return + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the + * removal of the object from the heap and the end of the + * scan, respectively. + * + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans + * only the specific bucket of the table. The bucket is a in-out + * parameter and return a valid bucket number if the original + * is invalid. + * + * A combination of flags can be used to modify the operation + * of the dn_ht_find(), and of the callbacks: + * + * DNHT_KEY_IS_OBJ means the key is the object pointer. + * It is usally of interest for the hash and match functions. + * + * DNHT_MATCH_PTR during a lookup, match pointers instead + * of calling match(). Normally used when removing specific + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used + * by the match function. + * + * DNHT_INSERT insert the element if not found. + * Calls new() to allocates a new object unless + * DNHT_KEY_IS_OBJ is set. + * + * DNHT_UNIQUE only insert if object not found. + * XXX should it imply DNHT_INSERT ? + * + * DNHT_REMOVE remove objects if we find them. + */ +struct dn_ht; /* should be opaque */ + +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, + uint32_t (*hash)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)); +void dn_ht_free(struct dn_ht *, int flags); + +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); +int dn_ht_entries(struct dn_ht *); + +enum { /* flags values. + * first two are returned by the scan callback to indicate + * to delete the matching element or to end the scan + */ + DNHT_SCAN_DEL = 0x0001, + DNHT_SCAN_END = 0x0002, + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ + DNHT_INSERT = 0x0010, /* insert if not found */ + DNHT_UNIQUE = 0x0020, /* report error if already there */ + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ +}; + +#endif /* _IP_DN_HEAP_H */ diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h new file mode 100644 index 0000000..a755e86 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The API to write a packet scheduling algorithm for dummynet. + * + * $FreeBSD: head/sys/netinet/ipfw/dn_sched.h 204591 2010-03-02 17:40:48Z luigi $ + */ + +#ifndef _DN_SCHED_H +#define _DN_SCHED_H + +#define DN_MULTIQUEUE 0x01 +/* + * Descriptor for a scheduling algorithm. + * Contains all function pointers for a given scheduler + * This is typically created when a module is loaded, and stored + * in a global list of schedulers. + */ +struct dn_alg { + uint32_t type; /* the scheduler type */ + const char *name; /* scheduler name */ + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ + + /* + * The following define the size of 3 optional data structures + * that may need to be allocated at runtime, and are appended + * to each of the base data structures: scheduler, sched.inst, + * and queue. We don't have a per-flowset structure. + */ + /* + parameters attached to the template, e.g. + * default queue sizes, weights, quantum size, and so on; + */ + size_t schk_datalen; + + /* + per-instance parameters, such as timestamps, + * containers for queues, etc; + */ + size_t si_datalen; + + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ + + /* + * Methods implemented by the scheduler: + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. + * q is NULL for !MULTIQUEUE. + * Return 0 on success, 1 on drop (packet consumed anyways). + * Note that q should be interpreted only as a hint + * on the flow that the mbuf belongs to: while a + * scheduler will normally enqueue m into q, it is ok + * to leave q alone and put the mbuf elsewhere. + * This function is called in two cases: + * - when a new packet arrives to the scheduler; + * - when a scheduler is reconfigured. In this case the + * call is issued by the new_queue callback, with a + * non empty queue (q) and m pointing to the first + * mbuf in the queue. For this reason, the function + * should internally check for (m != q->mq.head) + * before calling dn_enqueue(). + * + * dequeue Called when scheduler instance 's' can + * dequeue a packet. Return NULL if none are available. + * XXX what about non work-conserving ? + * + * config called on 'sched X config ...', normally writes + * in the area of size sch_arg + * + * destroy called on 'sched delete', frees everything + * in sch_arg (other parts are handled by more specific + * functions) + * + * new_sched called when a new instance is created, e.g. + * to create the local queue for !MULTIQUEUE, set V or + * copy parameters for WFQ, and so on. + * + * free_sched called when deleting an instance, cleans + * extra data in the per-instance area. + * + * new_fsk called when a flowset is linked to a scheduler, + * e.g. to validate parameters such as weights etc. + * free_fsk when a flowset is unlinked from a scheduler. + * (probably unnecessary) + * + * new_queue called to set the per-queue parameters, + * e.g. S and F, adjust sum of weights in the parent, etc. + * + * The new_queue callback is normally called from when + * creating a new queue. In some cases (such as a + * scheduler change or reconfiguration) it can be called + * with a non empty queue. In this case, the queue + * In case of non empty queue, the new_queue callback could + * need to call the enqueue function. In this case, + * the callback should eventually call enqueue() passing + * as m the first element in the queue. + * + * free_queue actions related to a queue removal, e.g. undo + * all the above. If the queue has data in it, also remove + * from the scheduler. This can e.g. happen during a reconfigure. + * If safe == 1 remove the queue only if the scheduler no longer + * need it, otherwise delete it even if the scheduler is using + * it. Usually, the flag safe is set when the drain routine is + * running to delete idle queues. + */ + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*dequeue)(struct dn_sch_inst *); + + int (*config)(struct dn_schk *); + int (*destroy)(struct dn_schk*); + int (*new_sched)(struct dn_sch_inst *); + int (*free_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *f); + int (*free_fsk)(struct dn_fsk *f); + int (*new_queue)(struct dn_queue *q); + int (*free_queue)(struct dn_queue *q, int safe); + + /* run-time fields */ + int ref_count; /* XXX number of instances in the system */ + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ +}; + +/* MSVC does not support initializers so we need this ugly macro */ +#ifdef _WIN32 +#define _SI(fld) +#else +#define _SI(fld) fld +#endif + +/* + * Additionally, dummynet exports some functions and macros + * to be used by schedulers: + */ + +void dn_free_pkts(struct mbuf *mnext); +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); +/* bound a variable between min and max */ +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); + +/* + * Extract the head of a queue, update stats. Must be the very last + * thing done on a dequeue as the queue itself may go away. + */ +static __inline struct mbuf* +dn_dequeue(struct dn_queue *q) +{ + struct mbuf *m = q->mq.head; + if (m == NULL) + return NULL; + q->mq.head = m->m_nextpkt; + + /* Update stats for the queue */ + q->ni.length--; + q->ni.len_bytes -= m->m_pkthdr.len; + /* When the queue becomes idle, update idle_time (used by RED) + * and also update the count of idle queues (for garbage collection). + */ + if (q->ni.length == 0) { + dn_cfg.idle_queue++; + q->q_time = dn_cfg.curr_time; + } + if (q->_si) { + struct dn_flow *ni = &(q->_si->ni); + /* update stats for the scheduler instance, and keep track + * of idle scheduler instances if needed + */ + ni->length--; + ni->len_bytes -= m->m_pkthdr.len; + if (ni->length == 0) + dn_cfg.idle_si++; + } + return m; +} + +int dn_sched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ + static moduledata_t name##_mod = { \ + #name, dn_sched_modevent, dnsched \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3); +#endif /* _DN_SCHED_H */ diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c new file mode 100644 index 0000000..d8733c9 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_fifo.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_fifo.c 11480 2012-07-31 08:02:00Z luigi $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +/* + * This file implements a FIFO scheduler for a single queue. + * The queue is allocated as part of the scheduler instance, + * and there is a single flowset is in the template which stores + * queue size and policy. + * Enqueue and dequeue use the default library functions. + */ +static int +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) +{ + /* XXX if called with q != NULL and m=NULL, this is a + * re-enqueue from an existing scheduler, which we should + * handle. + */ + return dn_enqueue((struct dn_queue *)(si+1), m, 0); +} + +static struct mbuf * +fifo_dequeue(struct dn_sch_inst *si) +{ + return dn_dequeue((struct dn_queue *)(si + 1)); +} + +static int +fifo_new_sched(struct dn_sch_inst *si) +{ + /* This scheduler instance contains the queue */ + struct dn_queue *q = (struct dn_queue *)(si + 1); + + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = si; + q->fs = si->sched->fs; + return 0; +} + +static int +fifo_free_sched(struct dn_sch_inst *si) +{ + struct dn_queue *q = (struct dn_queue *)(si + 1); + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); + return 0; +} + +/* + * FIFO scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fifo_desc = { + _SI( .type = ) DN_SCHED_FIFO, + _SI( .name = ) "FIFO", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct dn_queue), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fifo_enqueue, + _SI( .dequeue = ) fifo_dequeue, + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) fifo_new_sched, + _SI( .free_sched = ) fifo_free_sched, + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, +}; + +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/sys/netinet/ipfw/dn_sched_prio.c b/sys/netinet/ipfw/dn_sched_prio.c new file mode 100644 index 0000000..7bc67ea --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_prio.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_prio.c 11480 2012-07-31 08:02:00Z luigi $ + */ +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_PRIO 5 //XXX + +#if !defined(_KERNEL) || !defined(__linux__) +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +#ifdef __MIPSEL__ +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +/* Size of the array of queues pointers. */ +#define BITMAP_T unsigned long +#define MAXPRIO (sizeof(BITMAP_T) * 8) + +/* + * The scheduler instance contains an array of pointers to queues, + * one for each priority, and a bitmap listing backlogged queues. + */ +struct prio_si { + BITMAP_T bitmap; /* array bitmap */ + struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ +}; + +/* + * If a queue with the same priority is already backlogged, use + * that one instead of the queue passed as argument. + */ +static int +prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + int prio = q->fs->fs.par[0]; + + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else { /* use the existing queue */ + q = si->q_array[prio]; + } + if (dn_enqueue(q, m, 0)) + return 1; + return 0; +} + +/* + * Packets are dequeued only from the highest priority queue. + * The function ffs() return the lowest bit in the bitmap that rapresent + * the array index (-1) which contains the pointer to the highest priority + * queue. + * After the dequeue, if this queue become empty, it is index is removed + * from the bitmap. + * Scheduler is idle if the bitmap is empty + * + * NOTE: highest priority is 0, lowest is sched->max_prio_q + */ +static struct mbuf * +prio_dequeue(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + int prio; + + if (si->bitmap == 0) /* scheduler idle */ + return NULL; + + prio = ffs(si->bitmap) - 1; + + /* Take the highest priority queue in the scheduler */ + q = si->q_array[prio]; + // assert(q) + + m = dn_dequeue(q); + if (q->mq.head == NULL) { + /* Queue is now empty, remove from scheduler + * and mark it + */ + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return m; +} + +static int +prio_new_sched(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + + bzero(si->q_array, sizeof(si->q_array)); + si->bitmap = 0; + + return 0; +} + +static int +prio_new_fsk(struct dn_fsk *fs) +{ + /* Check if the prioritiy is between 0 and MAXPRIO-1 */ + ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); + return 0; +} + +static int +prio_new_queue(struct dn_queue *q) +{ + struct prio_si *si = (struct prio_si *)(q->_si + 1); + int prio = q->fs->fs.par[0]; + struct dn_queue *oldq; + + q->ni.oid.subtype = DN_SCHED_PRIO; + + if (q->mq.head == NULL) + return 0; + + /* Queue already full, must insert in the scheduler or append + * mbufs to existing queue. This partly duplicates prio_enqueue + */ + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else if ( (oldq = si->q_array[prio]) != q) { + /* must append to the existing queue. + * can simply append q->mq.head to q2->... + * and add the counters to those of q2 + */ + oldq->mq.tail->m_nextpkt = q->mq.head; + oldq->mq.tail = q->mq.tail; + oldq->ni.length += q->ni.length; + q->ni.length = 0; + oldq->ni.len_bytes += q->ni.len_bytes; + q->ni.len_bytes = 0; + q->mq.tail = q->mq.head = NULL; + } + return 0; +} + +static int +prio_free_queue(struct dn_queue *q, int safe) +{ + int prio = q->fs->fs.par[0]; + struct prio_si *si = (struct prio_si *)(q->_si + 1); + + if (si->q_array[prio] == q) { + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return 0; +} + + +static struct dn_alg prio_desc = { + _SI( .type = ) DN_SCHED_PRIO, + _SI( .name = ) "PRIO", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct prio_si), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) prio_enqueue, + _SI( .dequeue = ) prio_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) prio_new_sched, + _SI( .free_sched = ) NULL, + + _SI( .new_fsk = ) prio_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) prio_new_queue, + _SI( .free_queue = ) prio_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c new file mode 100644 index 0000000..eaf0478 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_qfq.c @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_qfq.c 11656 2012-08-07 08:39:06Z luigi $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifdef QFQ_DEBUG +struct qfq_sched; +static void dump_sched(struct qfq_sched *q, const char *msg); +#define NO(x) x +#else +#define NO(x) +#endif +#define DN_SCHED_QFQ 4 // XXX Where? +typedef unsigned long bitmap; + +/* + * bitmaps ops are critical. Some linux versions have __fls + * and the bitmap ops. Some machines have ffs + */ +#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) +int fls(unsigned int n) +{ + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} +#endif + +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} +#endif + +#if !defined(_KERNEL) || !defined(__linux__) +#ifdef QFQ_DEBUG +int test_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + return *p & (1< 31) + D("bad index %d", ix); + *p |= (1< 31) + D("bad index %d", ix); + *p &= ~(1<index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + +The max group index corresponds to Lmax/w_min, where +Lmax=1<group mapping. Class weights are + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<i_wsum) +#define IWSUM ((1< 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = ffs(bitmap) - 1; // zero-based + return &q->groups[index]; +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) +{ + uint64_t slot_size = (uint64_t)maxlen *inv_w; + unsigned long size_map; + int index = 0; + + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; // basically a log_2() + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; + +out: + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); + return index; +} +/*---- end support functions ----*/ + +/*-------- API calls --------------------------------*/ +/* + * Validate and copy parameters from flowset. + */ +static int +qfq_new_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + int i; + uint32_t w; /* approximated weight */ + + /* import parameters from the flowset. They should be correct + * already. + */ + w = _q->fs->fs.par[0]; + cl->lmax = _q->fs->fs.par[1]; + if (!w || w > QFQ_MAX_WEIGHT) { + w = 1; + D("rounding weight to 1"); + } + cl->inv_w = ONE_FP/w; + w = ONE_FP/cl->inv_w; + if (q->wsum + w > QFQ_MAX_WSUM) + return EINVAL; + + i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->grp = &q->groups[i]; + q->wsum += w; + // XXX cl->S = q->V; ? + // XXX compute q->i_wsum + return 0; +} + +/* remove an empty queue */ +static int +qfq_free_queue(struct dn_queue *_q, int safe) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + if (cl->inv_w) { + q->wsum -= ONE_FP/cl->inv_w; + cl->inv_w = 0; /* reset weight to avoid run twice */ + } + return 0; +} + +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long +mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static inline unsigned int +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static inline void +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) +{ + unsigned long mask, vslot, old_vslot; + + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) +{ + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + cl->next = grp->slots[i]; + grp->slots[i] = cl; + __set_bit(slot, &grp->full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->slots[grp->front]; + + *h = (*h)->next; + if (!*h) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_group *grp) +{ + int i; + + ND("grp %d full %x", grp->index, grp->full_slots); + if (!grp->full_slots) + return NULL; + + i = ffs(grp->full_slots) - 1; // zero-based + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return grp->slots[grp->front]; +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + + +static inline void +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) +{ + bitmap ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + + cl->S = cl->F; + if (cl->_q.mq.head == NULL) { + qfq_front_slot_remove(grp); + } else { + unsigned int len; + uint64_t roundedS; + + len = cl->_q.mq.head->m_pkthdr.len; + cl->F = cl->S + (uint64_t)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return 0; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + return 1; +} + +static struct mbuf * +qfq_dequeue(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + uint64_t old_V; + + NO(q->loops++;) + if (!q->bitmaps[ER]) { + NO(if (q->queued) + dump_sched(q, "start dequeue");) + return NULL; + } + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = grp->slots[grp->front]; + /* extract from the first bucket in the bucket list */ + m = dn_dequeue(&cl->_q); + + if (!m) { + D("BUG/* non-workconserving leaf */"); + return NULL; + } + NO(q->queued--;) + old_V = q->V; + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); + + if (qfq_update_class(q, grp, cl)) { + uint64_t old_F = grp->F; + cl = qfq_slot_scan(grp); + if (!cl) { /* group gone, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + // grp->S = grp->F + 1; // XXX debugging only + } else { + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + /* remove from ER and put in the new set */ + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + NO(if (!q->bitmaps[ER] && q->queued) + dump_sched(q, "end dequeue");) + + return m; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint64_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else { /* timestamp is not stale */ + cl->S = cl->F; + } +} + +static int +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl = (struct qfq_class *)_q; + uint64_t roundedS; + int s; + + NO(q->loops++;) + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, + _q, cl->inv_w, cl->grp->index); + /* XXX verify that the packet obeys the parameters */ + if (m != _q->mq.head) { + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ + return 1; + NO(q->queued++;) + if (m != _q->mq.head) + return 0; + } + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); /* adjust start time */ + /* compute new finish time and rounded start. */ + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + /* create a slot for this cl->S */ + qfq_slot_rotate(q, grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + ND("new state %d 0x%x", s, q->bitmaps[s]); + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return 0; +} + + +#if 0 +static inline void +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl, struct qfq_class **pprev) +{ + unsigned int i, offset; + uint64_t roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + +#ifdef notyet + if (!pprev) { + pprev = &grp->slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->next; + } +#endif + + *pprev = cl->next; + if (!grp->slots[i]) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, + struct qfq_class **pprev) +{ + struct qfq_group *grp = &q->groups[cl->index]; + unsigned long mask; + uint64_t roundedS; + int s; + + cl->F = cl->S; // not needed if the class goes away. + qfq_slot_remove(q, grp, cl, pprev); + + if (!grp->full_slots) { + /* nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (!grp->slots[grp->front]) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + qfq_update_eligible(q, q->V); +} +#endif + +static int +qfq_new_fsk(struct dn_fsk *f) +{ + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); + return 0; +} + +/* + * initialize a new scheduler instance + */ +static int +qfq_new_sched(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + int i; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - + (QFQ_MAX_INDEX - i); + } + return 0; +} + +/* + * QFQ scheduler descriptor + */ +static struct dn_alg qfq_desc = { + _SI( .type = ) DN_SCHED_QFQ, + _SI( .name = ) "QFQ", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct qfq_sched), + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), + + _SI( .enqueue = ) qfq_enqueue, + _SI( .dequeue = ) qfq_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) qfq_new_sched, + _SI( .free_sched = ) NULL, + _SI( .new_fsk = ) qfq_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) qfq_new_queue, + _SI( .free_queue = ) qfq_free_queue, +}; + +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); + +#ifdef QFQ_DEBUG +static void +dump_groups(struct qfq_sched *q, uint32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = &q->groups[i]; + + if (0 == (mask & (1<slots[j]) + D(" bucket %d %p", j, g->slots[j]); + } + D("full_slots 0x%x", g->full_slots); + D(" %2d S 0x%20llx F 0x%llx %c", i, + g->S, g->F, + mask & (1<loops, q->queued, q->V); + D(" ER 0x%08x", q->bitmaps[ER]); + D(" EB 0x%08x", q->bitmaps[EB]); + D(" IR 0x%08x", q->bitmaps[IR]); + D(" IB 0x%08x", q->bitmaps[IB]); + dump_groups(q, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c new file mode 100644 index 0000000..2a93746 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_rr.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_rr.c 11480 2012-07-31 08:02:00Z luigi $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_RR 3 // XXX Where? + +struct rr_queue { + struct dn_queue q; /* Standard queue */ + int status; /* 1: queue is in the list */ + int credit; /* Number of bytes to transmit */ + int quantum; /* quantum * C */ + struct rr_queue *qnext; /* */ +}; + +/* struct rr_schk contains global config parameters + * and is right after dn_schk + */ +struct rr_schk { + int min_q; /* Min quantum */ + int max_q; /* Max quantum */ + int q_bytes; /* Bytes per quantum */ +}; + +/* per-instance round robin list, right after dn_sch_inst */ +struct rr_si { + struct rr_queue *head, *tail; /* Pointer to current queue */ +}; + +/* Append a queue to the rr list */ +static inline void +rr_append(struct rr_queue *q, struct rr_si *si) +{ + q->status = 1; /* mark as in-rr_list */ + q->credit = q->quantum; /* initialize credit */ + + /* append to the tail */ + if (si->head == NULL) + si->head = q; + else + si->tail->qnext = q; + si->tail = q; /* advance the tail pointer */ + q->qnext = si->head; /* make it circular */ +} + +/* Remove the head queue from circular list. */ +static inline void +rr_remove_head(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + si->head->status = 0; + + if (si->head == si->tail) { + si->head = si->tail = NULL; + return; + } + + si->head = si->head->qnext; + si->tail->qnext = si->head; +} + +/* Remove a queue from circular list. + * XXX see if ti can be merge with remove_queue() + */ +static inline void +remove_queue_q(struct rr_queue *q, struct rr_si *si) +{ + struct rr_queue *prev; + + if (q->status != 1) + return; + if (q == si->head) { + rr_remove_head(si); + return; + } + + for (prev = si->head; prev; prev = prev->qnext) { + if (prev->qnext != q) + continue; + prev->qnext = q->qnext; + if (q == si->tail) + si->tail = prev; + q->status = 0; + break; + } +} + + +static inline void +next_pointer(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + + si->head = si->head->qnext; + si->tail = si->tail->qnext; +} + +static int +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct rr_si *si; + struct rr_queue *rrq; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) + return 0; + } + + /* If reach this point, queue q was idle */ + si = (struct rr_si *)(_si + 1); + rrq = (struct rr_queue *)q; + + if (rrq->status == 1) /* Queue is already in the queue list */ + return 0; + + /* Insert the queue in the queue list */ + rr_append(rrq, si); + + return 0; +} + +static struct mbuf * +rr_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct rr_si *si = (struct rr_si *)(_si + 1); + struct rr_queue *rrq; + uint64_t len; + + while ( (rrq = si->head) ) { + struct mbuf *m = rrq->q.mq.head; + if ( m == NULL) { + /* empty queue, remove from list */ + rr_remove_head(si); + continue; + } + len = m->m_pkthdr.len; + + if (len > rrq->credit) { + /* Packet too big */ + rrq->credit += rrq->quantum; + /* Try next queue */ + next_pointer(si); + } else { + rrq->credit -= len; + return dn_dequeue(&rrq->q); + } + } + + /* no packet to dequeue*/ + return NULL; +} + +static int +rr_config(struct dn_schk *_schk) +{ + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); + ND("called"); + + /* use reasonable quantums (64..2k bytes, default 1500) */ + schk->min_q = 64; + schk->max_q = 2048; + schk->q_bytes = 1500; /* quantum */ + + return 0; +} + +static int +rr_new_sched(struct dn_sch_inst *_si) +{ + struct rr_si *si = (struct rr_si *)(_si + 1); + + ND("called"); + si->head = si->tail = NULL; + + return 0; +} + +static int +rr_free_sched(struct dn_sch_inst *_si) +{ + ND("called"); + /* Nothing to do? */ + return 0; +} + +static int +rr_new_fsk(struct dn_fsk *fs) +{ + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); + /* par[0] is the weight, par[1] is the quantum step */ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 65536, "RR weight"); + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, + schk->min_q, schk->max_q, "RR quantum"); + return 0; +} + +static int +rr_new_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_RR; + + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; + ND("called, q->quantum %d", q->quantum); + q->credit = q->quantum; + q->status = 0; + + if (_q->mq.head != NULL) { + /* Queue NOT empty, insert in the queue list */ + rr_append(q, (struct rr_si *)(_q->_si + 1)); + } + return 0; +} + +static int +rr_free_queue(struct dn_queue *_q, int safe) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + ND("called"); + if (safe) /* Delete only if status == 0 */ + return q->status; + + if (q->status == 1) { + struct rr_si *si = (struct rr_si *)(_q->_si + 1); + remove_queue_q(q, si); + } + return 0; +} + +/* + * RR scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg rr_desc = { + _SI( .type = ) DN_SCHED_RR, + _SI( .name = ) "RR", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct rr_si), + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), + + _SI( .enqueue = ) rr_enqueue, + _SI( .dequeue = ) rr_dequeue, + + _SI( .config = ) rr_config, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) rr_new_sched, + _SI( .free_sched = ) rr_free_sched, + _SI( .new_fsk = ) rr_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) rr_new_queue, + _SI( .free_queue = ) rr_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c new file mode 100644 index 0000000..86d0d57 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_wf2q.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: dn_sched_wf2q.c 11480 2012-07-31 08:02:00Z luigi $ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifndef MAX64 +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#endif + +/* + * timestamps are computed on 64 bit using fixed point arithmetic. + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len + * and sum of weights, respectively. FRAC_BITS is the number of + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w + * using an unsigned 32-bit division, and to avoid wraparounds we need + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 + * As an example + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 + */ +#ifndef FRAC_BITS +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#endif + +/* + * Private information for the scheduler instance: + * sch_heap (key is Finish time) returns the next queue to serve + * ne_heap (key is Start time) stores not-eligible queues + * idle_heap (key=start/finish time) stores idle flows. It must + * support extract-from-middle. + * A flow is only in 1 of the three heaps. + * XXX todo: use a more efficient data structure, e.g. a tree sorted + * by F with min_subtree(S) in each node + */ +struct wf2qp_si { + struct dn_heap sch_heap; /* top extract - key Finish time */ + struct dn_heap ne_heap; /* top extract - key Start time */ + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ + uint64_t V; /* virtual time */ + uint32_t inv_wsum; /* inverse of sum of weights */ + uint32_t wsum; /* sum of weights */ +}; + +struct wf2qp_queue { + struct dn_queue _q; + uint64_t S, F; /* start time, finish time */ + uint32_t inv_w; /* ONE_FP / weight */ + int32_t heap_pos; /* position (index) of struct in heap */ +}; + +/* + * This file implements a WF2Q+ scheduler as it has been in dummynet + * since 2000. + * The scheduler supports per-flow queues and has O(log N) complexity. + * + * WF2Q+ needs to drain entries from the idle heap so that we + * can keep the sum of weights up to date. We can do it whenever + * we get a chance, or periodically, or following some other + * strategy. The function idle_check() drains at most N elements + * from the idle heap. + */ +static void +idle_check(struct wf2qp_si *si, int n, int force) +{ + struct dn_heap *h = &si->idle_heap; + while (n-- > 0 && h->elements > 0 && + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { + struct dn_queue *q = HEAP_TOP(h)->object; + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + + heap_extract(h, NULL); + /* XXX to let the flowset delete the queue we should + * mark it as 'unused' by the scheduler. + */ + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + } +} + +static int +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct dn_fsk *fs = q->fs; + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct wf2qp_queue *alg_fq; + uint64_t len = m->m_pkthdr.len; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) /* queue was already busy */ + return 0; + } + + /* If reach this point, queue q was idle */ + alg_fq = (struct wf2qp_queue *)q; + + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { + /* Fbrand new queue. */ + alg_fq->S = si->V; /* init start time */ + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ + si->inv_wsum = ONE_FP/si->wsum; + } else { /* if it was idle then it was in the idle heap */ + heap_extract(&si->idle_heap, q); + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ + } + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; + + /* if nothing is backlogged, make sure this flow is eligible */ + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) + si->V = MAX64(alg_fq->S, si->V); + + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the sch_heap cannot be + * empty). If the flow is not eligible we just store it in the + * ne_heap. Otherwise, we store in the sch_heap. + * Note that for all flows in sch_heap (SCH), S_i <= V, + * and for all flows in ne_heap (NEH), S_i > V. + * So when we need to compute max(V, min(S_i)) forall i in + * SCH+NEH, we only need to look into NEH. + */ + if (DN_KEY_LT(si->V, alg_fq->S)) { + /* S>V means flow Not eligible. */ + if (si->sch_heap.elements == 0) + D("++ ouch! not eligible but empty scheduler!"); + heap_insert(&si->ne_heap, alg_fq->S, q); + } else { + heap_insert(&si->sch_heap, alg_fq->F, q); + } + return 0; +} + +/* XXX invariant: sch > 0 || V >= min(S in neh) */ +static struct mbuf * +wf2qp_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + struct dn_heap *sch = &si->sch_heap; + struct dn_heap *neh = &si->ne_heap; + struct wf2qp_queue *alg_fq; + + if (sch->elements == 0 && neh->elements == 0) { + /* we have nothing to do. We could kill the idle heap + * altogether and reset V + */ + idle_check(si, 0x7fffffff, 1); + si->V = 0; + si->wsum = 0; /* should be set already */ + return NULL; /* quick return if nothing to do */ + } + idle_check(si, 1, 0); /* drain something from the idle heap */ + + /* make sure at least one element is eligible, bumping V + * and moving entries that have become eligible. + * We need to repeat the first part twice, before and + * after extracting the candidate, or enqueue() will + * find the data structure in a wrong state. + */ + m = NULL; + for(;;) { + /* + * Compute V = max(V, min(S_i)). Remember that all elements + * in sch have by definition S_i <= V so if sch is not empty, + * V is surely the max and we must not update it. Conversely, + * if sch is empty we only need to look at neh. + * We don't need to move the queues, as it will be done at the + * next enqueue + */ + if (sch->elements == 0 && neh->elements > 0) { + si->V = MAX64(si->V, HEAP_TOP(neh)->key); + } + while (neh->elements > 0 && + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { + q = HEAP_TOP(neh)->object; + alg_fq = (struct wf2qp_queue *)q; + heap_extract(neh, NULL); + heap_insert(sch, alg_fq->F, q); + } + if (m) /* pkt found in previous iteration */ + break; + /* ok we have at least one eligible pkt */ + q = HEAP_TOP(sch)->object; + alg_fq = (struct wf2qp_queue *)q; + m = dn_dequeue(q); + heap_extract(sch, NULL); /* Remove queue from heap. */ + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; + alg_fq->S = alg_fq->F; /* Update start time. */ + if (q->mq.head == 0) { /* not backlogged any more. */ + heap_insert(&si->idle_heap, alg_fq->F, q); + } else { /* Still backlogged. */ + /* Update F, store in neh or sch */ + uint64_t len = q->mq.head->m_pkthdr.len; + alg_fq->F += len * alg_fq->inv_w; + if (DN_KEY_LEQ(alg_fq->S, si->V)) { + heap_insert(sch, alg_fq->F, q); + } else { + heap_insert(neh, alg_fq->S, q); + } + } + } + return m; +} + +static int +wf2qp_new_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + int ofs = offsetof(struct wf2qp_queue, heap_pos); + + /* all heaps support extract from middle */ + if (heap_init(&si->idle_heap, 16, ofs) || + heap_init(&si->sch_heap, 16, ofs) || + heap_init(&si->ne_heap, 16, ofs)) { + heap_free(&si->ne_heap); + heap_free(&si->sch_heap); + heap_free(&si->idle_heap); + return ENOMEM; + } + return 0; +} + +static int +wf2qp_free_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + + heap_free(&si->sch_heap); + heap_free(&si->ne_heap); + heap_free(&si->idle_heap); + + return 0; +} + +static int +wf2qp_new_fsk(struct dn_fsk *fs) +{ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 100, "WF2Q+ weight"); + return 0; +} + +static int +wf2qp_new_queue(struct dn_queue *_q) +{ + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_WF2QP; + q->F = 0; /* not strictly necessary */ + q->S = q->F + 1; /* mark timestamp as invalid. */ + q->inv_w = ONE_FP / _q->fs->fs.par[0]; + if (_q->mq.head != NULL) { + wf2qp_enqueue(_q->_si, _q, _q->mq.head); + } + return 0; +} + +/* + * Called when the infrastructure removes a queue (e.g. flowset + * is reconfigured). Nothing to do if we did not 'own' the queue, + * otherwise remove it from the right heap and adjust the sum + * of weights. + */ +static int +wf2qp_free_queue(struct dn_queue *q, int safe) +{ + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); + + if (alg_fq->S >= alg_fq->F + 1) + return 0; /* nothing to do, not in any heap */ + + /* queue is in a scheduler heap */ + if (safe) /* do not delete in safe mode */ + return 1; + + si->wsum -= q->fs->fs.par[0]; + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + + /* extract from the heap. XXX TODO we may need to adjust V + * to make sure the invariants hold. + */ + if (q->mq.head == NULL) { + heap_extract(&si->idle_heap, q); + } else if (DN_KEY_LT(si->V, alg_fq->S)) { + heap_extract(&si->ne_heap, q); + } else { + heap_extract(&si->sch_heap, q); + } + return 0; +} + +/* + * WF2Q+ scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg wf2qp_desc = { + _SI( .type = ) DN_SCHED_WF2QP, + _SI( .name = ) "WF2Q+", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct wf2qp_si), + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - + sizeof(struct dn_queue), + + _SI( .enqueue = ) wf2qp_enqueue, + _SI( .dequeue = ) wf2qp_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) wf2qp_new_sched, + _SI( .free_sched = ) wf2qp_free_sched, + + _SI( .new_fsk = ) wf2qp_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) wf2qp_new_queue, + _SI( .free_queue = ) wf2qp_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c new file mode 100644 index 0000000..aa0ac90 --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_glue.c @@ -0,0 +1,846 @@ +/*- + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: ip_dn_glue.c 12500 2013-12-11 23:07:58Z luigi $ + * + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 + */ + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include + +#include +#include +#include +#include + +/* FREEBSD7.2 ip_dummynet.h r191715*/ + +struct dn_heap_entry7 { + int64_t key; /* sorting key. Topmost element is smallest one */ + void *object; /* object pointer */ +}; + +struct dn_heap7 { + int size; + int elements; + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry7 *p; /* really an array of "size" entries */ +}; + +/* Common to 7.2 and 8 */ +struct dn_flow_set { + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DNOLD_HAVE_FLOW_MASK 0x0001 +#define DNOLD_IS_RED 0x0002 +#define DNOLD_IS_GENTLE_RED 0x0004 +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ +#define DNOLD_IS_PIPE 0x4000 +#define DNOLD_IS_QUEUE 0x8000 + + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue7 **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; +SLIST_HEAD(dn_flow_set_head, dn_flow_set); + +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +struct dn_flow_queue7 { + struct dn_flow_queue7 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + u_long numbytes; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe7 { /* a pipe */ + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + int numbytes; + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; +SLIST_HEAD(dn_pipe_head7, dn_pipe7); + + +/* FREEBSD8 ip_dummynet.h r196045 */ +struct dn_flow_queue8 { + struct dn_flow_queue8 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ + int64_t extra_bits; /* extra bits simulating unavailable channel */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + int64_t idle_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe8 { /* a pipe */ + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + /* Same as in dn_flow_queue, numbytes can become large */ + int64_t numbytes; /* bits I can transmit (more or less). */ + uint64_t burst; /* burst size, scaled: bits * hz */ + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + int64_t idle_time; /* start of pipe idle time */ + + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ + + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int loss_level; + int samples_no; + int *samples; +}; + +#define ED_MAX_SAMPLES_NO 1024 +struct dn_pipe_max8 { + struct dn_pipe8 pipe; + int samples[ED_MAX_SAMPLES_NO]; +}; +SLIST_HEAD(dn_pipe_head8, dn_pipe8); + +/* + * Changes from 7.2 to 8: + * dn_pipe: + * numbytes from int to int64_t + * add burst (int64_t) + * add idle_time (int64_t) + * add profile + * add struct dn_pipe_max + * add flag DN_HAS_PROFILE + * + * dn_flow_queue + * numbytes from u_long to int64_t + * add extra_bits (int64_t) + * q_time from u_int32_t to int64_t and name idle_time + * + * dn_flow_set unchanged + * + */ + +/* NOTE:XXX copied from dummynet.c */ +#define O_NEXT(p, len) ((void *)((char *)p + len)) +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + + +static size_t pipesize7 = sizeof(struct dn_pipe7); +static size_t pipesize8 = sizeof(struct dn_pipe8); +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); + +/* Indicate 'ipfw' version + * 1: from FreeBSD 7.2 + * 0: from FreeBSD 8 + * -1: unknown (for now is unused) + * + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown, + * it is suppose to be the FreeBSD 8 version. + */ +static int is7 = 0; + +static int +convertflags2new(int src) +{ + int dst = 0; + + if (src & DNOLD_HAVE_FLOW_MASK) + dst |= DN_HAVE_MASK; + if (src & DNOLD_QSIZE_IS_BYTES) + dst |= DN_QSIZE_BYTES; + if (src & DNOLD_NOERROR) + dst |= DN_NOERROR; + if (src & DNOLD_IS_RED) + dst |= DN_IS_RED; + if (src & DNOLD_IS_GENTLE_RED) + dst |= DN_IS_GENTLE_RED; + if (src & DNOLD_HAS_PROFILE) + dst |= DN_HAS_PROFILE; + + return dst; +} + +static int +convertflags2old(int src) +{ + int dst = 0; + + if (src & DN_HAVE_MASK) + dst |= DNOLD_HAVE_FLOW_MASK; + if (src & DN_IS_RED) + dst |= DNOLD_IS_RED; + if (src & DN_IS_GENTLE_RED) + dst |= DNOLD_IS_GENTLE_RED; + if (src & DN_NOERROR) + dst |= DNOLD_NOERROR; + if (src & DN_HAS_PROFILE) + dst |= DNOLD_HAS_PROFILE; + if (src & DN_QSIZE_BYTES) + dst |= DNOLD_QSIZE_IS_BYTES; + + return dst; +} + +static int +dn_compat_del(void *v) +{ + struct dn_pipe7 *p = (struct dn_pipe7 *) v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + + /* XXX DN_API_VERSION ??? */ + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + + if (is7) { + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL; + } else { + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) + return EINVAL; + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) + return EINVAL; + } + + if (p->pipe_nr != 0) { /* pipe x delete */ + cmd.a[0] = p->pipe_nr; + cmd.oid.subtype = DN_LINK; + } else { /* queue x delete */ + cmd.oid.subtype = DN_FS; + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; + } + + return do_config(&cmd, cmd.oid.len); +} + +static int +dn_compat_config_queue(struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + struct dn_flow_set *f; + + if (is7) + f = &p7->fs; + else + f = &p8->fs; + + fs->fs_nr = f->fs_nr; + fs->sched_nr = f->parent_nr; + fs->flow_mask = f->flow_mask; + fs->buckets = f->rq_size; + fs->qsize = f->qsize; + fs->plr = f->plr; + fs->par[0] = f->weight; + fs->flags = convertflags2new(f->flags_fs); + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { + fs->w_q = f->w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->max_p; + } + + return 0; +} + +static int +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, + struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + int i = p7->pipe_nr; + + sch->sched_nr = i; + sch->oid.subtype = 0; + p->link_nr = i; + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Common to 7 and 8 */ + p->bandwidth = p7->bandwidth; + p->delay = p7->delay; + if (!is7) { + /* FreeBSD 8 has burst */ + p->burst = p8->burst; + } + + /* fill the fifo flowset */ + dn_compat_config_queue(fs, v); + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Move scheduler related parameter from fs to sch */ + sch->buckets = fs->buckets; /*XXX*/ + fs->buckets = 0; + if (fs->flags & DN_HAVE_MASK) { + sch->flags |= DN_HAVE_MASK; + fs->flags &= ~DN_HAVE_MASK; + sch->sched_mask = fs->flow_mask; + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); + } + + return 0; +} + +static int +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, + void *v) +{ + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); + + pf->link_nr = p->link_nr; + pf->loss_level = p8->loss_level; +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? + pf->samples_no = p8->samples_no; + strncpy(pf->name, p8->name,sizeof(pf->name)); + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); + + return 0; +} + +/* + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create + * the three main struct, else only a flowset is created + */ +static int +dn_compat_configure(void *v) +{ + struct dn_id *buf = NULL, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + int lmax; + int error; + + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + int i; /* number of object to configure */ + + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO); + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + /* pipe_nr is the same in p7 and p8 */ + i = p7->pipe_nr; + if (i != 0) { /* pipe config */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + error = dn_compat_config_pipe(sch, p, fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + if (!is7 && p8->samples_no > 0) { + /* Add profiles*/ + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + error = dn_compat_config_profile(pf, p, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + } else { /* queue config */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + error = dn_compat_config_queue(fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + error = do_config(base, (char *)buf - (char *)base); + + if (buf) + free(buf, M_DUMMYNET); + return error; +} + +int +dn_compat_calc_size(void) +{ + int need = 0; + /* XXX use FreeBSD 8 struct size */ + /* NOTE: + * - half scheduler: schk_count/2 + * - all flowset: fsk_count + * - all flowset queues: queue_count + * - all pipe queue: si_count + */ + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); + + return need; +} + +int +dn_c_copy_q (void *_ni, void *arg) +{ + struct copy_args *a = arg; + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; + struct dn_flow *ni = (struct dn_flow *)_ni; + int size = 0; + + /* XXX hash slot not set */ + /* No difference between 7.2/8 */ + fq7->len = ni->length; + fq7->len_bytes = ni->len_bytes; + fq7->id = ni->fid; + + if (is7) { + size = sizeof(struct dn_flow_queue7); + fq7->tot_pkts = ni->tot_pkts; + fq7->tot_bytes = ni->tot_bytes; + fq7->drops = ni->drops; + } else { + size = sizeof(struct dn_flow_queue8); + fq8->tot_pkts = ni->tot_pkts; + fq8->tot_bytes = ni->tot_bytes; + fq8->drops = ni->drops; + } + + *a->start += size; + return 0; +} + +int +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) +{ + struct dn_link *l = &s->link; + struct dn_fsk *f = s->fs; + + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; + struct dn_flow_set *fs; + int size = 0; + + if (is7) { + fs = &pipe7->fs; + size = sizeof(struct dn_pipe7); + } else { + fs = &pipe8->fs; + size = sizeof(struct dn_pipe8); + } + + /* These 4 field are the same in pipe7 and pipe8 */ + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; + pipe7->bandwidth = l->bandwidth; + pipe7->delay = l->delay * 1000 / hz; + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; + + if (!is7) { + if (s->profile) { + struct dn_profile *pf = s->profile; + strncpy(pipe8->name, pf->name, sizeof(pf->name)); + pipe8->loss_level = pf->loss_level; + pipe8->samples_no = pf->samples_no; + } + pipe8->burst = div64(l->burst , 8 * hz); + } + + fs->flow_mask = s->sch.sched_mask; + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; + + fs->parent_nr = l->link_nr - DN_MAX_ID; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->rq_elements = nq; + + fs->flags_fs = convertflags2old(f->fs.flags); + + *a->start += size; + return 0; +} + + +int +dn_compat_copy_pipe(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int pipe_size = sizeof(struct dn_pipe8); + int queue_size = sizeof(struct dn_flow_queue8); + int n_queue = 0; /* number of queues */ + + struct dn_schk *s = (struct dn_schk *)_o; + /* calculate needed space: + * - struct dn_pipe + * - if there are instances, dn_queue * n_instances + */ + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : + (s->siht ? 1 : 0)); + need = pipe_size + queue_size * n_queue; + if (have < need) { + D("have %d < need %d", have, need); + return 1; + } + /* copy pipe */ + dn_c_copy_pipe(s, a, n_queue); + + /* copy queues */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, dn_c_copy_q, a); + else if (s->siht) + dn_c_copy_q(s->siht, a); + return 0; +} + +int +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) +{ + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; + + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; + fs->fs_nr = f->fs.fs_nr; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->flow_mask = f->fs.flow_mask; + fs->rq_elements = nq; + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); + fs->parent_nr = f->fs.sched_nr; + fs->weight = f->fs.par[0]; + + fs->flags_fs = convertflags2old(f->fs.flags); + *a->start += sizeof(struct dn_flow_set); + return 0; +} + +int +dn_compat_copy_queue(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int fs_size = sizeof(struct dn_flow_set); + int queue_size = sizeof(struct dn_flow_queue8); + + struct dn_fsk *fs = (struct dn_fsk *)_o; + int n_queue = 0; /* number of queues */ + + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : + (fs->qht ? 1 : 0)); + + need = fs_size + queue_size * n_queue; + if (have < need) { + D("have < need"); + return 1; + } + + /* copy flowset */ + dn_c_copy_fs(fs, a, n_queue); + + /* copy queues */ + if (fs->fs.flags & DN_HAVE_MASK) + dn_ht_scan(fs->qht, dn_c_copy_q, a); + else if (fs->qht) + dn_c_copy_q(fs->qht, a); + + return 0; +} + +int +copy_data_helper_compat(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_COMPAT_PIPE) { + struct dn_schk *s = _o; + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { + return 0; /* not old type */ + } + /* copy pipe parameters, and if instance exists, copy + * other parameters and eventually queues. + */ + if(dn_compat_copy_pipe(a, _o)) + return DNHT_SCAN_END; + } else if (a->type == DN_COMPAT_QUEUE) { + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (dn_compat_copy_queue(a, _o)) + return DNHT_SCAN_END; + } + return 0; +} + +/* Main function to manage old requests */ +int +ip_dummynet_compat(struct sockopt *sopt) +{ + int error=0; + void *v = NULL; + struct dn_id oid; + + /* Lenght of data, used to found ipfw version... */ + int len = sopt->sopt_valsize; + + /* len can be 0 if command was dummynet_flush */ + if (len == pipesize7) { + D("setting compatibility with FreeBSD 7.2"); + is7 = 1; + } + else if (len == pipesize8 || len == pipesizemax8) { + D("setting compatibility with FreeBSD 8"); + is7 = 0; + } + + switch (sopt->sopt_name) { + default: + printf("dummynet: -- unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_config(&oid, oid.len); + break; + + case IP_DUMMYNET_DEL: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_del(v); + free(v, M_TEMP); + break; + + case IP_DUMMYNET_CONFIGURE: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_configure(v); + free(v, M_TEMP); + break; + + case IP_DUMMYNET_GET: { + void *buf; + int ret; + int original_size = sopt->sopt_valsize; + int size; + + ret = dummynet_get(sopt, &buf); + if (ret) + return 0;//XXX ? + size = sopt->sopt_valsize; + sopt->sopt_valsize = original_size; + D("size=%d, buf=%p", size, buf); + ret = sooptcopyout(sopt, buf, size); + if (ret) + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); + if (buf) + free(buf, M_DUMMYNET); + } + } + + return error; +} + + diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c new file mode 100644 index 0000000..fd0dbeb --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_io.c @@ -0,0 +1,962 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Dummynet portions related to packet handling. + */ +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dn_io.c 203321 2010-01-31 21:39:25Z luigi $"); + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include + +#include +#include /* ip_len, ip_off */ +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +#include /* various ether_* routines */ + +#include /* for ip6_input, ip6_output prototypes */ +#include + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) + * instead of dn_cfg.curr_time + */ + +struct dn_parms dn_cfg; +//VNET_DEFINE(struct dn_parms, _base_dn_cfg); + +static long tick_last; /* Last tick duration (usec). */ +static long tick_delta; /* Last vs standard tick diff (usec). */ +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ +static long tick_adjustment; /* Tick adjustments done. */ +static long tick_lost; /* Lost(coalesced) ticks number. */ +/* Adjusted vs non-adjusted curr_time difference (ticks). */ +static long tick_diff; + +static unsigned long io_pkt; +static unsigned long io_pkt_fast; +static unsigned long io_pkt_drop; + +/* + * We use a heap to store entities for which we have pending timer events. + * The heap is checked at every tick and all entities with expired events + * are extracted. + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); + +/* wrapper to pass dn_cfg fields to SYSCTL_* */ +//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) +#define DC(x) (&(dn_cfg.x)) +/* parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, DC(hash_size), 0, "Default hash table size"); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, + CTLFLAG_RW, DC(slot_limit), 0, + "Upper limit in slots for pipe queue."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, + CTLFLAG_RW, DC(byte_limit), 0, + "Upper limit in bytes for pipe queue."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, + CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, + CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); + +/* RED parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); + +/* time adjustment */ +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, + CTLFLAG_RD, &tick_diff, 0, + "Adjusted vs non-adjusted curr_time difference (ticks)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, + CTLFLAG_RD, &tick_lost, 0, + "Number of ticks coalesced by dummynet taskqueue."); + +/* Drain parameters */ +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, + CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_object, + CTLFLAG_RW, DC(expire_object), 0, "Min # of objects before start drain routine"); +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, object_idle_tick, + CTLFLAG_RD, DC(object_idle_tick), 0, "Time (in ticks) to cosiderer an object as idle"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, drain_ratio, + CTLFLAG_RD, DC(drain_ratio), 0, "% of dummynet_task() to dedicate to drain routine"); + +/* statistics */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, + CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, + CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, + CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, + CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, + CTLFLAG_RD, &io_pkt, 0, + "Number of packets passed to dummynet."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, + CTLFLAG_RD, &io_pkt_fast, 0, + "Number of packets bypassed dummynet scheduler."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, + CTLFLAG_RD, &io_pkt_drop, 0, + "Number of packets dropped by dummynet."); +#undef DC +SYSEND + +#endif + +static void dummynet_send(struct mbuf *); + +/* + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. + * Outside dummynet, only the 'rule' field is relevant, and it must + * be at the beginning of the structure. + */ +struct dn_pkt_tag { + struct ipfw_rule_ref rule; /* matching rule */ + + /* second part, dummynet specific */ + int dn_dir; /* action when packet comes out.*/ + /* see ip_fw_private.h */ + uint64_t output_time; /* when the pkt is due for delivery*/ + struct ifnet *ifp; /* interface, for ip_output */ + struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +}; + +/* + * Return the mbuf tag holding the dummynet state (it should + * be the first one on the list). + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); + KASSERT(mtag != NULL && + mtag->m_tag_cookie == MTAG_ABI_COMPAT && + mtag->m_tag_id == PACKET_TAG_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); + return (struct dn_pkt_tag *)(mtag+1); +} + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +/* + * Dispose a list of packet. Use a functions so if we need to do + * more work, this is a central point to do it. + */ +void dn_free_pkts(struct mbuf *mnext) +{ + struct mbuf *m; + + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + FREE_PKT(m); + } +} + +static int +red_drops (struct dn_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + */ + + struct dn_fsk *fs = q->fs; + int64_t p_b = 0; + + /* Queue in bytes or packets? */ + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* Average queue size estimation. */ + if (q_size != 0) { + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + + q->avg += (int)v; + } else { + /* + * Queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + + /* Should i drop? */ + if (q->avg < fs->min_th) { + q->count = -1; + return (0); /* accept packet */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->fs.flags & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than + * max_th the packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th + * c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - + fs->c_4; + } else { + q->count = -1; + return (1); + } + } else if (q->avg > fs->min_th) { + /* + * We compute p_b using the linear dropping function + * p_b = c_1 * avg - c_2 + * where c_1 = max_p / (max_th - min_th) + * c_2 = max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + } + + if (fs->fs.flags & DN_QSIZE_BYTES) + p_b = div64((p_b * len) , fs->max_pkt_size); + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { + q->count = 0; + /* After a drop we calculate a new random value. */ + q->random = random() & 0xffff; + return (1); /* drop */ + } + } + /* End of RED algorithm. */ + + return (0); /* accept */ + +} + +/* + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + struct dn_fs *f; + struct dn_flow *ni; /* stats for scheduler instance */ + uint64_t len; + + if (q->fs == NULL || q->_si == NULL) { + printf("%s fs %p si %p, dropping\n", + __FUNCTION__, q->fs, q->_si); + FREE_PKT(m); + return 1; + } + f = &(q->fs->fs); + ni = &q->_si->ni; + len = m->m_pkthdr.len; + /* Update statistics, then check reasons to drop pkt. */ + q->ni.tot_bytes += len; + q->ni.tot_pkts++; + ni->tot_bytes += len; + ni->tot_pkts++; + if (drop) + goto drop; + if (f->plr && random() < f->plr) + goto drop; + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) + goto drop; + if (f->flags & DN_QSIZE_BYTES) { + if (q->ni.len_bytes > f->qsize) + goto drop; + } else if (q->ni.length >= f->qsize) { + goto drop; + } + mq_append(&q->mq, m); + if (q->ni.length == 0) { /* queue was idle */ + dn_cfg.idle_queue--; + if (ni->length == 0) /* scheduler was idle */ + dn_cfg.idle_si--; + } + q->ni.length++; + q->ni.len_bytes += len; + ni->length++; + ni->len_bytes += len; + return 0; + +drop: + io_pkt_drop++; + q->ni.drops++; + ni->drops++; + FREE_PKT(m); + return 1; +} + +/* + * Fetch packets from the delay line which are due now. If there are + * leftover packets, reinsert the delay line in the heap. + * Runs under scheduler lock. + */ +static void +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) +{ + struct mbuf *m; + struct dn_pkt_tag *pkt = NULL; + + dline->oid.subtype = 0; /* not in heap */ + while ((m = dline->mq.head) != NULL) { + pkt = dn_tag_get(m); + if (!DN_KEY_LEQ(pkt->output_time, now)) + break; + dline->mq.head = m->m_nextpkt; + mq_append(q, m); + } + if (m != NULL) { + dline->oid.subtype = 1; /* in heap */ + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); + } +} + +/* + * Convert the additional MAC overheads/delays into an equivalent + * number of bits for the given data rate. The samples are + * in milliseconds so we need to divide by 1000. + */ +static uint64_t +extra_bits(struct mbuf *m, struct dn_schk *s) +{ + int index; + uint64_t bits; + struct dn_profile *pf = s->profile; + + if (!pf || pf->samples_no == 0) + return 0; + index = random() % pf->samples_no; + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); + if (index >= pf->loss_level) { + struct dn_pkt_tag *dt = dn_tag_get(m); + if (dt) + dt->dn_dir = DIR_DROP; + } + return bits; +} + +/* + * Send traffic from a scheduler instance due by 'now'. + * Return a pointer to the head of the queue. + */ +static struct mbuf * +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) +{ + struct mq def_q; + struct dn_schk *s = si->sched; + struct mbuf *m = NULL; + int delay_line_idle = (si->dline.mq.head == NULL); + int done, bw; + + if (q == NULL) { + q = &def_q; + q->head = NULL; + } + + bw = s->link.bandwidth; + si->kflags &= ~DN_ACTIVE; + + if (bw > 0) + si->credit += (now - si->sched_time) * bw; + else + si->credit = 0; + si->sched_time = now; + done = 0; + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + + /* + * Some schedulers might want wake up the scheduler later. + * To suppor this the caller returns an mbuf with len < 0 + * this will result in a new wake up of the scheduler + * instance between m->m_pkthdr.len ticks. + */ + if (m->m_pkthdr.len < 0) { + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now - m->m_pkthdr.len, si); + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return NULL; + } + + /* a regular mbuf received */ + done++; + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay; + mq_append(&si->dline.mq, m); + } + + /* + * If credit >= 0 the instance is idle, mark time. + * Otherwise put back in the heap, and adjust the output + * time of the last inserted packet, m, which was too early. + */ + if (si->credit >= 0) { + si->idle_time = now; + } else { + uint64_t t; + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); + t = div64(bw - 1 - si->credit, bw); + if (m) + dn_tag_get(m)->output_time += t; + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now + t, si); + } + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return q->head; +} + +/* + * Support function to read the TSC (or equivalent). We use this + * high resolution timer to adapt the amount of work done for + * expiring the clock. + * Supports Linux and FreeBSD both i386 and amd64 platform + * Supports OpenWRT mips architecture + * + * SMP no special works is needed in + * - In linux 2.6 timers will always run in the same cpu that have added it.See + * (http://book.opensourceproject.org.cn/kernel/kernel3rd/opensource/0596005652/understandlk-chp-6-sect-5.html) + * - FreeBSD8 has a new callout_reset_on() with specify the cpu on which + * the timer must be run + * - Windows runs dummynet_task() on cpu0. + * + * - Linux 2.4 doesn't assure to run a timer in the same cpu every time. + */ +#ifdef HAVE_TSC +uint64_t +readTSC (void) +{ + uint64_t a=0; + +#ifdef __linux__ + /* Linux and openwrt have a macro to read the tsc for i386 and + * amd64. + * Openwrt have patched the kernel and allow use of tsc with mips + * and other platforms + * rdtscll() is a macro defined in include/asm-xxx/msr.h, + * where xxx is the architecture (x86, mips). + */ + rdtscll(a); +#elif defined(_WIN32) + /* Microsoft recommends the use of KeQueryPerformanceCounter() + * insteead of rdtsc(). + */ + KeQueryPerformanceCounter((PLARGE_INTEGER)&a); //XXX not tested! +#elif defined(__FreeBSD__) + /* FreeBSD (i386/amd64) has macro rdtsc() defined in machine/cpufunc.h. + * We could use the macro instead of explicity assembly XXX + */ + return rdtsc(); +#endif + return a; +} +#endif /* HAVE_TSC */ + +/* + * compute avg task period. + * We could do something more complex, possibly. + */ +static void +do_update_cycle(void) +{ +#ifdef HAVE_TSC + uint64_t tmp = readTSC(); +#if defined (LINUX_24) && defined(CONFIG_SMP) + /* on LINUX24 and SMP, we have no guarantees on which cpu runs + * the timer callbacks. If the difference between new and + * old value is negative, we assume that the values come from + * different cpus so we adjust 'new' accordingly. + */ + if (tmp <= dn_cfg.cycle_task_new) + dn_cfg.cycle_task_new = tmp - dn_cfg.cycle_task; +#endif /* !(linux24 && SMP) */ + dn_cfg.cycle_task_old = dn_cfg.cycle_task_new; + dn_cfg.cycle_task_new = tmp; + dn_cfg.cycle_task = dn_cfg.cycle_task_new - dn_cfg.cycle_task_old; + + /* Update the average + * avg = (2^N * avg + new - avg ) / 2^N * avg + * N==4 seems to be a good compromise between clock clock change + * and 'spurious' cycle_task value + */ +#define DN_N 4 + dn_cfg.cycle_task_avg = (dn_cfg.cycle_task_avg << DN_N) + + dn_cfg.cycle_task - dn_cfg.cycle_task_avg; + dn_cfg.cycle_task_avg = dn_cfg.cycle_task_avg >> DN_N; +#undef DN_N + +#endif /* HAVE_TSC */ +} + +static void +do_drain(void) +{ +#ifdef HAVE_TSC + uint64_t dt_max; +#endif + if (!dn_cfg.expire || ++dn_cfg.expire_cycle < dn_cfg.expire) + return; + /* It's time to check if drain routines should be called */ + dn_cfg.expire_cycle = 0; + + dn_cfg.idle_queue_wait = 0; + dn_cfg.idle_si_wait = 0; + /* Do a drain cycle even if there isn't time to do it */ +#ifdef HAVE_TSC + dt_max = dn_cfg.cycle_task_avg * dn_cfg.drain_ratio; +#endif + for (;;) { + int done = 0; + + if (dn_cfg.idle_queue > dn_cfg.expire_object && + dn_cfg.idle_queue_wait < dn_cfg.idle_queue) { + dn_drain_queue(); + done = 1; + } + if (dn_cfg.idle_si > dn_cfg.expire_object && + dn_cfg.idle_si_wait < dn_cfg.idle_si) { + dn_drain_scheduler(); + done = 1; + } + /* time to end ? */ +#ifndef HAVE_TSC + /* If tsc does not exist, do only one drain cycle and exit */ + break; +#else + /* Exit when nothing was done or we have consumed all time */ + if ( (done == 0) || + ((readTSC() - dn_cfg.cycle_task_new) * 100 > dt_max) ) + break; +#endif /* HAVE_TSC */ + } +} + +/* + * The timer handler for dummynet. Time is computed in ticks, but + * but the code is tolerant to the actual rate at which this is called. + * Once complete, the function reschedules itself for the next tick. + */ +void +dummynet_task(void *context, int pending) +{ + struct timeval t; + struct mq q = { NULL, NULL }; /* queue to accumulate results */ + + CURVNET_SET((struct vnet *)context); + + do_update_cycle(); /* compute avg. tick duration */ + + DN_BH_WLOCK(); + + /* Update number of lost(coalesced) ticks. */ + tick_lost += pending - 1; + + getmicrouptime(&t); + /* Last tick duration (usec). */ + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + + (t.tv_usec - dn_cfg.prev_t.tv_usec); + /* Last tick vs standard tick difference (usec). */ + tick_delta = (tick_last * hz - 1000000) / hz; + /* Accumulated tick difference (usec). */ + tick_delta_sum += tick_delta; + + dn_cfg.prev_t = t; + + /* + * Adjust curr_time if the accumulated tick difference is + * greater than the 'standard' tick. Since curr_time should + * be monotonically increasing, we do positive adjustments + * as required, and throttle curr_time in case of negative + * adjustment. + */ + dn_cfg.curr_time++; + if (tick_delta_sum - tick >= 0) { + int diff = tick_delta_sum / tick; + + dn_cfg.curr_time += diff; + tick_diff += diff; + tick_delta_sum %= tick; + tick_adjustment++; + } else if (tick_delta_sum + tick <= 0) { + dn_cfg.curr_time--; + tick_diff--; + tick_delta_sum += tick; + tick_adjustment++; + } + + /* serve pending events, accumulate in q */ + for (;;) { + struct dn_id *p; /* generic parameter to handler */ + + if (dn_cfg.evheap.elements == 0 || + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) + break; + p = HEAP_TOP(&dn_cfg.evheap)->object; + heap_extract(&dn_cfg.evheap, NULL); + + if (p->type == DN_SCH_I) { + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); + } else { /* extracted a delay line */ + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); + } + } + do_drain(); + + DN_BH_WUNLOCK(); + dn_reschedule(); + if (q.head != NULL) + dummynet_send(q.head); + CURVNET_RESTORE(); +} + +/* + * forward a chain of packets to the proper destination. + * This runs outside the dummynet lock. + */ +static void +dummynet_send(struct mbuf *m) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ + struct m_tag *tag; + int dst; + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + tag = m_tag_first(m); + if (tag == NULL) { /* should not happen */ + dst = DIR_DROP; + } else { + struct dn_pkt_tag *pkt = dn_tag_get(m); + /* extract the dummynet info, rename the tag + * to carry reinject info. + */ + dst = pkt->dn_dir; + ifp = pkt->ifp; + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + } + + switch (dst) { + case DIR_OUT: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); + break ; + + case DIR_IN : + /* put header in network format for ip_input() */ + //SET_NET_IPLEN(mtod(m, struct ip *)); + netisr_dispatch(NETISR_IP, m); + break; + +#ifdef INET6 + case DIR_IN | PROTO_IPV6: + netisr_dispatch(NETISR_IPV6, m); + break; + + case DIR_OUT | PROTO_IPV6: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); + break; +#endif + + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ + if (bridge_dn_p != NULL) + ((*bridge_dn_p)(m, ifp)); + else + printf("dummynet: if_bridge not loaded\n"); + + break; + + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ + /* + * The Ethernet code assumes the Ethernet header is + * contiguous in the first mbuf header. + * Insure this is true. + */ + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/ether: pullup failed, " + "dropping packet\n"); + break; + } + ether_demux(m->m_pkthdr.rcvif, m); + break; + + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ + ether_output_frame(ifp, m); + break; + + case DIR_DROP: + /* drop the packet after some time */ + FREE_PKT(m); + break; + + default: + printf("dummynet: bad switch %d!\n", dst); + FREE_PKT(m); + break; + } + } +} + +static inline int +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt_tag *dt; + struct m_tag *mtag; + + mtag = m_tag_get(PACKET_TAG_DUMMYNET, + sizeof(*dt), M_NOWAIT | M_ZERO); + if (mtag == NULL) + return 1; /* Cannot allocate packet header. */ + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ + dt = (struct dn_pkt_tag *)(mtag + 1); + dt->rule = fwa->rule; + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ + dt->dn_dir = dir; + dt->ifp = fwa->oif; + /* dt->output tame is updated as we move through */ + dt->output_time = dn_cfg.curr_time; + return 0; +} + + +/* + * dummynet hook for packets. + * We use the argument to locate the flowset fs and the sched_set sch + * associated to it. The we apply flow_mask and sched_mask to + * determine the queue and scheduler instances. + * + * dir where shall we send the packet after dummynet. + * *m0 the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + */ +int +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct dn_fsk *fs = NULL; + struct dn_sch_inst *si; + struct dn_queue *q = NULL; /* default */ + + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); + DN_BH_WLOCK(); + io_pkt++; + /* we could actually tag outside the lock, but who cares... */ + if (tag_mbuf(m, dir, fwa)) + goto dropit; + if (dn_cfg.busy) { + /* if the upper half is busy doing something expensive, + * lets queue the packet and move forward + */ + mq_append(&dn_cfg.pending, m); + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + /* XXX locate_flowset could be optimised with a direct ref. */ + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); + if (fs == NULL) + goto dropit; /* This queue/pipe does not exist! */ + if (fs->sched == NULL) /* should not happen */ + goto dropit; + /* + * If the scheduler supports multiple queues, find the right one + * (otherwise it will be ignored by enqueue). + */ + if (fs->sched->fp->flags & DN_MULTIQUEUE) { + q = ipdn_q_find(fs, &(fwa->f_id)); + if (q == NULL) + goto dropit; + /* The scheduler instance lookup is done only for new queue. + * The callback q_new() will create the scheduler instance + * if needed. + */ + si = q->_si; + } else + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + + if (si == NULL) + goto dropit; + if (fs->sched->fp->enqueue(si, q, m)) { + /* packet was dropped by enqueue() */ + m = *m0 = NULL; + goto dropit; + } + + if (si->kflags & DN_ACTIVE) { + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + + /* compute the initial allowance */ + if (si->idle_time < dn_cfg.curr_time) { + /* Do this only on the first packet on an idle pipe */ + struct dn_link *p = &fs->sched->link; + + si->sched_time = dn_cfg.curr_time; + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; + if (p->burst) { + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; + if (burst > p->burst) + burst = p->burst; + si->credit += burst; + } + } + /* pass through scheduler and delay line */ + m = serve_sched(NULL, si, dn_cfg.curr_time); + + /* optimization -- pass it back to ipfw for immediate send */ + /* XXX Don't call dummynet_send() if scheduler return the packet + * just enqueued. This avoid a lock order reversal. + * + */ + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { + /* fast io, rename the tag * to carry reinject info. */ + struct m_tag *tag = m_tag_first(m); + + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + io_pkt_fast++; + if (m->m_nextpkt != NULL) { + printf("dummynet: fast io: pkt chain detected!\n"); + m->m_nextpkt = NULL; + } + m = NULL; + } else { + *m0 = NULL; + } +done: + DN_BH_WUNLOCK(); + if (m) + dummynet_send(m); + return 0; + +dropit: + io_pkt_drop++; + DN_BH_WUNLOCK(); + if (m) + FREE_PKT(m); + *m0 = NULL; + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; +} diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h new file mode 100644 index 0000000..ecb4fe2 --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_private.h @@ -0,0 +1,419 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * internal dummynet APIs. + * + * $FreeBSD: head/sys/netinet/ipfw/ip_dn_private.h 204591 2010-03-02 17:40:48Z luigi $ + */ + +#ifndef _IP_DN_PRIVATE_H +#define _IP_DN_PRIVATE_H + +/* debugging support + * use ND() to remove debugging, D() to print a line, + * DX(level, ...) to print above a certain level + * If you redefine D() you are expected to redefine all. + */ +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +#endif + +MALLOC_DECLARE(M_DUMMYNET); + +#ifndef __linux__ +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) +#endif + +#define DN_LOCK_INIT() do { \ + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ + } while (0) +#define DN_LOCK_DESTROY() do { \ + mtx_destroy(&dn_cfg.uh_mtx); \ + mtx_destroy(&dn_cfg.bh_mtx); \ + } while (0) +#if 0 /* not used yet */ +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) +#endif + +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) + +SLIST_HEAD(dn_schk_head, dn_schk); +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); +SLIST_HEAD(dn_fsk_head, dn_fsk); +SLIST_HEAD(dn_queue_head, dn_queue); +SLIST_HEAD(dn_alg_head, dn_alg); + +struct mq { /* a basic queue of packets*/ + struct mbuf *head, *tail; +}; + +static inline void +set_oid(struct dn_id *o, int type, int len) +{ + o->type = type; + o->len = len; + o->subtype = 0; +}; + +uint64_t readTSC (void); +/* + * see if tsc (ot other timer) is supported. + * - FreeBSD has rdtsc macro for i386 and amd64 + * - Linux has rdtscll and/or rdtsc (also for openWRT patched kernel source) + * - Windows has KeQueryPerformanceCounter() function that use tsc or other + * timer + */ +#if defined(rdtscll) || defined(rdtsc) || defined(_WIN32) +#define HAVE_TSC +#endif +/* + * configuration and global data for a dummynet instance + * + * When a configuration is modified from userland, 'id' is incremented + * so we can use the value to check for stale pointers. + */ +struct dn_parms { + uint32_t id; /* configuration version */ + + /* defaults (sysctl-accessible) */ + int red_lookup_depth; + int red_avg_pkt_size; + int red_max_pkt_size; + int hash_size; + int max_hash_size; + long byte_limit; /* max queue sizes */ + long slot_limit; + + int io_fast; + int debug; + + /* timekeeping */ + struct timeval prev_t; /* last time dummynet_tick ran */ + struct dn_heap evheap; /* scheduled events */ + + /* counters of objects -- used for reporting space */ + int schk_count; + int si_count; + int fsk_count; + int queue_count; + + /* ticks and other stuff */ + uint64_t curr_time; /* in ticks */ + + /* + * Variables to manage the time spent in the drain routines. + * max_drain is max the fraction of a tick (0..100) to be used + * for draining. + * We also need some variables to store the average number of + * timecounter ticks between calls to the periodic task, etc. + */ + int drain_ratio; + uint64_t cycle_task_new; /* TSC when dummynet_task() starts */ + uint64_t cycle_task_old; /* TSC when prev. dummynet_task() starts */ + uint64_t cycle_task; + uint64_t cycle_task_avg; /* Moving average of cicle_task */ + + /* flowsets and schedulers are in hash tables, with 'hash_size' + * buckets. fshash is looked up at every packet arrival + * so better be generous if we expect many entries. + */ + struct dn_ht *fshash; + struct dn_ht *schedhash; + /* list of flowsets without a scheduler -- use sch_chain */ + struct dn_fsk_head fsu; /* list of unlinked flowsets */ + struct dn_alg_head schedlist; /* list of algorithms */ + + /* Counter of idle objects -- used by drain routine + * We scan when idle_queue (or idle_si) > expire_object. + * The drain routine is called every 'expire' cycles (the counter + * used is expire_cycle). + * We can disable the expire routine by setting expire to 0. + * An object is kept alive for at least object_idle_tick after it + * becomes idle. During the scan, we count the number of objects + * that are idle but not ready in 'idle_si_wait' and 'idle_queue_wait' + */ + int idle_queue; + int idle_queue_wait; /* idle but not expired yet */ + int idle_si; + int idle_si_wait; /* idle but not expired yet */ + uint32_t expire_object; /* threshold for expires */ + uint32_t expire; /* how often to expire */ + uint32_t expire_cycle; + uint32_t object_idle_tick; /* lifetime of objs */ + uint32_t expire_object_examined; /* Burst of object examined */ + + /* drain_fs and drain_sch point to the next bucket to scan when + * draining. + */ + uint32_t drain_fs; + uint32_t drain_sch; + + int init_done; + + /* if the upper half is busy doing something long, + * can set the busy flag and we will enqueue packets in + * a queue for later processing. + */ + int busy; + struct mq pending; + +#ifdef _KERNEL + /* + * This file is normally used in the kernel, unless we do + * some userland tests, in which case we do not need a mtx. + * uh_mtx arbitrates between system calls and also + * protects fshash, schedhash and fsunlinked. + * These structures are readonly for the lower half. + * bh_mtx protects all other structures which may be + * modified upon packet arrivals + */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_mtx; + spinlock_t bh_mtx; +#else + struct mtx uh_mtx; + struct mtx bh_mtx; +#endif + +#endif /* _KERNEL */ +}; + +/* + * Delay line, contains all packets on output from a link. + * Every scheduler instance has one. + */ +struct delay_line { + struct dn_id oid; + struct dn_sch_inst *si; + struct mq mq; +}; + +/* + * The kernel side of a flowset. It is linked in a hash table + * of flowsets, and in a list of children of their parent scheduler. + * qht is either the queue or (if HAVE_MASK) a hash table queues. + * Note that the mask to use is the (flow_mask|sched_mask), which + * changes as we attach/detach schedulers. So we store it here. + * + * XXX If we want to add scheduler-specific parameters, we need to + * put them in external storage because the scheduler may not be + * available when the fsk is created. + */ +struct dn_fsk { /* kernel side of a flowset */ + struct dn_fs fs; + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ + + struct ipfw_flow_id fsk_mask; + + /* qht is a hash table of queues, or just a single queue + * a bit in fs.flags tells us which one + */ + struct dn_ht *qht; + struct dn_schk *sched; /* Sched we are linked to */ + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ + + /* bucket index used by drain routine to drain queues for this + * flowset + */ + int drain_bucket; + /* Parameter realted to RED / GRED */ + /* original values are in dn_fs*/ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; + +/* + * A queue is created as a child of a flowset unless it belongs to + * a !MULTIQUEUE scheduler. It is normally in a hash table in the + * flowset. fs always points to the parent flowset. + * si normally points to the sch_inst, unless the flowset has been + * detached from the scheduler -- in this case si == NULL and we + * should not enqueue. + */ +struct dn_queue { + struct dn_flow ni; /* oid, flow_id, stats */ + struct mq mq; /* packets queue */ + struct dn_sch_inst *_si; /* owner scheduler instance */ + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ + struct dn_fsk *fs; /* parent flowset. */ + + /* RED parameters */ + int avg; /* average queue length est. (scaled) */ + int count; /* arrivals since last RED drop */ + int random; /* random value (scaled) */ + uint64_t q_time; /* start of queue idle time */ + +}; + +/* + * The kernel side of a scheduler. Contains the userland config, + * a link, pointer to extra config arguments from command line, + * kernel flags, and a pointer to the scheduler methods. + * It is stored in a hash table, and holds a list of all + * flowsets and scheduler instances. + * XXX sch must be at the beginning, see schk_hash(). + */ +struct dn_schk { + struct dn_sch sch; + struct dn_alg *fp; /* Pointer to scheduler functions */ + struct dn_link link; /* The link, embedded */ + struct dn_profile *profile; /* delay profile, if any */ + struct dn_id *cfg; /* extra config arguments */ + + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ + + struct dn_fsk_head fsk_list; /* all fsk linked to me */ + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ + + /* bucket index used by the drain routine to drain the scheduler + * instance for this flowset. + */ + int drain_bucket; + + /* Hash table of all instances (through sch.sched_mask) + * or single instance if no mask. Always valid. + */ + struct dn_ht *siht; +}; + + +/* + * Scheduler instance. + * Contains variables and all queues relative to a this instance. + * This struct is created a runtime. + */ +struct dn_sch_inst { + struct dn_flow ni; /* oid, flowid and stats */ + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ + struct delay_line dline; + struct dn_schk *sched; /* the template */ + int kflags; /* DN_ACTIVE */ + + int64_t credit; /* bits I can transmit (more or less). */ + uint64_t sched_time; /* time link was scheduled in ready_heap */ + uint64_t idle_time; /* start of scheduler instance idle time */ + + /* q_count is the number of queues that this instance is using. + * The counter is incremented or decremented when + * a reference from the queue is created or deleted. + * It is used to make sure that a scheduler instance can be safely + * deleted by the drain routine. + */ + int q_count; + +}; + + +/* kernel-side flags. Linux has DN_DELETE in fcntl.h + */ +enum { + /* 1 and 2 are reserved for the SCAN flags */ + DN_DESTROY = 0x0004, /* destroy */ + DN_DELETE_FS = 0x0008, /* destroy flowset */ + DN_DETACH = 0x0010, + DN_ACTIVE = 0x0020, /* object is in evheap */ + DN_F_DLINE = 0x0040, /* object is a delay line */ + DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed + * by scheduler */ + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ +}; + +extern struct dn_parms dn_cfg; +//VNET_DECLARE(struct dn_parms, _base_dn_cfg); +//#define dn_cfg VNET(_base_dn_cfg) + +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); +void dummynet_task(void *context, int pending); +void dn_reschedule(void); + +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct ipfw_flow_id *); +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); + +/* + * copy_range is a template for requests for ranges of pipes/queues/scheds. + * The number of ranges is variable and can be derived by o.len. + * As a default, we use a small number of entries so that the struct + * fits easily on the stack and is sufficient for most common requests. + */ +#define DEFAULT_RANGES 5 +struct copy_range { + struct dn_id o; + uint32_t r[ 2 * DEFAULT_RANGES ]; +}; + +struct copy_args { + char **start; + char *end; + int flags; + int type; + struct copy_range *extra; /* extra filtering */ +}; + +struct sockopt; +int ip_dummynet_compat(struct sockopt *sopt); +int dummynet_get(struct sockopt *sopt, void **compat); +int dn_c_copy_q (void *_ni, void *arg); +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); +int dn_compat_copy_queue(struct copy_args *a, void *_o); +int dn_compat_copy_pipe(struct copy_args *a, void *_o); +int copy_data_helper_compat(void *_o, void *_arg); +int dn_compat_calc_size(void); +int do_config(void *p, int l); + +/* function to drain idle object */ +void dn_drain_scheduler(void); +void dn_drain_queue(void); + +#endif /* _IP_DN_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c new file mode 100644 index 0000000..7c63a2d --- /dev/null +++ b/sys/netinet/ipfw/ip_dummynet.c @@ -0,0 +1,2396 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_dummynet.c 203340 2010-02-01 12:06:37Z luigi $"); + +/* + * Configuration and internal object management for dummynet. + */ + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include + +#include +#include +#include +#include + +/* which objects to copy */ +#define DN_C_LINK 0x01 +#define DN_C_SCH 0x02 +#define DN_C_FLOW 0x04 +#define DN_C_FS 0x08 +#define DN_C_QUEUE 0x10 + +/* we use this argument in case of a schk_new */ +struct schk_new_arg { + struct dn_alg *fp; + struct dn_sch *sch; +}; + +/*---- callout hooks. ----*/ +static struct callout dn_timeout; +static struct task dn_task; +static struct taskqueue *dn_tq = NULL; + +/* dummynet and ipfw_tick can't be static in windows */ +void +dummynet(void * arg) +{ + + (void)arg; /* UNUSED */ + taskqueue_enqueue(dn_tq, &dn_task); +} + +void +dn_reschedule(void) +{ + callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0); +} +/*----- end of callout hooks -----*/ + +/* Return a scheduler descriptor given the type or name. */ +static struct dn_alg * +find_sched_type(int type, char *name) +{ + struct dn_alg *d; + + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { + if (d->type == type || (name && !strcasecmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + int oldv = *v; + const char *op = NULL; + if (dflt < lo) + dflt = lo; + if (dflt > hi) + dflt = hi; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } else + return *v; + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; +} + +/*---- flow_id mask, hash and compare functions ---*/ +/* + * The flow_id includes the 5-tuple, the queue/pipe number + * which we store in the extra area in host order, + * and for ipv6 also the flow_id6. + * XXX see if we want the tos byte (can store in 'flags') + */ +static struct ipfw_flow_id * +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) +{ + int is_v6 = IS_IP6_FLOW_ID(id); + + id->dst_port &= mask->dst_port; + id->src_port &= mask->src_port; + id->proto &= mask->proto; + id->extra &= mask->extra; + if (is_v6) { + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); + APPLY_MASK(&id->src_ip6, &mask->src_ip6); + id->flow_id6 &= mask->flow_id6; + } else { + id->dst_ip &= mask->dst_ip; + id->src_ip &= mask->src_ip; + } + return id; +} + +/* computes an OR of two masks, result in dst and also returned */ +static struct ipfw_flow_id * +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) +{ + int is_v6 = IS_IP6_FLOW_ID(dst); + + dst->dst_port |= src->dst_port; + dst->src_port |= src->src_port; + dst->proto |= src->proto; + dst->extra |= src->extra; + if (is_v6) { +#define OR_MASK(_d, _s) \ + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; + OR_MASK(&dst->dst_ip6, &src->dst_ip6); + OR_MASK(&dst->src_ip6, &src->src_ip6); +#undef OR_MASK + dst->flow_id6 |= src->flow_id6; + } else { + dst->dst_ip |= src->dst_ip; + dst->src_ip |= src->src_ip; + } + return dst; +} + +static int +nonzero_mask(struct ipfw_flow_id *m) +{ + if (m->dst_port || m->src_port || m->proto || m->extra) + return 1; + if (IS_IP6_FLOW_ID(m)) { + return + m->dst_ip6.__u6_addr.__u6_addr32[0] || + m->dst_ip6.__u6_addr.__u6_addr32[1] || + m->dst_ip6.__u6_addr.__u6_addr32[2] || + m->dst_ip6.__u6_addr.__u6_addr32[3] || + m->src_ip6.__u6_addr.__u6_addr32[0] || + m->src_ip6.__u6_addr.__u6_addr32[1] || + m->src_ip6.__u6_addr.__u6_addr32[2] || + m->src_ip6.__u6_addr.__u6_addr32[3] || + m->flow_id6; + } else { + return m->dst_ip || m->src_ip; + } +} + +/* XXX we may want a better hash function */ +static uint32_t +flow_id_hash(struct ipfw_flow_id *id) +{ + uint32_t i; + + if (IS_IP6_FLOW_ID(id)) { + uint32_t *d = (uint32_t *)&id->dst_ip6; + uint32_t *s = (uint32_t *)&id->src_ip6; + i = (d[0] ) ^ (d[1]) ^ + (d[2] ) ^ (d[3]) ^ + (d[0] >> 15) ^ (d[1] >> 15) ^ + (d[2] >> 15) ^ (d[3] >> 15) ^ + (s[0] << 1) ^ (s[1] << 1) ^ + (s[2] << 1) ^ (s[3] << 1) ^ + (s[0] << 16) ^ (s[1] << 16) ^ + (s[2] << 16) ^ (s[3] << 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->extra) ^ + (id->proto ) ^ (id->flow_id6); + } else { + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ + (id->extra) ^ + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); + } + return i; +} + +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ +static int +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) +{ + int is_v6 = IS_IP6_FLOW_ID(id1); + + if (!is_v6) { + if (IS_IP6_FLOW_ID(id2)) + return 1; /* different address families */ + + return (id1->dst_ip == id2->dst_ip && + id1->src_ip == id2->src_ip && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra) ? 0 : 1; + } + /* the ipv6 case */ + return ( + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra && + id1->flow_id6 == id2->flow_id6) ? 0 : 1; +} +/*--------- end of flow-id mask, hash and compare ---------*/ + +/*--- support functions for the qht hashtable ---- + * Entries are hashed by flow-id + */ +static uint32_t +q_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_queue *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +q_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_queue *o = (struct dn_queue *)obj; + struct ipfw_flow_id *id2; + + if (flags & DNHT_KEY_IS_OBJ) { + /* compare pointers */ + id2 = &((struct dn_queue *)key)->ni.fid; + } else { + id2 = (struct ipfw_flow_id *)key; + } + return (0 == flow_id_cmp(&o->ni.fid, id2)); +} + +/* + * create a new queue instance for the given 'key'. + */ +static void * +q_new(uintptr_t key, int flags, void *arg) +{ + struct dn_queue *q, *template = arg; + struct dn_fsk *fs = template->fs; + int size = sizeof(*q) + fs->sched->fp->q_datalen; + + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q == NULL) { + D("no memory for new queue"); + return NULL; + } + + set_oid(&q->ni.oid, DN_QUEUE, size); + if (fs->fs.flags & DN_QHT_HASH) + q->ni.fid = *(struct ipfw_flow_id *)key; + q->fs = fs; + q->_si = ipdn_si_find(q->fs->sched, &(template->ni.fid)); + if (q->_si == NULL) { + D("no memory for new si"); + free (q, M_DUMMYNET); + return NULL; + } + + q->_si->q_count++; + + if (fs->sched->fp->new_queue) + fs->sched->fp->new_queue(q); + dn_cfg.queue_count++; + dn_cfg.idle_queue++; + return q; +} + +/* + * Notify schedulers that a queue is going away. + * If (flags & DN_DESTROY), also free the packets. + * The version for callbacks is called q_delete_cb(). + * Returns 1 if the queue is NOT deleted (usually when + * the drain routine try to delete a queue that a scheduler + * instance needs), 0 otherwise. + * NOTE: flag DN_DEL_SAFE means that the queue should be + * deleted only if the scheduler no longer needs it + */ +static int +dn_delete_queue(struct dn_queue *q, int flags) +{ + struct dn_fsk *fs = q->fs; + + // D("fs %p si %p\n", fs, q->_si); + /* notify the parent scheduler that the queue is going away */ + if (fs && fs->sched->fp->free_queue) + if (fs->sched->fp->free_queue(q, flags & DN_DEL_SAFE) == 1) + return 1; /* queue NOT deleted */ + q->_si->q_count--; + q->_si = NULL; + if (flags & DN_DESTROY) { + if (q->mq.head) + dn_free_pkts(q->mq.head); + else + dn_cfg.idle_queue--; + bzero(q, sizeof(*q)); // safety + free(q, M_DUMMYNET); + dn_cfg.queue_count--; + } + return 0; +} + +static int +q_delete_cb(void *q, void *arg) +{ + int flags = (int)(uintptr_t)arg; + dn_delete_queue(q, flags); + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; +} + +/* + * calls dn_delete_queue/q_delete_cb on all queues, + * which notifies the parent scheduler and possibly drains packets. + * flags & DN_DESTROY: drains queues and destroy qht; + */ +static void +qht_delete(struct dn_fsk *fs, int flags) +{ + ND("fs %d start flags %d qht %p", + fs->fs.fs_nr, flags, fs->qht); + if (!fs->qht) + return; + if (fs->fs.flags & DN_QHT_HASH) { + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); + if (flags & DN_DESTROY) { + dn_ht_free(fs->qht, 0); + fs->qht = NULL; + } + } else { + dn_delete_queue((struct dn_queue *)(fs->qht), flags); + if (flags & DN_DESTROY) + fs->qht = NULL; + } +} + +/* + * Find and possibly create the queue for a MULTIQUEUE scheduler. + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). + */ +struct dn_queue * +ipdn_q_find(struct dn_fsk *fs, struct ipfw_flow_id *id) +{ + struct dn_queue template; + + template.fs = fs; + + if (fs->fs.flags & DN_QHT_HASH) { + struct ipfw_flow_id masked_id; + if (fs->qht == NULL) { + fs->qht = dn_ht_init(NULL, fs->fs.buckets, + offsetof(struct dn_queue, q_next), + q_hash, q_match, q_new); + if (fs->qht == NULL) + return NULL; + } + masked_id = *id; + flow_id_mask(&fs->fsk_mask, &masked_id); + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, + DNHT_INSERT, &template); + } else { + if (fs->qht == NULL) + fs->qht = q_new(0, 0, &template); + return (struct dn_queue *)fs->qht; + } +} +/*--- end of queue hash table ---*/ + +/*--- support functions for the sch_inst hashtable ---- + * + * These are hashed by flow-id + */ +static uint32_t +si_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +si_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_sch_inst *o = obj; + struct ipfw_flow_id *id2; + + id2 = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + return flow_id_cmp(&o->ni.fid, id2) == 0; +} + +static int si_reset_credit(void *_si, void *arg); // XXX si_new use this + +/* + * create a new instance for the given 'key' + * Allocate memory for instance, delay line and scheduler private data. + */ +static void * +si_new(uintptr_t key, int flags, void *arg) +{ + struct dn_schk *s = arg; + struct dn_sch_inst *si; + int l = sizeof(*si) + s->fp->si_datalen; + + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si == NULL) + goto error; + + /* Set length only for the part passed up to userland. */ + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); + set_oid(&(si->dline.oid), DN_DELAY_LINE, + sizeof(struct delay_line)); + /* mark si and dline as outside the event queue */ + si->ni.oid.id = si->dline.oid.id = -1; + + si->sched = s; + si->dline.si = si; + + if (s->fp->new_sched && s->fp->new_sched(si)) { + D("new_sched error"); + goto error; + } + if (s->sch.flags & DN_HAVE_MASK) + si->ni.fid = *(struct ipfw_flow_id *)key; + + si_reset_credit(si, NULL); + dn_cfg.si_count++; + dn_cfg.idle_si++; + return si; + +error: + if (si) { + bzero(si, sizeof(*si)); // safety + free(si, M_DUMMYNET); + } + return NULL; +} + +/* + * Callback from siht to delete all scheduler instances. Remove + * si and delay line from the system heap, destroy all queues. + * We assume that all flowset have been notified and do not + * point to us anymore. + */ +static int +si_destroy(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_schk *s = si->sched; + struct delay_line *dl = &si->dline; + + if (dl->oid.subtype) /* remove delay line from event heap */ + heap_extract(&dn_cfg.evheap, dl); + if (si->ni.length == 0) + dn_cfg.idle_si--; + dn_free_pkts(dl->mq.head); /* drain delay line */ + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ + heap_extract(&dn_cfg.evheap, si); + if (s->fp->free_sched) + s->fp->free_sched(si); + bzero(si, sizeof(*si)); /* safety */ + free(si, M_DUMMYNET); + dn_cfg.si_count--; + return DNHT_SCAN_DEL; +} + +/* + * Find the scheduler instance for this packet. If we need to apply + * a mask, do on a local copy of the flow_id to preserve the original. + * Assume siht is always initialized if we have a mask. + */ +struct dn_sch_inst * +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) +{ + + if (s->sch.flags & DN_HAVE_MASK) { + struct ipfw_flow_id id_t = *id; + flow_id_mask(&s->sch.sched_mask, &id_t); + return dn_ht_find(s->siht, (uintptr_t)&id_t, + DNHT_INSERT, s); + } + if (!s->siht) + s->siht = si_new(0, 0, s); + return (struct dn_sch_inst *)s->siht; +} + +/* callback to flush credit for the scheduler instance */ +static int +si_reset_credit(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_link *p = &si->sched->link; + + si->idle_time = dn_cfg.curr_time; + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); + return 0; +} + +static void +schk_reset_credit(struct dn_schk *s) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_reset_credit, NULL); + else if (s->siht) + si_reset_credit(s->siht, NULL); +} +/*---- end of sch_inst hashtable ---------------------*/ + +/*------------------------------------------------------- + * flowset hash (fshash) support. Entries are hashed by fs_nr. + * New allocations are put in the fsunlinked list, from which + * they are removed when they point to a specific scheduler. + */ +static uint32_t +fsk_hash(uintptr_t key, int flags, void *arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return ( (i>>8)^(i>>4)^i ); +} + +static int +fsk_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs = obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return (fs->fs.fs_nr == i); +} + +static void * +fsk_new(uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs; + + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs) { + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); + dn_cfg.fsk_count++; + fs->drain_bucket = 0; + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } + return fs; +} + +/* + * detach flowset from its current scheduler. Flags as follows: + * DN_DETACH removes from the fsk_list + * DN_DESTROY deletes individual queues + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). + */ +static void +fsk_detach(struct dn_fsk *fs, int flags) +{ + if (flags & DN_DELETE_FS) + flags |= DN_DESTROY; + ND("fs %d from sched %d flags %s %s %s", + fs->fs.fs_nr, fs->fs.sched_nr, + (flags & DN_DELETE_FS) ? "DEL_FS":"", + (flags & DN_DESTROY) ? "DEL":"", + (flags & DN_DETACH) ? "DET":""); + if (flags & DN_DETACH) { /* detach from the list */ + struct dn_fsk_head *h; + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); + } + /* Free the RED parameters, they will be recomputed on + * subsequent attach if needed. + */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + qht_delete(fs, flags); + if (fs->sched && fs->sched->fp->free_fsk) + fs->sched->fp->free_fsk(fs); + fs->sched = NULL; + if (flags & DN_DELETE_FS) { + bzero(fs, sizeof(*fs)); /* safety */ + free(fs, M_DUMMYNET); + dn_cfg.fsk_count--; + } else { + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } +} + +/* + * Detach or destroy all flowsets in a list. + * flags specifies what to do: + * DN_DESTROY: flush all queues + * DN_DELETE_FS: DN_DESTROY + destroy flowset + * DN_DELETE_FS implies DN_DESTROY + */ +static void +fsk_detach_list(struct dn_fsk_head *h, int flags) +{ + struct dn_fsk *fs; + int n = 0; /* only for stats */ + + ND("head %p flags %x", h, flags); + while ((fs = SLIST_FIRST(h))) { + SLIST_REMOVE_HEAD(h, sch_chain); + n++; + fsk_detach(fs, flags); + } + ND("done %d flowsets", n); +} + +/* + * called on 'queue X delete' -- removes the flowset from fshash, + * deletes all queues for the flowset, and removes the flowset. + */ +static int +delete_fs(int i, int locked) +{ + struct dn_fsk *fs; + int err = 0; + + if (!locked) + DN_BH_WLOCK(); + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); + if (dn_ht_entries(dn_cfg.fshash) == 0) { + dn_ht_free(dn_cfg.fshash, 0); + dn_cfg.fshash = NULL; + } + ND("fs %d found %p", i, fs); + if (fs) { + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); + err = 0; + } else + err = EINVAL; + if (!locked) + DN_BH_WUNLOCK(); + return err; +} + +/*----- end of flowset hashtable support -------------*/ + +/*------------------------------------------------------------ + * Scheduler hash. When searching by index we pass sched_nr, + * otherwise we pass struct dn_sch * which is the first field in + * struct dn_schk so we can cast between the two. We use this trick + * because in the create phase (but it should be fixed). + */ +static uint32_t +schk_hash(uintptr_t key, int flags, void *_arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return ( (i>>8)^(i>>4)^i ); +} + +static int +schk_match(void *obj, uintptr_t key, int flags, void *_arg) +{ + struct dn_schk *s = (struct dn_schk *)obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return (s->sch.sched_nr == i); +} + +/* + * Create the entry and intialize with the sched hash if needed. + * Leave s->fp unset so we can tell whether a dn_ht_find() returns + * a new object or a previously existing one. + */ +static void * +schk_new(uintptr_t key, int flags, void *arg) +{ + struct schk_new_arg *a = arg; + struct dn_schk *s; + int l = sizeof(*s) +a->fp->schk_datalen; + + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s == NULL) + return NULL; + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); + s->sch = *a->sch; // copy initial values + s->link.link_nr = s->sch.sched_nr; + SLIST_INIT(&s->fsk_list); + /* initialize the hash table or create the single instance */ + s->fp = a->fp; /* si_new needs this */ + s->drain_bucket = 0; + if (s->sch.flags & DN_HAVE_MASK) { + s->siht = dn_ht_init(NULL, s->sch.buckets, + offsetof(struct dn_sch_inst, si_next), + si_hash, si_match, si_new); + if (s->siht == NULL) { + free(s, M_DUMMYNET); + return NULL; + } + } + s->fp = NULL; /* mark as a new scheduler */ + dn_cfg.schk_count++; + return s; +} + +/* + * Callback for sched delete. Notify all attached flowsets to + * detach from the scheduler, destroy the internal flowset, and + * all instances. The scheduler goes away too. + * arg is 0 (only detach flowsets and destroy instances) + * DN_DESTROY (detach & delete queues, delete schk) + * or DN_DELETE_FS (delete queues and flowsets, delete schk) + */ +static int +schk_delete_cb(void *obj, void *arg) +{ + struct dn_schk *s = obj; +#if 0 + int a = (int)arg; + ND("sched %d arg %s%s", + s->sch.sched_nr, + a&DN_DESTROY ? "DEL ":"", + a&DN_DELETE_FS ? "DEL_FS":""); +#endif + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); + /* no more flowset pointing to us now */ + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan(s->siht, si_destroy, NULL); + dn_ht_free(s->siht, 0); + } + else if (s->siht) + si_destroy(s->siht, NULL); + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + s->siht = NULL; + if (s->fp->destroy) + s->fp->destroy(s); + bzero(s, sizeof(*s)); // safety + free(obj, M_DUMMYNET); + dn_cfg.schk_count--; + return DNHT_SCAN_DEL; +} + +/* + * called on a 'sched X delete' command. Deletes a single scheduler. + * This is done by removing from the schedhash, unlinking all + * flowsets and deleting their traffic. + */ +static int +delete_schk(int i) +{ + struct dn_schk *s; + + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + if (dn_ht_entries(dn_cfg.schedhash) == 0) { + dn_ht_free(dn_cfg.schedhash, 0); + dn_cfg.schedhash = NULL; + } + ND("%d %p", i, s); + if (!s) + return EINVAL; + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ + /* then detach flowsets, delete traffic */ + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); + return 0; +} +/*--- end of schk hashtable support ---*/ + +static int +copy_obj(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + + if (have < o->len || o->len == 0 || o->type == 0) { + D("(WARN) type %d %s %d have %d need %d", + o->type, msg, i, have, o->len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, o->len); + bcopy(_o, *start, o->len); + if (o->type == DN_LINK) { + /* Adjust burst parameter for link */ + struct dn_link *l = (struct dn_link *)*start; + l->burst = div64(l->burst, 8 * hz); + } else if (o->type == DN_SCH) { + /* Set id->id to the number of instances */ + struct dn_schk *s = _o; + struct dn_id *id = (struct dn_id *)(*start); + id->id = (s->sch.flags & DN_HAVE_MASK) ? + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); + } + *start += o->len; + return 0; +} + +/* Specific function to copy a queue. + * Copies only the user-visible part of a queue (which is in + * a struct dn_flow), and sets len accordingly. + */ +static int +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + int len = sizeof(struct dn_flow); /* see above comment */ + + if (have < len || o->len == 0 || o->type != DN_QUEUE) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, len); + bcopy(_o, *start, len); + ((struct dn_id*)(*start))->len = len; + *start += len; + return 0; +} + +static int +copy_q_cb(void *obj, void *arg) +{ + struct dn_queue *q = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); + return 0; +} + +static int +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + if (!fs->qht) + return 0; + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, copy_q_cb, a); + else + copy_q_cb(fs->qht, a); + return 0; +} + +/* + * This routine only copies the initial part of a profile ? XXX + * XXX marta: I think this routine is called to print a summary + * of the pipe configuration and does not need to show the + * profile samples list. + */ +static int +copy_profile(struct copy_args *a, struct dn_profile *p) +{ + int have = a->end - *a->start; + /* XXX here we check for max length */ + int profile_len = sizeof(struct dn_profile); + + if (p == NULL) + return 0; + if (have < profile_len) { + D("error have %d need %d", have, profile_len); + return 1; + } + bcopy(p, *a->start, profile_len); + ((struct dn_id *)(*a->start))->len = profile_len; + *a->start += profile_len; + return 0; +} + +static int +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + struct dn_fs *ufs = (struct dn_fs *)(*a->start); + if (!fs) + return 0; + ND("flowset %d", fs->fs.fs_nr); + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) + return DNHT_SCAN_END; + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); + if (flags) { /* copy queues */ + copy_q(a, fs, 0); + } + return 0; +} + +static int +copy_si_cb(void *obj, void *arg) +{ + struct dn_sch_inst *si = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj(a->start, a->end, &si->ni, "inst", + si->sched->sch.sched_nr)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); + return 0; +} + +static int +copy_si(struct copy_args *a, struct dn_schk *s, int flags) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, copy_si_cb, a); + else if (s->siht) + copy_si_cb(s->siht, a); + return 0; +} + +/* + * compute a list of children of a scheduler and copy up + */ +static int +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) +{ + struct dn_fsk *fs; + struct dn_id *o; + uint32_t *p; + + int n = 0, space = sizeof(*o); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs->fs.fs_nr < DN_MAX_ID) + n++; + } + space += n * sizeof(uint32_t); + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); + if (a->end - *(a->start) < space) + return DNHT_SCAN_END; + o = (struct dn_id *)(*(a->start)); + o->len = space; + *a->start += o->len; + o->type = DN_TEXT; + p = (uint32_t *)(o+1); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) + if (fs->fs.fs_nr < DN_MAX_ID) + *p++ = fs->fs.fs_nr; + return 0; +} + +static int +copy_data_helper(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + uint32_t *r = a->extra->r; /* start of first range */ + uint32_t *lim; /* first invalid pointer */ + int n; + + lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); + + if (a->type == DN_LINK || a->type == DN_SCH) { + /* pipe|sched show, we receive a dn_schk */ + struct dn_schk *s = _o; + + n = s->sch.sched_nr; + if (a->type == DN_SCH && n >= DN_MAX_ID) + return 0; /* not a scheduler */ + if (a->type == DN_LINK && n <= DN_MAX_ID) + return 0; /* not a pipe */ + + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + /* Found a valid entry, copy and we are done */ + if (a->flags & DN_C_LINK) { + if (copy_obj(a->start, a->end, + &s->link, "link", n)) + return DNHT_SCAN_END; + if (copy_profile(a, s->profile)) + return DNHT_SCAN_END; + if (copy_flowset(a, s->fs, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_SCH) { + if (copy_obj(a->start, a->end, + &s->sch, "sched", n)) + return DNHT_SCAN_END; + /* list all attached flowsets */ + if (copy_fsk_list(a, s, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_FLOW) + copy_si(a, s, 0); + break; + } + } else if (a->type == DN_FS) { + /* queue show, skip internal flowsets */ + struct dn_fsk *fs = _o; + + n = fs->fs.fs_nr; + if (n >= DN_MAX_ID) + return 0; + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + if (copy_flowset(a, fs, 0)) + return DNHT_SCAN_END; + copy_q(a, fs, 0); + break; /* we are done */ + } + } + return 0; +} + +static inline struct dn_schk * +locate_scheduler(int i) +{ + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); +} + +/* + * red parameters are in fixed point arithmetic. + */ +static int +config_red(struct dn_fsk *fs) +{ + int64_t s, idle, weight, w0; + int t, i; + + fs->w_q = fs->fs.w_q; + fs->max_p = fs->fs.max_p; + ND("called"); + /* Doing stuff that was in userland */ + i = fs->sched->link.bandwidth; + s = (i <= 0) ? 0 : + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; + + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); + /* fs->lookup_step not scaled, */ + if (!fs->lookup_step) + fs->lookup_step = 1; + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled + + for (t = fs->lookup_step; t > 1; --t) + weight = SCALE_MUL(weight, w0); + fs->lookup_weight = (int)(weight); // scaled + + /* Now doing stuff that was in kerneland */ + fs->min_th = SCALE(fs->fs.min_th); + fs->max_th = SCALE(fs->fs.max_th); + + fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); + + if (fs->fs.flags & DN_IS_GENTLE_RED) { + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; + fs->c_4 = SCALE(1) - 2 * fs->max_p; + } + + /* If the lookup table already exist, free and create it again. */ + if (fs->w_q_lookup) { + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + } + if (dn_cfg.red_lookup_depth == 0) { + printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" + "must be > 0\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return (EINVAL); + } + fs->lookup_depth = dn_cfg.red_lookup_depth; + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), + M_DUMMYNET, M_NOWAIT); + if (fs->w_q_lookup == NULL) { + printf("dummynet: sorry, cannot allocate red lookup table\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return(ENOSPC); + } + + /* Fill the lookup table with (1 - w_q)^x */ + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; + + for (i = 1; i < fs->lookup_depth; i++) + fs->w_q_lookup[i] = + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); + + if (dn_cfg.red_avg_pkt_size < 1) + dn_cfg.red_avg_pkt_size = 512; + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; + if (dn_cfg.red_max_pkt_size < 1) + dn_cfg.red_max_pkt_size = 1500; + fs->max_pkt_size = dn_cfg.red_max_pkt_size; + ND("exit"); + return 0; +} + +/* Scan all flowset attached to this scheduler and update red */ +static void +update_red(struct dn_schk *s) +{ + struct dn_fsk *fs; + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs && (fs->fs.flags & DN_IS_RED)) + config_red(fs); + } +} + +/* attach flowset to scheduler s, possibly requeue */ +static void +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) +{ + ND("remove fs %d from fsunlinked, link to sched %d", + fs->fs.fs_nr, s->sch.sched_nr); + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); + fs->sched = s; + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); + if (s->fp->new_fsk) + s->fp->new_fsk(fs); + /* XXX compute fsk_mask */ + fs->fsk_mask = fs->fs.flow_mask; + if (fs->sched->sch.flags & DN_HAVE_MASK) + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); + if (fs->qht) { + /* + * we must drain qht according to the old + * type, and reinsert according to the new one. + * The requeue is complex -- in general we need to + * reclassify every single packet. + * For the time being, let's hope qht is never set + * when we reach this point. + */ + D("XXX TODO requeue from fs %d to sch %d", + fs->fs.fs_nr, s->sch.sched_nr); + fs->qht = NULL; + } + /* set the new type for qht */ + if (nonzero_mask(&fs->fsk_mask)) + fs->fs.flags |= DN_QHT_HASH; + else + fs->fs.flags &= ~DN_QHT_HASH; + + /* XXX config_red() can fail... */ + if (fs->fs.flags & DN_IS_RED) + config_red(fs); +} + +/* update all flowsets which may refer to this scheduler */ +static void +update_fs(struct dn_schk *s) +{ + struct dn_fsk *fs, *tmp; + + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { + if (s->sch.sched_nr != fs->fs.sched_nr) { + D("fs %d for sch %d not %d still unlinked", + fs->fs.fs_nr, fs->fs.sched_nr, + s->sch.sched_nr); + continue; + } + fsk_attach(fs, s); + } +} + +/* + * Configuration -- to preserve backward compatibility we use + * the following scheme (N is 65536) + * NUMBER SCHED LINK FLOWSET + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 + * + * "pipe i config" configures #1, #2 and #3 + * "sched i config" configures #1 and possibly #6 + * "queue i config" configures #3 + * #1 is configured with 'pipe i config' or 'sched i config' + * #2 is configured with 'pipe i config', and created if not + * existing with 'sched i config' + * #3 is configured with 'queue i config' + * #4 is automatically configured after #1, can only be FIFO + * #5 is automatically configured after #2 + * #6 is automatically created when #1 is !MULTIQUEUE, + * and can be updated. + * #7 is automatically configured after #2 + */ + +/* + * configure a link (and its FIFO instance) + */ +static int +config_link(struct dn_link *p, struct dn_id *arg) +{ + int i; + + if (p->oid.len != sizeof(*p)) { + D("invalid pipe len %d", p->oid.len); + return EINVAL; + } + i = p->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + * burst ??? + */ + p->delay = (p->delay * hz) / 1000; + /* Scale burst size: bytes -> bits * hz */ + p->burst *= 8 * hz; + + DN_BH_WLOCK(); + /* do it twice, base link and FIFO link */ + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_schk *s = locate_scheduler(i); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("sched %d not found", i); + return EINVAL; + } + /* remove profile if exists */ + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + /* copy all parameters */ + s->link.oid = p->oid; + s->link.link_nr = i; + s->link.delay = p->delay; + if (s->link.bandwidth != p->bandwidth) { + /* XXX bandwidth changes, need to update red params */ + s->link.bandwidth = p->bandwidth; + update_red(s); + } + s->link.burst = p->burst; + schk_reset_credit(s); + } + dn_cfg.id++; + DN_BH_WUNLOCK(); + return 0; +} + +/* + * configure a flowset. Can be called from inside with locked=1, + */ +static struct dn_fsk * +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) +{ + int i; + struct dn_fsk *fs; + + if (nfs->oid.len != sizeof(*nfs)) { + D("invalid flowset len %d", nfs->oid.len); + return NULL; + } + i = nfs->fs_nr; + if (i <= 0 || i >= 3*DN_MAX_ID) + return NULL; + ND("flowset %d", i); + /* XXX other sanity checks */ + if (nfs->flags & DN_QSIZE_BYTES) { + ipdn_bound_var(&nfs->qsize, 16384, + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); + } else { + ipdn_bound_var(&nfs->qsize, 50, + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); + } + if (nfs->flags & DN_HAVE_MASK) { + /* make sure we have some buckets */ + ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "flowset buckets"); + } else { + nfs->buckets = 1; /* we only need 1 */ + } + if (!locked) + DN_BH_WLOCK(); + if (dn_cfg.fshash == NULL) + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_fsk, fsk_next), + fsk_hash, fsk_match, fsk_new); + do { /* exit with break when done */ + struct dn_schk *s; + int flags = nfs->sched_nr ? DNHT_INSERT : 0; + int j; + int oldc = dn_cfg.fsk_count; + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); + if (fs == NULL) { + D("missing sched for flowset %d", i); + break; + } + /* grab some defaults from the existing one */ + if (nfs->sched_nr == 0) /* reuse */ + nfs->sched_nr = fs->fs.sched_nr; + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { + if (nfs->par[j] == -1) /* reuse */ + nfs->par[j] = fs->fs.par[j]; + } + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { + ND("flowset %d unchanged", i); + break; /* no change, nothing to do */ + } + if (oldc != dn_cfg.fsk_count) /* new item */ + dn_cfg.id++; + s = locate_scheduler(nfs->sched_nr); + /* detach from old scheduler if needed, preserving + * queues if we need to reattach. Then update the + * configuration, and possibly attach to the new sched. + */ + DX(2, "fs %d changed sched %d@%p to %d@%p", + fs->fs.fs_nr, + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); + if (fs->sched) { + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); + flags |= DN_DESTROY; /* XXX temporary */ + fsk_detach(fs, flags); + } + fs->fs = *nfs; /* copy configuration */ + if (s != NULL) + fsk_attach(fs, s); + } while (0); + if (!locked) + DN_BH_WUNLOCK(); + return fs; +} + +/* + * config/reconfig a scheduler and its FIFO variant. + * For !MULTIQUEUE schedulers, also set up the flowset. + * + * On reconfigurations (detected because s->fp is set), + * detach existing flowsets preserving traffic, preserve link, + * and delete the old scheduler creating a new one. + */ +static int +config_sched(struct dn_sch *_nsch, struct dn_id *arg) +{ + struct dn_schk *s; + struct schk_new_arg a; /* argument for schk_new */ + int i; + struct dn_link p; /* copy of oldlink */ + struct dn_profile *pf = NULL; /* copy of old link profile */ + /* Used to preserv mask parameter */ + struct ipfw_flow_id new_mask; + int new_buckets = 0; + int new_flags = 0; + int pipe_cmd; + int err = ENOMEM; + + a.sch = _nsch; + if (a.sch->oid.len != sizeof(*a.sch)) { + D("bad sched len %d", a.sch->oid.len); + return EINVAL; + } + i = a.sch->sched_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* make sure we have some buckets */ + if (a.sch->flags & DN_HAVE_MASK) + ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "sched buckets"); + /* XXX other sanity checks */ + bzero(&p, sizeof(p)); + + pipe_cmd = a.sch->flags & DN_PIPE_CMD; + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? + if (pipe_cmd) { + /* Copy mask parameter */ + new_mask = a.sch->sched_mask; + new_buckets = a.sch->buckets; + new_flags = a.sch->flags; + } + DN_BH_WLOCK(); + if (dn_cfg.schedhash == NULL) + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_schk, schk_next), + schk_hash, schk_match, schk_new); +again: /* run twice, for wfq and fifo */ + /* + * lookup the type. If not supplied, use the previous one + * or default to WF2Q+. Otherwise, return an error. + */ + dn_cfg.id++; + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); + if (a.fp != NULL) { + /* found. Lookup or create entry */ + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { + /* No type. search existing s* or retry with WF2Q+ */ + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); + if (s != NULL) { + a.fp = s->fp; + /* Scheduler exists, skip to FIFO scheduler + * if command was pipe config... + */ + if (pipe_cmd) + goto next; + } else { + /* New scheduler, create a wf2q+ with no mask + * if command was pipe config... + */ + if (pipe_cmd) { + /* clear mask parameter */ + bzero(&a.sch->sched_mask, sizeof(new_mask)); + a.sch->buckets = 0; + a.sch->flags &= ~DN_HAVE_MASK; + } + a.sch->oid.subtype = DN_SCHED_WF2QP; + goto again; + } + } else { + D("invalid scheduler type %d %s", + a.sch->oid.subtype, a.sch->name); + err = EINVAL; + goto error; + } + /* normalize name and subtype */ + a.sch->oid.subtype = a.fp->type; + bzero(a.sch->name, sizeof(a.sch->name)); + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); + if (s == NULL) { + D("cannot allocate scheduler %d", i); + goto error; + } + /* restore existing link if any */ + if (p.link_nr) { + s->link = p; + if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ + s->profile = NULL; /* XXX maybe not needed */ + } else { + size_t pf_size = sizeof(struct dn_profile) + + s->profile->samples_no * sizeof(int); + + s->profile = malloc(pf_size, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("cannot allocate profile"); + goto error; //XXX + } + bcopy(pf, s->profile, pf_size); + } + } + p.link_nr = 0; + if (s->fp == NULL) { + DX(2, "sched %d new type %s", i, a.fp->name); + } else if (s->fp != a.fp || + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { + /* already existing. */ + DX(2, "sched %d type changed from %s to %s", + i, s->fp->name, a.fp->name); + DX(4, " type/sub %d/%d -> %d/%d", + s->sch.oid.type, s->sch.oid.subtype, + a.sch->oid.type, a.sch->oid.subtype); + if (s->link.link_nr == 0) + D("XXX WARNING link 0 for sched %d", i); + p = s->link; /* preserve link */ + if (s->profile) {/* preserve profile */ + if (!pf) + pf = malloc(sizeof(*pf), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (pf) /* XXX should issue a warning otherwise */ + bcopy(s->profile, pf, sizeof(*pf)); + } + /* remove from the hash */ + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + /* Detach flowsets, preserve queues. */ + // schk_delete_cb(s, NULL); + // XXX temporarily, kill queues + schk_delete_cb(s, (void *)DN_DESTROY); + goto again; + } else { + DX(4, "sched %d unchanged type %s", i, a.fp->name); + } + /* complete initialization */ + s->sch = *a.sch; + s->fp = a.fp; + s->cfg = arg; + // XXX schk_reset_credit(s); + /* create the internal flowset if needed, + * trying to reuse existing ones if available + */ + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); + if (!s->fs) { + struct dn_fs fs; + bzero(&fs, sizeof(fs)); + set_oid(&fs.oid, DN_FS, sizeof(fs)); + fs.fs_nr = i + DN_MAX_ID; + fs.sched_nr = i; + s->fs = config_fs(&fs, NULL, 1 /* locked */); + } + if (!s->fs) { + schk_delete_cb(s, (void *)DN_DESTROY); + D("error creating internal fs for %d", i); + goto error; + } + } + /* call init function after the flowset is created */ + if (s->fp->config) + s->fp->config(s); + update_fs(s); +next: + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ + i += DN_MAX_ID; + if (pipe_cmd) { + /* Restore mask parameter for FIFO */ + a.sch->sched_mask = new_mask; + a.sch->buckets = new_buckets; + a.sch->flags = new_flags; + } else { + /* sched config shouldn't modify the FIFO scheduler */ + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { + /* FIFO already exist, don't touch it */ + err = 0; /* and this is not an error */ + goto error; + } + } + a.sch->sched_nr = i; + a.sch->oid.subtype = DN_SCHED_FIFO; + bzero(a.sch->name, sizeof(a.sch->name)); + goto again; + } + err = 0; +error: + DN_BH_WUNLOCK(); + if (pf) + free(pf, M_DUMMYNET); + return err; +} + +/* + * attach a profile to a link + */ +static int +config_profile(struct dn_profile *pf, struct dn_id *arg) +{ + struct dn_schk *s; + int i, olen, err = 0; + + if (pf->oid.len < sizeof(*pf)) { + D("short profile len %d", pf->oid.len); + return EINVAL; + } + i = pf->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* XXX other sanity checks */ + DN_BH_WLOCK(); + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + s = locate_scheduler(i); + + if (s == NULL) { + err = EINVAL; + break; + } + dn_cfg.id++; + /* + * If we had a profile and the new one does not fit, + * or it is deleted, then we need to free memory. + */ + if (s->profile && (pf->samples_no == 0 || + s->profile->oid.len < pf->oid.len)) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + if (pf->samples_no == 0) + continue; + /* + * new profile, possibly allocate memory + * and copy data. + */ + if (s->profile == NULL) + s->profile = malloc(pf->oid.len, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("no memory for profile %d", i); + err = ENOMEM; + break; + } + /* preserve larger length XXX double check */ + olen = s->profile->oid.len; + if (olen < pf->oid.len) + olen = pf->oid.len; + bcopy(pf, s->profile, pf->oid.len); + s->profile->oid.len = olen; + } + DN_BH_WUNLOCK(); + return err; +} + +/* + * Delete all objects: + */ +static void +dummynet_flush(void) +{ + + /* delete all schedulers and related links/queues/flowsets */ + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, + (void *)(uintptr_t)DN_DELETE_FS); + /* delete all remaining (unlinked) flowsets */ + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); + + dn_ht_free(dn_cfg.schedhash, DNHT_REMOVE); + /* Reinitialize system heap... */ + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); +} + +/* + * Main handler for configuration. We are guaranteed to be called + * with an oid which is at least a dn_id. + * - the first object is the command (config, delete, flush, ...) + * - config_link must be issued after the corresponding config_sched + * - parameters (DN_TXT) for an object must preceed the object + * processed on a config_sched. + */ +int +do_config(void *p, int l) +{ + struct dn_id *next, *o; + int err = 0, err2 = 0; + struct dn_id *arg = NULL; + uintptr_t *a; + + o = p; + if (o->id != DN_API_VERSION) { + D("invalid api version got %d need %d", + o->id, DN_API_VERSION); + return EINVAL; + } + for (; l >= sizeof(*o); o = next) { + struct dn_id *prev = arg; + if (o->len < sizeof(*o) || l < o->len) { + D("bad len o->len %d len %d", o->len, l); + err = EINVAL; + break; + } + l -= o->len; + next = (struct dn_id *)((char *)o + o->len); + err = 0; + switch (o->type) { + default: + D("cmd %d not implemented", o->type); + break; + +#ifdef EMULATE_SYSCTL + /* sysctl emulation. + * if we recognize the command, jump to the correct + * handler and return + */ + case DN_SYSCTL_SET: + err = kesysctl_emu_set(p, l); + return err; +#endif + + case DN_CMD_CONFIG: /* simply a header */ + break; + + case DN_CMD_DELETE: + /* the argument is in the first uintptr_t after o */ + a = (uintptr_t *)(o+1); + if (o->len < sizeof(*o) + sizeof(*a)) { + err = EINVAL; + break; + } + switch (o->subtype) { + case DN_LINK: + /* delete base and derived schedulers */ + DN_BH_WLOCK(); + err = delete_schk(*a); + err2 = delete_schk(*a + DN_MAX_ID); + DN_BH_WUNLOCK(); + if (!err) + err = err2; + break; + + default: + D("invalid delete type %d", + o->subtype); + err = EINVAL; + break; + + case DN_FS: + err = (*a <1 || *a >= DN_MAX_ID) ? + EINVAL : delete_fs(*a, 0) ; + break; + } + break; + + case DN_CMD_FLUSH: + DN_BH_WLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); + break; + case DN_TEXT: /* store argument the next block */ + prev = NULL; + arg = o; + break; + case DN_LINK: + err = config_link((struct dn_link *)o, arg); + break; + case DN_PROFILE: + err = config_profile((struct dn_profile *)o, arg); + break; + case DN_SCH: + err = config_sched((struct dn_sch *)o, arg); + break; + case DN_FS: + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); + break; + } + if (prev) + arg = NULL; + if (err != 0) + break; + } + return err; +} + +static int +compute_space(struct dn_id *cmd, struct copy_args *a) +{ + int x = 0, need = 0; + int profile_size = sizeof(struct dn_profile); + + /* NOTE about compute space: + * NP = dn_cfg.schk_count + * NSI = dn_cfg.si_count + * NF = dn_cfg.fsk_count + * NQ = dn_cfg.queue_count + * - ipfw pipe show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI)*(dn_flow) all scheduler instance (includes + * the queue instance) + * - ipfw sched show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI * dn_flow) all scheduler instances + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler + * (NQ * dn_queue) all queue [XXXfor now not listed] + * - ipfw queue show + * (NF * dn_fs) all flowset + * (NQ * dn_queue) all queues + */ + switch (cmd->subtype) { + default: + return -1; + /* XXX where do LINK and SCH differ ? */ + /* 'ipfw sched show' could list all queues associated to + * a scheduler. This feature for now is disabled + */ + case DN_LINK: /* pipe show */ + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + break; + case DN_SCH: /* sched show */ + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; + break; + case DN_FS: /* queue show */ + x = DN_C_FS | DN_C_QUEUE; + break; + case DN_GET_COMPAT: /* compatibility mode */ + need = dn_compat_calc_size(); + break; + } + a->flags = x; + if (x & DN_C_SCH) { + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; + /* NOT also, each fs might be attached to a sched */ + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; + } + if (x & DN_C_FS) + need += dn_cfg.fsk_count * sizeof(struct dn_fs); + if (x & DN_C_LINK) { + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; + } + /* + * When exporting a queue to userland, only pass up the + * struct dn_flow, which is the only visible part. + */ + + if (x & DN_C_QUEUE) + need += dn_cfg.queue_count * sizeof(struct dn_flow); + if (x & DN_C_FLOW) + need += dn_cfg.si_count * (sizeof(struct dn_flow)); + return need; +} + +/* + * If compat != NULL dummynet_get is called in compatibility mode. + * *compat will be the pointer to the buffer to pass to ipfw + */ +int +dummynet_get(struct sockopt *sopt, void **compat) +{ + int have, i, need, error; + char *start = NULL, *buf; + size_t sopt_valsize; + struct dn_id *cmd; + struct copy_args a; + struct copy_range r; + int l = sizeof(struct dn_id); + + bzero(&a, sizeof(a)); + bzero(&r, sizeof(r)); + + /* save and restore original sopt_valsize around copyin */ + sopt_valsize = sopt->sopt_valsize; + + cmd = &r.o; + + if (!compat) { + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); +#endif + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_WAITOK); + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + } + } else { /* compatibility */ + error = 0; + cmd->type = DN_CMD_GET; + cmd->len = sizeof(struct dn_id); + cmd->subtype = DN_GET_COMPAT; + // cmd->id = sopt_valsize; + D("compatibility mode"); + } + a.extra = (struct copy_range *)cmd; + if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ + uint32_t *rp = (uint32_t *)(cmd + 1); + cmd->len += 2* sizeof(uint32_t); + rp[0] = 1; + rp[1] = DN_MAX_ID - 1; + if (cmd->subtype == DN_LINK) { + rp[0] += DN_MAX_ID; + rp[1] += DN_MAX_ID; + } + } + /* Count space (under lock) and allocate (outside lock). + * Exit with lock held if we manage to get enough buffer. + * Try a few times then give up. + */ + for (have = 0, i = 0; i < 10; i++) { + DN_BH_WLOCK(); + need = compute_space(cmd, &a); + + /* if there is a range, ignore value from compute_space() */ + if (l > sizeof(*cmd)) + need = sopt_valsize - sizeof(*cmd); + + if (need < 0) { + DN_BH_WUNLOCK(); + error = EINVAL; + goto done; + } + need += sizeof(*cmd); + cmd->id = need; + if (have >= need) + break; + + DN_BH_WUNLOCK(); + if (start) + free(start, M_DUMMYNET); + start = NULL; + if (need > sopt_valsize) + break; + + have = need; + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); + } + + if (start == NULL) { + if (compat) { + *compat = NULL; + error = 1; // XXX + } else { + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); + } + goto done; + } + ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " + "%d:%d si %d, %d:%d queues %d", + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); + sopt->sopt_valsize = sopt_valsize; + a.type = cmd->subtype; + + if (compat == NULL) { + bcopy(cmd, start, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); + } else + buf = start; + a.start = &buf; + a.end = start + have; + /* start copying other objects */ + if (compat) { + a.type = DN_COMPAT_PIPE; + dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); + a.type = DN_COMPAT_QUEUE; + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); + } else if (a.type == DN_FS) { + dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); + } else { + dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); + } + DN_BH_WUNLOCK(); + + if (compat) { + *compat = start; + sopt->sopt_valsize = buf - start; + /* free() is done by ip_dummynet_compat() */ + start = NULL; //XXX hack + } else { + error = sooptcopyout(sopt, start, buf - start); + } +done: + if (cmd && cmd != &r.o) + free(cmd, M_DUMMYNET); + if (start) + free(start, M_DUMMYNET); + return error; +} + +/* + * Functions to drain idle objects -- see dummynet_task() for some notes + */ +/* Callback called on scheduler instance to delete it if idle */ +static int +drain_scheduler_cb(void *_si, void *_arg) +{ + struct dn_sch_inst *si = _si; + int *arg = _arg; + int empty; + + if ( (*arg++) > dn_cfg.expire_object_examined) + return DNHT_SCAN_END; + + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) + return 0; + + /* + * if the scheduler is multiqueue, q_count also reflects empty + * queues that point to si, so we need to check si->q_count to + * tell whether we can remove the instance. + */ + if (si->ni.length == 0) { + /* si was marked as idle: + * remove it or increment idle_si_wait counter + */ + empty = (si->sched->fp->flags & DN_MULTIQUEUE) ? + (si->q_count == 0) : 1; + if (empty && + (si->idle_time < dn_cfg.curr_time - dn_cfg.object_idle_tick)) + return si_destroy(si, NULL); + else + dn_cfg.idle_si_wait++; + } + return 0; +} + +/* Callback called on scheduler to check if it has instances */ +static int +drain_scheduler_sch_cb(void *_s, void *_arg) +{ + struct dn_schk *s = _s; + int *arg = _arg; + + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan_bucket(s->siht, &s->drain_bucket, + drain_scheduler_cb, _arg); + } else { + if (s->siht) { + if (drain_scheduler_cb(s->siht, _arg) == DNHT_SCAN_DEL) + s->siht = NULL; + } + } + return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0; +} + +/* Called every tick, try to delete a 'bucket' of scheduler */ +void +dn_drain_scheduler(void) +{ + int arg = 0; + + dn_ht_scan_bucket(dn_cfg.schedhash, (int *)&dn_cfg.drain_sch, + drain_scheduler_sch_cb, &arg); +} + +/* Callback called on queue to delete if it is idle */ +static int +drain_queue_cb(void *_q, void *_arg) +{ + struct dn_queue *q = _q; + int *arg = _arg; + + if ( (*arg++) > dn_cfg.expire_object_examined) + return DNHT_SCAN_END; + + if (q->ni.length == 0) { + if (q->q_time < dn_cfg.curr_time - dn_cfg.object_idle_tick) { + if (dn_delete_queue(q, DN_DESTROY | DN_DEL_SAFE) == 0) + return DNHT_SCAN_DEL; /* queue is deleted */ + } else + dn_cfg.idle_queue_wait++; + } + + return 0; /* queue isn't deleted */ +} + +/* Callback called on flowset used to check if it has queues */ +static int +drain_queue_fs_cb(void *_fs, void *_arg) +{ + struct dn_fsk *fs = _fs; + int *arg = _arg; + + if (fs->fs.flags & DN_QHT_HASH) { + /* Flowset has a hash table for queues */ + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, + drain_queue_cb, _arg); + } else { + /* No hash table for this flowset, null the pointer + * if the queue is deleted + */ + if (fs->qht) { + if (drain_queue_cb(fs->qht, _arg) == DNHT_SCAN_DEL) + fs->qht = NULL; + } + } + return ( (*arg++) > dn_cfg.expire_object_examined) ? DNHT_SCAN_END : 0; +} + +/* Called every tick, try to delete a 'bucket' of queue */ +void +dn_drain_queue(void) +{ + int arg = 0; + + /* scan a bucket of flowset */ + dn_ht_scan_bucket(dn_cfg.fshash, (int *)&dn_cfg.drain_fs, + drain_queue_fs_cb, &arg); +} + +/* + * Handler for the various dummynet socket options + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + void *p = NULL; + int error, l; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); + if (error) + return (error); + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + switch (sopt->sopt_name) { + default : + D("dummynet: unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ + case IP_DUMMYNET_GET: + D("dummynet: compat option %d", sopt->sopt_name); + error = ip_dummynet_compat(sopt); + break; + + case IP_DUMMYNET3 : + if (sopt->sopt_dir == SOPT_GET) { + error = dummynet_get(sopt, NULL); + break; + } + l = sopt->sopt_valsize; + if (l < sizeof(struct dn_id) || l > 12000) { + D("argument len %d invalid", l); + break; + } + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? + error = sooptcopyin(sopt, p, l, l); + if (error) + break ; + error = do_config(p, l); + break; + } + + if (p != NULL) + free(p, M_TEMP); + + return error ; +} + + +static void +ip_dn_init(void) +{ + if (dn_cfg.init_done) + return; + printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); + dn_cfg.init_done = 1; + /* Set defaults here. MSVC does not accept initializers, + * and this is also useful for vimages + */ + /* queue limits */ + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ + dn_cfg.byte_limit = 1024 * 1024; + dn_cfg.expire = 1; + + /* RED parameters */ + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ + + /* hash tables */ + dn_cfg.max_hash_size = 1024; /* max in the hash tables */ + + if (dn_cfg.hash_size == 0) /* XXX or <= 0 ? */ + dn_cfg.hash_size = 64; /* default hash size */ + + /* hash tables for schedulers and flowsets are created + * when the first scheduler/flowset is inserted. + * This is done to allow to use the right hash_size value. + * When the last object is deleted, the table is destroyed, + * so a new hash_size value can be used. + * XXX rehash is not supported for now + */ + dn_cfg.schedhash = NULL; + dn_cfg.fshash = NULL; + /* bucket index to drain object */ + dn_cfg.drain_fs = 0; + dn_cfg.drain_sch = 0; + + if (dn_cfg.expire_object == 0) + dn_cfg.expire_object = 50; + if (dn_cfg.object_idle_tick == 0) + dn_cfg.object_idle_tick = 1000; + if (dn_cfg.expire_object_examined == 0) + dn_cfg.expire_object_examined = 10; + if (dn_cfg.drain_ratio == 0) + dn_cfg.drain_ratio = 1; + + // XXX what if we don't have a tsc ? +#ifdef HAVE_TSC + dn_cfg.cycle_task_new = dn_cfg.cycle_task_old = readTSC(); +#endif + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); + SLIST_INIT(&dn_cfg.fsu); + SLIST_INIT(&dn_cfg.schedlist); + + DN_LOCK_INIT(); + + TASK_INIT(&dn_task, 0, dummynet_task, curvnet); + dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT, + taskqueue_thread_enqueue, &dn_tq); + taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); + + callout_init(&dn_timeout, CALLOUT_MPSAFE); + callout_reset_on(&dn_timeout, 1, dummynet, NULL, 0); + + /* Initialize curr_time adjustment mechanics. */ + getmicrouptime(&dn_cfg.prev_t); +} + +#ifdef KLD_MODULE +static void +ip_dn_destroy(int last) +{ + callout_drain(&dn_timeout); + + DN_BH_WLOCK(); + if (last) { + ND("removing last instance\n"); + ip_dn_ctl_ptr = NULL; + ip_dn_io_ptr = NULL; + } + + dummynet_flush(); + DN_BH_WUNLOCK(); + taskqueue_drain(dn_tq, &dn_task); + taskqueue_free(dn_tq); + + dn_ht_free(dn_cfg.schedhash, 0); + dn_ht_free(dn_cfg.fshash, 0); + heap_free(&dn_cfg.evheap); + + DN_LOCK_DESTROY(); +} +#endif /* KLD_MODULE */ + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + + if (type == MOD_LOAD) { + if (ip_dn_io_ptr) { + printf("DUMMYNET already loaded\n"); + return EEXIST ; + } + ip_dn_init(); + ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; + return 0; + } else if (type == MOD_UNLOAD) { +#if !defined(KLD_MODULE) + printf("dummynet statically compiled, cannot unload\n"); + return EINVAL ; +#else + ip_dn_destroy(1 /* last */); + return 0; +#endif + } else + return EOPNOTSUPP; +} + +/* modevent helpers for the modules */ +static int +load_dn_sched(struct dn_alg *d) +{ + struct dn_alg *s; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if scheduler already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { + if (strcmp(s->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* scheduler already exists */ + } + } + if (s == NULL) + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); + DN_BH_WUNLOCK(); + D("dn_sched %s %sloaded", d->name, s ? "not ":""); + return s ? 1 : 0; +} + +static int +unload_dn_sched(struct dn_alg *s) +{ + struct dn_alg *tmp, *r; + int err = EINVAL; + + ND("called for %s", s->name); + + DN_BH_WLOCK(); + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { + if (strcmp(s->name, r->name) != 0) + continue; + ND("ref_count = %d", r->ref_count); + err = (r->ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); + break; + } + DN_BH_WUNLOCK(); + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); + return err; +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_alg *sch = arg; + + if (cmd == MOD_LOAD) + return load_dn_sched(sch); + else if (cmd == MOD_UNLOAD) + return unload_dn_sched(sch); + else + return EINVAL; +} + +static moduledata_t dummynet_mod = { + "dummynet", dummynet_modevent, NULL +}; + +#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN +#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ +DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); +MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); +MODULE_VERSION(dummynet, 3); + +/* + * Starting up. Done in order after dummynet_modevent() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); + +/* + * Shutdown handlers up shop. These are done in REVERSE ORDER, but still + * after dummynet_modevent() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); + +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c new file mode 100644 index 0000000..c95f896 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw2.c @@ -0,0 +1,2491 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $"); + +/* + * The FreeBSD IP packet firewall, main file + */ + +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_inet.h" +#ifndef INET +#error "IPFIREWALL requires INET" +#endif /* INET */ +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include +#include +#include +#endif + +#include /* XXX for in_cksum */ + +#ifdef MAC +#include +#endif + +/* + * static variables followed by global ones. + * All ipfw global variables are here. + */ + +/* ipfw_vnet_ready controls when we are open for business */ +static VNET_DEFINE(int, ipfw_vnet_ready) = 0; +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) + +static VNET_DEFINE(int, fw_deny_unknown_exthdrs); +#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) + +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT +static int default_to_accept = 1; +#else +static int default_to_accept; +#endif + +VNET_DEFINE(int, autoinc_step); + +/* + * Each rule belongs to one of 32 different sets (0..31). + * The variable set_disable contains one bit per set. + * If the bit is set, all rules in the corresponding set + * are disabled. Set RESVD_SET(31) is reserved for the default rule + * and rules that are not deleted by the flush command, + * and CANNOT be disabled. + * Rules in set RESVD_SET can only be deleted individually. + */ +VNET_DEFINE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DEFINE(int, fw_verbose); +/* counter for ipfw_log(NULL...) */ +VNET_DEFINE(u_int64_t, norule_counter); +VNET_DEFINE(int, verbose_limit); + +/* layer3_chain contains the list of rules for layer 3 */ +VNET_DEFINE(struct ip_fw_chain, layer3_chain); + +ipfw_nat_t *ipfw_nat_ptr = NULL; +struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); +ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_del_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#ifdef SYSCTL_NODE +uint32_t dummy_def = IPFW_DEFAULT_RULE; +uint32_t dummy_tables_max = IPFW_TABLES_MAX; + +SYSBEGIN(f3) + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, + "Rule number auto-increment step"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, + "Log matches to ipfw rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, + "Set upper limit of matches of ipfw rules logged"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, + &dummy_def, 0, + "The default/max possible rule number."); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD, + &dummy_tables_max, 0, + "The maximum number of tables."); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, + &default_to_accept, 0, + "Make the default rule accept all packets."); +TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, + "Number of static rules"); + +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, + "Deny packets with unknown IPv6 Extension Headers"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + + +/* + * Some macros used in the various matching options. + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +static __inline int +icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) +{ + int type = icmp->icmp_type; + + return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; + + return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. + * + * We scan options and store the bits we find set. We succeed if + * + * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear + * + * The code is sometimes optimized not to store additional variables. + */ + +static int +flags_match(ipfw_insn *cmd, u_int8_t bits) +{ + u_char want_clear; + bits = ~bits; + + if ( ((cmd->arg1 & 0xff) & bits) != 0) + return 0; /* some bits we want set were clear */ + want_clear = (cmd->arg1 >> 8) & 0xff; + if ( (want_clear & bits) != want_clear) + return 0; /* some bits we want clear were set */ + return 1; +} + +static int +ipopts_match(struct ip *ip, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(ip + 1); + int x = (ip->ip_hl << 2) - sizeof (struct ip); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[IPOPT_OPTVAL]; + + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > x) + return 0; /* invalid or truncated */ + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + bits |= IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + bits |= IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + bits |= IP_FW_IPOPT_RR; + break; + + case IPOPT_TS: + bits |= IP_FW_IPOPT_TS; + break; + } + } + return (flags_match(cmd, bits)); +} + +static int +tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(tcp + 1); + int x = (tcp->th_off << 2) - sizeof(struct tcphdr); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + bits |= IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + bits |= IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + bits |= IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + bits |= IP_FW_TCPOPT_TS; + break; + + } + } + return (flags_match(cmd, bits)); +} + +static int +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) +{ + if (ifp == NULL) /* no iface with this packet, match fails */ + return 0; + /* Check by name or by IP address */ + if (cmd->name[0] != '\0') { /* match by name */ + /* Check name */ + if (cmd->p.glob) { + if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) + return(1); + } else { + if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) + return(1); + } + } else { +#ifdef __FreeBSD__ /* and OSX too ? */ + struct ifaddr *ia; + + if_addr_rlock(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (cmd->p.ip.s_addr == ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) { + if_addr_runlock(ifp); + return(1); /* match */ + } + } + if_addr_runlock(ifp); +#endif /* __FreeBSD__ */ + } + return(0); /* no match, fail ... */ +} + +/* + * The verify_path function checks if a route to the src exists and + * if it is reachable via ifp (when provided). + * + * The 'verrevpath' option checks that the interface that an IP packet + * arrives on is the same interface that traffic destined for the + * packet's source address would be routed out of. + * The 'versrcreach' option just checks that the source address is + * reachable via any route (except default) in the routing table. + * These two are a measure to block forged packets. This is also + * commonly known as "anti-spoofing" or Unicast Reverse Path + * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs + * is purposely reminiscent of the Cisco IOS command, + * + * ip verify unicast reverse-path + * ip verify unicast source reachable-via any + * + * which implements the same functionality. But note that the syntax + * is misleading, and the check may be performed on all IP packets + * whether unicast, multicast, or broadcast. + */ +static int +verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) +{ +#ifndef __FreeBSD__ + return 0; +#else + struct route ro; + struct sockaddr_in *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in *)&(ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = src; + in_rtalloc_ign(&ro, 0, fib); + + if (ro.ro_rt == NULL) + return 0; + + /* + * If ifp is provided, check for equality with rtentry. + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * in order to pass packets injected back by if_simloop(): + * if useloopback == 1 routing entry (via lo0) for our own address + * may exist, so we need to handle routing assymetry. + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; +#endif /* __FreeBSD__ */ +} + +#ifdef INET6 +/* + * ipv6 specific rules here... + */ +static __inline int +icmp6type_match (int type, ipfw_insn_u32 *cmd) +{ + return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); +} + +static int +flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) +{ + int i; + for (i=0; i <= cmd->o.arg1; ++i ) + if (curr_flow == cmd->d[i] ) + return 1; + return 0; +} + +/* support for IP6_*_ME opcodes */ +static int +search_ip6_addr_net (struct in6_addr * ip6_addr) +{ + struct ifnet *mdc; + struct ifaddr *mdc2; + struct in6_ifaddr *fdm; + struct in6_addr copia; + + TAILQ_FOREACH(mdc, &V_ifnet, if_link) { + if_addr_rlock(mdc); + TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { + if (mdc2->ifa_addr->sa_family == AF_INET6) { + fdm = (struct in6_ifaddr *)mdc2; + copia = fdm->ia_addr.sin6_addr; + /* need for leaving scope_id in the sock_addr */ + in6_clearscope(&copia); + if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { + if_addr_runlock(mdc); + return 1; + } + } + } + if_addr_runlock(mdc); + } + return 0; +} + +static int +verify_path6(struct in6_addr *src, struct ifnet *ifp) +{ + struct route_in6 ro; + struct sockaddr_in6 *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in6 * )&(ro.ro_dst); + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = *src; + /* XXX MRT 0 for ipv6 at this time */ + rtalloc_ign((struct route *)&ro, 0); + + if (ro.ro_rt == NULL) + return 0; + + /* + * if ifp is provided, check for equality with rtentry + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * to support the case of sending packets to an address of our own. + * (where the former interface is the first argument of if_simloop() + * (=ifp), the latter is lo0) + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; + +} + +static int +is_icmp6_query(int icmp6_type) +{ + if ((icmp6_type <= ICMP6_MAXTYPE) && + (icmp6_type == ICMP6_ECHO_REQUEST || + icmp6_type == ICMP6_MEMBERSHIP_QUERY || + icmp6_type == ICMP6_WRUREQUEST || + icmp6_type == ICMP6_FQDN_QUERY || + icmp6_type == ICMP6_NI_QUERY)) + return (1); + + return (0); +} + +static void +send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) +{ + struct mbuf *m; + + m = args->m; + if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *tcp; + tcp = (struct tcphdr *)((char *)ip6 + hlen); + + if ((tcp->th_flags & TH_RST) == 0) { + struct mbuf *m0; + m0 = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m0 != NULL) + ip6_output(m0, NULL, NULL, 0, NULL, NULL, + NULL); + } + FREE_PKT(m); + } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ +#if 0 + /* + * Unlike above, the mbufs need to line up with the ip6 hdr, + * as the contents are read. We need to m_adj() the + * needed amount. + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + icmp6_error(m, ICMP6_DST_UNREACH, code, 0); + } else + FREE_PKT(m); + + args->m = NULL; +} + +#endif /* INET6 */ + + +/* + * sends a reject message, consuming the mbuf passed as an argument. + */ +static void +send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) +{ + +#if 0 + /* XXX When ip is not guaranteed to be at mtod() we will + * need to account for this */ + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ + /* We need the IP header in host order for icmp_error(). */ + SET_HOST_IPLEN(ip); + icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); + } else if (args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *const tcp = + L3HDR(struct tcphdr, mtod(args->m, struct ip *)); + if ( (tcp->th_flags & TH_RST) == 0) { + struct mbuf *m; + m = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m != NULL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + } + FREE_PKT(args->m); + } else + FREE_PKT(args->m); + args->m = NULL; +} + +/* + * Support for uid/gid/jail lookup. These tests are expensive + * (because we may need to look into the list of active sockets) + * so we cache the results. ugid_lookupp is 0 if we have not + * yet done a lookup, 1 if we succeeded, and -1 if we tried + * and failed. The function always returns the match value. + * We could actually spare the variable and use *uc, setting + * it to '(void *)check_uidgid if we have no info, NULL if + * we tried and failed, or any other value if successful. + */ +static int +check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, int *ugid_lookupp, + struct ucred **uc, struct inpcb *inp) +{ +#ifndef __FreeBSD__ + return cred_check(insn, proto, oif, + dst_ip, dst_port, src_ip, src_port, + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); +#else /* FreeBSD */ + struct inpcbinfo *pi; + int wildcard; + struct inpcb *pcb; + int match; + + /* + * Check to see if the UDP or TCP stack supplied us with + * the PCB. If so, rather then holding a lock and looking + * up the PCB, we can use the one that was supplied. + */ + if (inp && *ugid_lookupp == 0) { + INP_LOCK_ASSERT(inp); + if (inp->inp_socket != NULL) { + *uc = crhold(inp->inp_cred); + *ugid_lookupp = 1; + } else + *ugid_lookupp = -1; + } + /* + * If we have already been here and the packet has no + * PCB entry associated with it, then we can safely + * assume that this is a no match. + */ + if (*ugid_lookupp == -1) + return (0); + if (proto == IPPROTO_TCP) { + wildcard = 0; + pi = &V_tcbinfo; + } else if (proto == IPPROTO_UDP) { + wildcard = INPLOOKUP_WILDCARD; + pi = &V_udbinfo; + } else + return 0; + match = 0; + if (*ugid_lookupp == 0) { + INP_INFO_RLOCK(pi); + pcb = (oif) ? + in_pcblookup_hash(pi, + dst_ip, htons(dst_port), + src_ip, htons(src_port), + wildcard, oif) : + in_pcblookup_hash(pi, + src_ip, htons(src_port), + dst_ip, htons(dst_port), + wildcard, NULL); + if (pcb != NULL) { + *uc = crhold(pcb->inp_cred); + *ugid_lookupp = 1; + } + INP_INFO_RUNLOCK(pi); + if (*ugid_lookupp == 0) { + /* + * We tried and failed, set the variable to -1 + * so we will not try again on this packet. + */ + *ugid_lookupp = -1; + return (0); + } + } + if (insn->o.opcode == O_UID) + match = ((*uc)->cr_uid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_GID) + match = groupmember((gid_t)insn->d[0], *uc); + else if (insn->o.opcode == O_JAIL) + match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); + return match; +#endif /* __FreeBSD__ */ +} + +/* + * Helper function to set args with info on the rule after the matching + * one. slot is precise, whereas we guess rule_id as they are + * assigned sequentially. + */ +static inline void +set_match(struct ip_fw_args *args, int slot, + struct ip_fw_chain *chain) +{ + args->rule.chain_id = chain->id; + args->rule.slot = slot + 1; /* we use 0 as a marker */ + args->rule.rule_id = 1 + chain->map[slot]->id; + args->rule.rulenum = chain->map[slot]->rulenum; +} + +/* + * The main check routine for the firewall. + * + * All arguments are in args so we can modify them and return them + * back to the caller. + * + * Parameters: + * + * args->m (in/out) The packet; we set to NULL when/if we nuke it. + * Starts with the IP header. + * args->eh (in) Mac header if present, NULL for layer3 packet. + * args->L3offset Number of bytes bypassed if we came from L2. + * e.g. often sizeof(eh) ** NOTYET ** + * args->oif Outgoing interface, NULL if packet is incoming. + * The incoming interface is in the mbuf. (in) + * args->divert_rule (in/out) + * Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * + * args->rule Pointer to the last matching rule (in/out) + * args->next_hop Socket we are forwarding to (out). + * args->f_id Addresses grabbed from the packet (out) + * args->rule.info a cookie depending on rule action + * + * Return value: + * + * IP_FW_PASS the packet must be accepted + * IP_FW_DENY the packet must be dropped + * IP_FW_DIVERT divert packet, port in m_tag + * IP_FW_TEE tee packet, port in m_tag + * IP_FW_DUMMYNET to dummynet, pipe in args->cookie + * IP_FW_NETGRAPH into netgraph, cookie args->cookie + * args->rule contains the matching rule, + * args->rule.info has additional information. + * + */ +int +ipfw_chk(struct ip_fw_args *args) +{ + + /* + * Local variables holding state while processing a packet: + * + * IMPORTANT NOTE: to speed up the processing of rules, there + * are some assumption on the values of the variables, which + * are documented here. Should you change them, please check + * the implementation of the various instructions to make sure + * that they still work. + * + * args->eh The MAC header. It is non-null for a layer2 + * packet, it is NULL for a layer-3 packet. + * **notyet** + * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. + * + * m | args->m Pointer to the mbuf, as received from the caller. + * It may change if ipfw_chk() does an m_pullup, or if it + * consumes the packet because it calls send_reject(). + * XXX This has to change, so that ipfw_chk() never modifies + * or consumes the buffer. + * ip is the beginning of the ip(4 or 6) header. + * Calculated by adding the L3offset to the start of data. + * (Until we start using L3offset, the packet is + * supposed to start with the ip header). + */ + struct mbuf *m = args->m; + struct ip *ip = mtod(m, struct ip *); + + /* + * For rules which contain uid/gid or jail constraints, cache + * a copy of the users credentials after the pcb lookup has been + * executed. This will speed up the processing of rules with + * these types of constraints, as well as decrease contention + * on pcb related locks. + */ +#ifndef __FreeBSD__ + struct bsd_ucred ucred_cache; +#else + struct ucred *ucred_cache = NULL; +#endif + int ucred_lookup = 0; + + /* + * oif | args->oif If NULL, ipfw_chk has been called on the + * inbound path (ether_input, ip_input). + * If non-NULL, ipfw_chk has been called on the outbound path + * (ether_output, ip_output). + */ + struct ifnet *oif = args->oif; + + int f_pos = 0; /* index of current rule in the array */ + int retval = 0; + + /* + * hlen The length of the IP header. + */ + u_int hlen = 0; /* hlen >0 means we have an IP pkt */ + + /* + * offset The offset of a fragment. offset != 0 means that + * we have a fragment at this offset of an IPv4 packet. + * offset == 0 means that (if this is an IPv4 packet) + * this is the first or only fragment. + * For IPv6 offset == 0 means there is no Fragment Header. + * If offset != 0 for IPv6 always use correct mask to + * get the correct offset because we add IP6F_MORE_FRAG + * to be able to dectect the first fragment which would + * otherwise have offset = 0. + */ + u_short offset = 0; + + /* + * Local copies of addresses. They are only valid if we have + * an IP packet. + * + * proto The protocol. Set to 0 for non-ip packets, + * or to the protocol read from the packet otherwise. + * proto != 0 means that we have an IPv4 packet. + * + * src_port, dst_port port numbers, in HOST format. Only + * valid for TCP and UDP packets. + * + * src_ip, dst_ip ip addresses, in NETWORK format. + * Only valid for IPv4 packets. + */ + uint8_t proto; + uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ + struct in_addr src_ip, dst_ip; /* NOTE: network format */ + uint16_t iplen=0; + int pktlen; + uint16_t etype = 0; /* Host order stored ether type */ + + /* + * dyn_dir = MATCH_UNKNOWN when rules unchecked, + * MATCH_NONE when checked and not matched (q = NULL), + * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) + */ + int dyn_dir = MATCH_UNKNOWN; + ipfw_dyn_rule *q = NULL; + struct ip_fw_chain *chain = &V_layer3_chain; + + /* + * We store in ulp a pointer to the upper layer protocol header. + * In the ipv4 case this is easy to determine from the header, + * but for ipv6 we might have some additional headers in the middle. + * ulp is NULL if not found. + */ + void *ulp = NULL; /* upper layer protocol pointer. */ + + /* XXX ipv6 variables */ + int is_ipv6 = 0; + uint8_t icmp6_type = 0; + uint16_t ext_hd = 0; /* bits vector for extension header filtering */ + /* end of ipv6 variables */ + + int is_ipv4 = 0; + + int done = 0; /* flag to exit the outer loop */ + + if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) + return (IP_FW_PASS); /* accept */ + + dst_ip.s_addr = 0; /* make sure it is initialized */ + src_ip.s_addr = 0; /* make sure it is initialized */ + pktlen = m->m_pkthdr.len; + args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ + proto = args->f_id.proto = 0; /* mark f_id invalid */ + /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ + +/* + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, + * then it sets p to point at the offset "len" in the mbuf. WARNING: the + * pointer might become stale after other pullups (but we never use it + * this way). + */ +#define PULLUP_TO(_len, p, T) \ +do { \ + int x = (_len) + sizeof(T); \ + if ((m)->m_len < x) { \ + args->m = m = m_pullup(m, x); \ + if (m == NULL) \ + goto pullup_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ +} while (0) + + /* + * if we have an ether header, + */ + if (args->eh) + etype = ntohs(args->eh->ether_type); + + /* Identify IP packets and fill up variables. */ + if (pktlen >= sizeof(struct ip6_hdr) && + (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + is_ipv6 = 1; + args->f_id.addr_type = 6; + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + + /* Search extension headers to find upper layer protocols */ + while (ulp == NULL) { + switch (proto) { + case IPPROTO_ICMPV6: + PULLUP_TO(hlen, ulp, struct icmp6_hdr); + icmp6_type = ICMP6(ulp)->icmp6_type; + break; + + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_HOPOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_HOPOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_ROUTING: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_rthdr); + switch (((struct ip6_rthdr *)ulp)->ip6r_type) { + case 0: + ext_hd |= EXT_RTHDR0; + break; + case 2: + ext_hd |= EXT_RTHDR2; + break; + default: + printf("IPFW2: IPV6 - Unknown Routing " + "Header type(%d)\n", + ((struct ip6_rthdr *)ulp)->ip6r_type); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + ext_hd |= EXT_ROUTING; + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; + ulp = NULL; + break; + + case IPPROTO_FRAGMENT: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_frag); + ext_hd |= EXT_FRAGMENT; + hlen += sizeof (struct ip6_frag); + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_OFF_MASK; + /* Add IP6F_MORE_FRAG for offset of first + * fragment to be != 0. */ + offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_MORE_FRAG; + if (offset == 0) { + printf("IPFW2: IPV6 - Invalid Fragment " + "Header\n"); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + args->f_id.extra = + ntohl(((struct ip6_frag *)ulp)->ip6f_ident); + ulp = NULL; + break; + + case IPPROTO_DSTOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_DSTOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_AH: /* RFC 2402 */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + ext_hd |= EXT_AH; + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; + ulp = NULL; + break; + + case IPPROTO_ESP: /* RFC 2406 */ + PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ + /* Anything past Seq# is variable length and + * data past this ext. header is encrypted. */ + ext_hd |= EXT_ESP; + break; + + case IPPROTO_NONE: /* RFC 2460 */ + /* + * Packet ends here, and IPv6 header has + * already been pulled up. If ip6e_len!=0 + * then octets must be ignored. + */ + ulp = ip; /* non-NULL to get out of loop. */ + break; + + case IPPROTO_OSPFIGP: + /* XXX OSPF header check? */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + + case IPPROTO_PIM: + /* XXX PIM header check? */ + PULLUP_TO(hlen, ulp, struct pim); + break; + + case IPPROTO_CARP: + PULLUP_TO(hlen, ulp, struct carp_header); + if (((struct carp_header *)ulp)->carp_version != + CARP_VERSION) + return (IP_FW_DENY); + if (((struct carp_header *)ulp)->carp_type != + CARP_ADVERTISEMENT) + return (IP_FW_DENY); + break; + + case IPPROTO_IPV6: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip6_hdr); + break; + + case IPPROTO_IPV4: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip); + break; + + default: + printf("IPFW2: IPV6 - Unknown Extension " + "Header(%d), ext_hd=%x\n", proto, ext_hd); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + } /*switch */ + } + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)ip; + args->f_id.src_ip6 = ip6->ip6_src; + args->f_id.dst_ip6 = ip6->ip6_dst; + args->f_id.src_ip = 0; + args->f_id.dst_ip = 0; + args->f_id.flow_id6 = ntohl(ip6->ip6_flow); + } else if (pktlen >= sizeof(struct ip) && + (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { + is_ipv4 = 1; + hlen = ip->ip_hl << 2; + args->f_id.addr_type = 4; + + /* + * Collect parameters into local variables for faster matching. + */ + proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + offset = ntohs(ip->ip_off) & IP_OFFMASK; + iplen = ntohs(ip->ip_len); + pktlen = iplen < pktlen ? iplen : pktlen; + + if (offset == 0) { + switch (proto) { + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen, ulp, struct icmphdr); + //args->f_id.flags = ICMP(ulp)->icmp_type; + break; + + default: + break; + } + } + + ip = mtod(m, struct ip *); + args->f_id.src_ip = ntohl(src_ip.s_addr); + args->f_id.dst_ip = ntohl(dst_ip.s_addr); + } +#undef PULLUP_TO + if (proto) { /* we may have port numbers, store them */ + args->f_id.proto = proto; + args->f_id.src_port = src_port = ntohs(src_port); + args->f_id.dst_port = dst_port = ntohs(dst_port); + } + + IPFW_RLOCK(chain); + if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ + IPFW_RUNLOCK(chain); + return (IP_FW_PASS); /* accept */ + } + if (args->rule.slot) { + /* + * Packet has already been tagged as a result of a previous + * match on rule args->rule aka args->rule_id (PIPE, QUEUE, + * REASS, NETGRAPH, DIVERT/TEE...) + * Validate the slot and continue from the next one + * if still present, otherwise do a lookup. + */ + f_pos = (args->rule.chain_id == chain->id) ? + args->rule.slot : + ipfw_find_rule(chain, args->rule.rulenum, + args->rule.rule_id); + } else { + f_pos = 0; + } + + /* + * Now scan the rules, and parse microinstructions for each rule. + * We have two nested loops and an inner switch. Sometimes we + * need to break out of one or both loops, or re-enter one of + * the loops with updated variables. Loop variables are: + * + * f_pos (outer loop) points to the current rule. + * On output it points to the matching rule. + * done (outer loop) is used as a flag to break the loop. + * l (inner loop) residual length of current rule. + * cmd points to the current microinstruction. + * + * We break the inner loop by setting l=0 and possibly + * cmdlen=0 if we don't want to advance cmd. + * We break the outer loop by setting done=1 + * We can restart the inner loop by setting l>0 and f_pos, f, cmd + * as needed. + */ + for (; f_pos < chain->n_rules; f_pos++) { + ipfw_insn *cmd; + uint32_t tablearg = 0; + int l, cmdlen, skip_or; /* skip rest of OR block */ + struct ip_fw *f; + + f = chain->map[f_pos]; + if (V_set_disable & (1 << f->set) ) + continue; + + skip_or = 0; + for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; + l -= cmdlen, cmd += cmdlen) { + int match; + + /* + * check_body is a jump target used when we find a + * CHECK_STATE, and need to jump to the body of + * the target rule. + */ + +/* check_body: */ + cmdlen = F_LEN(cmd); + /* + * An OR block (insn_1 || .. || insn_n) has the + * F_OR bit set in all but the last instruction. + * The first match will set "skip_or", and cause + * the following instructions to be skipped until + * past the one with the F_OR bit clear. + */ + if (skip_or) { /* skip this instruction */ + if ((cmd->len & F_OR) == 0) + skip_or = 0; /* next one is good */ + continue; + } + match = 0; /* set to 1 if we succeed */ + + switch (cmd->opcode) { + /* + * The first set of opcodes compares the packet's + * fields with some pattern, setting 'match' if a + * match is found. At the end of the loop there is + * logic to deal with F_NOT and F_OR flags associated + * with the opcode. + */ + case O_NOP: + match = 1; + break; + + case O_FORWARD_MAC: + printf("ipfw: opcode %d unimplemented\n", + cmd->opcode); + break; + + case O_GID: + case O_UID: + case O_JAIL: + /* + * We only check offset == 0 && proto != 0, + * as this ensures that we have a + * packet with the ports info. + */ + if (offset!=0) + break; + if (is_ipv6) /* XXX to be fixed later */ + break; + if (proto == IPPROTO_TCP || + proto == IPPROTO_UDP) + match = check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); +#else + (void *)&ucred_cache, + (struct inpcb *)args->m); +#endif + break; + + case O_RECV: + match = iface_match(m->m_pkthdr.rcvif, + (ipfw_insn_if *)cmd); + break; + + case O_XMIT: + match = iface_match(oif, (ipfw_insn_if *)cmd); + break; + + case O_VIA: + match = iface_match(oif ? oif : + m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); + break; + + case O_MACADDR2: + if (args->eh != NULL) { /* have MAC header */ + u_int32_t *want = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->addr; + u_int32_t *mask = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->mask; + u_int32_t *hdr = (u_int32_t *)args->eh; + + match = + ( want[0] == (hdr[0] & mask[0]) && + want[1] == (hdr[1] & mask[1]) && + want[2] == (hdr[2] & mask[2]) ); + } + break; + + case O_MAC_TYPE: + if (args->eh != NULL) { + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (etype >= p[0] && + etype <= p[1]); + } + break; + + case O_FRAG: + match = (offset != 0); + break; + + case O_IN: /* "out" is "not in" */ + match = (oif == NULL); + break; + + case O_LAYER2: + match = (args->eh != NULL); + break; + + case O_DIVERTED: + { + /* For diverted packets, args->rule.info + * contains the divert port (in host format) + * reason and direction. + */ + uint32_t i = args->rule.info; + match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && + cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); + } + break; + + case O_PROTO: + /* + * We do not allow an arg of 0 so the + * check of "proto" only suffices. + */ + match = (proto == cmd->arg1); + break; + + case O_IP_SRC: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + src_ip.s_addr); + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (is_ipv4) { + uint32_t key = + (cmd->opcode == O_IP_DST_LOOKUP) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t v = 0; + + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { + /* generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + if (v == 0) + key = dst_ip.s_addr; + else if (v == 1) + key = src_ip.s_addr; + else if (v == 6) /* dscp */ + key = (ip->ip_tos >> 2) & 0x3f; + else if (offset != 0) + break; + else if (proto != IPPROTO_TCP && + proto != IPPROTO_UDP) + break; + else if (v == 2) + key = htonl(dst_port); + else if (v == 3) + key = htonl(src_port); + else if (v == 4 || v == 5) { + check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); + if (v == 4 /* O_UID */) + key = ucred_cache->cr_uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache, + (struct inpcb *)args->m); + if (v ==4 /* O_UID */) + key = ucred_cache.uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ + key = htonl(key); + } else + break; + } + match = ipfw_lookup_table(chain, + cmd->arg1, key, &v); + if (!match) + break; + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = + ((ipfw_insn_u32 *)cmd)->d[0] == v; + else + tablearg = v; + } + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + if (is_ipv4) { + uint32_t a = + (cmd->opcode == O_IP_DST_MASK) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; + int i = cmdlen-1; + + for (; !match && i>0; i-= 2, p+= 2) + match = (p[0] == (a & p[1])); + } + break; + + case O_IP_SRC_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(src_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_SRC_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); +#endif + break; + + case O_IP_DST_SET: + case O_IP_SRC_SET: + if (is_ipv4) { + u_int32_t *d = (u_int32_t *)(cmd+1); + u_int32_t addr = + cmd->opcode == O_IP_DST_SET ? + args->f_id.dst_ip : + args->f_id.src_ip; + + if (addr < d[0]) + break; + addr -= d[0]; /* subtract base */ + match = (addr < cmd->arg1) && + ( d[ 1 + (addr>>5)] & + (1<<(addr & 0x1f)) ); + } + break; + + case O_IP_DST: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + dst_ip.s_addr); + break; + + case O_IP_DST_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(dst_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_DST_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); +#endif + break; + + + case O_IP_SRCPORT: + case O_IP_DSTPORT: + /* + * offset == 0 && proto != 0 is enough + * to guarantee that we have a + * packet with port info. + */ + if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) + && offset == 0) { + u_int16_t x = + (cmd->opcode == O_IP_SRCPORT) ? + src_port : dst_port ; + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (x>=p[0] && x<=p[1]); + } + break; + + case O_ICMPTYPE: + match = (offset == 0 && proto==IPPROTO_ICMP && + icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); + break; + +#ifdef INET6 + case O_ICMP6TYPE: + match = is_ipv6 && offset == 0 && + proto==IPPROTO_ICMPV6 && + icmp6type_match( + ICMP6(ulp)->icmp6_type, + (ipfw_insn_u32 *)cmd); + break; +#endif /* INET6 */ + + case O_IPOPT: + match = (is_ipv4 && + ipopts_match(ip, cmd) ); + break; + + case O_IPVER: + match = (is_ipv4 && + cmd->arg1 == ip->ip_v); + break; + + case O_IPID: + case O_IPLEN: + case O_IPTTL: + if (is_ipv4) { /* only for IP packets */ + uint16_t x; + uint16_t *p; + int i; + + if (cmd->opcode == O_IPLEN) + x = iplen; + else if (cmd->opcode == O_IPTTL) + x = ip->ip_ttl; + else /* must be IPID */ + x = ntohs(ip->ip_id); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_IPPRECEDENCE: + match = (is_ipv4 && + (cmd->arg1 == (ip->ip_tos & 0xe0)) ); + break; + + case O_IPTOS: + match = (is_ipv4 && + flags_match(cmd, ip->ip_tos)); + break; + + case O_TCPDATALEN: + if (proto == IPPROTO_TCP && offset == 0) { + struct tcphdr *tcp; + uint16_t x; + uint16_t *p; + int i; + + tcp = TCP(ulp); + x = iplen - + ((ip->ip_hl + tcp->th_off) << 2); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_TCPFLAGS: + match = (proto == IPPROTO_TCP && offset == 0 && + flags_match(cmd, TCP(ulp)->th_flags)); + break; + + case O_TCPOPTS: + match = (proto == IPPROTO_TCP && offset == 0 && + tcpopts_match(TCP(ulp), cmd)); + break; + + case O_TCPSEQ: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_seq); + break; + + case O_TCPACK: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_ack); + break; + + case O_TCPWIN: + match = (proto == IPPROTO_TCP && offset == 0 && + cmd->arg1 == TCP(ulp)->th_win); + break; + + case O_ESTAB: + /* reject packets which have SYN only */ + /* XXX should i also check for TH_ACK ? */ + match = (proto == IPPROTO_TCP && offset == 0 && + (TCP(ulp)->th_flags & + (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); + break; + + case O_ALTQ: { + struct pf_mtag *at; + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + match = 1; + at = pf_find_mtag(m); + if (at != NULL && at->qid != 0) + break; + at = pf_get_mtag(m); + if (at == NULL) { + /* + * Let the packet fall back to the + * default ALTQ. + */ + break; + } + at->qid = altq->qid; + if (is_ipv4) + at->af = AF_INET; + else + at->af = AF_LINK; + at->hdr = ip; + break; + } + + case O_LOG: + ipfw_log(f, hlen, args, m, + oif, offset, tablearg, ip); + match = 1; + break; + + case O_PROB: + match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); + break; + + case O_VERREVPATH: + /* Outgoing packets automatically pass/match */ + match = ((oif != NULL) || + (m->m_pkthdr.rcvif == NULL) || + ( +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + m->m_pkthdr.rcvif) : +#endif + verify_path(src_ip, m->m_pkthdr.rcvif, + args->f_id.fib))); + break; + + case O_VERSRCREACH: + /* Outgoing packets automatically pass/match */ + match = (hlen > 0 && ((oif != NULL) || +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + NULL) : +#endif + verify_path(src_ip, NULL, args->f_id.fib))); + break; + + case O_ANTISPOOF: + /* Outgoing packets automatically pass/match */ + if (oif == NULL && hlen > 0 && + ( (is_ipv4 && in_localaddr(src_ip)) +#ifdef INET6 + || (is_ipv6 && + in6_localaddr(&(args->f_id.src_ip6))) +#endif + )) + match = +#ifdef INET6 + is_ipv6 ? verify_path6( + &(args->f_id.src_ip6), + m->m_pkthdr.rcvif) : +#endif + verify_path(src_ip, + m->m_pkthdr.rcvif, + args->f_id.fib); + else + match = 1; + break; + + case O_IPSEC: +#ifdef IPSEC + match = (m_tag_find(m, + PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); +#endif + /* otherwise no match */ + break; + +#ifdef INET6 + case O_IP6_SRC: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + + case O_IP6_DST: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if (is_ipv6) { + int i = cmdlen - 1; + struct in6_addr p; + struct in6_addr *d = + &((ipfw_insn_ip6 *)cmd)->addr6; + + for (; !match && i > 0; d += 2, + i -= F_INSN_SIZE(struct in6_addr) + * 2) { + p = (cmd->opcode == + O_IP6_SRC_MASK) ? + args->f_id.src_ip6: + args->f_id.dst_ip6; + APPLY_MASK(&p, &d[1]); + match = + IN6_ARE_ADDR_EQUAL(&d[0], + &p); + } + } + break; + + case O_FLOW6ID: + match = is_ipv6 && + flow6id_match(args->f_id.flow_id6, + (ipfw_insn_u32 *) cmd); + break; + + case O_EXT_HDR: + match = is_ipv6 && + (ext_hd & ((ipfw_insn *) cmd)->arg1); + break; + + case O_IP6: + match = is_ipv6; + break; +#endif + + case O_IP4: + match = is_ipv4; + break; + + case O_TAG: { + struct m_tag *mtag; + uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + + /* Packet is already tagged with this tag? */ + mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); + + /* We have `untag' action when F_NOT flag is + * present. And we must remove this mtag from + * mbuf and reset `match' to zero (`match' will + * be inversed later). + * Otherwise we should allocate new mtag and + * push it into mbuf. + */ + if (cmd->len & F_NOT) { /* `untag' action */ + if (mtag != NULL) + m_tag_delete(m, mtag); + match = 0; + } else if (mtag == NULL) { + if ((mtag = m_tag_alloc(MTAG_IPFW, + tag, 0, M_NOWAIT)) != NULL) + m_tag_prepend(m, mtag); + match = 1; + } + break; + } + + case O_FIB: /* try match the specified fib */ + if (args->f_id.fib == cmd->arg1) + match = 1; + break; + + case O_TAGGED: { + struct m_tag *mtag; + uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + + if (cmdlen == 1) { + match = m_tag_locate(m, MTAG_IPFW, + tag, NULL) != NULL; + break; + } + + /* we have ranges */ + for (mtag = m_tag_first(m); + mtag != NULL && !match; + mtag = m_tag_next(m, mtag)) { + uint16_t *p; + int i; + + if (mtag->m_tag_cookie != MTAG_IPFW) + continue; + + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for(; !match && i > 0; i--, p += 2) + match = + mtag->m_tag_id >= p[0] && + mtag->m_tag_id <= p[1]; + } + break; + } + + /* + * The second set of opcodes represents 'actions', + * i.e. the terminal part of a rule once the packet + * matches all previous patterns. + * Typically there is only one action for each rule, + * and the opcode is stored at the end of the rule + * (but there are exceptions -- see below). + * + * In general, here we set retval and terminate the + * outer loop (would be a 'break 3' in some language, + * but we need to set l=0, done=1) + * + * Exceptions: + * O_COUNT and O_SKIPTO actions: + * instead of terminating, we jump to the next rule + * (setting l=0), or to the SKIPTO target (setting + * f/f_len, cmd and l as needed), respectively. + * + * O_TAG, O_LOG and O_ALTQ action parameters: + * perform some action and set match = 1; + * + * O_LIMIT and O_KEEP_STATE: these opcodes are + * not real 'actions', and are stored right + * before the 'action' part of the rule. + * These opcodes try to install an entry in the + * state tables; if successful, we continue with + * the next opcode (match=1; break;), otherwise + * the packet must be dropped (set retval, + * break loops with l=0, done=1) + * + * O_PROBE_STATE and O_CHECK_STATE: these opcodes + * cause a lookup of the state table, and a jump + * to the 'action' part of the parent rule + * if an entry is found, or + * (CHECK_STATE only) a jump to the next rule if + * the entry is not found. + * The result of the lookup is cached so that + * further instances of these opcodes become NOPs. + * The jump to the next rule is done by setting + * l=0, cmdlen=0. + */ + case O_LIMIT: + case O_KEEP_STATE: + if (ipfw_install_state(f, + (ipfw_insn_limit *)cmd, args, tablearg)) { + /* error or limit violation */ + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + } + match = 1; + break; + + case O_PROBE_STATE: + case O_CHECK_STATE: + /* + * dynamic rules are checked at the first + * keep-state or check-state occurrence, + * with the result being stored in dyn_dir. + * The compiler introduces a PROBE_STATE + * instruction for us when we have a + * KEEP_STATE (because PROBE_STATE needs + * to be run first). + */ + if (dyn_dir == MATCH_UNKNOWN && + (q = ipfw_lookup_dyn_rule(&args->f_id, + &dyn_dir, proto == IPPROTO_TCP ? + TCP(ulp) : NULL)) + != NULL) { + /* + * Found dynamic entry, update stats + * and jump to the 'action' part of + * the parent rule by setting + * f, cmd, l and clearing cmdlen. + */ + q->pcnt++; + q->bcnt += pktlen; + /* XXX we would like to have f_pos + * readily accessible in the dynamic + * rule, instead of having to + * lookup q->rule. + */ + f = q->rule; + f_pos = ipfw_find_rule(chain, + f->rulenum, f->id); + cmd = ACTION_PTR(f); + l = f->cmd_len - f->act_ofs; + ipfw_dyn_unlock(); + cmdlen = 0; + match = 1; + break; + } + /* + * Dynamic entry not found. If CHECK_STATE, + * skip to next rule, if PROBE_STATE just + * ignore and continue with next opcode. + */ + if (cmd->opcode == O_CHECK_STATE) + l = 0; /* exit inner loop */ + match = 1; + break; + + case O_ACCEPT: + retval = 0; /* accept */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_PIPE: + case O_QUEUE: + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (cmd->opcode == O_PIPE) + args->rule.info |= IPFW_IS_PIPE; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = IP_FW_DUMMYNET; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_DIVERT: + case O_TEE: + if (args->eh) /* not on layer 2 */ + break; + /* otherwise this is terminal */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + retval = (cmd->opcode == O_DIVERT) ? + IP_FW_DIVERT : IP_FW_TEE; + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + break; + + case O_COUNT: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + l = 0; /* exit inner loop */ + break; + + case O_SKIPTO: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + /* If possible use cached f_pos (in f->next_rule), + * whose version is written in f->next_rule + * (horrible hacks to avoid changing the ABI). + */ + if (cmd->arg1 != IP_FW_TABLEARG && + (uintptr_t)f->x_next == chain->id) { + f_pos = (uintptr_t)f->next_rule; + } else { + int i = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + /* make sure we do not jump backward */ + if (i <= f->rulenum) + i = f->rulenum + 1; + f_pos = ipfw_find_rule(chain, i, 0); + /* update the cache */ + if (cmd->arg1 != IP_FW_TABLEARG) { + f->next_rule = + (void *)(uintptr_t)f_pos; + f->x_next = + (void *)(uintptr_t)chain->id; + } + } + /* + * Skip disabled rules, and re-enter + * the inner loop with the correct + * f_pos, f, l and cmd. + * Also clear cmdlen and skip_or + */ + for (; f_pos < chain->n_rules - 1 && + (V_set_disable & + (1 << chain->map[f_pos]->set)); + f_pos++) + ; + /* Re-enter the inner loop at the skipto rule. */ + f = chain->map[f_pos]; + l = f->cmd_len; + cmd = f->cmd; + match = 1; + cmdlen = 0; + skip_or = 0; + continue; + break; /* not reached */ + + case O_REJECT: + /* + * Drop the packet and send a reject notice + * if the packet is not ICMP (or is an ICMP + * query), and it is not multicast/broadcast. + */ + if (hlen > 0 && is_ipv4 && offset == 0 && + (proto != IPPROTO_ICMP || + is_icmp_query(ICMP(ulp))) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN_MULTICAST(ntohl(dst_ip.s_addr))) { + send_reject(args, cmd->arg1, iplen, ip); + m = args->m; + } + /* FALLTHROUGH */ +#ifdef INET6 + case O_UNREACH6: + if (hlen > 0 && is_ipv6 && + ((offset & IP6F_OFF_MASK) == 0) && + (proto != IPPROTO_ICMPV6 || + (is_icmp6_query(icmp6_type) == 1)) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { + send_reject6( + args, cmd->arg1, hlen, + (struct ip6_hdr *)ip); + m = args->m; + } + /* FALLTHROUGH */ +#endif + case O_DENY: + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_FORWARD_IP: + if (args->eh) /* not valid on layer2 pkts */ + break; + if (!q || dyn_dir == MATCH_FORWARD) { + struct sockaddr_in *sa; + sa = &(((ipfw_insn_sa *)cmd)->sa); + if (sa->sin_addr.s_addr == INADDR_ANY) { + bcopy(sa, &args->hopstore, + sizeof(*sa)); + args->hopstore.sin_addr.s_addr = + htonl(tablearg); + args->next_hop = &args->hopstore; + } else { + args->next_hop = sa; + } + } + retval = IP_FW_PASS; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_NETGRAPH: + case O_NGTEE: + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = (cmd->opcode == O_NETGRAPH) ? + IP_FW_NETGRAPH : IP_FW_NGTEE; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_SETFIB: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + M_SETFIB(m, cmd->arg1); + args->f_id.fib = cmd->arg1; + l = 0; /* exit inner loop */ + break; + + case O_NAT: + if (!IPFW_NAT_LOADED) { + retval = IP_FW_DENY; + } else { + struct cfg_nat *t; + int nat_id; + + set_match(args, f_pos, chain); + t = ((ipfw_insn_nat *)cmd)->nat; + if (t == NULL) { + nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + t = (*lookup_nat_ptr)(&chain->nat, nat_id); + + if (t == NULL) { + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + } + if (cmd->arg1 != IP_FW_TABLEARG) + ((ipfw_insn_nat *)cmd)->nat = t; + } + retval = ipfw_nat_ptr(args, t, m); + } + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_REASS: { + int ip_off; + + f->pcnt++; + f->bcnt += pktlen; + l = 0; /* in any case exit inner loop */ + ip_off = ntohs(ip->ip_off); + + /* if not fragmented, go to next rule */ + if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) + break; + /* + * ip_reass() expects len & off in host + * byte order. + */ + SET_HOST_IPLEN(ip); + + args->m = m = ip_reass(m); + + /* + * do IP header checksum fixup. + */ + if (m == NULL) { /* fragment got swallowed */ + retval = IP_FW_DENY; + } else { /* good, packet complete */ + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + retval = IP_FW_REASS; + set_match(args, f_pos, chain); + } + done = 1; /* exit outer loop */ + break; + } + + default: + panic("-- unknown opcode %d\n", cmd->opcode); + } /* end of switch() on opcodes */ + /* + * if we get here with l=0, then match is irrelevant. + */ + + if (cmd->len & F_NOT) + match = !match; + + if (match) { + if (cmd->len & F_OR) + skip_or = 1; + } else { + if (!(cmd->len & F_OR)) /* not an OR block, */ + break; /* try next rule */ + } + + } /* end of inner loop, scan opcodes */ + + if (done) + break; + +/* next_rule:; */ /* try next rule */ + + } /* end of outer for, scan rules */ + + if (done) { + struct ip_fw *rule = chain->map[f_pos]; + /* Update statistics */ + rule->pcnt++; + rule->bcnt += pktlen; + rule->timestamp = time_uptime; + } else { + retval = IP_FW_DENY; + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); + } + IPFW_RUNLOCK(chain); +#ifdef __FreeBSD__ + if (ucred_cache != NULL) + crfree(ucred_cache); +#endif + return (retval); + +pullup_failed: + if (V_fw_verbose) + printf("ipfw: pullup failed\n"); + return (IP_FW_DENY); +} + +/* + * Module and VNET glue + */ + +/* + * Stuff that must be initialised only on boot or module load + */ +static int +ipfw_init(void) +{ + int error = 0; + + ipfw_dyn_attach(); + /* + * Only print out this stuff the first time around, + * when called from the sysinit code. + */ + printf("ipfw2 " +#ifdef INET6 + "(+ipv6) " +#endif + "initialized, divert %s, nat %s, " + "rule-based forwarding " +#ifdef IPFIREWALL_FORWARD + "enabled, " +#else + "disabled, " +#endif + "default to %s, logging ", +#ifdef IPDIVERT + "enabled", +#else + "loadable", +#endif +#ifdef IPFIREWALL_NAT + "enabled", +#else + "loadable", +#endif + default_to_accept ? "accept" : "deny"); + + /* + * Note: V_xxx variables can be accessed here but the vnet specific + * initializer may not have been called yet for the VIMAGE case. + * Tuneables will have been processed. We will print out values for + * the default vnet. + * XXX This should all be rationalized AFTER 8.0 + */ + if (V_fw_verbose == 0) + printf("disabled\n"); + else if (V_verbose_limit == 0) + printf("unlimited\n"); + else + printf("limited to %d packets/entry by default\n", + V_verbose_limit); + + ipfw_log_bpf(1); /* init */ + return (error); +} + +/* + * Called for the removal of the last instance only on module unload. + */ +static void +ipfw_destroy(void) +{ + + ipfw_log_bpf(0); /* uninit */ + ipfw_dyn_detach(); + printf("IP firewall unloaded\n"); +} + +/* + * Stuff that must be initialized for every instance + * (including the first of course). + */ +static int +vnet_ipfw_init(const void *unused) +{ + int error; + struct ip_fw *rule = NULL; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + /* First set up some values that are compile time options */ + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_fw_deny_unknown_exthdrs = 1; +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif +#ifdef IPFIREWALL_NAT + LIST_INIT(&chain->nat); +#endif + + /* insert the default rule and create the initial map */ + chain->n_rules = 1; + chain->static_len = sizeof(struct ip_fw); + chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO); + if (chain->map) + rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO); + if (rule == NULL) { + if (chain->map) + free(chain->map, M_IPFW); + printf("ipfw2: ENOSPC initializing default rule " + "(support disabled)\n"); + return (ENOSPC); + } + error = ipfw_init_tables(chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } + + /* fill and insert the default rule */ + rule->act_ofs = 0; + rule->rulenum = IPFW_DEFAULT_RULE; + rule->cmd_len = 1; + rule->set = RESVD_SET; + rule->cmd[0].len = 1; + rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; + chain->rules = chain->default_rule = chain->map[0] = rule; + chain->id = rule->id = 1; + + IPFW_LOCK_INIT(chain); + ipfw_dyn_init(); + + /* First set up some values that are compile time options */ + V_ipfw_vnet_ready = 1; /* Open for business */ + + /* + * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) + * and pfil hooks for ipv4 and ipv6. Even if the latter two fail + * we still keep the module alive because the sockopt and + * layer2 paths are still useful. + * ipfw[6]_hook return 0 on success, ENOENT on failure, + * so we can ignore the exact return value and just set a flag. + * + * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so + * changes in the underlying (per-vnet) variables trigger + * immediate hook()/unhook() calls. + * In layer2 we have the same behaviour, except that V_ether_ipfw + * is checked on each packet because there are no pfil hooks. + */ + V_ip_fw_ctl_ptr = ipfw_ctl; + V_ip_fw_chk_ptr = ipfw_chk; + error = ipfw_attach_hooks(1); + return (error); +} + +/* + * Called for the removal of each instance. + */ +static int +vnet_ipfw_uninit(const void *unused) +{ + struct ip_fw *reap, *rule; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; + + V_ipfw_vnet_ready = 0; /* tell new callers to go away */ + /* + * disconnect from ipv4, ipv6, layer2 and sockopt. + * Then grab, release and grab again the WLOCK so we make + * sure the update is propagated and nobody will be in. + */ + (void)ipfw_attach_hooks(0 /* detach */); + V_ip_fw_chk_ptr = NULL; + V_ip_fw_ctl_ptr = NULL; + IPFW_UH_WLOCK(chain); + IPFW_UH_WUNLOCK(chain); + IPFW_UH_WLOCK(chain); + + IPFW_WLOCK(chain); + IPFW_WUNLOCK(chain); + IPFW_WLOCK(chain); + + ipfw_dyn_uninit(0); /* run the callout_drain */ + ipfw_destroy_tables(chain); + reap = NULL; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + rule->x_next = reap; + reap = rule; + } + if (chain->map) + free(chain->map, M_IPFW); + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); + if (reap != NULL) + ipfw_reap_rules(reap); + IPFW_LOCK_DESTROY(chain); + ipfw_dyn_uninit(1); /* free the remaining parts */ + return 0; +} + +/* + * Module event handler. + * In general we have the choice of handling most of these events by the + * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to + * use the SYSINIT handlers as they are more capable of expressing the + * flow of control during module and vnet operations, so this is just + * a skeleton. Note there is no SYSINIT equivalent of the module + * SHUTDOWN handler, but we don't have anything to do in that case anyhow. + */ +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + /* Called once at module load or + * system boot if compiled in. */ + break; + case MOD_QUIESCE: + /* Called before unload. May veto unloading. */ + break; + case MOD_UNLOAD: + /* Called during unload. */ + break; + case MOD_SHUTDOWN: + /* Called during system shutdown. */ + break; + default: + err = EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ +#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ +#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ + +DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); +MODULE_VERSION(ipfw, 2); +/* should declare some dependencies here */ + +/* + * Starting up. Done in order after ipfwmod() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_init, NULL); +VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_init, NULL); + +/* + * Closing up shop. These are done in REVERSE ORDER, but still + * after ipfwmod() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_destroy, NULL); +VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_uninit, NULL); +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c new file mode 100644 index 0000000..2bdd299 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_dynamic.c @@ -0,0 +1,1241 @@ +/*- + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 200601 2009-12-16 10:48:40Z luigi $"); + +#define DEB(x) +#define DDB(x) x + +/* + * Dynamic rule support for ipfw + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include + +#include +#include +#include /* ip_defttl */ +#include +#include +#include +#include + +#include /* IN6_ARE_ADDR_EQUAL */ +#ifdef INET6 +#include +#include +#endif + +#include /* XXX for in_cksum */ + +#ifdef MAC +#include +#endif + +/* + * Description of dynamic rules. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, its address fields are first masked + * with the mask defined for the rule, then hashed, then matched + * against the entries in the corresponding list. + * Dynamic rules can be used for different purposes: + * + stateful rules; + * + enforcing limits on the number of sessions; + * + in-kernel NAT (not implemented yet) + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is stored in dyn_count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rule holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. XXX we should make them survive. + * + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. XXX check the latter!!! + */ + +/* + * Static variables followed by global ones + */ +static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); +static VNET_DEFINE(u_int32_t, dyn_buckets); +static VNET_DEFINE(u_int32_t, curr_dyn_buckets); +static VNET_DEFINE(struct callout, ipfw_timeout); +#define V_ipfw_dyn_v VNET(ipfw_dyn_v) +#define V_dyn_buckets VNET(dyn_buckets) +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) +#define V_ipfw_timeout VNET(ipfw_timeout) + +static uma_zone_t ipfw_dyn_rule_zone; +#ifndef __FreeBSD__ +DEFINE_SPINLOCK(ipfw_dyn_mtx); +#else +static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ +#endif + +#define IPFW_DYN_LOCK_INIT() \ + mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) +#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) +#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) +#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) +#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) + +void +ipfw_dyn_unlock(void) +{ + IPFW_DYN_UNLOCK(); +} + +/* + * Timeouts for various events in handing dynamic rules. + */ +static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); +static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); +static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); +static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); +static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); +static VNET_DEFINE(u_int32_t, dyn_short_lifetime); + +#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET(dyn_short_lifetime) + +/* + * Keepalives are sent if dyn_keepalive is set. They are sent every + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval + * seconds of lifetime of a rule. + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower + * than dyn_keepalive_period. + */ + +static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); +static VNET_DEFINE(u_int32_t, dyn_keepalive_period); +static VNET_DEFINE(u_int32_t, dyn_keepalive); + +#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET(dyn_keepalive_period) +#define V_dyn_keepalive VNET(dyn_keepalive) + +static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ +static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ + +#define V_dyn_count VNET(dyn_count) +#define V_dyn_max VNET(dyn_max) + +#ifdef SYSCTL_NODE + +SYSBEGIN(f2) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, + "Number of dyn. buckets"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, + "Current Number of dyn. buckets"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_RD, &VNET_NAME(dyn_count), 0, + "Number of dyn. rules"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_RW, &VNET_NAME(dyn_max), 0, + "Max number of dyn. rules"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, + "Lifetime of dyn. rules for acks"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, + "Lifetime of dyn. rules for syn"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, + "Lifetime of dyn. rules for fin"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, + "Lifetime of dyn. rules for rst"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, + "Lifetime of dyn. rules for UDP"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, + "Enable keepalives for dyn. rules"); + +SYSEND + +#endif /* SYSCTL_NODE */ + + +static __inline int +hash_packet6(struct ipfw_flow_id *id) +{ + u_int32_t i; + i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->dst_port) ^ (id->src_port); + return i; +} + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +static __inline int +hash_packet(struct ipfw_flow_id *id) +{ + u_int32_t i; + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + i = hash_packet6(id); + else +#endif /* INET6 */ + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (V_curr_dyn_buckets - 1); + return i; +} + +static __inline void +unlink_dyn_rule_print(struct ipfw_flow_id *id) +{ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) { + ip6_sprintf(src, &id->src_ip6); + ip6_sprintf(dst, &id->dst_ip6); + } else +#endif + { + da.s_addr = htonl(id->src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(id->dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", + src, id->src_port, dst, id->dst_port, V_dyn_count - 1); +} + +/** + * unlink a dynamic rule from a chain. prev is a pointer to + * the previous one, q is a pointer to the rule to delete, + * head is a pointer to the head of the queue. + * Modifies q and potentially also head. + */ +#define UNLINK_DYN_RULE(prev, head, q) { \ + ipfw_dyn_rule *old_q = q; \ + \ + /* remove a refcount to the parent */ \ + if (q->dyn_type == O_LIMIT) \ + q->parent->count--; \ + DEB(unlink_dyn_rule_print(&q->id);) \ + if (prev != NULL) \ + prev->next = q = q->next; \ + else \ + head = q = q->next; \ + V_dyn_count--; \ + uma_zfree(ipfw_dyn_rule_zone, old_q); } + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) + +/** + * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. + * + * If keep_me == NULL, rules are deleted even if not expired, + * otherwise only expired rules are removed. + * + * The value of the second parameter is also used to point to identify + * a rule we absolutely do not want to remove (e.g. because we are + * holding a reference to it -- this is the case with O_LIMIT_PARENT + * rules). The pointer is only used for comparison, so any non-null + * value will do. + */ +static void +remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) +{ + static u_int32_t last_remove = 0; + +#define FORCE (keep_me == NULL) + + ipfw_dyn_rule *prev, *q; + int i, pass = 0, max_pass = 0; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) + return; + /* do not expire more than once per second, it is useless */ + if (!FORCE && last_remove == time_uptime) + return; + last_remove = time_uptime; + + /* + * because O_LIMIT refer to parent rules, during the first pass only + * remove child and mark any pending LIMIT_PARENT, and remove + * them in a second pass. + */ +next_pass: + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { + /* + * Logic can become complex here, so we split tests. + */ + if (q == keep_me) + goto next; + if (rule != NULL && rule != q->rule) + goto next; /* not the one we are looking for */ + if (q->dyn_type == O_LIMIT_PARENT) { + /* + * handle parent in the second pass, + * record we need one. + */ + max_pass = 1; + if (pass == 0) + goto next; + if (FORCE && q->count != 0 ) { + /* XXX should not happen! */ + printf("ipfw: OUCH! cannot remove rule," + " count %d\n", q->count); + } + } else { + if (!FORCE && + !TIME_LEQ( q->expire, time_uptime )) + goto next; + } + if (q->dyn_type != O_LIMIT_PARENT || !q->count) { + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); + continue; + } +next: + prev=q; + q=q->next; + } + } + if (pass++ < max_pass) + goto next_pass; +} + +void +ipfw_remove_dyn_children(struct ip_fw *rule) +{ + IPFW_DYN_LOCK(); + remove_dyn_rule(rule, NULL /* force removal */); + IPFW_DYN_UNLOCK(); +} + +/** + * lookup a dynamic rule, locked version + */ +static ipfw_dyn_rule * +lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + /* + * stateful ipfw extensions. + * Lookup into dynamic session queue + */ +#define MATCH_REVERSE 0 +#define MATCH_FORWARD 1 +#define MATCH_NONE 2 +#define MATCH_UNKNOWN 3 + int i, dir = MATCH_NONE; + ipfw_dyn_rule *prev, *q=NULL; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL) + goto done; /* not found */ + i = hash_packet( pkt ); + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { + if (q->dyn_type == O_LIMIT_PARENT && q->count) + goto next; + if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); + continue; + } + if (pkt->proto == q->id.proto && + q->dyn_type != O_LIMIT_PARENT) { + if (IS_IP6_FLOW_ID(pkt)) { + if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6)) && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.dst_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.src_ip6)) && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } else { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } + } +next: + prev = q; + q = q->next; + } + if (q == NULL) + goto done; /* q = NULL, not found */ + + if ( prev != NULL) { /* found and not in front */ + prev->next = q->next; + q->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = q; + } + if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ + u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST); + +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) + q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); + switch (q->state) { + case TH_SYN: /* opening */ + q->expire = time_uptime + V_dyn_syn_lifetime; + break; + + case BOTH_SYN: /* move to established */ + case BOTH_SYN | TH_FIN : /* one side tries to close */ + case BOTH_SYN | (TH_FIN << 8) : + if (tcp) { +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) + u_int32_t ack = ntohl(tcp->th_ack); + if (dir == MATCH_FORWARD) { + if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) + q->ack_fwd = ack; + else { /* ignore out-of-sequence */ + break; + } + } else { + if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) + q->ack_rev = ack; + else { /* ignore out-of-sequence */ + break; + } + } + } + q->expire = time_uptime + V_dyn_ack_lifetime; + break; + + case BOTH_SYN | BOTH_FIN: /* both sides closed */ + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; + break; + + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; + break; + } + } else if (pkt->proto == IPPROTO_UDP) { + q->expire = time_uptime + V_dyn_udp_lifetime; + } else { + /* other protocols */ + q->expire = time_uptime + V_dyn_short_lifetime; + } +done: + if (match_direction) + *match_direction = dir; + return q; +} + +ipfw_dyn_rule * +ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + ipfw_dyn_rule *q; + + IPFW_DYN_LOCK(); + q = lookup_dyn_rule_locked(pkt, match_direction, tcp); + if (q == NULL) + IPFW_DYN_UNLOCK(); + /* NB: return table locked when q is not NULL */ + return q; +} + +static void +realloc_dynamic_table(void) +{ + IPFW_DYN_LOCK_ASSERT(); + + /* + * Try reallocation, make sure we have a power of 2 and do + * not allow more than 64k entries. In case of overflow, + * default to 1024. + */ + + if (V_dyn_buckets > 65536) + V_dyn_buckets = 1024; + if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ + V_dyn_buckets = V_curr_dyn_buckets; /* reset */ + return; + } + V_curr_dyn_buckets = V_dyn_buckets; + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); + for (;;) { + V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), + M_IPFW, M_NOWAIT | M_ZERO); + if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) + break; + V_curr_dyn_buckets /= 2; + } +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (O_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (O_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (O_LIMIT_PARENT). + */ +static ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) +{ + ipfw_dyn_rule *r; + int i; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL || + (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { + realloc_dynamic_table(); + if (V_ipfw_dyn_v == NULL) + return NULL; /* failed ! */ + } + i = hash_packet(id); + + r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); + if (r == NULL) { + printf ("ipfw: sorry cannot allocate state\n"); + return NULL; + } + + /* increase refcount on parent, and set pointer */ + if (dyn_type == O_LIMIT) { + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; + if ( parent->dyn_type != O_LIMIT_PARENT) + panic("invalid parent"); + parent->count++; + r->parent = parent; + rule = parent->rule; + } + + r->id = *id; + r->expire = time_uptime + V_dyn_syn_lifetime; + r->rule = rule; + r->dyn_type = dyn_type; + r->pcnt = r->bcnt = 0; + r->count = 0; + + r->bucket = i; + r->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = r; + V_dyn_count++; + DEB({ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN]; + char dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN]; + char dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(r->id))) { + ip6_sprintf(src, &r->id.src_ip6); + ip6_sprintf(dst, &r->id.dst_ip6); + } else +#endif + { + da.s_addr = htonl(r->id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(r->id.dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", + dyn_type, src, r->id.src_port, dst, r->id.dst_port, + V_dyn_count); + }) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) +{ + ipfw_dyn_rule *q; + int i; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v) { + int is_v6 = IS_IP6_FLOW_ID(pkt); + i = hash_packet( pkt ); + for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) + if (q->dyn_type == O_LIMIT_PARENT && + rule== q->rule && + pkt->proto == q->id.proto && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port && + ( + (is_v6 && + IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6))) || + (!is_v6 && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip) + ) + ) { + q->expire = time_uptime + V_dyn_short_lifetime; + DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) + return q; + } + } + return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); +} + +/** + * Install dynamic state for rule type cmd->o.opcode + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +int +ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg) +{ + static int last_log; + ipfw_dyn_rule *q; + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + + src[0] = '\0'; + dst[0] = '\0'; + + IPFW_DYN_LOCK(); + + DEB( +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + ip6_sprintf(src, &args->f_id.src_ip6); + ip6_sprintf(dst, &args->f_id.dst_ip6); + } else +#endif + { + da.s_addr = htonl(args->f_id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(args->f_id.dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: %s: type %d %s %u -> %s %u\n", + __func__, cmd->o.opcode, src, args->f_id.src_port, + dst, args->f_id.dst_port); + src[0] = '\0'; + dst[0] = '\0'; + ) + + q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); + + if (q != NULL) { /* should never occur */ + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: entry already present, done\n", + __func__); + } + IPFW_DYN_UNLOCK(); + return (0); + } + + if (V_dyn_count >= V_dyn_max) + /* Run out of slots, try to remove any expired rule. */ + remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); + + if (V_dyn_count >= V_dyn_max) { + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: Too many dynamic rules\n", __func__); + } + IPFW_DYN_UNLOCK(); + return (1); /* cannot install, notify caller */ + } + + switch (cmd->o.opcode) { + case O_KEEP_STATE: /* bidir rule */ + add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); + break; + + case O_LIMIT: { /* limit number of sessions */ + struct ipfw_flow_id id; + ipfw_dyn_rule *parent; + uint32_t conn_limit; + uint16_t limit_mask = cmd->limit_mask; + + conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? + tablearg : cmd->conn_limit; + + DEB( + if (cmd->conn_limit == IP_FW_TABLEARG) + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " + "(tablearg)\n", __func__, conn_limit); + else + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", + __func__, conn_limit); + ) + + id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; + id.proto = args->f_id.proto; + id.addr_type = args->f_id.addr_type; + id.fib = M_GETFIB(args->m); + + if (IS_IP6_FLOW_ID (&(args->f_id))) { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip6 = args->f_id.src_ip6; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip6 = args->f_id.dst_ip6; + } else { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + } + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { + printf("ipfw: %s: add parent failed\n", __func__); + IPFW_DYN_UNLOCK(); + return (1); + } + + if (parent->count >= conn_limit) { + /* See if we can remove some expired rule. */ + remove_dyn_rule(rule, parent); + if (parent->count >= conn_limit) { + if (V_fw_verbose && last_log != time_uptime) { + last_log = time_uptime; +#ifdef INET6 + /* + * XXX IPv6 flows are not + * supported yet. + */ + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), + "[%s]", ip6_sprintf(ip6buf, + &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), + "[%s]", ip6_sprintf(ip6buf, + &args->f_id.dst_ip6)); + } else +#endif + { + da.s_addr = + htonl(args->f_id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = + htonl(args->f_id.dst_ip); + inet_ntoa_r(da, dst); + } + log(LOG_SECURITY | LOG_DEBUG, + "ipfw: %d %s %s:%u -> %s:%u, %s\n", + parent->rule->rulenum, + "drop session", + src, (args->f_id.src_port), + dst, (args->f_id.dst_port), + "too many entries"); + } + IPFW_DYN_UNLOCK(); + return (1); + } + } + add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); + break; + } + default: + printf("ipfw: %s: unknown dynamic rule type %u\n", + __func__, cmd->o.opcode); + IPFW_DYN_UNLOCK(); + return (1); + } + + /* XXX just set lifetime */ + lookup_dyn_rule_locked(&args->f_id, NULL, NULL); + + IPFW_DYN_UNLOCK(); + return (0); +} + +/* + * Generate a TCP packet, containing either a RST or a keepalive. + * When flags & TH_RST, we are sending a RST packet, because of a + * "reset" action matched the packet. + * Otherwise we are sending a keepalive, and flags & TH_ + * The 'replyto' mbuf is the mbuf being replied to, if any, and is required + * so that MAC can label the reply appropriately. + */ +struct mbuf * +ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, + u_int32_t ack, int flags) +{ + struct mbuf *m = NULL; /* stupid compiler */ + int len, dir; + struct ip *h = NULL; /* stupid compiler */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif + struct tcphdr *th = NULL; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); + + M_SETFIB(m, id->fib); +#ifdef MAC + if (replyto != NULL) + mac_netinet_firewall_reply(replyto, m); + else + mac_netinet_firewall_send(m); +#else + (void)replyto; /* don't warn about unused arg */ +#endif + + switch (id->addr_type) { + case 4: + len = sizeof(struct ip) + sizeof(struct tcphdr); + break; +#ifdef INET6 + case 6: + len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + break; +#endif + default: + /* XXX: log me?!? */ + FREE_PKT(m); + return (NULL); + } + dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); + + m->m_data += max_linkhdr; + m->m_flags |= M_SKIP_FIREWALL; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + + switch (id->addr_type) { + case 4: + h = mtod(m, struct ip *); + + /* prepare for checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(sizeof(struct tcphdr)); + if (dir) { + h->ip_src.s_addr = htonl(id->src_ip); + h->ip_dst.s_addr = htonl(id->dst_ip); + } else { + h->ip_src.s_addr = htonl(id->dst_ip); + h->ip_dst.s_addr = htonl(id->src_ip); + } + + th = (struct tcphdr *)(h + 1); + break; +#ifdef INET6 + case 6: + h6 = mtod(m, struct ip6_hdr *); + + /* prepare for checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(sizeof(struct tcphdr)); + if (dir) { + h6->ip6_src = id->src_ip6; + h6->ip6_dst = id->dst_ip6; + } else { + h6->ip6_src = id->dst_ip6; + h6->ip6_dst = id->src_ip6; + } + + th = (struct tcphdr *)(h6 + 1); + break; +#endif + } + + if (dir) { + th->th_sport = htons(id->src_port); + th->th_dport = htons(id->dst_port); + } else { + th->th_sport = htons(id->dst_port); + th->th_dport = htons(id->src_port); + } + th->th_off = sizeof(struct tcphdr) >> 2; + + if (flags & TH_RST) { + if (flags & TH_ACK) { + th->th_seq = htonl(ack); + th->th_flags = TH_RST; + } else { + if (flags & TH_SYN) + seq++; + th->th_ack = htonl(seq); + th->th_flags = TH_RST | TH_ACK; + } + } else { + /* + * Keepalive - use caller provided sequence numbers + */ + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_flags = TH_ACK; + } + + switch (id->addr_type) { + case 4: + th->th_sum = in_cksum(m, len); + + /* finish the ip header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = 0; + /* ip_len must be in host format for ip_output */ + h->ip_len = len; + h->ip_ttl = V_ip_defttl; + h->ip_sum = 0; + break; +#ifdef INET6 + case 6: + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), + sizeof(struct tcphdr)); + + /* finish the ip6 header */ + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + break; +#endif + } + + return (m); +} + +/* + * This procedure is only used to handle keepalives. It is invoked + * every dyn_keepalive_period + */ + /* dummynet() and ipfw_tick() can't be static in windows */ +void +ipfw_tick(void * vnetx) +{ + struct mbuf *m0, *m, *mnext, **mtailp; +#ifdef INET6 + struct mbuf *m6, **m6_tailp; +#endif + int i; + ipfw_dyn_rule *q; +#ifdef VIMAGE + struct vnet *vp = vnetx; +#endif + + CURVNET_SET(vp); + if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) + goto done; + + /* + * We make a chain of packets to go out here -- not deferring + * until after we drop the IPFW dynamic rule lock would result + * in a lock order reversal with the normal packet input -> ipfw + * call stack. + */ + m0 = NULL; + mtailp = &m0; +#ifdef INET6 + m6 = NULL; + m6_tailp = &m6; +#endif + IPFW_DYN_LOCK(); + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { + if (q->dyn_type == O_LIMIT_PARENT) + continue; + if (q->id.proto != IPPROTO_TCP) + continue; + if ( (q->state & BOTH_SYN) != BOTH_SYN) + continue; + if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, + q->expire)) + continue; /* too early */ + if (TIME_LEQ(q->expire, time_uptime)) + continue; /* too late, rule expired */ + + m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, + q->ack_fwd, TH_SYN); + mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, + q->ack_rev, 0); + + switch (q->id.addr_type) { + case 4: + if (m != NULL) { + *mtailp = m; + mtailp = &(*mtailp)->m_nextpkt; + } + if (mnext != NULL) { + *mtailp = mnext; + mtailp = &(*mtailp)->m_nextpkt; + } + break; +#ifdef INET6 + case 6: + if (m != NULL) { + *m6_tailp = m; + m6_tailp = &(*m6_tailp)->m_nextpkt; + } + if (mnext != NULL) { + *m6_tailp = mnext; + m6_tailp = &(*m6_tailp)->m_nextpkt; + } + break; +#endif + } + + m = mnext = NULL; + } + } + IPFW_DYN_UNLOCK(); + for (m = mnext = m0; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + ip_output(m, NULL, NULL, 0, NULL, NULL); + } +#ifdef INET6 + for (m = mnext = m6; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); + } +#endif +done: + callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz, + ipfw_tick, vnetx, 0); + CURVNET_RESTORE(); +} + +void +ipfw_dyn_attach(void) +{ + ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + IPFW_DYN_LOCK_INIT(); +} + +void +ipfw_dyn_detach(void) +{ + uma_zdestroy(ipfw_dyn_rule_zone); + IPFW_DYN_LOCK_DESTROY(); +} + +void +ipfw_dyn_init(void) +{ + V_ipfw_dyn_v = NULL; + V_dyn_buckets = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + + V_dyn_max = 4096; /* max # of dynamic rules */ + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); + callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0); +} + +void +ipfw_dyn_uninit(int pass) +{ + if (pass == 0) + callout_drain(&V_ipfw_timeout); + else { + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); + } +} + +int +ipfw_dyn_len(void) +{ + return (V_ipfw_dyn_v == NULL) ? 0 : + (V_dyn_count * sizeof(ipfw_dyn_rule)); +} + +void +ipfw_get_dynamic(char **pbp, const char *ep) +{ + ipfw_dyn_rule *p, *last = NULL; + char *bp; + int i; + + if (V_ipfw_dyn_v == NULL) + return; + bp = *pbp; + + IPFW_DYN_LOCK(); + for (i = 0 ; i < V_curr_dyn_buckets; i++) + for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { + if (bp + sizeof *p <= ep) { + ipfw_dyn_rule *dst = + (ipfw_dyn_rule *)bp; + bcopy(p, dst, sizeof *p); + bcopy(&(p->rule->rulenum), &(dst->rule), + sizeof(p->rule->rulenum)); + /* + * store set number into high word of + * dst->rule pointer. + */ + bcopy(&(p->rule->set), + (char *)&dst->rule + + sizeof(p->rule->rulenum), + sizeof(p->rule->set)); + /* + * store a non-null value in "next". + * The userland code will interpret a + * NULL here as a marker + * for the last dynamic rule. + */ + bcopy(&dst, &dst->next, sizeof(dst)); + last = dst; + dst->expire = + TIME_LEQ(dst->expire, time_uptime) ? + 0 : dst->expire - time_uptime ; + bp += sizeof(ipfw_dyn_rule); + } + } + IPFW_DYN_UNLOCK(); + if (last != NULL) /* mark last dynamic rule */ + bzero(&last->next, sizeof(last)); + *pbp = bp; +} +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c new file mode 100644 index 0000000..55b5c26 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_log.c @@ -0,0 +1,449 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 209845 2010-07-09 11:27:33Z glebius $"); + +/* + * Logging support for ipfw + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include +#include /* for IFT_ETHER */ +#include /* for BPF */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include /* ip6_sprintf() */ +#endif + +#ifdef MAC +#include +#endif + +/* + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 +#define SNP(buf) buf, sizeof(buf) + +#ifdef WITHOUT_BPF +void +ipfw_log_bpf(int onoff) +{ +} +#else /* !WITHOUT_BPF */ +static struct ifnet *log_if; /* hook to attach to bpf */ + +/* we use this dummy function for all ifnet callbacks */ +static int +log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + return EINVAL; +} + +static int +ipfw_log_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + if (m != NULL) + m_freem(m); + return EINVAL; +} + +static void +ipfw_log_start(struct ifnet* ifp) +{ + panic("ipfw_log_start() must not be called"); +} + +static const u_char ipfwbroadcastaddr[6] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +void +ipfw_log_bpf(int onoff) +{ + struct ifnet *ifp; + + if (onoff) { + if (log_if) + return; + ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) + return; + if_initname(ifp, "ipfw", 0); + ifp->if_mtu = 65536; + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = (void *)log_dummy; + ifp->if_ioctl = log_dummy; + ifp->if_start = ipfw_log_start; + ifp->if_output = ipfw_log_output; + ifp->if_addrlen = 6; + ifp->if_hdrlen = 14; + if_attach(ifp); + ifp->if_broadcastaddr = ipfwbroadcastaddr; + ifp->if_baudrate = IF_Mbps(10); + bpfattach(ifp, DLT_EN10MB, 14); + log_if = ifp; + } else { + if (log_if) { + ether_ifdetach(log_if); + if_free(log_if); + } + log_if = NULL; + } +} +#endif /* !WITHOUT_BPF */ + +/* + * We enter here when we have a rule with O_LOG. + * XXX this function alone takes about 2Kbytes of code! + */ +void +ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip) +{ + char *action; + int limit_reached = 0; + char action2[40], proto[128], fragment[32]; + + if (V_fw_verbose == 0) { +#ifndef WITHOUT_BPF + + if (log_if == NULL || log_if->if_bpf == NULL) + return; + + if (args->eh) /* layer2, use orig hdr */ + BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); + else + /* Add fake header. Later we will store + * more info in the header. + */ + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); +#endif /* !WITHOUT_BPF */ + return; + } + /* the old 'log' function */ + fragment[0] = '\0'; + proto[0] = '\0'; + + if (f == NULL) { /* bogus pkt */ + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) + return; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; + action = "Refuse"; + } else { /* O_LOG is the first action, find the real one */ + ipfw_insn *cmd = ACTION_PTR(f); + ipfw_insn_log *l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + l->log_left--; + if (l->log_left == 0) + limit_reached = l->max_log; + cmd += F_LEN(cmd); /* point to first action */ + if (cmd->opcode == O_ALTQ) { + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + snprintf(SNPARGS(action2, 0), "Altq %d", + altq->qid); + cmd += F_LEN(cmd); + } + if (cmd->opcode == O_PROB) + cmd += F_LEN(cmd); + + if (cmd->opcode == O_TAG) + cmd += F_LEN(cmd); + + action = action2; + switch (cmd->opcode) { + case O_DENY: + action = "Deny"; + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + action = "Reset"; + else if (cmd->arg1==ICMP_UNREACH_HOST) + action = "Reject"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1==ICMP6_UNREACH_RST) + action = "Reset"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_ACCEPT: + action = "Accept"; + break; + case O_COUNT: + action = "Count"; + break; + case O_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + cmd->arg1); + break; + case O_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + cmd->arg1); + break; + case O_SETFIB: + snprintf(SNPARGS(action2, 0), "SetFib %d", + cmd->arg1); + break; + case O_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + cmd->arg1); + break; + case O_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + cmd->arg1); + break; + case O_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + cmd->arg1); + break; + case O_FORWARD_IP: { + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; + int len; + struct in_addr dummyaddr; + if (sa->sa.sin_addr.s_addr == INADDR_ANY) + dummyaddr.s_addr = htonl(tablearg); + else + dummyaddr.s_addr = sa->sa.sin_addr.s_addr; + + len = snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(dummyaddr)); + + if (sa->sa.sin_port) + snprintf(SNPARGS(action2, len), ":%d", + sa->sa.sin_port); + } + break; + case O_NETGRAPH: + snprintf(SNPARGS(action2, 0), "Netgraph %d", + cmd->arg1); + break; + case O_NGTEE: + snprintf(SNPARGS(action2, 0), "Ngtee %d", + cmd->arg1); + break; + case O_NAT: + action = "Nat"; + break; + case O_REASS: + action = "Reass"; + break; + default: + action = "UNKNOWN"; + break; + } + } + + if (hlen == 0) { /* non-ip */ + snprintf(SNPARGS(proto, 0), "MAC"); + + } else { + int len; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + struct icmphdr *icmp; + struct tcphdr *tcp; + struct udphdr *udp; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + struct icmp6_hdr *icmp6; +#endif + src[0] = '\0'; + dst[0] = '\0'; +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); + + ip6 = (struct ip6_hdr *)ip; + tcp = (struct tcphdr *)(((char *)ip) + hlen); + udp = (struct udphdr *)(((char *)ip) + hlen); + } else +#endif + { + tcp = L3HDR(struct tcphdr, ip); + udp = L3HDR(struct udphdr, ip); + + inet_ntoa_r(ip->ip_src, src); + inet_ntoa_r(ip->ip_dst, dst); + } + + switch (args->f_id.proto) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(tcp->th_sport), + dst, + ntohs(tcp->th_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(udp->uh_sport), + dst, + ntohs(udp->uh_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_ICMP: + icmp = L3HDR(struct icmphdr, ip); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#ifdef INET6 + case IPPROTO_ICMPV6: + icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMPv6:%u.%u ", + icmp6->icmp6_type, icmp6->icmp6_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#endif + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", + args->f_id.proto, src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; + } + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) + snprintf(SNPARGS(fragment, 0), + " (frag %08x:%d@%d%s)", + args->f_id.extra, + ntohs(ip6->ip6_plen) - hlen, + ntohs(offset & IP6F_OFF_MASK) << 3, + (offset & IP6F_MORE_FRAG) ? "+" : ""); + } else +#endif + { + int ipoff, iplen; + ipoff = ntohs(ip->ip_off); + iplen = ntohs(ip->ip_len); + if (ipoff & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), + " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), + offset << 3, + (ipoff & IP_MF) ? "+" : ""); + } + } +#ifdef __FreeBSD__ + if (oif || m->m_pkthdr.rcvif) + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s %s via %s%s\n", + f ? f->rulenum : -1, + action, proto, oif ? "out" : "in", + oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, + fragment); + else +#endif + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, + action, proto, fragment); + if (limit_reached) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + limit_reached, f ? f->rulenum : -1); +} +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_lookup.c b/sys/netinet/ipfw/ip_fw_lookup.c new file mode 100644 index 0000000..bf04cb6 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_lookup.c @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 2009 Luigi Rizzo Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $"); + +/* + * Rule and pipe lookup support for ipfw. + * + +ipfw and dummynet need to quickly find objects (rules, pipes) +that may be dynamically created or destroyed. +To address the problem, we label each new object with a unique +32-bit identifier whose low K bits are the index in a lookup +table. All existing objects are referred by the lookup table, +and identifiers are chosen so that for each slot there is +at most one active object (whose identifier points to the slot). +This is almost a hash table, except that we can pick the +identifiers after looking at the table's occupation so +we have a trivial hash function and are collision free. + +With this structure, operations are very fast and simple: +- the table has N entries s[i] with two fields, 'id' and 'ptr', + with N <= M = 2^k (M is an upper bound to the size of the table); +- initially, all slots have s[i].id = i, and the pointers + are used to build a freelist (tailq). +- a slot is considered empty if ptr == NULL or s[0] <= ptr < s[N]. + This is easy to detect and we can use ptr to build the freelist. +- when a new object is created, we put it in the empty slot i at the + head of the freelist, and set the id to s[i].id; +- when an object is destroyed, we append its slot i to the end + of the freelist, and set s[i].id += M (note M, not N). +- on a lookup for id = X, we look at slot i = X & (M-1), + and consider the lookup successful only if the slot is not + empty and s[i].id == X; +- wraps occur at most every F * 2^32/M operations, where F is + the number of free slots. Because F is usually a reasonable + fraction of M, we should not worry too much. +- if the table fills up, we can extend it by increasing N +- shrinking the table is more difficult as we might create + collisions during the rehashing. + * + */ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +MALLOC_DEFINE(M_IPFW_LUT, "ipfw_lookup", "IpFw lookup"); +#define Malloc(n) malloc(n, M_IPFW_LUT, M_WAITOK) +#define Calloc(n) calloc(n, M_IPFW_LUT, M_WAITOK | M_ZERO) +#define Free(p) free(p, M_IPFW_LUT) + +#define log(x, arg...) + +#else /* !_KERNEL */ +#include +#include +#include +#include +#define Malloc(n) malloc(n) +#define Calloc(n) calloc(1, n) +#define Free(p) free(p) +#define log(x, arg...) fprintf(stderr, "%s: " x "\n", __FUNCTION__, ##arg) +#endif /* !_KERNEL */ + +struct entry { + uint32_t id; + struct entry *ptr; +}; + +struct lookup_table { + int _size; + int used; + int mask; /* 2^k -1, used for hashing */ + struct entry *f_head, *f_tail; /* freelist */ + struct entry * s; /* slots, array of N entries */ +}; + +static __inline int empty(struct lookup_table *head, const void *p) +{ + const struct entry *ep = p; + return (ep == NULL || + (ep >= head->s && ep < &head->s[head->_size])); +} + +/* + * init or reinit a table + */ +struct lookup_table * +ipfw_lut_init(struct lookup_table *head, int new_size, int mask) +{ + int i; + struct entry *s; /* the new slots */ + struct entry *fh, *ft; /* the freelist */ + + if (head != NULL) { + mask = head->mask; + if (new_size <= head->_size) + return head; + if (new_size >= mask+1) { + log("size larger than mask"); + return NULL; + } + } else { + log("old is null, initialize"); + head = Calloc(sizeof(*head)); + if (head == NULL) + return NULL; + if (new_size >= mask) + mask = new_size; + if (mask & (mask -1)) { + for (i = 1; i < mask; i += i) + ; + log("mask %d not 2^k, round up to %d", mask, i); + mask = i; + } + mask = head->mask = mask - 1; + } + + s = Calloc(new_size * sizeof(*s)); + if (s == NULL) + return NULL; + if (!head->s) { + head->s = s; + head->_size = 1; + } + fh = ft = NULL; + /* remap the entries, adjust the freelist */ + for (i = 0; i < new_size; i++) { + s[i].id = (i >= head->_size) ? i : head->s[i].id; + if (i < head->_size && !empty(head, head->s[i].ptr)) { + s[i].ptr = head->s[i].ptr; + continue; + } + if (fh == NULL) + fh = &s[i]; + else + ft->ptr = &s[i]; + ft = &s[i]; + } + head->f_head = fh; + head->f_tail = ft; + + /* write lock on the structure, to protect the readers */ + fh = head->s; + head->s = s; + head->_size = new_size; + /* release write lock */ + if (fh != s) + Free(fh); + log("done"); + return head; +} + +/* insert returns the id */ +int +ipfw_lut_insert(struct lookup_table *head, void *d) +{ + struct entry *e; + + e = head->f_head; + if (e == NULL) + return -1; + head->f_head = e->ptr; + e->ptr = d; + head->used++; + return e->id; +} + +/* delete, returns the original entry */ +void * +ipfw_lut_delete(struct lookup_table *head, int id) +{ + int i = id & head->mask; + void *result; + struct entry *e; + + if (i >= head->_size) + return NULL; + e = &head->s[i]; + if (e->id != id) + return NULL; + result = e->ptr; + /* write lock to invalidate the entry to readers */ + e->id += head->mask + 1; /* prepare for next insert */ + e->ptr = NULL; + /* release write lock */ + if (head->f_head == NULL) + head->f_head = e; + else + head->f_tail->ptr = e; + head->f_tail = e; + head->used--; + return result; +} + +void * +ipfw_lut_lookup(struct lookup_table *head, int id) +{ + int i = id & head->mask; + struct entry *e; + + if (i >= head->_size) + return NULL; + e = &head->s[i]; + return (e->id == id) ? e->ptr : NULL; +} + +void +ipfw_lut_dump(struct lookup_table *head) +{ + int i; + + log("head %p size %d used %d freelist %d", + head, head->_size, head->used, head->f_head ? + head->f_head - head->s : -1); + for (i = 0; i < head->_size; i++) { + struct entry *e = &head->s[i]; + char ee = empty(head, e->ptr) ? 'E' : ' '; + log("%5d %5d %c %p", i, e->id, ee, + ee == 'E' && e->ptr != NULL ? + (void *)((struct entry *)e->ptr - head->s) : e->ptr); + } +} + +#ifndef _KERNEL +void dump_p(struct lookup_table *p, int *map) +{ + int i; + for (i = 0; i < p->_size; i++) { + int id = (int)ipfw_lut_lookup(p, map[i]); + log("%3d: %3d: %c", map[i] % 64, i, id); + } +} +int main(int argc, char *argv[]) +{ + int i, j, l; +#define S 1000 + int map[S]; + struct lookup_table *p; + struct lookup_table *p1; + const char *m = "nel mezzo del cammin di nostra vita mi ritrovai" + " in una selva oscura e la diritta via era smarrita!"; + + fprintf(stderr, "testing lookup\n"); + + l = strlen(m); + + p = ipfw_lut_init(NULL, 120, 33); + + ipfw_lut_dump(p); + for (i = 0; i < l; i++) { + int x = m[i]; + int id = ipfw_lut_insert(p, (void *)x); + //ipfw_lut_dump(p); + map[i] = id; + for (j=0; j < 10; j++) { + id = ipfw_lut_insert(p, (void *)'a'); + // ipfw_lut_dump(p); + ipfw_lut_delete(p, id); + // ipfw_lut_dump(p); + } + // ipfw_lut_dump(p); + } + dump_p(p, map); + p1 = ipfw_lut_init(p, 23, 0); + if (!p1) + return 1; + dump_p(p1, map); + p1 = ipfw_lut_init(p1, 120, 0); + if (!p1) + return 1; + dump_p(p1, map); + return 0; +} +#endif +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c new file mode 100644 index 0000000..d093924 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_nat.c @@ -0,0 +1,605 @@ +/*- + * Copyright (c) 2008 Paolo Pisati + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: user/luigi/ipfw3-head/sys/netinet/ipfw/ip_fw_nat.c 200975 2009-12-25 01:15:39Z luigi $"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* XXX for in_cksum */ + +static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag); +#define V_ifaddr_event_tag VNET(ifaddr_event_tag) + +static void +ifaddr_change(void *arg, struct ifnet *ifp) +{ + struct cfg_nat *ptr; + struct ifaddr *ifa; + struct ip_fw_chain *chain; + + (void)arg; + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + /* Check every nat entry... */ + LIST_FOREACH(ptr, &chain->nat, _next) { + /* ...using nic 'ifp->if_xname' as dynamic alias address. */ + if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) + continue; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ptr->ip = ((struct sockaddr_in *) + (ifa->ifa_addr))->sin_addr; + LibAliasSetAddress(ptr->lib, ptr->ip); + } + if_addr_runlock(ifp); + } + IPFW_WUNLOCK(chain); +} + +/* + * delete the pointers for nat entry ix, or all of them if ix < 0 + */ +static void +flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) +{ + int i; + ipfw_insn_nat *cmd; + + IPFW_WLOCK_ASSERT(chain); + for (i = 0; i < chain->n_rules; i++) { + cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); + /* XXX skip log and the like ? */ + if (cmd->o.opcode == O_NAT && cmd->nat != NULL && + (ix < 0 || cmd->nat->id == ix)) + cmd->nat = NULL; + } +} + +static void +del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) +{ + struct cfg_redir *r, *tmp_r; + struct cfg_spool *s, *tmp_s; + int i, num; + + LIST_FOREACH_SAFE(r, head, _next, tmp_r) { + num = 1; /* Number of alias_link to delete. */ + switch (r->mode) { + case REDIR_PORT: + num = r->pport_cnt; + /* FALLTHROUGH */ + case REDIR_ADDR: + case REDIR_PROTO: + /* Delete all libalias redirect entry. */ + for (i = 0; i < num; i++) + LibAliasRedirectDelete(n->lib, r->alink[i]); + /* Del spool cfg if any. */ + LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { + LIST_REMOVE(s, _next); + free(s, M_IPFW); + } + free(r->alink, M_IPFW); + LIST_REMOVE(r, _next); + free(r, M_IPFW); + break; + default: + printf("unknown redirect mode: %u\n", r->mode); + /* XXX - panic?!?!? */ + break; + } + } +} + +static int +add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) +{ + struct cfg_redir *r, *ser_r; + struct cfg_spool *s, *ser_s; + int cnt, off, i; + + for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { + ser_r = (struct cfg_redir *)&buf[off]; + r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(r, ser_r, SOF_REDIR); + LIST_INIT(&r->spool_chain); + off += SOF_REDIR; + r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, + M_IPFW, M_WAITOK | M_ZERO); + switch (r->mode) { + case REDIR_ADDR: + r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, + r->paddr); + break; + case REDIR_PORT: + for (i = 0 ; i < r->pport_cnt; i++) { + /* If remotePort is all ports, set it to 0. */ + u_short remotePortCopy = r->rport + i; + if (r->rport_cnt == 1 && r->rport == 0) + remotePortCopy = 0; + r->alink[i] = LibAliasRedirectPort(ptr->lib, + r->laddr, htons(r->lport + i), r->raddr, + htons(remotePortCopy), r->paddr, + htons(r->pport + i), r->proto); + if (r->alink[i] == NULL) { + r->alink[0] = NULL; + break; + } + } + break; + case REDIR_PROTO: + r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, + r->raddr, r->paddr, r->proto); + break; + default: + printf("unknown redirect mode: %u\n", r->mode); + break; + } + /* XXX perhaps return an error instead of panic ? */ + if (r->alink[0] == NULL) + panic("LibAliasRedirect* returned NULL"); + /* LSNAT handling. */ + for (i = 0; i < r->spool_cnt; i++) { + ser_s = (struct cfg_spool *)&buf[off]; + s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(s, ser_s, SOF_SPOOL); + LibAliasAddServer(ptr->lib, r->alink[0], + s->addr, htons(s->port)); + off += SOF_SPOOL; + /* Hook spool entry. */ + LIST_INSERT_HEAD(&r->spool_chain, s, _next); + } + /* And finally hook this redir entry. */ + LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); + } + return (1); +} + +static int +ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) +{ + struct mbuf *mcl; + struct ip *ip; + /* XXX - libalias duct tape */ + int ldt, retval; + char *c; + + ldt = 0; + retval = 0; + mcl = m_megapullup(m, m->m_pkthdr.len); + if (mcl == NULL) { + args->m = NULL; + return (IP_FW_DENY); + } + ip = mtod(mcl, struct ip *); + + /* + * XXX - Libalias checksum offload 'duct tape': + * + * locally generated packets have only pseudo-header checksum + * calculated and libalias will break it[1], so mark them for + * later fix. Moreover there are cases when libalias modifies + * tcp packet data[2], mark them for later fix too. + * + * [1] libalias was never meant to run in kernel, so it does + * not have any knowledge about checksum offloading, and + * expects a packet with a full internet checksum. + * Unfortunately, packets generated locally will have just the + * pseudo header calculated, and when libalias tries to adjust + * the checksum it will actually compute a wrong value. + * + * [2] when libalias modifies tcp's data content, full TCP + * checksum has to be recomputed: the problem is that + * libalias does not have any idea about checksum offloading. + * To work around this, we do not do checksumming in LibAlias, + * but only mark the packets in th_x2 field. If we receive a + * marked packet, we calculate correct checksum for it + * aware of offloading. Why such a terrible hack instead of + * recalculating checksum for each packet? + * Because the previous checksum was not checked! + * Recalculating checksums for EVERY packet will hide ALL + * transmission errors. Yes, marked packets still suffer from + * this problem. But, sigh, natd(8) has this problem, too. + * + * TODO: -make libalias mbuf aware (so + * it can handle delayed checksum and tso) + */ + + if (mcl->m_pkthdr.rcvif == NULL && + mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + ldt = 1; + + c = mtod(mcl, char *); + if (args->oif == NULL) + retval = LibAliasIn(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + else + retval = LibAliasOut(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + if (retval == PKT_ALIAS_RESPOND) { + m->m_flags |= M_SKIP_FIREWALL; + retval = PKT_ALIAS_OK; + } + if (retval != PKT_ALIAS_OK && + retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) { + /* XXX - should i add some logging? */ + m_free(mcl); + args->m = NULL; + return (IP_FW_DENY); + } + mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); + + /* + * XXX - libalias checksum offload + * 'duct tape' (see above) + */ + + if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && + ip->ip_p == IPPROTO_TCP) { + struct tcphdr *th; + + th = (struct tcphdr *)(ip + 1); + if (th->th_x2) + ldt = 1; + } + + if (ldt) { + struct tcphdr *th; + struct udphdr *uh; + u_short cksum; + + ip->ip_len = ntohs(ip->ip_len); + cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))); + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + /* + * Maybe it was set in + * libalias... + */ + th->th_x2 = 0; + th->th_sum = cksum; + mcl->m_pkthdr.csum_data = + offsetof(struct tcphdr, th_sum); + break; + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + uh->uh_sum = cksum; + mcl->m_pkthdr.csum_data = + offsetof(struct udphdr, uh_sum); + break; + } + /* No hw checksum offloading: do it ourselves */ + if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { + in_delayed_cksum(mcl); + mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + ip->ip_len = htons(ip->ip_len); + } + args->m = mcl; + return (IP_FW_NAT); +} + +static struct cfg_nat * +lookup_nat(struct nat_list *l, int nat_id) +{ + struct cfg_nat *res; + + LIST_FOREACH(res, l, _next) { + if (res->id == nat_id) + break; + } + return res; +} + +static int +ipfw_nat_cfg(struct sockopt *sopt) +{ + struct cfg_nat *ptr, *ser_n; + char *buf; + struct ip_fw_chain *chain = &V_layer3_chain; + + buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); + sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat)); + ser_n = (struct cfg_nat *)buf; + + /* check valid parameter ser_n->id > 0 ? */ + /* + * Find/create nat rule. + */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, ser_n->id); + if (ptr == NULL) { + /* New rule: allocate and init new instance. */ + ptr = malloc(sizeof(struct cfg_nat), + M_IPFW, M_NOWAIT | M_ZERO); + if (ptr == NULL) { + IPFW_WUNLOCK(chain); + free(buf, M_IPFW); + return (ENOSPC); + } + ptr->lib = LibAliasInit(NULL); + if (ptr->lib == NULL) { + IPFW_WUNLOCK(chain); + free(ptr, M_IPFW); + free(buf, M_IPFW); + return (EINVAL); + } + LIST_INIT(&ptr->redir_chain); + } else { + /* Entry already present: temporarly unhook it. */ + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, ser_n->id); + } + IPFW_WUNLOCK(chain); + + /* + * Basic nat configuration. + */ + ptr->id = ser_n->id; + /* + * XXX - what if this rule doesn't nat any ip and just + * redirect? + * do we set aliasaddress to 0.0.0.0? + */ + ptr->ip = ser_n->ip; + ptr->redir_cnt = ser_n->redir_cnt; + ptr->mode = ser_n->mode; + LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode); + LibAliasSetAddress(ptr->lib, ptr->ip); + memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE); + + /* + * Redir and LSNAT configuration. + */ + /* Delete old cfgs. */ + del_redir_spool_cfg(ptr, &ptr->redir_chain); + /* Add new entries. */ + add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); + free(buf, M_IPFW); + IPFW_WLOCK(chain); + LIST_INSERT_HEAD(&chain->nat, ptr, _next); + IPFW_WUNLOCK(chain); + return (0); +} + +static int +ipfw_nat_del(struct sockopt *sopt) +{ + struct cfg_nat *ptr; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; + + sooptcopyin(sopt, &i, sizeof i, sizeof i); + /* XXX validate i */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, i); + if (ptr == NULL) { + IPFW_WUNLOCK(chain); + return (EINVAL); + } + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, i); + IPFW_WUNLOCK(chain); + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); + return (0); +} + +static int +ipfw_nat_get_cfg(struct sockopt *sopt) +{ + uint8_t *data; + struct cfg_nat *n; + struct cfg_redir *r; + struct cfg_spool *s; + int nat_cnt, off; + struct ip_fw_chain *chain; + int err = ENOSPC; + + chain = &V_layer3_chain; + nat_cnt = 0; + off = sizeof(nat_cnt); + + data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); + IPFW_RLOCK(chain); + /* Serialize all the data. */ + LIST_FOREACH(n, &chain->nat, _next) { + nat_cnt++; + if (off + SOF_NAT >= NAT_BUF_LEN) + goto nospace; + bcopy(n, &data[off], SOF_NAT); + off += SOF_NAT; + LIST_FOREACH(r, &n->redir_chain, _next) { + if (off + SOF_REDIR >= NAT_BUF_LEN) + goto nospace; + bcopy(r, &data[off], SOF_REDIR); + off += SOF_REDIR; + LIST_FOREACH(s, &r->spool_chain, _next) { + if (off + SOF_SPOOL >= NAT_BUF_LEN) + goto nospace; + bcopy(s, &data[off], SOF_SPOOL); + off += SOF_SPOOL; + } + } + } + err = 0; /* all good */ +nospace: + IPFW_RUNLOCK(chain); + if (err == 0) { + bcopy(&nat_cnt, data, sizeof(nat_cnt)); + sooptcopyout(sopt, data, NAT_BUF_LEN); + } else { + printf("serialized data buffer not big enough:" + "please increase NAT_BUF_LEN\n"); + } + free(data, M_IPFW); + return (err); +} + +static int +ipfw_nat_get_log(struct sockopt *sopt) +{ + uint8_t *data; + struct cfg_nat *ptr; + int i, size; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + IPFW_RLOCK(chain); + /* one pass to count, one to copy the data */ + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + i++; + } + size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); + data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); + if (data == NULL) { + IPFW_RUNLOCK(chain); + return (ENOSPC); + } + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + bcopy(&ptr->id, &data[i], sizeof(int)); + i += sizeof(int); + bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); + i += LIBALIAS_BUF_SIZE; + } + IPFW_RUNLOCK(chain); + sooptcopyout(sopt, data, size); + free(data, M_IPFW); + return(0); +} + +static void +ipfw_nat_init(void) +{ + + IPFW_WLOCK(&V_layer3_chain); + /* init ipfw hooks */ + ipfw_nat_ptr = ipfw_nat; + lookup_nat_ptr = lookup_nat; + ipfw_nat_cfg_ptr = ipfw_nat_cfg; + ipfw_nat_del_ptr = ipfw_nat_del; + ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; + ipfw_nat_get_log_ptr = ipfw_nat_get_log; + IPFW_WUNLOCK(&V_layer3_chain); + V_ifaddr_event_tag = EVENTHANDLER_REGISTER( + ifaddr_event, ifaddr_change, + NULL, EVENTHANDLER_PRI_ANY); +} + +static void +ipfw_nat_destroy(void) +{ + struct cfg_nat *ptr, *ptr_temp; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { + LIST_REMOVE(ptr, _next); + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); + } + EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag); + flush_nat_ptrs(chain, -1 /* flush all */); + /* deregister ipfw_nat */ + ipfw_nat_ptr = NULL; + lookup_nat_ptr = NULL; + ipfw_nat_cfg_ptr = NULL; + ipfw_nat_del_ptr = NULL; + ipfw_nat_get_cfg_ptr = NULL; + ipfw_nat_get_log_ptr = NULL; + IPFW_WUNLOCK(chain); +} + +static int +ipfw_nat_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + ipfw_nat_init(); + break; + + case MOD_UNLOAD: + ipfw_nat_destroy(); + break; + + default: + return EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfw_nat_mod = { + "ipfw_nat", + ipfw_nat_modevent, + 0 +}; + +DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); +MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); +MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); +MODULE_VERSION(ipfw_nat, 1); +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c new file mode 100644 index 0000000..a125ef2 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_pfil.c @@ -0,0 +1,415 @@ +/*- + * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 200601 2009-12-16 10:48:40Z luigi $"); + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif /* KLD_MODULE */ +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static VNET_DEFINE(int, fw_enable) = 1; +#define V_fw_enable VNET(fw_enable) + +#ifdef INET6 +static VNET_DEFINE(int, fw6_enable) = 1; +#define V_fw6_enable VNET(fw6_enable) +#endif + +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); + +/* Forward declarations. */ +static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f1) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw"); +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6_fw); +SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + +/* + * The pfilter hook to pass packets to ipfw_chk and then to + * dummynet, divert, netgraph or other modules. + * The packet may be consumed. + */ +int +ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + struct ip_fw_args args; + struct m_tag *tag; + int ipfw; + int ret; + + /* all the processing now uses ip_len in net format */ + if (mtod(*m0, struct ip *)->ip_v == 4) + SET_NET_IPLEN(mtod(*m0, struct ip *)); + + /* convert dir to IPFW values */ + dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; + bzero(&args, sizeof(args)); + +again: + /* + * extract and remove the tag if present. If we are left + * with onepass, optimize the outgoing path. + */ + tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (tag != NULL) { + args.rule = *((struct ipfw_rule_ref *)(tag+1)); + m_tag_delete(*m0, tag); + if (args.rule.info & IPFW_ONEPASS) { + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return 0; + } + } + + args.m = *m0; + args.oif = dir == DIR_OUT ? ifp : NULL; + args.inp = inp; + + ipfw = ipfw_chk(&args); + *m0 = args.m; + + KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", + __func__)); + + /* breaking out of the switch means drop */ + ret = 0; /* default return value for pass */ + switch (ipfw) { + case IP_FW_PASS: + /* next_hop may be set by ipfw_chk */ + if (args.next_hop == NULL) + break; /* pass */ +#ifndef IPFIREWALL_FORWARD + ret = EACCES; +#else + { + struct m_tag *fwd_tag; + + /* Incoming packets should not be tagged so we do not + * m_tag_find. Outgoing packets may be tagged, so we + * reuse the tag if present. + */ + fwd_tag = (dir == DIR_IN) ? NULL : + m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag != NULL) { + m_tag_unlink(*m0, fwd_tag); + } else { + fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, + sizeof(struct sockaddr_in), M_NOWAIT); + if (fwd_tag == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + } + bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in)); + m_tag_prepend(*m0, fwd_tag); + + if (in_localip(args.next_hop->sin_addr)) + (*m0)->m_flags |= M_FASTFWD_OURS; + } +#endif + break; + + case IP_FW_DENY: + ret = EACCES; + break; /* i.e. drop */ + + case IP_FW_DUMMYNET: + ret = EACCES; + if (ip_dn_io_ptr == NULL) + break; /* i.e. drop */ + if (mtod(*m0, struct ip *)->ip_v == 4) + ret = ip_dn_io_ptr(m0, dir, &args); + else if (mtod(*m0, struct ip *)->ip_v == 6) + ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); + else + break; /* drop it */ + /* + * XXX should read the return value. + * dummynet normally eats the packet and sets *m0=NULL + * unless the packet can be sent immediately. In this + * case args is updated and we should re-run the + * check without clearing args. + */ + if (*m0 != NULL) + goto again; + break; + + case IP_FW_TEE: + case IP_FW_DIVERT: + if (ip_divert_ptr == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ipfw_divert(m0, dir, &args.rule, + (ipfw == IP_FW_TEE) ? 1 : 0); + /* continue processing for the original packet (tee). */ + if (*m0) + goto again; + break; + + case IP_FW_NGTEE: + case IP_FW_NETGRAPH: + if (ng_ipfw_input_p == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ng_ipfw_input_p(m0, dir, &args, + (ipfw == IP_FW_NGTEE) ? 1 : 0); + if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ + goto again; /* continue with packet */ + break; + + case IP_FW_NAT: + /* honor one-pass in case of successful nat */ + if (V_fw_one_pass) + break; /* ret is already 0 */ + goto again; + + case IP_FW_REASS: + goto again; /* continue with packet */ + + default: + KASSERT(0, ("%s: unknown retval", __func__)); + } + + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return ret; +} + +/* do the divert, return 1 on error 0 on success */ +static int +ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, + int tee) +{ + /* + * ipfw_chk() has already tagged the packet with the divert tag. + * If tee is set, copy packet and return original. + * If not tee, consume packet and send it to divert socket. + */ + struct mbuf *clone; + struct ip *ip; + struct m_tag *tag; + + /* Cloning needed for tee? */ + if (tee == 0) { + clone = *m0; /* use the original mbuf */ + *m0 = NULL; + } else { + clone = m_dup(*m0, M_DONTWAIT); + /* If we cannot duplicate the mbuf, we sacrifice the divert + * chain and continue with the tee-ed packet. + */ + if (clone == NULL) + return 1; + } + + /* + * Divert listeners can normally handle non-fragmented packets, + * but we can only reass in the non-tee case. + * This means that listeners on a tee rule may get fragments, + * and have to live with that. + * Note that we now have the 'reass' ipfw option so if we care + * we can do it before a 'tee'. + */ + ip = mtod(clone, struct ip *); + if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { + int hlen; + struct mbuf *reass; + + SET_HOST_IPLEN(ip); /* ip_reass wants host order */ + reass = ip_reass(clone); /* Reassemble packet. */ + if (reass == NULL) + return 0; /* not an error */ + /* if reass = NULL then it was consumed by ip_reass */ + /* + * IP header checksum fixup after reassembly and leave header + * in network byte order. + */ + ip = mtod(reass, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(reass, hlen); + clone = reass; + } + /* attach a tag to the packet with the reinject info */ + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT); + if (tag == NULL) { + FREE_PKT(clone); + return 1; + } + *((struct ipfw_rule_ref *)(tag+1)) = *rule; + m_tag_prepend(clone, tag); + + /* Do the dirty job... */ + ip_divert_ptr(clone, incoming); + return 0; +} + +/* + * attach or detach hooks for a given protocol family + */ +static int +ipfw_hook(int onoff, int pf) +{ + struct pfil_head *pfh; + + pfh = pfil_head_get(PFIL_TYPE_AF, pf); + if (pfh == NULL) + return ENOENT; + + (void) (onoff ? pfil_add_hook : pfil_remove_hook) + (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); + + return 0; +} + +int +ipfw_attach_hooks(int arg) +{ + int error = 0; + + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET); + else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { + error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ + printf("ipfw_hook() error\n"); + } +#ifdef INET6 + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET6); + else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { + error = ENOENT; + printf("ipfw6_hook() error\n"); + } +#endif + return error; +} + +int +ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +{ + int enable; + int oldenable; + int error; + int af; + + if (arg1 == &VNET_NAME(fw_enable)) { + enable = V_fw_enable; + af = AF_INET; + } +#ifdef INET6 + else if (arg1 == &VNET_NAME(fw6_enable)) { + enable = V_fw6_enable; + af = AF_INET6; + } +#endif + else + return (EINVAL); + + oldenable = enable; + + error = sysctl_handle_int(oidp, &enable, 0, req); + + if (error) + return (error); + + enable = (enable) ? 1 : 0; + + if (enable == oldenable) + return (0); + + error = ipfw_hook(enable, af); + if (error) + return (error); + if (af == AF_INET) + V_fw_enable = enable; +#ifdef INET6 + else if (af == AF_INET6) + V_fw6_enable = enable; +#endif + + return (0); +} +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h new file mode 100644 index 0000000..334face --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_private.h @@ -0,0 +1,301 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $ + */ + +#ifndef _IPFW2_PRIVATE_H +#define _IPFW2_PRIVATE_H + +/* + * Internal constants and data structures used by ipfw components + * and not meant to be exported outside the kernel. + */ + +#ifdef _KERNEL + +/* + * For platforms that do not have SYSCTL support, we wrap the + * SYSCTL_* into a function (one per file) to collect the values + * into an array at module initialization. The wrapping macros, + * SYSBEGIN() and SYSEND, are empty in the default case. + */ +#ifndef SYSBEGIN +#define SYSBEGIN(x) +#endif +#ifndef SYSEND +#define SYSEND +#endif + +/* Return values from ipfw_chk() */ +enum { + IP_FW_PASS = 0, + IP_FW_DENY, + IP_FW_DIVERT, + IP_FW_TEE, + IP_FW_DUMMYNET, + IP_FW_NETGRAPH, + IP_FW_NGTEE, + IP_FW_NAT, + IP_FW_REASS, +}; + +/* + * Structure for collecting parameters to dummynet for ip6_output forwarding + */ +struct _ip6dn_args { + struct ip6_pktopts *opt_or; + struct route_in6 ro_or; + int flags_or; + struct ip6_moptions *im6o_or; + struct ifnet *origifp_or; + struct ifnet *ifp_or; + struct sockaddr_in6 dst_or; + u_long mtu_or; + struct route_in6 ro_pmtu_or; +}; + + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + + /* + * On return, it points to the matching rule. + * On entry, rule.slot > 0 means the info is valid and + * contains the the starting rule for an ipfw search. + * If chain_id == chain->id && slot >0 then jump to that slot. + * Otherwise, we locate the first rule >= rulenum:rule_id + */ + struct ipfw_rule_ref rule; /* match/restart info */ + + struct ether_header *eh; /* for bridged packets */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + //uint32_t cookie; /* a cookie depending on rule action */ + struct inpcb *inp; + + struct _ip6dn_args dummypar; /* dummynet->ip6_output */ + struct sockaddr_in hopstore; /* store here if cannot use a pointer */ +}; + +MALLOC_DECLARE(M_IPFW); + +/* + * Hooks sometime need to know the direction of the packet + * (divert, dummynet, netgraph, ...) + * We use a generic definition here, with bit0-1 indicating the + * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the + * specific protocol + * indicating the protocol (if necessary) + */ +enum { + DIR_MASK = 0x3, + DIR_OUT = 0, + DIR_IN = 1, + DIR_FWD = 2, + DIR_DROP = 3, + PROTO_LAYER2 = 0x4, /* set for layer 2 */ + /* PROTO_DEFAULT = 0, */ + PROTO_IPV4 = 0x08, + PROTO_IPV6 = 0x10, + PROTO_IFB = 0x0c, /* layer2 + ifbridge */ + /* PROTO_OLDBDG = 0x14, unused, old bridge */ +}; + +/* wrapper for freeing a packet, in case we need to do more work */ +#ifndef FREE_PKT +#if defined(__linux__) || defined(_WIN32) +#define FREE_PKT(m) netisr_dispatch(-1, m) +#else +#define FREE_PKT(m) m_freem(m) +#endif +#endif /* !FREE_PKT */ + +/* + * Function definitions. + */ + +/* attach (arg = 1) or detach (arg = 0) hooks */ +int ipfw_attach_hooks(int); +#ifdef NOTYET +void ipfw_nat_destroy(void); +#endif + +/* In ip_fw_log.c */ +struct ip; +void ipfw_log_bpf(int); +void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip); +VNET_DECLARE(u_int64_t, norule_counter); +#define V_norule_counter VNET(norule_counter) +VNET_DECLARE(int, verbose_limit); +#define V_verbose_limit VNET(verbose_limit) + +/* In ip_fw_dynamic.c */ + +enum { /* result for matching dynamic rules */ + MATCH_REVERSE = 0, + MATCH_FORWARD, + MATCH_NONE, + MATCH_UNKNOWN, +}; + +/* + * The lock for dynamic rules is only used once outside the file, + * and only to release the result of lookup_dyn_rule(). + * Eventually we may implement it with a callback on the function. + */ +void ipfw_dyn_unlock(void); + +struct tcphdr; +struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, + u_int32_t, u_int32_t, int); +int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg); +ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, + int *match_direction, struct tcphdr *tcp); +void ipfw_remove_dyn_children(struct ip_fw *rule); +void ipfw_get_dynamic(char **bp, const char *ep); + +void ipfw_dyn_attach(void); /* uma_zcreate .... */ +void ipfw_dyn_detach(void); /* uma_zdestroy ... */ +void ipfw_dyn_init(void); /* per-vnet initialization */ +void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ +int ipfw_dyn_len(void); + +/* common variables */ +VNET_DECLARE(int, fw_one_pass); +#define V_fw_one_pass VNET(fw_one_pass) + +VNET_DECLARE(int, fw_verbose); +#define V_fw_verbose VNET(fw_verbose) + +VNET_DECLARE(struct ip_fw_chain, layer3_chain); +#define V_layer3_chain VNET(layer3_chain) + +VNET_DECLARE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DECLARE(int, autoinc_step); +#define V_autoinc_step VNET(autoinc_step) + +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + struct ip_fw *default_rule; + int n_rules; /* number of static rules */ + int static_len; /* total len of static rules */ + struct ip_fw **map; /* array of rule ptrs to ease lookup */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rwmtx; + spinlock_t uh_lock; +#else + struct rwlock rwmtx; + struct rwlock uh_lock; /* lock for upper half */ +#endif + uint32_t id; /* ruleset id */ +}; + +struct sockopt; /* used by tcp_var.h */ + +/* + * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c + * so the variable and the macros must be here. + */ + +#define IPFW_LOCK_INIT(_chain) do { \ + rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rw_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) + +#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) +#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) +#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) +#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) + +/* In ip_fw_sockopt.c */ +int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); +int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); +int ipfw_ctl(struct sockopt *sopt); +int ipfw_chk(struct ip_fw_args *args); +void ipfw_reap_rules(struct ip_fw *head); + +/* In ip_fw_pfil */ +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp); + +/* In ip_fw_table.c */ +struct radix_node; +int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val); +int ipfw_init_tables(struct ip_fw_chain *ch); +void ipfw_destroy_tables(struct ip_fw_chain *ch); +int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); +int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value); +int ipfw_dump_table_entry(struct radix_node *rn, void *arg); +int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen); +int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); +int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); + +/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ + +extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); + +typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); +typedef int ipfw_nat_cfg_t(struct sockopt *); + +extern ipfw_nat_t *ipfw_nat_ptr; +#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) + +extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#endif /* _KERNEL */ +#endif /* _IPFW2_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c new file mode 100644 index 0000000..6938aca --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_sockopt.c @@ -0,0 +1,1343 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Supported by: Valeria Paoli + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 206339 2010-04-07 08:23:58Z luigi $"); + +/* + * Sockopt support for ipfw. The routines here implement + * the upper half of the ipfw code. + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include /* struct m_tag used by nested headers */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* hooks */ +#include +#include + +#ifdef MAC +#include +#endif + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +/* + * static variables followed by global ones (none in this file) + */ + +/* + * Find the smallest rule >= key, id. + * We could use bsearch but it is so simple that we code it directly + */ +int +ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) +{ + int i, lo, hi; + struct ip_fw *r; + + for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { + i = (lo + hi) / 2; + r = chain->map[i]; + if (r->rulenum < key) + lo = i + 1; /* continue from the next one */ + else if (r->rulenum > key) + hi = i; /* this might be good */ + else if (r->id < id) + lo = i + 1; /* continue from the next one */ + else /* r->id >= id */ + hi = i; /* this might be good */ + }; + return hi; +} + +/* + * allocate a new map, returns the chain locked. extra is the number + * of entries to add or delete. + */ +static struct ip_fw ** +get_map(struct ip_fw_chain *chain, int extra, int locked) +{ + + for (;;) { + struct ip_fw **map; + int i; + + i = chain->n_rules + extra; + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, + locked ? M_NOWAIT : M_WAITOK); + if (map == NULL) { + printf("%s: cannot allocate map\n", __FUNCTION__); + return NULL; + } + if (!locked) + IPFW_UH_WLOCK(chain); + if (i >= chain->n_rules + extra) /* good */ + return map; + /* otherwise we lost the race, free and retry */ + if (!locked) + IPFW_UH_WUNLOCK(chain); + free(map, M_IPFW); + } +} + +/* + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK + */ +static struct ip_fw ** +swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) +{ + struct ip_fw **old_map; + + IPFW_WLOCK(chain); + chain->id++; + chain->n_rules = new_len; + old_map = chain->map; + chain->map = new_map; + IPFW_WUNLOCK(chain); + return old_map; +} + +/* + * Add a new rule to the list. Copy the rule into a malloc'ed area, then + * possibly create a rule number and add the rule to the list. + * Update the rule_number in the input struct so the caller knows it as well. + * XXX DO NOT USE FOR THE DEFAULT RULE. + * Must be called without IPFW_UH held + */ +int +ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) +{ + struct ip_fw *rule; + int i, l, insert_before; + struct ip_fw **map; /* the new array of pointers */ + + if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) + return (EINVAL); + + l = RULESIZE(input_rule); + rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); + if (rule == NULL) + return (ENOSPC); + /* get_map returns with IPFW_UH_WLOCK if successful */ + map = get_map(chain, 1, 0 /* not locked */); + if (map == NULL) { + free(rule, M_IPFW); + return ENOSPC; + } + + bcopy(input_rule, rule, l); + /* clear fields not settable from userland */ + rule->x_next = NULL; + rule->next_rule = NULL; + rule->pcnt = 0; + rule->bcnt = 0; + rule->timestamp = 0; + + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; + /* find the insertion point, we will insert before */ + insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; + i = ipfw_find_rule(chain, insert_before, 0); + /* duplicate first part */ + if (i > 0) + bcopy(chain->map, map, i * sizeof(struct ip_fw *)); + map[i] = rule; + /* duplicate remaining part, we always have the default rule */ + bcopy(chain->map + i, map + i + 1, + sizeof(struct ip_fw *) *(chain->n_rules - i)); + if (rule->rulenum == 0) { + /* write back the number */ + rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; + if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rule->rulenum += V_autoinc_step; + input_rule->rulenum = rule->rulenum; + } + + rule->id = chain->id + 1; + map = swap_map(chain, map, chain->n_rules + 1); + chain->static_len += l; + IPFW_UH_WUNLOCK(chain); + if (map) + free(map, M_IPFW); + return (0); +} + +/* + * Reclaim storage associated with a list of rules. This is + * typically the list created using remove_rule. + * A NULL pointer on input is handled correctly. + */ +void +ipfw_reap_rules(struct ip_fw *head) +{ + struct ip_fw *rule; + + while ((rule = head) != NULL) { + head = head->x_next; + free(rule, M_IPFW); + } +} + +/* + * Used by del_entry() to check if a rule should be kept. + * Returns 1 if the rule must be kept, 0 otherwise. + * + * Called with cmd = {0,1,5}. + * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; + * cmd == 1 matches on set numbers only, rule numbers are ignored; + * cmd == 5 matches on rule and set numbers. + * + * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. + * + * Rules to keep are + * (default || reserved || !match_set || !match_number) + * where + * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) + * // the default rule is always protected + * + * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) + * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") + * + * match_set ::= (cmd == 0 || rule->set == set) + * // set number is ignored for cmd == 0 + * + * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) + * // number is ignored for cmd == 1 or n == 0 + * + */ +static int +keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) +{ + return + (rule->rulenum == IPFW_DEFAULT_RULE) || + (cmd == 0 && n == 0 && rule->set == RESVD_SET) || + !(cmd == 0 || rule->set == set) || + !(cmd == 1 || n == 0 || n == rule->rulenum); +} + +/** + * Remove all rules with given number, or do set manipulation. + * Assumes chain != NULL && *chain != NULL. + * + * The argument is an uint32_t. The low 16 bit are the rule or set number; + * the next 8 bits are the new set; the top 8 bits indicate the command: + * + * 0 delete rules numbered "rulenum" + * 1 delete rules in set "rulenum" + * 2 move rules "rulenum" to set "new_set" + * 3 move rules from set "rulenum" to set "new_set" + * 4 swap sets "rulenum" and "new_set" + * 5 delete rules "rulenum" and set "new_set" + */ +static int +del_entry(struct ip_fw_chain *chain, uint32_t arg) +{ + struct ip_fw *rule; + uint32_t num; /* rule number or old_set */ + uint8_t cmd, new_set; + int start, end, i, ofs, n; + struct ip_fw **map = NULL; + int error = 0; + + num = arg & 0xffff; + cmd = (arg >> 24) & 0xff; + new_set = (arg >> 16) & 0xff; + + if (cmd > 5 || new_set > RESVD_SET) + return EINVAL; + if (cmd == 0 || cmd == 2 || cmd == 5) { + if (num >= IPFW_DEFAULT_RULE) + return EINVAL; + } else { + if (num > RESVD_SET) /* old_set */ + return EINVAL; + } + + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + chain->reap = NULL; /* prepare for deletions */ + + switch (cmd) { + case 0: /* delete rules "num" (num == 0 matches all) */ + case 1: /* delete all rules in set N */ + case 5: /* delete rules with number N and set "new_set". */ + + /* + * Locate first rule to delete (start), the rule after + * the last one to delete (end), and count how many + * rules to delete (n). Always use keep_rule() to + * determine which rules to keep. + */ + n = 0; + if (cmd == 1) { + /* look for a specific set including RESVD_SET. + * Must scan the entire range, ignore num. + */ + new_set = num; + for (start = -1, end = i = 0; i < chain->n_rules; i++) { + if (keep_rule(chain->map[i], cmd, new_set, 0)) + continue; + if (start < 0) + start = i; + end = i; + n++; + } + end++; /* first non-matching */ + } else { + /* Optimized search on rule numbers */ + start = ipfw_find_rule(chain, num, 0); + for (end = start; end < chain->n_rules; end++) { + rule = chain->map[end]; + if (num > 0 && rule->rulenum != num) + break; + if (!keep_rule(rule, cmd, new_set, num)) + n++; + } + } + + if (n == 0) { + /* A flush request (arg == 0) on empty ruleset + * returns with no error. On the contrary, + * if there is no match on a specific request, + * we return EINVAL. + */ + error = (arg == 0) ? 0 : EINVAL; + break; + } + + /* We have something to delete. Allocate the new map */ + map = get_map(chain, -n, 1 /* locked */); + if (map == NULL) { + error = EINVAL; + break; + } + + /* 1. bcopy the initial part of the map */ + if (start > 0) + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); + /* 2. copy active rules between start and end */ + for (i = ofs = start; i < end; i++) { + rule = chain->map[i]; + if (keep_rule(rule, cmd, new_set, num)) + map[ofs++] = rule; + } + /* 3. copy the final part of the map */ + bcopy(chain->map + end, map + ofs, + (chain->n_rules - end) * sizeof(struct ip_fw *)); + /* 4. swap the maps (under BH_LOCK) */ + map = swap_map(chain, map, chain->n_rules - n); + /* 5. now remove the rules deleted from the old map */ + for (i = start; i < end; i++) { + int l; + rule = map[i]; + if (keep_rule(rule, cmd, new_set, num)) + continue; + l = RULESIZE(rule); + chain->static_len -= l; + ipfw_remove_dyn_children(rule); + rule->x_next = chain->reap; + chain->reap = rule; + } + break; + + /* + * In the next 3 cases the loop stops at (n_rules - 1) + * because the default rule is never eligible.. + */ + + case 2: /* move rules with given RULE number to new set */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->rulenum == num) + rule->set = new_set; + } + break; + + case 3: /* move rules with given SET number to new set */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == num) + rule->set = new_set; + } + break; + + case 4: /* swap two sets */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == num) + rule->set = new_set; + else if (rule->set == new_set) + rule->set = num; + } + break; + } + + rule = chain->reap; + chain->reap = NULL; + IPFW_UH_WUNLOCK(chain); + ipfw_reap_rules(rule); + if (map) + free(map, M_IPFW); + return error; +} + +/* + * Clear counters for a specific rule. + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops + * so we only care that rules do not disappear. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) { + rule->bcnt = rule->pcnt = 0; + rule->timestamp = 0; + } + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/** + * Reset some or all counters on firewall rules. + * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, + * the next 8 bits are the set number, the top 8 bits are the command: + * 0 work with rules from all set's; + * 1 work with rules only from specified set. + * Specified rule number is zero if we want to clear all entries. + * log_only is 1 if we only want to reset logs, zero otherwise. + */ +static int +zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) +{ + struct ip_fw *rule; + char *msg; + int i; + + uint16_t rulenum = arg & 0xffff; + uint8_t set = (arg >> 16) & 0xff; + uint8_t cmd = (arg >> 24) & 0xff; + + if (cmd > 1) + return (EINVAL); + if (cmd == 1 && set > RESVD_SET) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + if (rulenum == 0) { + V_norule_counter = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + /* Skip rules not in our set. */ + if (cmd == 1 && rule->set != set) + continue; + clear_counters(rule, log_only); + } + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + } else { + int cleared = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->rulenum == rulenum) { + if (cmd == 0 || rule->set == set) + clear_counters(rule, log_only); + cleared = 1; + } + if (rule->rulenum > rulenum) + break; + } + if (!cleared) { /* we did not find any matching rules */ + IPFW_UH_RUNLOCK(chain); + return (EINVAL); + } + msg = log_only ? "logging count reset" : "cleared"; + } + IPFW_UH_RUNLOCK(chain); + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + + if (rulenum) + log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); + else + log(lev, "ipfw: %s.\n", msg); + } + return (0); +} + +/* + * Check validity of the structure before insert. + * Rules are simple, so this mostly need to check rule sizes. + */ +static int +check_ipfw_struct(struct ip_fw *rule, int size) +{ + int l, cmdlen = 0; + int have_action=0; + ipfw_insn *cmd; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + /* first, check for valid size */ + l = RULESIZE(rule); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + for (l = rule->cmd_len, cmd = rule->cmd ; + l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d size truncated\n", + cmd->opcode); + return EINVAL; + } + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_DIVERTED: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_TCPWIN: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_VERSRCREACH: + case O_ANTISPOOF: + case O_IPSEC: +#ifdef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: +#endif + case O_IP4: + case O_TAG: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + break; + + case O_SETFIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + goto check_action; + + case O_UID: + case O_GID: + case O_JAIL: + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + goto bad_size; + break; + + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + goto bad_size; + + ((ipfw_insn_log *)cmd)->log_left = + ((ipfw_insn_log *)cmd)->max_log; + + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ( !(cmdlen & 1) || cmdlen > 31) + goto bad_size; + break; + + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return EINVAL; + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + goto bad_size; + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (cmd->arg1 >= IPFW_TABLES_MAX) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + goto bad_size; + break; + + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + case O_TCPDATALEN: + case O_TAGGED: + if (cmdlen < 1 || cmdlen > 31) + goto bad_size; + break; + + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + goto bad_size; + break; + + case O_RECV: + case O_XMIT: + case O_VIA: + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + goto bad_size; + break; + + case O_ALTQ: + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) + goto bad_size; + break; + + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + goto check_action; + + case O_FORWARD_IP: +#ifdef IPFIREWALL_FORWARD + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + goto bad_size; + goto check_action; +#else + return EINVAL; +#endif + + case O_DIVERT: + case O_TEE: + if (ip_divert_ptr == NULL) + return EINVAL; + else + goto check_size; + case O_NETGRAPH: + case O_NGTEE: + if (ng_ipfw_input_p == NULL) + return EINVAL; + else + goto check_size; + case O_NAT: + if (!IPFW_NAT_LOADED) + return EINVAL; + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) + goto bad_size; + goto check_action; + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_CHECK_STATE: + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: +#ifdef INET6 + case O_UNREACH6: +#endif + case O_SKIPTO: + case O_REASS: +check_size: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; +check_action: + if (have_action) { + printf("ipfw: opcode %d, multiple actions" + " not allowed\n", + cmd->opcode); + return EINVAL; + } + have_action = 1; + if (l != cmdlen) { + printf("ipfw: opcode %d, action must be" + " last opcode\n", + cmd->opcode); + return EINVAL; + } + break; +#ifdef INET6 + case O_IP6_SRC: + case O_IP6_DST: + if (cmdlen != F_INSN_SIZE(struct in6_addr) + + F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FLOW6ID: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + ((ipfw_insn_u32 *)cmd)->o.arg1) + goto bad_size; + break; + + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if ( !(cmdlen & 1) || cmdlen > 127) + goto bad_size; + break; + case O_ICMP6TYPE: + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) + goto bad_size; + break; +#endif + + default: + switch (cmd->opcode) { +#ifndef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: + case O_UNREACH6: + case O_IP6_SRC: + case O_IP6_DST: + case O_FLOW6ID: + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + case O_ICMP6TYPE: + printf("ipfw: no IPv6 support in kernel\n"); + return EPROTONOSUPPORT; +#endif + default: + printf("ipfw: opcode %d, unknown opcode\n", + cmd->opcode); + return EINVAL; + } + } + } + if (have_action == 0) { + printf("ipfw: missing action\n"); + return EINVAL; + } + return 0; + +bad_size: + printf("ipfw: opcode %d size %d wrong\n", + cmd->opcode, cmdlen); + return EINVAL; +} + + +/* + * Translation of requests for compatibility with FreeBSD 7.2/8. + * a static variable tells us if we have an old client from userland, + * and if necessary we translate requests and responses between the + * two formats. + */ +static int is7 = 0; + +struct ip_fw7 { + struct ip_fw7 *next; /* linked list of rules */ + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + // #define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + // uint32_t id; /* rule id, only in v.8 */ + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + + int convert_rule_to_7(struct ip_fw *rule); +int convert_rule_to_8(struct ip_fw *rule); + +#ifndef RULESIZE7 +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) +#endif + + +/* + * Copy the static and dynamic rules to the supplied buffer + * and return the amount of space actually used. + * Must be run under IPFW_UH_RLOCK + */ +static size_t +ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) +{ + char *bp = buf; + char *ep = bp + space; + struct ip_fw *rule, *dst; + int l, i; + time_t boot_seconds; + + boot_seconds = boottime.tv_sec; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + + if (is7) { + /* Convert rule to FreeBSd 7.2 format */ + l = RULESIZE7(rule); + if (bp + l + sizeof(uint32_t) <= ep) { + int error; + bcopy(rule, bp, l + sizeof(uint32_t)); + error = convert_rule_to_7((struct ip_fw *) bp); + if (error) + return 0; /*XXX correct? */ + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, + &(((struct ip_fw7 *)bp)->next_rule), + sizeof(V_set_disable)); + if (((struct ip_fw7 *)bp)->timestamp) + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; + bp += l; + } + continue; /* go to next rule */ + } + + /* normal mode, don't touch rules */ + l = RULESIZE(rule); + if (bp + l > ep) { /* should not happen */ + printf("overflow dumping static rules\n"); + break; + } + dst = (struct ip_fw *)bp; + bcopy(rule, dst, l); + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); + if (dst->timestamp) + dst->timestamp += boot_seconds; + bp += l; + } + ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ + return (bp - (char *)buf); +} + + +/** + * {set|get}sockopt parser. + */ +int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + int error; + size_t size; + struct ip_fw *buf, *rule; + struct ip_fw_chain *chain; + u_int32_t rulenum[2]; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); + if (error) + return (error); + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + chain = &V_layer3_chain; + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number IPFW_DEFAULT_RULE), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + * + * Note that the calculated size is used to bound the + * amount of data returned to the user. The rule set may + * change between calculating the size and returning the + * data in which case we'll just return what fits. + */ + for (;;) { + int len = 0, want; + + size = chain->static_len; + size += ipfw_dyn_len(); + if (size >= sopt->sopt_valsize) + break; + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == NULL) + break; + IPFW_UH_RLOCK(chain); + /* check again how much space we need */ + want = chain->static_len + ipfw_dyn_len(); + if (size >= want) + len = ipfw_getrules(chain, buf, size); + IPFW_UH_RUNLOCK(chain); + if (size >= want) + error = sooptcopyout(sopt, buf, len); + free(buf, M_TEMP); + if (size >= want) + break; + } + break; + + case IP_FW_FLUSH: + /* locking is done within del_entry() */ + error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ + break; + + case IP_FW_ADD: + rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, + sizeof(struct ip_fw7) ); + + /* + * If the size of commands equals RULESIZE7 then we assume + * a FreeBSD7.2 binary is talking to us (set is7=1). + * is7 is persistent so the next 'ipfw list' command + * will use this format. + * NOTE: If wrong version is guessed (this can happen if + * the first ipfw command is 'ipfw [pipe] list') + * the ipfw binary may crash or loop infinitly... + */ + if (sopt->sopt_valsize == RULESIZE7(rule)) { + is7 = 1; + error = convert_rule_to_8(rule); + if (error) + return error; + if (error == 0) + error = check_ipfw_struct(rule, RULESIZE(rule)); + } else { + is7 = 0; + if (error == 0) + error = check_ipfw_struct(rule, sopt->sopt_valsize); + } + if (error == 0) { + /* locking is done within ipfw_add_rule() */ + error = ipfw_add_rule(chain, rule); + size = RULESIZE(rule); + if (!error && sopt->sopt_dir == SOPT_GET) { + if (is7) { + error = convert_rule_to_7(rule); + size = RULESIZE7(rule); + if (error) + return error; + } + error = sooptcopyout(sopt, rule, size); + } + } + free(rule, M_TEMP); + break; + + case IP_FW_DEL: + /* + * IP_FW_DEL is used for deleting single rules or sets, + * and (ab)used to atomically manipulate sets. Argument size + * is used to distinguish between the two: + * sizeof(u_int32_t) + * delete single rule or set of rules, + * or reassign rules (or sets) to a different set. + * 2*sizeof(u_int32_t) + * atomic disable/enable sets. + * first u_int32_t contains sets to be disabled, + * second u_int32_t contains sets to be enabled. + */ + error = sooptcopyin(sopt, rulenum, + 2*sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + size = sopt->sopt_valsize; + if (size == sizeof(u_int32_t) && rulenum[0] != 0) { + /* delete or reassign, locking done in del_entry() */ + error = del_entry(chain, rulenum[0]); + } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ + IPFW_UH_WLOCK(chain); + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & + ~(1<sopt_val != 0) { + error = sooptcopyin(sopt, rulenum, + sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + } + error = zero_entry(chain, rulenum[0], + sopt->sopt_name == IP_FW_RESETLOG); + break; + + /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ + case IP_FW_TABLE_ADD: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_add_table_entry(chain, ent.tbl, + ent.addr, ent.masklen, ent.value); + } + break; + + case IP_FW_TABLE_DEL: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_del_table_entry(chain, ent.tbl, + ent.addr, ent.masklen); + } + break; + + case IP_FW_TABLE_FLUSH: + { + u_int16_t tbl; + + error = sooptcopyin(sopt, &tbl, + sizeof(tbl), sizeof(tbl)); + if (error) + break; + IPFW_WLOCK(chain); + error = ipfw_flush_table(chain, tbl); + IPFW_WUNLOCK(chain); + } + break; + + case IP_FW_TABLE_GETSIZE: + { + u_int32_t tbl, cnt; + + if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), + sizeof(tbl)))) + break; + IPFW_RLOCK(chain); + error = ipfw_count_table(chain, tbl, &cnt); + IPFW_RUNLOCK(chain); + if (error) + break; + error = sooptcopyout(sopt, &cnt, sizeof(cnt)); + } + break; + + case IP_FW_TABLE_LIST: + { + ipfw_table *tbl; + + if (sopt->sopt_valsize < sizeof(*tbl)) { + error = EINVAL; + break; + } + size = sopt->sopt_valsize; + tbl = malloc(size, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); + if (error) { + free(tbl, M_TEMP); + break; + } + tbl->size = (size - sizeof(*tbl)) / + sizeof(ipfw_table_entry); + IPFW_RLOCK(chain); + error = ipfw_dump_table(chain, tbl); + IPFW_RUNLOCK(chain); + if (error) { + free(tbl, M_TEMP); + break; + } + error = sooptcopyout(sopt, tbl, size); + free(tbl, M_TEMP); + } + break; + + /*--- NAT operations are protected by the IPFW_LOCK ---*/ + case IP_FW_NAT_CFG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_DEL: + if (IPFW_NAT_LOADED) + error = ipfw_nat_del_ptr(sopt); + else { + printf("IP_FW_NAT_DEL: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_CONFIG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_GET_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_LOG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_log_ptr(sopt); + else { + printf("IP_FW_NAT_GET_LOG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + default: + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL; + } + + return (error); +#undef RULE_MAXSIZE +} + + +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + +/* Functions to convert rules 7.2 <==> 8.0 */ +int +convert_rule_to_7(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; + /* copy of original rule, version 8 */ + struct ip_fw *tmp; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + bcopy(rule, tmp, RULE_MAXSIZE); + + /* Copy fields */ + rule7->_pad = tmp->_pad; + rule7->set = tmp->set; + rule7->rulenum = tmp->rulenum; + rule7->cmd_len = tmp->cmd_len; + rule7->act_ofs = tmp->act_ofs; + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; + rule7->next = (struct ip_fw7 *)tmp->x_next; + rule7->cmd_len = tmp->cmd_len; + rule7->pcnt = tmp->pcnt; + rule7->bcnt = tmp->bcnt; + rule7->timestamp = tmp->timestamp; + + /* Copy commands */ + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * decrement opcode if it is after O_REASS + */ + dst->opcode--; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + free(tmp, M_TEMP); + + return 0; +} + +int +convert_rule_to_8(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + /* Copy of original rule */ + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + + bcopy(rule7, tmp, RULE_MAXSIZE); + + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * increment opcode if it is after O_REASS + */ + dst->opcode++; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + + rule->_pad = tmp->_pad; + rule->set = tmp->set; + rule->rulenum = tmp->rulenum; + rule->cmd_len = tmp->cmd_len; + rule->act_ofs = tmp->act_ofs; + rule->next_rule = (struct ip_fw *)tmp->next_rule; + rule->x_next = (struct ip_fw *)tmp->next; + rule->cmd_len = tmp->cmd_len; + rule->id = 0; /* XXX see if is ok = 0 */ + rule->pcnt = tmp->pcnt; + rule->bcnt = tmp->bcnt; + rule->timestamp = tmp->timestamp; + + free (tmp, M_TEMP); + return 0; +} + +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c new file mode 100644 index 0000000..d8973d5 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_table.c @@ -0,0 +1,286 @@ +/*- + * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 200601 2009-12-16 10:48:40Z luigi $"); + +/* + * Lookup table support for ipfw + * + * Lookup tables are implemented (at the moment) using the radix + * tree used for routing tables. Tables store key-value entries, where + * keys are network prefixes (addr/masklen), and values are integers. + * As a degenerate case we can interpret keys as 32-bit integers + * (with a /32 mask). + * + * The table is protected by the IPFW lock even for manipulation coming + * from userland, because operations are typically fast. + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include +#include +#include +#include +#include +#include +#include +#include /* ip_fw.h requires IFNAMSIZ */ +#include +#include +#include + +#include +#include /* struct ipfw_rule_ref */ +#include +#include /* LIST_HEAD */ +#include + +#ifdef MAC +#include +#endif + +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + +/* + * The radix code expects addr and mask to be array of bytes, + * with the first byte being the length of the array. rn_inithead + * is called with the offset in bits of the lookup key within the + * array. If we use a sockaddr_in as the underlying type, + * sin_len is conveniently located at offset 0, sin_addr is at + * offset 4 and normally aligned. + * But for portability, let's avoid assumption and make the code explicit + */ +#define KEY_LEN(v) *((uint8_t *)&(v)) +#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) + +int +ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct radix_node *rn; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); + if (ent == NULL) + return (ENOMEM); + ent->value = value; + KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8; + ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent); + if (rn == NULL) { + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (EEXIST); + } + IPFW_WUNLOCK(ch); + return (0); +} + +int +ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa, mask; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = KEY_LEN(mask) = 8; + mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); + if (ent == NULL) { + IPFW_WUNLOCK(ch); + return (ESRCH); + } + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (0); +} + +static int +flush_table_entry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct table_entry *ent; + + ent = (struct table_entry *) + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); + if (ent != NULL) + free(ent, M_IPFW_TBL); + return (0); +} + +int +ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) +{ + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KASSERT(rnh != NULL, ("NULL IPFW table")); + rnh->rnh_walktree(rnh, flush_table_entry, rnh); + return (0); +} + +void +ipfw_destroy_tables(struct ip_fw_chain *ch) +{ + uint16_t tbl; + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) { + ipfw_flush_table(ch, tbl); + rnh = ch->tables[tbl]; + rn_detachhead((void **)&rnh); + } +} + +int +ipfw_init_tables(struct ip_fw_chain *ch) +{ + int i; + uint16_t j; + + for (i = 0; i < IPFW_TABLES_MAX; i++) { + if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) { + for (j = 0; j < i; j++) { + (void) ipfw_flush_table(ch, j); + } + return (ENOMEM); + } + } + return (0); +} + +int +ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa; + + if (tbl >= IPFW_TABLES_MAX) + return (0); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = 8; + sa.sin_addr.s_addr = addr; + ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); + if (ent != NULL) { + *val = ent->value; + return (1); + } + return (0); +} + +static int +count_table_entry(struct radix_node *rn, void *arg) +{ + u_int32_t * const cnt = arg; + + (*cnt)++; + return (0); +} + +int +ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) +{ + struct radix_node_head *rnh; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + *cnt = 0; + rnh->rnh_walktree(rnh, count_table_entry, cnt); + return (0); +} + +static int +dump_table_entry(struct radix_node *rn, void *arg) +{ + struct table_entry * const n = (struct table_entry *)rn; + ipfw_table * const tbl = arg; + ipfw_table_entry *ent; + + if (tbl->cnt == tbl->size) + return (1); + ent = &tbl->ent[tbl->cnt]; + ent->tbl = tbl->tbl; + if (in_nullhost(n->mask.sin_addr)) + ent->masklen = 0; + else + ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); + ent->addr = n->addr.sin_addr.s_addr; + ent->value = n->value; + tbl->cnt++; + return (0); +} + +int +ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) +{ + struct radix_node_head *rnh; + + if (tbl->tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl->tbl]; + tbl->cnt = 0; + rnh->rnh_walktree(rnh, dump_table_entry, tbl); + return (0); +} +/* end of file */ diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h new file mode 100644 index 0000000..5af35a7 --- /dev/null +++ b/sys/netinet/tcp.h @@ -0,0 +1,228 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/tcp.h,v 1.40.2.2 2008/07/31 06:10:25 kmacy Exp $ + */ + +#ifndef _NETINET_TCP_H_ +#define _NETINET_TCP_H_ + +#include + +#define __BSD_VISIBLE 1 + +#if __BSD_VISIBLE + +typedef u_int32_t tcp_seq; + +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ + +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_char th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) +#define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" + + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOLEN_EOL 1 +#define TCPOPT_PAD 0 /* padding after EOL */ +#define TCPOLEN_PAD 1 +#define TCPOPT_NOP 1 +#define TCPOLEN_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 +#define TCPOLEN_SACKHDR 2 +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ +#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ +#define TCPOLEN_SIGNATURE 18 + +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ +#define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ + + +/* + * Default maximum segment size for TCP. + * With an IP MTU of 576, this is 536, + * but 512 is probably more convenient. + * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + */ +#define TCP_MSS 512 +/* + * TCP_MINMSS is defined to be 216 which is fine for the smallest + * link MTU (256 bytes, AX.25 packet radio) in the Internet. + * However it is very unlikely to come across such low MTU interfaces + * these days (anno dato 2003). + * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. + * Setting this to "0" disables the minmss check. + */ +#define TCP_MINMSS 216 + +/* + * Default maximum segment size for TCP6. + * With an IP6 MSS of 1280, this is 1220, + * but 1024 is probably more convenient. (xxx kazu in doubt) + * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) + */ +#define TCP6_MSS 1024 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +#define TCP_MAXBURST 4 /* maximum segments in a burst */ + +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + /* max space left for options */ +#endif /* __BSD_VISIBLE */ + +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#if __BSD_VISIBLE +#define TCP_MAXSEG 0x02 /* set maximum segment size */ +#define TCP_NOPUSH 0x04 /* don't push last block of write */ +#define TCP_NOOPT 0x08 /* don't use TCP options */ +#define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ +#define TCP_INFO 0x20 /* retrieve tcp_info structure */ +#define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */ + +#define TCP_CA_NAME_MAX 16 /* max congestion control name length */ + +#define TCPI_OPT_TIMESTAMPS 0x01 +#define TCPI_OPT_SACK 0x02 +#define TCPI_OPT_WSCALE 0x04 +#define TCPI_OPT_ECN 0x08 +#define TCPI_OPT_TOE 0x10 + +/* + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits + * the caller to query certain information about the state of a TCP + * connection. We provide an overlapping set of fields with the Linux + * implementation, but since this is a fixed size structure, room has been + * left for growth. In order to maximize potential future compatibility with + * the Linux API, the same variable names and order have been adopted, and + * padding left to make room for omitted fields in case they are added later. + * + * XXX: This is currently an unstable ABI/API, in that it is expected to + * change. + */ +struct tcp_info { + u_int8_t tcpi_state; /* TCP FSM state. */ + u_int8_t __tcpi_ca_state; + u_int8_t __tcpi_retransmits; + u_int8_t __tcpi_probes; + u_int8_t __tcpi_backoff; + u_int8_t tcpi_options; /* Options enabled on conn. */ + u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ + tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ + + u_int32_t __tcpi_rto; + u_int32_t __tcpi_ato; + u_int32_t __tcpi_snd_mss; + u_int32_t __tcpi_rcv_mss; + + u_int32_t __tcpi_unacked; + u_int32_t __tcpi_sacked; + u_int32_t __tcpi_lost; + u_int32_t __tcpi_retrans; + u_int32_t __tcpi_fackets; + + /* Times; measurements in usecs. */ + u_int32_t __tcpi_last_data_sent; + u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ + u_int32_t __tcpi_last_data_recv; + u_int32_t __tcpi_last_ack_recv; + + /* Metrics; variable units. */ + u_int32_t __tcpi_pmtu; + u_int32_t __tcpi_rcv_ssthresh; + u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ + u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ + u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ + u_int32_t __tcpi_advmss; + u_int32_t __tcpi_reordering; + + u_int32_t __tcpi_rcv_rtt; + u_int32_t tcpi_rcv_space; /* Advertised recv window. */ + + /* FreeBSD extensions to tcp_info. */ + u_int32_t tcpi_snd_wnd; /* Advertised send window. */ + u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_nxt; /* Next egress seqno */ + u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ + u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ + + /* Padding to grow without breaking ABI. */ + u_int32_t __tcpi_pad[29]; /* Padding. */ +}; +#endif + +#endif /* !_NETINET_TCP_H_ */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h new file mode 100644 index 0000000..35196a2 --- /dev/null +++ b/sys/netinet/tcp_var.h @@ -0,0 +1,4 @@ +#ifndef _NETINET_TCP_VAR_H_ +#define _NETINET_TCP_VAR_H_ +#include +#endif /* !_NETINET_TCP_VAR_H_ */ diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h new file mode 100644 index 0000000..cd75bd1 --- /dev/null +++ b/sys/netinet/udp.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/udp.h,v 1.10 2007/02/20 10:13:11 rwatson Exp $ + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * UDP protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +/* + * User-settable options (used with setsockopt). + */ +#define UDP_ENCAP 0x01 + + +/* + * UDP Encapsulation of IPsec Packets options. + */ +/* Encapsulation types. */ +#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ +#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ + +/* Default ESP in UDP encapsulation port. */ +#define UDP_ENCAP_ESPINUDP_PORT 500 + +/* Maximum UDP fragment size for ESP over UDP. */ +#define UDP_ENCAP_ESPINUDP_MAXFRAGLEN 552 + +#endif diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h new file mode 100644 index 0000000..82c9851 --- /dev/null +++ b/sys/sys/cdefs.h @@ -0,0 +1,29 @@ +#ifndef _CDEFS_H_ +#define _CDEFS_H_ + +/* + * various compiler macros and common functions + */ + +#ifndef __packed +#define __packed __attribute__ ((__packed__)) +#endif + +#ifndef __aligned +#define __aligned(x) __attribute__((__aligned__(x))) +#endif + +/* defined as assert */ +void panic(const char *fmt, ...); + +#define KASSERT(exp,msg) do { \ + if (__predict_false(!(exp))) \ + panic msg; \ +} while (0) + +/* don't bother to optimize */ +#ifndef __predict_false +#define __predict_false(x) (x) /* __builtin_expect((exp), 0) */ +#endif + +#endif /* !_CDEFS_H_ */ diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h new file mode 100644 index 0000000..fbc9581 --- /dev/null +++ b/sys/sys/kernel.h @@ -0,0 +1,26 @@ +/* + * from freebsd's kernel.h + */ +#ifndef _SYS_KERNEL_H_ +#define _SYS_KERNEL_H_ + +#define SYSINIT(a, b, c, d, e) \ + void *sysinit_ ## d = d +#define VNET_SYSINIT(a, b, c, d, e) \ + void *sysinit_ ## d = d +#define SYSUNINIT(a, b, c, d, e) \ + void *sysuninit_ ## d = d +#define VNET_SYSUNINIT(a, b, c, d, e) \ + void *sysuninit_ ## d = d + +/* + * Some enumerated orders; "ANY" sorts last. + */ +enum sysinit_elem_order { + SI_ORDER_FIRST = 0x0000000, /* first*/ + SI_ORDER_SECOND = 0x0000001, /* second*/ + SI_ORDER_THIRD = 0x0000002, /* third*/ + SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ + SI_ORDER_ANY = 0xfffffff /* last*/ +}; +#endif diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h new file mode 100644 index 0000000..ac16aed --- /dev/null +++ b/sys/sys/malloc.h @@ -0,0 +1,59 @@ +#ifndef _SYS_MALLOC_H_ +#define _SYS_MALLOC_H_ + +/* + * No matter what, try to get clear memory and be non-blocking. + * XXX check if 2.4 has a native way to zero memory, + * XXX obey to the flags (M_NOWAIT <-> GPF_ATOMIC, M_WAIT <-> GPF_KERNEL) + */ +#ifndef _WIN32 /* this is the linux version */ + +/* + * XXX On zeroshell (2.6.25.17) we get a load error + * __you_cannot_kmalloc_that_much + * which is triggered when kmalloc() is called with a large + * compile-time constant argument (include/linux/slab_def.h) + * + * I think it may be a compiler (or source) bug because there is no + * evidence that such a large request is made. + * Making the _size argument to kmalloc volatile prevents the compiler + * from making the mistake, though it is clearly not ideal. + */ + +#if !defined (LINUX_24) && LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) +#define malloc(_size, type, flags) \ + ({ volatile int _v = _size; kmalloc(_v, GFP_ATOMIC | __GFP_ZERO); }) +#else /* LINUX <= 2.6.22 and LINUX_24 */ +/* linux 2.6.22 does not zero allocated memory */ +#define malloc(_size, type, flags) \ + ({ int _s = _size; \ + void *_ret = kmalloc(_s, GFP_ATOMIC); \ + if (_ret) memset(_ret, 0, _s); \ + (_ret); \ + }) +#endif /* LINUX <= 2.6.22 */ + +#define calloc(_n, _s) malloc((_n * _s), NULL, GFP_ATOMIC | __GFP_ZERO) +#define free(_var, type) kfree(_var) + +#else /* _WIN32, the windows version */ + +/* + * ntddk.h uses win_malloc() and MmFreeContiguousMemory(). + * wipfw uses + * ExAllocatePoolWithTag(, pool, len, tag) + * ExFreePoolWithTag(ptr, tag) + */ +#define malloc(_size, _type, _flags) my_alloc(_size) +#define calloc(_size, _type, _flags) my_alloc(_size) + +void *my_alloc(int _size); +/* the 'tag' version does not work without -Gz in the linker */ +#define free(_var, type) ExFreePool(_var) +//#define free(_var, type) ExFreePoolWithTag(_var, 'wfpi') + +#endif /* _WIN32 */ + +#define M_NOWAIT 0x0001 /* do not block */ +#define M_ZERO 0x0100 /* bzero the allocation */ +#endif /* _SYS_MALLOC_H_ */ diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h new file mode 100644 index 0000000..894b221 --- /dev/null +++ b/sys/sys/mbuf.h @@ -0,0 +1,267 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Universita` di Pisa + * + * BSD copyright. + * + * A simple compatibility interface to map mbufs onto sk_buff + */ + +#ifndef _SYS_MBUF_H_ +#define _SYS_MBUF_H_ + +#include /* we use free() */ +/* hopefully queue.h is already included by someone else */ +#include +#ifdef _KERNEL + +/* bzero not present on linux, but this should go in glue.h */ +// #define bzero(s, n) memset(s, 0, n) + +/* + * We implement a very simplified UMA allocator where the backend + * is simply malloc, and uma_zone only stores the length of the components. + */ +typedef int uma_zone_t; /* the zone size */ + +#define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len) + + +#define uma_zfree(zone, item) free(item, M_IPFW) +#define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags) +#define uma_zdestroy(zone) do {} while (0) + +/*- + * Macros for type conversion: + * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. + */ +#define mtod(m, t) ((t)((m)->m_data)) + +#endif /* _KERNEL */ + +/* + * Packet tag structure (see below for details). + */ +struct m_tag { + SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ + u_int16_t m_tag_id; /* Tag ID */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_cookie; /* ABI/Module ID */ + void (*m_tag_free)(struct m_tag *); +}; + +#if defined(__linux__) || defined( _WIN32 ) + +/* + * Auxiliary structure to store values from the sk_buf. + * Note that we should not alter the sk_buff, and if we do + * so make sure to keep the values in sync between the mbuf + * and the sk_buff (especially m_len and m_pkthdr.len). + */ + +struct mbuf { + struct mbuf *m_next; + struct mbuf *m_nextpkt; + char *m_data; // XXX was void * + int m_len; /* length in this mbuf */ + int m_flags; +#ifdef __linux__ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + struct nf_info *queue_entry; +#else + struct nf_queue_entry *queue_entry; +#endif +#else /* _WIN32 */ + int direction; /* could go in rcvif */ + NDIS_HANDLE context; /* replaces queue_entry or skb ?*/ + PNDIS_PACKET pkt; +#endif + struct sk_buff *m_skb; + struct { +#ifdef __linux__ + struct net_device *rcvif; +#else + struct ifnet *rcvif; +#endif + int len; /* total packet len */ + SLIST_HEAD (packet_tags, m_tag) tags; + } m_pkthdr; +}; + +#define M_SKIP_FIREWALL 0x01 /* skip firewall processing */ +#define M_BCAST 0x02 /* send/received as link-level broadcast */ +#define M_MCAST 0x04 /* send/received as link-level multicast */ + +#define M_DONTWAIT M_NOWAIT /* should not be here... */ + + +/* + * m_dup() is used in the TEE case, currently unsupported so we + * just return. + */ +static __inline struct mbuf *m_dup(struct mbuf *m, int n) +{ + (void)m; (void)n; + return NULL; +}; + +#define MTAG_ABI_COMPAT 0 /* compatibility ABI */ +static __inline struct m_tag * +m_tag_find(struct mbuf *m, int type, struct m_tag *start) +{ + (void)m; (void)type; (void)start; + return NULL; +}; + + +static __inline void +m_tag_prepend(struct mbuf *m, struct m_tag *t) +{ + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); +} + +/* + * Return the next tag in the list of tags associated with an mbuf. + */ +static __inline struct m_tag * +m_tag_next(struct mbuf *m, struct m_tag *t) +{ + + return (SLIST_NEXT(t, m_tag_link)); +} + +/* + * Create an mtag of the given type + */ +static __inline struct m_tag * +m_tag_alloc(uint32_t cookie, int type, int length, int wait) +{ + int l = length + sizeof(struct m_tag); + struct m_tag *m = malloc(l, 0, M_NOWAIT); + if (m) { + memset(m, 0, l); + m->m_tag_id = type; + m->m_tag_len = length; + m->m_tag_cookie = cookie; + } + return m; +}; + +static __inline struct m_tag * +m_tag_get(int type, int length, int wait) +{ + return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); +} + +static __inline struct m_tag * +m_tag_first(struct mbuf *m) +{ + return SLIST_FIRST(&m->m_pkthdr.tags); +}; + +static __inline void +m_tag_delete(struct mbuf *m, struct m_tag *t) +{ +}; + +static __inline struct m_tag * +m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t) +{ + struct m_tag *tag; + + tag = m_tag_first(m); + if (tag == NULL) + return NULL; + + if (tag->m_tag_cookie != n || tag->m_tag_id != x) + return NULL; + else + return tag; +}; + +#define M_SETFIB(_m, _fib) /* nothing on linux */ + +static __inline void +m_freem(struct mbuf *m) +{ + struct m_tag *t; + + /* free the m_tag chain */ + while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) { + SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link); + free(t, 0); + } + + /* free the mbuf */ + free(m, M_IPFW); +}; + +/* m_pullup is not supported, there is a macro in missing.h */ + +#define M_GETFIB(_m) 0 + +/* macro used to create a new mbuf */ +#define MT_DATA 1 /* dynamic (data) allocation */ +#define MSIZE 256 /* size of an mbuf */ +#define MGETHDR(_m, _how, _type) ((_m) = m_gethdr((_how), (_type))) + +/* allocate and init a new mbuf using the same structure of FreeBSD */ +static __inline struct mbuf * +m_gethdr(int how, short type) +{ + struct mbuf *m; + + m = malloc(MSIZE, M_IPFW, M_NOWAIT); + + if (m == NULL) { + return m; + } + + /* here we have MSIZE - sizeof(struct mbuf) available */ + m->m_data = (char *)(m + 1); + + return m; +} + +#endif /* __linux__ || _WIN32 */ + +/* + * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise + * tags are expected to ``vanish'' when they pass through a network + * interface. For most interfaces this happens normally as the tags are + * reclaimed when the mbuf is free'd. However in some special cases + * reclaiming must be done manually. An example is packets that pass through + * the loopback interface. Also, one must be careful to do this when + * ``turning around'' packets (e.g., icmp_reflect). + * + * To mark a tag persistent bit-or this flag in when defining the tag id. + * The tag will then be treated as described above. + */ +#define MTAG_PERSISTENT 0x800 + +#define PACKET_TAG_NONE 0 /* Nadda */ + +/* Packet tags for use with PACKET_ABI_COMPAT. */ +#define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ +#define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ +#define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ +#define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ +#define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ +#define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ +#define PACKET_TAG_GIF 8 /* GIF processing done */ +#define PACKET_TAG_GRE 9 /* GRE processing done */ +#define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ +#define PACKET_TAG_ENCAP 11 /* Encap. processing */ +#define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ +#define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ +#define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ +#define PACKET_TAG_DUMMYNET 15 /* dummynet info */ +#define PACKET_TAG_DIVERT 17 /* divert info */ +#define PACKET_TAG_IPFORWARD 18 /* ipforward info */ +#define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ +#define PACKET_TAG_PF 21 /* PF + ALTQ information */ +#define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ +#define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ +#define PACKET_TAG_CARP 28 /* CARP info */ + +#endif /* !_SYS_MBUF_H_ */ diff --git a/sys/sys/module.h b/sys/sys/module.h new file mode 100644 index 0000000..85bf220 --- /dev/null +++ b/sys/sys/module.h @@ -0,0 +1,41 @@ +/* + * trivial module support + */ +#ifndef _SYS_MODULE_H_ +#define _SYS_MODULE_H_ +typedef struct module *module_t; +typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); + +typedef enum modeventtype { + MOD_LOAD, + MOD_UNLOAD, + MOD_SHUTDOWN, + MOD_QUIESCE +} modeventtype_t; + +typedef struct moduledata { + const char *name; /* module name */ + modeventhand_t evhand; /* event handler */ + void *priv; /* extra data */ +} moduledata_t; + +/* + * Hook the module descriptor, md, into our list of things to do. + * We should in principle respect the order of loading. + * + * XXX use the gcc .init functions + */ +#define DECLARE_MODULE(a, md, c,d) \ + moduledata_t *moddesc_##a = &md; + +/* + * XXX MODULE_VERSION is define in linux too + */ +#define MODULE_DEPEND(a,b,c,d,e) +#if defined( __linux__ ) || defined( _WIN32 ) +#undef MODULE_VERSION +#define MODULE_VERSION(a,b) +#endif + +#endif /* _SYS_MODULE_H_ */ + diff --git a/sys/sys/param.h b/sys/sys/param.h new file mode 100644 index 0000000..f068998 --- /dev/null +++ b/sys/sys/param.h @@ -0,0 +1,11 @@ +#ifndef _SYS_PARAM_H_ +#define _SYS_PARAM_H_ + +/* + * number of additional groups + */ +#ifndef LINUX_24 +#define NGROUPS 16 +#endif + +#endif /* _SYS_PARAM_H_ */ diff --git a/sys/sys/queue.h b/sys/sys/queue.h new file mode 100644 index 0000000..3630218 --- /dev/null +++ b/sys/sys/queue.h @@ -0,0 +1,623 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.68 2006/10/24 11:20:29 ru Exp $ + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +//#include + +/* + * This file defines four types of data structures: singly-linked lists, + * singly-linked tail queues, lists and tail queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ + * _HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - - - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_SAFE + + + + + * _FOREACH_REVERSE - - - + + * _FOREACH_REVERSE_SAFE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT - - + + + * _REMOVE_HEAD + - + - + * _REMOVE + + + + + * + */ +#ifdef QUEUE_MACRO_DEBUG +/* Store the last 2 places the queue element or head was altered */ +struct qm_trace { + char * lastfile; + int lastline; + char * prevfile; + int prevline; +}; + +#define TRACEBUF struct qm_trace trace; +#define TRASHIT(x) do {(x) = (void *)-1;} while (0) + +#define QMD_TRACE_HEAD(head) do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ +} while (0) + +#define QMD_TRACE_ELEM(elem) do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ +} while (0) + +#else +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define TRACEBUF +#define TRASHIT(x) +#endif /* QUEUE_MACRO_DEBUG */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#if defined( _WIN32 ) && defined(SLIST_ENTRY) +#undef SLIST_ENTRY +#endif +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); \ + ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ + TRASHIT((elm)->field.sle_next); \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_CONCAT(head1, head2) do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ +} while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + + +#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = STAILQ_FIRST((head)); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? \ + NULL : \ + ((struct type *)(void *) \ + ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ + TRASHIT((elm)->field.stqe_next); \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#ifndef LIST_HEAD +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_LIST_CHECK_HEAD(head, field) do { \ + if (LIST_FIRST((head)) != NULL && \ + LIST_FIRST((head))->field.le_prev != \ + &LIST_FIRST((head))) \ + panic("Bad list head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_LIST_CHECK_NEXT(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL && \ + LIST_NEXT((elm), field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_LIST_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_LIST_CHECK_HEAD(head, field) +#define QMD_LIST_CHECK_NEXT(elm, field) +#define QMD_LIST_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = LIST_FIRST((head)); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + QMD_LIST_CHECK_NEXT(listelm, field); \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_LIST_CHECK_PREV(listelm, field); \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + QMD_LIST_CHECK_HEAD((head), field); \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + QMD_LIST_CHECK_NEXT(elm, field); \ + QMD_LIST_CHECK_PREV(elm, field); \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ + TRASHIT((elm)->field.le_next); \ + TRASHIT((elm)->field.le_prev); \ +} while (0) +#endif /* LIST_HEAD */ + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_TAILQ_CHECK_HEAD(head, field) do { \ + if (!TAILQ_EMPTY(head) && \ + TAILQ_FIRST((head))->field.tqe_prev != \ + &TAILQ_FIRST((head))) \ + panic("Bad tailq head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_TAIL(head, field) do { \ + if (*(head)->tqh_last != NULL) \ + panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ + if (TAILQ_NEXT((elm), field) != NULL && \ + TAILQ_NEXT((elm), field)->field.tqe_prev != \ + &((elm)->field.tqe_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_TAILQ_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.tqe_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_TAILQ_CHECK_HEAD(head, field) +#define QMD_TAILQ_CHECK_TAIL(head, headname) +#define QMD_TAILQ_CHECK_NEXT(elm, field) +#define QMD_TAILQ_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(elm, field); \ + QMD_TAILQ_CHECK_PREV(elm, field); \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT((elm)->field.tqe_next); \ + TRASHIT((elm)->field.tqe_prev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + + +#ifdef _KERNEL + +/* + * XXX insque() and remque() are an old way of handling certain queues. + * They bogusly assumes that all queue heads look alike. + */ + +struct quehead { + struct quehead *qh_link; + struct quehead *qh_rlink; +}; + +#ifdef __CC_SUPPORTS___INLINE + +static __inline void +insque(void *a, void *b) +{ + struct quehead *element = (struct quehead *)a, + *head = (struct quehead *)b; + + element->qh_link = head->qh_link; + element->qh_rlink = head; + head->qh_link = element; + element->qh_link->qh_rlink = element; +} + +static __inline void +remque(void *a) +{ + struct quehead *element = (struct quehead *)a; + + element->qh_link->qh_rlink = element->qh_rlink; + element->qh_rlink->qh_link = element->qh_link; + element->qh_rlink = 0; +} + +#else /* !__CC_SUPPORTS___INLINE */ + +void insque(void *a, void *b); +void remque(void *a); + +#endif /* __CC_SUPPORTS___INLINE */ + +#endif /* _KERNEL */ + +#endif /* !_SYS_QUEUE_H_ */ diff --git a/sys/sys/syslog.h b/sys/sys/syslog.h new file mode 100644 index 0000000..143df1f --- /dev/null +++ b/sys/sys/syslog.h @@ -0,0 +1,7 @@ +#ifndef _SYS_SYSLOG_H_ +#define _SYS_SYSLOG_H_ +/* XXX find linux equivalent */ +#define LOG_SECURITY 0 +#define LOG_NOTICE 0 +#define LOG_DEBUG 0 +#endif /* _SYS_SYSLOG_H_ */ diff --git a/sys/sys/systm.h b/sys/sys/systm.h new file mode 100644 index 0000000..e98335e --- /dev/null +++ b/sys/sys/systm.h @@ -0,0 +1,126 @@ +#ifndef _SYS_SYSTM_H_ +#define _SYS_SYSTM_H_ + +#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ +#define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ + +#ifndef _WIN32 /* this is the linux version */ +/* callout support, in on FreeBSD */ +/* + * callout support on linux module is done using timers + */ +#include +#ifdef LINUX_24 +#include /* jiffies definition is here in 2.4 */ +#endif +#define callout timer_list +static __inline int +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) +{ + co->expires = jiffies + ticks; + co->function = (void (*)(unsigned long))fn; + co->data = (unsigned long)arg; + /* + * Linux 2.6.31 and above has add_timer_on(co, cpu), + * otherwise add_timer() always schedules a callout on the same + * CPU used the first time, so we don't need more. + */ + add_timer(co); + return 0; +} + +#define callout_init(co, safe) init_timer(co) +#define callout_drain(co) del_timer(co) +#define callout_stop(co) del_timer(co) + +#else /* _WIN32 */ +#include + +/* This is the windows part for callout support */ +struct callout { + KTIMER thetimer; + KDPC timerdpc; + int dpcinitialized; + LARGE_INTEGER duetime; +}; + +void dummynet (void*); +VOID dummynet_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ); + +VOID ipfw_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ); + +/* callout_reset must handle two problems: + * - dummynet() scheduler must be run always on the same processor + * because do_gettimeofday() is based on cpu performance counter, and + * _occasionally_ can leap backward in time if we query another cpu. + * typically this won't happen that much, and the cpu will almost always + * be the same even without the affinity restriction, but better to be sure. + * - ipfw_tick() does not have the granularity requirements of dummynet() + * but we need to pass a pointer as argument. + * + * for these reasons, if we are called for dummynet() timer, + * KeInitializeDpc is called only once as it should be, and the thread + * is forced on cpu0 (which is always present), while if we're called + * for ipfw_tick(), we re-initialize the DPC each time, using + * parameter DeferredContext to pass the needed pointer. since this + * timer is called only once a sec, this won't hurt that much. + */ +static __inline int +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) +{ + if(fn == &dummynet) + { + if(co->dpcinitialized == 0) + { + KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL); + KeSetTargetProcessorDpc(&co->timerdpc, cpu); + co->dpcinitialized = 1; + } + } + else + { + KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg); + } + co->duetime.QuadPart = (-ticks)*10000; + KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc); + return 0; +} + +static __inline void +callout_init(struct callout* co, int safe) +{ + printf("%s: initializing timer at %p\n",__FUNCTION__,co); + KeInitializeTimer(&co->thetimer); +} + +static __inline int +callout_drain(struct callout* co) +{ + BOOLEAN canceled = KeCancelTimer(&co->thetimer); + while (canceled != TRUE) + { + canceled = KeCancelTimer(&co->thetimer); + } + printf("%s: stopping timer at %p\n",__FUNCTION__,co); + return 0; +} + +static __inline int +callout_stop(struct callout* co) +{ + return callout_drain(co); +} + +#endif /* _WIN32 */ + +#endif /* _SYS_SYSTM_H_ */ diff --git a/sys/sys/taskqueue.h b/sys/sys/taskqueue.h new file mode 100644 index 0000000..43efdd5 --- /dev/null +++ b/sys/sys/taskqueue.h @@ -0,0 +1,34 @@ +#ifndef _SYS_TASKQUEUE_H_ +#define _SYS_TASKQUEUE_H_ + +/* + * Remap taskqueue to direct calls + */ + +#ifdef _WIN32 +struct task { + void (*func)(void*, int); +}; +#define taskqueue_enqueue(tq, ta) (ta)->func(NULL,1) +#define TASK_INIT(a,b,c,d) do { \ + (a)->func = (c); } while (0) +#else +struct task { + void (*func)(void); +}; +#define taskqueue_enqueue(tq, ta) (ta)->func() +#define TASK_INIT(a,b,c,d) do { \ + (a)->func = (void (*)(void))c; } while (0) + +#endif +#define taskqueue_create_fast(_a, _b, _c, _d) NULL +#define taskqueue_start_threads(_a, _b, _c, _d) + +#define taskqueue_drain(_a, _b) /* XXX to be completed */ +#define taskqueue_free(_a) /* XXX to be completed */ + +#define PRI_MIN (0) /* Highest priority. */ +#define PRI_MIN_ITHD (PRI_MIN) +#define PI_NET (PRI_MIN_ITHD + 16) + +#endif /* !_SYS_TASKQUEUE_H_ */ diff --git a/tcc-0.9.25-bsd.zip b/tcc-0.9.25-bsd.zip new file mode 100644 index 0000000..06d7c37 Binary files /dev/null and b/tcc-0.9.25-bsd.zip differ diff --git a/tcc_glue.h b/tcc_glue.h new file mode 100644 index 0000000..db757ed --- /dev/null +++ b/tcc_glue.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * headers to build userland ipfw under tcc. + */ + +#ifndef _TCC_GLUE_H +#define _TCC_GLUE_H + +//#define __restrict +#define NULL ((void *)0) +typedef int size_t; +typedef unsigned char u_char; +typedef unsigned char uint8_t; +typedef unsigned char u_int8_t; +typedef unsigned short u_short; +typedef unsigned short uint16_t; +typedef unsigned short u_int16_t; +typedef int __int32_t; +typedef int int32_t; +typedef int socklen_t; +typedef int pid_t; +typedef unsigned int time_t; +typedef unsigned int uint; +typedef unsigned int u_int; +typedef unsigned int uint32_t; +typedef unsigned int u_int32_t; +typedef unsigned int gid_t; +typedef unsigned int uid_t; +typedef unsigned long u_long; +typedef unsigned long uintptr_t; +typedef long long int int64_t; +typedef unsigned long long int uint64_t; +typedef unsigned long long int u_int64_t; + +typedef uint32_t in_addr_t; +struct in_addr { + uint32_t s_addr; +}; +struct sockaddr_in { + uint8_t _sin_len; + uint8_t sin_family; + uint16_t sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; +#define IFNAMSIZ 16 +#define INET6_ADDRSTRLEN 64 + +struct in6_addr { + union { + uint8_t __s6_addr8[16]; + uint16_t __s6_addr16[8]; + uint32_t __s6_addr32[4]; + } __u6; // _addr; /* 128-bit IP6 address */ +}; + + +#define LITTLE_ENDIAN 1234 +#define BYTE_ORDER LITTLE_ENDIAN + +/* to be revised */ +#define EX_OK 0 +#define EX_DATAERR 1 +#define EX_OSERR 2 +#define EX_UNAVAILABLE 3 +#define EX_USAGE 4 +#define EX_NOHOST 5 + +#define EEXIST 1 +#define EINVAL 2 +#define ERANGE 3 +#define ESRCH 4 + +#define IPPROTO_IP 1 +#define IPPROTO_IPV6 2 +#define IPPROTO_RAW 100 + +#define IPTOS_LOWDELAY 100 +#define IPTOS_MINCOST 101 +#define IPTOS_RELIABILITY 102 +#define IPTOS_THROUGHPUT 103 +#define SOCK_RAW 12 +#define AF_INET 2 +#define AF_INET6 28 + +#define INADDR_ANY 0 + + +#define bcmp(src, dst, len) memcmp(src, dst, len) +#define bcopy(src, dst, len) memcpy(dst, src, len) +#define bzero(p, len) memset(p, 0, len) +#define index(s, c) strchr(s, c) + +char *strsep(char **stringp, const char *delim); + +void warn(const char *, ...); +//void warnx(const char *, ...); +#define warnx warn +void err(int, const char *, ...); +#define errx err + +uint16_t htons(uint16_t)__attribute__ ((stdcall)); +uint16_t ntohs(uint16_t)__attribute__ ((stdcall)); +uint32_t htonl(uint32_t)__attribute__ ((stdcall)); +uint32_t ntohl(uint32_t)__attribute__ ((stdcall)); +int inet_aton(const char *cp, struct in_addr *pin)__attribute__ ((stdcall));; +char * inet_ntoa(struct in_addr)__attribute__ ((stdcall));; +const char * inet_ntop(int af, const void * src, char * dst, + socklen_t size)__attribute__ ((stdcall));; +int inet_pton(int af, const char * src, void * dst)__attribute__ ((stdcall));; + +struct group { + gid_t gr_gid; + char gr_name[16]; +}; +struct passwd { + uid_t pw_uid; + char pw_name[16]; +}; + +#define getpwnam(s) (NULL) +#define getpwuid(s) (NULL) + +#define getgrnam(x) (NULL) +#define getgrgid(x) (NULL) + +int getopt(int argc, char * const argv[], const char *optstring); + +int getsockopt(int s, int level, int optname, void * optval, + socklen_t * optlen); + +int setsockopt(int s, int level, int optname, const void *optval, + socklen_t optlen); + +struct protoent { + char *p_name; /* official protocol name */ + char **p_aliases; /* alias list */ + short p_proto; /* protocol # */ +}; + +struct servent { + char *s_name; /* official service name */ + char **s_aliases; /* alias list */ + short s_port; /* port # */ + char *s_proto; /* protocol to use */ +}; + +struct hostent { + char *h_name; /* official name of host */ + char **h_aliases; /* alias list */ + short h_addrtype; /* host address type */ + short h_length; /* length of address */ + char **h_addr_list; /* list of addresses */ +#define h_addr h_addr_list[0] /* address, for backward compat */ +}; + +struct hostent* gethostbyaddr(const char* addr, int len, int type)__attribute__ ((stdcall)); +struct hostent* gethostbyname(const char *name)__attribute__ ((stdcall)); + +struct protoent* getprotobynumber(int number)__attribute__ ((stdcall)); +struct protoent* getprotobyname(const char* name)__attribute__ ((stdcall)); + +struct servent* getservbyport(int port, const char* proto)__attribute__ ((stdcall)); +struct servent* getservbyname(const char* name, const char* proto) __attribute__ ((stdcall)); + +extern int optind; +extern char *optarg; + +#include + +#define WSADESCRIPTION_LEN 256 +#define WSASYS_STATUS_LEN 128 + +typedef struct WSAData { + WORD wVersion; + WORD wHighVersion; + char szDescription[WSADESCRIPTION_LEN+1]; + char szSystemStatus[WSASYS_STATUS_LEN+1]; + unsigned short iMaxSockets; + unsigned short iMaxUdpDg; + char FAR * lpVendorInfo; +} WSADATA, * LPWSADATA; + +int WSAStartup( + WORD wVersionRequested, + LPWSADATA lpWSAData + ); + +int +WSACleanup(void); + +int WSAGetLastError(); + +/* return error on process handling */ +#define pipe(f) (-1) +#define kill(p, s) (-1) +#define waitpid(w,s,o) (-1) +#define fork(x) (-1) +#define execvp(f, a) (-1) + +#define _W_INT(i) (i) +#define _WSTATUS(x) (_W_INT(x) & 0177) +#define WIFEXITED(x) (_WSTATUS(x) == 0) +#define WEXITSTATUS(x) (_W_INT(x) >> 8) +#define _WSTOPPED 0177 /* _WSTATUS if process is stopped */ +#define WIFSIGNALED(x) (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0) +#define WTERMSIG(x) (_WSTATUS(x)) + +#endif /* _TCC_GLUE_H */ diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..9ed47f8 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,53 @@ +# +# $Id: Makefile 5626 2010-03-04 21:55:22Z luigi $ +# +# Makefile for building userland tests +# this is written in a form compatible with gmake + +SCHED_SRCS = test_dn_sched.c +SCHED_SRCS += dn_sched_fifo.c +SCHED_SRCS += dn_sched_wf2q.c +SCHED_SRCS += dn_sched_qfq.c +SCHED_SRCS += dn_sched_rr.c +SCHED_SRCS += dn_heap.c +SCHED_SRCS += main.c + +SCHED_OBJS=$(SCHED_SRCS:.c=.o) + +HEAP_SRCS = dn_heap.c test_dn_heap.c +HEAP_OBJS=$(HEAP_SRCS:.c=.o) + +VPATH= .:../dummynet2 + +#CFLAGS = -I../dummynet2/include -I. -Wall -Werror -O3 -DIPFW +CFLAGS = -I. -I../dummynet2/include/netinet/ipfw -DIPFW +CFLAGS += -Wall -Werror +CFLAGS += -g -O3 +TARGETS= test_sched # no test_heap by default + +all: $(TARGETS) + +test_heap : $(HEAP_OBJS) + $(CC) -o $@ $(HEAP_OBJS) + +test_sched : $(SCHED_OBJS) + $(CC) -o $@ $(SCHED_OBJS) + +$(SCHED_OBJS): dn_test.h +main.o: mylist.h + +clean: + - rm *.o $(TARGETS) *.core + +ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \ + dn_sched.h dn_heap.h ip_dn_private.h Makefile +TMPBASE = /tmp/testXYZ +TMPDIR = $(TMPBASE)/test + +tgz: + -rm -rf $(TMPDIR) + mkdir -p $(TMPDIR) + -cp -p $(ALLSRCS) $(TMPDIR) + -(cd ..; cp -p $(ALLSRCS) $(TMPDIR)) + ls -la $(TMPDIR) + (cd $(TMPBASE); tar cvzf /tmp/test.tgz test) diff --git a/test/basic_ipfw.sh b/test/basic_ipfw.sh new file mode 100755 index 0000000..08b66f9 --- /dev/null +++ b/test/basic_ipfw.sh @@ -0,0 +1,72 @@ +#!/bin/sh + +IPFW=./ipfw/ipfw +PING=/bin/ping +RH=127.0.0.1 # remote host +R=10 # test rule number +P=1 # test pipe number + +abort() +{ +echo $* +} + +#insmod dummynet2/ipfw_mod.ko +#$IPFW show > /dev/null +#$IPFW pipe show +echo "Flushing rules, do you agree ?" +$IPFW flush + +# test_msg rule counter +clean() +{ + $IPFW delete $R 2> /dev/null + $IPFW pipe $P delete 2> /dev/null +} + +# simple counter/allow test +echo -n "counter/allow test..." +clean +$IPFW add $R allow icmp from any to 127.0.0.1 > /dev/null +$PING -f -c100 $RH > /dev/null +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f3` +[ ! $counter -eq 400 ] && abort "Wrong counter $counter 400" +echo "...OK" + +# simple drop test +echo -n "deny test..." +clean +$IPFW add $R deny icmp from any to 127.0.0.1 > /dev/null +$PING -f -c10 -W 1 $RH > /dev/null +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ ! $counter -eq 10 ] && abort "Wrong counter $counter 10" +echo "...OK" + +# pipe delay test +echo -n "pipe delay test..." +clean +$IPFW pipe $P config delay 2000ms >/dev/null +$IPFW add $R pipe $P icmp from any to $RH >/dev/null +$PING -f -c10 -W 1 $RH > /dev/null +counter1=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +sleep 2 +counter2=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ ! $counter1 -eq 10 ] && abort "Wrong counter $counter 10" +[ ! $counter2 -eq 20 ] && abort "Wrong counter $counter 20" +echo "...OK" + +# pipe bw test +echo -n "pipe bw test..." +clean +$IPFW pipe $P config bw 2Kbit/s >/dev/null +$IPFW add $R pipe $P icmp from any to $RH >/dev/null +$PING -i 0.1 -c10 -W 1 $RH > /dev/null +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30" +sleep 1 +counter=`$IPFW show | grep $R | head -n 1 | cut -d " " -f4` +[ $counter -gt 30 ] && abort "Wrong counter $counter should be < 30" +echo "...OK" + +# Final clean +clean diff --git a/test/dn_test.h b/test/dn_test.h new file mode 100644 index 0000000..f2a4a51 --- /dev/null +++ b/test/dn_test.h @@ -0,0 +1,157 @@ +/* + * $Id: dn_test.h 5626 2010-03-04 21:55:22Z luigi $ + * + * userspace compatibility code for dummynet schedulers + */ + +#ifndef _DN_TEST_H +#define _DN_TEST_H +#include +#include +#include +#include /* bzero, ffs, ... */ +#include /* strcmp */ +#include +#include +#include + +extern int debug; +#define ND(fmt, args...) do {} while (0) +#define D1(fmt, args...) do {} while (0) +#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ + __FUNCTION__, ## args) +#define DX(lev, fmt, args...) do { \ + if (debug > lev) D(fmt, ## args); } while (0) + + +#define offsetof(t,m) (int)((&((t *)0L)->m)) + +#include + +/* prevent include of other system headers */ +#define _NETINET_IP_VAR_H_ /* ip_fw_args */ +#define _IPFW2_H +#define _SYS_MBUF_H_ + +enum { + DN_QUEUE, +}; + +enum { + DN_SCHED_FIFO, + DN_SCHED_WF2QP, +}; + +struct dn_id { + int type, subtype, len, id; +}; +struct dn_fs { + int par[4]; /* flowset parameters */ + + /* simulation entries. + * 'index' is not strictly necessary + * y is used for the inverse mapping , + */ + int index; + int y; /* inverse mapping */ + int base_y; /* inverse mapping */ + int next_y; /* inverse mapping */ + int n_flows; + int first_flow; + int next_flow; /* first_flow + n_flows */ + /* + * when generating, let 'cur' go from 0 to n_flows-1, + * then point to flow first_flow + cur + */ + int cur; +}; +struct dn_sch { +}; +struct dn_flow { + struct dn_id oid; + int length; + int len_bytes; + int drops; + uint64_t tot_bytes; + uint32_t flow_id; + struct list_head h; /* used by the generator */ +}; +struct dn_link { +}; + +struct ip_fw_args { +}; + +struct mbuf { + struct { + int len; + } m_pkthdr; + struct mbuf *m_nextpkt; + int flow_id; /* for testing, index of a flow */ + //int flowset_id; /* for testing, index of a flowset */ + void *cfg; /* config args */ +}; + +#define MALLOC_DECLARE(x) +#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) +struct ipfw_flow_id { +}; + +typedef void * module_t; +struct _md_t { + const char *name; + int (*f)(module_t, int, void *); + void *p; +}; +typedef struct _md_t moduledata_t; +#define DECLARE_MODULE(name, b, c, d) \ + moduledata_t *_g_##name = & b +#define MODULE_DEPEND(a, b, c, d, e) + +#ifdef IPFW +#include +#include +#include +#else +struct dn_queue { + struct dn_fsk *fs; /* parent flowset. */ + struct dn_sch_inst *_si; /* parent sched instance. */ +}; +struct dn_schk { +}; +struct dn_fsk { + struct dn_fs fs; + struct dn_schk *sched; +}; +struct dn_sch_inst { + struct dn_schk *sched; +}; +struct dn_alg { + int type; + const char *name; + void *enqueue, *dequeue; + int q_datalen, si_datalen, schk_datalen; + int (*config)(struct dn_schk *); + int (*new_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *); + int (*new_queue)(struct dn_queue *q); +}; + +#endif + +#ifndef __FreeBSD__ +int fls(int); +#endif + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +#endif /* _DN_TEST_H */ diff --git a/test/dynrules.sh b/test/dynrules.sh new file mode 100644 index 0000000..98f5fe6 --- /dev/null +++ b/test/dynrules.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# 20100507 marta, quick test for dyn rules +# ./ipfw/ipfw -d show |grep \ 80 + +IPFW_MOD=dummynet2/ipfw_mod.ko +IPFW=ipfw/ipfw + +# main +# remove any previous loaded module +/sbin/rmmod ipfw_mod +/sbin/insmod ${IPFW_MOD} +echo "25" > /sys/module/ipfw_mod/parameters/dyn_ack_lifetime +${IPFW} add 1 check-state +${IPFW} add 9 allow all from any to any keep-state +${IPFW} add 10 allow all from any to onelab1.iet.unipi.it keep-state + +telnet 72.14.234.104 80 + + diff --git a/test/interpolation.c b/test/interpolation.c new file mode 100644 index 0000000..d6731f1 --- /dev/null +++ b/test/interpolation.c @@ -0,0 +1,335 @@ +#include +#include +#include + +/* gcc interpolation.c -o interpolation */ + +void +err(int eval, const char *fmt, ...) +{ +} +void +errx(int eval, const char *fmt, ...) +{ +} + + +#define ED_MAX_SAMPLES_NO 1000 +#define ED_MAX_LINE_LEN 128 +#define EX_DATAERR 1 +#define EX_UNAVAILABLE 3 +#define ED_TOK_DELAY "delay" +#define ED_TOK_PROB "prob" +#define ED_SEPARATORS " \t\n" +#define ED_TOK_PROFILE_NO "profile_no" + + +struct point { + double prob; /* y */ + double delay; /* x */ +}; + +struct profile { + char filename[128]; /* profile filename */ + int samples[ED_MAX_SAMPLES_NO+1]; /* may be shorter */ + int samples_no; /* actual len of samples[] */ +}; + +/* + * returns 1 if s is a non-negative number, with at least one '.' + */ +static int +is_valid_number(const char *s) +{ +#if 0 + int i, dots_found = 0; + int len = strlen(s); + + for (i = 0; i 1)) + return 0; +#endif + return 1; +} + +static int +compare_points(const void *vp1, const void *vp2) +{ + const struct point *p1 = vp1; + const struct point *p2 = vp2; + double res = 0; + + res = p1->prob - p2->prob; + if (res == 0) + res = p1->delay - p2->delay; + if (res < 0) + return -1; + else if (res > 0) + return 1; + else + return 0; +} + +#define ED_EFMT(s) 1,"error in %s at line %d: "#s,filename,lineno + +/* + * The points defined by the user are stored in the ponts structure. + * The number of user defined points is stored in points_no. + * We assume that The last point for the '1' value of the + * probability should be defined. (XXX add checks for this) + * The user defined sampling value is stored in samples_no. + * The resulting samples are in the "samples" pointer. + */ +static void +interpolate_samples(struct point *p, int points_no, + int *samples, int samples_no, const char *filename) +{ + double dy; /* delta on the y axis */ + double y; /* current value of y */ + double x; /* current value of x */ + double m; /* the y slope */ + int i; /* samples index */ + int curr; /* points current index */ + + dy = 1.0/samples_no; + y = 0; + + for (i=0, curr = 0; i < samples_no; i++, y+=dy) { + /* This statment move the curr pointer to the next point + * skipping the points with the same x value. We are + * guaranteed to exit from the loop because the + * last possible value of y is stricly less than 1 + * and the last possible value of the y points is 1 */ + while ( y >= p[curr+1].prob ) curr++; + + /* compute the slope of the curve */ + m = (p[curr+1].delay - p[curr].delay) / (p[curr+1].prob - p[curr].prob); + /* compute the x value starting from the current point */ + x = p[curr].delay + (y - p[curr].prob) * m; + samples[i] = x; + } + + /* add the last sample */ + samples[i] = p[curr+1].delay; +} + +#if 0 +static void +interpolate_samples_old(struct point *points, int points_no, + int *samples, int samples_no, const char *filename) +{ + int i; /* pointer to the sampled array */ + int j = 0; /* pointer to user defined samples */ + double dy; /* delta y */ + double y; /* current value of y */ + int x; /* computed value of x */ + double m; /* slope of the line */ + double y1, x1, y2, x2; /* two points of the current line */ + + /* make sure that there are enough points. */ + /* XXX Duplicated shoule be removed */ + if (points_no < 3) + errx(EX_DATAERR, "%s too few samples, need at least %d", + filename, 3); + + qsort(points, points_no, sizeof(struct point), compare_points); + + samples_no--; + dy = 1.0/samples_no; + printf("\nsamples no is %d dy is %f ", samples_no, dy); + + /* start with the first two points */ + y1 = points[j].prob * samples_no; + x1 = points[j].delay; + j++; + y2 = points[j].prob * samples_no; + x2 = points[j].delay; + + m = (y2-y1)/(x2-x1); + printf("\nStart"); + printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f m %f\n", + x1, y1, x2, y2, m); + + y = 0; + x = x1; + + for(i=0; i < samples_no+1; i++, y+=dy) { + printf("\ni:%d j:%d y:%f real y:%f", i, j, y, y*samples_no); + if ( (y*samples_no) >= y2 ) { /* move to the next point */ + j++; + if ( j >= points_no ) { + printf("\n\tNo more points, exit with j: %d i: %d and y:%f %f\n", + j, i, y, (y*samples_no)); + break; /* no more user defined points */ + } + /* load a new point */ + y1 = y2; + x1 = x2; + y2 = points[j].prob * samples_no; + x2 = points[j].delay; + m = (y2-y1)/(x2-x1); + if (x1==x2) { /* m = infinito */ + m = -1; + x = x2; + } + /* very small m problem */ + printf ("\ndelta %f\n", (y1 - y2)); + if (abs(y1 - y2) < 0.00001) { /* m = 0 XXX Should this magic number depend on samples_no ? */ + m = 0; + x = x2; + } + printf("\n\tCurrent points x1 y1 %f %f next point x2y2 %f %f (%f/%f)=m \n", + x1, y1, x2, y2, (y2-y1), (x2-x1), m); + } + printf("\n\tcompute step y %f x[%d]=%d ", + y, i, x); + if ((m != -1) && ( m != 0 )) { + x = x + (dy * samples_no)/m; + } + samples[i] = x; + printf(" dy %f x new %d\n", dy*samples_no, x); + printf(" m %f (dy * samples_no)/m %f \n", m, (dy * samples_no)/m); + } + + x = samples[i-1]; + printf("Finish i is %d samples_no is %d\n", i, samples_no); + /* The last point has a probability less than 1 */ + for (; i <= samples_no; i++) + samples[i] = x; +} +#endif + +static void +load_profile(struct profile *p) +{ + FILE *f; /* file handler */ + char line[ED_MAX_LINE_LEN]; + int lineno = 0; + int do_points = 0; + int delay_first = -1; + int i; + + struct point points[1000]; /* MAX_POINTS_NO */ + int points_no = 0; + + char *filename = p->filename; + f = fopen(filename, "r"); + if (f == NULL) { + err(EX_UNAVAILABLE, "fopen: %s", filename); + } + + + while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ + char *s, *cur = line, *name = NULL, *arg = NULL; + + ++lineno; + + /* parse the line */ + while (cur) { + s = strsep(&cur, ED_SEPARATORS); + if (s == NULL || *s == '#') + break; + if (*s == '\0') + continue; + if (arg) + errx(ED_EFMT("too many arguments")); + if (name == NULL) + name = s; + else + arg = s; + } + + if (name == NULL) + continue; + + if (!strcasecmp(name, ED_TOK_DELAY)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 1; + do_points = 1; + continue; + } else if (!strcasecmp(name, ED_TOK_PROB)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 0; + do_points = 1; + continue; + } + if (!strcasecmp(name, ED_TOK_PROFILE_NO)) { + int p_no = atof(arg); + if (p_no <= 0) { + p_no = 100; + printf("invalid interpolation samples, using %d\n", + p_no); + } + if (p_no > ED_MAX_SAMPLES_NO) { + p_no = ED_MAX_SAMPLES_NO; + printf("invalid interpolation samples, using %d\n", + p_no); + } + + p->samples_no = p_no; + continue; + + } else if (do_points) { + if (!is_valid_number(name) || !is_valid_number(arg)) + errx(ED_EFMT("invalid point found")); + if (delay_first) { + points[points_no].delay = atof(name); + points[points_no].prob = atof(arg); + } else { + points[points_no].delay = atof(arg); + points[points_no].prob = atof(name); + } + if (points[points_no].prob > 1.0) + errx(ED_EFMT("probability greater than 1.0")); + ++points_no; + /* XXX no more that 1000 */ + continue; + } else { + errx(ED_EFMT("unrecognised command '%s'"), name); + } + } + + for(i=0; i < p->samples_no; i++) { + p->samples[i] = 666; + } + + /* This code assume the user define a value of X for the sampling value, + * and that: + * - the value stored in the emulator structure is X; + * - the allocated structure for the samples is X+1; + */ + interpolate_samples(points, points_no, p->samples, p->samples_no, filename); + + // User defined samples + printf("\nLoaded %d points:\n", points_no); + for(i=0; i < points_no; i++) { + printf("%f %f\n", points[i].prob, points[i].delay); + } + printf("\n"); + printf("The sample value is %d \n", p->samples_no); + +} + +int main(int argc, char **argv) +{ + if (argc < 2) { + printf("Usage: ./interpolation \n"); + return -1; + } + + char *filename; + filename = argv[1]; + + struct profile p; + int i; + + strncpy(p.filename, filename, 128); + load_profile(&p); + printf("-----------\n"); + for (i=0; i<=p.samples_no; i++) + printf("%d %d\n", i, p.samples[i]); + printf("-----------\n"); + return 0; +} diff --git a/test/main.c b/test/main.c new file mode 100644 index 0000000..85fc621 --- /dev/null +++ b/test/main.c @@ -0,0 +1,636 @@ +/* + * $Id: main.c 5626 2010-03-04 21:55:22Z luigi $ + * + * Testing program for schedulers + * + * The framework include a simple controller which, at each + * iteration, decides whether we can enqueue and/or dequeue. + * Then the mainloop runs the required number of tests, + * keeping track of statistics. + */ + +#include "dn_test.h" + +struct q_list { + struct list_head h; +}; + +struct cfg_s { + int ac; + char * const *av; + + const char *name; + int loops; + struct timeval time; + + /* running counters */ + uint32_t _enqueue; + uint32_t drop; + uint32_t pending; + uint32_t dequeue; + + /* generator parameters */ + int th_min, th_max; + int maxburst; + int lmin, lmax; /* packet len */ + int flows; /* number of flows */ + int flowsets; /* number of flowsets */ + int wsum; /* sum of weights of all flows */ + int max_y; /* max random number in the generation */ + int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ + const char *fs_config; /* flowset config */ + int can_dequeue; + int burst; /* count of packets sent in a burst */ + struct mbuf *tosend; /* packet to send -- also flag to enqueue */ + + struct mbuf *freelist; + + struct mbuf *head, *tail; /* a simple tailq */ + + /* scheduler hooks */ + int (*enq)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*deq)(struct dn_sch_inst *); + /* size of the three fields including sched-specific areas */ + int schk_len; + int q_len; /* size of a queue including sched-fields */ + int si_len; /* size of a sch_inst including sched-fields */ + char *q; /* array of flow queues */ + /* use a char* because size is variable */ + struct dn_fsk *fs; /* array of flowsets */ + struct dn_sch_inst *si; + struct dn_schk *sched; + + /* generator state */ + int state; /* 0 = going up, 1: going down */ + + /* + * We keep lists for each backlog level, and always serve + * the one with shortest backlog. llmask contains a bitmap + * of lists, and ll are the heads of the lists. The last + * entry (BACKLOG) contains all entries considered 'full' + * XXX to optimize things, entry i could contain queues with + * 2^{i-1}+1 .. 2^i entries. + */ +#define BACKLOG 30 + uint32_t llmask; + struct list_head ll[BACKLOG + 10]; +}; + +/* FI2Q and Q2FI converts from flow_id to dn_queue and back. + * We cannot easily use pointer arithmetic because it is variable size. + */ +#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) +#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) + +int debug = 0; + +struct dn_parms dn_cfg; + +static void controller(struct cfg_s *c); + +/* release a packet: put the mbuf in the freelist, and the queue in + * the bucket. + */ +int +drop(struct cfg_s *c, struct mbuf *m) +{ + struct dn_queue *q; + int i; + + c->drop++; + q = FI2Q(c, m->flow_id); + i = q->ni.length; // XXX or ffs... + + ND("q %p id %d current length %d", q, m->flow_id, i); + if (i < BACKLOG) { + struct list_head *h = &q->ni.h; + c->llmask &= ~(1<<(i+1)); + c->llmask |= (1<<(i)); + list_del(h); + list_add_tail(h, &c->ll[i]); + } + m->m_nextpkt = c->freelist; + c->freelist = m; + return 0; +} + +/* dequeue returns NON-NULL when a packet is dropped */ +static int +enqueue(struct cfg_s *c, void *_m) +{ + struct mbuf *m = _m; + if (c->enq) + return c->enq(c->si, FI2Q(c, m->flow_id), m); + if (c->head == NULL) + c->head = m; + else + c->tail->m_nextpkt = m; + c->tail = m; + return 0; /* default - success */ +} + +/* dequeue returns NON-NULL when a packet is available */ +static void * +dequeue(struct cfg_s *c) +{ + struct mbuf *m; + if (c->deq) + return c->deq(c->si); + if ((m = c->head)) { + m = c->head; + c->head = m->m_nextpkt; + m->m_nextpkt = NULL; + } + return m; +} + +static int +mainloop(struct cfg_s *c) +{ + int i; + struct mbuf *m; + + for (i=0; i < c->loops; i++) { + /* implement histeresis */ + controller(c); + DX(3, "loop %d enq %d send %p rx %d", + i, c->_enqueue, c->tosend, c->can_dequeue); + if ( (m = c->tosend) ) { + c->_enqueue++; + if (enqueue(c, m)) { + drop(c, m); + ND("loop %d enqueue fail", i ); + } else { + ND("enqueue ok"); + c->pending++; + } + } + if (c->can_dequeue) { + c->dequeue++; + if ((m = dequeue(c))) { + c->pending--; + drop(c, m); + c->drop--; /* compensate */ + } + } + } + DX(1, "mainloop ends %d", i); + return 0; +} + +int +dump(struct cfg_s *c) +{ + int i; + struct dn_queue *q; + + for (i=0; i < c->flows; i++) { + q = FI2Q(c, i); + DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); + } + DX(1, "done %d loops\n", c->loops); + return 0; +} + +/* interpret a number in human form */ +static long +getnum(const char *s, char **next, const char *key) +{ + char *end = NULL; + long l; + + if (next) /* default */ + *next = NULL; + if (s && *s) { + DX(3, "token is <%s> %s", s, key ? key : "-"); + l = strtol(s, &end, 0); + } else { + DX(3, "empty string"); + l = -1; + } + if (l < 0) { + DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); + return 0; // invalid + } + if (!end || !*end) + return l; + if (*end == 'n') + l = -l; /* multiply by n */ + else if (*end == 'K') + l = l*1000; + else if (*end == 'M') + l = l*1000000; + else if (*end == 'k') + l = l*1024; + else if (*end == 'm') + l = l*1024*1024; + else if (*end == 'w') + ; + else {/* not recognized */ + D("suffix %s for %s, next %p", end, key, next); + end--; + } + end++; + DX(3, "suffix now %s for %s, next %p", end, key, next); + if (next && *end) { + DX(3, "setting next to %s for %s", end, key); + *next = end; + } + return l; +} + +/* + * flowsets are a comma-separated list of + * weight:maxlen:flows + * indicating how many flows are hooked to that fs. + * Both weight and range can be min-max-steps. + * In a first pass we just count the number of flowsets and flows, + * in a second pass we complete the setup. + */ +static void +parse_flowsets(struct cfg_s *c, const char *fs, int pass) +{ + char *s, *cur, *next; + int n_flows = 0, n_fs = 0, wsum = 0; + int i, j; + struct dn_fs *prev = NULL; + + DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); + if (pass == 0) + c->fs_config = fs; + s = c->fs_config ? strdup(c->fs_config) : NULL; + if (s == NULL) { + if (pass == 0) + D("no fsconfig"); + return; + } + for (next = s; (cur = strsep(&next, ","));) { + char *p = NULL; + int w, w_h, w_steps, wi; + int len, len_h, l_steps, li; + int flows; + + w = getnum(strsep(&cur, ":"), &p, "weight"); + if (w <= 0) + w = 1; + w_h = p ? getnum(p+1, &p, "weight_max") : w; + w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); + len = getnum(strsep(&cur, ":"), &p, "len"); + if (len <= 0) + len = 1000; + len_h = p ? getnum(p+1, &p, "len_max") : len; + l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); + flows = getnum(strsep(&cur, ":"), NULL, "flows"); + if (flows == 0) + flows = 1; + DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", + w, w_h, w_steps, len, len_h, l_steps, flows); + if (w == 0 || w_h < w || len == 0 || len_h < len || + flows == 0) { + DX(4,"wrong parameters %s", fs); + return; + } + n_flows += flows * w_steps * l_steps; + for (i = 0; i < w_steps; i++) { + wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); + for (j = 0; j < l_steps; j++, n_fs++) { + struct dn_fs *fs = &c->fs[n_fs].fs; // tentative + int x; + + li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); + x = (wi*2048)/li; + DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", + n_fs, wi, li, x, flows); + if (pass == 0) + continue; + if (c->fs == NULL || c->flowsets <= n_fs) { + D("error in number of flowsets"); + return; + } + wsum += wi * flows; + fs->par[0] = wi; + fs->par[1] = li; + fs->index = n_fs; + fs->n_flows = flows; + fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; + fs->next_flow = fs->first_flow + fs->n_flows; + fs->y = x * flows; + fs->base_y = (prev == NULL) ? 0 : prev->next_y; + fs->next_y = fs->base_y + fs->y; + prev = fs; + } + } + } + c->max_y = prev ? prev->base_y + prev->y : 0; + c->flows = n_flows; + c->flowsets = n_fs; + c->wsum = wsum; + if (pass == 0) + return; + + /* now link all flows to their parent flowsets */ + DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); + for (i=0; i < c->flowsets; i++) { + struct dn_fs *fs = &c->fs[i].fs; + DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", + i, fs->par[0], fs->par[1], + fs->first_flow, fs->next_flow, + fs->base_y, fs->next_y); + for (j = fs->first_flow; j < fs->next_flow; j++) { + struct dn_queue *q = FI2Q(c, j); + q->fs = &c->fs[i]; + } + } +} + +static int +init(struct cfg_s *c) +{ + int i; + int ac = c->ac; + char * const *av = c->av; + + c->si_len = sizeof(struct dn_sch_inst); + c->q_len = sizeof(struct dn_queue); + moduledata_t *mod = NULL; + struct dn_alg *p = NULL; + + c->th_min = 0; + c->th_max = -20;/* 20 packets per flow */ + c->lmin = c->lmax = 1280; /* packet len */ + c->flows = 1; + c->flowsets = 1; + c->name = "null"; + ac--; av++; + while (ac > 1) { + if (!strcmp(*av, "-n")) { + c->loops = getnum(av[1], NULL, av[0]); + } else if (!strcmp(*av, "-d")) { + debug = atoi(av[1]); + } else if (!strcmp(*av, "-alg")) { + extern moduledata_t *_g_dn_fifo; + extern moduledata_t *_g_dn_wf2qp; + extern moduledata_t *_g_dn_rr; + extern moduledata_t *_g_dn_qfq; +#ifdef WITH_KPS + extern moduledata_t *_g_dn_kps; +#endif + if (!strcmp(av[1], "rr")) + mod = _g_dn_rr; + else if (!strcmp(av[1], "wf2qp")) + mod = _g_dn_wf2qp; + else if (!strcmp(av[1], "fifo")) + mod = _g_dn_fifo; + else if (!strcmp(av[1], "qfq")) + mod = _g_dn_qfq; +#ifdef WITH_KPS + else if (!strcmp(av[1], "kps")) + mod = _g_dn_kps; +#endif + else + mod = NULL; + c->name = mod ? mod->name : "NULL"; + DX(3, "using scheduler %s", c->name); + } else if (!strcmp(*av, "-len")) { + c->lmin = getnum(av[1], NULL, av[0]); + c->lmax = c->lmin; + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-burst")) { + c->maxburst = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmax")) { + c->th_max = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmin")) { + c->th_min = getnum(av[1], NULL, av[0]); + DX(3, "setting min to %d", c->th_min); + } else if (!strcmp(*av, "-flows")) { + c->flows = getnum(av[1], NULL, av[0]); + DX(3, "setting flows to %d", c->flows); + } else if (!strcmp(*av, "-flowsets")) { + parse_flowsets(c, av[1], 0); + DX(3, "setting flowsets to %d", c->flowsets); + } else { + D("option %s not recognised, ignore", *av); + } + ac -= 2; av += 2; + } + if (c->maxburst <= 0) + c->maxburst = 1; + if (c->loops <= 0) + c->loops = 1; + if (c->flows <= 0) + c->flows = 1; + if (c->flowsets <= 0) + c->flowsets = 1; + if (c->lmin <= 0) + c->lmin = 1; + if (c->lmax <= 0) + c->lmax = 1; + /* multiply by N */ + if (c->th_min < 0) + c->th_min = c->flows * -c->th_min; + if (c->th_max < 0) + c->th_max = c->flows * -c->th_max; + if (c->th_max <= c->th_min) + c->th_max = c->th_min + 1; + if (mod) { + p = mod->p; + DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); + DX(3, "modname %s ty %d", p->name, p->type); + c->enq = p->enqueue; + c->deq = p->dequeue; + c->si_len += p->si_datalen; + c->q_len += p->q_datalen; + c->schk_len += p->schk_datalen; + } + /* allocate queues, flowsets and one scheduler */ + c->q = calloc(c->flows, c->q_len); + c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); + c->si = calloc(1, c->si_len); + c->sched = calloc(c->flows, c->schk_len); + if (c->q == NULL || c->fs == NULL) { + D("error allocating memory for flows"); + exit(1); + } + c->si->sched = c->sched; + if (p) { + if (p->config) + p->config(c->sched); + if (p->new_sched) + p->new_sched(c->si); + } + /* parse_flowsets links queues to their flowsets */ + parse_flowsets(c, av[1], 1); + /* complete the work calling new_fsk */ + for (i = 0; i < c->flowsets; i++) { + if (c->fs[i].fs.par[1] == 0) + c->fs[i].fs.par[1] = 1000; /* default pkt len */ + c->fs[i].sched = c->sched; + if (p && p->new_fsk) + p->new_fsk(&c->fs[i]); + } + + /* initialize the lists for the generator, and put + * all flows in the list for backlog = 0 + */ + for (i=0; i <= BACKLOG+5; i++) + INIT_LIST_HEAD(&c->ll[i]); + + for (i = 0; i < c->flows; i++) { + struct dn_queue *q = FI2Q(c, i); + if (q->fs == NULL) + q->fs = &c->fs[0]; /* XXX */ + q->_si = c->si; + if (p && p->new_queue) + p->new_queue(q); + INIT_LIST_HEAD(&q->ni.h); + list_add_tail(&q->ni.h, &c->ll[0]); + } + c->llmask = 1; + return 0; +} + + +int +main(int ac, char *av[]) +{ + struct cfg_s c; + struct timeval end; + double ll; + int i; + char msg[40]; + + bzero(&c, sizeof(c)); + c.ac = ac; + c.av = av; + init(&c); + gettimeofday(&c.time, NULL); + mainloop(&c); + gettimeofday(&end, NULL); + end.tv_sec -= c.time.tv_sec; + end.tv_usec -= c.time.tv_usec; + if (end.tv_usec < 0) { + end.tv_usec += 1000000; + end.tv_sec--; + } + c.time = end; + ll = end.tv_sec*1000000 + end.tv_usec; + ll *= 1000; /* convert to nanoseconds */ + ll /= c._enqueue; + sprintf(msg, "1::%d", c.flows); + D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", + c.name, c._enqueue, c.loops, + (int)c.time.tv_sec, (int)c.time.tv_usec, ll, + c.th_min, c.th_max, + c.fs_config ? c.fs_config : msg, c.drop); + dump(&c); + DX(1, "done ac %d av %p", ac, av); + for (i=0; i < ac; i++) + DX(1, "arg %d %s", i, av[i]); + return 0; +} + +/* + * The controller decides whether in this iteration we should send + * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) + */ +static void +controller(struct cfg_s *c) +{ + struct mbuf *m; + struct dn_fs *fs; + int flow_id; + + /* histeresis between max and min */ + if (c->state == 0 && c->pending >= c->th_max) + c->state = 1; + else if (c->state == 1 && c->pending <= c->th_min) + c->state = 0; + ND(1, "state %d pending %2d", c->state, c->pending); + c->can_dequeue = c->state; + c->tosend = NULL; + if (c->state) + return; + + if (1) { + int i; + struct dn_queue *q; + struct list_head *h; + + i = ffs(c->llmask) - 1; + if (i < 0) { + DX(2, "no candidate"); + c->can_dequeue = 1; + return; + } + h = &c->ll[i]; + ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); + q = list_first_entry(h, struct dn_queue, ni.h); + list_del(&q->ni.h); + flow_id = Q2FI(c, q); + DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); + if (list_empty(h)) { + ND(2, "backlog %d empty", i); + c->llmask &= ~(1<ni.h, h+1); + ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); + if (i < BACKLOG) { + ND(2, "backlog %d full", i+1); + c->llmask |= 1<<(1+i); + } + fs = &q->fs->fs; + c->cur_fs = q->fs - c->fs; + fs->cur = flow_id; + } else { + /* XXX this does not work ? */ + /* now decide whom to send the packet, and the length */ + /* lookup in the flow table */ + if (c->cur_y >= c->max_y) { /* handle wraparound */ + c->cur_y = 0; + c->cur_fs = 0; + } + fs = &c->fs[c->cur_fs].fs; + flow_id = fs->cur++; + if (fs->cur >= fs->next_flow) + fs->cur = fs->first_flow; + c->cur_y++; + if (c->cur_y >= fs->next_y) + c->cur_fs++; + } + + /* construct a packet */ + if (c->freelist) { + m = c->tosend = c->freelist; + c->freelist = c->freelist->m_nextpkt; + } else { + m = c->tosend = calloc(1, sizeof(struct mbuf)); + } + if (m == NULL) + return; + + m->cfg = c; + m->m_nextpkt = NULL; + m->m_pkthdr.len = fs->par[1]; // XXX maxlen + m->flow_id = flow_id; + + ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", + c->cur_y, m->flow_id, c->cur_fs, + fs->par[0], m->m_pkthdr.len); + +} + +/* +Packet allocation: +to achieve a distribution that matches weights, for each X=w/lmax class +we should generate a number of packets proportional to Y = X times the number +of flows in the class. +So we construct an array with the cumulative distribution of Y's, +and use it to identify the flow via inverse mapping (if the Y's are +not too many we can use an array for the lookup). In practice, +each flow will have X entries [virtually] pointing to it. + +*/ diff --git a/test/memory_leak.sh b/test/memory_leak.sh new file mode 100644 index 0000000..9bdf093 --- /dev/null +++ b/test/memory_leak.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# this script execute N times the command CMD +# collecting the memory usage on a file. +# The value of the Dirty memory should not increase +# between tests. + +BASE_NAME=ipfw_r5808_ +N=10000 +CMD1="/sbin/insmod ../dummynet2/ipfw_mod.ko" +CMD2="/sbin/rmmod ipfw_mod" + +# main +# remove any previous loaded module +/sbin/rmmod ipfw_mod + +# pre + +for n in `seq $N`; do + $CMD1 + $CMD2 + [ $n = 10 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} + [ $n = 100 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} + [ $n = 1000 ] && cat /proc/meminfo > /tmp/${BASE_NAME}_${n} +done; + +# post diff --git a/test/mylist.h b/test/mylist.h new file mode 100644 index 0000000..b546fc2 --- /dev/null +++ b/test/mylist.h @@ -0,0 +1,49 @@ +/* + * $Id: mylist.h 5626 2010-03-04 21:55:22Z luigi $ + * + * linux-like bidirectional lists + */ + +#ifndef _MYLIST_H +#define _MYLIST_H +struct list_head { + struct list_head *prev, *next; +}; + +#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) +#define list_empty(l) ( (l)->next == l ) +static inline void +__list_add(struct list_head *o, struct list_head *prev, + struct list_head *next) +{ + next->prev = o; + o->next = next; + o->prev = prev; + prev->next = o; +} + +static inline void +list_add_tail(struct list_head *o, struct list_head *head) +{ + __list_add(o, head->prev, head); +} + +#define list_first_entry(pL, ty, member) \ + (ty *)((char *)((pL)->next) - offsetof(ty, member)) + +static inline void +__list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void +list_del(struct list_head *entry) +{ + ND("called on %p", entry); + __list_del(entry->prev, entry->next); + entry->next = entry->prev = NULL; +} + +#endif /* _MYLIST_H */ diff --git a/test/profile_bench1 b/test/profile_bench1 new file mode 100644 index 0000000..797650f --- /dev/null +++ b/test/profile_bench1 @@ -0,0 +1,26 @@ +profile_no 100 +delay prob +207 0.000264 +255 0.034117 +270 0.072280 +279 0.106749 +288 0.148604 +298 0.184304 +302 0.202194 +353 0.384541 +423 0.588842 +510 0.782126 +516 0.800970 +545 0.845706 +553 0.861411 +573 0.889430 +586 0.912117 +620 0.920003 +661 0.938308 +695 0.944191 +740 0.949112 +765 0.952598 +848 0.957109 +1379 0.983768 +1555 0.983778 +1649 1 diff --git a/test/profile_bench2 b/test/profile_bench2 new file mode 100644 index 0000000..c733868 --- /dev/null +++ b/test/profile_bench2 @@ -0,0 +1,7 @@ +samples 10 +delay prob +0 0 +250 0 +250 0.5 +500 0.5 +500 1 diff --git a/test/profile_bench3 b/test/profile_bench3 new file mode 100644 index 0000000..5d1722e --- /dev/null +++ b/test/profile_bench3 @@ -0,0 +1,5 @@ +profile_no 100 +delay prob +0 0 +50 0.5 +100 1 diff --git a/test/test_dn_heap.c b/test/test_dn_heap.c new file mode 100644 index 0000000..7d3dc05 --- /dev/null +++ b/test/test_dn_heap.c @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Userland code for testing binary heaps and hash tables + * + * $Id: test_dn_heap.c 6131 2010-04-22 15:37:36Z svn_panicucci $ + */ + +#include +#include + +#include +#include +#include +#include "dn_test.h" +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) + +#include + +struct x { + struct x *ht_link; + char buf[0]; +}; + +uint32_t hf(uintptr_t key, int flags, void *arg) +{ + return (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf[0] : *(char *)key; +} + +int matchf(void *obj, uintptr_t key, int flags, void *arg) +{ + char *s = (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf : (char *)key; + return (strcmp(((struct x *)obj)->buf, s) == 0); +} + +void *newfn(uintptr_t key, int flags, void *arg) +{ + char *s = (char *)key; + struct x *p = malloc(sizeof(*p) + 1 + strlen(s)); + if (p) + strcpy(p->buf, s); + return p; +} + +char *strings[] = { + "undici", "unico", "doppio", "devoto", + "uno", "due", "tre", "quattro", "cinque", "sei", + "uno", "due", "tre", "quattro", "cinque", "sei", + NULL, +}; + +int doprint(void *_x, void *arg) +{ + struct x *x = _x; + printf("found element <%s>\n", x->buf); + return (int)arg; +} + +static void +test_hash() +{ + char **p; + struct dn_ht *h; + uintptr_t x = 0; + uintptr_t x1 = 0; + + /* first, find and allocate */ + h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn); + + for (p = strings; *p; p++) { + dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("/* second -- find without allocate */\n"); + h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL); + for (p = strings; *p; p++) { + void **y = newfn((uintptr_t)*p, 0, NULL); + if (x == 0) + x = (uintptr_t)y; + else { + if (x1 == 0) + x1 = (uintptr_t)*p; + } + dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + dn_ht_scan(h, doprint, 0); +} + +int +main(int argc, char *argv[]) +{ + struct dn_heap h; + int i, n, n2, n3; + + test_hash(); + return 0; + + /* n = elements, n2 = cycles */ + n = (argc > 1) ? atoi(argv[1]) : 0; + if (n <= 0 || n > 1000000) + n = 100; + n2 = (argc > 2) ? atoi(argv[2]) : 0; + if (n2 <= 0) + n = 1000000; + n3 = (argc > 3) ? atoi(argv[3]) : 0; + bzero(&h, sizeof(h)); + heap_init(&h, n, -1); + while (n2-- > 0) { + uint64_t prevk = 0; + for (i=0; i < n; i++) + heap_insert(&h, n3 ? n-i: random(), (void *)(100+i)); + + for (i=0; h.elements > 0; i++) { + uint64_t k = h.p[0].key; + if (k < prevk) + panic("wrong sequence\n"); + prevk = k; + if (0) + printf("%d key %llu, val %p\n", + i, h.p[0].key, h.p[0].object); + heap_extract(&h, NULL); + } + } + return 0; +} diff --git a/test/test_dn_sched.c b/test/test_dn_sched.c new file mode 100644 index 0000000..65bbf18 --- /dev/null +++ b/test/test_dn_sched.c @@ -0,0 +1,89 @@ +/* + * $Id: test_dn_sched.c 5626 2010-03-04 21:55:22Z luigi $ + * + * library functions for userland testing of dummynet schedulers + */ + +#include "dn_test.h" + +void +m_freem(struct mbuf *m) +{ + printf("free %p\n", m); +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + return 0; +} + +void +dn_free_pkts(struct mbuf *m) +{ + struct mbuf *x; + while ( (x = m) ) { + m = m->m_nextpkt; + m_freem(x); + } +} + +int +dn_delete_queue(void *_q, void *do_free) +{ + struct dn_queue *q = _q; + if (q->mq.head) + dn_free_pkts(q->mq.head); + free(q); + return 0; +} + +/* + * This is a simplified function for testing purposes, which does + * not implement statistics or random loss. + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + if (drop) + goto drop; + if (q->ni.length >= 200) + goto drop; + mq_append(&q->mq, m); + q->ni.length++; + q->ni.tot_bytes += m->m_pkthdr.len; + return 0; + +drop: + q->ni.drops++; + return 1; +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + if (*v < lo) { + *v = dflt; + } else if (*v > hi) { + *v = hi; + } + return *v; +} + +#ifndef __FreeBSD__ +int +fls(int mask) +{ + int bit; + + if (mask == 0) + return (0); + for (bit = 1; mask != 1; bit++) + mask = (unsigned int)mask >> 1; + return (bit); +} +#endif