X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=datapath%2Fdatapath.c;h=c2ffc4a16f25bb4bee2a4131a1811d7ea182f523;hb=176265ed070cb38314857d0e7c08269d73967e10;hp=f2aa8625a90d16069fbf21782435d24083bc630d;hpb=de6e0a76a3f14047a8bf84654de428ac0f33878b;p=sliver-openvswitch.git

diff --git a/datapath/datapath.c b/datapath/datapath.c
index f2aa8625a..c2ffc4a16 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -6,6 +6,7 @@
 
 /* Functions for managing the dp interface/device. */
 
+#include <linux/init.h>
 #include <linux/module.h>
 #include <linux/if_arp.h>
 #include <linux/if_bridge.h>
@@ -13,11 +14,12 @@
 #include <linux/in.h>
 #include <net/genetlink.h>
 #include <linux/ip.h>
+#include <linux/delay.h>
 #include <linux/etherdevice.h>
 #include <linux/kernel.h>
+#include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/rtnetlink.h>
-#include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/version.h>
 #include <linux/ethtool.h>
@@ -26,71 +28,196 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/inetdevice.h>
 #include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/workqueue.h>
 
 #include "openflow-netlink.h"
 #include "datapath.h"
 #include "table.h"
 #include "chain.h"
+#include "dp_dev.h"
 #include "forward.h"
 #include "flow.h"
-#include "datapath_t.h"
 
 #include "compat.h"
 
 
-/* Number of seconds between runs of the flow expiration code. */
-#define EXPIRE_SECS 1
+/* Strings to describe the manufacturer, hardware, and software.  This data 
+ * is queriable through the switch description stats message. */
+static char mfr_desc[DESC_STR_LEN] = "Nicira Networks";
+static char hw_desc[DESC_STR_LEN] = "Reference Linux Kernel Module";
+static char sw_desc[DESC_STR_LEN] = VERSION;
+static char serial_num[SERIAL_NUM_LEN] = "None";
 
-#define BRIDGE_PORT_NO_FLOOD	0x00000001 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+module_param_string(mfr_desc, mfr_desc, sizeof mfr_desc, 0444);
+module_param_string(hw_desc, hw_desc, sizeof hw_desc, 0444);
+module_param_string(sw_desc, sw_desc, sizeof sw_desc, 0444);
+module_param_string(serial_num, serial_num, sizeof serial_num, 0444);
+#else
+MODULE_PARM(mfr_desc, "s");
+MODULE_PARM(hw_desc, "s");
+MODULE_PARM(sw_desc, "s");
+MODULE_PARM(serial_num, "s");
+#endif
 
-#define UINT32_MAX			  4294967295U
 
-struct net_bridge_port {
-	u16	port_no;
-	u32 flags;
-	struct datapath	*dp;
-	struct net_device *dev;
-	struct list_head node; /* Element in datapath.ports. */
-};
+/* Number of milliseconds between runs of the maintenance thread. */
+#define MAINT_SLEEP_MSECS 1000
+
+#define UINT32_MAX			  4294967295U
+#define UINT16_MAX			  65535
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
 
 static struct genl_family dp_genl_family;
 static struct genl_multicast_group mc_group;
 
-int dp_dev_setup(struct net_device *dev);  
-
 /* It's hard to imagine wanting more than one datapath, but... */
 #define DP_MAX 32
 
-/* datapaths.  Protected on the read side by rcu_read_lock, on the write side
- * by dp_mutex.
+/* Datapaths.  Protected on the read side by rcu_read_lock, on the write side
+ * by dp_mutex.  dp_mutex is almost completely redundant with genl_mutex
+ * maintained by the Generic Netlink code, but the timeout path needs mutual
+ * exclusion too.
  *
  * It is safe to access the datapath and net_bridge_port structures with just
- * the dp_mutex, but to access the chain you need to take the rcu_read_lock
- * also (because dp_mutex doesn't prevent flows from being destroyed).
+ * dp_mutex.
  */
 static struct datapath *dps[DP_MAX];
-static DEFINE_MUTEX(dp_mutex);
+DEFINE_MUTEX(dp_mutex);
+EXPORT_SYMBOL(dp_mutex);
 
-static void dp_timer_handler(unsigned long arg);
+static int dp_maint_func(void *data);
+static int update_port_status(struct net_bridge_port *p);
 static int send_port_status(struct net_bridge_port *p, uint8_t status);
+static int dp_genl_openflow_done(struct netlink_callback *);
+static struct net_bridge_port *new_nbp(struct datapath *,
+				       struct net_device *, int port_no);
+static int del_switch_port(struct net_bridge_port *);
 
-
-/* nla_unreserve - reduce amount of space reserved by nla_reserve  
+/* nla_shrink - reduce amount of space reserved by nla_reserve
  * @skb: socket buffer from which to recover room
  * @nla: netlink attribute to adjust
- * @len: amount by which to reduce attribute payload
+ * @len: new length of attribute payload
  *
  * Reduces amount of space reserved by a call to nla_reserve.
  *
  * No other attributes may be added between calling nla_reserve and this
  * function, since it will create a hole in the message.
  */
-void nla_unreserve(struct sk_buff *skb, struct nlattr *nla, int len)
+void nla_shrink(struct sk_buff *skb, struct nlattr *nla, int len)
+{
+	int delta = nla_total_size(len) - nla_total_size(nla_len(nla));
+	BUG_ON(delta > 0);
+	skb->tail += delta;
+	skb->len  += delta;
+	nla->nla_len = nla_attr_size(len);
+}
+
+/* Puts a set of openflow headers for a message of the given 'type' into 'skb'.
+ * If 'sender' is nonnull, then it is used as the message's destination.  'dp'
+ * must specify the datapath to use.
+ *
+ * '*max_openflow_len' receives the maximum number of bytes that are available
+ * for the embedded OpenFlow message.  The caller must call
+ * resize_openflow_skb() to set the actual size of the message to this number
+ * of bytes or less.
+ *
+ * Returns the openflow header if successful, otherwise (if 'skb' is too small)
+ * an error code. */
+static void *
+put_openflow_headers(struct datapath *dp, struct sk_buff *skb, uint8_t type,
+		     const struct sender *sender, int *max_openflow_len)
+{
+	struct ofp_header *oh;
+	struct nlattr *attr;
+	int openflow_len;
+
+	/* Assemble the Generic Netlink wrapper. */
+	if (!genlmsg_put(skb,
+			 sender ? sender->pid : 0,
+			 sender ? sender->seq : 0,
+			 &dp_genl_family, 0, DP_GENL_C_OPENFLOW))
+		return ERR_PTR(-ENOBUFS);
+	if (nla_put_u32(skb, DP_GENL_A_DP_IDX, dp->dp_idx) < 0)
+		return ERR_PTR(-ENOBUFS);
+	openflow_len = (skb_tailroom(skb) - NLA_HDRLEN) & ~(NLA_ALIGNTO - 1);
+	if (openflow_len < sizeof *oh)
+		return ERR_PTR(-ENOBUFS);
+	*max_openflow_len = openflow_len;
+	attr = nla_reserve(skb, DP_GENL_A_OPENFLOW, openflow_len);
+	BUG_ON(!attr);
+
+	/* Fill in the header.  The caller is responsible for the length. */
+	oh = nla_data(attr);
+	oh->version = OFP_VERSION;
+	oh->type = type;
+	oh->xid = sender ? sender->xid : 0;
+
+	return oh;
+}
+
+/* Resizes OpenFlow header 'oh', which must be at the tail end of 'skb', to new
+ * length 'new_length' (in bytes), adjusting pointers and size values as
+ * necessary. */
+static void
+resize_openflow_skb(struct sk_buff *skb,
+		    struct ofp_header *oh, size_t new_length)
+{
+	struct nlattr *attr = ((void *) oh) - NLA_HDRLEN;
+	nla_shrink(skb, attr, new_length);
+	oh->length = htons(new_length);
+	nlmsg_end(skb, (struct nlmsghdr *) skb->data);
+}
+
+/* Allocates a new skb to contain an OpenFlow message 'openflow_len' bytes in
+ * length.  Returns a null pointer if memory is unavailable, otherwise returns
+ * the OpenFlow header and stores a pointer to the skb in '*pskb'. 
+ *
+ * 'type' is the OpenFlow message type.  If 'sender' is nonnull, then it is
+ * used as the message's destination.  'dp' must specify the datapath to
+ * use.  */
+static void *
+alloc_openflow_skb(struct datapath *dp, size_t openflow_len, uint8_t type,
+		   const struct sender *sender, struct sk_buff **pskb) 
 {
-	skb->tail -= len;
-	skb->len  -= len;
+	struct ofp_header *oh;
+	size_t genl_len;
+	struct sk_buff *skb;
+	int max_openflow_len;
+
+	if ((openflow_len + sizeof(struct ofp_header)) > UINT16_MAX) {
+		if (net_ratelimit())
+			printk("alloc_openflow_skb: openflow message too large: %zu\n", 
+					openflow_len);
+		return NULL;
+	}
+
+	genl_len = nlmsg_total_size(GENL_HDRLEN + dp_genl_family.hdrsize);
+	genl_len += nla_total_size(sizeof(uint32_t)); /* DP_GENL_A_DP_IDX */
+	genl_len += nla_total_size(openflow_len);    /* DP_GENL_A_OPENFLOW */
+	skb = *pskb = genlmsg_new(genl_len, GFP_ATOMIC);
+	if (!skb) {
+		if (net_ratelimit())
+			printk("alloc_openflow_skb: genlmsg_new failed\n");
+		return NULL;
+	}
 
-	nla->nla_len -= len;
+	oh = put_openflow_headers(dp, skb, type, sender, &max_openflow_len);
+	BUG_ON(!oh || IS_ERR(oh));
+	resize_openflow_skb(skb, oh, openflow_len);
+
+	return oh;
+}
+
+/* Sends 'skb' to 'sender' if it is nonnull, otherwise multicasts 'skb' to all
+ * listeners. */
+static int
+send_openflow_skb(struct sk_buff *skb, const struct sender *sender) 
+{
+	return (sender
+		? genlmsg_unicast(skb, sender->pid)
+		: genlmsg_multicast(skb, 0, mc_group.id, GFP_ATOMIC));
 }
 
 /* Generates a unique datapath id.  It incorporates the datapath index
@@ -129,9 +256,7 @@ uint64_t gen_datapath_id(uint16_t dp_idx)
 }
 
 /* Creates a new datapath numbered 'dp_idx'.  Returns 0 for success or a
- * negative error code.
- *
- * Not called with any locks. */
+ * negative error code. */
 static int new_dp(int dp_idx)
 {
 	struct datapath *dp;
@@ -143,9 +268,8 @@ static int new_dp(int dp_idx)
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	mutex_lock(&dp_mutex);
-	dp = rcu_dereference(dps[dp_idx]);
-	if (dp != NULL) {
+	/* Exit early if a datapath with that number already exists. */
+	if (dps[dp_idx]) {
 		err = -EEXIST;
 		goto err_unlock;
 	}
@@ -155,42 +279,49 @@ static int new_dp(int dp_idx)
 	if (dp == NULL)
 		goto err_unlock;
 
+	/* Setup our "of" device */
+	err = dp_dev_setup(dp);
+	if (err)
+		goto err_free_dp;
+
 	dp->dp_idx = dp_idx;
 	dp->id = gen_datapath_id(dp_idx);
 	dp->chain = chain_create(dp);
 	if (dp->chain == NULL)
-		goto err_free_dp;
+		goto err_destroy_dp_dev;
 	INIT_LIST_HEAD(&dp->port_list);
 
-#if 0
-	/* Setup our "of" device */
-	dp->dev.priv = dp;
-	rtnl_lock();
-	err = dp_dev_setup(&dp->dev);
-	rtnl_unlock();
-	if (err != 0) 
-		printk("datapath: problem setting up 'of' device\n");
-#endif
+	dp->local_port = new_nbp(dp, dp->netdev, OFPP_LOCAL);
+	if (IS_ERR(dp->local_port)) {
+		err = PTR_ERR(dp->local_port);
+		goto err_destroy_local_port;
+	}
 
+	dp->flags = 0;
 	dp->miss_send_len = OFP_DEFAULT_MISS_SEND_LEN;
 
-	setup_timer(&dp->timer, dp_timer_handler, (unsigned long) dp);
-	mod_timer(&dp->timer, round_jiffies(jiffies + (EXPIRE_SECS * HZ)));
+	dp->dp_task = kthread_run(dp_maint_func, dp, "dp%d", dp_idx);
+	if (IS_ERR(dp->dp_task))
+		goto err_destroy_chain;
 
-	rcu_assign_pointer(dps[dp_idx], dp);
-	mutex_unlock(&dp_mutex);
+	dps[dp_idx] = dp;
 
 	return 0;
 
+err_destroy_local_port:
+	del_switch_port(dp->local_port);
+err_destroy_chain:
+	chain_destroy(dp->chain);
+err_destroy_dp_dev:
+	dp_dev_destroy(dp);
 err_free_dp:
 	kfree(dp);
 err_unlock:
-	mutex_unlock(&dp_mutex);
 	module_put(THIS_MODULE);
 		return err;
 }
 
-/* Find and return a free port number under 'dp'.  Called under dp_mutex. */
+/* Find and return a free port number under 'dp'. */
 static int find_portno(struct datapath *dp)
 {
 	int i;
@@ -201,50 +332,53 @@ static int find_portno(struct datapath *dp)
 }
 
 static struct net_bridge_port *new_nbp(struct datapath *dp,
-									   struct net_device *dev)
+				       struct net_device *dev, int port_no)
 {
 	struct net_bridge_port *p;
-	int port_no;
 
-	port_no = find_portno(dp);
-	if (port_no < 0)
-		return ERR_PTR(port_no);
+	if (dev->br_port != NULL)
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (p == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	p->dp = dp;
+	rtnl_lock();
+	dev_set_promiscuity(dev, 1);
+	rtnl_unlock();
 	dev_hold(dev);
+	p->dp = dp;
 	p->dev = dev;
 	p->port_no = port_no;
+	spin_lock_init(&p->lock);
+	INIT_WORK(&p->port_task, NULL);
+	if (port_no != OFPP_LOCAL)
+		rcu_assign_pointer(dev->br_port, p);
+	if (port_no < OFPP_MAX)
+		rcu_assign_pointer(dp->ports[port_no], p); 
+	list_add_rcu(&p->node, &dp->port_list);
 
 	return p;
 }
 
-/* Called with dp_mutex. */
 int add_switch_port(struct datapath *dp, struct net_device *dev)
 {
 	struct net_bridge_port *p;
+	int port_no;
 
-	if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER)
+	if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER
+	    || is_dp_dev(dev))
 		return -EINVAL;
 
-	if (dev->br_port != NULL)
-		return -EBUSY;
+	port_no = find_portno(dp);
+	if (port_no < 0)
+		return port_no;
 
-	p = new_nbp(dp, dev);
+	p = new_nbp(dp, dev, port_no);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
-	dev_hold(dev);
-	rcu_assign_pointer(dev->br_port, p);
-	rtnl_lock();
-	dev_set_promiscuity(dev, 1);
-	rtnl_unlock();
-
-	rcu_assign_pointer(dp->ports[p->port_no], p);
-	list_add_rcu(&p->node, &dp->port_list);
+	update_port_status(p);
 
 	/* Notify the ctlpath that this port has been added */
 	send_port_status(p, OFPPR_ADD);
@@ -252,16 +386,17 @@ int add_switch_port(struct datapath *dp, struct net_device *dev)
 	return 0;
 }
 
-/* Delete 'p' from switch.
- * Called with dp_mutex. */
+/* Delete 'p' from switch. */
 static int del_switch_port(struct net_bridge_port *p)
 {
 	/* First drop references to device. */
+	cancel_work_sync(&p->port_task);
 	rtnl_lock();
 	dev_set_promiscuity(p->dev, -1);
 	rtnl_unlock();
 	list_del_rcu(&p->node);
-	rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
+	if (p->port_no != OFPP_LOCAL)
+		rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
 	rcu_assign_pointer(p->dev->br_port, NULL);
 
 	/* Then wait until no one is still using it, and destroy it. */
@@ -276,24 +411,26 @@ static int del_switch_port(struct net_bridge_port *p)
 	return 0;
 }
 
-/* Called with dp_mutex. */
 static void del_dp(struct datapath *dp)
 {
 	struct net_bridge_port *p, *n;
 
-#if 0
-	/* Unregister the "of" device of this dp */
-	rtnl_lock();
-	unregister_netdevice(&dp->dev);
-	rtnl_unlock();
-#endif
+	kthread_stop(dp->dp_task);
 
 	/* Drop references to DP. */
 	list_for_each_entry_safe (p, n, &dp->port_list, node)
 		del_switch_port(p);
-	del_timer_sync(&dp->timer);
 	rcu_assign_pointer(dps[dp->dp_idx], NULL);
 
+	/* Kill off local_port dev references from buffered packets that have
+	 * associated dst entries. */
+	synchronize_rcu();
+	fwd_discard_all();
+
+	/* Destroy dp->netdev.  (Must follow deleting switch ports since
+	 * dp->local_port has a reference to it.) */
+	dp_dev_destroy(dp);
+
 	/* Wait until no longer in use, then destroy it. */
 	synchronize_rcu();
 	chain_destroy(dp->chain);
@@ -301,80 +438,62 @@ static void del_dp(struct datapath *dp)
 	module_put(THIS_MODULE);
 }
 
-static void dp_timer_handler(unsigned long arg)
+static int dp_maint_func(void *data)
 {
-	struct datapath *dp = (struct datapath *) arg;
-#if 1
-	chain_timeout(dp->chain);
-#else
-	int count = chain_timeout(dp->chain);
-	chain_print_stats(dp->chain);
-	if (count)
-		printk("%d flows timed out\n", count);
-#endif
-	mod_timer(&dp->timer, round_jiffies(jiffies + (EXPIRE_SECS * HZ)));
+	struct datapath *dp = (struct datapath *) data;
+
+	while (!kthread_should_stop()) {
+		struct net_bridge_port *p;
+
+		/* Check if port status has changed */
+		rcu_read_lock();
+		list_for_each_entry_rcu (p, &dp->port_list, node) 
+			if (update_port_status(p)) 
+				send_port_status(p, OFPPR_MOD);
+		rcu_read_unlock();
+
+		/* Timeout old entries */
+		chain_timeout(dp->chain);
+		msleep_interruptible(MAINT_SLEEP_MSECS);
+	}
+		
+	return 0;
+}
+
+static void
+do_port_input(struct net_bridge_port *p, struct sk_buff *skb) 
+{
+	/* Push the Ethernet header back on. */
+	skb_push(skb, ETH_HLEN);
+	fwd_port_input(p->dp->chain, skb, p);
 }
 
 /*
  * Used as br_handle_frame_hook.  (Cannot run bridge at the same time, even on
- * different set of devices!)  Returns 0 if *pskb should be processed further,
- * 1 if *pskb is handled. */
+ * different set of devices!)
+ */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
 /* Called with rcu_read_lock. */
 static struct sk_buff *dp_frame_hook(struct net_bridge_port *p,
 					 struct sk_buff *skb)
 {
-	struct ethhdr *eh = eth_hdr(skb);
-	struct sk_buff *skb_local = NULL;
-
-
-	if (compare_ether_addr(eh->h_dest, skb->dev->dev_addr) == 0) 
-		return skb;
-
-	if (is_broadcast_ether_addr(eh->h_dest)
-				|| is_multicast_ether_addr(eh->h_dest)
-				|| is_local_ether_addr(eh->h_dest)) 
-		skb_local = skb_clone(skb, GFP_ATOMIC);
-
-	/* Push the Ethernet header back on. */
-	if (skb->protocol == htons(ETH_P_8021Q))
-		skb_push(skb, VLAN_ETH_HLEN);
-	else
-		skb_push(skb, ETH_HLEN);
-
-	fwd_port_input(p->dp->chain, skb, p->port_no);
-
-	return skb_local;
+	do_port_input(p, skb);
+	return NULL;
 }
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 static int dp_frame_hook(struct net_bridge_port *p, struct sk_buff **pskb)
 {
-	/* Push the Ethernet header back on. */
-	if ((*pskb)->protocol == htons(ETH_P_8021Q))
-		skb_push(*pskb, VLAN_ETH_HLEN);
-	else
-		skb_push(*pskb, ETH_HLEN);
-
-	fwd_port_input(p->dp->chain, *pskb, p->port_no);
+	do_port_input(p, *pskb);
 	return 1;
 }
-#else 
+#else
 /* NB: This has only been tested on 2.4.35 */
-
-/* Called without any locks (?) */
 static void dp_frame_hook(struct sk_buff *skb)
 {
 	struct net_bridge_port *p = skb->dev->br_port;
-
-	/* Push the Ethernet header back on. */
-	if (skb->protocol == htons(ETH_P_8021Q))
-		skb_push(skb, VLAN_ETH_HLEN);
-	else
-		skb_push(skb, ETH_HLEN);
-
 	if (p) {
 		rcu_read_lock();
-		fwd_port_input(p->dp->chain, skb, p->port_no);
+		do_port_input(p, skb);
 		rcu_read_unlock();
 	} else
 		kfree_skb(skb);
@@ -384,17 +503,6 @@ static void dp_frame_hook(struct sk_buff *skb)
 /* Forwarding output path.
  * Based on net/bridge/br_forward.c. */
 
-/* Don't forward packets to originating port or with flooding disabled */
-static inline int should_deliver(const struct net_bridge_port *p,
-			const struct sk_buff *skb)
-{
-	if ((skb->dev == p->dev) || (p->flags & BRIDGE_PORT_NO_FLOOD)) {
-		return 0;
-	} 
-
-	return 1;
-}
-
 static inline unsigned packet_length(const struct sk_buff *skb)
 {
 	int length = skb->len - ETH_HLEN;
@@ -403,15 +511,18 @@ static inline unsigned packet_length(const struct sk_buff *skb)
 	return length;
 }
 
+/* Send packets out all the ports except the originating one.  If the
+ * "flood" argument is set, only send along the minimum spanning tree.
+ */
 static int
-flood(struct datapath *dp, struct sk_buff *skb)
+output_all(struct datapath *dp, struct sk_buff *skb, int flood)
 {
+	u32 disable = flood ? OFPPFL_NO_FLOOD : 0;
 	struct net_bridge_port *p;
-	int prev_port;
+	int prev_port = -1;
 
-	prev_port = -1;
 	list_for_each_entry_rcu (p, &dp->port_list, node) {
-		if (!should_deliver(p, skb))
+		if (skb->dev == p->dev || p->flags & disable)
 			continue;
 		if (prev_port != -1) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -419,12 +530,12 @@ flood(struct datapath *dp, struct sk_buff *skb)
 				kfree_skb(skb);
 				return -ENOMEM;
 			}
-			dp_output_port(dp, clone, prev_port); 
+			dp_output_port(dp, clone, prev_port, 0); 
 		}
 		prev_port = p->port_no;
 	}
 	if (prev_port != -1)
-		dp_output_port(dp, skb, prev_port);
+		dp_output_port(dp, skb, prev_port, 0);
 	else
 		kfree_skb(skb);
 
@@ -436,37 +547,22 @@ flood(struct datapath *dp, struct sk_buff *skb)
 int dp_set_origin(struct datapath *dp, uint16_t in_port,
 			   struct sk_buff *skb)
 {
-	if (in_port < OFPP_MAX && dp->ports[in_port]) {
-		skb->dev = dp->ports[in_port]->dev;
+	struct net_bridge_port *p = (in_port < OFPP_MAX ? dp->ports[in_port]
+				     : in_port == OFPP_LOCAL ? dp->local_port
+				     : NULL);
+	if (p) {
+		skb->dev = p->dev;
 		return 0;
 	}
 	return -ENOENT;
 }
 
-/* Takes ownership of 'skb' and transmits it to 'out_port' on 'dp'.
- */
-int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port)
+static int xmit_skb(struct sk_buff *skb)
 {
-	struct net_bridge_port *p;
 	int len = skb->len;
-
-	BUG_ON(!skb);
-	if (out_port == OFPP_FLOOD)
-		return flood(dp, skb);
-	else if (out_port == OFPP_CONTROLLER)
-		return dp_output_control(dp, skb, fwd_save_skb(skb), 0,
-						  OFPR_ACTION);
-	else if (out_port >= OFPP_MAX)
-		goto bad_port;
-
-	p = dp->ports[out_port];
-	if (p == NULL)
-		goto bad_port;
-
-	skb->dev = p->dev;
 	if (packet_length(skb) > skb->dev->mtu) {
 		printk("dropped over-mtu packet: %d > %d\n",
-					packet_length(skb), skb->dev->mtu);
+			   packet_length(skb), skb->dev->mtu);
 		kfree_skb(skb);
 		return -E2BIG;
 	}
@@ -474,6 +570,71 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port)
 	dev_queue_xmit(skb);
 
 	return len;
+}
+
+/* Takes ownership of 'skb' and transmits it to 'out_port' on 'dp'.
+ */
+int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
+		   int ignore_no_fwd)
+{
+	BUG_ON(!skb);
+	switch (out_port){
+	case OFPP_IN_PORT:
+		/* Send it out the port it came in on, which is already set in
+		 * the skb. */
+		if (!skb->dev) {
+			if (net_ratelimit())
+				printk("skb device not set forwarding to in_port\n");
+			kfree(skb);
+			return -ESRCH;
+		}
+		return xmit_skb(skb);
+		
+	case OFPP_TABLE: {
+		int retval = run_flow_through_tables(dp->chain, skb,
+						     skb->dev->br_port);
+		if (retval)
+			kfree_skb(skb);
+		return retval;
+	}
+
+	case OFPP_FLOOD:
+		return output_all(dp, skb, 1);
+
+	case OFPP_ALL:
+		return output_all(dp, skb, 0);
+
+	case OFPP_CONTROLLER:
+		return dp_output_control(dp, skb, fwd_save_skb(skb), 0,
+						  OFPR_ACTION);
+
+	case OFPP_LOCAL: {
+		struct net_device *dev = dp->netdev;
+		return dev ? dp_dev_recv(dev, skb) : -ESRCH;
+	}
+
+	case 0 ... OFPP_MAX-1: {
+		struct net_bridge_port *p = dp->ports[out_port];
+		if (p == NULL)
+			goto bad_port;
+		if (p->dev == skb->dev) {
+			/* To send to the input port, must use OFPP_IN_PORT */
+			kfree_skb(skb);
+			if (net_ratelimit())
+				printk("can't directly forward to input port\n");
+			return -EINVAL;
+		}
+		if (p->flags & OFPPFL_NO_FWD && !ignore_no_fwd) {
+			kfree_skb(skb);
+			return 0;
+		}
+		skb->dev = p->dev; 
+		return xmit_skb(skb);
+	}
+
+	default:
+		goto bad_port;
+	}
 
 bad_port:
 	kfree_skb(skb);
@@ -492,86 +653,60 @@ int
 dp_output_control(struct datapath *dp, struct sk_buff *skb,
 			   uint32_t buffer_id, size_t max_len, int reason)
 {
-	/* FIXME? packet_rcv_spkt in net/packet/af_packet.c does some stuff
-	   that we should possibly be doing here too. */
 	/* FIXME?  Can we avoid creating a new skbuff in the case where we
 	 * forward the whole packet? */
 	struct sk_buff *f_skb;
-	struct nlattr *attr;
 	struct ofp_packet_in *opi;
-	size_t opi_len;
-	size_t len, fwd_len;
-	void *data;
-	int err = -ENOMEM;
+	struct net_bridge_port *p;
+	size_t fwd_len, opi_len;
+	int err;
 
 	fwd_len = skb->len;
 	if ((buffer_id != (uint32_t) -1) && max_len)
 		fwd_len = min(fwd_len, max_len);
 
-	len = nla_total_size(offsetof(struct ofp_packet_in, data) + fwd_len) 
-				+ nla_total_size(sizeof(uint32_t));
-
-	f_skb = genlmsg_new(len, GFP_ATOMIC); 
-	if (!f_skb)
-		goto error_free_skb;
-
-	data = genlmsg_put(f_skb, 0, 0, &dp_genl_family, 0,
-				DP_GENL_C_OPENFLOW);
-	if (data == NULL)
-		goto error_free_f_skb;
-
-	NLA_PUT_U32(f_skb, DP_GENL_A_DP_IDX, dp->dp_idx);
-
 	opi_len = offsetof(struct ofp_packet_in, data) + fwd_len;
-	attr = nla_reserve(f_skb, DP_GENL_A_OPENFLOW, opi_len);
-	if (!attr)
-		goto error_free_f_skb;
-	opi = nla_data(attr);
-	opi->header.version = OFP_VERSION;
-	opi->header.type    = OFPT_PACKET_IN;
-	opi->header.length  = htons(opi_len);
-	opi->header.xid     = htonl(0);
-
+	opi = alloc_openflow_skb(dp, opi_len, OFPT_PACKET_IN, NULL, &f_skb);
+	if (!opi) {
+		err = -ENOMEM;
+		goto out;
+	}
 	opi->buffer_id      = htonl(buffer_id);
 	opi->total_len      = htons(skb->len);
-	opi->in_port        = htons(skb->dev->br_port->port_no);
+	p = skb->dev->br_port;
+	opi->in_port        = htons(p ? p->port_no : OFPP_LOCAL);
 	opi->reason         = reason;
 	opi->pad            = 0;
-	SKB_LINEAR_ASSERT(skb);
 	memcpy(opi->data, skb_mac_header(skb), fwd_len);
+	err = send_openflow_skb(f_skb, NULL);
 
-	err = genlmsg_end(f_skb, data);
-	if (err < 0)
-		goto error_free_f_skb;
-
-	err = genlmsg_multicast(f_skb, 0, mc_group.id, GFP_ATOMIC);
-	if (err && net_ratelimit())
-		printk(KERN_WARNING "dp_output_control: genlmsg_multicast failed: %d\n", err);
-
-	kfree_skb(skb);  
-
-	return err;
-
-nla_put_failure:
-error_free_f_skb:
-	nlmsg_free(f_skb);
-error_free_skb:
+out:
 	kfree_skb(skb);
-	if (net_ratelimit())
-		printk(KERN_ERR "dp_output_control: failed to send: %d\n", err);
 	return err;
 }
 
 static void fill_port_desc(struct net_bridge_port *p, struct ofp_phy_port *desc)
 {
+	unsigned long flags;
 	desc->port_no = htons(p->port_no);
 	strncpy(desc->name, p->dev->name, OFP_MAX_PORT_NAME_LEN);
 	desc->name[OFP_MAX_PORT_NAME_LEN-1] = '\0';
 	memcpy(desc->hw_addr, p->dev->dev_addr, ETH_ALEN);
-	desc->flags = htonl(p->flags);
+	desc->flags = 0;
 	desc->features = 0;
 	desc->speed = 0;
 
+	if (p->port_no < 255) {
+		/* FIXME: this is a layering violation and should really be
+		 * done in the secchan, as with OFPC_STP in
+		 * OFP_SUPPORTED_CAPABILITIES. */
+		desc->features |= OFPPF_STP;
+	}
+
+	spin_lock_irqsave(&p->lock, flags);
+	desc->flags = htonl(p->flags | p->status);
+	spin_unlock_irqrestore(&p->lock, flags);
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,24)
 	if (p->dev->ethtool_ops && p->dev->ethtool_ops->get_settings) {
 		struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET };
@@ -593,36 +728,29 @@ static void fill_port_desc(struct net_bridge_port *p, struct ofp_phy_port *desc)
 			if (ecmd.supported & SUPPORTED_10000baseT_Full)
 				desc->features |= OFPPF_10GB_FD;
 
-			desc->features = htonl(desc->features);
 			desc->speed = htonl(ecmd.speed);
 		}
 	}
 #endif
+	desc->features = htonl(desc->features);
 }
 
 static int 
-fill_data_hello(struct datapath *dp, struct ofp_data_hello *odh)
+fill_features_reply(struct datapath *dp, struct ofp_switch_features *ofr)
 {
 	struct net_bridge_port *p;
 	int port_count = 0;
 
-	odh->header.version = OFP_VERSION;
-	odh->header.type    = OFPT_DATA_HELLO;
-	odh->header.xid     = htonl(0);
-	odh->datapath_id    = cpu_to_be64(dp->id); 
-
-	odh->n_exact        = htonl(2 * TABLE_HASH_MAX_FLOWS);
-	odh->n_mac_only     = htonl(TABLE_MAC_MAX_FLOWS);
-	odh->n_compression  = 0;					   /* Not supported */
-	odh->n_general      = htonl(TABLE_LINEAR_MAX_FLOWS);
-	odh->buffer_mb      = htonl(UINT32_MAX);
-	odh->n_buffers      = htonl(N_PKT_BUFFERS);
-	odh->capabilities   = htonl(OFP_SUPPORTED_CAPABILITIES);
-	odh->actions        = htonl(OFP_SUPPORTED_ACTIONS);
-	odh->miss_send_len  = htons(dp->miss_send_len); 
+	ofr->datapath_id  = cpu_to_be64(dp->id); 
+
+	ofr->n_buffers    = htonl(N_PKT_BUFFERS);
+	ofr->n_tables     = dp->chain->n_tables;
+	ofr->capabilities = htonl(OFP_SUPPORTED_CAPABILITIES);
+	ofr->actions      = htonl(OFP_SUPPORTED_ACTIONS);
+	memset(ofr->pad, 0, sizeof ofr->pad);
 
 	list_for_each_entry_rcu (p, &dp->port_list, node) {
-		fill_port_desc(p, &odh->ports[port_count]);
+		fill_port_desc(p, &ofr->ports[port_count]);
 		port_count++;
 	}
 
@@ -630,227 +758,227 @@ fill_data_hello(struct datapath *dp, struct ofp_data_hello *odh)
 }
 
 int
-dp_send_hello(struct datapath *dp)
+dp_send_features_reply(struct datapath *dp, const struct sender *sender)
 {
 	struct sk_buff *skb;
-	struct nlattr *attr;
-	struct ofp_data_hello *odh;
-	size_t odh_max_len, odh_len, port_max_len, len;
-	void *data;
-	int err = -ENOMEM;
+	struct ofp_switch_features *ofr;
+	size_t ofr_len, port_max_len;
 	int port_count;
 
-
-	/* Overallocate, since we can't reliably determine the number of
-	 * ports a priori. */
+	/* Overallocate. */
 	port_max_len = sizeof(struct ofp_phy_port) * OFPP_MAX;
+	ofr = alloc_openflow_skb(dp, sizeof(*ofr) + port_max_len,
+				 OFPT_FEATURES_REPLY, sender, &skb);
+	if (!ofr)
+		return -ENOMEM;
 
-	len = nla_total_size(sizeof(*odh) + port_max_len) 
-				+ nla_total_size(sizeof(uint32_t));
+	/* Fill. */
+	port_count = fill_features_reply(dp, ofr);
 
-	skb = genlmsg_new(len, GFP_ATOMIC);
-	if (!skb) {
-		if (net_ratelimit())
-			printk("dp_send_hello: genlmsg_new failed\n");
-		goto error;
-	}
+	/* Shrink to fit. */
+	ofr_len = sizeof(*ofr) + (sizeof(struct ofp_phy_port) * port_count);
+	resize_openflow_skb(skb, &ofr->header, ofr_len);
+	return send_openflow_skb(skb, sender);
+}
 
-	data = genlmsg_put(skb, 0, 0, &dp_genl_family, 0,
-			   DP_GENL_C_OPENFLOW);
-	if (data == NULL) {
-		if (net_ratelimit())
-			printk("dp_send_hello: genlmsg_put failed\n");
-		goto error;
-	}
+int
+dp_send_config_reply(struct datapath *dp, const struct sender *sender)
+{
+	struct sk_buff *skb;
+	struct ofp_switch_config *osc;
 
-	NLA_PUT_U32(skb, DP_GENL_A_DP_IDX, dp->dp_idx);
+	osc = alloc_openflow_skb(dp, sizeof *osc, OFPT_GET_CONFIG_REPLY, sender,
+				 &skb);
+	if (!osc)
+		return -ENOMEM;
 
-	odh_max_len = sizeof(*odh) + port_max_len;
-	attr = nla_reserve(skb, DP_GENL_A_OPENFLOW, odh_max_len);
-	if (!attr) {
-		if (net_ratelimit())
-			printk("dp_send_hello: nla_reserve failed\n");
-		goto error;
-	}
-	odh = nla_data(attr);
-	port_count = fill_data_hello(dp, odh);
+	osc->flags = htons(dp->flags);
+	osc->miss_send_len = htons(dp->miss_send_len);
 
-	/* Only now that we know how many ports we've added can we say
-	 * say something about the length. */
-	odh_len = sizeof(*odh) + (sizeof(struct ofp_phy_port) * port_count);
-	odh->header.length = htons(odh_len);
+	return send_openflow_skb(skb, sender);
+}
 
-	/* Take back the unused part that was reserved */
-	nla_unreserve(skb, attr, (odh_max_len - odh_len));
+/* Callback function for a workqueue to disable an interface */
+static void
+down_port_cb(struct work_struct *work)
+{
+	struct net_bridge_port *p = container_of(work, struct net_bridge_port, 
+			port_task);
 
-	err = genlmsg_end(skb, data);
-	if (err < 0) {
+	rtnl_lock();
+	if (dev_change_flags(p->dev, p->dev->flags & ~IFF_UP) < 0)
 		if (net_ratelimit())
-			printk("dp_send_hello: genlmsg_end failed\n");
-		goto error;
-	}
-
-	err = genlmsg_multicast(skb, 0, mc_group.id, GFP_ATOMIC);
-	if (err && net_ratelimit())
-		printk(KERN_WARNING "dp_send_hello: genlmsg_multicast failed: %d\n", err);
+			printk("problem bringing up port %s\n", p->dev->name);
+	rtnl_unlock();
+	p->status |= OFPPFL_PORT_DOWN;
+}
 
-	return err;
+/* Callback function for a workqueue to enable an interface */
+static void
+up_port_cb(struct work_struct *work)
+{
+	struct net_bridge_port *p = container_of(work, struct net_bridge_port, 
+			port_task);
 
-nla_put_failure:
-error:
-	kfree_skb(skb);
-	if (net_ratelimit())
-		printk(KERN_ERR "dp_send_hello: failed to send: %d\n", err);
-	return err;
+	rtnl_lock();
+	if (dev_change_flags(p->dev, p->dev->flags | IFF_UP) < 0)
+		if (net_ratelimit())
+			printk("problem bringing down port %s\n", p->dev->name);
+	rtnl_unlock();
+	p->status &= ~OFPPFL_PORT_DOWN;
 }
 
 int
-dp_update_port_flags(struct datapath *dp, const struct ofp_phy_port *opp)
+dp_update_port_flags(struct datapath *dp, const struct ofp_port_mod *opm)
 {
-	struct net_bridge_port *p;
-
-	p = dp->ports[htons(opp->port_no)];
+	unsigned long int flags;
+	const struct ofp_phy_port *opp = &opm->desc;
+	int port_no = ntohs(opp->port_no);
+	struct net_bridge_port *p = (port_no < OFPP_MAX ? dp->ports[port_no]
+				     : port_no == OFPP_LOCAL ? dp->local_port
+				     : NULL);
+	uint32_t flag_mask;
 
 	/* Make sure the port id hasn't changed since this was sent */
-	if (!p || memcmp(opp->hw_addr, p->dev->dev_addr, ETH_ALEN) != 0) 
+	if (!p || memcmp(opp->hw_addr, p->dev->dev_addr, ETH_ALEN))
 		return -1;
-	
-	p->flags = htonl(opp->flags);
+
+	spin_lock_irqsave(&p->lock, flags);
+	flag_mask = ntohl(opm->mask) & PORT_FLAG_BITS;
+	if (flag_mask) {
+		p->flags &= ~flag_mask;
+		p->flags |= ntohl(opp->flags) & flag_mask;
+	}
+
+	/* Modifying the status of an interface requires taking a lock
+	 * that cannot be done from here.  For this reason, we use a shared 
+	 * workqueue, which will cause it to be executed from a safer 
+	 * context. */
+	if (opm->mask & htonl(OFPPFL_PORT_DOWN)) {
+		if ((opp->flags & htonl(OFPPFL_PORT_DOWN))
+		    && (p->status & OFPPFL_PORT_DOWN) == 0) {
+			PREPARE_WORK(&p->port_task, down_port_cb);
+			schedule_work(&p->port_task);
+		} else if ((opp->flags & htonl(OFPPFL_PORT_DOWN)) == 0
+			   && (p->status & OFPPFL_PORT_DOWN)) {
+			PREPARE_WORK(&p->port_task, up_port_cb);
+			schedule_work(&p->port_task);
+		}
+	}
+	spin_unlock_irqrestore(&p->lock, flags);
 
 	return 0;
 }
 
+/* Update the port status field of the bridge port.  A non-zero return
+ * value indicates some field has changed. 
+ *
+ * NB: Callers of this function may hold the RCU read lock, so any
+ * additional checks must not sleep.
+ */
+static int
+update_port_status(struct net_bridge_port *p)
+{
+	unsigned long int flags;
+	uint32_t orig_status;
+
+	spin_lock_irqsave(&p->lock, flags);
+	orig_status = p->status;
+
+	if (p->dev->flags & IFF_UP) 
+		p->status &= ~OFPPFL_PORT_DOWN;
+	else
+		p->status |= OFPPFL_PORT_DOWN;
+
+	if (netif_carrier_ok(p->dev))
+		p->status &= ~OFPPFL_LINK_DOWN;
+	else
+		p->status |= OFPPFL_LINK_DOWN;
+
+	spin_unlock_irqrestore(&p->lock, flags);
+	return (orig_status != p->status);
+}
 
 static int
 send_port_status(struct net_bridge_port *p, uint8_t status)
 {
 	struct sk_buff *skb;
-	struct nlattr *attr;
 	struct ofp_port_status *ops;
-	void *data;
-	int err = -ENOMEM;
 
+	ops = alloc_openflow_skb(p->dp, sizeof *ops, OFPT_PORT_STATUS, NULL,
+				 &skb);
+	if (!ops)
+		return -ENOMEM;
+	ops->reason = status;
+	memset(ops->pad, 0, sizeof ops->pad);
+	fill_port_desc(p, &ops->desc);
 
-	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
-	if (!skb) {
-		if (net_ratelimit())
-			printk("send_port_status: genlmsg_new failed\n");
-		goto error;
-	}
+	return send_openflow_skb(skb, NULL);
+}
 
-	data = genlmsg_put(skb, 0, 0, &dp_genl_family, 0,
-			   DP_GENL_C_OPENFLOW);
-	if (data == NULL) {
-		if (net_ratelimit())
-			printk("send_port_status: genlmsg_put failed\n");
-		goto error;
-	}
+int 
+dp_send_flow_expired(struct datapath *dp, struct sw_flow *flow,
+		     enum ofp_flow_expired_reason reason)
+{
+	struct sk_buff *skb;
+	struct ofp_flow_expired *ofe;
 
-	NLA_PUT_U32(skb, DP_GENL_A_DP_IDX, p->dp->dp_idx);
+	if (!(dp->flags & OFPC_SEND_FLOW_EXP))
+		return 0;
 
-	attr = nla_reserve(skb, DP_GENL_A_OPENFLOW, sizeof(*ops));
-	if (!attr) {
-		if (net_ratelimit())
-			printk("send_port_status: nla_reserve failed\n");
-		goto error;
-	}
+	ofe = alloc_openflow_skb(dp, sizeof *ofe, OFPT_FLOW_EXPIRED, 0, &skb);
+	if (!ofe)
+		return -ENOMEM;
 
-	ops = nla_data(attr);
-	ops->header.version = OFP_VERSION;
-	ops->header.type    = OFPT_PORT_STATUS;
-	ops->header.length  = htons(sizeof(*ops));
-	ops->header.xid     = htonl(0);
+	flow_fill_match(&ofe->match, &flow->key);
 
-	ops->reason         = status;
-	fill_port_desc(p, &ops->desc);
-
-	err = genlmsg_end(skb, data);
-	if (err < 0) {
-		if (net_ratelimit())
-			printk("send_port_status: genlmsg_end failed\n");
-		goto error;
-	}
+	ofe->priority = htons(flow->priority);
+	ofe->reason = reason;
+	memset(ofe->pad, 0, sizeof ofe->pad);
 
-	err = genlmsg_multicast(skb, 0, mc_group.id, GFP_ATOMIC);
-	if (err && net_ratelimit())
-		printk(KERN_WARNING "send_port_status: genlmsg_multicast failed: %d\n", err);
+	ofe->duration     = htonl((jiffies - flow->init_time) / HZ);
+	memset(ofe->pad2, 0, sizeof ofe->pad2);
+	ofe->packet_count = cpu_to_be64(flow->packet_count);
+	ofe->byte_count   = cpu_to_be64(flow->byte_count);
 
-	return err;
-
-nla_put_failure:
-error:
-	kfree_skb(skb);
-	if (net_ratelimit())
-		printk(KERN_ERR "send_port_status: failed to send: %d\n", err);
-	return err;
+	return send_openflow_skb(skb, NULL);
 }
+EXPORT_SYMBOL(dp_send_flow_expired);
 
-int 
-dp_send_flow_expired(struct datapath *dp, struct sw_flow *flow)
+int
+dp_send_error_msg(struct datapath *dp, const struct sender *sender, 
+		uint16_t type, uint16_t code, const uint8_t *data, size_t len)
 {
 	struct sk_buff *skb;
-	struct nlattr *attr;
-	struct ofp_flow_expired *ofe;
-	void *data;
-	unsigned long duration_j;
-	int err = -ENOMEM;
-
-
-	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
-	if (!skb) {
-		if (net_ratelimit())
-			printk("dp_send_flow_expired: genlmsg_new failed\n");
-		goto error;
-	}
-
-	data = genlmsg_put(skb, 0, 0, &dp_genl_family, 0,
-			   DP_GENL_C_OPENFLOW);
-	if (data == NULL) {
-		if (net_ratelimit())
-			printk("dp_send_flow_expired: genlmsg_put failed\n");
-		goto error;
-	}
+	struct ofp_error_msg *oem;
 
-	NLA_PUT_U32(skb, DP_GENL_A_DP_IDX, dp->dp_idx);
 
-	attr = nla_reserve(skb, DP_GENL_A_OPENFLOW, sizeof(*ofe));
-	if (!attr) {
-		if (net_ratelimit())
-			printk("dp_send_flow_expired: nla_reserve failed\n");
-		goto error;
-	}
-
-	ofe = nla_data(attr);
-	ofe->header.version = OFP_VERSION;
-	ofe->header.type    = OFPT_FLOW_EXPIRED;
-	ofe->header.length  = htons(sizeof(*ofe));
-	ofe->header.xid     = htonl(0);
+	oem = alloc_openflow_skb(dp, sizeof(*oem)+len, OFPT_ERROR_MSG, 
+			sender, &skb);
+	if (!oem)
+		return -ENOMEM;
 
-	flow_fill_match(&ofe->match, &flow->key);
-	duration_j = (flow->timeout - HZ * flow->max_idle) - flow->init_time;
-	ofe->duration   = htonl(duration_j / HZ);
-	ofe->packet_count   = cpu_to_be64(flow->packet_count);
-	ofe->byte_count     = cpu_to_be64(flow->byte_count);
+	oem->type = htons(type);
+	oem->code = htons(code);
+	memcpy(oem->data, data, len);
 
-	err = genlmsg_end(skb, data);
-	if (err < 0) {
-		if (net_ratelimit())
-			printk("dp_send_flow_expired: genlmsg_end failed\n");
-		goto error;
-	}
+	return send_openflow_skb(skb, sender);
+}
 
-	err = genlmsg_multicast(skb, 0, mc_group.id, GFP_ATOMIC);
-	if (err && net_ratelimit())
-		printk(KERN_WARNING "send_flow_expired: genlmsg_multicast failed: %d\n", err);
+int
+dp_send_echo_reply(struct datapath *dp, const struct sender *sender,
+		   const struct ofp_header *rq)
+{
+	struct sk_buff *skb;
+	struct ofp_header *reply;
 
-	return err;
+	reply = alloc_openflow_skb(dp, ntohs(rq->length), OFPT_ECHO_REPLY,
+				   sender, &skb);
+	if (!reply)
+		return -ENOMEM;
 
-nla_put_failure:
-error:
-	kfree_skb(skb);
-	if (net_ratelimit())
-		printk(KERN_ERR "send_flow_expired: failed to send: %d\n", err);
-	return err;
+	memcpy(reply + 1, rq + 1, ntohs(rq->length) - sizeof *rq);
+	return send_openflow_skb(skb, sender);
 }
 
 /* Generic Netlink interface.
@@ -906,7 +1034,6 @@ static int dp_genl_del(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[DP_GENL_A_DP_IDX])
 		return -EINVAL;
 
-	mutex_lock(&dp_mutex);
 	dp = dp_get(nla_get_u32((info->attrs[DP_GENL_A_DP_IDX])));
 	if (!dp)
 		err = -ENOENT;
@@ -914,7 +1041,6 @@ static int dp_genl_del(struct sk_buff *skb, struct genl_info *info)
 		del_dp(dp);
 		err = 0;
 	}
-	mutex_unlock(&dp_mutex);
 	return err;
 }
 
@@ -953,7 +1079,7 @@ static int dp_genl_query(struct sk_buff *skb, struct genl_info *info)
 		err = -ENOENT;
 	else {
 		void *data;
-		ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+		ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
 		if (!ans_skb) {
 			err = -ENOMEM;
 			goto err;
@@ -980,431 +1106,6 @@ nla_put_failure:
 	return err;
 }
 
-/*
- * Fill flow entry for nl flow query.  Called with rcu_lock  
- *
- */
-static
-int
-dp_fill_flow(struct ofp_flow_mod* ofm, struct swt_iterator* iter)
-{
-	ofm->header.version  = OFP_VERSION;
-	ofm->header.type     = OFPT_FLOW_MOD;
-	ofm->header.length   = htons(sizeof(struct ofp_flow_mod) 
-				+ sizeof(ofm->actions[0]));
-	ofm->header.xid      = htonl(0);
-
-	ofm->match.wildcards = htons(iter->flow->key.wildcards);
-	ofm->match.in_port   = iter->flow->key.in_port;
-	ofm->match.dl_vlan   = iter->flow->key.dl_vlan;
-	memcpy(ofm->match.dl_src, iter->flow->key.dl_src, ETH_ALEN);
-	memcpy(ofm->match.dl_dst, iter->flow->key.dl_dst, ETH_ALEN);
-	ofm->match.dl_type   = iter->flow->key.dl_type;
-	ofm->match.nw_src    = iter->flow->key.nw_src;
-	ofm->match.nw_dst    = iter->flow->key.nw_dst;
-	ofm->match.nw_proto  = iter->flow->key.nw_proto;
-	ofm->match.tp_src    = iter->flow->key.tp_src;
-	ofm->match.tp_dst    = iter->flow->key.tp_dst;
-	ofm->group_id        = iter->flow->group_id;
-	ofm->max_idle        = iter->flow->max_idle;
-	/* TODO support multiple actions  */
-	ofm->actions[0]      = iter->flow->actions[0];
-
-	return 0;
-}
-
-static int dp_genl_show(struct sk_buff *skb, struct genl_info *info)
-{
-	struct datapath *dp;
-	int err = -ENOMEM;
-	struct sk_buff *ans_skb = NULL;
-	void *data;
-	struct nlattr *attr;
-	struct ofp_data_hello *odh;
-	size_t odh_max_len, odh_len, port_max_len, len;
-	int port_count;
-
-	if (!info->attrs[DP_GENL_A_DP_IDX])
-		return -EINVAL;
-
-	mutex_lock(&dp_mutex);
-	dp = dp_get(nla_get_u32((info->attrs[DP_GENL_A_DP_IDX])));
-	if (!dp)
-		goto error;
-
-	/* Overallocate, since we can't reliably determine the number of
-	 * ports a priori. */
-	port_max_len = sizeof(struct ofp_phy_port) * OFPP_MAX;
-
-	len = nla_total_size(sizeof(*odh) + port_max_len)
-			+ nla_total_size(sizeof(uint32_t));
-
-	ans_skb = nlmsg_new(len, GFP_KERNEL);
-	if (!ans_skb)
-		goto error;
-
-	data = genlmsg_put_reply(ans_skb, info, &dp_genl_family,
-				 0, DP_GENL_C_SHOW_DP);
-	if (data == NULL) 
-		goto error;
-
-	NLA_PUT_U32(ans_skb, DP_GENL_A_DP_IDX, dp->dp_idx);
-
-	odh_max_len = sizeof(*odh) + port_max_len;
-	attr = nla_reserve(ans_skb, DP_GENL_A_DP_INFO, odh_max_len);
-	if (!attr)
-		goto error;
-	odh = nla_data(attr);
-	port_count = fill_data_hello(dp, odh);
-
-	/* Only now that we know how many ports we've added can we say
-	 * say something about the length. */
-	odh_len = sizeof(*odh) + (sizeof(struct ofp_phy_port) * port_count);
-	odh->header.length = htons(odh_len);
-
-	/* Take back the unused part that was reserved */
-	nla_unreserve(ans_skb, attr, (odh_max_len - odh_len));
-
-	genlmsg_end(ans_skb, data);
-	err = genlmsg_reply(ans_skb, info);
-	if (!err)
-		ans_skb = NULL;
-
-error:
-nla_put_failure:
-	if (ans_skb)
-		kfree_skb(ans_skb);
-	mutex_unlock(&dp_mutex);
-	return err;
-}
-
-static struct genl_ops dp_genl_ops_show_dp = {
-	.cmd = DP_GENL_C_SHOW_DP,
-	.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
-	.policy = dp_genl_policy,
-	.doit = dp_genl_show,
-	.dumpit = NULL,
-};
-
-/* Convenience function */
-static
-void* 
-dp_init_nl_flow_msg(uint32_t dp_idx, uint16_t table_idx, 
-		struct genl_info *info, struct sk_buff* skb)
-{
-	void* data;
-
-	data = genlmsg_put_reply(skb, info, &dp_genl_family, 0, 
-				DP_GENL_C_QUERY_FLOW);
-	if (data == NULL)
-		return NULL;
-	NLA_PUT_U32(skb, DP_GENL_A_DP_IDX,   dp_idx);
-	NLA_PUT_U16(skb, DP_GENL_A_TABLEIDX, table_idx);
-
-	return data;
-
-nla_put_failure:
-	return NULL;
-}
-
-/*  Iterate through the specified table and send all flow entries over
- *  netlink to userspace.  Each flow message has the following format:
- *
- *  32bit dpix
- *  16bit tabletype
- *  32bit number of flows
- *  openflow-flow-entries
- *
- *  The full table may require multiple messages.  A message with 0 flows
- *  signifies end-of message.
- */
-
-static 
-int 
-dp_dump_table(struct datapath *dp, uint16_t table_idx, struct genl_info *info, struct ofp_flow_mod* matchme) 
-{ 
-	struct sk_buff  *skb = 0; 
-	struct sw_table *table = 0;
-	struct swt_iterator iter;
-	struct sw_flow_key in_flow; 
-	struct nlattr   *attr;
-	int count = 0, sum_count = 0;
-	void *data; 
-	uint8_t* ofm_ptr = 0;
-	struct nlattr   *num_attr; 
-	int err = -ENOMEM;
-
-	table = dp->chain->tables[table_idx]; 
-	if ( table == NULL ) {
-		dprintk("dp::dp_dump_table error, non-existant table at position %d\n", table_idx);
-		return -EINVAL;
-	}
-
-	if (!table->iterator(table, &iter)) {
-		dprintk("dp::dp_dump_table couldn't initialize empty table iterator\n");
-		return -ENOMEM;
-	}
-
-	while (iter.flow) {
-
-		/* verify that we can fit all NL_FLOWS_PER_MESSAGE in a single
-		 * sk_buf */
-		if( (sizeof(dp_genl_family) + sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint32_t) + 
-					(NL_FLOWS_PER_MESSAGE * sizeof(struct ofp_flow_mod))) > (8192 - 64)){
-			dprintk("dp::dp_dump_table NL_FLOWS_PER_MESSAGE may cause overrun in skbuf\n");
-			return -ENOMEM;
-		}
-
-		skb = nlmsg_new(8192 - 64, GFP_ATOMIC);
-		if (skb == NULL) {
-			return -ENOMEM;
-		}
-
-		data = dp_init_nl_flow_msg(dp->dp_idx, table_idx, info, skb);
-		if (data == NULL){
-			err= -ENOMEM;	
-			goto error_free_skb;
-		} 
-
-		/* reserve space to put the number of flows for this message, to
-		 * be filled after the loop*/
-		num_attr = nla_reserve(skb, DP_GENL_A_NUMFLOWS, sizeof(uint32_t));
-		if(!num_attr){
-			err = -ENOMEM;
-			goto error_free_skb;
-		}
-
-		/* Only load NL_FLOWS_PER_MESSAGE flows at a time */
-		attr = nla_reserve(skb, DP_GENL_A_FLOW, 
-				(sizeof(struct ofp_flow_mod) + sizeof(struct ofp_action)) * NL_FLOWS_PER_MESSAGE);
-		if (!attr){
-			err = -ENOMEM;
-			goto error_free_skb;
-		}
-
-		/* internal loop to fill NL_FLOWS_PER_MESSAGE flows */
-		ofm_ptr = nla_data(attr);
-		flow_extract_match(&in_flow, &matchme->match);
-		while (iter.flow && count < NL_FLOWS_PER_MESSAGE) {
-			if(flow_matches(&in_flow, &iter.flow->key)){
-				if((err = dp_fill_flow((struct ofp_flow_mod*)ofm_ptr, &iter))) 
-					goto error_free_skb;
-				count++; 
-				/* TODO support multiple actions  */
-				ofm_ptr += sizeof(struct ofp_flow_mod) + sizeof(struct ofp_action);
-			}
-			table->iterator_next(&iter);
-		}
-
-		*((uint32_t*)nla_data(num_attr)) = count;
-		genlmsg_end(skb, data); 
-
-		sum_count += count; 
-		count = 0;
-
-		err = genlmsg_unicast(skb, info->snd_pid); 
-		skb = 0;
-	}
-
-	/* send a sentinal message saying we're done */
-	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
-	if (skb == NULL) {
-		return -ENOMEM;
-	}
-	data = dp_init_nl_flow_msg(dp->dp_idx, table_idx, info, skb);
-	if (data == NULL){
-		err= -ENOMEM;	
-		goto error_free_skb;
-	} 
-
-	NLA_PUT_U32(skb, DP_GENL_A_NUMFLOWS,   0);
-	/* dummy flow so nl doesn't complain */
-	attr = nla_reserve(skb, DP_GENL_A_FLOW, sizeof(struct ofp_flow_mod));
-	if (!attr){
-		err = -ENOMEM;
-		goto error_free_skb;
-	}
-	genlmsg_end(skb, data); 
-	err = genlmsg_reply(skb, info); skb = 0;
-
-nla_put_failure:
-error_free_skb:
-	if(skb)
-		kfree_skb(skb);
-	return err;
-}
-
-/* Helper function to query_table which creates and sends a message packed with
- * table stats.  Message form is:
- *
- * u32 DP_IDX
- * u32 NUM_TABLES
- * OFP_TABLE (list of OFP_TABLES)
- *
- */
-
-static 
-int 
-dp_dump_table_stats(struct datapath *dp, int dp_idx, struct genl_info *info) 
-{ 
-	struct sk_buff   *skb = 0; 
-	struct ofp_table *ot = 0;
-	struct nlattr   *attr;
-	struct sw_table_stats stats; 
-	void *data; 
-	int err = -ENOMEM;
-	int i = 0;
-	int nt = dp->chain->n_tables;
-
-	/* u32 IDX, u32 NUMTABLES, list-of-tables */
-	skb = nlmsg_new(4 + 4 + (sizeof(struct ofp_table) * nt), GFP_ATOMIC);
-	if (skb == NULL) {
-		return -ENOMEM;
-	}
-	
-	data = genlmsg_put_reply(skb, info, &dp_genl_family, 0, 
-				DP_GENL_C_QUERY_TABLE);
-	if (data == NULL){
-		return -ENOMEM;
-	} 
-
-	NLA_PUT_U32(skb, DP_GENL_A_DP_IDX,	dp_idx);
-	NLA_PUT_U32(skb, DP_GENL_A_NUMTABLES, nt);
-
-	/* ... we assume that all tables can fit in a single message.
-	 * Probably a reasonable assumption seeing that we only have
-	 * 3 atm */
-	attr = nla_reserve(skb, DP_GENL_A_TABLE, (sizeof(struct ofp_table) * nt));
-	if (!attr){
-		err = -ENOMEM;
-		goto error_free_skb;
-	}
-
-	ot = nla_data(attr);
-
-	for (i = 0; i < nt; ++i) {
-		dp->chain->tables[i]->stats(dp->chain->tables[i], &stats);
-		ot->header.version = OFP_VERSION;
-		ot->header.type    = OFPT_TABLE;
-		ot->header.length  = htons(sizeof(struct ofp_table));
-		ot->header.xid     = htonl(0);
-
-		strncpy(ot->name, stats.name, OFP_MAX_TABLE_NAME_LEN); 
-		ot->table_id  = htons(i);
-		ot->n_flows   = htonl(stats.n_flows);
-		ot->max_flows = htonl(stats.max_flows);
-		ot++;
-	}
-
-
-	genlmsg_end(skb, data); 
-	err = genlmsg_reply(skb, info); skb = 0;
-
-nla_put_failure:
-error_free_skb:
-	if(skb)
-		kfree_skb(skb);
-	return err;
-}
-
-/* 
- * Queries a datapath for flow-table statistics 
- */
-
-
-static int dp_genl_table_query(struct sk_buff *skb, struct genl_info *info)
-{
-	struct   datapath* dp;
-	int	  err = 0;
-
-	if (!info->attrs[DP_GENL_A_DP_IDX]) {
-		dprintk("dp::dp_genl_table_query received message with missing attributes\n");
-		return -EINVAL;
-	}
-
-	rcu_read_lock();
-	dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
-	if (!dp) {
-		err = -ENOENT;
-		goto err_out;
-	}
-
-	err = dp_dump_table_stats(dp, nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]), info); 
-
-err_out:
-	rcu_read_unlock();
-	return err;
-}
-
-/* 
- * Queries a datapath for flow-table entries.
- */
-
-static int dp_genl_flow_query(struct sk_buff *skb, struct genl_info *info)
-{
-	struct datapath* dp;
-	struct ofp_flow_mod*  ofm;
-	u16	table_idx;
-	int	err = 0;
-
-	if (!info->attrs[DP_GENL_A_DP_IDX]
-				|| !info->attrs[DP_GENL_A_TABLEIDX]
-				|| !info->attrs[DP_GENL_A_FLOW]) {
-		dprintk("dp::dp_genl_flow_query received message with missing attributes\n");
-		return -EINVAL;
-	}
-
-	rcu_read_lock();
-	dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
-	if (!dp) {
-		err = -ENOENT;
-		goto err_out;
-	}
-
-	table_idx = nla_get_u16(info->attrs[DP_GENL_A_TABLEIDX]);
-
-	if (dp->chain->n_tables <= table_idx){
-		printk("table index %d invalid (dp has %d tables)\n",
-				table_idx, dp->chain->n_tables);
-	err = -EINVAL;
-		goto err_out;
-	}
-
-	ofm = nla_data(info->attrs[DP_GENL_A_FLOW]);
-	err = dp_dump_table(dp, table_idx, info, ofm); 
-
-err_out:
-	rcu_read_unlock();
-	return err;
-}
-
-static struct nla_policy dp_genl_flow_policy[DP_GENL_A_MAX + 1] = {
-	[DP_GENL_A_DP_IDX]	= { .type = NLA_U32 },
-	[DP_GENL_A_TABLEIDX] = { .type = NLA_U16 },
-	[DP_GENL_A_NUMFLOWS]  = { .type = NLA_U32 },
-};
-
-static struct genl_ops dp_genl_ops_query_flow = {
-	.cmd	= DP_GENL_C_QUERY_FLOW,
-	.flags  = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
-	.policy = dp_genl_flow_policy,
-	.doit   = dp_genl_flow_query,
-	.dumpit = NULL,
-};
-
-static struct nla_policy dp_genl_table_policy[DP_GENL_A_MAX + 1] = {
-	[DP_GENL_A_DP_IDX]	= { .type = NLA_U32 },
-};
-
-static struct genl_ops dp_genl_ops_query_table = {
-	.cmd	= DP_GENL_C_QUERY_TABLE,
-	.flags  = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
-	.policy = dp_genl_table_policy,
-	.doit   = dp_genl_table_query,
-	.dumpit = NULL,
-};
-
-
 static struct genl_ops dp_genl_ops_query_dp = {
 	.cmd = DP_GENL_C_QUERY_DP,
 	.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1423,7 +1124,6 @@ static int dp_genl_add_del_port(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	/* Get datapath. */
-	mutex_lock(&dp_mutex);
 	dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
 	if (!dp) {
 		err = -ENOENT;
@@ -1452,7 +1152,6 @@ static int dp_genl_add_del_port(struct sk_buff *skb, struct genl_info *info)
 out_put:
 	dev_put(port);
 out:
-	mutex_unlock(&dp_mutex);
 	return err;
 }
 
@@ -1476,24 +1175,29 @@ static int dp_genl_openflow(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *va = info->attrs[DP_GENL_A_OPENFLOW];
 	struct datapath *dp;
+	struct ofp_header *oh;
+	struct sender sender;
 	int err;
 
 	if (!info->attrs[DP_GENL_A_DP_IDX] || !va)
 		return -EINVAL;
 
-	rcu_read_lock();
 	dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
-	if (!dp) {
-		err = -ENOENT;
-		goto out;
-	}
+	if (!dp)
+		return -ENOENT;
 
-	va = info->attrs[DP_GENL_A_OPENFLOW];
+	if (nla_len(va) < sizeof(struct ofp_header))
+		return -EINVAL;
+	oh = nla_data(va);
 
-	err = fwd_control_input(dp->chain, nla_data(va), nla_len(va));
+	sender.xid = oh->xid;
+	sender.pid = info->snd_pid;
+	sender.seq = info->snd_seq;
 
-out:
-	rcu_read_unlock();
+	mutex_lock(&dp_mutex);
+	err = fwd_control_input(dp->chain, &sender,
+				nla_data(va), nla_len(va));
+	mutex_unlock(&dp_mutex);
 	return err;
 }
 
@@ -1501,26 +1205,457 @@ static struct nla_policy dp_genl_openflow_policy[DP_GENL_A_MAX + 1] = {
 	[DP_GENL_A_DP_IDX] = { .type = NLA_U32 },
 };
 
-static struct genl_ops dp_genl_ops_openflow = {
-	.cmd = DP_GENL_C_OPENFLOW,
-	.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
-	.policy = dp_genl_openflow_policy,
-	.doit = dp_genl_openflow,
-	.dumpit = NULL,
+static int desc_stats_dump(struct datapath *dp, void *state,
+			    void *body, int *body_len)
+{
+	struct ofp_desc_stats *ods = body;
+	int n_bytes = sizeof *ods;
+
+	if (n_bytes > *body_len) {
+		return -ENOBUFS;
+	}
+	*body_len = n_bytes;
+
+	strncpy(ods->mfr_desc, mfr_desc, sizeof ods->mfr_desc);
+	strncpy(ods->hw_desc, hw_desc, sizeof ods->hw_desc);
+	strncpy(ods->sw_desc, sw_desc, sizeof ods->sw_desc);
+	strncpy(ods->serial_num, serial_num, sizeof ods->serial_num);
+
+	return 0;
+}
+
+struct flow_stats_state {
+	int table_idx;
+	struct sw_table_position position;
+	const struct ofp_flow_stats_request *rq;
+
+	void *body;
+	int bytes_used, bytes_allocated;
 };
 
-static struct nla_policy dp_genl_benchmark_policy[DP_GENL_A_MAX + 1] = {
-	[DP_GENL_A_DP_IDX] = { .type = NLA_U32 },
-	[DP_GENL_A_NPACKETS] = { .type = NLA_U32 },
-	[DP_GENL_A_PSIZE] = { .type = NLA_U32 },
+static int flow_stats_init(struct datapath *dp, const void *body, int body_len,
+			   void **state)
+{
+	const struct ofp_flow_stats_request *fsr = body;
+	struct flow_stats_state *s = kmalloc(sizeof *s, GFP_ATOMIC);
+	if (!s)
+		return -ENOMEM;
+	s->table_idx = fsr->table_id == 0xff ? 0 : fsr->table_id;
+	memset(&s->position, 0, sizeof s->position);
+	s->rq = fsr;
+	*state = s;
+	return 0;
+}
+
+static int flow_stats_dump_callback(struct sw_flow *flow, void *private)
+{
+	struct flow_stats_state *s = private;
+	struct ofp_flow_stats *ofs;
+	int actions_length;
+	int length;
+
+	actions_length = sizeof *ofs->actions * flow->n_actions;
+	length = sizeof *ofs + sizeof *ofs->actions * flow->n_actions;
+	if (length + s->bytes_used > s->bytes_allocated)
+		return 1;
+
+	ofs = s->body + s->bytes_used;
+	ofs->length          = htons(length);
+	ofs->table_id        = s->table_idx;
+	ofs->pad             = 0;
+	ofs->match.wildcards = htonl(flow->key.wildcards);
+	ofs->match.in_port   = flow->key.in_port;
+	memcpy(ofs->match.dl_src, flow->key.dl_src, ETH_ALEN);
+	memcpy(ofs->match.dl_dst, flow->key.dl_dst, ETH_ALEN);
+	ofs->match.dl_vlan   = flow->key.dl_vlan;
+	ofs->match.dl_type   = flow->key.dl_type;
+	ofs->match.nw_src    = flow->key.nw_src;
+	ofs->match.nw_dst    = flow->key.nw_dst;
+	ofs->match.nw_proto  = flow->key.nw_proto;
+	ofs->match.pad       = 0;
+	ofs->match.tp_src    = flow->key.tp_src;
+	ofs->match.tp_dst    = flow->key.tp_dst;
+	ofs->duration        = htonl((jiffies - flow->init_time) / HZ);
+	ofs->priority        = htons(flow->priority);
+	ofs->idle_timeout    = htons(flow->idle_timeout);
+	ofs->hard_timeout    = htons(flow->hard_timeout);
+	memset(ofs->pad2, 0, sizeof ofs->pad2);
+	ofs->packet_count    = cpu_to_be64(flow->packet_count);
+	ofs->byte_count      = cpu_to_be64(flow->byte_count);
+	memcpy(ofs->actions, flow->actions, actions_length);
+
+	s->bytes_used += length;
+	return 0;
+}
+
+static int flow_stats_dump(struct datapath *dp, void *state,
+			   void *body, int *body_len)
+{
+	struct flow_stats_state *s = state;
+	struct sw_flow_key match_key;
+	int error = 0;
+
+	s->bytes_used = 0;
+	s->bytes_allocated = *body_len;
+	s->body = body;
+
+	flow_extract_match(&match_key, &s->rq->match);
+	while (s->table_idx < dp->chain->n_tables
+	       && (s->rq->table_id == 0xff || s->rq->table_id == s->table_idx))
+	{
+		struct sw_table *table = dp->chain->tables[s->table_idx];
+
+		error = table->iterate(table, &match_key, &s->position,
+				       flow_stats_dump_callback, s);
+		if (error)
+			break;
+
+		s->table_idx++;
+		memset(&s->position, 0, sizeof s->position);
+	}
+	*body_len = s->bytes_used;
+
+	/* If error is 0, we're done.
+	 * Otherwise, if some bytes were used, there are more flows to come.
+	 * Otherwise, we were not able to fit even a single flow in the body,
+	 * which indicates that we have a single flow with too many actions to
+	 * fit.  We won't ever make any progress at that rate, so give up. */
+	return !error ? 0 : s->bytes_used ? 1 : -ENOMEM;
+}
+
+static void flow_stats_done(void *state)
+{
+	kfree(state);
+}
+
+static int aggregate_stats_init(struct datapath *dp,
+				const void *body, int body_len,
+				void **state)
+{
+	*state = (void *)body;
+	return 0;
+}
+
+static int aggregate_stats_dump_callback(struct sw_flow *flow, void *private)
+{
+	struct ofp_aggregate_stats_reply *rpy = private;
+	rpy->packet_count += flow->packet_count;
+	rpy->byte_count += flow->byte_count;
+	rpy->flow_count++;
+	return 0;
+}
+
+static int aggregate_stats_dump(struct datapath *dp, void *state,
+				void *body, int *body_len)
+{
+	struct ofp_aggregate_stats_request *rq = state;
+	struct ofp_aggregate_stats_reply *rpy;
+	struct sw_table_position position;
+	struct sw_flow_key match_key;
+	int table_idx;
+
+	if (*body_len < sizeof *rpy)
+		return -ENOBUFS;
+	rpy = body;
+	*body_len = sizeof *rpy;
+
+	memset(rpy, 0, sizeof *rpy);
+
+	flow_extract_match(&match_key, &rq->match);
+	table_idx = rq->table_id == 0xff ? 0 : rq->table_id;
+	memset(&position, 0, sizeof position);
+	while (table_idx < dp->chain->n_tables
+	       && (rq->table_id == 0xff || rq->table_id == table_idx))
+	{
+		struct sw_table *table = dp->chain->tables[table_idx];
+		int error;
+
+		error = table->iterate(table, &match_key, &position,
+				       aggregate_stats_dump_callback, rpy);
+		if (error)
+			return error;
+
+		table_idx++;
+		memset(&position, 0, sizeof position);
+	}
+
+	rpy->packet_count = cpu_to_be64(rpy->packet_count);
+	rpy->byte_count = cpu_to_be64(rpy->byte_count);
+	rpy->flow_count = htonl(rpy->flow_count);
+	return 0;
+}
+
+static int table_stats_dump(struct datapath *dp, void *state,
+			    void *body, int *body_len)
+{
+	struct ofp_table_stats *ots;
+	int n_bytes = dp->chain->n_tables * sizeof *ots;
+	int i;
+	if (n_bytes > *body_len)
+		return -ENOBUFS;
+	*body_len = n_bytes;
+	for (i = 0, ots = body; i < dp->chain->n_tables; i++, ots++) {
+		struct sw_table_stats stats;
+		dp->chain->tables[i]->stats(dp->chain->tables[i], &stats);
+		strncpy(ots->name, stats.name, sizeof ots->name);
+		ots->table_id = i;
+		ots->wildcards = htonl(stats.wildcards);
+		memset(ots->pad, 0, sizeof ots->pad);
+		ots->max_entries = htonl(stats.max_flows);
+		ots->active_count = htonl(stats.n_flows);
+		ots->matched_count = cpu_to_be64(stats.n_matched);
+	}
+	return 0;
+}
+
+struct port_stats_state {
+	int port;
+};
+
+static int port_stats_init(struct datapath *dp, const void *body, int body_len,
+			   void **state)
+{
+	struct port_stats_state *s = kmalloc(sizeof *s, GFP_ATOMIC);
+	if (!s)
+		return -ENOMEM;
+	s->port = 0;
+	*state = s;
+	return 0;
+}
+
+static int port_stats_dump(struct datapath *dp, void *state,
+			   void *body, int *body_len)
+{
+	struct port_stats_state *s = state;
+	struct ofp_port_stats *ops;
+	int n_ports, max_ports;
+	int i;
+
+	max_ports = *body_len / sizeof *ops;
+	if (!max_ports)
+		return -ENOMEM;
+	ops = body;
+
+	n_ports = 0;
+	for (i = s->port; i < OFPP_MAX && n_ports < max_ports; i++) {
+		struct net_bridge_port *p = dp->ports[i];
+		struct net_device_stats *stats;
+		if (!p)
+			continue;
+		stats = p->dev->get_stats(p->dev);
+		ops->port_no = htons(p->port_no);
+		memset(ops->pad, 0, sizeof ops->pad);
+		ops->rx_packets   = cpu_to_be64(stats->rx_packets);
+		ops->tx_packets   = cpu_to_be64(stats->tx_packets);
+		ops->rx_bytes     = cpu_to_be64(stats->rx_bytes);
+		ops->tx_bytes     = cpu_to_be64(stats->tx_bytes);
+		ops->rx_dropped   = cpu_to_be64(stats->rx_dropped);
+		ops->tx_dropped   = cpu_to_be64(stats->tx_dropped);
+		ops->rx_errors    = cpu_to_be64(stats->rx_errors);
+		ops->tx_errors    = cpu_to_be64(stats->tx_errors);
+		ops->rx_frame_err = cpu_to_be64(stats->rx_frame_errors);
+		ops->rx_over_err  = cpu_to_be64(stats->rx_over_errors);
+		ops->rx_crc_err   = cpu_to_be64(stats->rx_crc_errors);
+		ops->collisions   = cpu_to_be64(stats->collisions);
+		n_ports++;
+		ops++;
+	}
+	s->port = i;
+	*body_len = n_ports * sizeof *ops;
+	return n_ports >= max_ports;
+}
+
+static void port_stats_done(void *state)
+{
+	kfree(state);
+}
+
+struct stats_type {
+	/* Minimum and maximum acceptable number of bytes in body member of
+	 * struct ofp_stats_request. */
+	size_t min_body, max_body;
+
+	/* Prepares to dump some kind of statistics on 'dp'.  'body' and
+	 * 'body_len' are the 'body' member of the struct ofp_stats_request.
+	 * Returns zero if successful, otherwise a negative error code.
+	 * May initialize '*state' to state information.  May be null if no
+	 * initialization is required.*/
+	int (*init)(struct datapath *dp, const void *body, int body_len,
+		    void **state);
+
+	/* Dumps statistics for 'dp' into the '*body_len' bytes at 'body', and
+	 * modifies '*body_len' to reflect the number of bytes actually used.
+	 * ('body' will be transmitted as the 'body' member of struct
+	 * ofp_stats_reply.) */
+	int (*dump)(struct datapath *dp, void *state,
+		    void *body, int *body_len);
+
+	/* Cleans any state created by the init or dump functions.  May be null
+	 * if no cleanup is required. */
+	void (*done)(void *state);
+};
+
+static const struct stats_type stats[] = {
+	[OFPST_DESC] = {
+		0,
+		0,
+		NULL,
+		desc_stats_dump,
+		NULL
+	},
+	[OFPST_FLOW] = {
+		sizeof(struct ofp_flow_stats_request),
+		sizeof(struct ofp_flow_stats_request),
+		flow_stats_init,
+		flow_stats_dump,
+		flow_stats_done
+	},
+	[OFPST_AGGREGATE] = {
+		sizeof(struct ofp_aggregate_stats_request),
+		sizeof(struct ofp_aggregate_stats_request),
+		aggregate_stats_init,
+		aggregate_stats_dump,
+		NULL
+	},
+	[OFPST_TABLE] = {
+		0,
+		0,
+		NULL,
+		table_stats_dump,
+		NULL
+	},
+	[OFPST_PORT] = {
+		0,
+		0,
+		port_stats_init,
+		port_stats_dump,
+		port_stats_done
+	},
 };
 
-static struct genl_ops dp_genl_ops_benchmark_nl = {
-	.cmd = DP_GENL_C_BENCHMARK_NL,
+static int
+dp_genl_openflow_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct datapath *dp;
+	struct sender sender;
+	const struct stats_type *s;
+	struct ofp_stats_reply *osr;
+	int dp_idx;
+	int max_openflow_len, body_len;
+	void *body;
+	int err;
+
+	/* Set up the cleanup function for this dump.  Linux 2.6.20 and later
+	 * support setting up cleanup functions via the .doneit member of
+	 * struct genl_ops.  This kluge supports earlier versions also. */
+	cb->done = dp_genl_openflow_done;
+
+	if (!cb->args[0]) {
+		struct nlattr *attrs[DP_GENL_A_MAX + 1];
+		struct ofp_stats_request *rq;
+		struct nlattr *va;
+		size_t len, body_len;
+		int type;
+
+		err = nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, DP_GENL_A_MAX,
+				  dp_genl_openflow_policy);
+		if (err < 0)
+			return err;
+
+		if (!attrs[DP_GENL_A_DP_IDX])
+			return -EINVAL;
+		dp_idx = nla_get_u16(attrs[DP_GENL_A_DP_IDX]);
+		dp = dp_get(dp_idx);
+		if (!dp)
+			return -ENOENT;
+
+		va = attrs[DP_GENL_A_OPENFLOW];
+		len = nla_len(va);
+		if (!va || len < sizeof *rq)
+			return -EINVAL;
+
+		rq = nla_data(va);
+		type = ntohs(rq->type);
+		if (rq->header.version != OFP_VERSION
+		    || rq->header.type != OFPT_STATS_REQUEST
+		    || ntohs(rq->header.length) != len
+		    || type >= ARRAY_SIZE(stats)
+		    || !stats[type].dump)
+			return -EINVAL;
+
+		s = &stats[type];
+		body_len = len - offsetof(struct ofp_stats_request, body);
+		if (body_len < s->min_body || body_len > s->max_body)
+			return -EINVAL;
+
+		cb->args[0] = 1;
+		cb->args[1] = dp_idx;
+		cb->args[2] = type;
+		cb->args[3] = rq->header.xid;
+		if (s->init) {
+			void *state;
+			err = s->init(dp, rq->body, body_len, &state);
+			if (err)
+				return err;
+			cb->args[4] = (long) state;
+		}
+	} else if (cb->args[0] == 1) {
+		dp_idx = cb->args[1];
+		s = &stats[cb->args[2]];
+
+		dp = dp_get(dp_idx);
+		if (!dp)
+			return -ENOENT;
+	} else {
+		return 0;
+	}
+
+	sender.xid = cb->args[3];
+	sender.pid = NETLINK_CB(cb->skb).pid;
+	sender.seq = cb->nlh->nlmsg_seq;
+
+	osr = put_openflow_headers(dp, skb, OFPT_STATS_REPLY, &sender,
+				   &max_openflow_len);
+	if (IS_ERR(osr))
+		return PTR_ERR(osr);
+	osr->type = htons(s - stats);
+	osr->flags = 0;
+	resize_openflow_skb(skb, &osr->header, max_openflow_len);
+	body = osr->body;
+	body_len = max_openflow_len - offsetof(struct ofp_stats_reply, body);
+
+	err = s->dump(dp, (void *) cb->args[4], body, &body_len);
+	if (err >= 0) {
+		if (!err)
+			cb->args[0] = 2;
+		else
+			osr->flags = ntohs(OFPSF_REPLY_MORE);
+		resize_openflow_skb(skb, &osr->header,
+				    (offsetof(struct ofp_stats_reply, body)
+				     + body_len));
+		err = skb->len;
+	}
+
+	return err;
+}
+
+static int
+dp_genl_openflow_done(struct netlink_callback *cb)
+{
+	if (cb->args[0]) {
+		const struct stats_type *s = &stats[cb->args[2]];
+		if (s->done)
+			s->done((void *) cb->args[4]);
+	}
+	return 0;
+}
+
+static struct genl_ops dp_genl_ops_openflow = {
+	.cmd = DP_GENL_C_OPENFLOW,
 	.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
-	.policy = dp_genl_benchmark_policy,
-	.doit = dp_genl_benchmark_nl,
-	.dumpit = NULL,
+	.policy = dp_genl_openflow_policy,
+	.doit = dp_genl_openflow,
+	.dumpit = dp_genl_openflow_dumpit,
 };
 
 static struct genl_ops *dp_genl_all_ops[] = {
@@ -1529,15 +1664,11 @@ static struct genl_ops *dp_genl_all_ops[] = {
 	 * front. */
 	&dp_genl_ops_openflow,
 
-	&dp_genl_ops_query_flow,
-	&dp_genl_ops_query_table,
-	&dp_genl_ops_show_dp,
 	&dp_genl_ops_add_dp,
 	&dp_genl_ops_del_dp,
 	&dp_genl_ops_query_dp,
 	&dp_genl_ops_add_port,
 	&dp_genl_ops_del_port,
-	&dp_genl_ops_benchmark_nl,
 };
 
 static int dp_init_netlink(void)
@@ -1572,19 +1703,12 @@ static void dp_uninit_netlink(void)
 	genl_unregister_family(&dp_genl_family);
 }
 
-#define DRV_NAME		"openflow"
-#define DRV_VERSION	 VERSION
-#define DRV_DESCRIPTION "OpenFlow switching datapath implementation"
-#define DRV_COPYRIGHT   "Copyright (c) 2007, 2008 The Board of Trustees of The Leland Stanford Junior University"
-
-
 static int __init dp_init(void)
 {
 	int err;
 
-	printk(KERN_INFO DRV_NAME ": " DRV_DESCRIPTION "\n");
-	printk(KERN_INFO DRV_NAME ": " VERSION" built on "__DATE__" "__TIME__"\n");
-	printk(KERN_INFO DRV_NAME ": " DRV_COPYRIGHT "\n");
+	printk("OpenFlow "VERSION", built "__DATE__" "__TIME__", "
+	       "protocol 0x%02x\n", OFP_VERSION);
 
 	err = flow_init();
 	if (err)
@@ -1620,6 +1744,6 @@ static void dp_cleanup(void)
 module_init(dp_init);
 module_exit(dp_cleanup);
 
-MODULE_DESCRIPTION(DRV_DESCRIPTION);
-MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION("OpenFlow switching datapath");
+MODULE_AUTHOR("Copyright (c) 2007, 2008 The Board of Trustees of The Leland Stanford Junior University");
 MODULE_LICENSE("GPL");