Merge branch 'mainstream'

author Giuseppe Lettieri <g.lettieri@iet.unipi.it>

Thu, 15 Aug 2013 18:43:14 +0000 (20:43 +0200)

committer Giuseppe Lettieri <g.lettieri@iet.unipi.it>

Thu, 15 Aug 2013 18:43:14 +0000 (20:43 +0200)
author Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Thu, 15 Aug 2013 18:43:14 +0000 (20:43 +0200)
committer Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Thu, 15 Aug 2013 18:43:14 +0000 (20:43 +0200)
diff --git a/FAQ b/FAQ

index 810803e..75d9007 100644 (file)
--- a/FAQ
+++ b/FAQ
@@ -148,7 +148,7 @@ A: The following table lists the Linux kernel versions against which the
         1.9.x      2.6.18 to 3.8
         1.10.x     2.6.18 to 3.8
         1.11.x     2.6.18 to 3.8
-       1.12.x     2.6.18 to 3.9
+       1.12.x     2.6.18 to 3.10
  
     Open vSwitch userspace should also work with the Linux kernel module
     built into Linux 3.3 and later.
diff --git a/NEWS b/NEWS

index f9953ab..1246383 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,9 @@ v1.12.0 - xx xxx xxxx
        * New support for matching outer source and destination IP address
          of tunneled packets, for tunnel ports configured with the newly
          added "remote_ip=flow" and "local_ip=flow" options.
+      * Support for matching on metadata 'pkt_mark' for interacting with
+        other system components. On Linux this corresponds to the skb
+        mark.
      - The Interface table in the database has a new "ifindex" column to
        report the interface's OS-assigned ifindex.
      - New "check-oftest" Makefile target for running OFTest against Open
@@ -19,7 +22,7 @@ v1.12.0 - xx xxx xxxx
        through database paths (e.g. Private key option with the database name
        should look like "--private-key=db:Open_vSwitch,SSL,private_key").
      - Added ovs-dev.py, a utility script helpful for Open vSwitch developers.
-    - Support for Linux kernels up to 3.9
+    - Support for Linux kernels up to 3.10
      - ovs-ofctl:
        * New "ofp-parse" for printing OpenFlow messages read from a file.
  
diff --git a/acinclude.m4 b/acinclude.m4

index 6033bfa..73ee5ce 100644 (file)
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -271,6 +271,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
    OVS_GREP_IFELSE([$KSRC/include/net/checksum.h], [csum_replace4])
    OVS_GREP_IFELSE([$KSRC/include/net/checksum.h], [csum_unfold])
  
+  OVS_GREP_IFELSE([$KSRC/include/net/genetlink.h], [parallel_ops])
    OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [NLA_NUL_STRING])
    OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_get_be16])
    OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be16])
diff --git a/datapath/Modules.mk b/datapath/Modules.mk

index 2ce8888..ccf4dfa 100644 (file)
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -12,7 +12,6 @@ openvswitch_sources = \
         datapath.c \
         dp_notify.c \
         flow.c \
-       tunnel.c \
         vlan.c \
         vport.c \
         vport-gre.c \
@@ -26,7 +25,6 @@ openvswitch_headers = \
         compat.h \
         datapath.h \
         flow.h \
-       tunnel.h \
         vlan.h \
         vport.h \
         vport-internal_dev.h \
diff --git a/datapath/actions.c b/datapath/actions.c

index 0a2def6..2c09d57 100644 (file)
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -100,7 +100,7 @@ static int pop_vlan(struct sk_buff *skb)
         if (unlikely(err))
                 return err;
  
-       __vlan_hwaccel_put_tag(skb, ntohs(tci));
+       __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci));
         return 0;
  }
  
@@ -112,7 +112,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
                 /* push down current VLAN tag */
                 current_tag = vlan_tx_tag_get(skb);
  
-               if (!__vlan_put_tag(skb, current_tag))
+               if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
                         return -ENOMEM;
  
                 if (get_ip_summed(skb) == OVS_CSUM_COMPLETE)
@@ -120,7 +120,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
                                         + (2 * ETH_ALEN), VLAN_HLEN, 0));
  
         }
-       __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+       __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
         return 0;
  }
  
diff --git a/datapath/compat.h b/datapath/compat.h

index 4dfd192..8457dbf 100644 (file)
--- a/datapath/compat.h
+++ b/datapath/compat.h
@@ -19,7 +19,12 @@
  #ifndef COMPAT_H
  #define COMPAT_H 1
  
+#include <linux/in.h>
+#include <linux/in_route.h>
  #include <linux/netlink.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
  
  #ifndef HAVE_NLA_NUL_STRING
  static inline int CHECK_NUL_STRING(struct nlattr *attr, int maxlen)
@@ -61,6 +66,13 @@ static inline void skb_clear_rxhash(struct sk_buff *skb)
  #define SET_NETNSOK    .netnsok = true,
  #endif
  
+#ifdef HAVE_PARALLEL_OPS
+#define SET_PARALLEL_OPS       .parallel_ops = true,
+#else
+#define SET_PARALLEL_OPS
+#endif
+
+
  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
  #ifdef CONFIG_NETFILTER
  static inline u32 skb_get_mark(struct sk_buff *skb)
@@ -106,4 +118,41 @@ static inline void skb_set_mark(struct sk_buff *skb, u32 mark)
  #define inet_sport(sk) (inet_sk(sk)->inet_sport)
  #endif
  
+static inline struct rtable *find_route(struct net *net,
+                                       __be32 *saddr, __be32 daddr,
+                                       u8 ipproto, u8 tos, u32 skb_mark)
+{
+       struct rtable *rt;
+       /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
+        * router expect RT_TOS bits only. */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+       struct flowi fl = { .nl_u = { .ip4_u = {
+                                       .daddr = daddr,
+                                       .saddr = *saddr,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+                                       .fwmark = skb_mark,
+#endif
+                                       .tos   = RT_TOS(tos) } },
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+                                       .mark = skb_mark,
+#endif
+                                       .proto = ipproto };
+
+       if (unlikely(ip_route_output_key(net, &rt, &fl)))
+               return ERR_PTR(-EADDRNOTAVAIL);
+       *saddr = fl.nl_u.ip4_u.saddr;
+       return rt;
+#else
+       struct flowi4 fl = { .daddr = daddr,
+                            .saddr = *saddr,
+                            .flowi4_tos = RT_TOS(tos),
+                            .flowi4_mark = skb_mark,
+                            .flowi4_proto = ipproto };
+
+       rt = ip_route_output_key(net, &fl);
+       *saddr = fl.saddr;
+       return rt;
+#endif
+}
  #endif /* compat.h */
diff --git a/datapath/datapath.c b/datapath/datapath.c

index 190b61b..48f17c0 100644 (file)
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -58,12 +58,11 @@
  #include "datapath.h"
  #include "flow.h"
  #include "vlan.h"
-#include "tunnel.h"
  #include "vport-internal_dev.h"
  #include "vport-netdev.h"
  
  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) || \
-    LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+    LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
  #error Kernels before 2.6.18 or after 3.9 are not supported by this version of Open vSwitch.
  #endif
  
@@ -280,6 +279,7 @@ static struct genl_family dp_packet_genl_family = {
         .version = OVS_PACKET_VERSION,
         .maxattr = OVS_PACKET_ATTR_MAX,
          SET_NETNSOK
+        SET_PARALLEL_OPS
  };
  
  int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
@@ -1010,6 +1010,7 @@ static struct genl_family dp_flow_genl_family = {
         .version = OVS_FLOW_VERSION,
         .maxattr = OVS_FLOW_ATTR_MAX,
          SET_NETNSOK
+        SET_PARALLEL_OPS
  };
  
  static struct genl_multicast_group ovs_dp_flow_multicast_group = {
@@ -1589,6 +1590,7 @@ static struct genl_family dp_datapath_genl_family = {
         .version = OVS_DATAPATH_VERSION,
         .maxattr = OVS_DP_ATTR_MAX,
          SET_NETNSOK
+        SET_PARALLEL_OPS
  };
  
  static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
@@ -1968,6 +1970,7 @@ static struct genl_family dp_vport_genl_family = {
         .version = OVS_VPORT_VERSION,
         .maxattr = OVS_VPORT_ATTR_MAX,
          SET_NETNSOK
+        SET_PARALLEL_OPS
  };
  
  struct genl_multicast_group ovs_dp_vport_multicast_group = {
diff --git a/datapath/datapath.h b/datapath/datapath.h

index eda87fd..064211d 100644 (file)
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -29,7 +29,6 @@
  #include "checksum.h"
  #include "compat.h"
  #include "flow.h"
-#include "tunnel.h"
  #include "vlan.h"
  #include "vport.h"
  
diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c

index ec573a5..d530893 100644 (file)
--- a/datapath/dp_notify.c
+++ b/datapath/dp_notify.c
@@ -18,6 +18,8 @@
  
  #include <linux/netdevice.h>
  #include <net/genetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
  
  #include "datapath.h"
  #include "vport-internal_dev.h"
diff --git a/datapath/flow.h b/datapath/flow.h

index 5d15783..d8277b5 100644 (file)
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -58,6 +58,22 @@ struct ovs_key_ipv4_tunnel {
         u8   ipv4_ttl;
  };
  
+static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
+                                        const struct iphdr *iph, __be64 tun_id,
+                                        __be16 tun_flags)
+{
+       tun_key->tun_id = tun_id;
+       tun_key->ipv4_src = iph->saddr;
+       tun_key->ipv4_dst = iph->daddr;
+       tun_key->ipv4_tos = iph->tos;
+       tun_key->ipv4_ttl = iph->ttl;
+       tun_key->tun_flags = tun_flags;
+
+       /* clear struct padding. */
+       memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
+              sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
+}
+
  struct sw_flow_key {
         struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
         struct {
diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk

index edaeabb..5f9c792 100644 (file)
--- a/datapath/linux/Modules.mk
+++ b/datapath/linux/Modules.mk
@@ -48,6 +48,7 @@ openvswitch_headers += \
         linux/compat/include/linux/mutex.h \
         linux/compat/include/linux/net.h \
         linux/compat/include/linux/netdevice.h \
+       linux/compat/include/linux/netdev_features.h \
         linux/compat/include/linux/netfilter_bridge.h \
         linux/compat/include/linux/netfilter_ipv4.h \
         linux/compat/include/linux/netlink.h \
diff --git a/datapath/linux/compat/include/linux/if_vlan.h b/datapath/linux/compat/include/linux/if_vlan.h

index b8b1961..730175b 100644 (file)
--- a/datapath/linux/compat/include/linux/if_vlan.h
+++ b/datapath/linux/compat/include/linux/if_vlan.h
@@ -5,6 +5,7 @@
  #include <linux/version.h>
  #include_next <linux/if_vlan.h>
  
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
  /*
   * The behavior of __vlan_put_tag() has changed over time:
   *
@@ -19,8 +20,9 @@
   * to avoid the need to guess whether the version in the kernel tree is
   * acceptable.
   */
-#define __vlan_put_tag rpl_vlan_put_tag
-static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
+#define __vlan_put_tag(skb, proto, tag)  rpl__vlan_put_tag(skb, tag)
+
+static inline struct sk_buff *rpl__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
  {
         struct vlan_ethhdr *veth;
  
@@ -45,6 +47,16 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
         return skb;
  }
  
+static inline struct sk_buff *rpl___vlan_hwaccel_put_tag(struct sk_buff *skb,
+                                                    __be16 vlan_proto,
+                                                    u16 vlan_tci)
+{
+       return __vlan_hwaccel_put_tag(skb, vlan_tci);
+}
+
+#define __vlan_hwaccel_put_tag rpl___vlan_hwaccel_put_tag
+
+#endif
  
  /* All of these were introduced in a single commit preceding 2.6.33, so
   * presumably all of them or none of them are present. */
diff --git a/datapath/linux/compat/include/linux/netdev_features.h b/datapath/linux/compat/include/linux/netdev_features.h

new file mode 100644 (file)

index 0000000..0259413
--- /dev/null
+++ b/datapath/linux/compat/include/linux/netdev_features.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_NETDEV_FEATURES_WRAPPER_H
+#define __LINUX_NETDEV_FEATURES_WRAPPER_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+#include_next <linux/netdev_features.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#define NETIF_F_HW_VLAN_CTAG_TX NETIF_F_HW_VLAN_TX
+#endif
+
+#endif
diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h

index 3f66d3a..f62bd6d 100644 (file)
--- a/datapath/linux/compat/include/linux/netdevice.h
+++ b/datapath/linux/compat/include/linux/netdevice.h
@@ -190,16 +190,21 @@ static inline struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
  #endif
  
  #if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
+
+/* XEN dom0 networking assumes dev->master is bond device
+ * and it tries to access bond private structure from dev->master
+ * ptr on receive path. This causes panic. Therefore it is better
+ * not to backport this API.
+ **/
  static inline int netdev_master_upper_dev_link(struct net_device *dev,
                                                struct net_device *upper_dev)
  {
-       return netdev_set_master(dev, upper_dev);
+       return 0;
  }
  
  static inline void netdev_upper_dev_unlink(struct net_device *dev,
                                            struct net_device *upper_dev)
  {
-       netdev_set_master(dev, NULL);
  }
  #endif
  
diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h

index bd0c3d4..5f46aed 100644 (file)
--- a/datapath/linux/compat/include/net/gre.h
+++ b/datapath/linux/compat/include/net/gre.h
@@ -21,41 +21,13 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version);
  
  #endif
  
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
  struct gre_base_hdr {
         __be16 flags;
         __be16 protocol;
  };
  #define GRE_HEADER_SECTION 4
  
-#define MAX_GRE_PROTO_PRIORITY 255
-struct gre_cisco_protocol {
-       int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
-       u8 priority;
-};
-
-#define gre_build_header rpl_gre_build_header
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
-                     int hdr_len);
-
-#define gre_handle_offloads rpl_gre_handle_offloads
-struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
-
-int gre_cisco_register(struct gre_cisco_protocol *proto);
-int gre_cisco_unregister(struct gre_cisco_protocol *proto);
-
-static inline int ip_gre_calc_hlen(__be16 o_flags)
-{
-       int addend = 4;
-
-       if (o_flags & TUNNEL_CSUM)
-               addend += 4;
-       if (o_flags & TUNNEL_KEY)
-               addend += 4;
-       if (o_flags & TUNNEL_SEQ)
-               addend += 4;
-       return addend;
-}
-
  static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
  {
         __be16 tflags = 0;
@@ -99,4 +71,36 @@ static inline __be16 tnl_flags_to_gre_flags(__be16 tflags)
  
         return flags;
  }
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) */
+
+#define MAX_GRE_PROTO_PRIORITY 255
+struct gre_cisco_protocol {
+       int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
+       u8 priority;
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *proto);
+int gre_cisco_unregister(struct gre_cisco_protocol *proto);
+
+#define gre_build_header rpl_gre_build_header
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+                     int hdr_len);
+
+#define gre_handle_offloads rpl_gre_handle_offloads
+struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
+
+static inline int ip_gre_calc_hlen(__be16 o_flags)
+{
+       int addend = 4;
+
+       if (o_flags & TUNNEL_CSUM)
+               addend += 4;
+       if (o_flags & TUNNEL_KEY)
+               addend += 4;
+       if (o_flags & TUNNEL_SEQ)
+               addend += 4;
+       return addend;
+}
+
+
  #endif
diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h

index ad17c9d..14f55d0 100644 (file)
--- a/datapath/linux/compat/include/net/ip_tunnels.h
+++ b/datapath/linux/compat/include/net/ip_tunnels.h
@@ -31,20 +31,6 @@ struct tnl_ptk_info {
  #define PACKET_RCVD    0
  #define PACKET_REJECT  1
  
-static inline void tunnel_ip_select_ident(struct sk_buff *skb,
-                                         const struct iphdr  *old_iph,
-                                         struct dst_entry *dst)
-{
-       struct iphdr *iph = ip_hdr(skb);
-
-       /* Use inner packet iph-id if possible. */
-       if (skb->protocol == htons(ETH_P_IP) && old_iph->id)
-               iph->id = old_iph->id;
-       else
-               __ip_select_ident(iph, dst,
-                               (skb_shinfo(skb)->gso_segs ?: 1) - 1);
-}
-
  int iptunnel_xmit(struct net *net, struct rtable *rt,
                   struct sk_buff *skb,
                   __be32 src, __be32 dst, __u8 proto,
diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c

index 03c47a2..01cc2fb 100644 (file)
--- a/datapath/linux/compat/ip_tunnels_core.c
+++ b/datapath/linux/compat/ip_tunnels_core.c
@@ -68,9 +68,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt,
         iph->daddr      =       dst;
         iph->saddr      =       src;
         iph->ttl        =       ttl;
-       tunnel_ip_select_ident(skb,
-                              (const struct iphdr *)skb_inner_network_header(skb),
-                              &rt_dst(rt));
+       __ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1);
  
         err = ip_local_out(skb);
         if (unlikely(net_xmit_eval(err)))
diff --git a/datapath/tunnel.c b/datapath/tunnel.c

deleted file mode 100644 (file)

index bd63da5..0000000
--- a/datapath/tunnel.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Copyright (c) 2007-2012 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/in.h>
-#include <linux/in_route.h>
-#include <linux/inetdevice.h>
-#include <linux/jhash.h>
-#include <linux/list.h>
-#include <linux/kernel.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-#include <linux/rculist.h>
-#include <net/route.h>
-#include <net/xfrm.h>
-
-#include "checksum.h"
-#include "compat.h"
-#include "datapath.h"
-#include "tunnel.h"
-#include "vlan.h"
-#include "vport.h"
-
-/**
- *     ovs_tnl_rcv - ingress point for generic tunnel code
- *
- * @vport: port this packet was received on
- * @skb: received packet
- * @tos: ToS from encapsulating IP packet, used to copy ECN bits
- *
- * Must be called with rcu_read_lock.
- *
- * Packets received by this function are in the following state:
- * - skb->data points to the inner Ethernet header.
- * - The inner Ethernet header is in the linear data area.
- * - skb->csum does not include the inner Ethernet header.
- * - The layer pointers are undefined.
- */
-void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
-                struct ovs_key_ipv4_tunnel *tun_key)
-{
-       struct ethhdr *eh;
-
-       skb_reset_mac_header(skb);
-       eh = eth_hdr(skb);
-
-       if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
-               skb->protocol = eh->h_proto;
-       else
-               skb->protocol = htons(ETH_P_802_2);
-
-       skb_dst_drop(skb);
-       nf_reset(skb);
-       skb_clear_rxhash(skb);
-       secpath_reset(skb);
-       vlan_set_tci(skb, 0);
-
-       if (unlikely(compute_ip_summed(skb, false))) {
-               kfree_skb(skb);
-               return;
-       }
-
-       ovs_vport_receive(vport, skb, tun_key);
-}
-
-struct rtable *find_route(struct net *net,
-                         __be32 *saddr, __be32 daddr, u8 ipproto,
-                         u8 tos, u32 skb_mark)
-{
-       struct rtable *rt;
-       /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
-        * router expect RT_TOS bits only. */
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
-       struct flowi fl = { .nl_u = { .ip4_u = {
-                                       .daddr = daddr,
-                                       .saddr = *saddr,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
-                                       .fwmark = skb_mark,
-#endif
-                                       .tos   = RT_TOS(tos) } },
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
-                                       .mark = skb_mark,
-#endif
-                                       .proto = ipproto };
-
-       if (unlikely(ip_route_output_key(net, &rt, &fl)))
-               return ERR_PTR(-EADDRNOTAVAIL);
-       *saddr = fl.nl_u.ip4_u.saddr;
-       return rt;
-#else
-       struct flowi4 fl = { .daddr = daddr,
-                            .saddr = *saddr,
-                            .flowi4_tos = RT_TOS(tos),
-                            .flowi4_mark = skb_mark,
-                            .flowi4_proto = ipproto };
-
-       rt = ip_route_output_key(net, &fl);
-       *saddr = fl.saddr;
-       return rt;
-#endif
-}
-
-static bool need_linearize(const struct sk_buff *skb)
-{
-       int i;
-
-       if (unlikely(skb_shinfo(skb)->frag_list))
-               return true;
-
-       /*
-        * Generally speaking we should linearize if there are paged frags.
-        * However, if all of the refcounts are 1 we know nobody else can
-        * change them from underneath us and we can skip the linearization.
-        */
-       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-               if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
-                       return true;
-
-       return false;
-}
-
-static struct sk_buff *handle_offloads(struct sk_buff *skb)
-{
-       int err;
-
-       forward_ip_summed(skb, true);
-
-       if (skb_is_gso(skb)) {
-               struct sk_buff *nskb;
-               char cb[sizeof(skb->cb)];
-
-               memcpy(cb, skb->cb, sizeof(cb));
-
-               nskb = __skb_gso_segment(skb, 0, false);
-               if (IS_ERR(nskb)) {
-                       err = PTR_ERR(nskb);
-                       goto error;
-               }
-
-               consume_skb(skb);
-               skb = nskb;
-               while (nskb) {
-                       memcpy(nskb->cb, cb, sizeof(cb));
-                       nskb = nskb->next;
-               }
-       } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
-               /* Pages aren't locked and could change at any time.
-                * If this happens after we compute the checksum, the
-                * checksum will be wrong.  We linearize now to avoid
-                * this problem.
-                */
-               if (unlikely(need_linearize(skb))) {
-                       err = __skb_linearize(skb);
-                       if (unlikely(err))
-                               goto error;
-               }
-
-               err = skb_checksum_help(skb);
-               if (unlikely(err))
-                       goto error;
-       }
-
-       set_ip_summed(skb, OVS_CSUM_NONE);
-
-       return skb;
-
-error:
-       return ERR_PTR(err);
-}
-
-/* Compute source UDP port for outgoing packet.
- * Currently we use the flow hash.
- */
-u16 ovs_tnl_get_src_port(struct sk_buff *skb)
-{
-       int low;
-       int high;
-       unsigned int range;
-       struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
-       u32 hash = jhash2((const u32 *)pkt_key,
-                         sizeof(*pkt_key) / sizeof(u32), 0);
-
-       inet_get_local_port_range(&low, &high);
-       range = (high - low) + 1;
-       return (((u64) hash * range) >> 32) + low;
-}
-
-int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
-                u8 ipproto, int tunnel_hlen,
-                void (*build_header)(const struct vport *,
-                                     struct sk_buff *,
-                                     int tunnel_hlen))
-{
-       int min_headroom;
-       struct rtable *rt;
-       __be32 saddr;
-       int sent_len = 0;
-       int err;
-       struct sk_buff *nskb;
-
-       /* Route lookup */
-       saddr = OVS_CB(skb)->tun_key->ipv4_src;
-       rt = find_route(ovs_dp_get_net(vport->dp),
-                       &saddr,
-                       OVS_CB(skb)->tun_key->ipv4_dst,
-                       ipproto,
-                       OVS_CB(skb)->tun_key->ipv4_tos,
-                       skb_get_mark(skb));
-       if (IS_ERR(rt)) {
-               err = PTR_ERR(rt);
-               goto error;
-       }
-
-       tunnel_hlen += sizeof(struct iphdr);
-
-       min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
-                       + tunnel_hlen
-                       + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
-
-       if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
-               int head_delta = SKB_DATA_ALIGN(min_headroom -
-                                               skb_headroom(skb) +
-                                               16);
-
-               err = pskb_expand_head(skb, max_t(int, head_delta, 0),
-                                       0, GFP_ATOMIC);
-               if (unlikely(err))
-                       goto err_free_rt;
-       }
-
-       /* Offloading */
-       nskb = handle_offloads(skb);
-       if (IS_ERR(nskb)) {
-               err = PTR_ERR(nskb);
-               goto err_free_rt;
-       }
-       skb = nskb;
-
-       /* Reset SKB */
-       nf_reset(skb);
-       secpath_reset(skb);
-       skb_dst_drop(skb);
-       skb_clear_rxhash(skb);
-
-       while (skb) {
-               struct sk_buff *next_skb = skb->next;
-               struct iphdr *iph;
-               int frag_len;
-
-               skb->next = NULL;
-
-               if (unlikely(vlan_deaccel_tag(skb)))
-                       goto next;
-
-               frag_len = skb->len;
-               skb_push(skb, tunnel_hlen);
-               skb_reset_network_header(skb);
-               skb_set_transport_header(skb, sizeof(struct iphdr));
-
-               if (next_skb)
-                       skb_dst_set(skb, dst_clone(&rt_dst(rt)));
-               else
-                       skb_dst_set(skb, &rt_dst(rt));
-
-               /* Push Tunnel header. */
-               build_header(vport, skb, tunnel_hlen);
-
-               /* Push IP header. */
-               iph = ip_hdr(skb);
-               iph->version    = 4;
-               iph->ihl        = sizeof(struct iphdr) >> 2;
-               iph->protocol   = ipproto;
-               iph->daddr      = OVS_CB(skb)->tun_key->ipv4_dst;
-               iph->saddr      = saddr;
-               iph->tos        = OVS_CB(skb)->tun_key->ipv4_tos;
-               iph->ttl        = OVS_CB(skb)->tun_key->ipv4_ttl;
-               iph->frag_off   = OVS_CB(skb)->tun_key->tun_flags &
-                                 TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
-               /*
-                * Allow our local IP stack to fragment the outer packet even
-                * if the DF bit is set as a last resort.  We also need to
-                * force selection of an IP ID here with __ip_select_ident(),
-                * as ip_select_ident() assumes a proper ID is not needed when
-                * when the DF bit is set.
-                */
-               skb->local_df = 1;
-               __ip_select_ident(iph, skb_dst(skb), 0);
-
-               memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-
-               err = ip_local_out(skb);
-               if (unlikely(net_xmit_eval(err)))
-                       goto next;
-
-               sent_len += frag_len;
-
-next:
-               skb = next_skb;
-       }
-
-       return sent_len;
-
-err_free_rt:
-       ip_rt_put(rt);
-error:
-       return err;
-}
diff --git a/datapath/tunnel.h b/datapath/tunnel.h

deleted file mode 100644 (file)

index 17de7c4..0000000
--- a/datapath/tunnel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2007-2012 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- */
-
-#ifndef TUNNEL_H
-#define TUNNEL_H 1
-
-#include <linux/version.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-
-#include "flow.h"
-#include "vport.h"
-
-struct rtable *find_route(struct net *net,
-                         __be32 *saddr, __be32 daddr, u8 ipproto,
-                         u8 tos, u32 skb_mark);
-
-u16 ovs_tnl_get_src_port(struct sk_buff *skb);
-
-int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
-                u8 ipproto, int tunnel_hlen,
-                void (*build_header)(const struct vport *,
-                                     struct sk_buff *,
-                                     int tunnel_hlen));
-
-void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
-                struct ovs_key_ipv4_tunnel *tun_key);
-
-static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
-                                        const struct iphdr *iph, __be64 tun_id,
-                                        __be16 tun_flags)
-{
-       tun_key->tun_id = tun_id;
-       tun_key->ipv4_src = iph->saddr;
-       tun_key->ipv4_dst = iph->daddr;
-       tun_key->ipv4_tos = iph->tos;
-       tun_key->ipv4_ttl = iph->ttl;
-       tun_key->tun_flags = tun_flags;
-
-       /* clear struct padding. */
-       memset((unsigned char*) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
-              sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
-}
-
-#endif /* TUNNEL_H */
diff --git a/datapath/vlan.h b/datapath/vlan.h

index 46d0db3..aee5551 100644 (file)
--- a/datapath/vlan.h
+++ b/datapath/vlan.h
@@ -89,7 +89,7 @@ static inline int vlan_deaccel_tag(struct sk_buff *skb)
         if (!vlan_tx_tag_present(skb))
                 return 0;
  
-       skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+       skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
         if (unlikely(!skb))
                 return -ENOMEM;
  
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c

index c74f5fc..5af6dbe 100644 (file)
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -35,10 +35,11 @@
  #include <linux/kernel.h>
  #include <linux/workqueue.h>
  #include <linux/rculist.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
  #include <net/route.h>
  #include <net/xfrm.h>
  
-
  #include <net/icmp.h>
  #include <net/ip.h>
  #include <net/ip_tunnels.h>
@@ -46,7 +47,6 @@
  #include <net/protocol.h>
  
  #include "datapath.h"
-#include "tunnel.h"
  #include "vport.h"
  
  /* Returns the least-significant 32 bits of a __be64. */
@@ -112,7 +112,7 @@ static int gre_rcv(struct sk_buff *skb,
                 return PACKET_REJECT;
  
         key = key_to_tunnel_id(tpi->key, tpi->seq);
-       tnl_tun_key_init(&tun_key, ip_hdr(skb), key, filter_tnl_flags(tpi->flags));
+       ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, filter_tnl_flags(tpi->flags));
  
         ovs_vport_receive(vport, skb, &tun_key);
         return PACKET_RCVD;
@@ -335,17 +335,19 @@ static __be32 be64_get_high32(__be64 x)
  
  static int gre64_send(struct vport *vport, struct sk_buff *skb)
  {
-       int hlen;
+       int hlen = GRE_HEADER_SECTION +         /* GRE Hdr */
+                  GRE_HEADER_SECTION +         /* GRE Key */
+                  GRE_HEADER_SECTION;          /* GRE SEQ */
         __be32 seq;
  
         if (unlikely(!OVS_CB(skb)->tun_key))
                 return -EINVAL;
  
-       hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags)
-              + GRE_HEADER_SECTION;
+       if (OVS_CB(skb)->tun_key->tun_flags & TUNNEL_CSUM)
+               hlen += GRE_HEADER_SECTION;
  
         seq = be64_get_high32(OVS_CB(skb)->tun_key->tun_id);
-       return __send(vport, skb, hlen, seq, TUNNEL_SEQ);
+       return __send(vport, skb, hlen, seq, (TUNNEL_KEY|TUNNEL_SEQ));
  }
  
  const struct vport_ops ovs_gre64_vport_ops = {
diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c

index 9ee1c42..db55ee0 100644 (file)
--- a/datapath/vport-internal_dev.c
+++ b/datapath/vport-internal_dev.c
@@ -22,6 +22,7 @@
  #include <linux/netdevice.h>
  #include <linux/etherdevice.h>
  #include <linux/ethtool.h>
+#include <linux/netdev_features.h>
  #include <linux/skbuff.h>
  #include <linux/version.h>
  
@@ -188,7 +189,7 @@ static void do_setup(struct net_device *netdev)
  
  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
         netdev->vlan_features = netdev->features;
-       netdev->features |= NETIF_F_HW_VLAN_TX;
+       netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
  #endif
  
  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c

index 54c10ae..2f62d11 100644 (file)
--- a/datapath/vport-lisp.c
+++ b/datapath/vport-lisp.c
@@ -30,13 +30,13 @@
  
  #include <net/icmp.h>
  #include <net/ip.h>
+#include <net/route.h>
  #include <net/udp.h>
+#include <net/xfrm.h>
  
  #include "datapath.h"
-#include "tunnel.h"
  #include "vport.h"
  
-
  /*
   *  LISP encapsulation header:
   *
@@ -160,6 +160,23 @@ static __be64 instance_id_to_tunnel_id(__u8 *iid)
  #endif
  }
  
+/* Compute source UDP port for outgoing packet.
+ * Currently we use the flow hash.
+ */
+static u16 ovs_tnl_get_src_port(struct sk_buff *skb)
+{
+       int low;
+       int high;
+       unsigned int range;
+       struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
+       u32 hash = jhash2((const u32 *)pkt_key,
+                         sizeof(*pkt_key) / sizeof(u32), 0);
+
+       inet_get_local_port_range(&low, &high);
+       range = (high - low) + 1;
+       return (((u64) hash * range) >> 32) + low;
+}
+
  static void lisp_build_header(const struct vport *vport,
                               struct sk_buff *skb,
                               int tunnel_hlen)
@@ -189,6 +206,48 @@ static void lisp_build_header(const struct vport *vport,
         lisph->u2.word2.locator_status_bits = 1;
  }
  
+/**
+ *     ovs_tnl_rcv - ingress point for generic tunnel code
+ *
+ * @vport: port this packet was received on
+ * @skb: received packet
+ * @tos: ToS from encapsulating IP packet, used to copy ECN bits
+ *
+ * Must be called with rcu_read_lock.
+ *
+ * Packets received by this function are in the following state:
+ * - skb->data points to the inner Ethernet header.
+ * - The inner Ethernet header is in the linear data area.
+ * - skb->csum does not include the inner Ethernet header.
+ * - The layer pointers are undefined.
+ */
+static void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
+                       struct ovs_key_ipv4_tunnel *tun_key)
+{
+       struct ethhdr *eh;
+
+       skb_reset_mac_header(skb);
+       eh = eth_hdr(skb);
+
+       if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+               skb->protocol = eh->h_proto;
+       else
+               skb->protocol = htons(ETH_P_802_2);
+
+       skb_dst_drop(skb);
+       nf_reset(skb);
+       skb_clear_rxhash(skb);
+       secpath_reset(skb);
+       vlan_set_tci(skb, 0);
+
+       if (unlikely(compute_ip_summed(skb, false))) {
+               kfree_skb(skb);
+               return;
+       }
+
+       ovs_vport_receive(vport, skb, tun_key);
+}
+
  /* Called with rcu_read_lock and BH disabled. */
  static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
  {
@@ -218,7 +277,7 @@ static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
  
         /* Save outer tunnel values */
         iph = ip_hdr(skb);
-       tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+       ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
  
         /* Drop non-IP inner packets */
         inner_iph = (struct iphdr *)(lisph + 1);
@@ -361,6 +420,196 @@ error:
         return ERR_PTR(err);
  }
  
+static bool need_linearize(const struct sk_buff *skb)
+{
+       int i;
+
+       if (unlikely(skb_shinfo(skb)->frag_list))
+               return true;
+
+       /*
+        * Generally speaking we should linearize if there are paged frags.
+        * However, if all of the refcounts are 1 we know nobody else can
+        * change them from underneath us and we can skip the linearization.
+        */
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+               if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
+                       return true;
+
+       return false;
+}
+
+static struct sk_buff *handle_offloads(struct sk_buff *skb)
+{
+       int err;
+
+       forward_ip_summed(skb, true);
+
+
+       if (skb_is_gso(skb)) {
+               struct sk_buff *nskb;
+               char cb[sizeof(skb->cb)];
+
+               memcpy(cb, skb->cb, sizeof(cb));
+
+               nskb = __skb_gso_segment(skb, 0, false);
+               if (IS_ERR(nskb)) {
+                       err = PTR_ERR(nskb);
+                       goto error;
+               }
+
+               consume_skb(skb);
+               skb = nskb;
+               while (nskb) {
+                       memcpy(nskb->cb, cb, sizeof(cb));
+                       nskb = nskb->next;
+               }
+       } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
+               /* Pages aren't locked and could change at any time.
+                * If this happens after we compute the checksum, the
+                * checksum will be wrong.  We linearize now to avoid
+                * this problem.
+                */
+               if (unlikely(need_linearize(skb))) {
+                       err = __skb_linearize(skb);
+                       if (unlikely(err))
+                               goto error;
+               }
+
+               err = skb_checksum_help(skb);
+               if (unlikely(err))
+                       goto error;
+       }
+
+       set_ip_summed(skb, OVS_CSUM_NONE);
+
+       return skb;
+
+error:
+       return ERR_PTR(err);
+}
+
+static int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
+                       u8 ipproto, int tunnel_hlen,
+                       void (*build_header)(const struct vport *,
+                                            struct sk_buff *,
+                                            int tunnel_hlen))
+{
+       int min_headroom;
+       struct rtable *rt;
+       __be32 saddr;
+       int sent_len = 0;
+       int err;
+       struct sk_buff *nskb;
+
+       /* Route lookup */
+       saddr = OVS_CB(skb)->tun_key->ipv4_src;
+       rt = find_route(ovs_dp_get_net(vport->dp),
+                       &saddr,
+                       OVS_CB(skb)->tun_key->ipv4_dst,
+                       ipproto,
+                       OVS_CB(skb)->tun_key->ipv4_tos,
+                       skb_get_mark(skb));
+       if (IS_ERR(rt)) {
+               err = PTR_ERR(rt);
+               goto error;
+       }
+
+       tunnel_hlen += sizeof(struct iphdr);
+
+       min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+                       + tunnel_hlen
+                       + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+       if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+               int head_delta = SKB_DATA_ALIGN(min_headroom -
+                                               skb_headroom(skb) +
+                                               16);
+
+               err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+                                       0, GFP_ATOMIC);
+               if (unlikely(err))
+                       goto err_free_rt;
+       }
+
+       /* Offloading */
+       nskb = handle_offloads(skb);
+       if (IS_ERR(nskb)) {
+               err = PTR_ERR(nskb);
+               goto err_free_rt;
+       }
+       skb = nskb;
+
+       /* Reset SKB */
+       nf_reset(skb);
+       secpath_reset(skb);
+       skb_dst_drop(skb);
+       skb_clear_rxhash(skb);
+
+       while (skb) {
+               struct sk_buff *next_skb = skb->next;
+               struct iphdr *iph;
+               int frag_len;
+
+               skb->next = NULL;
+
+               if (unlikely(vlan_deaccel_tag(skb)))
+                       goto next;
+
+               frag_len = skb->len;
+               skb_push(skb, tunnel_hlen);
+               skb_reset_network_header(skb);
+               skb_set_transport_header(skb, sizeof(struct iphdr));
+
+               if (next_skb)
+                       skb_dst_set(skb, dst_clone(&rt_dst(rt)));
+               else
+                       skb_dst_set(skb, &rt_dst(rt));
+
+               /* Push Tunnel header. */
+               build_header(vport, skb, tunnel_hlen);
+
+               /* Push IP header. */
+               iph = ip_hdr(skb);
+               iph->version    = 4;
+               iph->ihl        = sizeof(struct iphdr) >> 2;
+               iph->protocol   = ipproto;
+               iph->daddr      = OVS_CB(skb)->tun_key->ipv4_dst;
+               iph->saddr      = saddr;
+               iph->tos        = OVS_CB(skb)->tun_key->ipv4_tos;
+               iph->ttl        = OVS_CB(skb)->tun_key->ipv4_ttl;
+               iph->frag_off   = OVS_CB(skb)->tun_key->tun_flags &
+                                 TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
+               /*
+                * Allow our local IP stack to fragment the outer packet even
+                * if the DF bit is set as a last resort.  We also need to
+                * force selection of an IP ID here with __ip_select_ident(),
+                * as ip_select_ident() assumes a proper ID is not needed when
+                * when the DF bit is set.
+                */
+               skb->local_df = 1;
+               __ip_select_ident(iph, skb_dst(skb), 0);
+
+               memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+               err = ip_local_out(skb);
+               if (unlikely(net_xmit_eval(err)))
+                       goto next;
+
+               sent_len += frag_len;
+
+next:
+               skb = next_skb;
+       }
+
+       return sent_len;
+
+err_free_rt:
+       ip_rt_put(rt);
+error:
+       return err;
+}
+
  static int lisp_tnl_send(struct vport *vport, struct sk_buff *skb)
  {
         int tnl_len;
diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c

index 4bc1617..50373b1 100644 (file)
--- a/datapath/vport-netdev.c
+++ b/datapath/vport-netdev.c
@@ -340,7 +340,7 @@ static int netdev_send(struct vport *vport, struct sk_buff *skb)
                                 nskb = skb->next;
                                 skb->next = NULL;
  
-                               skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+                               skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
                                 if (likely(skb)) {
                                         len += skb->len;
                                         vlan_set_tci(skb, 0);
@@ -354,7 +354,7 @@ static int netdev_send(struct vport *vport, struct sk_buff *skb)
                 }
  
  tag:
-               skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+               skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
                 if (unlikely(!skb))
                         return 0;
                 vlan_set_tci(skb, 0);
diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c

index 5546820..f3ef947 100644 (file)
--- a/datapath/vport-vxlan.c
+++ b/datapath/vport-vxlan.c
@@ -42,7 +42,6 @@
  #include <net/vxlan.h>
  
  #include "datapath.h"
-#include "tunnel.h"
  #include "vport.h"
  
  #define OVS_VXLAN_RCV_PRIORITY         8
@@ -73,7 +72,7 @@ static int vxlan_rcv(struct vxlan_handler *vh, struct sk_buff *skb, __be32 vx_vn
         /* Save outer tunnel values */
         iph = ip_hdr(skb);
         key = cpu_to_be64(ntohl(vx_vni) >> 8);
-       tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+       ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
  
         ovs_vport_receive(vport, skb, &tun_key);
         return PACKET_RCVD;
diff --git a/debian/rules b/debian/rules

index d34bdb3..b21c8db 100755 (executable)
--- a/debian/rules
+++ b/debian/rules
@@ -29,6 +29,12 @@ else
  CFLAGS += -O2
  endif
  
+# Old versions of dpkg-buildflags do not understand --export=configure.
+# When dpkg-buildflags does not understand an option, it prints its full
+# --help output on stdout, so we have to avoid that here.
+buildflags := $(shell if dpkg-buildflags --export=configure >/dev/null 2>&1; \
+                     then dpkg-buildflags --export=configure; fi)
+
  configure: configure-stamp
  configure-stamp:
         dh_testdir
@@ -40,7 +46,7 @@ configure-stamp:
                 test -e Makefile || \
                 ../configure --prefix=/usr --localstatedir=/var --enable-ssl \
                         --sysconfdir=/etc CFLAGS="$(CFLAGS)" \
-                       $(DATAPATH_CONFIGURE_OPTS))
+                       $(buildflags) $(DATAPATH_CONFIGURE_OPTS))
         touch configure-stamp
  
  #Architecture 
diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h

index 6319f4e..de5ff6a 100644 (file)
--- a/include/openflow/nicira-ext.h
+++ b/include/openflow/nicira-ext.h
@@ -477,6 +477,7 @@ OFP_ASSERT(sizeof(struct nx_action_pop_queue) == 16);
   *   - NXM_NX_ND_SLL
   *   - NXM_NX_ND_TLL
   *   - NXM_NX_REG(idx) for idx in the switch's accepted range.
+ *   - NXM_NX_PKT_MARK
   *   - NXM_NX_TUN_IPV4_SRC
   *   - NXM_NX_TUN_IPV4_DST
   *
@@ -498,6 +499,8 @@ OFP_ASSERT(sizeof(struct nx_action_pop_queue) == 16);
   *
   *   - NXM_NX_REG(idx) for idx in the switch's accepted range.
   *
+ *   - NXM_NX_PKT_MARK
+ *
   *   - NXM_OF_VLAN_TCI.  Modifying this field's value has side effects on the
   *     packet's 802.1Q header.  Setting a value with CFI=0 removes the 802.1Q
   *     header (if any), ignoring the other bits.  Setting a value with CFI=1
@@ -1766,6 +1769,20 @@ OFP_ASSERT(sizeof(struct nx_action_output_reg) == 24);
  #define NXM_NX_TUN_IPV4_DST   NXM_HEADER  (0x0001, 32, 4)
  #define NXM_NX_TUN_IPV4_DST_W NXM_HEADER_W(0x0001, 32, 4)
  
+/* Metadata marked onto the packet in a system-dependent manner.
+ *
+ * The packet mark may be used to carry contextual information
+ * to other parts of the system outside of Open vSwitch. As a
+ * result, the semantics depend on system in use.
+ *
+ * Prereqs: None.
+ *
+ * Format: 32-bit integer in network byte order.
+ *
+ * Masking: Fully maskable. */
+#define NXM_NX_PKT_MARK   NXM_HEADER  (0x0001, 33, 4)
+#define NXM_NX_PKT_MARK_W NXM_HEADER_W(0x0001, 33, 4)
+
  /* ## --------------------- ## */
  /* ## Requests and replies. ## */
  /* ## --------------------- ## */
diff --git a/include/sparse/pthread.h b/include/sparse/pthread.h

index aa4652e..40c5ca3 100644 (file)
--- a/include/sparse/pthread.h
+++ b/include/sparse/pthread.h
@@ -21,18 +21,6 @@
  /* Get actual <pthread.h> definitions for us to annotate and build on. */
  #include_next <pthread.h>
  
-#include "compiler.h"
-
-int pthread_mutex_lock(pthread_mutex_t *mutex) OVS_ACQUIRES(mutex);
-int pthread_mutex_unlock(pthread_mutex_t *mutex) OVS_RELEASES(mutex);
-
-int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) OVS_ACQ_RDLOCK(rwlock);
-int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) OVS_ACQ_WRLOCK(rwlock);
-int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) OVS_RELEASES(rwlock);
-
-int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *mutex)
-    OVS_REQUIRES(mutex);
-
  /* Sparse complains about the proper PTHREAD_*_INITIALIZER definitions.
   * Luckily, it's not a real compiler so we can overwrite it with something
   * simple. */
@@ -47,29 +35,3 @@ int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *mutex)
  
  #undef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
  #define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP {}
-
-#define pthread_mutex_trylock(MUTEX)                    \
-    ({                                                  \
-        int retval = pthread_mutex_trylock(mutex);      \
-        if (!retval) {                                  \
-            OVS_MACRO_LOCK(MUTEX);                      \
-        }                                               \
-        retval;                                         \
-    })
-
-#define pthread_rwlock_tryrdlock(RWLOCK)                \
-    ({                                                  \
-        int retval = pthread_rwlock_tryrdlock(rwlock);  \
-        if (!retval) {                                  \
-            OVS_MACRO_LOCK(RWLOCK);                     \
-        }                                               \
-        retval;                                         \
-    })
-#define pthread_rwlock_trywrlock(RWLOCK)                \
-    ({                                                  \
-        int retval = pthread_rwlock_trywrlock(rwlock);  \
-        if (!retval) {                                  \
-            OVS_MACRO_LOCK(RWLOCK);                     \
-        }                                               \
-        retval;                                         \
-    })
diff --git a/lib/automake.mk b/lib/automake.mk

index cd50e91..fa7f173 100644 (file)
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -161,6 +161,8 @@ lib_libopenvswitch_a_SOURCES = \
         lib/reconnect.c \
         lib/reconnect.h \
         lib/sat-math.h \
+       lib/seq.c \
+       lib/seq.h \
         lib/sha1.c \
         lib/sha1.h \
         lib/shash.c \
diff --git a/lib/bfd.c b/lib/bfd.c

index 81fd178..74b27c4 100644 (file)
--- a/lib/bfd.c
+++ b/lib/bfd.c
@@ -261,6 +261,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg)
      static atomic_uint16_t udp_src = ATOMIC_VAR_INIT(0);
  
      long long int min_tx, min_rx;
+    bool need_poll = false;
      bool cpath_down;
      const char *hwaddr;
      uint8_t ea[ETH_ADDR_LEN];
@@ -315,7 +316,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg)
              || (!bfd_in_poll(bfd) && bfd->cfg_min_tx < bfd->min_tx)) {
              bfd->min_tx = bfd->cfg_min_tx;
          }
-        bfd_poll(bfd);
+        need_poll = true;
      }
  
      min_rx = smap_get_int(cfg, "min_rx", 1000);
@@ -326,7 +327,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg)
              || (!bfd_in_poll(bfd) && bfd->cfg_min_rx > bfd->min_rx)) {
              bfd->min_rx = bfd->cfg_min_rx;
          }
-        bfd_poll(bfd);
+        need_poll = true;
      }
  
      cpath_down = smap_get_bool(cfg, "cpath_down", false);
@@ -335,7 +336,7 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg)
          if (bfd->diag == DIAG_NONE || bfd->diag == DIAG_CPATH_DOWN) {
              bfd_set_state(bfd, bfd->state, DIAG_NONE);
          }
-        bfd_poll(bfd);
+        need_poll = true;
      }
  
      hwaddr = smap_get(cfg, "bfd_dst_mac");
@@ -347,6 +348,9 @@ bfd_configure(struct bfd *bfd, const char *name, const struct smap *cfg)
          bfd->eth_dst_set = false;
      }
  
+    if (need_poll) {
+        bfd_poll(bfd);
+    }
      ovs_mutex_unlock(&mutex);
      return bfd;
  }
@@ -516,7 +520,7 @@ bfd_should_process_flow(const struct bfd *bfd, const struct flow *flow,
      return (flow->dl_type == htons(ETH_TYPE_IP)
              && flow->nw_proto == IPPROTO_UDP
              && flow->tp_dst == htons(BFD_DEST_PORT)
-            && (check_tnl_key || flow->tunnel.tun_id == htonll(0)));
+            && (!check_tnl_key || flow->tunnel.tun_id == htonll(0)));
  }
  
  void
@@ -782,6 +786,8 @@ bfd_flag_str(enum flags flags)
          ds_put_cstr(&ds, "poll ");
      }
  
+    /* Do not copy the trailing whitespace. */
+    ds_chomp(&ds, ' ');
      ovs_strlcpy(flag_str, ds_cstr(&ds), sizeof flag_str);
      ds_destroy(&ds);
      return flag_str;
diff --git a/lib/bond.c b/lib/bond.c

index 06dd362..3834774 100644 (file)
--- a/lib/bond.c
+++ b/lib/bond.c
@@ -475,7 +475,7 @@ bond_wait(struct bond *bond)
          poll_timer_wait_until(bond->next_fake_iface_update);
      }
  
-    if (!bond->bond_revalidate) {
+    if (bond->bond_revalidate) {
          poll_immediate_wake();
      }
      ovs_rwlock_unlock(&rwlock);
@@ -661,11 +661,14 @@ bond_choose_output_slave(struct bond *bond, const struct flow *flow,
                           struct flow_wildcards *wc, uint16_t vlan)
  {
      struct bond_slave *slave;
+    void *aux;
  
      ovs_rwlock_rdlock(&rwlock);
      slave = choose_output_slave(bond, flow, wc, vlan);
+    aux = slave ? slave->aux : NULL;
      ovs_rwlock_unlock(&rwlock);
-    return slave;
+
+    return aux;
  }
  \f
  /* Rebalancing. */
diff --git a/lib/compiler.h b/lib/compiler.h

index 2ca81bd..519b832 100644 (file)
--- a/lib/compiler.h
+++ b/lib/compiler.h
@@ -128,32 +128,7 @@
  #define OVS_EXCLUDED(...) __attribute__((locks_excluded(__VA_ARGS__)))
  #define OVS_ACQ_BEFORE(...) __attribute__((acquired_before(__VA_ARGS__)))
  #define OVS_ACQ_AFTER(...) __attribute__((acquired_after(__VA_ARGS__)))
-#elif __CHECKER__
-/* "sparse" annotations for mutexes and mutex-like constructs.
- *
- * Change the thread-safety check annotations to use "context" attribute.
- *
- * OVS_MACRO_LOCK and OVS_MACRO_RELEASE are suitable for use within macros,
- * where there is no function prototype to annotate. */
-#define OVS_LOCKABLE
-#define OVS_REQ_RDLOCK(...) __attribute__((context(MUTEX, 1, 1)))
-#define OVS_ACQ_RDLOCK(...) __attribute__((context(MUTEX, 0, 1)))
-#define OVS_REQ_WRLOCK(...) __attribute__((context(MUTEX, 1, 1)))
-#define OVS_ACQ_WRLOCK(...) __attribute__((context(MUTEX, 0, 1)))
-#define OVS_REQUIRES(...)   __attribute__((context(MUTEX, 1, 1)))
-#define OVS_ACQUIRES(...)   __attribute__((context(MUTEX, 0, 1)))
-#define OVS_TRY_WRLOCK(RETVAL, ...)
-#define OVS_TRY_RDLOCK(RETVAL, ...)
-#define OVS_TRY_LOCK(REVAL, ...)
-#define OVS_GUARDED
-#define OVS_GUARDED_BY(...)
-#define OVS_EXCLUDED(...)
-#define OVS_RELEASES(...)   __attribute__((context(MUTEX, 1, 0)))
-#define OVS_ACQ_BEFORE(...)
-#define OVS_ACQ_AFTER(...)
-#define OVS_MACRO_LOCK(...) __context__(MUTEX, 0, 1)
-#define OVS_MACRO_RELEASE(...) __context__(MUTEX, 1, 0)
-#else
+#else  /* not Clang */
  #define OVS_LOCKABLE
  #define OVS_REQ_RDLOCK(...)
  #define OVS_ACQ_RDLOCK(...)
@@ -170,8 +145,6 @@
  #define OVS_RELEASES(...)
  #define OVS_ACQ_BEFORE(...)
  #define OVS_ACQ_AFTER(...)
-#define OVS_MACRO_LOCK(...)
-#define OVS_MACRO_RELEASE(...)
  #endif
  
  /* ISO C says that a C implementation may choose any integer type for an enum
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c

index a8a54a1..07c1467 100644 (file)
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -49,6 +49,7 @@
  #include "packets.h"
  #include "poll-loop.h"
  #include "random.h"
+#include "seq.h"
  #include "shash.h"
  #include "sset.h"
  #include "timeval.h"
@@ -92,6 +93,7 @@ struct dp_netdev {
  
      struct dp_netdev_queue queues[N_QUEUES];
      struct hmap flow_table;     /* Flow table. */
+    struct seq *queue_seq;      /* Incremented whenever a packet is queued. */
  
      /* Statistics. */
      long long int n_hit;        /* Number of flow table matches. */
@@ -101,7 +103,7 @@ struct dp_netdev {
      /* Ports. */
      struct dp_netdev_port *ports[MAX_PORTS];
      struct list port_list;
-    unsigned int serial;
+    struct seq *port_seq;       /* Incremented whenever a port changes. */
  };
  
  /* A port in a netdev-based datapath. */
@@ -134,7 +136,7 @@ struct dp_netdev_flow {
  struct dpif_netdev {
      struct dpif dpif;
      struct dp_netdev *dp;
-    unsigned int dp_serial;
+    uint64_t last_port_seq;
  };
  
  /* All netdev-based datapaths. */
@@ -164,7 +166,7 @@ static void dp_netdev_execute_actions(struct dp_netdev *,
  static void dp_netdev_port_input(struct dp_netdev *dp,
                                   struct dp_netdev_port *port,
                                   struct ofpbuf *packet, uint32_t skb_priority,
-                                 uint32_t skb_mark, const struct flow_tnl *tnl);
+                                 uint32_t pkt_mark, const struct flow_tnl *tnl);
  
  static struct dpif_netdev *
  dpif_netdev_cast(const struct dpif *dpif)
@@ -225,7 +227,7 @@ create_dpif_netdev(struct dp_netdev *dp)
      dpif = xmalloc(sizeof *dpif);
      dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
      dpif->dp = dp;
-    dpif->dp_serial = dp->serial;
+    dpif->last_port_seq = seq_read(dp->port_seq);
  
      return &dpif->dpif;
  }
@@ -288,8 +290,10 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
      for (i = 0; i < N_QUEUES; i++) {
          dp->queues[i].head = dp->queues[i].tail = 0;
      }
+    dp->queue_seq = seq_create();
      hmap_init(&dp->flow_table);
      list_init(&dp->port_list);
+    dp->port_seq = seq_create();
  
      error = do_add_port(dp, name, "internal", ODPP_LOCAL);
      if (error) {
@@ -352,7 +356,9 @@ dp_netdev_free(struct dp_netdev *dp)
          do_del_port(dp, port->port_no);
      }
      dp_netdev_purge_queues(dp);
+    seq_destroy(dp->queue_seq);
      hmap_destroy(&dp->flow_table);
+    seq_destroy(dp->port_seq);
      free(dp->name);
      free(dp);
  }
@@ -454,7 +460,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
  
      list_push_back(&dp->port_list, &port->node);
      dp->ports[odp_to_u32(port_no)] = port;
-    dp->serial++;
+    seq_change(dp->port_seq);
  
      return 0;
  }
@@ -554,7 +560,7 @@ do_del_port(struct dp_netdev *dp, odp_port_t port_no)
  
      list_remove(&port->node);
      dp->ports[odp_to_u32(port_no)] = NULL;
-    dp->serial++;
+    seq_change(dp->port_seq);
  
      netdev_close(port->netdev);
      netdev_restore_flags(port->sf);
@@ -700,11 +706,13 @@ static int
  dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
  {
      struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
+    uint64_t new_port_seq;
      int error;
  
      ovs_mutex_lock(&dp_netdev_mutex);
-    if (dpif->dp_serial != dpif->dp->serial) {
-        dpif->dp_serial = dpif->dp->serial;
+    new_port_seq = seq_read(dpif->dp->port_seq);
+    if (dpif->last_port_seq != new_port_seq) {
+        dpif->last_port_seq = new_port_seq;
          error = ENOBUFS;
      } else {
          error = EAGAIN;
@@ -719,14 +727,8 @@ dpif_netdev_port_poll_wait(const struct dpif *dpif_)
  {
      struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
  
-    /* XXX In a multithreaded process, there is a race window between this
-     * function and the poll_block() in one thread and a change in
-     * dpif->dp->serial in another thread. */
-
      ovs_mutex_lock(&dp_netdev_mutex);
-    if (dpif->dp_serial != dpif->dp->serial) {
-        poll_immediate_wake();
-    }
+    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
      ovs_mutex_unlock(&dp_netdev_mutex);
  }
  
@@ -1107,13 +1109,15 @@ dpif_netdev_recv(struct dpif *dpif, struct dpif_upcall *upcall,
  static void
  dpif_netdev_recv_wait(struct dpif *dpif)
  {
-    /* XXX In a multithreaded process, there is a race window between this
-     * function and the poll_block() in one thread and a packet being queued in
-     * another thread. */
+    struct dp_netdev *dp = get_dp_netdev(dpif);
+    uint64_t seq;
  
      ovs_mutex_lock(&dp_netdev_mutex);
+    seq = seq_read(dp->queue_seq);
      if (find_nonempty_queue(dpif)) {
          poll_immediate_wake();
+    } else {
+        seq_wait(dp->queue_seq, seq);
      }
      ovs_mutex_unlock(&dp_netdev_mutex);
  }
@@ -1139,7 +1143,7 @@ dp_netdev_flow_used(struct dp_netdev_flow *flow, const struct ofpbuf *packet)
  static void
  dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port,
                       struct ofpbuf *packet, uint32_t skb_priority,
-                     uint32_t skb_mark, const struct flow_tnl *tnl)
+                     uint32_t pkt_mark, const struct flow_tnl *tnl)
  {
      struct dp_netdev_flow *flow;
      struct flow key;
@@ -1149,7 +1153,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct dp_netdev_port *port,
          return;
      }
      in_port_.odp_port = port->port_no;
-    flow_extract(packet, skb_priority, skb_mark, tnl, &in_port_, &key);
+    flow_extract(packet, skb_priority, pkt_mark, tnl, &in_port_, &key);
      flow = dp_netdev_lookup_flow(dp, &key);
      if (flow) {
          dp_netdev_flow_used(flow, packet);
@@ -1274,6 +1278,8 @@ dp_netdev_output_userspace(struct dp_netdev *dp, const struct ofpbuf *packet,
          buf->size = packet->size;
          upcall->packet = buf;
  
+        seq_change(dp->queue_seq);
+
          return 0;
      } else {
          dp->n_lost++;
@@ -1375,7 +1381,7 @@ dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
      dp->ports[odp_to_u32(port->port_no)] = NULL;
      dp->ports[port_no] = port;
      port->port_no = u32_to_odp(port_no);
-    dp->serial++;
+    seq_change(dp->port_seq);
      unixctl_command_reply(conn, NULL);
  }
  
diff --git a/lib/flow.c b/lib/flow.c

index d899d26..3e29aa1 100644 (file)
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -356,7 +356,7 @@ invalid:
   *      present and has a correct length, and otherwise NULL.
   */
  void
-flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark,
+flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t pkt_mark,
               const struct flow_tnl *tnl, const union flow_in_port *in_port,
               struct flow *flow)
  {
@@ -375,7 +375,7 @@ flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark,
          flow->in_port = *in_port;
      }
      flow->skb_priority = skb_priority;
-    flow->skb_mark = skb_mark;
+    flow->pkt_mark = pkt_mark;
  
      packet->l2   = b.data;
      packet->l2_5 = NULL;
@@ -500,6 +500,7 @@ flow_get_metadata(const struct flow *flow, struct flow_metadata *fmd)
      fmd->tun_dst = flow->tunnel.ip_dst;
      fmd->metadata = flow->metadata;
      memcpy(fmd->regs, flow->regs, sizeof fmd->regs);
+    fmd->pkt_mark = flow->pkt_mark;
      fmd->in_port = flow->in_port.ofp_port;
  }
  
diff --git a/lib/flow.h b/lib/flow.h

index 7c3654b..8164d9c 100644 (file)
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -96,7 +96,7 @@ struct flow {
      ovs_be32 nw_dst;            /* IPv4 destination address. */
      ovs_be32 ipv6_label;        /* IPv6 flow label. */
      union flow_in_port in_port; /* Input port.*/
-    uint32_t skb_mark;          /* Packet mark. */
+    uint32_t pkt_mark;          /* Packet mark. */
      ovs_be32 mpls_lse;          /* MPLS label stack entry. */
      uint16_t mpls_depth;        /* Depth of MPLS stack. */
      ovs_be16 vlan_tci;          /* If 802.1Q, TCI | VLAN_CFI; otherwise 0. */
@@ -128,6 +128,7 @@ struct flow_metadata {
      ovs_be32 tun_dst;                /* Tunnel outer IPv4 dst addr */
      ovs_be64 metadata;               /* OpenFlow 1.1+ metadata field. */
      uint32_t regs[FLOW_N_REGS];      /* Registers. */
+    uint32_t pkt_mark;               /* Packet mark. */
      ofp_port_t in_port;              /* OpenFlow port or zero. */
  };
  
diff --git a/lib/match.c b/lib/match.c

index 91c05a7..e97b0b1 100644 (file)
--- a/lib/match.c
+++ b/lib/match.c
@@ -60,8 +60,8 @@ match_wc_init(struct match *match, const struct flow *flow)
          memset(&wc->masks.skb_priority, 0xff, sizeof wc->masks.skb_priority);
      }
  
-    if (flow->skb_mark) {
-        memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark);
+    if (flow->pkt_mark) {
+        memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark);
      }
  
      for (i = 0; i < FLOW_N_REGS; i++) {
@@ -138,7 +138,6 @@ match_init_exact(struct match *match, const struct flow *flow)
  {
      match->flow = *flow;
      match->flow.skb_priority = 0;
-    match->flow.skb_mark = 0;
      flow_wildcards_init_exact(&match->wc);
  }
  
@@ -286,10 +285,16 @@ match_set_skb_priority(struct match *match, uint32_t skb_priority)
  }
  
  void
-match_set_skb_mark(struct match *match, uint32_t skb_mark)
+match_set_pkt_mark(struct match *match, uint32_t pkt_mark)
  {
-    match->wc.masks.skb_mark = UINT32_MAX;
-    match->flow.skb_mark = skb_mark;
+    match_set_pkt_mark_masked(match, pkt_mark, UINT32_MAX);
+}
+
+void
+match_set_pkt_mark_masked(struct match *match, uint32_t pkt_mark, uint32_t mask)
+{
+    match->flow.pkt_mark = pkt_mark & mask;
+    match->wc.masks.pkt_mark = mask;
  }
  
  void
@@ -836,8 +841,16 @@ match_format(const struct match *match, struct ds *s, unsigned int priority)
          ds_put_format(s, "priority=%u,", priority);
      }
  
-    if (wc->masks.skb_mark) {
-        ds_put_format(s, "skb_mark=%#"PRIx32",", f->skb_mark);
+    switch (wc->masks.pkt_mark) {
+    case 0:
+        break;
+    case UINT32_MAX:
+        ds_put_format(s, "pkt_mark=%#"PRIx32",", f->pkt_mark);
+        break;
+    default:
+        ds_put_format(s, "pkt_mark=%#"PRIx32"/%#"PRIx32",",
+                      f->pkt_mark, wc->masks.pkt_mark);
+        break;
      }
  
      if (wc->masks.skb_priority) {
diff --git a/lib/match.h b/lib/match.h

index 0ea1f2d..5788721 100644 (file)
--- a/lib/match.h
+++ b/lib/match.h
@@ -61,7 +61,8 @@ void match_set_tun_tos_masked(struct match *match, uint8_t tos, uint8_t mask);
  void match_set_tun_flags(struct match *match, uint16_t flags);
  void match_set_tun_flags_masked(struct match *match, uint16_t flags, uint16_t mask);
  void match_set_in_port(struct match *, ofp_port_t ofp_port);
-void match_set_skb_mark(struct match *, uint32_t skb_mark);
+void match_set_pkt_mark(struct match *, uint32_t pkt_mark);
+void match_set_pkt_mark_masked(struct match *, uint32_t pkt_mark, uint32_t mask);
  void match_set_skb_priority(struct match *, uint32_t skb_priority);
  void match_set_dl_type(struct match *, ovs_be16);
  void match_set_dl_src(struct match *, const uint8_t[6]);
diff --git a/lib/meta-flow.c b/lib/meta-flow.c

index 11fdfaa..ce061a3 100644 (file)
--- a/lib/meta-flow.c
+++ b/lib/meta-flow.c
@@ -137,14 +137,14 @@ static const struct mf_field mf_fields[MFF_N_IDS] = {
          0, NULL,
          0, NULL,
      }, {
-        MFF_SKB_MARK, "skb_mark", NULL,
+        MFF_PKT_MARK, "pkt_mark", NULL,
          MF_FIELD_SIZES(be32),
-        MFM_NONE,
+        MFM_FULLY,
          MFS_HEXADECIMAL,
          MFP_NONE,
-        false,
-        0, NULL,
-        0, NULL,
+        true,
+        NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK",
+        NXM_NX_PKT_MARK, "NXM_NX_PKT_MARK",
      },
  
  #define REGISTER(IDX)                           \
@@ -706,8 +706,8 @@ mf_is_all_wild(const struct mf_field *mf, const struct flow_wildcards *wc)
          return !wc->masks.in_port.ofp_port;
      case MFF_SKB_PRIORITY:
          return !wc->masks.skb_priority;
-    case MFF_SKB_MARK:
-        return !wc->masks.skb_mark;
+    case MFF_PKT_MARK:
+        return !wc->masks.pkt_mark;
      CASE_MFF_REGS:
          return !wc->masks.regs[mf->id - MFF_REG0];
  
@@ -912,7 +912,7 @@ mf_is_value_valid(const struct mf_field *mf, const union mf_value *value)
      case MFF_METADATA:
      case MFF_IN_PORT:
      case MFF_SKB_PRIORITY:
-    case MFF_SKB_MARK:
+    case MFF_PKT_MARK:
      CASE_MFF_REGS:
      case MFF_ETH_SRC:
      case MFF_ETH_DST:
@@ -1026,8 +1026,8 @@ mf_get_value(const struct mf_field *mf, const struct flow *flow,
          value->be32 = htonl(flow->skb_priority);
          break;
  
-    case MFF_SKB_MARK:
-        value->be32 = htonl(flow->skb_mark);
+    case MFF_PKT_MARK:
+        value->be32 = htonl(flow->pkt_mark);
          break;
  
      CASE_MFF_REGS:
@@ -1216,8 +1216,8 @@ mf_set_value(const struct mf_field *mf,
          match_set_skb_priority(match, ntohl(value->be32));
          break;
  
-    case MFF_SKB_MARK:
-        match_set_skb_mark(match, ntohl(value->be32));
+    case MFF_PKT_MARK:
+        match_set_pkt_mark(match, ntohl(value->be32));
          break;
  
      CASE_MFF_REGS:
@@ -1405,8 +1405,8 @@ mf_set_flow_value(const struct mf_field *mf,
          flow->skb_priority = ntohl(value->be32);
          break;
  
-    case MFF_SKB_MARK:
-        flow->skb_mark = ntohl(value->be32);
+    case MFF_PKT_MARK:
+        flow->pkt_mark = ntohl(value->be32);
          break;
  
      CASE_MFF_REGS:
@@ -1607,9 +1607,9 @@ mf_set_wild(const struct mf_field *mf, struct match *match)
          match->wc.masks.skb_priority = 0;
          break;
  
-    case MFF_SKB_MARK:
-        match->flow.skb_mark = 0;
-        match->wc.masks.skb_mark = 0;
+    case MFF_PKT_MARK:
+        match->flow.pkt_mark = 0;
+        match->wc.masks.pkt_mark = 0;
          break;
  
      CASE_MFF_REGS:
@@ -1780,7 +1780,6 @@ mf_set(const struct mf_field *mf,
      switch (mf->id) {
      case MFF_IN_PORT:
      case MFF_IN_PORT_OXM:
-    case MFF_SKB_MARK:
      case MFF_SKB_PRIORITY:
      case MFF_ETH_TYPE:
      case MFF_DL_VLAN:
@@ -1829,6 +1828,11 @@ mf_set(const struct mf_field *mf,
                               ntohl(value->be32), ntohl(mask->be32));
          break;
  
+    case MFF_PKT_MARK:
+        match_set_pkt_mark_masked(match, ntohl(value->be32),
+                                  ntohl(mask->be32));
+        break;
+
      case MFF_ETH_DST:
          match_set_dl_dst_masked(match, value->mac, mask->mac);
          break;
@@ -1985,7 +1989,7 @@ mf_random_value(const struct mf_field *mf, union mf_value *value)
      case MFF_TUN_FLAGS:
      case MFF_METADATA:
      case MFF_IN_PORT:
-    case MFF_SKB_MARK:
+    case MFF_PKT_MARK:
      case MFF_SKB_PRIORITY:
      CASE_MFF_REGS:
      case MFF_ETH_SRC:
diff --git a/lib/meta-flow.h b/lib/meta-flow.h

index bc402dc..93b894d 100644 (file)
--- a/lib/meta-flow.h
+++ b/lib/meta-flow.h
@@ -41,7 +41,7 @@ enum mf_field_id {
      MFF_IN_PORT,                /* be16 */
      MFF_IN_PORT_OXM,            /* be32 */
      MFF_SKB_PRIORITY,           /* be32 */
-    MFF_SKB_MARK,               /* be32 */
+    MFF_PKT_MARK,               /* be32 */
  
  #if FLOW_N_REGS > 0
      MFF_REG0,                   /* be32 */
diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c

index 0f625af..180ce7f 100644 (file)
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -42,6 +42,8 @@
  #include <sys/sysctl.h>
  #if defined(__NetBSD__)
  #include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/if_inarp.h>
  #endif
  
  #include "rtbsd.h"
@@ -76,6 +78,13 @@ struct netdev_rx_bsd {
  
  struct netdev_bsd {
      struct netdev up;
+
+    /* Never changes after initialization. */
+    char *kernel_name;
+
+    /* Protects all members below. */
+    struct ovs_mutex mutex;
+
      unsigned int cache_valid;
      unsigned int change_seq;
  
@@ -92,8 +101,6 @@ struct netdev_bsd {
      /* Used for sending packets on non-tap devices. */
      pcap_t *pcap;
      int fd;
-
-    char *kernel_name;
  };
  
  
@@ -128,7 +135,7 @@ static void destroy_tap(int fd, const char *name);
  static int get_flags(const struct netdev *, int *flagsp);
  static int set_flags(const char *, int flags);
  static int do_set_addr(struct netdev *netdev,
-                       int ioctl_nr, const char *ioctl_name,
+                       unsigned long ioctl_nr, const char *ioctl_name,
                         struct in_addr addr);
  static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
  static int set_etheraddr(const char *netdev_name, int hwaddr_family,
@@ -139,7 +146,7 @@ static int ifr_get_flags(const struct ifreq *);
  static void ifr_set_flags(struct ifreq *, int flags);
  
  #ifdef __NetBSD__
-static int af_link_ioctl(int command, const void *arg);
+static int af_link_ioctl(unsigned long command, const void *arg);
  #endif
  
  static void netdev_bsd_run(void);
@@ -286,6 +293,7 @@ netdev_bsd_construct_system(struct netdev *netdev_)
          return error;
      }
  
+    ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
      netdev->change_seq = 1;
      netdev->tap_fd = -1;
      netdev->kernel_name = xstrdup(netdev_->name);
@@ -319,6 +327,7 @@ netdev_bsd_construct_tap(struct netdev *netdev_)
  
      /* Create a tap device by opening /dev/tap.  The TAPGIFNAME ioctl is used
       * to retrieve the name of the tap device. */
+    ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
      netdev->tap_fd = open("/dev/tap", O_RDWR);
      netdev->change_seq = 1;
      if (netdev->tap_fd < 0) {
@@ -373,6 +382,7 @@ netdev_bsd_construct_tap(struct netdev *netdev_)
      return 0;
  
  error_unref_notifier:
+    ovs_mutex_destroy(&netdev->mutex);
      cache_notifier_unref();
  error:
      free(kernel_name);
@@ -393,6 +403,7 @@ netdev_bsd_destruct(struct netdev *netdev_)
          pcap_close(netdev->pcap);
      }
      free(netdev->kernel_name);
+    ovs_mutex_destroy(&netdev->mutex);
  }
  
  static void
@@ -485,21 +496,23 @@ netdev_bsd_rx_construct(struct netdev_rx *rx_)
      struct netdev_rx_bsd *rx = netdev_rx_bsd_cast(rx_);
      struct netdev *netdev_ = rx->up.netdev;
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+    int error;
  
      if (!strcmp(netdev_get_type(netdev_), "tap")) {
          rx->pcap_handle = NULL;
          rx->fd = netdev->tap_fd;
+        error = 0;
      } else {
-        int error = netdev_bsd_open_pcap(netdev_get_kernel_name(netdev_),
-                                         &rx->pcap_handle, &rx->fd);
-        if (error) {
-            return error;
+        ovs_mutex_lock(&netdev->mutex);
+        error = netdev_bsd_open_pcap(netdev_get_kernel_name(netdev_),
+                                     &rx->pcap_handle, &rx->fd);
+        if (!error) {
+            netdev_bsd_changed(netdev);
          }
-
-        netdev_bsd_changed(netdev);
+        ovs_mutex_unlock(&netdev->mutex);
      }
  
-    return 0;
+    return error;
  }
  
  static void
@@ -662,15 +675,16 @@ netdev_bsd_send(struct netdev *netdev_, const void *data, size_t size)
  {
      struct netdev_bsd *dev = netdev_bsd_cast(netdev_);
      const char *name = netdev_get_name(netdev_);
+    int error;
  
+    ovs_mutex_lock(&dev->mutex);
      if (dev->tap_fd < 0 && !dev->pcap) {
-        int error = netdev_bsd_open_pcap(name, &dev->pcap, &dev->fd);
-        if (error) {
-            return error;
-        }
+        error = netdev_bsd_open_pcap(name, &dev->pcap, &dev->fd);
+    } else {
+        error = 0;
      }
  
-    for (;;) {
+    while (!error) {
          ssize_t retval;
          if (dev->tap_fd >= 0) {
              retval = write(dev->tap_fd, data, size);
@@ -680,19 +694,24 @@ netdev_bsd_send(struct netdev *netdev_, const void *data, size_t size)
          if (retval < 0) {
              if (errno == EINTR) {
                  continue;
-            } else if (errno != EAGAIN) {
-                VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
-                             name, ovs_strerror(errno));
+            } else {
+                error = errno;
+                if (error != EAGAIN) {
+                    VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: "
+                                 "%s", name, ovs_strerror(error));
+                }
              }
-            return errno;
          } else if (retval != size) {
              VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
                           "%zu) on %s", retval, size, name);
-           return EMSGSIZE;
+            error = EMSGSIZE;
          } else {
-            return 0;
+            break;
          }
      }
+
+    ovs_mutex_unlock(&dev->mutex);
+    return error;
  }
  
  /*
@@ -705,6 +724,7 @@ netdev_bsd_send_wait(struct netdev *netdev_)
  {
      struct netdev_bsd *dev = netdev_bsd_cast(netdev_);
  
+    ovs_mutex_lock(&dev->mutex);
      if (dev->tap_fd >= 0) {
          /* TAP device always accepts packets. */
          poll_immediate_wake();
@@ -714,6 +734,7 @@ netdev_bsd_send_wait(struct netdev *netdev_)
          /* We haven't even tried to send a packet yet. */
          poll_immediate_wake();
      }
+    ovs_mutex_unlock(&dev->mutex);
  }
  
  /*
@@ -725,8 +746,9 @@ netdev_bsd_set_etheraddr(struct netdev *netdev_,
                           const uint8_t mac[ETH_ADDR_LEN])
  {
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
-    int error;
+    int error = 0;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_ETHERADDR)
          || !eth_addr_equals(netdev->etheraddr, mac)) {
          error = set_etheraddr(netdev_get_kernel_name(netdev_), AF_LINK,
@@ -736,9 +758,9 @@ netdev_bsd_set_etheraddr(struct netdev *netdev_,
              memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
              netdev_bsd_changed(netdev);
          }
-    } else {
-        error = 0;
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error;
  }
  
@@ -751,18 +773,22 @@ netdev_bsd_get_etheraddr(const struct netdev *netdev_,
                           uint8_t mac[ETH_ADDR_LEN])
  {
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+    int error = 0;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_ETHERADDR)) {
-        int error = get_etheraddr(netdev_get_kernel_name(netdev_),
-                                  netdev->etheraddr);
-        if (error) {
-            return error;
+        error = get_etheraddr(netdev_get_kernel_name(netdev_),
+                              netdev->etheraddr);
+        if (!error) {
+            netdev->cache_valid |= VALID_ETHERADDR;
          }
-        netdev->cache_valid |= VALID_ETHERADDR;
      }
-    memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
+    if (!error) {
+        memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
+    }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    return 0;
+    return error;
  }
  
  /*
@@ -774,30 +800,37 @@ static int
  netdev_bsd_get_mtu(const struct netdev *netdev_, int *mtup)
  {
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+    int error = 0;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_MTU)) {
          struct ifreq ifr;
-        int error;
  
          error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
                                      SIOCGIFMTU, "SIOCGIFMTU");
-        if (error) {
-            return error;
+        if (!error) {
+            netdev->mtu = ifr.ifr_mtu;
+            netdev->cache_valid |= VALID_MTU;
          }
-        netdev->mtu = ifr.ifr_mtu;
-        netdev->cache_valid |= VALID_MTU;
      }
+    if (!error) {
+        *mtup = netdev->mtu;
+    }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    *mtup = netdev->mtu;
      return 0;
  }
  
  static int
-netdev_bsd_get_ifindex(const struct netdev *netdev)
+netdev_bsd_get_ifindex(const struct netdev *netdev_)
  {
+    struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
      int ifindex, error;
  
-    error = get_ifindex(netdev, &ifindex);
+    ovs_mutex_lock(&netdev->mutex);
+    error = get_ifindex(netdev_, &ifindex);
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error ? -error : ifindex;
  }
  
@@ -805,34 +838,37 @@ static int
  netdev_bsd_get_carrier(const struct netdev *netdev_, bool *carrier)
  {
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+    int error = 0;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_CARRIER)) {
          struct ifmediareq ifmr;
-        int error;
  
          memset(&ifmr, 0, sizeof(ifmr));
          strncpy(ifmr.ifm_name, netdev_get_kernel_name(netdev_),
                  sizeof ifmr.ifm_name);
  
          error = af_inet_ioctl(SIOCGIFMEDIA, &ifmr);
-        if (error) {
+        if (!error) {
+            netdev->carrier = (ifmr.ifm_status & IFM_ACTIVE) == IFM_ACTIVE;
+            netdev->cache_valid |= VALID_CARRIER;
+
+            /* If the interface doesn't report whether the media is active,
+             * just assume it is active. */
+            if ((ifmr.ifm_status & IFM_AVALID) == 0) {
+                netdev->carrier = true;
+            }
+        } else {
              VLOG_DBG_RL(&rl, "%s: ioctl(SIOCGIFMEDIA) failed: %s",
                          netdev_get_name(netdev_), ovs_strerror(error));
-            return error;
-        }
-
-        netdev->carrier = (ifmr.ifm_status & IFM_ACTIVE) == IFM_ACTIVE;
-        netdev->cache_valid |= VALID_CARRIER;
-
-        /* If the interface doesn't report whether the media is active,
-         * just assume it is active. */
-        if ((ifmr.ifm_status & IFM_AVALID) == 0) {
-            netdev->carrier = true;
          }
      }
-    *carrier = netdev->carrier;
+    if (!error) {
+        *carrier = netdev->carrier;
+    }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    return 0;
+    return error;
  }
  
  static void
@@ -1074,33 +1110,35 @@ netdev_bsd_get_in4(const struct netdev *netdev_, struct in_addr *in4,
                     struct in_addr *netmask)
  {
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
+    int error = 0;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_IN4)) {
-        const struct sockaddr_in *sin;
          struct ifreq ifr;
-        int error;
  
          ifr.ifr_addr.sa_family = AF_INET;
          error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
                                      SIOCGIFADDR, "SIOCGIFADDR");
-        if (error) {
-            return error;
-        }
+        if (!error) {
+            const struct sockaddr_in *sin;
  
-        sin = (struct sockaddr_in *) &ifr.ifr_addr;
-        netdev->in4 = sin->sin_addr;
-        error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
-                                    SIOCGIFNETMASK, "SIOCGIFNETMASK");
-        if (error) {
-            return error;
+            sin = (struct sockaddr_in *) &ifr.ifr_addr;
+            netdev->in4 = sin->sin_addr;
+            netdev->cache_valid |= VALID_IN4;
+            error = af_inet_ifreq_ioctl(netdev_get_kernel_name(netdev_), &ifr,
+                                        SIOCGIFNETMASK, "SIOCGIFNETMASK");
+            if (!error) {
+                *netmask = sin->sin_addr;
+            }
          }
-        netdev->netmask = sin->sin_addr;
-        netdev->cache_valid |= VALID_IN4;
      }
-    *in4 = netdev->in4;
-    *netmask = netdev->netmask;
+    if (!error) {
+        *in4 = netdev->in4;
+        *netmask = netdev->netmask;
+    }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    return in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
+    return error ? error : in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
  }
  
  /*
@@ -1115,6 +1153,7 @@ netdev_bsd_set_in4(struct netdev *netdev_, struct in_addr addr,
      struct netdev_bsd *netdev = netdev_bsd_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", addr);
      if (!error) {
          if (addr.s_addr != INADDR_ANY) {
@@ -1128,6 +1167,8 @@ netdev_bsd_set_in4(struct netdev *netdev_, struct in_addr addr,
          }
          netdev_bsd_changed(netdev);
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error;
  }
  
@@ -1293,6 +1334,63 @@ netdev_bsd_get_next_hop(const struct in_addr *host OVS_UNUSED,
  #endif
  }
  
+static int
+netdev_bsd_arp_lookup(const struct netdev *netdev OVS_UNUSED,
+                      ovs_be32 ip OVS_UNUSED,
+                     uint8_t mac[ETH_ADDR_LEN] OVS_UNUSED)
+{
+#if defined(__NetBSD__)
+    const struct rt_msghdr *rtm;
+    size_t needed;
+    char *buf;
+    const char *cp;
+    const char *ep;
+    int mib[6];
+    int error;
+
+    buf = NULL;
+    mib[0] = CTL_NET;
+    mib[1] = PF_ROUTE;
+    mib[2] = 0;
+    mib[3] = AF_INET;
+    mib[4] = NET_RT_FLAGS;
+    mib[5] = RTF_LLINFO;
+    if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) {
+        error = errno;
+        goto error;
+    }
+    buf = xmalloc(needed);
+    if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) {
+        error = errno;
+        goto error;
+    }
+    ep = buf + needed;
+    for (cp = buf; cp < ep; cp += rtm->rtm_msglen) {
+        const struct sockaddr_inarp *sina;
+        const struct sockaddr_dl *sdl;
+
+        rtm = (const void *)cp;
+        sina = (const void *)(rtm + 1);
+        if (ip != sina->sin_addr.s_addr) {
+            continue;
+        }
+        sdl = (const void *)
+           ((const char *)(const void *)sina + RT_ROUNDUP(sina->sin_len));
+        if (sdl->sdl_alen == ETH_ADDR_LEN) {
+            memcpy(mac, &sdl->sdl_data[sdl->sdl_nlen], ETH_ADDR_LEN);
+            error = 0;
+            goto error;
+        }
+    }
+    error = ENXIO;
+error:
+    free(buf);
+    return error;
+#else
+    return EOPNOTSUPP;
+#endif
+}
+
  static void
  make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
  {
@@ -1308,7 +1406,8 @@ make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
  
  static int
  do_set_addr(struct netdev *netdev,
-            int ioctl_nr, const char *ioctl_name, struct in_addr addr)
+            unsigned long ioctl_nr, const char *ioctl_name,
+            struct in_addr addr)
  {
      struct ifreq ifr;
      make_in4_sockaddr(&ifr.ifr_addr, addr);
@@ -1420,7 +1519,7 @@ const struct netdev_class netdev_bsd_class = {
      NULL, /* add_router */
      netdev_bsd_get_next_hop,
      NULL, /* get_status */
-    NULL, /* arp_lookup */
+    netdev_bsd_arp_lookup, /* arp_lookup */
  
      netdev_bsd_update_flags,
  
@@ -1483,7 +1582,7 @@ const struct netdev_class netdev_tap_class = {
      NULL, /* add_router */
      netdev_bsd_get_next_hop,
      NULL, /* get_status */
-    NULL, /* arp_lookup */
+    netdev_bsd_arp_lookup, /* arp_lookup */
  
      netdev_bsd_update_flags,
  
@@ -1677,7 +1776,7 @@ ifr_set_flags(struct ifreq *ifr, int flags)
  /* Calls ioctl() on an AF_LINK sock, passing the specified 'command' and
   * 'arg'.  Returns 0 if successful, otherwise a positive errno value. */
  int
-af_link_ioctl(int command, const void *arg)
+af_link_ioctl(unsigned long command, const void *arg)
  {
      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
      static int sock;
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c

index 0560ade..5c31210 100644 (file)
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -44,8 +44,22 @@ struct dummy_stream {
      struct list txq;
  };
  
+/* Protects 'dummy_list'. */
+static struct ovs_mutex dummy_list_mutex = OVS_MUTEX_INITIALIZER;
+
+/* Contains all 'struct dummy_dev's. */
+static struct list dummy_list OVS_GUARDED_BY(dummy_list_mutex)
+    = LIST_INITIALIZER(&dummy_list);
+
  struct netdev_dummy {
      struct netdev up;
+
+    /* In dummy_list. */
+    struct list list_node OVS_GUARDED_BY(dummy_list_mutex);
+
+    /* Protects all members below. */
+    struct ovs_mutex mutex OVS_ACQ_AFTER(dummy_list_mutex);
+
      uint8_t hwaddr[ETH_ADDR_LEN];
      int mtu;
      struct netdev_stats stats;
@@ -60,8 +74,6 @@ struct netdev_dummy {
      struct list rxes;           /* List of child "netdev_rx_dummy"s. */
  };
  
-static const struct netdev_class dummy_class;
-
  /* Max 'recv_queue_len' in struct netdev_dummy. */
  #define NETDEV_DUMMY_MAX_QUEUE 100
  
@@ -75,7 +87,8 @@ struct netdev_rx_dummy {
  
  static unixctl_cb_func netdev_dummy_set_admin_state;
  static int netdev_dummy_construct(struct netdev *);
-static void netdev_dummy_poll_notify(struct netdev_dummy *);
+static void netdev_dummy_poll_notify(struct netdev_dummy *netdev)
+    OVS_REQUIRES(netdev->mutex);
  static void netdev_dummy_queue_packet(struct netdev_dummy *, struct ofpbuf *);
  
  static void dummy_stream_close(struct dummy_stream *);
@@ -103,15 +116,14 @@ netdev_rx_dummy_cast(const struct netdev_rx *rx)
  static void
  netdev_dummy_run(void)
  {
-    struct shash dummy_netdevs;
-    struct shash_node *node;
+    struct netdev_dummy *dev;
  
-    shash_init(&dummy_netdevs);
-    netdev_get_devices(&dummy_class, &dummy_netdevs);
-    SHASH_FOR_EACH (node, &dummy_netdevs) {
-        struct netdev_dummy *dev = node->data;
+    ovs_mutex_lock(&dummy_list_mutex);
+    LIST_FOR_EACH (dev, list_node, &dummy_list) {
          size_t i;
  
+        ovs_mutex_lock(&dev->mutex);
+
          if (dev->pstream) {
              struct stream *new_stream;
              int error;
@@ -203,9 +215,9 @@ netdev_dummy_run(void)
              }
          }
  
-        netdev_close(&dev->up);
+        ovs_mutex_unlock(&dev->mutex);
      }
-    shash_destroy(&dummy_netdevs);
+    ovs_mutex_unlock(&dummy_list_mutex);
  }
  
  static void
@@ -219,15 +231,13 @@ dummy_stream_close(struct dummy_stream *s)
  static void
  netdev_dummy_wait(void)
  {
-    struct shash dummy_netdevs;
-    struct shash_node *node;
+    struct netdev_dummy *dev;
  
-    shash_init(&dummy_netdevs);
-    netdev_get_devices(&dummy_class, &dummy_netdevs);
-    SHASH_FOR_EACH (node, &dummy_netdevs) {
-        struct netdev_dummy *dev = node->data;
+    ovs_mutex_lock(&dummy_list_mutex);
+    LIST_FOR_EACH (dev, list_node, &dummy_list) {
          size_t i;
  
+        ovs_mutex_lock(&dev->mutex);
          if (dev->pstream) {
              pstream_wait(dev->pstream);
          }
@@ -240,9 +250,9 @@ netdev_dummy_wait(void)
              }
              stream_recv_wait(s->stream);
          }
-        netdev_close(&dev->up);
+        ovs_mutex_unlock(&dev->mutex);
      }
-    shash_destroy(&dummy_netdevs);
+    ovs_mutex_unlock(&dummy_list_mutex);
  }
  
  static struct netdev *
@@ -260,6 +270,8 @@ netdev_dummy_construct(struct netdev *netdev_)
      unsigned int n;
  
      atomic_add(&next_n, 1, &n);
+
+    ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
      netdev->hwaddr[0] = 0xaa;
      netdev->hwaddr[1] = 0x55;
      netdev->hwaddr[2] = n >> 24;
@@ -277,6 +289,10 @@ netdev_dummy_construct(struct netdev *netdev_)
  
      list_init(&netdev->rxes);
  
+    ovs_mutex_lock(&dummy_list_mutex);
+    list_push_back(&dummy_list, &netdev->list_node);
+    ovs_mutex_unlock(&dummy_list_mutex);
+
      return 0;
  }
  
@@ -286,11 +302,16 @@ netdev_dummy_destruct(struct netdev *netdev_)
      struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
      size_t i;
  
+    ovs_mutex_lock(&dummy_list_mutex);
+    list_remove(&netdev->list_node);
+    ovs_mutex_unlock(&dummy_list_mutex);
+
      pstream_close(netdev->pstream);
      for (i = 0; i < netdev->n_streams; i++) {
          dummy_stream_close(&netdev->streams[i]);
      }
      free(netdev->streams);
+    ovs_mutex_destroy(&netdev->mutex);
  }
  
  static void
@@ -321,6 +342,7 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args)
      struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
      const char *pstream;
  
+    ovs_mutex_lock(&netdev->mutex);
      netdev->ifindex = smap_get_int(args, "ifindex", -EOPNOTSUPP);
  
      pstream = smap_get(args, "pstream");
@@ -340,6 +362,8 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args)
              }
          }
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
      return 0;
  }
  
@@ -356,9 +380,11 @@ netdev_dummy_rx_construct(struct netdev_rx *rx_)
      struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
      struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
  
+    ovs_mutex_lock(&netdev->mutex);
      list_push_back(&netdev->rxes, &rx->node);
      list_init(&rx->recv_queue);
      rx->recv_queue_len = 0;
+    ovs_mutex_unlock(&netdev->mutex);
  
      return 0;
  }
@@ -367,9 +393,12 @@ static void
  netdev_dummy_rx_destruct(struct netdev_rx *rx_)
  {
      struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+    struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
  
+    ovs_mutex_lock(&netdev->mutex);
      list_remove(&rx->node);
      ofpbuf_list_delete(&rx->recv_queue);
+    ovs_mutex_unlock(&netdev->mutex);
  }
  
  static void
@@ -384,15 +413,23 @@ static int
  netdev_dummy_rx_recv(struct netdev_rx *rx_, void *buffer, size_t size)
  {
      struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+    struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
      struct ofpbuf *packet;
      int retval;
  
-    if (list_is_empty(&rx->recv_queue)) {
+    ovs_mutex_lock(&netdev->mutex);
+    if (!list_is_empty(&rx->recv_queue)) {
+        packet = ofpbuf_from_list(list_pop_front(&rx->recv_queue));
+        rx->recv_queue_len--;
+    } else {
+        packet = NULL;
+    }
+    ovs_mutex_unlock(&netdev->mutex);
+
+    if (!packet) {
          return -EAGAIN;
      }
  
-    packet = ofpbuf_from_list(list_pop_front(&rx->recv_queue));
-    rx->recv_queue_len--;
      if (packet->size <= size) {
          memcpy(buffer, packet->data, packet->size);
          retval = packet->size;
@@ -408,17 +445,26 @@ static void
  netdev_dummy_rx_wait(struct netdev_rx *rx_)
  {
      struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+    struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
+
+    ovs_mutex_lock(&netdev->mutex);
      if (!list_is_empty(&rx->recv_queue)) {
          poll_immediate_wake();
      }
+    ovs_mutex_unlock(&netdev->mutex);
  }
  
  static int
  netdev_dummy_rx_drain(struct netdev_rx *rx_)
  {
      struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
+    struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
+
+    ovs_mutex_lock(&netdev->mutex);
      ofpbuf_list_delete(&rx->recv_queue);
      rx->recv_queue_len = 0;
+    ovs_mutex_unlock(&netdev->mutex);
+
      return 0;
  }
  
@@ -443,6 +489,7 @@ netdev_dummy_send(struct netdev *netdev, const void *buffer, size_t size)
          }
      }
  
+    ovs_mutex_lock(&dev->mutex);
      dev->stats.tx_packets++;
      dev->stats.tx_bytes += size;
  
@@ -457,6 +504,7 @@ netdev_dummy_send(struct netdev *netdev, const void *buffer, size_t size)
              list_push_back(&s->txq, &b->list_node);
          }
      }
+    ovs_mutex_unlock(&dev->mutex);
  
      return 0;
  }
@@ -467,10 +515,12 @@ netdev_dummy_set_etheraddr(struct netdev *netdev,
  {
      struct netdev_dummy *dev = netdev_dummy_cast(netdev);
  
+    ovs_mutex_lock(&dev->mutex);
      if (!eth_addr_equals(dev->hwaddr, mac)) {
          memcpy(dev->hwaddr, mac, ETH_ADDR_LEN);
          netdev_dummy_poll_notify(dev);
      }
+    ovs_mutex_unlock(&dev->mutex);
  
      return 0;
  }
@@ -479,18 +529,24 @@ static int
  netdev_dummy_get_etheraddr(const struct netdev *netdev,
                             uint8_t mac[ETH_ADDR_LEN])
  {
-    const struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+    struct netdev_dummy *dev = netdev_dummy_cast(netdev);
  
+    ovs_mutex_lock(&dev->mutex);
      memcpy(mac, dev->hwaddr, ETH_ADDR_LEN);
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
  static int
  netdev_dummy_get_mtu(const struct netdev *netdev, int *mtup)
  {
-    const struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+    struct netdev_dummy *dev = netdev_dummy_cast(netdev);
  
+    ovs_mutex_lock(&dev->mutex);
      *mtup = dev->mtu;
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
@@ -499,16 +555,22 @@ netdev_dummy_set_mtu(const struct netdev *netdev, int mtu)
  {
      struct netdev_dummy *dev = netdev_dummy_cast(netdev);
  
+    ovs_mutex_lock(&dev->mutex);
      dev->mtu = mtu;
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
  static int
  netdev_dummy_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
  {
-    const struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+    struct netdev_dummy *dev = netdev_dummy_cast(netdev);
  
+    ovs_mutex_lock(&dev->mutex);
      *stats = dev->stats;
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
@@ -517,7 +579,10 @@ netdev_dummy_set_stats(struct netdev *netdev, const struct netdev_stats *stats)
  {
      struct netdev_dummy *dev = netdev_dummy_cast(netdev);
  
+    ovs_mutex_lock(&dev->mutex);
      dev->stats = *stats;
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
@@ -525,17 +590,21 @@ static int
  netdev_dummy_get_ifindex(const struct netdev *netdev)
  {
      struct netdev_dummy *dev = netdev_dummy_cast(netdev);
+    int ifindex;
  
-    return dev->ifindex;
+    ovs_mutex_lock(&dev->mutex);
+    ifindex = dev->ifindex;
+    ovs_mutex_unlock(&dev->mutex);
+
+    return ifindex;
  }
  
  static int
-netdev_dummy_update_flags(struct netdev *netdev_,
-                          enum netdev_flags off, enum netdev_flags on,
-                          enum netdev_flags *old_flagsp)
+netdev_dummy_update_flags__(struct netdev_dummy *netdev,
+                            enum netdev_flags off, enum netdev_flags on,
+                            enum netdev_flags *old_flagsp)
+    OVS_REQUIRES(netdev->mutex)
  {
-    struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
-
      if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
          return EINVAL;
      }
@@ -546,13 +615,36 @@ netdev_dummy_update_flags(struct netdev *netdev_,
      if (*old_flagsp != netdev->flags) {
          netdev_dummy_poll_notify(netdev);
      }
+
      return 0;
  }
  
+static int
+netdev_dummy_update_flags(struct netdev *netdev_,
+                          enum netdev_flags off, enum netdev_flags on,
+                          enum netdev_flags *old_flagsp)
+{
+    struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
+    int error;
+
+    ovs_mutex_lock(&netdev->mutex);
+    error = netdev_dummy_update_flags__(netdev, off, on, old_flagsp);
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
+}
+
  static unsigned int
-netdev_dummy_change_seq(const struct netdev *netdev)
+netdev_dummy_change_seq(const struct netdev *netdev_)
  {
-    return netdev_dummy_cast(netdev)->change_seq;
+    struct netdev_dummy *netdev = netdev_dummy_cast(netdev_);
+    unsigned int change_seq;
+
+    ovs_mutex_lock(&netdev->mutex);
+    change_seq = netdev->change_seq;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return change_seq;
  }
  \f
  /* Helper functions. */
@@ -722,10 +814,11 @@ netdev_dummy_receive(struct unixctl_conn *conn,
              goto exit;
          }
  
+        ovs_mutex_lock(&dummy_dev->mutex);
          dummy_dev->stats.rx_packets++;
          dummy_dev->stats.rx_bytes += packet->size;
-
          netdev_dummy_queue_packet(dummy_dev, packet);
+        ovs_mutex_unlock(&dummy_dev->mutex);
      }
  
      unixctl_command_reply(conn, NULL);
@@ -736,13 +829,14 @@ exit:
  
  static void
  netdev_dummy_set_admin_state__(struct netdev_dummy *dev, bool admin_state)
+    OVS_REQUIRES(dev->mutex)
  {
      enum netdev_flags old_flags;
  
      if (admin_state) {
-        netdev_dummy_update_flags(&dev->up, 0, NETDEV_UP, &old_flags);
+        netdev_dummy_update_flags__(dev, 0, NETDEV_UP, &old_flags);
      } else {
-        netdev_dummy_update_flags(&dev->up, NETDEV_UP, 0, &old_flags);
+        netdev_dummy_update_flags__(dev, NETDEV_UP, 0, &old_flags);
      }
  }
  
@@ -766,7 +860,10 @@ netdev_dummy_set_admin_state(struct unixctl_conn *conn, int argc,
          if (netdev && is_dummy_class(netdev->netdev_class)) {
              struct netdev_dummy *dummy_dev = netdev_dummy_cast(netdev);
  
+            ovs_mutex_lock(&dummy_dev->mutex);
              netdev_dummy_set_admin_state__(dummy_dev, up);
+            ovs_mutex_unlock(&dummy_dev->mutex);
+
              netdev_close(netdev);
          } else {
              unixctl_command_reply_error(conn, "Unknown Dummy Interface");
@@ -774,17 +871,15 @@ netdev_dummy_set_admin_state(struct unixctl_conn *conn, int argc,
              return;
          }
      } else {
-        struct shash dummy_netdevs;
-        struct shash_node *node;
-
-        shash_init(&dummy_netdevs);
-        netdev_get_devices(&dummy_class, &dummy_netdevs);
-        SHASH_FOR_EACH (node, &dummy_netdevs) {
-            struct netdev *netdev = node->data;
-            netdev_dummy_set_admin_state__(netdev_dummy_cast(netdev), up);
-            netdev_close(netdev);
+        struct netdev_dummy *netdev;
+
+        ovs_mutex_lock(&dummy_list_mutex);
+        LIST_FOR_EACH (netdev, list_node, &dummy_list) {
+            ovs_mutex_lock(&netdev->mutex);
+            netdev_dummy_set_admin_state__(netdev, up);
+            ovs_mutex_unlock(&netdev->mutex);
          }
-        shash_destroy(&dummy_netdevs);
+        ovs_mutex_unlock(&dummy_list_mutex);
      }
      unixctl_command_reply(conn, "OK");
  }
@@ -807,11 +902,17 @@ netdev_dummy_register(bool override)
          SSET_FOR_EACH (type, &types) {
              if (!netdev_unregister_provider(type)) {
                  struct netdev_class *class;
+                int error;
  
-                class = xmalloc(sizeof *class);
-                *class = dummy_class;
+                class = xmemdup(&dummy_class, sizeof dummy_class);
                  class->type = xstrdup(type);
-                netdev_register_provider(class);
+                error = netdev_register_provider(class);
+                if (error) {
+                    VLOG_ERR("%s: failed to register netdev provider (%s)",
+                             type, ovs_strerror(error));
+                    free(CONST_CAST(char *, class->type));
+                    free(class);
+                }
              }
          }
          sset_destroy(&types);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c

index cf45905..9a80b67 100644 (file)
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -106,9 +106,6 @@ COVERAGE_DEFINE(netdev_set_ethtool);
  #define TC_RTAB_SIZE 1024
  #endif
  
-static struct nln_notifier *netdev_linux_cache_notifier = NULL;
-static int cache_notifier_refcount;
-
  enum {
      VALID_IFINDEX           = 1 << 0,
      VALID_ETHERADDR         = 1 << 1,
@@ -355,6 +352,9 @@ static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
  struct netdev_linux {
      struct netdev up;
  
+    /* Protects all members below. */
+    struct ovs_mutex mutex;
+
      unsigned int cache_valid;
      unsigned int change_seq;
  
@@ -410,6 +410,9 @@ static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
                                   int cmd, const char *cmd_name);
  static int get_flags(const struct netdev *, unsigned int *flags);
  static int set_flags(const char *, unsigned int flags);
+static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
+                        enum netdev_flags on, enum netdev_flags *old_flagsp)
+    OVS_REQUIRES(netdev->mutex);
  static int do_get_ifindex(const char *netdev_name);
  static int get_ifindex(const struct netdev *, int *ifindexp);
  static int do_set_addr(struct netdev *netdev,
@@ -450,23 +453,117 @@ netdev_rx_linux_cast(const struct netdev_rx *rx)
      return CONTAINER_OF(rx, struct netdev_rx_linux, up);
  }
  \f
+static void netdev_linux_update(struct netdev_linux *netdev,
+                                const struct rtnetlink_link_change *)
+    OVS_REQUIRES(netdev->mutex);
+static void netdev_linux_changed(struct netdev_linux *netdev,
+                                 unsigned int ifi_flags, unsigned int mask)
+    OVS_REQUIRES(netdev->mutex);
+
+/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
+ * if no such socket could be created. */
+static struct nl_sock *
+netdev_linux_notify_sock(void)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+    static struct nl_sock *sock;
+
+    if (ovsthread_once_start(&once)) {
+        int error;
+
+        error = nl_sock_create(NETLINK_ROUTE, &sock);
+        if (!error) {
+            error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
+            if (error) {
+                nl_sock_destroy(sock);
+                sock = NULL;
+            }
+        }
+        ovsthread_once_done(&once);
+    }
+
+    return sock;
+}
+
  static void
  netdev_linux_run(void)
  {
-    rtnetlink_link_run();
+    struct nl_sock *sock;
+    int error;
+
      netdev_linux_miimon_run();
+
+    sock = netdev_linux_notify_sock();
+    if (!sock) {
+        return;
+    }
+
+    do {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        uint64_t buf_stub[4096 / 8];
+        struct ofpbuf buf;
+
+        ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
+        error = nl_sock_recv(sock, &buf, false);
+        if (!error) {
+            struct rtnetlink_link_change change;
+
+            if (rtnetlink_link_parse(&buf, &change)) {
+                struct netdev *netdev_ = netdev_from_name(change.ifname);
+                if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
+                    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+
+                    ovs_mutex_lock(&netdev->mutex);
+                    netdev_linux_update(netdev, &change);
+                    ovs_mutex_unlock(&netdev->mutex);
+                }
+                netdev_close(netdev_);
+            }
+        } else if (error == ENOBUFS) {
+            struct shash device_shash;
+            struct shash_node *node;
+
+            nl_sock_drain(sock);
+
+            shash_init(&device_shash);
+            netdev_get_devices(&netdev_linux_class, &device_shash);
+            SHASH_FOR_EACH (node, &device_shash) {
+                struct netdev *netdev_ = node->data;
+                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+                unsigned int flags;
+
+                ovs_mutex_lock(&netdev->mutex);
+                get_flags(netdev_, &flags);
+                netdev_linux_changed(netdev, flags, 0);
+                ovs_mutex_unlock(&netdev->mutex);
+
+                netdev_close(netdev_);
+            }
+            shash_destroy(&device_shash);
+        } else if (error != EAGAIN) {
+            VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
+                         ovs_strerror(error));
+        }
+        ofpbuf_uninit(&buf);
+    } while (!error);
  }
  
  static void
  netdev_linux_wait(void)
  {
-    rtnetlink_link_wait();
+    struct nl_sock *sock;
+
      netdev_linux_miimon_wait();
+    sock = netdev_linux_notify_sock();
+    if (sock) {
+        nl_sock_wait(sock, POLLIN);
+    }
  }
  
  static void
  netdev_linux_changed(struct netdev_linux *dev,
                       unsigned int ifi_flags, unsigned int mask)
+    OVS_REQUIRES(dev->mutex)
  {
      dev->change_seq++;
      if (!dev->change_seq) {
@@ -484,6 +581,7 @@ netdev_linux_changed(struct netdev_linux *dev,
  static void
  netdev_linux_update(struct netdev_linux *dev,
                      const struct rtnetlink_link_change *change)
+    OVS_REQUIRES(dev->mutex)
  {
      if (change->nlmsg_type == RTM_NEWLINK) {
          /* Keep drv-info */
@@ -511,64 +609,6 @@ netdev_linux_update(struct netdev_linux *dev,
      }
  }
  
-static void
-netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
-                      void *aux OVS_UNUSED)
-{
-    if (change) {
-        struct netdev *base_dev = netdev_from_name(change->ifname);
-        if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
-            netdev_linux_update(netdev_linux_cast(base_dev), change);
-            netdev_close(base_dev);
-        }
-    } else {
-        struct shash device_shash;
-        struct shash_node *node;
-
-        shash_init(&device_shash);
-        netdev_get_devices(&netdev_linux_class, &device_shash);
-        SHASH_FOR_EACH (node, &device_shash) {
-            struct netdev *netdev = node->data;
-            struct netdev_linux *dev = netdev_linux_cast(netdev);
-            unsigned int flags;
-
-            get_flags(&dev->up, &flags);
-            netdev_linux_changed(dev, flags, 0);
-            netdev_close(netdev);
-        }
-        shash_destroy(&device_shash);
-    }
-}
-
-static int
-cache_notifier_ref(void)
-{
-    if (!cache_notifier_refcount) {
-        ovs_assert(!netdev_linux_cache_notifier);
-
-        netdev_linux_cache_notifier =
-            rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
-
-        if (!netdev_linux_cache_notifier) {
-            return EINVAL;
-        }
-    }
-    cache_notifier_refcount++;
-
-    return 0;
-}
-
-static void
-cache_notifier_unref(void)
-{
-    ovs_assert(cache_notifier_refcount > 0);
-    if (!--cache_notifier_refcount) {
-        ovs_assert(netdev_linux_cache_notifier);
-        rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
-        netdev_linux_cache_notifier = NULL;
-    }
-}
-
  static struct netdev *
  netdev_linux_alloc(void)
  {
@@ -576,12 +616,11 @@ netdev_linux_alloc(void)
      return &netdev->up;
  }
  
-static int
+static void
  netdev_linux_common_construct(struct netdev_linux *netdev)
  {
+    ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
      netdev->change_seq = 1;
-
-    return cache_notifier_ref();
  }
  
  /* Creates system and internal devices. */
@@ -591,16 +630,12 @@ netdev_linux_construct(struct netdev *netdev_)
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
-    error = netdev_linux_common_construct(netdev);
-    if (error) {
-        return error;
-    }
+    netdev_linux_common_construct(netdev);
  
      error = get_flags(&netdev->up, &netdev->ifi_flags);
      if (error == ENODEV) {
          if (netdev->up.netdev_class != &netdev_internal_class) {
              /* The device does not exist, so don't allow it to be opened. */
-            cache_notifier_unref();
              return ENODEV;
          } else {
              /* "Internal" netdevs have to be created as netdev objects before
@@ -628,17 +663,14 @@ netdev_linux_construct_tap(struct netdev *netdev_)
      struct ifreq ifr;
      int error;
  
-    error = netdev_linux_common_construct(netdev);
-    if (error) {
-        goto error;
-    }
+    netdev_linux_common_construct(netdev);
  
      /* Open tap device. */
      netdev->tap_fd = open(tap_dev, O_RDWR);
      if (netdev->tap_fd < 0) {
          error = errno;
          VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
-        goto error_unref_notifier;
+        return error;
      }
  
      /* Create tap device. */
@@ -661,9 +693,6 @@ netdev_linux_construct_tap(struct netdev *netdev_)
  
  error_close:
      close(netdev->tap_fd);
-error_unref_notifier:
-    cache_notifier_unref();
-error:
      return error;
  }
  
@@ -682,7 +711,7 @@ netdev_linux_destruct(struct netdev *netdev_)
          close(netdev->tap_fd);
      }
  
-    cache_notifier_unref();
+    ovs_mutex_destroy(&netdev->mutex);
  }
  
  static void
@@ -707,6 +736,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_)
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      rx->is_tap = is_tap_netdev(netdev_);
      if (rx->is_tap) {
          rx->fd = netdev->tap_fd;
@@ -766,6 +796,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_)
              goto error;
          }
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
      return 0;
  
@@ -773,6 +804,7 @@ error:
      if (rx->fd >= 0) {
          close(rx->fd);
      }
+    ovs_mutex_unlock(&netdev->mutex);
      return error;
  }
  
@@ -863,7 +895,6 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
              struct msghdr msg;
              struct iovec iov;
              int ifindex;
-            int error;
              int sock;
  
              sock = af_packet_sock();
@@ -871,9 +902,9 @@ netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
                  return -sock;
              }
  
-            error = get_ifindex(netdev_, &ifindex);
-            if (error) {
-                return error;
+            ifindex = netdev_get_ifindex(netdev_);
+            if (ifindex < 0) {
+                return -ifindex;
              }
  
              /* We don't bother setting most fields in sockaddr_ll because the
@@ -951,22 +982,22 @@ netdev_linux_set_etheraddr(struct netdev *netdev_,
                             const uint8_t mac[ETH_ADDR_LEN])
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
-    struct netdev_saved_flags *sf = NULL;
+    enum netdev_flags old_flags = 0;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
+
      if (netdev->cache_valid & VALID_ETHERADDR) {
-        if (netdev->ether_addr_error) {
-            return netdev->ether_addr_error;
-        }
-        if (eth_addr_equals(netdev->etheraddr, mac)) {
-            return 0;
+        error = netdev->ether_addr_error;
+        if (error || eth_addr_equals(netdev->etheraddr, mac)) {
+            goto exit;
          }
          netdev->cache_valid &= ~VALID_ETHERADDR;
      }
  
      /* Tap devices must be brought down before setting the address. */
      if (is_tap_netdev(netdev_)) {
-        netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
+        update_flags(netdev, NETDEV_UP, 0, &old_flags);
      }
      error = set_etheraddr(netdev_get_name(netdev_), mac);
      if (!error || error == ENODEV) {
@@ -977,8 +1008,12 @@ netdev_linux_set_etheraddr(struct netdev *netdev_,
          }
      }
  
-    netdev_restore_flags(sf);
+    if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
+        update_flags(netdev, 0, NETDEV_UP, &old_flags);
+    }
  
+exit:
+    ovs_mutex_unlock(&netdev->mutex);
      return error;
  }
  
@@ -988,20 +1023,22 @@ netdev_linux_get_etheraddr(const struct netdev *netdev_,
                             uint8_t mac[ETH_ADDR_LEN])
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_ETHERADDR)) {
-        int error = get_etheraddr(netdev_get_name(netdev_),
-                                  netdev->etheraddr);
-
-        netdev->ether_addr_error = error;
+        netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
+                                                 netdev->etheraddr);
          netdev->cache_valid |= VALID_ETHERADDR;
      }
  
-    if (!netdev->ether_addr_error) {
+    error = netdev->ether_addr_error;
+    if (!error) {
          memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    return netdev->ether_addr_error;
+    return error;
  }
  
  /* Returns the maximum size of transmitted (and received) packets on 'netdev',
@@ -1011,22 +1048,25 @@ static int
  netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
+
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_MTU)) {
          struct ifreq ifr;
-        int error;
-
-        error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
-                                    SIOCGIFMTU, "SIOCGIFMTU");
  
-        netdev->netdev_mtu_error = error;
+        netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
+            netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
          netdev->mtu = ifr.ifr_mtu;
          netdev->cache_valid |= VALID_MTU;
      }
  
-    if (!netdev->netdev_mtu_error) {
+    error = netdev->netdev_mtu_error;
+    if (!error) {
          *mtup = netdev->mtu;
      }
-    return netdev->netdev_mtu_error;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  /* Sets the maximum size of transmitted (MTU) for given device using linux
@@ -1039,12 +1079,11 @@ netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
      struct ifreq ifr;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (netdev->cache_valid & VALID_MTU) {
-        if (netdev->netdev_mtu_error) {
-            return netdev->netdev_mtu_error;
-        }
-        if (netdev->mtu == mtu) {
-            return 0;
+        error = netdev->netdev_mtu_error;
+        if (error || netdev->mtu == mtu) {
+            goto exit;
          }
          netdev->cache_valid &= ~VALID_MTU;
      }
@@ -1056,17 +1095,23 @@ netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
          netdev->mtu = ifr.ifr_mtu;
          netdev->cache_valid |= VALID_MTU;
      }
+exit:
+    ovs_mutex_unlock(&netdev->mutex);
      return error;
  }
  
  /* Returns the ifindex of 'netdev', if successful, as a positive number.
   * On failure, returns a negative errno value. */
  static int
-netdev_linux_get_ifindex(const struct netdev *netdev)
+netdev_linux_get_ifindex(const struct netdev *netdev_)
  {
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int ifindex, error;
  
-    error = get_ifindex(netdev, &ifindex);
+    ovs_mutex_lock(&netdev->mutex);
+    error = get_ifindex(netdev_, &ifindex);
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error ? -error : ifindex;
  }
  
@@ -1075,19 +1120,28 @@ netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
  
+    ovs_mutex_lock(&netdev->mutex);
      if (netdev->miimon_interval > 0) {
          *carrier = netdev->miimon;
      } else {
          *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
      return 0;
  }
  
  static long long int
-netdev_linux_get_carrier_resets(const struct netdev *netdev)
+netdev_linux_get_carrier_resets(const struct netdev *netdev_)
  {
-    return netdev_linux_cast(netdev)->carrier_resets;
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    long long int carrier_resets;
+
+    ovs_mutex_lock(&netdev->mutex);
+    carrier_resets = netdev->carrier_resets;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return carrier_resets;
  }
  
  static int
@@ -1155,11 +1209,13 @@ netdev_linux_set_miimon_interval(struct netdev *netdev_,
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
  
+    ovs_mutex_lock(&netdev->mutex);
      interval = interval > 0 ? MAX(interval, 100) : 0;
      if (netdev->miimon_interval != interval) {
          netdev->miimon_interval = interval;
          timer_set_expired(&netdev->miimon_timer);
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
      return 0;
  }
@@ -1177,18 +1233,17 @@ netdev_linux_miimon_run(void)
          struct netdev_linux *dev = netdev_linux_cast(netdev);
          bool miimon;
  
-        if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
-            netdev_close(netdev);
-            continue;
-        }
+        ovs_mutex_lock(&dev->mutex);
+        if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
+            netdev_linux_get_miimon(dev->up.name, &miimon);
+            if (miimon != dev->miimon) {
+                dev->miimon = miimon;
+                netdev_linux_changed(dev, dev->ifi_flags, 0);
+            }
  
-        netdev_linux_get_miimon(dev->up.name, &miimon);
-        if (miimon != dev->miimon) {
-            dev->miimon = miimon;
-            netdev_linux_changed(dev, dev->ifi_flags, 0);
+            timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
          }
-
-        timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
+        ovs_mutex_unlock(&dev->mutex);
          netdev_close(netdev);
      }
  
@@ -1207,9 +1262,11 @@ netdev_linux_miimon_wait(void)
          struct netdev *netdev = node->data;
          struct netdev_linux *dev = netdev_linux_cast(netdev);
  
+        ovs_mutex_lock(&dev->mutex);
          if (dev->miimon_interval > 0) {
              timer_wait(&dev->miimon_timer);
          }
+        ovs_mutex_unlock(&dev->mutex);
          netdev_close(netdev);
      }
      shash_destroy(&device_shash);
@@ -1326,7 +1383,7 @@ get_stats_via_vport(const struct netdev *netdev_,
  
  static int
  netdev_linux_sys_get_stats(const struct netdev *netdev_,
-                         struct netdev_stats *stats)
+                           struct netdev_stats *stats)
  {
      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
      static int use_netlink_stats;
@@ -1365,19 +1422,14 @@ netdev_linux_get_stats(const struct netdev *netdev_,
      struct netdev_stats dev_stats;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      get_stats_via_vport(netdev_, stats);
-
      error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
-
      if (error) {
-        if (netdev->vport_stats_error) {
-            return error;
-        } else {
-            return 0;
+        if (!netdev->vport_stats_error) {
+            error = 0;
          }
-    }
-
-    if (netdev->vport_stats_error) {
+    } else if (netdev->vport_stats_error) {
          /* stats not available from OVS then use ioctl stats. */
          *stats = dev_stats;
      } else {
@@ -1399,7 +1451,9 @@ netdev_linux_get_stats(const struct netdev *netdev_,
          stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
          stats->tx_window_errors    += dev_stats.tx_window_errors;
      }
-    return 0;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  /* Retrieves current device stats for 'netdev-tap' netdev or
@@ -1411,24 +1465,20 @@ netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
      struct netdev_stats dev_stats;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      get_stats_via_vport(netdev_, stats);
-
      error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
      if (error) {
-        if (netdev->vport_stats_error) {
-            return error;
-        } else {
-            return 0;
+        if (!netdev->vport_stats_error) {
+            error = 0;
          }
-    }
+    } else if (netdev->vport_stats_error) {
+        /* Transmit and receive stats will appear to be swapped relative to the
+         * other ports since we are the one sending the data, not a remote
+         * computer.  For consistency, we swap them back here. This does not
+         * apply if we are getting stats from the vport layer because it always
+         * tracks stats from the perspective of the switch. */
  
-    /* If this port is an internal port then the transmit and receive stats
-     * will appear to be swapped relative to the other ports since we are the
-     * one sending the data, not a remote computer.  For consistency, we swap
-     * them back here. This does not apply if we are getting stats from the
-     * vport layer because it always tracks stats from the perspective of the
-     * switch. */
-    if (netdev->vport_stats_error) {
          *stats = dev_stats;
          swap_uint64(&stats->rx_packets, &stats->tx_packets);
          swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
@@ -1455,7 +1505,9 @@ netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
          stats->multicast           += dev_stats.multicast;
          stats->collisions          += dev_stats.collisions;
      }
-    return 0;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  static int
@@ -1463,9 +1515,14 @@ netdev_internal_get_stats(const struct netdev *netdev_,
                            struct netdev_stats *stats)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      get_stats_via_vport(netdev_, stats);
-    return netdev->vport_stats_error;
+    error = netdev->vport_stats_error;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  static int
@@ -1505,6 +1562,7 @@ netdev_internal_set_stats(struct netdev *netdev,
  
  static void
  netdev_linux_read_features(struct netdev_linux *netdev)
+    OVS_REQUIRES(netdev->mutex)
  {
      struct ethtool_cmd ecmd;
      uint32_t speed;
@@ -1646,32 +1704,39 @@ netdev_linux_get_features(const struct netdev *netdev_,
                            enum netdev_features *peer)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      netdev_linux_read_features(netdev);
-
      if (!netdev->get_features_error) {
          *current = netdev->current;
          *advertised = netdev->advertised;
          *supported = netdev->supported;
          *peer = 0;              /* XXX */
      }
-    return netdev->get_features_error;
+    error = netdev->get_features_error;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  /* Set the features advertised by 'netdev' to 'advertise'. */
  static int
-netdev_linux_set_advertisements(struct netdev *netdev,
+netdev_linux_set_advertisements(struct netdev *netdev_,
                                  enum netdev_features advertise)
  {
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      struct ethtool_cmd ecmd;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
+
      COVERAGE_INC(netdev_get_ethtool);
      memset(&ecmd, 0, sizeof ecmd);
-    error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
+    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
                                      ETHTOOL_GSET, "ETHTOOL_GSET");
      if (error) {
-        return error;
+        goto exit;
      }
  
      ecmd.advertising = 0;
@@ -1712,8 +1777,12 @@ netdev_linux_set_advertisements(struct netdev *netdev,
          ecmd.advertising |= ADVERTISED_Asym_Pause;
      }
      COVERAGE_INC(netdev_set_ethtool);
-    return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
-                                   ETHTOOL_SSET, "ETHTOOL_SSET");
+    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
+                                    ETHTOOL_SSET, "ETHTOOL_SSET");
+
+exit:
+    ovs_mutex_unlock(&netdev->mutex);
+    return error;
  }
  
  /* Attempts to set input rate limiting (policing) policy.  Returns 0 if
@@ -1726,20 +1795,17 @@ netdev_linux_set_policing(struct netdev *netdev_,
      const char *netdev_name = netdev_get_name(netdev_);
      int error;
  
-
      kbits_burst = (!kbits_rate ? 0       /* Force to 0 if no rate specified. */
                     : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
                     : kbits_burst);       /* Stick with user-specified value. */
  
+    ovs_mutex_lock(&netdev->mutex);
      if (netdev->cache_valid & VALID_POLICING) {
-        if (netdev->netdev_policing_error) {
-            return netdev->netdev_policing_error;
-        }
-
-        if (netdev->kbits_rate == kbits_rate &&
-            netdev->kbits_burst == kbits_burst) {
+        error = netdev->netdev_policing_error;
+        if (error || (netdev->kbits_rate == kbits_rate &&
+                      netdev->kbits_burst == kbits_burst)) {
              /* Assume that settings haven't changed since we last set them. */
-            return 0;
+            goto out;
          }
          netdev->cache_valid &= ~VALID_POLICING;
      }
@@ -1777,6 +1843,7 @@ out:
          netdev->netdev_policing_error = error;
          netdev->cache_valid |= VALID_POLICING;
      }
+    ovs_mutex_unlock(&netdev->mutex);
      return error;
  }
  
@@ -1864,15 +1931,17 @@ netdev_linux_get_qos(const struct netdev *netdev_,
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
+    if (!error) {
+        *typep = netdev->tc->ops->ovs_name;
+        error = (netdev->tc->ops->qdisc_get
+                 ? netdev->tc->ops->qdisc_get(netdev_, details)
+                 : 0);
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    *typep = netdev->tc->ops->ovs_name;
-    return (netdev->tc->ops->qdisc_get
-            ? netdev->tc->ops->qdisc_get(netdev_, details)
-            : 0);
+    return error;
  }
  
  static int
@@ -1888,27 +1957,30 @@ netdev_linux_set_qos(struct netdev *netdev_,
          return EOPNOTSUPP;
      }
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
      if (error) {
-        return error;
+        goto exit;
      }
  
      if (new_ops == netdev->tc->ops) {
-        return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
+        error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
      } else {
          /* Delete existing qdisc. */
          error = tc_del_qdisc(netdev_);
          if (error) {
-            return error;
+            goto exit;
          }
          ovs_assert(netdev->tc == NULL);
  
          /* Install new qdisc. */
          error = new_ops->tc_install(netdev_, details);
          ovs_assert((error == 0) == (netdev->tc != NULL));
-
-        return error;
      }
+
+exit:
+    ovs_mutex_unlock(&netdev->mutex);
+    return error;
  }
  
  static int
@@ -1918,15 +1990,17 @@ netdev_linux_get_queue(const struct netdev *netdev_,
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
-    } else {
+    if (!error) {
          struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
-        return (queue
+        error = (queue
                  ? netdev->tc->ops->class_get(netdev_, queue, details)
                  : ENOENT);
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  static int
@@ -1936,15 +2010,17 @@ netdev_linux_set_queue(struct netdev *netdev_,
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
-    } else if (queue_id >= netdev->tc->ops->n_queues
-               || !netdev->tc->ops->class_set) {
-        return EINVAL;
+    if (!error) {
+        error = (queue_id < netdev->tc->ops->n_queues
+                 && netdev->tc->ops->class_set
+                 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
+                 : EINVAL);
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    return netdev->tc->ops->class_set(netdev_, queue_id, details);
+    return error;
  }
  
  static int
@@ -1953,17 +2029,21 @@ netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
-    } else if (!netdev->tc->ops->class_delete) {
-        return EINVAL;
-    } else {
-        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
-        return (queue
-                ? netdev->tc->ops->class_delete(netdev_, queue)
-                : ENOENT);
+    if (!error) {
+        if (netdev->tc->ops->class_delete) {
+            struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
+            error = (queue
+                     ? netdev->tc->ops->class_delete(netdev_, queue)
+                     : ENOENT);
+        } else {
+            error = EINVAL;
+        }
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  static int
@@ -1974,19 +2054,25 @@ netdev_linux_get_queue_stats(const struct netdev *netdev_,
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
-    } else if (!netdev->tc->ops->class_get_stats) {
-        return EOPNOTSUPP;
-    } else {
-        const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
-        if (!queue) {
-            return ENOENT;
+    if (!error) {
+        if (netdev->tc->ops->class_get_stats) {
+            const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
+            if (queue) {
+                stats->created = queue->created;
+                error = netdev->tc->ops->class_get_stats(netdev_, queue,
+                                                         stats);
+            } else {
+                error = ENOENT;
+            }
+        } else {
+            error = EOPNOTSUPP;
          }
-        stats->created = queue->created;
-        return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  static bool
@@ -2010,34 +2096,37 @@ netdev_linux_dump_queues(const struct netdev *netdev_,
                           netdev_dump_queues_cb *cb, void *aux)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
-    struct tc_queue *queue, *next_queue;
-    struct smap details;
-    int last_error;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
-    } else if (!netdev->tc->ops->class_get) {
-        return EOPNOTSUPP;
-    }
+    if (!error) {
+        if (netdev->tc->ops->class_get) {
+            struct tc_queue *queue, *next_queue;
+            struct smap details;
  
-    last_error = 0;
-    smap_init(&details);
-    HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
-                        &netdev->tc->queues) {
-        smap_clear(&details);
+            smap_init(&details);
+            HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
+                                &netdev->tc->queues) {
+                int retval;
  
-        error = netdev->tc->ops->class_get(netdev_, queue, &details);
-        if (!error) {
-            (*cb)(queue->queue_id, &details, aux);
+                smap_clear(&details);
+
+                retval = netdev->tc->ops->class_get(netdev_, queue, &details);
+                if (!retval) {
+                    (*cb)(queue->queue_id, &details, aux);
+                } else {
+                    error = retval;
+                }
+            }
+            smap_destroy(&details);
          } else {
-            last_error = error;
+            error = EOPNOTSUPP;
          }
      }
-    smap_destroy(&details);
+    ovs_mutex_unlock(&netdev->mutex);
  
-    return last_error;
+    return error;
  }
  
  static int
@@ -2045,31 +2134,38 @@ netdev_linux_dump_queue_stats(const struct netdev *netdev_,
                                netdev_dump_queue_stats_cb *cb, void *aux)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
-    struct nl_dump dump;
-    struct ofpbuf msg;
-    int last_error;
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = tc_query_qdisc(netdev_);
-    if (error) {
-        return error;
-    } else if (!netdev->tc->ops->class_dump_stats) {
-        return EOPNOTSUPP;
-    }
+    if (!error) {
+        struct nl_dump dump;
  
-    last_error = 0;
-    if (!start_queue_dump(netdev_, &dump)) {
-        return ENODEV;
-    }
-    while (nl_dump_next(&dump, &msg)) {
-        error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
-        if (error) {
-            last_error = error;
+        if (!netdev->tc->ops->class_dump_stats) {
+            error = EOPNOTSUPP;
+        } else if (!start_queue_dump(netdev_, &dump)) {
+            error = ENODEV;
+        } else {
+            struct ofpbuf msg;
+            int retval;
+
+            while (nl_dump_next(&dump, &msg)) {
+                retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
+                                                           cb, aux);
+                if (retval) {
+                    error = retval;
+                }
+            }
+
+            retval = nl_dump_done(&dump);
+            if (retval) {
+                error = retval;
+            }
          }
      }
+    ovs_mutex_unlock(&netdev->mutex);
  
-    error = nl_dump_done(&dump);
-    return error ? error : last_error;
+    return error;
  }
  
  static int
@@ -2077,27 +2173,34 @@ netdev_linux_get_in4(const struct netdev *netdev_,
                       struct in_addr *address, struct in_addr *netmask)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_IN4)) {
-        int error;
-
          error = netdev_linux_get_ipv4(netdev_, &netdev->address,
                                        SIOCGIFADDR, "SIOCGIFADDR");
-        if (error) {
-            return error;
+        if (!error) {
+            error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
+                                          SIOCGIFNETMASK, "SIOCGIFNETMASK");
+            if (!error) {
+                netdev->cache_valid |= VALID_IN4;
+            }
          }
+    } else {
+        error = 0;
+    }
  
-        error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
-                                      SIOCGIFNETMASK, "SIOCGIFNETMASK");
-        if (error) {
-            return error;
+    if (!error) {
+        if (netdev->address.s_addr != INADDR_ANY) {
+            *address = netdev->address;
+            *netmask = netdev->netmask;
+        } else {
+            error = EADDRNOTAVAIL;
          }
-
-        netdev->cache_valid |= VALID_IN4;
      }
-    *address = netdev->address;
-    *netmask = netdev->netmask;
-    return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
  }
  
  static int
@@ -2107,6 +2210,7 @@ netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
+    ovs_mutex_lock(&netdev->mutex);
      error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
      if (!error) {
          netdev->cache_valid |= VALID_IN4;
@@ -2117,6 +2221,8 @@ netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
                                  "SIOCSIFNETMASK", netmask);
          }
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error;
  }
  
@@ -2142,6 +2248,8 @@ static int
  netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_IN6)) {
          FILE *file;
          char line[128];
@@ -2166,6 +2274,8 @@ netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
          netdev->cache_valid |= VALID_IN6;
      }
      *in6 = netdev->in6;
+    ovs_mutex_unlock(&netdev->mutex);
+
      return 0;
  }
  
@@ -2280,6 +2390,7 @@ netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error = 0;
  
+    ovs_mutex_lock(&netdev->mutex);
      if (!(netdev->cache_valid & VALID_DRVINFO)) {
          struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
  
@@ -2299,6 +2410,8 @@ netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
          smap_add(smap, "driver_version", netdev->drvinfo.version);
          smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
      }
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error;
  }
  
@@ -2370,10 +2483,10 @@ iff_to_nd_flags(int iff)
  }
  
  static int
-netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
-                          enum netdev_flags on, enum netdev_flags *old_flagsp)
+update_flags(struct netdev_linux *netdev, enum netdev_flags off,
+             enum netdev_flags on, enum netdev_flags *old_flagsp)
+    OVS_REQUIRES(netdev->mutex)
  {
-    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int old_flags, new_flags;
      int error = 0;
  
@@ -2381,16 +2494,38 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
      *old_flagsp = iff_to_nd_flags(old_flags);
      new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
      if (new_flags != old_flags) {
-        error = set_flags(netdev_get_name(netdev_), new_flags);
-        get_flags(netdev_, &netdev->ifi_flags);
+        error = set_flags(netdev_get_name(&netdev->up), new_flags);
+        get_flags(&netdev->up, &netdev->ifi_flags);
      }
+
+    return error;
+}
+
+static int
+netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
+                          enum netdev_flags on, enum netdev_flags *old_flagsp)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
+
+    ovs_mutex_lock(&netdev->mutex);
+    error = update_flags(netdev, off, on, old_flagsp);
+    ovs_mutex_unlock(&netdev->mutex);
+
      return error;
  }
  
  static unsigned int
-netdev_linux_change_seq(const struct netdev *netdev)
+netdev_linux_change_seq(const struct netdev *netdev_)
  {
-    return netdev_linux_cast(netdev)->change_seq;
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    unsigned int change_seq;
+
+    ovs_mutex_lock(&netdev->mutex);
+    change_seq = netdev->change_seq;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return change_seq;
  }
  
  #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS,  \
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h

index 9457c17..23905d4 100644 (file)
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -33,9 +33,12 @@ extern "C" {
   * Network device implementations may read these members but should not modify
   * them. */
  struct netdev {
+    /* The following do not change during the lifetime of a struct netdev. */
      char *name;                         /* Name of network device. */
      const struct netdev_class *netdev_class; /* Functions to control
                                                  this device. */
+
+    /* The following are protected by 'netdev_mutex' (internal to netdev.c). */
      int ref_cnt;                        /* Times this devices was opened. */
      struct shash_node *node;            /* Pointer to element in global map. */
      struct list saved_flags_list; /* Contains "struct netdev_saved_flags". */
@@ -636,7 +639,6 @@ struct netdev_class {
  
  int netdev_register_provider(const struct netdev_class *);
  int netdev_unregister_provider(const char *type);
-const struct netdev_class *netdev_lookup_provider(const char *type);
  
  extern const struct netdev_class netdev_linux_class;
  extern const struct netdev_class netdev_internal_class;
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c

index ac3da63..76aa148 100644 (file)
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -48,6 +48,10 @@ VLOG_DEFINE_THIS_MODULE(netdev_vport);
  
  struct netdev_vport {
      struct netdev up;
+
+    /* Protects all members below. */
+    struct ovs_mutex mutex;
+
      unsigned int change_seq;
      uint8_t etheraddr[ETH_ADDR_LEN];
      struct netdev_stats stats;
@@ -65,9 +69,10 @@ struct vport_class {
  };
  
  static int netdev_vport_construct(struct netdev *);
-static int get_patch_config(const struct netdev *, struct smap *args);
+static int get_patch_config(const struct netdev *netdev, struct smap *args);
  static int get_tunnel_config(const struct netdev *, struct smap *args);
-static void netdev_vport_poll_notify(struct netdev_vport *);
+static void netdev_vport_poll_notify(struct netdev_vport *netdev)
+    OVS_REQUIRES(netdev->mutex);
  
  static bool
  is_vport_class(const struct netdev_class *class)
@@ -166,6 +171,7 @@ netdev_vport_construct(struct netdev *netdev_)
  {
      struct netdev_vport *netdev = netdev_vport_cast(netdev_);
  
+    ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
      netdev->change_seq = 1;
      eth_addr_random(netdev->etheraddr);
  
@@ -181,6 +187,7 @@ netdev_vport_destruct(struct netdev *netdev_)
  
      route_table_unregister();
      free(netdev->peer);
+    ovs_mutex_destroy(&netdev->mutex);
  }
  
  static void
@@ -195,26 +202,39 @@ netdev_vport_set_etheraddr(struct netdev *netdev_,
                             const uint8_t mac[ETH_ADDR_LEN])
  {
      struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+
+    ovs_mutex_lock(&netdev->mutex);
      memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
      netdev_vport_poll_notify(netdev);
+    ovs_mutex_unlock(&netdev->mutex);
+
      return 0;
  }
  
  static int
-netdev_vport_get_etheraddr(const struct netdev *netdev,
+netdev_vport_get_etheraddr(const struct netdev *netdev_,
                             uint8_t mac[ETH_ADDR_LEN])
  {
-    memcpy(mac, netdev_vport_cast(netdev)->etheraddr, ETH_ADDR_LEN);
+    struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+
+    ovs_mutex_lock(&netdev->mutex);
+    memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
+    ovs_mutex_unlock(&netdev->mutex);
+
      return 0;
  }
  
  static int
-tunnel_get_status(const struct netdev *netdev, struct smap *smap)
+tunnel_get_status(const struct netdev *netdev_, struct smap *smap)
  {
+    struct netdev_vport *netdev = netdev_vport_cast(netdev_);
      char iface[IFNAMSIZ];
      ovs_be32 route;
  
-    route = netdev_vport_cast(netdev)->tnl_cfg.ip_dst;
+    ovs_mutex_lock(&netdev->mutex);
+    route = netdev->tnl_cfg.ip_dst;
+    ovs_mutex_unlock(&netdev->mutex);
+
      if (route_table_get_name(route, iface)) {
          struct netdev *egress_netdev;
  
@@ -473,8 +493,10 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
                                 &tnl_cfg.out_key_present,
                                 &tnl_cfg.out_key_flow);
  
+    ovs_mutex_lock(&dev->mutex);
      dev->tnl_cfg = tnl_cfg;
      netdev_vport_poll_notify(dev);
+    ovs_mutex_unlock(&dev->mutex);
  
      return 0;
  }
@@ -482,56 +504,60 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
  static int
  get_tunnel_config(const struct netdev *dev, struct smap *args)
  {
-    const struct netdev_tunnel_config *tnl_cfg =
-        &netdev_vport_cast(dev)->tnl_cfg;
+    struct netdev_vport *netdev = netdev_vport_cast(dev);
+    struct netdev_tunnel_config tnl_cfg;
  
-    if (tnl_cfg->ip_dst) {
-        smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg->ip_dst));
-    } else if (tnl_cfg->ip_dst_flow) {
+    ovs_mutex_lock(&netdev->mutex);
+    tnl_cfg = netdev->tnl_cfg;
+    ovs_mutex_unlock(&netdev->mutex);
+
+    if (tnl_cfg.ip_dst) {
+        smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_dst));
+    } else if (tnl_cfg.ip_dst_flow) {
          smap_add(args, "remote_ip", "flow");
      }
  
-    if (tnl_cfg->ip_src) {
-        smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg->ip_src));
-    } else if (tnl_cfg->ip_src_flow) {
+    if (tnl_cfg.ip_src) {
+        smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_src));
+    } else if (tnl_cfg.ip_src_flow) {
          smap_add(args, "local_ip", "flow");
      }
  
-    if (tnl_cfg->in_key_flow && tnl_cfg->out_key_flow) {
+    if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) {
          smap_add(args, "key", "flow");
-    } else if (tnl_cfg->in_key_present && tnl_cfg->out_key_present
-               && tnl_cfg->in_key == tnl_cfg->out_key) {
-        smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg->in_key));
+    } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present
+               && tnl_cfg.in_key == tnl_cfg.out_key) {
+        smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key));
      } else {
-        if (tnl_cfg->in_key_flow) {
+        if (tnl_cfg.in_key_flow) {
              smap_add(args, "in_key", "flow");
-        } else if (tnl_cfg->in_key_present) {
+        } else if (tnl_cfg.in_key_present) {
              smap_add_format(args, "in_key", "%"PRIu64,
-                            ntohll(tnl_cfg->in_key));
+                            ntohll(tnl_cfg.in_key));
          }
  
-        if (tnl_cfg->out_key_flow) {
+        if (tnl_cfg.out_key_flow) {
              smap_add(args, "out_key", "flow");
-        } else if (tnl_cfg->out_key_present) {
+        } else if (tnl_cfg.out_key_present) {
              smap_add_format(args, "out_key", "%"PRIu64,
-                            ntohll(tnl_cfg->out_key));
+                            ntohll(tnl_cfg.out_key));
          }
      }
  
-    if (tnl_cfg->ttl_inherit) {
+    if (tnl_cfg.ttl_inherit) {
          smap_add(args, "ttl", "inherit");
-    } else if (tnl_cfg->ttl != DEFAULT_TTL) {
-        smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg->ttl);
+    } else if (tnl_cfg.ttl != DEFAULT_TTL) {
+        smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl);
      }
  
-    if (tnl_cfg->tos_inherit) {
+    if (tnl_cfg.tos_inherit) {
          smap_add(args, "tos", "inherit");
-    } else if (tnl_cfg->tos) {
-        smap_add_format(args, "tos", "0x%x", tnl_cfg->tos);
+    } else if (tnl_cfg.tos) {
+        smap_add_format(args, "tos", "0x%x", tnl_cfg.tos);
      }
  
-    if (tnl_cfg->dst_port) {
-        uint16_t dst_port = ntohs(tnl_cfg->dst_port);
+    if (tnl_cfg.dst_port) {
+        uint16_t dst_port = ntohs(tnl_cfg.dst_port);
          const char *type = netdev_get_type(dev);
  
          if ((!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
@@ -540,11 +566,11 @@ get_tunnel_config(const struct netdev *dev, struct smap *args)
          }
      }
  
-    if (tnl_cfg->csum) {
+    if (tnl_cfg.csum) {
          smap_add(args, "csum", "true");
      }
  
-    if (!tnl_cfg->dont_fragment) {
+    if (!tnl_cfg.dont_fragment) {
          smap_add(args, "df_default", "false");
      }
  
@@ -553,12 +579,26 @@ get_tunnel_config(const struct netdev *dev, struct smap *args)
  \f
  /* Code specific to patch ports. */
  
-const char *
-netdev_vport_patch_peer(const struct netdev *netdev)
+/* If 'netdev' is a patch port, returns the name of its peer as a malloc()'d
+ * string that the caller must free.
+ *
+ * If 'netdev' is not a patch port, returns NULL. */
+char *
+netdev_vport_patch_peer(const struct netdev *netdev_)
  {
-    return (netdev_vport_is_patch(netdev)
-            ? netdev_vport_cast(netdev)->peer
-            : NULL);
+    char *peer = NULL;
+
+    if (netdev_vport_is_patch(netdev_)) {
+        struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+
+        ovs_mutex_lock(&netdev->mutex);
+        if (netdev->peer) {
+            peer = xstrdup(netdev->peer);
+        }
+        ovs_mutex_unlock(&netdev->mutex);
+    }
+
+    return peer;
  }
  
  void
@@ -567,8 +607,11 @@ netdev_vport_inc_rx(const struct netdev *netdev,
  {
      if (is_vport_class(netdev_get_class(netdev))) {
          struct netdev_vport *dev = netdev_vport_cast(netdev);
+
+        ovs_mutex_lock(&dev->mutex);
          dev->stats.rx_packets += stats->n_packets;
          dev->stats.rx_bytes += stats->n_bytes;
+        ovs_mutex_unlock(&dev->mutex);
      }
  }
  
@@ -578,8 +621,11 @@ netdev_vport_inc_tx(const struct netdev *netdev,
  {
      if (is_vport_class(netdev_get_class(netdev))) {
          struct netdev_vport *dev = netdev_vport_cast(netdev);
+
+        ovs_mutex_lock(&dev->mutex);
          dev->stats.tx_packets += stats->n_packets;
          dev->stats.tx_bytes += stats->n_bytes;
+        ovs_mutex_unlock(&dev->mutex);
      }
  }
  
@@ -588,9 +634,12 @@ get_patch_config(const struct netdev *dev_, struct smap *args)
  {
      struct netdev_vport *dev = netdev_vport_cast(dev_);
  
+    ovs_mutex_lock(&dev->mutex);
      if (dev->peer) {
          smap_add(args, "peer", dev->peer);
      }
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
@@ -617,9 +666,12 @@ set_patch_config(struct netdev *dev_, const struct smap *args)
          return EINVAL;
      }
  
+    ovs_mutex_lock(&dev->mutex);
      free(dev->peer);
      dev->peer = xstrdup(peer);
      netdev_vport_poll_notify(dev);
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  
@@ -627,7 +679,11 @@ static int
  get_stats(const struct netdev *netdev, struct netdev_stats *stats)
  {
      struct netdev_vport *dev = netdev_vport_cast(netdev);
-    memcpy(stats, &dev->stats, sizeof *stats);
+
+    ovs_mutex_lock(&dev->mutex);
+    *stats = dev->stats;
+    ovs_mutex_unlock(&dev->mutex);
+
      return 0;
  }
  \f
@@ -712,15 +768,15 @@ netdev_vport_tunnel_register(void)
          TUNNEL_CLASS("vxlan", "vxlan_system"),
          TUNNEL_CLASS("lisp", "lisp_system")
      };
-    static bool inited;
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
  
-    int i;
+    if (ovsthread_once_start(&once)) {
+        int i;
  
-    if (!inited) {
-        inited = true;
          for (i = 0; i < ARRAY_SIZE(vport_classes); i++) {
              netdev_register_provider(&vport_classes[i].netdev_class);
          }
+        ovsthread_once_done(&once);
      }
  }
  
diff --git a/lib/netdev-vport.h b/lib/netdev-vport.h

index 5394966..dc49097 100644 (file)
--- a/lib/netdev-vport.h
+++ b/lib/netdev-vport.h
@@ -31,7 +31,7 @@ void netdev_vport_patch_register(void);
  
  bool netdev_vport_is_patch(const struct netdev *);
  
-const char *netdev_vport_patch_peer(const struct netdev *netdev);
+char *netdev_vport_patch_peer(const struct netdev *netdev);
  
  void netdev_vport_inc_rx(const struct netdev *,
                           const struct dpif_flow_stats *);
diff --git a/lib/netdev.c b/lib/netdev.c

index 5f4345a..c70105b 100644 (file)
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -56,10 +56,31 @@ struct netdev_saved_flags {
      enum netdev_flags saved_values;
  };
  
-static struct shash netdev_classes = SHASH_INITIALIZER(&netdev_classes);
+/* Protects 'netdev_shash' and the mutable members of struct netdev. */
+static struct ovs_mutex netdev_mutex = OVS_MUTEX_INITIALIZER;
  
  /* All created network devices. */
-static struct shash netdev_shash = SHASH_INITIALIZER(&netdev_shash);
+static struct shash netdev_shash OVS_GUARDED_BY(netdev_mutex)
+    = SHASH_INITIALIZER(&netdev_shash);
+
+/* Protects 'netdev_classes' against insertions or deletions.
+ *
+ * This is not an rwlock for performance reasons but to allow recursive
+ * acquisition when calling into providers.  For example, netdev_run() calls
+ * into provider 'run' functions, which might reasonably want to call one of
+ * the netdev functions that takes netdev_class_rwlock read-only. */
+static struct ovs_rwlock netdev_class_rwlock OVS_ACQ_BEFORE(netdev_mutex)
+    = OVS_RWLOCK_INITIALIZER;
+
+/* Contains 'struct netdev_registered_class'es. */
+static struct hmap netdev_classes OVS_GUARDED_BY(netdev_class_rwlock)
+    = HMAP_INITIALIZER(&netdev_classes);
+
+struct netdev_registered_class {
+    struct hmap_node hmap_node; /* In 'netdev_classes', by class->type. */
+    const struct netdev_class *class;
+    atomic_int ref_cnt;         /* Number of 'struct netdev's of this class. */
+};
  
  /* This is set pretty low because we probably won't learn anything from the
   * additional log messages. */
@@ -70,12 +91,11 @@ void update_device_args(struct netdev *, const struct shash *args);
  
  static void
  netdev_initialize(void)
+    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
  {
-    static bool inited;
-
-    if (!inited) {
-        inited = true;
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
  
+    if (ovsthread_once_start(&once)) {
          fatal_signal_add_hook(restore_all_flags, NULL, NULL, true);
          netdev_vport_patch_register();
  
@@ -89,8 +109,10 @@ netdev_initialize(void)
          netdev_register_provider(&netdev_tap_class);
          netdev_register_provider(&netdev_bsd_class);
  #endif
-       netdev_register_provider(&netdev_tunnel_class);
-       netdev_register_provider(&netdev_pltap_class);
+        netdev_register_provider(&netdev_tunnel_class);
+        netdev_register_provider(&netdev_pltap_class);
+
+        ovsthread_once_done(&once);
      }
  }
  
@@ -100,14 +122,15 @@ netdev_initialize(void)
   * main poll loop. */
  void
  netdev_run(void)
+    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
  {
-    struct shash_node *node;
-    SHASH_FOR_EACH(node, &netdev_classes) {
-        const struct netdev_class *netdev_class = node->data;
-        if (netdev_class->run) {
-            netdev_class->run();
-        }
+    struct netdev_registered_class *rc;
+
+    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+        rc->class->run();
      }
+    ovs_rwlock_unlock(&netdev_class_rwlock);
  }
  
  /* Arranges for poll_block() to wake up when netdev_run() needs to be called.
@@ -116,39 +139,63 @@ netdev_run(void)
   * main poll loop. */
  void
  netdev_wait(void)
+    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
  {
-    struct shash_node *node;
-    SHASH_FOR_EACH(node, &netdev_classes) {
-        const struct netdev_class *netdev_class = node->data;
-        if (netdev_class->wait) {
-            netdev_class->wait();
+    struct netdev_registered_class *rc;
+
+    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+        rc->class->wait();
+    }
+    ovs_rwlock_unlock(&netdev_class_rwlock);
+}
+
+static struct netdev_registered_class *
+netdev_lookup_class(const char *type)
+    OVS_REQ_RDLOCK(netdev_class_rwlock)
+{
+    struct netdev_registered_class *rc;
+
+    HMAP_FOR_EACH_WITH_HASH (rc, hmap_node, hash_string(type, 0),
+                             &netdev_classes) {
+        if (!strcmp(type, rc->class->type)) {
+            return rc;
          }
      }
+    return NULL;
  }
  
  /* Initializes and registers a new netdev provider.  After successful
   * registration, new netdevs of that type can be opened using netdev_open(). */
  int
  netdev_register_provider(const struct netdev_class *new_class)
+    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
  {
-    if (shash_find(&netdev_classes, new_class->type)) {
+    int error;
+
+    ovs_rwlock_wrlock(&netdev_class_rwlock);
+    if (netdev_lookup_class(new_class->type)) {
          VLOG_WARN("attempted to register duplicate netdev provider: %s",
                     new_class->type);
-        return EEXIST;
-    }
-
-    if (new_class->init) {
-        int error = new_class->init();
-        if (error) {
+        error = EEXIST;
+    } else {
+        error = new_class->init ? new_class->init() : 0;
+        if (!error) {
+            struct netdev_registered_class *rc;
+
+            rc = xmalloc(sizeof *rc);
+            hmap_insert(&netdev_classes, &rc->hmap_node,
+                        hash_string(new_class->type, 0));
+            rc->class = new_class;
+            atomic_init(&rc->ref_cnt, 0);
+        } else {
              VLOG_ERR("failed to initialize %s network device class: %s",
                       new_class->type, ovs_strerror(error));
-            return error;
          }
      }
+    ovs_rwlock_unlock(&netdev_class_rwlock);
  
-    shash_add(&netdev_classes, new_class->type, new_class);
-
-    return 0;
+    return error;
  }
  
  /* Unregisters a netdev provider.  'type' must have been previously
@@ -156,51 +203,52 @@ netdev_register_provider(const struct netdev_class *new_class)
   * new netdevs of that type cannot be opened using netdev_open(). */
  int
  netdev_unregister_provider(const char *type)
+    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
  {
-    struct shash_node *del_node, *netdev_node;
+    struct netdev_registered_class *rc;
+    int error;
  
-    del_node = shash_find(&netdev_classes, type);
-    if (!del_node) {
+    ovs_rwlock_wrlock(&netdev_class_rwlock);
+    rc = netdev_lookup_class(type);
+    if (!rc) {
          VLOG_WARN("attempted to unregister a netdev provider that is not "
                    "registered: %s", type);
-        return EAFNOSUPPORT;
-    }
+        error = EAFNOSUPPORT;
+    } else {
+        int ref_cnt;
  
-    SHASH_FOR_EACH (netdev_node, &netdev_shash) {
-        struct netdev *netdev = netdev_node->data;
-        if (!strcmp(netdev->netdev_class->type, type)) {
+        atomic_read(&rc->ref_cnt, &ref_cnt);
+        if (!ref_cnt) {
+            hmap_remove(&netdev_classes, &rc->hmap_node);
+            free(rc);
+            error = 0;
+        } else {
              VLOG_WARN("attempted to unregister in use netdev provider: %s",
                        type);
-            return EBUSY;
+            error = EBUSY;
          }
      }
+    ovs_rwlock_unlock(&netdev_class_rwlock);
  
-    shash_delete(&netdev_classes, del_node);
-
-    return 0;
-}
-
-const struct netdev_class *
-netdev_lookup_provider(const char *type)
-{
-    netdev_initialize();
-    return shash_find_data(&netdev_classes, type && type[0] ? type : "system");
+    return error;
  }
  
  /* Clears 'types' and enumerates the types of all currently registered netdev
   * providers into it.  The caller must first initialize the sset. */
  void
  netdev_enumerate_types(struct sset *types)
+    OVS_EXCLUDED(netdev_mutex)
  {
-    struct shash_node *node;
+    struct netdev_registered_class *rc;
  
      netdev_initialize();
      sset_clear(types);
  
-    SHASH_FOR_EACH(node, &netdev_classes) {
-        const struct netdev_class *netdev_class = node->data;
-        sset_add(types, netdev_class->type);
+    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+        sset_add(types, rc->class->type);
      }
+    ovs_rwlock_unlock(&netdev_class_rwlock);
  }
  
  /* Check that the network device name is not the same as any of the registered
@@ -210,17 +258,21 @@ netdev_enumerate_types(struct sset *types)
   * Returns true if there is a name conflict, false otherwise. */
  bool
  netdev_is_reserved_name(const char *name)
+    OVS_EXCLUDED(netdev_mutex)
  {
-    struct shash_node *node;
+    struct netdev_registered_class *rc;
  
      netdev_initialize();
-    SHASH_FOR_EACH (node, &netdev_classes) {
-        const char *dpif_port;
-        dpif_port = netdev_vport_class_get_dpif_port(node->data);
+
+    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
+        const char *dpif_port = netdev_vport_class_get_dpif_port(rc->class);
          if (dpif_port && !strcmp(dpif_port, name)) {
+            ovs_rwlock_unlock(&netdev_class_rwlock);
              return true;
          }
      }
+    ovs_rwlock_unlock(&netdev_class_rwlock);
  
      if (!strncmp(name, "ovs-", 4)) {
          struct sset types;
@@ -249,29 +301,39 @@ netdev_is_reserved_name(const char *name)
   * before they can be used. */
  int
  netdev_open(const char *name, const char *type, struct netdev **netdevp)
+    OVS_EXCLUDED(netdev_mutex)
  {
      struct netdev *netdev;
      int error;
  
      netdev_initialize();
  
+    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_mutex);
      netdev = shash_find_data(&netdev_shash, name);
      if (!netdev) {
-        const struct netdev_class *class;
+        struct netdev_registered_class *rc;
  
-        class = netdev_lookup_provider(type);
-        if (class) {
-            netdev = class->alloc();
+        rc = netdev_lookup_class(type && type[0] ? type : "system");
+        if (rc) {
+            netdev = rc->class->alloc();
              if (netdev) {
                  memset(netdev, 0, sizeof *netdev);
-                netdev->netdev_class = class;
+                netdev->netdev_class = rc->class;
                  netdev->name = xstrdup(name);
                  netdev->node = shash_add(&netdev_shash, name, netdev);
                  list_init(&netdev->saved_flags_list);
  
-                error = class->construct(netdev);
-                if (error) {
-                    class->dealloc(netdev);
+                error = rc->class->construct(netdev);
+                if (!error) {
+                    int old_ref_cnt;
+
+                    atomic_add(&rc->ref_cnt, 1, &old_ref_cnt);
+                } else {
+                    free(netdev->name);
+                    ovs_assert(list_is_empty(&netdev->saved_flags_list));
+                    shash_delete(&netdev_shash, netdev->node);
+                    rc->class->dealloc(netdev);
                  }
              } else {
                  error = ENOMEM;
@@ -285,6 +347,9 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp)
          error = 0;
      }
  
+    ovs_mutex_unlock(&netdev_mutex);
+    ovs_rwlock_unlock(&netdev_class_rwlock);
+
      if (!error) {
          netdev->ref_cnt++;
          *netdevp = netdev;
@@ -298,12 +363,15 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp)
   * 'netdev_' is null. */
  struct netdev *
  netdev_ref(const struct netdev *netdev_)
+    OVS_EXCLUDED(netdev_mutex)
  {
      struct netdev *netdev = CONST_CAST(struct netdev *, netdev_);
  
      if (netdev) {
+        ovs_mutex_lock(&netdev_mutex);
          ovs_assert(netdev->ref_cnt > 0);
          netdev->ref_cnt++;
+        ovs_mutex_unlock(&netdev_mutex);
      }
      return netdev;
  }
@@ -312,9 +380,10 @@ netdev_ref(const struct netdev *netdev_)
   * or NULL if none are needed. */
  int
  netdev_set_config(struct netdev *netdev, const struct smap *args)
+    OVS_EXCLUDED(netdev_mutex)
  {
      if (netdev->netdev_class->set_config) {
-        struct smap no_args = SMAP_INITIALIZER(&no_args);
+        const struct smap no_args = SMAP_INITIALIZER(&no_args);
          return netdev->netdev_class->set_config(netdev,
                                                  args ? args : &no_args);
      } else if (args && !smap_is_empty(args)) {
@@ -334,6 +403,7 @@ netdev_set_config(struct netdev *netdev, const struct smap *args)
   * smap_destroy(). */
  int
  netdev_get_config(const struct netdev *netdev, struct smap *args)
+    OVS_EXCLUDED(netdev_mutex)
  {
      int error;
  
@@ -352,6 +422,7 @@ netdev_get_config(const struct netdev *netdev, struct smap *args)
  
  const struct netdev_tunnel_config *
  netdev_get_tunnel_config(const struct netdev *netdev)
+    OVS_EXCLUDED(netdev_mutex)
  {
      if (netdev->netdev_class->get_tunnel_config) {
          return netdev->netdev_class->get_tunnel_config(netdev);
@@ -362,22 +433,38 @@ netdev_get_tunnel_config(const struct netdev *netdev)
  
  static void
  netdev_unref(struct netdev *dev)
+    OVS_RELEASES(netdev_mutex)
  {
      ovs_assert(dev->ref_cnt);
      if (!--dev->ref_cnt) {
+        const struct netdev_class *class = dev->netdev_class;
+        struct netdev_registered_class *rc;
+        int old_ref_cnt;
+
          dev->netdev_class->destruct(dev);
  
          shash_delete(&netdev_shash, dev->node);
          free(dev->name);
          dev->netdev_class->dealloc(dev);
+        ovs_mutex_unlock(&netdev_mutex);
+
+        ovs_rwlock_rdlock(&netdev_class_rwlock);
+        rc = netdev_lookup_class(class->type);
+        atomic_sub(&rc->ref_cnt, 1, &old_ref_cnt);
+        ovs_assert(old_ref_cnt > 0);
+        ovs_rwlock_unlock(&netdev_class_rwlock);
+    } else {
+        ovs_mutex_unlock(&netdev_mutex);
      }
  }
  
  /* Closes and destroys 'netdev'. */
  void
  netdev_close(struct netdev *netdev)
+    OVS_EXCLUDED(netdev_mutex)
  {
      if (netdev) {
+        ovs_mutex_lock(&netdev_mutex);
          netdev_unref(netdev);
      }
  }
@@ -403,6 +490,7 @@ netdev_parse_name(const char *netdev_name_, char **name, char **type)
  
  int
  netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp)
+    OVS_EXCLUDED(netdev_mutex)
  {
      int error;
  
@@ -412,7 +500,10 @@ netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp)
              rx->netdev = netdev;
              error = netdev->netdev_class->rx_construct(rx);
              if (!error) {
+                ovs_mutex_lock(&netdev_mutex);
                  netdev->ref_cnt++;
+                ovs_mutex_unlock(&netdev_mutex);
+
                  *rxp = rx;
                  return 0;
              }
@@ -430,6 +521,7 @@ netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp)
  
  void
  netdev_rx_close(struct netdev_rx *rx)
+    OVS_EXCLUDED(netdev_mutex)
  {
      if (rx) {
          struct netdev *netdev = rx->netdev;
@@ -849,6 +941,7 @@ static int
  do_update_flags(struct netdev *netdev, enum netdev_flags off,
                  enum netdev_flags on, enum netdev_flags *old_flagsp,
                  struct netdev_saved_flags **sfp)
+    OVS_EXCLUDED(netdev_mutex)
  {
      struct netdev_saved_flags *sf = NULL;
      enum netdev_flags old_flags;
@@ -865,6 +958,7 @@ do_update_flags(struct netdev *netdev, enum netdev_flags off,
          enum netdev_flags new_flags = (old_flags & ~off) | on;
          enum netdev_flags changed_flags = old_flags ^ new_flags;
          if (changed_flags) {
+            ovs_mutex_lock(&netdev_mutex);
              *sfp = sf = xmalloc(sizeof *sf);
              sf->netdev = netdev;
              list_push_front(&netdev->saved_flags_list, &sf->node);
@@ -872,6 +966,7 @@ do_update_flags(struct netdev *netdev, enum netdev_flags off,
              sf->saved_values = changed_flags & new_flags;
  
              netdev->ref_cnt++;
+            ovs_mutex_unlock(&netdev_mutex);
          }
      }
  
@@ -935,6 +1030,7 @@ netdev_turn_flags_off(struct netdev *netdev, enum netdev_flags flags,
   * Does nothing if 'sf' is NULL. */
  void
  netdev_restore_flags(struct netdev_saved_flags *sf)
+    OVS_EXCLUDED(netdev_mutex)
  {
      if (sf) {
          struct netdev *netdev = sf->netdev;
@@ -944,9 +1040,10 @@ netdev_restore_flags(struct netdev_saved_flags *sf)
                                             sf->saved_flags & sf->saved_values,
                                             sf->saved_flags & ~sf->saved_values,
                                             &old_flags);
+
+        ovs_mutex_lock(&netdev_mutex);
          list_remove(&sf->node);
          free(sf);
-
          netdev_unref(netdev);
      }
  }
@@ -1381,13 +1478,16 @@ netdev_get_class(const struct netdev *netdev)
   * The caller must free the returned netdev with netdev_close(). */
  struct netdev *
  netdev_from_name(const char *name)
+    OVS_EXCLUDED(netdev_mutex)
  {
      struct netdev *netdev;
  
+    ovs_mutex_lock(&netdev_mutex);
      netdev = shash_find_data(&netdev_shash, name);
      if (netdev) {
-        netdev_ref(netdev);
+        netdev->ref_cnt++;
      }
+    ovs_mutex_unlock(&netdev_mutex);
  
      return netdev;
  }
@@ -1399,8 +1499,11 @@ netdev_from_name(const char *name)
  void
  netdev_get_devices(const struct netdev_class *netdev_class,
                     struct shash *device_list)
+    OVS_EXCLUDED(netdev_mutex)
  {
      struct shash_node *node;
+
+    ovs_mutex_lock(&netdev_mutex);
      SHASH_FOR_EACH (node, &netdev_shash) {
          struct netdev *dev = node->data;
  
@@ -1409,6 +1512,7 @@ netdev_get_devices(const struct netdev_class *netdev_class,
              shash_add(device_list, node->name, node->data);
          }
      }
+    ovs_mutex_unlock(&netdev_mutex);
  }
  
  const char *
diff --git a/lib/netdev.h b/lib/netdev.h

index eb1870b..287f6cc 100644 (file)
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -31,7 +31,23 @@ extern "C" {
   * Every port on a switch must have a corresponding netdev that must minimally
   * support a few operations, such as the ability to read the netdev's MTU.
   * The PORTING file at the top of the source tree has more information in the
- * "Writing a netdev Provider" section. */
+ * "Writing a netdev Provider" section.
+ *
+ * Thread-safety
+ * =============
+ *
+ * Most of the netdev functions are fully thread-safe: they may be called from
+ * any number of threads on the same or different netdev objects.  The
+ * exceptions are:
+ *
+ *    netdev_rx_recv()
+ *    netdev_rx_wait()
+ *    netdev_rx_drain()
+ *
+ *      These functions are conditionally thread-safe: they may be called from
+ *      different threads only on different netdev_rx objects.  (The client may
+ *      create multiple netdev_rx objects for a single netdev and access each
+ *      of those from a different thread.) */
  
  struct netdev;
  struct netdev_class;
diff --git a/lib/nx-match.c b/lib/nx-match.c

index 940dd9a..09f7f54 100644 (file)
--- a/lib/nx-match.c
+++ b/lib/nx-match.c
@@ -693,6 +693,10 @@ nx_put_raw(struct ofpbuf *b, bool oxm, const struct match *match,
                      htonl(flow->regs[i]), htonl(match->wc.masks.regs[i]));
      }
  
+    /* Mark. */
+    nxm_put_32m(b, NXM_NX_PKT_MARK, htonl(flow->pkt_mark),
+                htonl(match->wc.masks.pkt_mark));
+
      /* OpenFlow 1.1+ Metadata. */
      nxm_put_64m(b, OXM_OF_METADATA, flow->metadata, match->wc.masks.metadata);
  
diff --git a/lib/odp-execute.c b/lib/odp-execute.c

index e6e8c91..d505c60 100644 (file)
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -65,7 +65,7 @@ odp_execute_set_action(struct ofpbuf *packet, const struct nlattr *a,
          break;
  
      case OVS_KEY_ATTR_SKB_MARK:
-        flow->skb_mark = nl_attr_get_u32(a);
+        flow->pkt_mark = nl_attr_get_u32(a);
          break;
  
      case OVS_KEY_ATTR_ETHERNET:
diff --git a/lib/odp-util.c b/lib/odp-util.c

index 78d5a1b..a09042e 100644 (file)
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -2361,7 +2361,7 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *data,
          tun_key_to_attr(buf, &data->tunnel);
      }
  
-    nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->skb_mark);
+    nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark);
  
      /* Add an ingress port attribute if this is a mask or 'odp_in_port'
       * is not the magical value "ODPP_NONE". */
@@ -2932,7 +2932,7 @@ odp_flow_key_to_flow(const struct nlattr *key, size_t key_len,
      }
  
      if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK)) {
-        flow->skb_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);
+        flow->pkt_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);
          expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK;
      }
  
@@ -3044,11 +3044,11 @@ commit_set_action(struct ofpbuf *odp_actions, enum ovs_key_attr key_type,
  }
  
  void
-odp_put_skb_mark_action(const uint32_t skb_mark,
+odp_put_pkt_mark_action(const uint32_t pkt_mark,
                          struct ofpbuf *odp_actions)
  {
-    commit_set_action(odp_actions, OVS_KEY_ATTR_SKB_MARK, &skb_mark,
-                      sizeof(skb_mark));
+    commit_set_action(odp_actions, OVS_KEY_ATTR_SKB_MARK, &pkt_mark,
+                      sizeof(pkt_mark));
  }
  
  /* If any of the flow key data that ODP actions can modify are different in
@@ -3306,18 +3306,18 @@ commit_set_priority_action(const struct flow *flow, struct flow *base,
  }
  
  static void
-commit_set_skb_mark_action(const struct flow *flow, struct flow *base,
+commit_set_pkt_mark_action(const struct flow *flow, struct flow *base,
                             struct ofpbuf *odp_actions,
                             struct flow_wildcards *wc)
  {
-    if (base->skb_mark == flow->skb_mark) {
+    if (base->pkt_mark == flow->pkt_mark) {
          return;
      }
  
-    memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark);
-    base->skb_mark = flow->skb_mark;
+    memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark);
+    base->pkt_mark = flow->pkt_mark;
  
-    odp_put_skb_mark_action(base->skb_mark, odp_actions);
+    odp_put_pkt_mark_action(base->pkt_mark, odp_actions);
  }
  /* If any of the flow key data that ODP actions can modify are different in
   * 'base' and 'flow', appends ODP actions to 'odp_actions' that change the flow
@@ -3339,5 +3339,5 @@ commit_odp_actions(const struct flow *flow, struct flow *base,
       */
      commit_mpls_action(flow, base, odp_actions, wc);
      commit_set_priority_action(flow, base, odp_actions, wc);
-    commit_set_skb_mark_action(flow, base, odp_actions, wc);
+    commit_set_pkt_mark_action(flow, base, odp_actions, wc);
  }
diff --git a/lib/odp-util.h b/lib/odp-util.h

index 7e27888..0c40f38 100644 (file)
--- a/lib/odp-util.h
+++ b/lib/odp-util.h
@@ -181,7 +181,7 @@ size_t odp_put_userspace_action(uint32_t pid,
                                  struct ofpbuf *odp_actions);
  void odp_put_tunnel_action(const struct flow_tnl *tunnel,
                             struct ofpbuf *odp_actions);
-void odp_put_skb_mark_action(const uint32_t skb_mark,
+void odp_put_pkt_mark_action(const uint32_t pkt_mark,
                               struct ofpbuf *odp_actions);
  
  /* Reasons why a subfacet might not be fast-pathable. */
diff --git a/lib/ofp-print.c b/lib/ofp-print.c

index 1a4dd9c..21989a9 100644 (file)
--- a/lib/ofp-print.c
+++ b/lib/ofp-print.c
@@ -130,6 +130,10 @@ ofp_print_packet_in(struct ds *string, const struct ofp_header *oh,
          }
      }
  
+    if (pin.fmd.pkt_mark != 0) {
+        ds_put_format(string, " pkt_mark=0x%"PRIx32, pin.fmd.pkt_mark);
+    }
+
      ds_put_format(string, " (via %s)",
                    ofputil_packet_in_reason_to_string(pin.reason, reasonbuf,
                                                       sizeof reasonbuf));
diff --git a/lib/ofp-util.c b/lib/ofp-util.c

index d1bcf9c..45ff0a1 100644 (file)
--- a/lib/ofp-util.c
+++ b/lib/ofp-util.c
@@ -1134,11 +1134,17 @@ ofputil_usable_protocols(const struct match *match)
          return OFPUTIL_P_NONE;
      }
  
-    /* skb_mark and skb_priority can't be sent in a flow_mod */
-    if (wc->masks.skb_mark || wc->masks.skb_priority) {
+    /* skb_priority can't be sent in a flow_mod */
+    if (wc->masks.skb_priority) {
          return OFPUTIL_P_NONE;
      }
  
+    /* NXM and OXM support pkt_mark */
+    if (wc->masks.pkt_mark) {
+        return OFPUTIL_P_OF10_NXM_ANY | OFPUTIL_P_OF12_OXM
+            | OFPUTIL_P_OF13_OXM;
+    }
+
      /* NXM, OXM, and OF1.1 support bitwise matching on ethernet addresses. */
      if (!eth_mask_is_exact(wc->masks.dl_src)
          && !eth_addr_is_zero(wc->masks.dl_src)) {
@@ -2917,6 +2923,7 @@ ofputil_decode_packet_in_finish(struct ofputil_packet_in *pin,
      pin->fmd.tun_dst = match->flow.tunnel.ip_dst;
      pin->fmd.metadata = match->flow.metadata;
      memcpy(pin->fmd.regs, match->flow.regs, sizeof pin->fmd.regs);
+    pin->fmd.pkt_mark = match->flow.pkt_mark;
  }
  
  enum ofperr
@@ -3031,6 +3038,10 @@ ofputil_packet_in_to_match(const struct ofputil_packet_in *pin,
          }
      }
  
+    if (pin->fmd.pkt_mark != 0) {
+        match_set_pkt_mark(match, pin->fmd.pkt_mark);
+    }
+
      match_set_in_port(match, pin->fmd.in_port);
  }
  
diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h

index 3547686..b7bc5d1 100644 (file)
--- a/lib/ovs-thread.h
+++ b/lib/ovs-thread.h
@@ -467,12 +467,12 @@ struct ovsthread_once {
      }
  
  static inline bool ovsthread_once_start(struct ovsthread_once *once)
-    OVS_TRY_LOCK(true, &once->mutex);
+    OVS_TRY_LOCK(true, once->mutex);
  void ovsthread_once_done(struct ovsthread_once *once)
-    OVS_RELEASES(&once->mutex);
+    OVS_RELEASES(once->mutex);
  
  bool ovsthread_once_start__(struct ovsthread_once *once)
-    OVS_TRY_LOCK(false, &once->mutex);
+    OVS_TRY_LOCK(false, once->mutex);
  
  static inline bool
  ovsthread_once_is_done__(const struct ovsthread_once *once)
@@ -496,11 +496,6 @@ ovsthread_once_start(struct ovsthread_once *once)
      return OVS_UNLIKELY(!ovsthread_once_is_done__(once)
                          && !ovsthread_once_start__(once));
  }
-
-#ifdef __CHECKER__
-#define ovsthread_once_start(ONCE) \
-    ((ONCE)->done ? false : ({ OVS_MACRO_LOCK((&ONCE->mutex)); true; }))
-#endif
  \f
  /* Thread ID.
   *
diff --git a/lib/poll-loop.c b/lib/poll-loop.c

index 5f9b9cd..4eb1187 100644 (file)
--- a/lib/poll-loop.c
+++ b/lib/poll-loop.c
@@ -26,6 +26,7 @@
  #include "fatal-signal.h"
  #include "list.h"
  #include "ovs-thread.h"
+#include "seq.h"
  #include "socket-util.h"
  #include "timeval.h"
  #include "vlog.h"
@@ -248,6 +249,8 @@ poll_block(void)
  
      /* Handle any pending signals before doing anything else. */
      fatal_signal_run();
+
+    seq_woke();
  }
  \f
  static void
diff --git a/lib/seq.c b/lib/seq.c

new file mode 100644 (file)

index 0000000..36e5065
--- /dev/null
+++ b/lib/seq.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "seq.h"
+
+#include <stdbool.h>
+
+#include "hash.h"
+#include "hmap.h"
+#include "latch.h"
+#include "list.h"
+#include "ovs-thread.h"
+#include "poll-loop.h"
+
+/* A sequence number object. */
+struct seq {
+    uint64_t value OVS_GUARDED;
+    struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
+};
+
+/* A thread waiting on a particular seq. */
+struct seq_waiter {
+    struct seq *seq OVS_GUARDED;            /* Seq being waited for. */
+    struct hmap_node hmap_node OVS_GUARDED; /* In 'seq->waiters'. */
+    unsigned int ovsthread_id OVS_GUARDED;  /* Key in 'waiters' hmap. */
+
+    struct seq_thread *thread OVS_GUARDED; /* Thread preparing to wait. */
+    struct list list_node OVS_GUARDED;     /* In 'thread->waiters'. */
+
+    uint64_t value OVS_GUARDED; /* seq->value we're waiting to change. */
+};
+
+/* A thread that might be waiting on one or more seqs. */
+struct seq_thread {
+    struct list waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
+    struct latch latch OVS_GUARDED;  /* Wakeup latch for this thread. */
+    bool waiting OVS_GUARDED;        /* True if latch_wait() already called. */
+};
+
+static struct ovs_mutex seq_mutex = OVS_ADAPTIVE_MUTEX_INITIALIZER;
+
+static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
+
+static pthread_key_t seq_thread_key;
+
+static void seq_init(void);
+static struct seq_thread *seq_thread_get(void) OVS_REQUIRES(seq_mutex);
+static void seq_thread_exit(void *thread_) OVS_EXCLUDED(seq_mutex);
+static void seq_thread_woke(struct seq_thread *) OVS_REQUIRES(seq_mutex);
+static void seq_waiter_destroy(struct seq_waiter *) OVS_REQUIRES(seq_mutex);
+static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
+
+/* Creates and returns a new 'seq' object. */
+struct seq * OVS_EXCLUDED(seq_mutex)
+seq_create(void)
+{
+    struct seq *seq;
+
+    seq_init();
+
+    seq = xmalloc(sizeof *seq);
+    ovs_mutex_lock(&seq_mutex);
+    seq->value = seq_next++;
+    hmap_init(&seq->waiters);
+    ovs_mutex_unlock(&seq_mutex);
+
+    return seq;
+}
+
+/* Destroys 'seq', waking up threads that were waiting on it, if any. */
+void
+seq_destroy(struct seq *seq)
+     OVS_EXCLUDED(seq_mutex)
+{
+    ovs_mutex_lock(&seq_mutex);
+    seq_wake_waiters(seq);
+    hmap_destroy(&seq->waiters);
+    free(seq);
+    ovs_mutex_unlock(&seq_mutex);
+}
+
+/* Increments 'seq''s sequence number, waking up any threads that are waiting
+ * on 'seq'. */
+void
+seq_change(struct seq *seq)
+    OVS_EXCLUDED(seq_mutex)
+{
+    ovs_mutex_lock(&seq_mutex);
+    seq->value = seq_next++;
+    seq_wake_waiters(seq);
+    ovs_mutex_unlock(&seq_mutex);
+}
+
+/* Returns 'seq''s current sequence number (which could change immediately).
+ *
+ * seq_read() and seq_wait() can be used together to yield a race-free wakeup
+ * when an object changes, even without an ability to lock the object.  See
+ * Usage in seq.h for details. */
+uint64_t
+seq_read(const struct seq *seq)
+    OVS_EXCLUDED(seq_mutex)
+{
+    uint64_t value;
+
+    ovs_mutex_lock(&seq_mutex);
+    value = seq->value;
+    ovs_mutex_unlock(&seq_mutex);
+
+    return value;
+}
+
+static void
+seq_wait__(struct seq *seq, uint64_t value)
+    OVS_REQUIRES(seq_mutex)
+{
+    unsigned int id = ovsthread_id_self();
+    uint32_t hash = hash_int(id, 0);
+    struct seq_waiter *waiter;
+
+    HMAP_FOR_EACH_IN_BUCKET (waiter, hmap_node, hash, &seq->waiters) {
+        if (waiter->ovsthread_id == id) {
+            if (waiter->value != value) {
+                /* The current value is different from the value we've already
+                 * waited for, */
+                poll_immediate_wake();
+            } else {
+                /* Already waiting on 'value', nothing more to do. */
+            }
+            return;
+        }
+    }
+
+    waiter = xmalloc(sizeof *waiter);
+    waiter->seq = seq;
+    hmap_insert(&seq->waiters, &waiter->hmap_node, hash);
+    waiter->value = value;
+    waiter->thread = seq_thread_get();
+    list_push_back(&waiter->thread->waiters, &waiter->list_node);
+
+    if (!waiter->thread->waiting) {
+        latch_wait(&waiter->thread->latch);
+        waiter->thread->waiting = true;
+    }
+}
+
+/* Causes the following poll_block() to wake up when 'seq''s sequence number
+ * changes from 'value'.  (If 'seq''s sequence number isn't 'value', then
+ * poll_block() won't block at all.)
+ *
+ * seq_read() and seq_wait() can be used together to yield a race-free wakeup
+ * when an object changes, even without an ability to lock the object.  See
+ * Usage in seq.h for details. */
+void
+seq_wait(const struct seq *seq_, uint64_t value)
+    OVS_EXCLUDED(seq_mutex)
+{
+    struct seq *seq = CONST_CAST(struct seq *, seq_);
+
+    ovs_mutex_lock(&seq_mutex);
+    if (value == seq->value) {
+        seq_wait__(seq, value);
+    } else {
+        poll_immediate_wake();
+    }
+    ovs_mutex_unlock(&seq_mutex);
+}
+
+/* Called by poll_block() just before it returns, this function destroys any
+ * seq_waiter objects associated with the current thread. */
+void
+seq_woke(void)
+    OVS_EXCLUDED(seq_mutex)
+{
+    struct seq_thread *thread;
+
+    seq_init();
+
+    thread = pthread_getspecific(seq_thread_key);
+    if (thread) {
+        ovs_mutex_lock(&seq_mutex);
+        seq_thread_woke(thread);
+        thread->waiting = false;
+        ovs_mutex_unlock(&seq_mutex);
+    }
+}
+\f
+static void
+seq_init(void)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+
+    if (ovsthread_once_start(&once)) {
+        xpthread_key_create(&seq_thread_key, seq_thread_exit);
+        ovsthread_once_done(&once);
+    }
+}
+
+static struct seq_thread *
+seq_thread_get(void)
+    OVS_REQUIRES(seq_mutex)
+{
+    struct seq_thread *thread = pthread_getspecific(seq_thread_key);
+    if (!thread) {
+        thread = xmalloc(sizeof *thread);
+        list_init(&thread->waiters);
+        latch_init(&thread->latch);
+        thread->waiting = false;
+
+        xpthread_setspecific(seq_thread_key, thread);
+    }
+    return thread;
+}
+
+static void
+seq_thread_exit(void *thread_)
+    OVS_EXCLUDED(seq_mutex)
+{
+    struct seq_thread *thread = thread_;
+
+    ovs_mutex_lock(&seq_mutex);
+    seq_thread_woke(thread);
+    latch_destroy(&thread->latch);
+    free(thread);
+    ovs_mutex_unlock(&seq_mutex);
+}
+
+static void
+seq_thread_woke(struct seq_thread *thread)
+    OVS_REQUIRES(seq_mutex)
+{
+    struct seq_waiter *waiter, *next_waiter;
+
+    LIST_FOR_EACH_SAFE (waiter, next_waiter, list_node, &thread->waiters) {
+        ovs_assert(waiter->thread == thread);
+        seq_waiter_destroy(waiter);
+    }
+    latch_poll(&thread->latch);
+}
+
+static void
+seq_waiter_destroy(struct seq_waiter *waiter)
+    OVS_REQUIRES(seq_mutex)
+{
+    hmap_remove(&waiter->seq->waiters, &waiter->hmap_node);
+    list_remove(&waiter->list_node);
+    free(waiter);
+}
+
+static void
+seq_wake_waiters(struct seq *seq)
+    OVS_REQUIRES(seq_mutex)
+{
+    struct seq_waiter *waiter, *next_waiter;
+
+    HMAP_FOR_EACH_SAFE (waiter, next_waiter, hmap_node, &seq->waiters) {
+        latch_set(&waiter->thread->latch);
+        seq_waiter_destroy(waiter);
+    }
+}
diff --git a/lib/seq.h b/lib/seq.h

new file mode 100644 (file)

index 0000000..c764809
--- /dev/null
+++ b/lib/seq.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SEQ_H
+#define SEQ_H 1
+
+/* Thread-safe, pollable sequence number.
+ *
+ *
+ * Motivation
+ * ==========
+ *
+ * It is sometimes desirable to take an action whenever an object changes.
+ * Suppose we associate a sequence number with an object and increment the
+ * sequence number whenver we change the object.  An observer can then record
+ * the sequence number it sees.  Later on, if the current sequence number
+ * differs from the one it saw last, then the observer knows to examine the
+ * object for changes.
+ *
+ * Code that wants to run when a sequence number changes is challenging to
+ * implement in a multithreaded environment.  A naive implementation, that
+ * simply checks whether the sequence number changed and, if so, calls
+ * poll_immediate_wake(), will fail when another thread increments the sequence
+ * number after the check (including during poll_block()).
+ *
+ * struct seq is a solution.  It implements a sequence number along with enough
+ * internal infrastructure so that a thread waiting on a particular value will
+ * wake up if the sequence number changes, or even if the "struct seq" is
+ * destroyed.
+ *
+ *
+ * Usage
+ * =====
+ *
+ * The object that includes a sequence number should use seq_create() and
+ * seq_destroy() at creation and destruction, and seq_change() whenever the
+ * object's observable state changes.
+ *
+ * An observer may seq_read() to read the current sequence number and
+ * seq_wait() to cause poll_block() to wake up when the sequence number changes
+ * from a specified value.
+ *
+ * To avoid races, observers should use seq_read() to check for changes,
+ * process any changes, and then use seq_wait() to wait for a change from the
+ * previously read value.  That is, a correct usage looks something like this:
+ *
+ *    new_seq = seq_read(seq);
+ *    if (new_seq != last_seq) {
+ *        ...process changes...
+ *        last_seq = new_seq;
+ *    }
+ *    seq_wait(seq, new_seq);
+ *    poll_block();
+ *
+ *
+ * Alternate Usage
+ * ===============
+ *
+ * struct seq can also be used as a sort of pollable condition variable.
+ * Suppose that we want a thread to process items in a queue, and thus to be
+ * able to wake up whenever the queue is nonempty.  This requires a lock to
+ * protect the queue and a seq to signal that the queue has become nonempty,
+ * e.g.:
+ *
+ *    struct ovs_mutex mutex;
+ *    struct list queue OVS_GUARDED_BY(mutex);
+ *    struct seq nonempty_seq;
+ *
+ * To add an element to the queue:
+ *
+ *    ovs_mutex_lock(&mutex);
+ *    list_push_back(&queue, ...element...);
+ *    if (list_is_singleton(&queue)) {   // The 'if' test here is optional.
+ *        seq_change(&nonempty_seq);
+ *    }
+ *    ovs_mutex_unlock(&mutex);
+ *
+ * To wait for the queue to become nonempty:
+ *
+ *    ovs_mutex_lock(&mutex);
+ *    if (list_is_empty(&queue)) {
+ *        seq_wait(&nonempty_seq, seq_read(&nonempty_seq));
+ *    } else {
+ *        poll_immediate_wake();
+ *    }
+ *    ovs_mutex_unlock(&mutex);
+ *
+ * (In the above code 'mutex' prevents the queue from changing between
+ * seq_read() and seq_wait().  Otherwise, it would be necessary to seq_read(),
+ * check for a nonempty queue, and then seq_wait() on the previously read
+ * sequence number, as under Usage above.)
+ *
+ *
+ * Thread-safety
+ * =============
+ *
+ * Fully thread safe.
+ */
+
+#include <stdint.h>
+
+/* For implementation of an object with a sequence number attached. */
+struct seq *seq_create(void);
+void seq_destroy(struct seq *);
+void seq_change(struct seq *);
+
+/* For observers. */
+uint64_t seq_read(const struct seq *);
+void seq_wait(const struct seq *, uint64_t value);
+
+/* For poll_block() internal use. */
+void seq_woke(void);
+
+#endif /* seq.h */
diff --git a/ofproto/automake.mk b/ofproto/automake.mk

index af9a12a..47ca1b8 100644 (file)
--- a/ofproto/automake.mk
+++ b/ofproto/automake.mk
@@ -30,6 +30,8 @@ ofproto_libofproto_a_SOURCES = \
         ofproto/ofproto-dpif-mirror.h \
         ofproto/ofproto-dpif-sflow.c \
         ofproto/ofproto-dpif-sflow.h \
+       ofproto/ofproto-dpif-upcall.c \
+       ofproto/ofproto-dpif-upcall.h \
         ofproto/ofproto-dpif-xlate.c \
         ofproto/ofproto-dpif-xlate.h \
         ofproto/ofproto-provider.h \
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c

new file mode 100644 (file)

index 0000000..ff9b2d5
--- /dev/null
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -0,0 +1,831 @@
+/* Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.  */
+
+#include <config.h>
+#include "ofproto-dpif-upcall.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "dpif.h"
+#include "fail-open.h"
+#include "latch.h"
+#include "seq.h"
+#include "list.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "ofproto-dpif.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "vlog.h"
+
+#define MAX_QUEUE_LENGTH 512
+
+VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall);
+
+COVERAGE_DEFINE(upcall_queue_overflow);
+COVERAGE_DEFINE(drop_queue_overflow);
+COVERAGE_DEFINE(miss_queue_overflow);
+COVERAGE_DEFINE(fmb_queue_overflow);
+
+/* A thread that processes each upcall handed to it by the dispatcher thread,
+ * forwards the upcall's packet, and then queues it to the main ofproto_dpif
+ * to possibly set up a kernel flow as a cache. */
+struct handler {
+    struct udpif *udpif;               /* Parent udpif. */
+    pthread_t thread;                  /* Thread ID. */
+
+    struct ovs_mutex mutex;            /* Mutex guarding the following. */
+
+    /* Atomic queue of unprocessed miss upcalls. */
+    struct list upcalls OVS_GUARDED;
+    size_t n_upcalls OVS_GUARDED;
+
+    pthread_cond_t wake_cond;          /* Wakes 'thread' while holding
+                                          'mutex'. */
+};
+
+/* An upcall handler for ofproto_dpif.
+ *
+ * udpif is implemented as a "dispatcher" thread that reads upcalls from the
+ * kernel.  It processes each upcall just enough to figure out its next
+ * destination.  For a "miss" upcall (MISS_UPCALL), this is one of several
+ * "handler" threads (see struct handler).  Other upcalls are queued to the
+ * main ofproto_dpif. */
+struct udpif {
+    struct dpif *dpif;                 /* Datapath handle. */
+    struct dpif_backer *backer;        /* Opaque dpif_backer pointer. */
+
+    uint32_t secret;                   /* Random seed for upcall hash. */
+
+    pthread_t dispatcher;              /* Dispatcher thread ID. */
+
+    struct handler *handlers;          /* Miss handlers. */
+    size_t n_handlers;
+
+    /* Atomic queue of unprocessed drop keys. */
+    struct ovs_mutex drop_key_mutex;
+    struct list drop_keys OVS_GUARDED;
+    size_t n_drop_keys OVS_GUARDED;
+
+    /* Atomic queue of special upcalls for ofproto-dpif to process. */
+    struct ovs_mutex upcall_mutex;
+    struct list upcalls OVS_GUARDED;
+    size_t n_upcalls OVS_GUARDED;
+
+    /* Atomic queue of flow_miss_batches. */
+    struct ovs_mutex fmb_mutex;
+    struct list fmbs OVS_GUARDED;
+    size_t n_fmbs OVS_GUARDED;
+
+    /* Number of times udpif_revalidate() has been called. */
+    atomic_uint reval_seq;
+
+    struct seq *wait_seq;
+    uint64_t last_seq;
+
+    struct latch exit_latch; /* Tells child threads to exit. */
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+static void recv_upcalls(struct udpif *);
+static void handle_miss_upcalls(struct udpif *, struct list *upcalls);
+static void miss_destroy(struct flow_miss *);
+static void *udpif_dispatcher(void *);
+static void *udpif_miss_handler(void *);
+
+struct udpif *
+udpif_create(struct dpif_backer *backer, struct dpif *dpif)
+{
+    struct udpif *udpif = xzalloc(sizeof *udpif);
+
+    udpif->dpif = dpif;
+    udpif->backer = backer;
+    udpif->secret = random_uint32();
+    udpif->wait_seq = seq_create();
+    latch_init(&udpif->exit_latch);
+    list_init(&udpif->drop_keys);
+    list_init(&udpif->upcalls);
+    list_init(&udpif->fmbs);
+    atomic_init(&udpif->reval_seq, 0);
+    ovs_mutex_init(&udpif->drop_key_mutex, PTHREAD_MUTEX_NORMAL);
+    ovs_mutex_init(&udpif->upcall_mutex, PTHREAD_MUTEX_NORMAL);
+    ovs_mutex_init(&udpif->fmb_mutex, PTHREAD_MUTEX_NORMAL);
+
+    return udpif;
+}
+
+void
+udpif_destroy(struct udpif *udpif)
+{
+    struct flow_miss_batch *fmb;
+    struct drop_key *drop_key;
+    struct upcall *upcall;
+
+    udpif_recv_set(udpif, 0, false);
+
+    while ((drop_key = drop_key_next(udpif))) {
+        drop_key_destroy(drop_key);
+    }
+
+    while ((upcall = upcall_next(udpif))) {
+        upcall_destroy(upcall);
+    }
+
+    while ((fmb = flow_miss_batch_next(udpif))) {
+        flow_miss_batch_destroy(fmb);
+    }
+
+    ovs_mutex_destroy(&udpif->drop_key_mutex);
+    ovs_mutex_destroy(&udpif->upcall_mutex);
+    ovs_mutex_destroy(&udpif->fmb_mutex);
+    latch_destroy(&udpif->exit_latch);
+    seq_destroy(udpif->wait_seq);
+    free(udpif);
+}
+
+/* Tells 'udpif' to begin or stop handling flow misses depending on the value
+ * of 'enable'.  'n_handlers' is the number of miss_handler threads to create.
+ * Passing 'n_handlers' as zero is equivalent to passing 'enable' as false. */
+void
+udpif_recv_set(struct udpif *udpif, size_t n_handlers, bool enable)
+{
+    n_handlers = enable ? n_handlers : 0;
+    n_handlers = MIN(n_handlers, 64);
+
+    /* Stop the old threads (if any). */
+    if (udpif->handlers && udpif->n_handlers != n_handlers) {
+        size_t i;
+
+        latch_set(&udpif->exit_latch);
+
+        /* Wake the handlers so they can exit. */
+        for (i = 0; i < udpif->n_handlers; i++) {
+            struct handler *handler = &udpif->handlers[i];
+
+            ovs_mutex_lock(&handler->mutex);
+            xpthread_cond_signal(&handler->wake_cond);
+            ovs_mutex_unlock(&handler->mutex);
+        }
+
+        xpthread_join(udpif->dispatcher, NULL);
+        for (i = 0; i < udpif->n_handlers; i++) {
+            struct handler *handler = &udpif->handlers[i];
+            struct upcall *miss, *next;
+
+            xpthread_join(handler->thread, NULL);
+
+            ovs_mutex_lock(&handler->mutex);
+            LIST_FOR_EACH_SAFE (miss, next, list_node, &handler->upcalls) {
+                list_remove(&miss->list_node);
+                upcall_destroy(miss);
+            }
+            ovs_mutex_unlock(&handler->mutex);
+            ovs_mutex_destroy(&handler->mutex);
+
+            xpthread_cond_destroy(&handler->wake_cond);
+        }
+        latch_poll(&udpif->exit_latch);
+
+        free(udpif->handlers);
+        udpif->handlers = NULL;
+        udpif->n_handlers = 0;
+    }
+
+    /* Start new threads (if necessary). */
+    if (!udpif->handlers && n_handlers) {
+        size_t i;
+
+        udpif->n_handlers = n_handlers;
+        udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers);
+        for (i = 0; i < udpif->n_handlers; i++) {
+            struct handler *handler = &udpif->handlers[i];
+
+            handler->udpif = udpif;
+            list_init(&handler->upcalls);
+            xpthread_cond_init(&handler->wake_cond, NULL);
+            ovs_mutex_init(&handler->mutex, PTHREAD_MUTEX_NORMAL);
+            xpthread_create(&handler->thread, NULL, udpif_miss_handler, handler);
+        }
+        xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif);
+    }
+}
+
+void
+udpif_run(struct udpif *udpif)
+{
+    udpif->last_seq = seq_read(udpif->wait_seq);
+}
+
+void
+udpif_wait(struct udpif *udpif)
+{
+    ovs_mutex_lock(&udpif->drop_key_mutex);
+    if (udpif->n_drop_keys) {
+        poll_immediate_wake();
+    }
+    ovs_mutex_unlock(&udpif->drop_key_mutex);
+
+    ovs_mutex_lock(&udpif->upcall_mutex);
+    if (udpif->n_upcalls) {
+        poll_immediate_wake();
+    }
+    ovs_mutex_unlock(&udpif->upcall_mutex);
+
+    ovs_mutex_lock(&udpif->fmb_mutex);
+    if (udpif->n_fmbs) {
+        poll_immediate_wake();
+    }
+    ovs_mutex_unlock(&udpif->fmb_mutex);
+
+    seq_wait(udpif->wait_seq, udpif->last_seq);
+}
+
+/* Notifies 'udpif' that something changed which may render previous
+ * xlate_actions() results invalid. */
+void
+udpif_revalidate(struct udpif *udpif)
+{
+    struct flow_miss_batch *fmb, *next_fmb;
+    unsigned int junk;
+
+    /* Since we remove each miss on revalidation, their statistics won't be
+     * accounted to the appropriate 'facet's in the upper layer.  In most
+     * cases, this is alright because we've already pushed the stats to the
+     * relevant rules.  However, NetFlow requires absolute packet counts on
+     * 'facet's which could now be incorrect. */
+    ovs_mutex_lock(&udpif->fmb_mutex);
+    atomic_add(&udpif->reval_seq, 1, &junk);
+    LIST_FOR_EACH_SAFE (fmb, next_fmb, list_node, &udpif->fmbs) {
+        list_remove(&fmb->list_node);
+        flow_miss_batch_destroy(fmb);
+        udpif->n_fmbs--;
+    }
+    ovs_mutex_unlock(&udpif->fmb_mutex);
+    udpif_drop_key_clear(udpif);
+}
+
+/* Retreives the next upcall which ofproto-dpif is responsible for handling.
+ * The caller is responsible for destroying the returned upcall with
+ * upcall_destroy(). */
+struct upcall *
+upcall_next(struct udpif *udpif)
+{
+    struct upcall *next = NULL;
+
+    ovs_mutex_lock(&udpif->upcall_mutex);
+    if (udpif->n_upcalls) {
+        udpif->n_upcalls--;
+        next = CONTAINER_OF(list_pop_front(&udpif->upcalls), struct upcall,
+                            list_node);
+    }
+    ovs_mutex_unlock(&udpif->upcall_mutex);
+    return next;
+}
+
+/* Destroys and deallocates 'upcall'. */
+void
+upcall_destroy(struct upcall *upcall)
+{
+    if (upcall) {
+        ofpbuf_uninit(&upcall->upcall_buf);
+        free(upcall);
+    }
+}
+
+/* Retreives the next batch of processed flow misses for 'udpif' to install.
+ * The caller is responsible for destroying it with flow_miss_batch_destroy().
+ */
+struct flow_miss_batch *
+flow_miss_batch_next(struct udpif *udpif)
+{
+    struct flow_miss_batch *next = NULL;
+
+    ovs_mutex_lock(&udpif->fmb_mutex);
+    if (udpif->n_fmbs) {
+        udpif->n_fmbs--;
+        next = CONTAINER_OF(list_pop_front(&udpif->fmbs),
+                            struct flow_miss_batch, list_node);
+    }
+    ovs_mutex_unlock(&udpif->fmb_mutex);
+    return next;
+}
+
+/* Destroys and deallocates 'fmb'. */
+void
+flow_miss_batch_destroy(struct flow_miss_batch *fmb)
+{
+    struct flow_miss *miss, *next;
+
+    if (!fmb) {
+        return;
+    }
+
+    HMAP_FOR_EACH_SAFE (miss, next, hmap_node, &fmb->misses) {
+        hmap_remove(&fmb->misses, &miss->hmap_node);
+        miss_destroy(miss);
+    }
+
+    hmap_destroy(&fmb->misses);
+    free(fmb);
+}
+
+/* Retreives the next drop key which ofproto-dpif needs to process.  The caller
+ * is responsible for destroying it with drop_key_destroy(). */
+struct drop_key *
+drop_key_next(struct udpif *udpif)
+{
+    struct drop_key *next = NULL;
+
+    ovs_mutex_lock(&udpif->drop_key_mutex);
+    if (udpif->n_drop_keys) {
+        udpif->n_drop_keys--;
+        next = CONTAINER_OF(list_pop_front(&udpif->drop_keys), struct drop_key,
+                            list_node);
+    }
+    ovs_mutex_unlock(&udpif->drop_key_mutex);
+    return next;
+}
+
+/* Destorys and deallocates 'drop_key'. */
+void
+drop_key_destroy(struct drop_key *drop_key)
+{
+    if (drop_key) {
+        free(drop_key->key);
+        free(drop_key);
+    }
+}
+
+/* Clears all drop keys waiting to be processed by drop_key_next(). */
+void
+udpif_drop_key_clear(struct udpif *udpif)
+{
+    struct drop_key *drop_key, *next;
+
+    ovs_mutex_lock(&udpif->drop_key_mutex);
+    LIST_FOR_EACH_SAFE (drop_key, next, list_node, &udpif->drop_keys) {
+        list_remove(&drop_key->list_node);
+        drop_key_destroy(drop_key);
+        udpif->n_drop_keys--;
+    }
+    ovs_mutex_unlock(&udpif->drop_key_mutex);
+}
+\f
+/* The dispatcher thread is responsible for receving upcalls from the kernel,
+ * assigning the miss upcalls to a miss_handler thread, and assigning the more
+ * complex ones to ofproto-dpif directly. */
+static void *
+udpif_dispatcher(void *arg)
+{
+    struct udpif *udpif = arg;
+
+    set_subprogram_name("dispatcher");
+    while (!latch_is_set(&udpif->exit_latch)) {
+        recv_upcalls(udpif);
+        dpif_recv_wait(udpif->dpif);
+        latch_wait(&udpif->exit_latch);
+        poll_block();
+    }
+
+    return NULL;
+}
+
+/* The miss handler thread is responsible for processing miss upcalls retreived
+ * by the dispatcher thread.  Once finished it passes the processed miss
+ * upcalls to ofproto-dpif where they're installed in the datapath. */
+static void *
+udpif_miss_handler(void *arg)
+{
+    struct list misses = LIST_INITIALIZER(&misses);
+    struct handler *handler = arg;
+
+    set_subprogram_name("miss_handler");
+    for (;;) {
+        size_t i;
+
+        ovs_mutex_lock(&handler->mutex);
+
+        if (latch_is_set(&handler->udpif->exit_latch)) {
+            ovs_mutex_unlock(&handler->mutex);
+            return NULL;
+        }
+
+        if (!handler->n_upcalls) {
+            ovs_mutex_cond_wait(&handler->wake_cond, &handler->mutex);
+        }
+
+        for (i = 0; i < FLOW_MISS_MAX_BATCH; i++) {
+            if (handler->n_upcalls) {
+                handler->n_upcalls--;
+                list_push_back(&misses, list_pop_front(&handler->upcalls));
+            } else {
+                break;
+            }
+        }
+        ovs_mutex_unlock(&handler->mutex);
+
+        handle_miss_upcalls(handler->udpif, &misses);
+    }
+}
+\f
+static void
+miss_destroy(struct flow_miss *miss)
+{
+    struct upcall *upcall, *next;
+
+    LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) {
+        list_remove(&upcall->list_node);
+        upcall_destroy(upcall);
+    }
+    xlate_out_uninit(&miss->xout);
+}
+
+static enum upcall_type
+classify_upcall(const struct upcall *upcall)
+{
+    const struct dpif_upcall *dpif_upcall = &upcall->dpif_upcall;
+    union user_action_cookie cookie;
+    size_t userdata_len;
+
+    /* First look at the upcall type. */
+    switch (dpif_upcall->type) {
+    case DPIF_UC_ACTION:
+        break;
+
+    case DPIF_UC_MISS:
+        return MISS_UPCALL;
+
+    case DPIF_N_UC_TYPES:
+    default:
+        VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32,
+                     dpif_upcall->type);
+        return BAD_UPCALL;
+    }
+
+    /* "action" upcalls need a closer look. */
+    if (!dpif_upcall->userdata) {
+        VLOG_WARN_RL(&rl, "action upcall missing cookie");
+        return BAD_UPCALL;
+    }
+    userdata_len = nl_attr_get_size(dpif_upcall->userdata);
+    if (userdata_len < sizeof cookie.type
+        || userdata_len > sizeof cookie) {
+        VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu",
+                     userdata_len);
+        return BAD_UPCALL;
+    }
+    memset(&cookie, 0, sizeof cookie);
+    memcpy(&cookie, nl_attr_get(dpif_upcall->userdata), userdata_len);
+    if (userdata_len == sizeof cookie.sflow
+        && cookie.type == USER_ACTION_COOKIE_SFLOW) {
+        return SFLOW_UPCALL;
+    } else if (userdata_len == sizeof cookie.slow_path
+               && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) {
+        return MISS_UPCALL;
+    } else if (userdata_len == sizeof cookie.flow_sample
+               && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) {
+        return FLOW_SAMPLE_UPCALL;
+    } else if (userdata_len == sizeof cookie.ipfix
+               && cookie.type == USER_ACTION_COOKIE_IPFIX) {
+        return IPFIX_UPCALL;
+    } else {
+        VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16
+                     " and size %zu", cookie.type, userdata_len);
+        return BAD_UPCALL;
+    }
+}
+
+static void
+recv_upcalls(struct udpif *udpif)
+{
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+    for (;;) {
+        struct upcall *upcall;
+        int error;
+
+        upcall = xmalloc(sizeof *upcall);
+        ofpbuf_use_stub(&upcall->upcall_buf, upcall->upcall_stub,
+                        sizeof upcall->upcall_stub);
+        error = dpif_recv(udpif->dpif, &upcall->dpif_upcall,
+                          &upcall->upcall_buf);
+        if (error) {
+            upcall_destroy(upcall);
+            break;
+        }
+
+        upcall->type = classify_upcall(upcall);
+        if (upcall->type == BAD_UPCALL) {
+            upcall_destroy(upcall);
+        } else if (upcall->type == MISS_UPCALL) {
+            struct dpif_upcall *dupcall = &upcall->dpif_upcall;
+            uint32_t hash = udpif->secret;
+            struct handler *handler;
+            struct nlattr *nla;
+            size_t n_bytes, left;
+
+            n_bytes = 0;
+            NL_ATTR_FOR_EACH (nla, left, dupcall->key, dupcall->key_len) {
+                enum ovs_key_attr type = nl_attr_type(nla);
+                if (type == OVS_KEY_ATTR_IN_PORT
+                    || type == OVS_KEY_ATTR_TCP
+                    || type == OVS_KEY_ATTR_UDP) {
+                    if (nl_attr_get_size(nla) == 4) {
+                        ovs_be32 attr = nl_attr_get_be32(nla);
+                        hash = mhash_add(hash, (OVS_FORCE uint32_t) attr);
+                        n_bytes += 4;
+                    } else {
+                        VLOG_WARN("Netlink attribute with incorrect size.");
+                    }
+                }
+            }
+           hash =  mhash_finish(hash, n_bytes);
+
+           handler = &udpif->handlers[hash % udpif->n_handlers];
+
+           ovs_mutex_lock(&handler->mutex);
+           if (handler->n_upcalls < MAX_QUEUE_LENGTH) {
+               list_push_back(&handler->upcalls, &upcall->list_node);
+               handler->n_upcalls++;
+               xpthread_cond_signal(&handler->wake_cond);
+               ovs_mutex_unlock(&handler->mutex);
+               if (!VLOG_DROP_DBG(&rl)) {
+                   struct ds ds = DS_EMPTY_INITIALIZER;
+
+                   odp_flow_key_format(upcall->dpif_upcall.key,
+                                       upcall->dpif_upcall.key_len,
+                                       &ds);
+                   VLOG_DBG("dispatcher: miss enqueue (%s)", ds_cstr(&ds));
+                   ds_destroy(&ds);
+               }
+           } else {
+               ovs_mutex_unlock(&handler->mutex);
+               COVERAGE_INC(miss_queue_overflow);
+               upcall_destroy(upcall);
+           }
+        } else {
+            ovs_mutex_lock(&udpif->upcall_mutex);
+            if (udpif->n_upcalls < MAX_QUEUE_LENGTH) {
+                udpif->n_upcalls++;
+                list_push_back(&udpif->upcalls, &upcall->list_node);
+                ovs_mutex_unlock(&udpif->upcall_mutex);
+                seq_change(udpif->wait_seq);
+            } else {
+                ovs_mutex_unlock(&udpif->upcall_mutex);
+                COVERAGE_INC(upcall_queue_overflow);
+                upcall_destroy(upcall);
+            }
+        }
+    }
+}
+
+static struct flow_miss *
+flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto,
+               const struct flow *flow, uint32_t hash)
+{
+    struct flow_miss *miss;
+
+    HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) {
+        if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) {
+            return miss;
+        }
+    }
+
+    return NULL;
+}
+
+/* Executes flow miss 'miss'.  May add any required datapath operations
+ * to 'ops', incrementing '*n_ops' for each new op. */
+static void
+execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops)
+{
+    struct ofproto_dpif *ofproto = miss->ofproto;
+    struct flow_wildcards wc;
+    struct rule_dpif *rule;
+    struct ofpbuf *packet;
+    struct xlate_in xin;
+
+    memset(&miss->stats, 0, sizeof miss->stats);
+    miss->stats.used = time_msec();
+    LIST_FOR_EACH (packet, list_node, &miss->packets) {
+        miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
+        miss->stats.n_bytes += packet->size;
+        miss->stats.n_packets++;
+    }
+
+    flow_wildcards_init_catchall(&wc);
+    rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule);
+    rule_credit_stats(rule, &miss->stats);
+    xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags,
+                  NULL);
+    xin.may_learn = true;
+    xin.resubmit_stats = &miss->stats;
+    xlate_actions(&xin, &miss->xout);
+    flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc);
+
+    if (rule->up.cr.priority == FAIL_OPEN_PRIORITY) {
+        struct ofputil_packet_in pin;
+
+        /* Extra-special case for fail-open mode.
+         *
+         * We are in fail-open mode and the packet matched the fail-open
+         * rule, but we are connected to a controller too.  We should send
+         * the packet up to the controller in the hope that it will try to
+         * set up a flow and thereby allow us to exit fail-open.
+         *
+         * See the top-level comment in fail-open.c for more information. */
+        pin.packet = packet->data;
+        pin.packet_len = packet->size;
+        pin.reason = OFPR_NO_MATCH;
+        pin.controller_id = 0;
+        pin.table_id = 0;
+        pin.cookie = 0;
+        pin.send_len = 0; /* Not used for flow table misses. */
+        flow_get_metadata(&miss->flow, &pin.fmd);
+        ofproto_dpif_send_packet_in(ofproto, &pin);
+    }
+
+    if (miss->xout.slow) {
+        LIST_FOR_EACH (packet, list_node, &miss->packets) {
+            struct xlate_in xin;
+
+            xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
+            xlate_actions_for_side_effects(&xin);
+        }
+    }
+    rule_release(rule);
+
+    if (miss->xout.odp_actions.size) {
+        LIST_FOR_EACH (packet, list_node, &miss->packets) {
+            struct dpif_op *op = &ops[*n_ops];
+            struct dpif_execute *execute = &op->u.execute;
+
+            if (miss->flow.in_port.ofp_port
+                != vsp_realdev_to_vlandev(miss->ofproto,
+                                          miss->flow.in_port.ofp_port,
+                                          miss->flow.vlan_tci)) {
+                /* This packet was received on a VLAN splinter port.  We
+                 * added a VLAN to the packet to make the packet resemble
+                 * the flow, but the actions were composed assuming that
+                 * the packet contained no VLAN.  So, we must remove the
+                 * VLAN header from the packet before trying to execute the
+                 * actions. */
+                eth_pop_vlan(packet);
+            }
+
+            op->type = DPIF_OP_EXECUTE;
+            execute->key = miss->key;
+            execute->key_len = miss->key_len;
+            execute->packet = packet;
+            execute->actions = miss->xout.odp_actions.data;
+            execute->actions_len = miss->xout.odp_actions.size;
+
+            (*n_ops)++;
+        }
+    }
+}
+
+static void
+handle_miss_upcalls(struct udpif *udpif, struct list *upcalls)
+{
+    struct dpif_op *opsp[FLOW_MISS_MAX_BATCH];
+    struct dpif_op ops[FLOW_MISS_MAX_BATCH];
+    unsigned int old_reval_seq, new_reval_seq;
+    struct upcall *upcall, *next;
+    struct flow_miss_batch *fmb;
+    size_t n_upcalls, n_ops, i;
+    struct flow_miss *miss;
+
+    atomic_read(&udpif->reval_seq, &old_reval_seq);
+
+    /* Construct the to-do list.
+     *
+     * This just amounts to extracting the flow from each packet and sticking
+     * the packets that have the same flow in the same "flow_miss" structure so
+     * that we can process them together. */
+    fmb = xmalloc(sizeof *fmb);
+    hmap_init(&fmb->misses);
+    n_upcalls = 0;
+    LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
+        struct dpif_upcall *dupcall = &upcall->dpif_upcall;
+        struct flow_miss *miss = &fmb->miss_buf[n_upcalls];
+        struct flow_miss *existing_miss;
+        struct ofproto_dpif *ofproto;
+        odp_port_t odp_in_port;
+        struct flow flow;
+        uint32_t hash;
+        int error;
+
+        error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key,
+                              dupcall->key_len, &flow, &miss->key_fitness,
+                              &ofproto, &odp_in_port);
+
+        if (error == ENODEV) {
+            struct drop_key *drop_key;
+
+            /* Received packet on datapath port for which we couldn't
+             * associate an ofproto.  This can happen if a port is removed
+             * while traffic is being received.  Print a rate-limited message
+             * in case it happens frequently.  Install a drop flow so
+             * that future packets of the flow are inexpensively dropped
+             * in the kernel. */
+            VLOG_INFO_RL(&rl, "received packet on unassociated datapath port "
+                              "%"PRIu32, odp_in_port);
+
+            drop_key = xmalloc(sizeof *drop_key);
+            drop_key->key = xmemdup(dupcall->key, dupcall->key_len);
+            drop_key->key_len = dupcall->key_len;
+
+            ovs_mutex_lock(&udpif->drop_key_mutex);
+            if (udpif->n_drop_keys < MAX_QUEUE_LENGTH) {
+                udpif->n_drop_keys++;
+                list_push_back(&udpif->drop_keys, &drop_key->list_node);
+                ovs_mutex_unlock(&udpif->drop_key_mutex);
+                seq_change(udpif->wait_seq);
+            } else {
+                ovs_mutex_unlock(&udpif->drop_key_mutex);
+                COVERAGE_INC(drop_queue_overflow);
+                drop_key_destroy(drop_key);
+            }
+            continue;
+        } else if (error) {
+            continue;
+        }
+
+        flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark,
+                     &flow.tunnel, &flow.in_port, &miss->flow);
+
+        /* Add other packets to a to-do list. */
+        hash = flow_hash(&miss->flow, 0);
+        existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, hash);
+        if (!existing_miss) {
+            hmap_insert(&fmb->misses, &miss->hmap_node, hash);
+            miss->ofproto = ofproto;
+            miss->key = dupcall->key;
+            miss->key_len = dupcall->key_len;
+            miss->upcall_type = dupcall->type;
+            list_init(&miss->packets);
+            list_init(&miss->upcalls);
+
+            n_upcalls++;
+        } else {
+            miss = existing_miss;
+        }
+        list_push_back(&miss->packets, &dupcall->packet->list_node);
+
+        list_remove(&upcall->list_node);
+        list_push_back(&miss->upcalls, &upcall->list_node);
+    }
+
+    LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
+        list_remove(&upcall->list_node);
+        upcall_destroy(upcall);
+    }
+
+    /* Process each element in the to-do list, constructing the set of
+     * operations to batch. */
+    n_ops = 0;
+    HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
+        execute_flow_miss(miss, ops, &n_ops);
+    }
+    ovs_assert(n_ops <= ARRAY_SIZE(ops));
+
+    /* Execute batch. */
+    for (i = 0; i < n_ops; i++) {
+        opsp[i] = &ops[i];
+    }
+    dpif_operate(udpif->dpif, opsp, n_ops);
+
+    ovs_mutex_lock(&udpif->fmb_mutex);
+    atomic_read(&udpif->reval_seq, &new_reval_seq);
+    if (old_reval_seq != new_reval_seq) {
+        /* udpif_revalidate() was called as we were calculating the actions.
+         * To be safe, we need to assume all the misses need revalidation. */
+        ovs_mutex_unlock(&udpif->fmb_mutex);
+        flow_miss_batch_destroy(fmb);
+    } else if (udpif->n_fmbs < MAX_QUEUE_LENGTH) {
+        udpif->n_fmbs++;
+        list_push_back(&udpif->fmbs, &fmb->list_node);
+        ovs_mutex_unlock(&udpif->fmb_mutex);
+        seq_change(udpif->wait_seq);
+    } else {
+        COVERAGE_INC(fmb_queue_overflow);
+        ovs_mutex_unlock(&udpif->fmb_mutex);
+        flow_miss_batch_destroy(fmb);
+    }
+}
diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h

new file mode 100644 (file)

index 0000000..f742060
--- /dev/null
+++ b/ofproto/ofproto-dpif-upcall.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#ifndef OFPROTO_DPIF_UPCALL_H
+#define OFPROTO_DPIF_UPCALL_H
+
+#define FLOW_MISS_MAX_BATCH 50
+
+#include "dpif.h"
+#include "flow.h"
+#include "hmap.h"
+#include "list.h"
+#include "odp-util.h"
+#include "ofpbuf.h"
+#include "ofproto-dpif-xlate.h"
+
+struct dpif;
+struct dpif_backer;
+
+/* udif is responsible for retrieving upcalls from the kernel, processing miss
+ * upcalls, and handing more complex ones up to the main ofproto-dpif
+ * module. */
+
+struct udpif *udpif_create(struct dpif_backer *, struct dpif *);
+void udpif_recv_set(struct udpif *, size_t n_workers, bool enable);
+void udpif_destroy(struct udpif *);
+
+void udpif_run(struct udpif *);
+void udpif_wait(struct udpif *);
+
+void udpif_revalidate(struct udpif *);
+\f
+/* udpif can handle some upcalls on its own.  Others need the main ofproto_dpif
+ * code to handle them.  This interface passes upcalls not handled by udpif up
+ * to the ofproto_dpif main thread. */
+
+/* Type of an upcall. */
+enum upcall_type {
+    /* Handled internally by udpif code.  Not returned by upcall_next().*/
+    BAD_UPCALL,                 /* Some kind of bug somewhere. */
+    MISS_UPCALL,                /* A flow miss.  */
+
+    /* Require main thread's involvement.  May be returned by upcall_next(). */
+    SFLOW_UPCALL,               /* sFlow sample. */
+    FLOW_SAMPLE_UPCALL,         /* Per-flow sampling. */
+    IPFIX_UPCALL                /* Per-bridge sampling. */
+};
+
+/* An upcall. */
+struct upcall {
+    struct list list_node;          /* For queuing upcalls. */
+
+    enum upcall_type type;          /* Classification. */
+
+    /* Raw upcall plus data for keeping track of the memory backing it. */
+    struct dpif_upcall dpif_upcall; /* As returned by dpif_recv() */
+    struct ofpbuf upcall_buf;       /* Owns some data in 'dpif_upcall'. */
+    uint64_t upcall_stub[256 / 8];  /* Buffer to reduce need for malloc(). */
+};
+
+struct upcall *upcall_next(struct udpif *);
+void upcall_destroy(struct upcall *);
+\f
+/* udpif figures out how to forward packets, and does forward them, but it
+ * can't set up datapath flows on its own.  This interface passes packet
+ * forwarding data from udpif to the higher level ofproto_dpif to allow the
+ * latter to set up datapath flows. */
+
+/* Flow miss batching.
+ *
+ * Some dpifs implement operations faster when you hand them off in a batch.
+ * To allow batching, "struct flow_miss" queues the dpif-related work needed
+ * for a given flow.  Each "struct flow_miss" corresponds to sending one or
+ * more packets, plus possibly installing the flow in the dpif. */
+struct flow_miss {
+    struct hmap_node hmap_node;
+    struct ofproto_dpif *ofproto;
+
+    struct flow flow;
+    enum odp_key_fitness key_fitness;
+    const struct nlattr *key;
+    size_t key_len;
+    struct list packets;
+    enum dpif_upcall_type upcall_type;
+    struct dpif_flow_stats stats;
+
+    struct xlate_out xout;
+
+    struct list upcalls;
+};
+
+struct flow_miss_batch {
+    struct list list_node;
+
+    struct flow_miss miss_buf[FLOW_MISS_MAX_BATCH];
+    struct hmap misses;
+};
+
+struct flow_miss_batch *flow_miss_batch_next(struct udpif *);
+void flow_miss_batch_destroy(struct flow_miss_batch *);
+\f
+/* Drop keys are odp flow keys which have drop flows installed in the kernel.
+ * These are datapath flows which have no associated ofproto, if they did we
+ * would use facets.
+ *
+ * udpif can't install drop flows by itself.  This interfaces allows udpif to
+ * pass the drop flows up to ofproto_dpif to get it to install them. */
+struct drop_key {
+    struct hmap_node hmap_node;
+    struct list list_node;
+    struct nlattr *key;
+    size_t key_len;
+};
+
+struct drop_key *drop_key_next(struct udpif *);
+void drop_key_destroy(struct drop_key *);
+void udpif_drop_key_clear(struct udpif *);
+
+#endif /* ofproto-dpif-upcall.h */
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c

index 30697ac..8be8088 100644 (file)
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -201,8 +201,6 @@ static void output_normal(struct xlate_ctx *, const struct xbundle *,
                            uint16_t vlan);
  static void compose_output_action(struct xlate_ctx *, ofp_port_t ofp_port);
  
-static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
-
  static struct xbridge *xbridge_lookup(const struct ofproto_dpif *);
  static struct xbundle *xbundle_lookup(const struct ofbundle *);
  static struct xport *xport_lookup(const struct ofport_dpif *);
@@ -1519,7 +1517,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
      struct flow_wildcards *wc = &ctx->xout->wc;
      struct flow *flow = &ctx->xin->flow;
      ovs_be16 flow_vlan_tci;
-    uint32_t flow_skb_mark;
+    uint32_t flow_pkt_mark;
      uint8_t flow_nw_tos;
      odp_port_t out_port, odp_port;
      uint8_t dscp;
@@ -1587,7 +1585,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
      }
  
      flow_vlan_tci = flow->vlan_tci;
-    flow_skb_mark = flow->skb_mark;
+    flow_pkt_mark = flow->pkt_mark;
      flow_nw_tos = flow->nw_tos;
  
      if (dscp_from_skb_priority(xport, flow->skb_priority, &dscp)) {
@@ -1633,7 +1631,6 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
              out_port = ofp_port_to_odp_port(ctx->xbridge, vlandev_port);
              flow->vlan_tci = htons(0);
          }
-        flow->skb_mark &= ~IPSEC_MARK;
      }
  
      if (out_port != ODPP_NONE) {
@@ -1650,7 +1647,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
   out:
      /* Restore flow */
      flow->vlan_tci = flow_vlan_tci;
-    flow->skb_mark = flow_skb_mark;
+    flow->pkt_mark = flow_pkt_mark;
      flow->nw_tos = flow_nw_tos;
  }
  
@@ -1660,34 +1657,6 @@ compose_output_action(struct xlate_ctx *ctx, ofp_port_t ofp_port)
      compose_output_action__(ctx, ofp_port, true);
  }
  
-/* Common rule processing in one place to avoid duplicating code. */
-static struct rule_dpif *
-ctx_rule_hooks(struct xlate_ctx *ctx, struct rule_dpif *rule,
-               bool may_packet_in)
-{
-    if (ctx->xin->resubmit_hook) {
-        ctx->xin->resubmit_hook(ctx->xin, rule, ctx->recurse);
-    }
-    if (rule == NULL && may_packet_in) {
-        struct xport *xport;
-
-        /* XXX
-         * check if table configuration flags
-         * OFPTC_TABLE_MISS_CONTROLLER, default.
-         * OFPTC_TABLE_MISS_CONTINUE,
-         * OFPTC_TABLE_MISS_DROP
-         * When OF1.0, OFPTC_TABLE_MISS_CONTINUE is used. What to do? */
-        xport = get_ofp_port(ctx->xbridge, ctx->xin->flow.in_port.ofp_port);
-        rule = choose_miss_rule(xport ? xport->config : 0,
-                                ctx->xbridge->miss_rule,
-                                ctx->xbridge->no_packet_in_rule);
-    }
-    if (rule && ctx->xin->resubmit_stats) {
-        rule_credit_stats(rule, ctx->xin->resubmit_stats);
-    }
-    return rule;
-}
-
  static void
  xlate_table_action(struct xlate_ctx *ctx,
                     ofp_port_t in_port, uint8_t table_id, bool may_packet_in)
@@ -1701,15 +1670,39 @@ xlate_table_action(struct xlate_ctx *ctx,
  
          /* Look up a flow with 'in_port' as the input port. */
          ctx->xin->flow.in_port.ofp_port = in_port;
-        rule = rule_dpif_lookup_in_table(ctx->xbridge->ofproto,
-                                         &ctx->xin->flow, &ctx->xout->wc,
-                                         table_id);
+        rule_dpif_lookup_in_table(ctx->xbridge->ofproto, &ctx->xin->flow,
+                                  &ctx->xout->wc, table_id, &rule);
  
          /* Restore the original input port.  Otherwise OFPP_NORMAL and
           * OFPP_IN_PORT will have surprising behavior. */
          ctx->xin->flow.in_port.ofp_port = old_in_port;
  
-        rule = ctx_rule_hooks(ctx, rule, may_packet_in);
+        if (ctx->xin->resubmit_hook) {
+            ctx->xin->resubmit_hook(ctx->xin, rule, ctx->recurse);
+        }
+
+        if (rule == NULL && may_packet_in) {
+            struct xport *xport;
+
+            /* Makes clang's thread safety analysis happy. */
+            rule_release(rule);
+
+            /* XXX
+             * check if table configuration flags
+             * OFPTC_TABLE_MISS_CONTROLLER, default.
+             * OFPTC_TABLE_MISS_CONTINUE,
+             * OFPTC_TABLE_MISS_DROP
+             * When OF1.0, OFPTC_TABLE_MISS_CONTINUE is used. What to do? */
+            xport = get_ofp_port(ctx->xbridge, ctx->xin->flow.in_port.ofp_port);
+            rule = choose_miss_rule(xport ? xport->config : 0,
+                                    ctx->xbridge->miss_rule,
+                                    ctx->xbridge->no_packet_in_rule);
+            ovs_rwlock_rdlock(&rule->up.evict);
+        }
+
+        if (rule && ctx->xin->resubmit_stats) {
+            rule_credit_stats(rule, ctx->xin->resubmit_stats);
+        }
  
          if (rule) {
              struct rule_dpif *old_rule = ctx->rule;
@@ -1720,6 +1713,7 @@ xlate_table_action(struct xlate_ctx *ctx,
              ctx->rule = old_rule;
              ctx->recurse--;
          }
+        rule_release(rule);
  
          ctx->table_id = old_table_id;
      } else {
@@ -1788,7 +1782,7 @@ execute_controller_action(struct xlate_ctx *ctx, int len,
      packet = ofpbuf_clone(ctx->xin->packet);
  
      key.skb_priority = 0;
-    key.skb_mark = 0;
+    key.pkt_mark = 0;
      memset(&key.tunnel, 0, sizeof key.tunnel);
  
      commit_odp_actions(&ctx->xin->flow, &ctx->base_flow,
@@ -2174,39 +2168,14 @@ may_receive(const struct xport *xport, struct xlate_ctx *ctx)
      return true;
  }
  
-static bool
-tunnel_ecn_ok(struct xlate_ctx *ctx)
-{
-    if (is_ip_any(&ctx->base_flow)
-        && (ctx->xin->flow.tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE) {
-        if ((ctx->base_flow.nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) {
-            VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE"
-                         " but is not ECN capable");
-            return false;
-        } else {
-            /* Set the ECN CE value in the tunneled packet. */
-            ctx->xin->flow.nw_tos |= IP_ECN_CE;
-        }
-    }
-
-    return true;
-}
-
  static void
  do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
                   struct xlate_ctx *ctx)
  {
      struct flow_wildcards *wc = &ctx->xout->wc;
      struct flow *flow = &ctx->xin->flow;
-    bool was_evictable = true;
      const struct ofpact *a;
  
-    if (ctx->rule) {
-        /* Don't let the rule we're working on get evicted underneath us. */
-        was_evictable = ctx->rule->up.evictable;
-        ctx->rule->up.evictable = false;
-    }
-
      OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) {
          struct ofpact_controller *controller;
          const struct ofpact_metadata *metadata;
@@ -2352,20 +2321,20 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
          case OFPACT_SET_MPLS_TTL:
              if (compose_set_mpls_ttl_action(ctx,
                                              ofpact_get_SET_MPLS_TTL(a)->ttl)) {
-                goto out;
+                return;
              }
              break;
  
          case OFPACT_DEC_MPLS_TTL:
              if (compose_dec_mpls_ttl_action(ctx)) {
-                goto out;
+                return;
              }
              break;
  
          case OFPACT_DEC_TTL:
              wc->masks.nw_ttl = 0xff;
              if (compose_dec_ttl(ctx, ofpact_get_DEC_TTL(a))) {
-                goto out;
+                return;
              }
              break;
  
@@ -2432,11 +2401,6 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
              break;
          }
      }
-
-out:
-    if (ctx->rule) {
-        ctx->rule->up.evictable = was_evictable;
-    }
  }
  
  void
@@ -2567,6 +2531,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
      struct flow orig_flow;
      struct xlate_ctx ctx;
      size_t ofpacts_len;
+    bool tnl_may_send;
  
      COVERAGE_INC(xlate_actions);
  
@@ -2622,12 +2587,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
      memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type);
      wc->masks.nw_frag |= FLOW_NW_FRAG_MASK;
  
-    if (tnl_port_should_receive(&ctx.xin->flow)) {
-        memset(&wc->masks.tunnel, 0xff, sizeof wc->masks.tunnel);
-        /* skb_mark is currently used only by tunnels but that will likely
-         * change in the future. */
-        memset(&wc->masks.skb_mark, 0xff, sizeof wc->masks.skb_mark);
-    }
+    tnl_may_send = tnl_xlate_init(&ctx.base_flow, flow, wc);
      if (ctx.xbridge->has_netflow) {
          netflow_mask_wc(flow, wc);
      }
@@ -2696,7 +2656,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
          add_ipfix_action(&ctx);
          sample_actions_len = ctx.xout->odp_actions.size;
  
-        if (tunnel_ecn_ok(&ctx) && (!in_port || may_receive(in_port, &ctx))) {
+        if (tnl_may_send && (!in_port || may_receive(in_port, &ctx))) {
              do_xlate_actions(ofpacts, ofpacts_len, &ctx);
  
              /* We've let OFPP_NORMAL and the learning action look at the
diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h

index 1c37bc3..ba24e92 100644 (file)
--- a/ofproto/ofproto-dpif-xlate.h
+++ b/ofproto/ofproto-dpif-xlate.h
@@ -12,8 +12,8 @@
   * See the License for the specific language governing permissions and
   * limitations under the License. */
  
-#ifndef OFPROT_DPIF_XLATE_H
-#define OFPROT_DPIF_XLATE_H 1
+#ifndef OFPROTO_DPIF_XLATE_H
+#define OFPROTO_DPIF_XLATE_H 1
  
  #include "flow.h"
  #include "meta-flow.h"
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c

index 1582619..229b16c 100644 (file)
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -52,6 +52,7 @@
  #include "ofproto-dpif-ipfix.h"
  #include "ofproto-dpif-mirror.h"
  #include "ofproto-dpif-sflow.h"
+#include "ofproto-dpif-upcall.h"
  #include "ofproto-dpif-xlate.h"
  #include "poll-loop.h"
  #include "simap.h"
@@ -74,6 +75,8 @@ COVERAGE_DEFINE(subfacet_install_fail);
  COVERAGE_DEFINE(packet_in_overflow);
  COVERAGE_DEFINE(flow_mod_overflow);
  
+#define N_THREADS 16
+
  /* Number of implemented OpenFlow tables. */
  enum { N_TABLES = 255 };
  enum { TBL_INTERNAL = N_TABLES - 1 };    /* Used for internal hidden rules. */
@@ -82,10 +85,6 @@ BUILD_ASSERT_DECL(N_TABLES >= 2 && N_TABLES <= 255);
  struct flow_miss;
  struct facet;
  
-static struct rule_dpif *rule_dpif_lookup(struct ofproto_dpif *,
-                                          const struct flow *,
-                                          struct flow_wildcards *wc);
-
  static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes);
  
  struct ofbundle {
@@ -170,8 +169,7 @@ struct subfacet {
  
  #define SUBFACET_DESTROY_MAX_BATCH 50
  
-static struct subfacet *subfacet_create(struct facet *, struct flow_miss *miss,
-                                        long long int now);
+static struct subfacet *subfacet_create(struct facet *, struct flow_miss *);
  static struct subfacet *subfacet_find(struct dpif_backer *,
                                        const struct nlattr *key, size_t key_len,
                                        uint32_t key_hash);
@@ -247,7 +245,6 @@ struct facet {
      uint8_t tcp_flags;           /* TCP flags seen for this 'rule'. */
  
      struct xlate_out xout;
-    bool fail_open;              /* Facet matched the fail open rule. */
  
      /* Storage for a single subfacet, to reduce malloc() time and space
       * overhead.  (A facet always has at least one subfacet and in the common
@@ -259,9 +256,7 @@ struct facet {
      long long int learn_rl;      /* Rate limiter for facet_learn(). */
  };
  
-static struct facet *facet_create(const struct flow_miss *, struct rule_dpif *,
-                                  struct xlate_out *,
-                                  struct dpif_flow_stats *);
+static struct facet *facet_create(const struct flow_miss *);
  static void facet_remove(struct facet *);
  static void facet_free(struct facet *);
  
@@ -274,6 +269,8 @@ static bool facet_check_consistency(struct facet *);
  static void facet_flush_stats(struct facet *);
  
  static void facet_reset_counters(struct facet *);
+static void flow_push_stats(struct ofproto_dpif *, struct flow *,
+                            struct dpif_flow_stats *, bool may_learn);
  static void facet_push_stats(struct facet *, bool may_learn);
  static void facet_learn(struct facet *);
  static void facet_account(struct facet *);
@@ -382,15 +379,6 @@ COVERAGE_DEFINE(rev_flow_table);
  COVERAGE_DEFINE(rev_mac_learning);
  COVERAGE_DEFINE(rev_inconsistency);
  
-/* Drop keys are odp flow keys which have drop flows installed in the kernel.
- * These are datapath flows which have no associated ofproto, if they did we
- * would use facets. */
-struct drop_key {
-    struct hmap_node hmap_node;
-    struct nlattr *key;
-    size_t key_len;
-};
-
  struct avg_subfacet_rates {
      double add_rate;   /* Moving average of new flows created per minute. */
      double del_rate;   /* Moving average of flows deleted per minute. */
@@ -401,6 +389,7 @@ struct dpif_backer {
      char *type;
      int refcount;
      struct dpif *dpif;
+    struct udpif *udpif;
      struct timer next_expiration;
  
      struct ovs_rwlock odp_to_ofport_lock;
@@ -534,8 +523,7 @@ static void ofproto_trace(struct ofproto_dpif *, const struct flow *,
                            const struct ofpbuf *packet, struct ds *);
  
  /* Upcalls. */
-#define FLOW_MISS_MAX_BATCH 50
-static int handle_upcalls(struct dpif_backer *, unsigned int max_batch);
+static void handle_upcalls(struct dpif_backer *);
  
  /* Flow expiration. */
  static int expire(struct dpif_backer *);
@@ -708,9 +696,11 @@ type_run(const char *type)
  
          error = dpif_recv_set(backer->dpif, backer->recv_set_enable);
          if (error) {
+            udpif_recv_set(backer->udpif, 0, false);
              VLOG_ERR("Failed to enable receiving packets in dpif.");
              return error;
          }
+        udpif_recv_set(backer->udpif, N_THREADS, backer->recv_set_enable);
          dpif_flow_flush(backer->dpif);
          backer->need_revalidate = REV_RECONFIGURE;
      }
@@ -841,6 +831,8 @@ type_run(const char *type)
                  run_fast_rl();
              }
          }
+
+        udpif_revalidate(backer->udpif);
      }
  
      if (!backer->recv_set_enable) {
@@ -1004,32 +996,10 @@ process_dpif_port_error(struct dpif_backer *backer, int error)
  }
  
  static int
-dpif_backer_run_fast(struct dpif_backer *backer, int max_batch)
+dpif_backer_run_fast(struct dpif_backer *backer)
  {
-    unsigned int work;
-
-    /* If recv_set_enable is false, we should not handle upcalls. */
-    if (!backer->recv_set_enable) {
-        return 0;
-    }
-
-    /* Handle one or more batches of upcalls, until there's nothing left to do
-     * or until we do a fixed total amount of work.
-     *
-     * We do work in batches because it can be much cheaper to set up a number
-     * of flows and fire off their patches all at once.  We do multiple batches
-     * because in some cases handling a packet can cause another packet to be
-     * queued almost immediately as part of the return flow.  Both
-     * optimizations can make major improvements on some benchmarks and
-     * presumably for real traffic as well. */
-    work = 0;
-    while (work < max_batch) {
-        int retval = handle_upcalls(backer, max_batch - work);
-        if (retval <= 0) {
-            return -retval;
-        }
-        work += retval;
-    }
+    udpif_run(backer->udpif);
+    handle_upcalls(backer);
  
      return 0;
  }
@@ -1046,14 +1016,13 @@ type_run_fast(const char *type)
          return 0;
      }
  
-    return dpif_backer_run_fast(backer, FLOW_MISS_MAX_BATCH);
+    return dpif_backer_run_fast(backer);
  }
  
  static void
  run_fast_rl(void)
  {
      static long long int port_rl = LLONG_MIN;
-    static unsigned int backer_rl = 0;
  
      if (time_msec() >= port_rl) {
          struct ofproto_dpif *ofproto;
@@ -1063,23 +1032,6 @@ run_fast_rl(void)
          }
          port_rl = time_msec() + 200;
      }
-
-    /* XXX: We have to be careful not to do too much work in this function.  If
-     * we call dpif_backer_run_fast() too often, or with too large a batch,
-     * performance improves signifcantly, but at a cost.  It's possible for the
-     * number of flows in the datapath to increase without bound, and for poll
-     * loops to take 10s of seconds.   The correct solution to this problem,
-     * long term, is to separate flow miss handling into it's own thread so it
-     * isn't affected by revalidations, and expirations.  Until then, this is
-     * the best we can do. */
-    if (++backer_rl >= 10) {
-        struct shash_node *node;
-
-        backer_rl = 0;
-        SHASH_FOR_EACH (node, &all_dpif_backers) {
-            dpif_backer_run_fast(node->data, 1);
-        }
-    }
  }
  
  static void
@@ -1139,6 +1091,7 @@ close_dpif_backer(struct dpif_backer *backer)
      node = shash_find(&all_dpif_backers, backer->type);
      free(backer->type);
      shash_delete(&all_dpif_backers, node);
+    udpif_destroy(backer->udpif);
      dpif_close(backer->dpif);
  
      ovs_assert(hmap_is_empty(&backer->subfacets));
@@ -1208,6 +1161,7 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp)
          free(backer);
          return error;
      }
+    backer->udpif = udpif_create(backer, backer->dpif);
  
      backer->type = xstrdup(type);
      backer->governor = NULL;
@@ -1255,6 +1209,7 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp)
          close_dpif_backer(backer);
          return error;
      }
+    udpif_recv_set(backer->udpif, N_THREADS, backer->recv_set_enable);
  
      backer->max_n_subfacet = 0;
      backer->created = time_msec();
@@ -1387,9 +1342,12 @@ add_internal_flow(struct ofproto_dpif *ofproto, int id,
          return error;
      }
  
-    *rulep = rule_dpif_lookup_in_table(ofproto, &fm.match.flow, NULL,
-                                       TBL_INTERNAL);
-    ovs_assert(*rulep != NULL);
+    if (rule_dpif_lookup_in_table(ofproto, &fm.match.flow, NULL, TBL_INTERNAL,
+                                  rulep)) {
+        ovs_rwlock_unlock(&(*rulep)->up.evict);
+    } else {
+        NOT_REACHED();
+    }
  
      return 0;
  }
@@ -1668,7 +1626,7 @@ wait(struct ofproto *ofproto_)
      }
  
      dpif_wait(ofproto->backer->dpif);
-    dpif_recv_wait(ofproto->backer->dpif);
+    udpif_wait(ofproto->backer->udpif);
      if (ofproto->sflow) {
          dpif_sflow_wait(ofproto->sflow);
      }
@@ -2442,7 +2400,7 @@ bundle_add_port(struct ofbundle *bundle, ofp_port_t ofp_port,
      if (port->bundle != bundle) {
          bundle->ofproto->backer->need_revalidate = REV_RECONFIGURE;
          if (port->bundle) {
-            bundle_del_port(port);
+            bundle_remove(&port->up);
          }
  
          port->bundle = bundle;
@@ -2905,7 +2863,7 @@ ofport_update_peer(struct ofport_dpif *ofport)
  {
      const struct ofproto_dpif *ofproto;
      struct dpif_backer *backer;
-    const char *peer_name;
+    char *peer_name;
  
      if (!netdev_vport_is_patch(ofport->up.netdev)) {
          return;
@@ -2927,7 +2885,7 @@ ofport_update_peer(struct ofport_dpif *ofport)
      HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
          struct ofport *peer_ofport;
          struct ofport_dpif *peer;
-        const char *peer_peer;
+        char *peer_peer;
  
          if (ofproto->backer != backer) {
              continue;
@@ -2945,9 +2903,11 @@ ofport_update_peer(struct ofport_dpif *ofport)
              ofport->peer = peer;
              ofport->peer->peer = ofport;
          }
+        free(peer_peer);
  
-        return;
+        break;
      }
+    free(peer_name);
  }
  
  static void
@@ -3270,26 +3230,6 @@ port_is_lacp_current(const struct ofport *ofport_)
  \f
  /* Upcall handling. */
  
-/* Flow miss batching.
- *
- * Some dpifs implement operations faster when you hand them off in a batch.
- * To allow batching, "struct flow_miss" queues the dpif-related work needed
- * for a given flow.  Each "struct flow_miss" corresponds to sending one or
- * more packets, plus possibly installing the flow in the dpif.
- *
- * So far we only batch the operations that affect flow setup time the most.
- * It's possible to batch more than that, but the benefit might be minimal. */
-struct flow_miss {
-    struct hmap_node hmap_node;
-    struct ofproto_dpif *ofproto;
-    struct flow flow;
-    enum odp_key_fitness key_fitness;
-    const struct nlattr *key;
-    size_t key_len;
-    struct list packets;
-    enum dpif_upcall_type upcall_type;
-};
-
  struct flow_miss_op {
      struct dpif_op dpif_op;
  
@@ -3305,96 +3245,6 @@ struct flow_miss_op {
      struct subfacet *subfacet;
  };
  
-/* Sends an OFPT_PACKET_IN message for 'packet' of type OFPR_NO_MATCH to each
- * OpenFlow controller as necessary according to their individual
- * configurations. */
-static void
-send_packet_in_miss(struct ofproto_dpif *ofproto, const struct ofpbuf *packet,
-                    const struct flow *flow)
-{
-    struct ofputil_packet_in pin;
-
-    pin.packet = packet->data;
-    pin.packet_len = packet->size;
-    pin.reason = OFPR_NO_MATCH;
-    pin.controller_id = 0;
-
-    pin.table_id = 0;
-    pin.cookie = 0;
-
-    pin.send_len = 0;           /* not used for flow table misses */
-
-    flow_get_metadata(flow, &pin.fmd);
-
-    connmgr_send_packet_in(ofproto->up.connmgr, &pin);
-}
-
-static struct flow_miss *
-flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto,
-               const struct flow *flow, uint32_t hash)
-{
-    struct flow_miss *miss;
-
-    HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) {
-        if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) {
-            return miss;
-        }
-    }
-
-    return NULL;
-}
-
-/* Partially Initializes 'op' as an "execute" operation for 'miss' and
- * 'packet'.  The caller must initialize op->actions and op->actions_len.  If
- * 'miss' is associated with a subfacet the caller must also initialize the
- * returned op->subfacet, and if anything needs to be freed after processing
- * the op, the caller must initialize op->garbage also. */
-static void
-init_flow_miss_execute_op(struct flow_miss *miss, struct ofpbuf *packet,
-                          struct flow_miss_op *op)
-{
-    if (miss->flow.in_port.ofp_port
-        != vsp_realdev_to_vlandev(miss->ofproto, miss->flow.in_port.ofp_port,
-                                  miss->flow.vlan_tci)) {
-        /* This packet was received on a VLAN splinter port.  We
-         * added a VLAN to the packet to make the packet resemble
-         * the flow, but the actions were composed assuming that
-         * the packet contained no VLAN.  So, we must remove the
-         * VLAN header from the packet before trying to execute the
-         * actions. */
-        eth_pop_vlan(packet);
-    }
-
-    op->subfacet = NULL;
-    op->xout_garbage = false;
-    op->dpif_op.type = DPIF_OP_EXECUTE;
-    op->dpif_op.u.execute.key = miss->key;
-    op->dpif_op.u.execute.key_len = miss->key_len;
-    op->dpif_op.u.execute.packet = packet;
-    ofpbuf_use_stack(&op->mask, &op->maskbuf, sizeof op->maskbuf);
-}
-
-/* Helper for handle_flow_miss_without_facet() and
- * handle_flow_miss_with_facet(). */
-static void
-handle_flow_miss_common(struct ofproto_dpif *ofproto, struct ofpbuf *packet,
-                        const struct flow *flow, bool fail_open)
-{
-    if (fail_open) {
-        /*
-         * Extra-special case for fail-open mode.
-         *
-         * We are in fail-open mode and the packet matched the fail-open
-         * rule, but we are connected to a controller too.  We should send
-         * the packet up to the controller in the hope that it will try to
-         * set up a flow and thereby allow us to exit fail-open.
-         *
-         * See the top-level comment in fail-open.c for more information.
-         */
-        send_packet_in_miss(ofproto, packet, flow);
-    }
-}
-
  /* Figures out whether a flow that missed in 'ofproto', whose details are in
   * 'miss' masked by 'wc', is likely to be worth tracking in detail in userspace
   * and (usually) installing a datapath flow.  The answer is usually "yes" (a
@@ -3403,7 +3253,7 @@ handle_flow_miss_common(struct ofproto_dpif *ofproto, struct ofpbuf *packet,
   * flows we impose some heuristics to decide which flows are likely to be worth
   * tracking. */
  static bool
-flow_miss_should_make_facet(struct flow_miss *miss, struct flow_wildcards *wc)
+flow_miss_should_make_facet(struct flow_miss *miss)
  {
      struct dpif_backer *backer = miss->ofproto->backer;
      uint32_t hash;
@@ -3428,96 +3278,34 @@ flow_miss_should_make_facet(struct flow_miss *miss, struct flow_wildcards *wc)
          backer->governor = governor_create();
      }
  
-    hash = flow_hash_in_wildcards(&miss->flow, wc, 0);
+    hash = flow_hash_in_wildcards(&miss->flow, &miss->xout.wc, 0);
      return governor_should_install_flow(backer->governor, hash,
                                          list_size(&miss->packets));
  }
  
-/* Handles 'miss' without creating a facet or subfacet or creating any datapath
- * flow.  'miss->flow' must have matched 'rule' and been xlated into 'xout'.
- * May add an "execute" operation to 'ops' and increment '*n_ops'. */
-static void
-handle_flow_miss_without_facet(struct rule_dpif *rule, struct xlate_out *xout,
-                               struct flow_miss *miss,
-                               struct flow_miss_op *ops, size_t *n_ops)
-{
-    struct ofpbuf *packet;
-
-    LIST_FOR_EACH (packet, list_node, &miss->packets) {
-
-        COVERAGE_INC(facet_suppress);
-
-        handle_flow_miss_common(miss->ofproto, packet, &miss->flow,
-                                rule->up.cr.priority == FAIL_OPEN_PRIORITY);
-
-        if (xout->slow) {
-            struct xlate_in xin;
-
-            xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
-            xlate_actions_for_side_effects(&xin);
-        }
-
-        if (xout->odp_actions.size) {
-            struct flow_miss_op *op = &ops[*n_ops];
-            struct dpif_execute *execute = &op->dpif_op.u.execute;
-
-            init_flow_miss_execute_op(miss, packet, op);
-            xlate_out_copy(&op->xout, xout);
-            execute->actions = op->xout.odp_actions.data;
-            execute->actions_len = op->xout.odp_actions.size;
-            op->xout_garbage = true;
-
-            (*n_ops)++;
-        }
-    }
-}
-
  /* Handles 'miss', which matches 'facet'.  May add any required datapath
   * operations to 'ops', incrementing '*n_ops' for each new op.
   *
- * All of the packets in 'miss' are considered to have arrived at time 'now'.
- * This is really important only for new facets: if we just called time_msec()
- * here, then the new subfacet or its packets could look (occasionally) as
- * though it was used some time after the facet was used.  That can make a
- * one-packet flow look like it has a nonzero duration, which looks odd in
- * e.g. NetFlow statistics.
- *
- * If non-null, 'stats' will be folded into 'facet'. */
+ * All of the packets in 'miss' are considered to have arrived at time
+ * 'miss->stats.used'.  This is really important only for new facets: if we
+ * just called time_msec() here, then the new subfacet or its packets could
+ * look (occasionally) as though it was used some time after the facet was
+ * used.  That can make a one-packet flow look like it has a nonzero duration,
+ * which looks odd in e.g. NetFlow statistics. */
  static void
  handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
-                            long long int now, struct dpif_flow_stats *stats,
                              struct flow_miss_op *ops, size_t *n_ops)
  {
      enum subfacet_path want_path;
      struct subfacet *subfacet;
-    struct ofpbuf *packet;
  
-    want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH;
-
-    LIST_FOR_EACH (packet, list_node, &miss->packets) {
-        struct flow_miss_op *op = &ops[*n_ops];
-
-        handle_flow_miss_common(miss->ofproto, packet, &miss->flow,
-                                facet->fail_open);
-
-        if (want_path != SF_FAST_PATH) {
-            struct rule_dpif *rule;
-            struct xlate_in xin;
-
-            rule = rule_dpif_lookup(facet->ofproto, &facet->flow, NULL);
-            xlate_in_init(&xin, facet->ofproto, &miss->flow, rule, 0, packet);
-            xlate_actions_for_side_effects(&xin);
-        }
-
-        if (facet->xout.odp_actions.size) {
-            struct dpif_execute *execute = &op->dpif_op.u.execute;
+    facet->packet_count += miss->stats.n_packets;
+    facet->prev_packet_count += miss->stats.n_packets;
+    facet->byte_count += miss->stats.n_bytes;
+    facet->prev_byte_count += miss->stats.n_bytes;
  
-            init_flow_miss_execute_op(miss, packet, op);
-            execute->actions = facet->xout.odp_actions.data,
-            execute->actions_len = facet->xout.odp_actions.size;
-            (*n_ops)++;
-        }
-    }
+    subfacet = subfacet_create(facet, miss);
+    want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH;
  
      /* Don't install the flow if it's the result of the "userspace"
       * action for an already installed facet.  This can occur when a
@@ -3526,20 +3314,10 @@ handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
       * be rejected as overlapping by the datapath. */
      if (miss->upcall_type == DPIF_UC_ACTION
          && !list_is_empty(&facet->subfacets)) {
-        if (stats) {
-            facet->used = MAX(facet->used, stats->used);
-            facet->packet_count += stats->n_packets;
-            facet->byte_count += stats->n_bytes;
-            facet->tcp_flags |= stats->tcp_flags;
-        }
          return;
      }
  
-    subfacet = subfacet_create(facet, miss, now);
-    if (stats) {
-        subfacet_update_stats(subfacet, stats);
-    }
-
+    subfacet = subfacet_create(facet, miss);
      if (subfacet->path != want_path) {
          struct flow_miss_op *op = &ops[(*n_ops)++];
          struct dpif_flow_put *put = &op->dpif_op.u.flow_put;
@@ -3579,55 +3357,25 @@ static void
  handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops,
                   size_t *n_ops)
  {
-    struct ofproto_dpif *ofproto = miss->ofproto;
-    struct dpif_flow_stats stats__;
-    struct dpif_flow_stats *stats = &stats__;
-    struct ofpbuf *packet;
      struct facet *facet;
-    long long int now;
  
-    now = time_msec();
-    memset(stats, 0, sizeof *stats);
-    stats->used = now;
-    LIST_FOR_EACH (packet, list_node, &miss->packets) {
-        stats->tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
-        stats->n_bytes += packet->size;
-        stats->n_packets++;
-    }
+    miss->ofproto->n_missed += list_size(&miss->packets);
  
-    facet = facet_lookup_valid(ofproto, &miss->flow);
+    facet = facet_lookup_valid(miss->ofproto, &miss->flow);
      if (!facet) {
-        struct flow_wildcards wc;
-        struct rule_dpif *rule;
-        struct xlate_out xout;
-        struct xlate_in xin;
-
-        flow_wildcards_init_catchall(&wc);
-        rule = rule_dpif_lookup(ofproto, &miss->flow, &wc);
-        rule_credit_stats(rule, stats);
-
-        xlate_in_init(&xin, ofproto, &miss->flow, rule, stats->tcp_flags,
-                      NULL);
-        xin.resubmit_stats = stats;
-        xin.may_learn = true;
-        xlate_actions(&xin, &xout);
-        flow_wildcards_or(&xout.wc, &xout.wc, &wc);
-
          /* There does not exist a bijection between 'struct flow' and datapath
           * flow keys with fitness ODP_FIT_TO_LITTLE.  This breaks a fundamental
           * assumption used throughout the facet and subfacet handling code.
           * Since we have to handle these misses in userspace anyway, we simply
           * skip facet creation, avoiding the problem altogether. */
          if (miss->key_fitness == ODP_FIT_TOO_LITTLE
-            || !flow_miss_should_make_facet(miss, &xout.wc)) {
-            handle_flow_miss_without_facet(rule, &xout, miss, ops, n_ops);
+            || !flow_miss_should_make_facet(miss)) {
              return;
          }
  
-        facet = facet_create(miss, rule, &xout, stats);
-        stats = NULL;
+        facet = facet_create(miss);
      }
-    handle_flow_miss_with_facet(miss, facet, now, stats, ops, n_ops);
+    handle_flow_miss_with_facet(miss, facet, ops, n_ops);
  }
  
  static struct drop_key *
@@ -3666,109 +3414,24 @@ drop_key_clear(struct dpif_backer *backer)
          }
  
          hmap_remove(&backer->drop_keys, &drop_key->hmap_node);
-        free(drop_key->key);
-        free(drop_key);
+        drop_key_destroy(drop_key);
      }
+
+    udpif_drop_key_clear(backer->udpif);
  }
  
  static void
-handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls,
-                    size_t n_upcalls)
+handle_flow_misses(struct dpif_backer *backer, struct flow_miss_batch *fmb)
  {
-    struct dpif_upcall *upcall;
+    struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH];
+    struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH];
      struct flow_miss *miss;
-    struct flow_miss misses[FLOW_MISS_MAX_BATCH];
-    struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
-    struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
-    struct hmap todo;
-    int n_misses;
-    size_t n_ops;
-    size_t i;
-
-    if (!n_upcalls) {
-        return;
-    }
-
-    /* Construct the to-do list.
-     *
-     * This just amounts to extracting the flow from each packet and sticking
-     * the packets that have the same flow in the same "flow_miss" structure so
-     * that we can process them together. */
-    hmap_init(&todo);
-    n_misses = 0;
-    for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
-        struct flow_miss *miss = &misses[n_misses];
-        struct flow_miss *existing_miss;
-        struct ofproto_dpif *ofproto;
-        odp_port_t odp_in_port;
-        struct flow flow;
-        uint32_t hash;
-        int error;
-
-        error = xlate_receive(backer, upcall->packet, upcall->key,
-                              upcall->key_len, &flow, &miss->key_fitness,
-                              &ofproto, &odp_in_port);
-        if (error == ENODEV) {
-            struct drop_key *drop_key;
-
-            /* Received packet on datapath port for which we couldn't
-             * associate an ofproto.  This can happen if a port is removed
-             * while traffic is being received.  Print a rate-limited message
-             * in case it happens frequently.  Install a drop flow so
-             * that future packets of the flow are inexpensively dropped
-             * in the kernel. */
-            VLOG_INFO_RL(&rl, "received packet on unassociated datapath port "
-                              "%"PRIu32, odp_in_port);
-
-            drop_key = drop_key_lookup(backer, upcall->key, upcall->key_len);
-            if (!drop_key) {
-                int ret;
-                ret = dpif_flow_put(backer->dpif,
-                                    DPIF_FP_CREATE | DPIF_FP_MODIFY,
-                                    upcall->key, upcall->key_len,
-                                    NULL, 0, NULL, 0, NULL);
-
-                if (!ret) {
-                    drop_key = xmalloc(sizeof *drop_key);
-                    drop_key->key = xmemdup(upcall->key, upcall->key_len);
-                    drop_key->key_len = upcall->key_len;
-
-                    hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
-                                hash_bytes(drop_key->key, drop_key->key_len, 0));
-                }
-            }
-            continue;
-        }
-        if (error) {
-            continue;
-        }
-
-        ofproto->n_missed++;
-        flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark,
-                     &flow.tunnel, &flow.in_port, &miss->flow);
-
-        /* Add other packets to a to-do list. */
-        hash = flow_hash(&miss->flow, 0);
-        existing_miss = flow_miss_find(&todo, ofproto, &miss->flow, hash);
-        if (!existing_miss) {
-            hmap_insert(&todo, &miss->hmap_node, hash);
-            miss->ofproto = ofproto;
-            miss->key = upcall->key;
-            miss->key_len = upcall->key_len;
-            miss->upcall_type = upcall->type;
-            list_init(&miss->packets);
-
-            n_misses++;
-        } else {
-            miss = existing_miss;
-        }
-        list_push_back(&miss->packets, &upcall->packet->list_node);
-    }
+    size_t n_ops, i;
  
      /* Process each element in the to-do list, constructing the set of
       * operations to batch. */
      n_ops = 0;
-    HMAP_FOR_EACH (miss, hmap_node, &todo) {
+    HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
          handle_flow_miss(miss, flow_miss_ops, &n_ops);
      }
      ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
@@ -3801,66 +3464,6 @@ handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls,
  
              subfacet->path = SF_NOT_INSTALLED;
          }
-
-        /* Free memory. */
-        if (flow_miss_ops[i].xout_garbage) {
-            xlate_out_uninit(&flow_miss_ops[i].xout);
-        }
-    }
-    hmap_destroy(&todo);
-}
-
-static enum { SFLOW_UPCALL, MISS_UPCALL, BAD_UPCALL, FLOW_SAMPLE_UPCALL,
-              IPFIX_UPCALL }
-classify_upcall(const struct dpif_upcall *upcall)
-{
-    size_t userdata_len;
-    union user_action_cookie cookie;
-
-    /* First look at the upcall type. */
-    switch (upcall->type) {
-    case DPIF_UC_ACTION:
-        break;
-
-    case DPIF_UC_MISS:
-        return MISS_UPCALL;
-
-    case DPIF_N_UC_TYPES:
-    default:
-        VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32, upcall->type);
-        return BAD_UPCALL;
-    }
-
-    /* "action" upcalls need a closer look. */
-    if (!upcall->userdata) {
-        VLOG_WARN_RL(&rl, "action upcall missing cookie");
-        return BAD_UPCALL;
-    }
-    userdata_len = nl_attr_get_size(upcall->userdata);
-    if (userdata_len < sizeof cookie.type
-        || userdata_len > sizeof cookie) {
-        VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %zu",
-                     userdata_len);
-        return BAD_UPCALL;
-    }
-    memset(&cookie, 0, sizeof cookie);
-    memcpy(&cookie, nl_attr_get(upcall->userdata), userdata_len);
-    if (userdata_len == sizeof cookie.sflow
-        && cookie.type == USER_ACTION_COOKIE_SFLOW) {
-        return SFLOW_UPCALL;
-    } else if (userdata_len == sizeof cookie.slow_path
-               && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) {
-        return MISS_UPCALL;
-    } else if (userdata_len == sizeof cookie.flow_sample
-               && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) {
-        return FLOW_SAMPLE_UPCALL;
-    } else if (userdata_len == sizeof cookie.ipfix
-               && cookie.type == USER_ACTION_COOKIE_IPFIX) {
-        return IPFIX_UPCALL;
-    } else {
-        VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16
-                     " and size %zu", cookie.type, userdata_len);
-        return BAD_UPCALL;
      }
  }
  
@@ -3929,66 +3532,64 @@ handle_ipfix_upcall(struct dpif_backer *backer,
      dpif_ipfix_bridge_sample(ofproto->ipfix, upcall->packet, &flow);
  }
  
-static int
-handle_upcalls(struct dpif_backer *backer, unsigned int max_batch)
+static void
+handle_upcalls(struct dpif_backer *backer)
  {
-    struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
-    struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
-    uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
+    struct flow_miss_batch *fmb;
      int n_processed;
-    int n_misses;
-    int i;
-
-    ovs_assert(max_batch <= FLOW_MISS_MAX_BATCH);
  
-    n_misses = 0;
-    for (n_processed = 0; n_processed < max_batch; n_processed++) {
-        struct dpif_upcall *upcall = &misses[n_misses];
-        struct ofpbuf *buf = &miss_bufs[n_misses];
-        int error;
+    for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) {
+        struct upcall *upcall = upcall_next(backer->udpif);
  
-        ofpbuf_use_stub(buf, miss_buf_stubs[n_misses],
-                        sizeof miss_buf_stubs[n_misses]);
-        error = dpif_recv(backer->dpif, upcall, buf);
-        if (error) {
-            ofpbuf_uninit(buf);
+        if (!upcall) {
              break;
          }
  
-        switch (classify_upcall(upcall)) {
-        case MISS_UPCALL:
-            /* Handle it later. */
-            n_misses++;
-            break;
-
+        switch (upcall->type) {
          case SFLOW_UPCALL:
-            handle_sflow_upcall(backer, upcall);
-            ofpbuf_uninit(buf);
+            handle_sflow_upcall(backer, &upcall->dpif_upcall);
              break;
  
          case FLOW_SAMPLE_UPCALL:
-            handle_flow_sample_upcall(backer, upcall);
-            ofpbuf_uninit(buf);
+            handle_flow_sample_upcall(backer, &upcall->dpif_upcall);
              break;
  
          case IPFIX_UPCALL:
-            handle_ipfix_upcall(backer, upcall);
-            ofpbuf_uninit(buf);
+            handle_ipfix_upcall(backer, &upcall->dpif_upcall);
              break;
  
          case BAD_UPCALL:
-            ofpbuf_uninit(buf);
              break;
+
+        case MISS_UPCALL:
+            NOT_REACHED();
          }
+
+        upcall_destroy(upcall);
      }
  
-    /* Handle deferred MISS_UPCALL processing. */
-    handle_miss_upcalls(backer, misses, n_misses);
-    for (i = 0; i < n_misses; i++) {
-        ofpbuf_uninit(&miss_bufs[i]);
+    for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) {
+        struct drop_key *drop_key = drop_key_next(backer->udpif);
+        if (!drop_key) {
+            break;
+        }
+
+        if (!drop_key_lookup(backer, drop_key->key, drop_key->key_len)) {
+            hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
+                        hash_bytes(drop_key->key, drop_key->key_len, 0));
+            dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY,
+                          drop_key->key, drop_key->key_len,
+                          NULL, 0, NULL, 0, NULL);
+        } else {
+            drop_key_destroy(drop_key);
+        }
      }
  
-    return n_processed;
+    fmb = flow_miss_batch_next(backer->udpif);
+    if (fmb) {
+        handle_flow_misses(backer, fmb);
+        flow_miss_batch_destroy(fmb);
+    }
  }
  \f
  /* Flow expiration. */
@@ -4340,10 +3941,12 @@ rule_expire(struct rule_dpif *rule)
          return;
      }
  
-    COVERAGE_INC(ofproto_dpif_expired);
+    if (!ovs_rwlock_trywrlock(&rule->up.evict)) {
+        COVERAGE_INC(ofproto_dpif_expired);
  
-    /* Get rid of the rule. */
-    ofproto_rule_expire(&rule->up, reason);
+        /* Get rid of the rule. */
+        ofproto_rule_expire(&rule->up, reason);
+    }
  }
  \f
  /* Facets. */
@@ -4360,8 +3963,7 @@ rule_expire(struct rule_dpif *rule)
   * The facet will initially have no subfacets.  The caller should create (at
   * least) one subfacet with subfacet_create(). */
  static struct facet *
-facet_create(const struct flow_miss *miss, struct rule_dpif *rule,
-             struct xlate_out *xout, struct dpif_flow_stats *stats)
+facet_create(const struct flow_miss *miss)
  {
      struct ofproto_dpif *ofproto = miss->ofproto;
      struct facet *facet;
@@ -4369,10 +3971,7 @@ facet_create(const struct flow_miss *miss, struct rule_dpif *rule,
  
      facet = xzalloc(sizeof *facet);
      facet->ofproto = miss->ofproto;
-    facet->packet_count = facet->prev_packet_count = stats->n_packets;
-    facet->byte_count = facet->prev_byte_count = stats->n_bytes;
-    facet->tcp_flags = stats->tcp_flags;
-    facet->used = stats->used;
+    facet->used = miss->stats.used;
      facet->flow = miss->flow;
      facet->learn_rl = time_msec() + 500;
  
@@ -4380,7 +3979,7 @@ facet_create(const struct flow_miss *miss, struct rule_dpif *rule,
      netflow_flow_init(&facet->nf_flow);
      netflow_flow_update_time(ofproto->netflow, &facet->nf_flow, facet->used);
  
-    xlate_out_copy(&facet->xout, xout);
+    xlate_out_copy(&facet->xout, &miss->xout);
  
      match_init(&match, &facet->flow, &facet->xout.wc);
      cls_rule_init(&facet->cr, &match, OFP_DEFAULT_PRIORITY);
@@ -4389,8 +3988,6 @@ facet_create(const struct flow_miss *miss, struct rule_dpif *rule,
      ovs_rwlock_unlock(&ofproto->facets.rwlock);
  
      facet->nf_flow.output_iface = facet->xout.nf_output_iface;
-    facet->fail_open = rule->up.cr.priority == FAIL_OPEN_PRIORITY;
-
      return facet;
  }
  
@@ -4540,16 +4137,19 @@ facet_is_controller_flow(struct facet *facet)
  {
      if (facet) {
          struct ofproto_dpif *ofproto = facet->ofproto;
-        const struct rule_dpif *rule = rule_dpif_lookup(ofproto, &facet->flow,
-                                                        NULL);
-        const struct ofpact *ofpacts = rule->up.ofpacts;
-        size_t ofpacts_len = rule->up.ofpacts_len;
-
-        if (ofpacts_len > 0 &&
-            ofpacts->type == OFPACT_CONTROLLER &&
-            ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len)) {
-            return true;
-        }
+        const struct ofpact *ofpacts;
+        struct rule_dpif *rule;
+        size_t ofpacts_len;
+        bool is_controller;
+
+        rule_dpif_lookup(ofproto, &facet->flow, NULL, &rule);
+        ofpacts_len = rule->up.ofpacts_len;
+        ofpacts = rule->up.ofpacts;
+        is_controller = ofpacts_len > 0
+            && ofpacts->type == OFPACT_CONTROLLER
+            && ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len);
+        rule_release(rule);
+        return is_controller;
      }
      return false;
  }
@@ -4636,17 +4236,16 @@ facet_check_consistency(struct facet *facet)
      struct xlate_in xin;
  
      struct rule_dpif *rule;
-    bool ok, fail_open;
+    bool ok;
  
      /* Check the datapath actions for consistency. */
-    rule = rule_dpif_lookup(facet->ofproto, &facet->flow, NULL);
+    rule_dpif_lookup(facet->ofproto, &facet->flow, NULL, &rule);
      xlate_in_init(&xin, facet->ofproto, &facet->flow, rule, 0, NULL);
      xlate_actions(&xin, &xout);
+    rule_release(rule);
  
-    fail_open = rule->up.cr.priority == FAIL_OPEN_PRIORITY;
      ok = ofpbuf_equal(&facet->xout.odp_actions, &xout.odp_actions)
-        && facet->xout.slow == xout.slow
-        && facet->fail_open == fail_open;
+        && facet->xout.slow == xout.slow;
      if (!ok && !VLOG_DROP_WARN(&rl)) {
          struct ds s = DS_EMPTY_INITIALIZER;
  
@@ -4667,10 +4266,6 @@ facet_check_consistency(struct facet *facet)
              ds_put_format(&s, " slow path incorrect. should be %d", xout.slow);
          }
  
-        if (facet->fail_open != fail_open) {
-            ds_put_format(&s, " fail open incorrect. should be %s",
-                          fail_open ? "true" : "false");
-        }
          ds_destroy(&s);
      }
      xlate_out_uninit(&xout);
@@ -4722,7 +4317,7 @@ facet_revalidate(struct facet *facet)
      }
  
      flow_wildcards_init_catchall(&wc);
-    new_rule = rule_dpif_lookup(ofproto, &facet->flow, &wc);
+    rule_dpif_lookup(ofproto, &facet->flow, &wc, &new_rule);
  
      /* Calculate new datapath actions.
       *
@@ -4745,6 +4340,7 @@ facet_revalidate(struct facet *facet)
          || memcmp(&facet->xout.wc, &xout.wc, sizeof xout.wc)) {
          facet_remove(facet);
          xlate_out_uninit(&xout);
+        rule_release(new_rule);
          return false;
      }
  
@@ -4774,9 +4370,9 @@ facet_revalidate(struct facet *facet)
      facet->xout.mirrors = xout.mirrors;
      facet->nf_flow.output_iface = facet->xout.nf_output_iface;
      facet->used = MAX(facet->used, new_rule->up.created);
-    facet->fail_open = new_rule->up.cr.priority == FAIL_OPEN_PRIORITY;
  
      xlate_out_uninit(&xout);
+    rule_release(new_rule);
      return true;
  }
  
@@ -4790,6 +4386,28 @@ facet_reset_counters(struct facet *facet)
      facet->accounted_bytes = 0;
  }
  
+static void
+flow_push_stats(struct ofproto_dpif *ofproto, struct flow *flow,
+                struct dpif_flow_stats *stats, bool may_learn)
+{
+    struct ofport_dpif *in_port;
+    struct rule_dpif *rule;
+    struct xlate_in xin;
+
+    in_port = get_ofp_port(ofproto, flow->in_port.ofp_port);
+    if (in_port && in_port->is_tunnel) {
+        netdev_vport_inc_rx(in_port->up.netdev, stats);
+    }
+
+    rule_dpif_lookup(ofproto, flow, NULL, &rule);
+    rule_credit_stats(rule, stats);
+    xlate_in_init(&xin, ofproto, flow, rule, stats->tcp_flags, NULL);
+    xin.resubmit_stats = stats;
+    xin.may_learn = may_learn;
+    xlate_actions_for_side_effects(&xin);
+    rule_release(rule);
+}
+
  static void
  facet_push_stats(struct facet *facet, bool may_learn)
  {
@@ -4805,33 +4423,16 @@ facet_push_stats(struct facet *facet, bool may_learn)
      stats.tcp_flags = facet->tcp_flags;
  
      if (may_learn || stats.n_packets || facet->used > facet->prev_used) {
-        struct ofproto_dpif *ofproto = facet->ofproto;
-        struct ofport_dpif *in_port;
-        struct rule_dpif *rule;
-        struct xlate_in xin;
-
          facet->prev_packet_count = facet->packet_count;
          facet->prev_byte_count = facet->byte_count;
          facet->prev_used = facet->used;
  
-        in_port = get_ofp_port(ofproto, facet->flow.in_port.ofp_port);
-        if (in_port && in_port->is_tunnel) {
-            netdev_vport_inc_rx(in_port->up.netdev, &stats);
-        }
-
-        rule = rule_dpif_lookup(ofproto, &facet->flow, NULL);
-        rule_credit_stats(rule, &stats);
-        netflow_flow_update_time(ofproto->netflow, &facet->nf_flow,
+        netflow_flow_update_time(facet->ofproto->netflow, &facet->nf_flow,
                                   facet->used);
          netflow_flow_update_flags(&facet->nf_flow, facet->tcp_flags);
-        mirror_update_stats(ofproto->mbridge, facet->xout.mirrors,
+        mirror_update_stats(facet->ofproto->mbridge, facet->xout.mirrors,
                              stats.n_packets, stats.n_bytes);
-
-        xlate_in_init(&xin, ofproto, &facet->flow, rule, stats.tcp_flags,
-                      NULL);
-        xin.resubmit_stats = &stats;
-        xin.may_learn = may_learn;
-        xlate_actions_for_side_effects(&xin);
+        flow_push_stats(facet->ofproto, &facet->flow, &stats, may_learn);
      }
  }
  
@@ -4903,8 +4504,7 @@ subfacet_find(struct dpif_backer *backer, const struct nlattr *key,
   * existing subfacet if there is one, otherwise creates and returns a
   * new subfacet. */
  static struct subfacet *
-subfacet_create(struct facet *facet, struct flow_miss *miss,
-                long long int now)
+subfacet_create(struct facet *facet, struct flow_miss *miss)
  {
      struct dpif_backer *backer = miss->ofproto->backer;
      enum odp_key_fitness key_fitness = miss->key_fitness;
@@ -4938,8 +4538,8 @@ subfacet_create(struct facet *facet, struct flow_miss *miss,
      subfacet->key_fitness = key_fitness;
      subfacet->key = xmemdup(key, key_len);
      subfacet->key_len = key_len;
-    subfacet->used = now;
-    subfacet->created = now;
+    subfacet->used = miss->stats.used;
+    subfacet->created = subfacet->used;
      subfacet->dp_packet_count = 0;
      subfacet->dp_byte_count = 0;
      subfacet->path = SF_NOT_INSTALLED;
@@ -5132,16 +4732,14 @@ subfacet_update_stats(struct subfacet *subfacet,
  
  /* Lookup 'flow' in 'ofproto''s classifier.  If 'wc' is non-null, sets
   * the fields that were relevant as part of the lookup. */
-static struct rule_dpif *
+void
  rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow,
-                 struct flow_wildcards *wc)
+                 struct flow_wildcards *wc, struct rule_dpif **rule)
  {
      struct ofport_dpif *port;
-    struct rule_dpif *rule;
  
-    rule = rule_dpif_lookup_in_table(ofproto, flow, wc, 0);
-    if (rule) {
-        return rule;
+    if (rule_dpif_lookup_in_table(ofproto, flow, wc, 0, rule)) {
+        return;
      }
      port = get_ofp_port(ofproto, flow->in_port.ofp_port);
      if (!port) {
@@ -5149,21 +4747,24 @@ rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow,
                       flow->in_port.ofp_port);
      }
  
-    return choose_miss_rule(port ? port->up.pp.config : 0, ofproto->miss_rule,
-                            ofproto->no_packet_in_rule);
+    *rule = choose_miss_rule(port ? port->up.pp.config : 0, ofproto->miss_rule,
+                             ofproto->no_packet_in_rule);
+    ovs_rwlock_rdlock(&(*rule)->up.evict);
  }
  
-struct rule_dpif *
+bool
  rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto,
                            const struct flow *flow, struct flow_wildcards *wc,
-                          uint8_t table_id)
+                          uint8_t table_id, struct rule_dpif **rule)
+    OVS_ACQ_RDLOCK((*rule)->up.evict)
  {
      struct cls_rule *cls_rule;
      struct classifier *cls;
      bool frag;
  
+    *rule = NULL;
      if (table_id >= N_TABLES) {
-        return NULL;
+        return false;
      }
  
      if (wc) {
@@ -5172,26 +4773,32 @@ rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto,
      }
  
      cls = &ofproto->up.tables[table_id].cls;
+    ovs_rwlock_rdlock(&cls->rwlock);
      frag = (flow->nw_frag & FLOW_NW_FRAG_ANY) != 0;
      if (frag && ofproto->up.frag_handling == OFPC_FRAG_NORMAL) {
          /* We must pretend that transport ports are unavailable. */
          struct flow ofpc_normal_flow = *flow;
          ofpc_normal_flow.tp_src = htons(0);
          ofpc_normal_flow.tp_dst = htons(0);
-        ovs_rwlock_rdlock(&cls->rwlock);
          cls_rule = classifier_lookup(cls, &ofpc_normal_flow, wc);
-        ovs_rwlock_unlock(&cls->rwlock);
      } else if (frag && ofproto->up.frag_handling == OFPC_FRAG_DROP) {
          cls_rule = &ofproto->drop_frags_rule->up.cr;
          if (wc) {
              flow_wildcards_init_exact(wc);
          }
      } else {
-        ovs_rwlock_rdlock(&cls->rwlock);
          cls_rule = classifier_lookup(cls, flow, wc);
-        ovs_rwlock_unlock(&cls->rwlock);
      }
-    return rule_dpif_cast(rule_from_cls_rule(cls_rule));
+
+    *rule = rule_dpif_cast(rule_from_cls_rule(cls_rule));
+    if (*rule && ovs_rwlock_tryrdlock(&(*rule)->up.evict)) {
+        /* The rule is in the process of being removed.  Best we can do is
+         * pretend it isn't there. */
+        *rule = NULL;
+    }
+    ovs_rwlock_unlock(&cls->rwlock);
+
+    return *rule != NULL;
  }
  
  /* Given a port configuration (specified as zero if there's no port), chooses
@@ -5204,6 +4811,14 @@ choose_miss_rule(enum ofputil_port_config config, struct rule_dpif *miss_rule,
      return config & OFPUTIL_PC_NO_PACKET_IN ? no_packet_in_rule : miss_rule;
  }
  
+void
+rule_release(struct rule_dpif *rule)
+{
+    if (rule) {
+        ovs_rwlock_unlock(&rule->up.evict);
+    }
+}
+
  static void
  complete_operation(struct rule_dpif *rule)
  {
@@ -5798,7 +5413,7 @@ ofproto_unixctl_trace(struct unixctl_conn *conn, int argc, const char *argv[],
  
              /* Use the metadata from the flow and the packet argument
               * to reconstruct the flow. */
-            flow_extract(packet, flow.skb_priority, flow.skb_mark, NULL,
+            flow_extract(packet, flow.skb_priority, flow.pkt_mark, NULL,
                           &in_port_, &flow);
          }
      }
@@ -5818,12 +5433,14 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow,
                const struct ofpbuf *packet, struct ds *ds)
  {
      struct rule_dpif *rule;
+    struct flow_wildcards wc;
  
      ds_put_cstr(ds, "Flow: ");
      flow_format(ds, flow);
      ds_put_char(ds, '\n');
  
-    rule = rule_dpif_lookup(ofproto, flow, NULL);
+    flow_wildcards_init_catchall(&wc);
+    rule_dpif_lookup(ofproto, flow, &wc, &rule);
  
      trace_format_rule(ds, 0, rule);
      if (rule == ofproto->miss_rule) {
@@ -5853,6 +5470,7 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow,
          trace.xin.report_hook = trace_report;
  
          xlate_actions(&trace.xin, &trace.xout);
+        flow_wildcards_or(&trace.xout.wc, &trace.xout.wc, &wc);
  
          ds_put_char(ds, '\n');
          trace_format_flow(ds, 0, "Final flow", &trace);
@@ -5893,6 +5511,8 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow,
  
          xlate_out_uninit(&trace.xout);
      }
+
+    rule_release(rule);
  }
  
  static void
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h

index a74146b..6a4ae07 100644 (file)
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -18,6 +18,7 @@
  #include <stdint.h>
  
  #include "hmapx.h"
+#include "odp-util.h"
  #include "ofproto/ofproto-provider.h"
  #include "ovs-thread.h"
  #include "timer.h"
@@ -29,6 +30,34 @@ struct ofproto_dpif;
  struct ofport_dpif;
  struct dpif_backer;
  
+/* Ofproto-dpif -- DPIF based ofproto implementation.
+ *
+ * Ofproto-dpif provides an ofproto implementation for those platforms which
+ * implement the netdev and dpif interface defined in netdev.h and dpif.h.  The
+ * most important of which is the Linux Kernel Module (dpif-linux), but
+ * alternatives are supported such as a userspace only implementation
+ * (dpif-netdev), and a dummy implementation used for unit testing.
+ *
+ * Ofproto-dpif is divided into three major chunks.
+ *
+ * - ofproto-dpif.c
+ *   The main ofproto-dpif module is responsible for implementing the
+ *   provider interface, installing and removing datapath flows, maintaining
+ *   packet statistics, running protocols (BFD, LACP, STP, etc), and
+ *   configuring relevant submodules.
+ *
+ * - ofproto-dpif-upcall.c
+ *   Ofproto-dpif-upcall is responsible for retrieving upcalls from the kernel,
+ *   processing miss upcalls, and handing more complex ones up to the main
+ *   ofproto-dpif module.  Miss upcall processing boils down to figuring out
+ *   what each packet's actions are, executing them (i.e. asking the kernel to
+ *   forward it), and handing it up to ofproto-dpif to decided whether or not
+ *   to install a kernel flow.
+ *
+ * - ofproto-dpif-xlate.c
+ *   Ofproto-dpif-xlate is responsible for translating translating OpenFlow
+ *   actions into datapath actions. */
+
  struct rule_dpif {
      struct rule up;
  
@@ -55,10 +84,16 @@ static inline struct rule_dpif *rule_dpif_cast(const struct rule *rule)
      return rule ? CONTAINER_OF(rule, struct rule_dpif, up) : NULL;
  }
  
-struct rule_dpif *rule_dpif_lookup_in_table(struct ofproto_dpif *,
-                                            const struct flow *,
-                                            struct flow_wildcards *,
-                                            uint8_t table_id);
+void rule_dpif_lookup(struct ofproto_dpif *, const struct flow *,
+                      struct flow_wildcards *, struct rule_dpif **rule)
+    OVS_ACQ_RDLOCK((*rule)->up.evict);
+
+bool rule_dpif_lookup_in_table(struct ofproto_dpif *, const struct flow *,
+                               struct flow_wildcards *, uint8_t table_id,
+                               struct rule_dpif **rule)
+    OVS_ACQ_RDLOCK((*rule)->up.evict);
+
+void rule_release(struct rule_dpif *rule) OVS_RELEASES(rule->up.evict);
  
  void rule_credit_stats(struct rule_dpif *, const struct dpif_flow_stats *);
  
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h

index f081482..aa262bc 100644 (file)
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -230,10 +230,18 @@ struct rule {
      uint16_t idle_timeout OVS_GUARDED; /* In seconds from ->used. */
  
      /* Eviction groups. */
-    bool evictable;              /* If false, prevents eviction. */
      struct heap_node evg_node;   /* In eviction_group's "rules" heap. */
      struct eviction_group *eviction_group; /* NULL if not in any group. */
  
+    /* The evict lock is used to prevent rules from being evicted while child
+     * threads are using them to xlate flows.  A read lock means the rule is
+     * currently being used.  A write lock means the rule is in the process of
+     * being evicted and should be considered gone.  A rule will not be evicted
+     * unless both its own and its classifiers write locks are held.
+     * Therefore, while holding a classifier readlock, one can be assured that
+     * even write locked rules are safe. */
+    struct ovs_rwlock evict;
+
      struct ofpact *ofpacts;      /* Sequence of "struct ofpacts". */
      unsigned int ofpacts_len;    /* Size of 'ofpacts', in bytes. */
  
@@ -265,7 +273,8 @@ rule_from_cls_rule(const struct cls_rule *cls_rule)
  }
  
  void ofproto_rule_update_used(struct rule *, long long int used);
-void ofproto_rule_expire(struct rule *, uint8_t reason);
+void ofproto_rule_expire(struct rule *rule, uint8_t reason)
+    OVS_RELEASES(rule->evict);
  void ofproto_rule_destroy(struct ofproto *, struct classifier *cls,
                            struct rule *) OVS_REQ_WRLOCK(cls->rwlock);
  
diff --git a/ofproto/ofproto-unixctl.man b/ofproto/ofproto-unixctl.man

index 8141de9..dd8e8d8 100644 (file)
--- a/ofproto/ofproto-unixctl.man
+++ b/ofproto/ofproto-unixctl.man
@@ -86,8 +86,8 @@ only metadata. The metadata can be:
  .RS
  .IP \fIskb_priority\fR
  Packet QoS priority.
-.IP \fIskb_mark\fR
-SKB mark of the packet.
+.IP \fIpkt_mark\fR
+Mark of the packet.
  .IP \fItun_id\fR
  The tunnel ID on which the packet arrived.
  .IP \fIin_port\fR
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c

index 3cdc72c..bbdb2d2 100644 (file)
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -152,10 +152,10 @@ static void oftable_enable_eviction(struct oftable *,
                                      const struct mf_subfield *fields,
                                      size_t n_fields);
  
-static void oftable_remove_rule(struct rule *);
+static void oftable_remove_rule(struct rule *rule) OVS_RELEASES(rule->evict);
  static void oftable_remove_rule__(struct ofproto *ofproto,
                                    struct classifier *cls, struct rule *rule)
-    OVS_REQ_WRLOCK(cls->rwlock);
+    OVS_REQ_WRLOCK(cls->rwlock) OVS_RELEASES(rule->evict);
  static struct rule *oftable_replace_rule(struct rule *);
  static void oftable_substitute_rule(struct rule *old, struct rule *new);
  
@@ -181,7 +181,8 @@ struct eviction_group {
      struct heap rules;          /* Contains "struct rule"s. */
  };
  
-static struct rule *choose_rule_to_evict(struct oftable *);
+static bool choose_rule_to_evict(struct oftable *table, struct rule **rulep)
+    OVS_TRY_WRLOCK(true, (*rulep)->evict);
  static void ofproto_evict(struct ofproto *);
  static uint32_t rule_eviction_priority(struct rule *);
  
@@ -202,8 +203,9 @@ static bool rule_is_modifiable(const struct rule *);
  static enum ofperr add_flow(struct ofproto *, struct ofconn *,
                              struct ofputil_flow_mod *,
                              const struct ofp_header *);
-static void delete_flow__(struct rule *, struct ofopgroup *,
-                          enum ofp_flow_removed_reason);
+static void delete_flow__(struct rule *rule, struct ofopgroup *,
+                          enum ofp_flow_removed_reason)
+    OVS_RELEASES(rule->evict);
  static bool handle_openflow(struct ofconn *, const struct ofpbuf *);
  static enum ofperr handle_flow_mod__(struct ofproto *, struct ofconn *,
                                       struct ofputil_flow_mod *,
@@ -1077,6 +1079,7 @@ ofproto_flush__(struct ofproto *ofproto)
              if (!rule->pending) {
                  ofoperation_create(group, rule, OFOPERATION_DELETE,
                                     OFPRR_DELETE);
+                ovs_rwlock_wrlock(&rule->evict);
                  oftable_remove_rule__(ofproto, &table->cls, rule);
                  ofproto->ofproto_class->rule_destruct(rule);
              }
@@ -1678,6 +1681,7 @@ ofproto_delete_flow(struct ofproto *ofproto,
          /* Initiate deletion -> success. */
          struct ofopgroup *group = ofopgroup_create_unattached(ofproto);
          ofoperation_create(group, rule, OFOPERATION_DELETE, OFPRR_DELETE);
+        ovs_rwlock_wrlock(&rule->evict);
          oftable_remove_rule(rule);
          ofproto->ofproto_class->rule_destruct(rule);
          ofopgroup_submit(group);
@@ -2181,6 +2185,7 @@ ofproto_rule_destroy__(struct rule *rule)
          cls_rule_destroy(&rule->cr);
          free(rule->ofpacts);
          ovs_mutex_destroy(&rule->timeout_mutex);
+        ovs_rwlock_destroy(&rule->evict);
          rule->ofproto->ofproto_class->rule_dealloc(rule);
      }
  }
@@ -2198,7 +2203,11 @@ ofproto_rule_destroy(struct ofproto *ofproto, struct classifier *cls,
                       struct rule *rule) OVS_REQ_WRLOCK(cls->rwlock)
  {
      ovs_assert(!rule->pending);
-    oftable_remove_rule__(ofproto, cls, rule);
+    if (!ovs_rwlock_trywrlock(&rule->evict)) {
+        oftable_remove_rule__(ofproto, cls, rule);
+    } else {
+        NOT_REACHED();
+    }
      ofproto_rule_destroy__(rule);
  }
  
@@ -3423,12 +3432,12 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn,
      rule->ofpacts_len = fm->ofpacts_len;
      rule->meter_id = find_meter(rule->ofpacts, rule->ofpacts_len);
      list_init(&rule->meter_list_node);
-    rule->evictable = true;
      rule->eviction_group = NULL;
      list_init(&rule->expirable);
      rule->monitor_flags = 0;
      rule->add_seqno = 0;
      rule->modify_seqno = 0;
+    ovs_rwlock_init(&rule->evict);
  
      /* Insert new rule. */
      victim = oftable_replace_rule(rule);
@@ -3445,19 +3454,18 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn,
          n_rules = classifier_count(&table->cls);
          ovs_rwlock_unlock(&table->cls.rwlock);
          if (n_rules > table->max_flows) {
-            bool was_evictable;
-
-            was_evictable = rule->evictable;
-            rule->evictable = false;
-            evict = choose_rule_to_evict(table);
-            rule->evictable = was_evictable;
-
-            if (!evict) {
+            ovs_rwlock_rdlock(&rule->evict);
+            if (choose_rule_to_evict(table, &evict)) {
+                ovs_rwlock_unlock(&rule->evict);
+                ovs_rwlock_unlock(&evict->evict);
+                if (evict->pending) {
+                    error = OFPROTO_POSTPONE;
+                    goto exit;
+                }
+            } else {
+                ovs_rwlock_unlock(&rule->evict);
                  error = OFPERR_OFPFMFC_TABLE_FULL;
                  goto exit;
-            } else if (evict->pending) {
-                error = OFPROTO_POSTPONE;
-                goto exit;
              }
          } else {
              evict = NULL;
@@ -3472,6 +3480,13 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn,
              op->group->n_running--;
              ofoperation_destroy(rule->pending);
          } else if (evict) {
+            /* It would be better if we maintained the lock we took in
+             * choose_rule_to_evict() earlier, but that confuses the thread
+             * safety analysis, and this code is fragile enough that we really
+             * need it.  In the worst case, we'll have to block a little while
+             * before we perform the eviction, which doesn't seem like a big
+             * problem. */
+            ovs_rwlock_wrlock(&evict->evict);
              delete_flow__(evict, group, OFPRR_EVICTION);
          }
          ofopgroup_submit(group);
@@ -3642,6 +3657,7 @@ delete_flows__(struct ofproto *ofproto, struct ofconn *ofconn,
  
      group = ofopgroup_create(ofproto, ofconn, request, UINT32_MAX);
      LIST_FOR_EACH_SAFE (rule, next, ofproto_node, rules) {
+        ovs_rwlock_wrlock(&rule->evict);
          delete_flow__(rule, group, reason);
      }
      ofopgroup_submit(group);
@@ -5065,17 +5081,18 @@ pick_fallback_dpid(void)
  \f
  /* Table overflow policy. */
  
-/* Chooses and returns a rule to evict from 'table'.  Returns NULL if the table
- * is not configured to evict rules or if the table contains no evictable
- * rules.  (Rules with 'evictable' set to false or with no timeouts are not
- * evictable.) */
-static struct rule *
-choose_rule_to_evict(struct oftable *table)
+/* Chooses and updates 'rulep' with a rule to evict from 'table'.  Sets 'rulep'
+ * to NULL if the table is not configured to evict rules or if the table
+ * contains no evictable rules.  (Rules with a readlock on their evict rwlock,
+ * or with no timeouts are not evictable.) */
+static bool
+choose_rule_to_evict(struct oftable *table, struct rule **rulep)
  {
      struct eviction_group *evg;
  
+    *rulep = NULL;
      if (!table->eviction_fields) {
-        return NULL;
+        return false;
      }
  
      /* In the common case, the outer and inner loops here will each be entered
@@ -5094,13 +5111,14 @@ choose_rule_to_evict(struct oftable *table)
          struct rule *rule;
  
          HEAP_FOR_EACH (rule, evg_node, &evg->rules) {
-            if (rule->evictable) {
-                return rule;
+            if (!ovs_rwlock_trywrlock(&rule->evict)) {
+                *rulep = rule;
+                return true;
              }
          }
      }
  
-    return NULL;
+    return false;
  }
  
  /* Searches 'ofproto' for tables that have more flows than their configured
@@ -5129,8 +5147,12 @@ ofproto_evict(struct ofproto *ofproto)
                  break;
              }
  
-            rule = choose_rule_to_evict(table);
-            if (!rule || rule->pending) {
+            if (!choose_rule_to_evict(table, &rule)) {
+                break;
+            }
+
+            if (rule->pending) {
+                ovs_rwlock_unlock(&rule->evict);
                  break;
              }
  
@@ -5437,7 +5459,8 @@ oftable_enable_eviction(struct oftable *table,
  /* Removes 'rule' from the oftable that contains it. */
  static void
  oftable_remove_rule__(struct ofproto *ofproto, struct classifier *cls,
-                      struct rule *rule) OVS_REQ_WRLOCK(cls->rwlock)
+                      struct rule *rule)
+    OVS_REQ_WRLOCK(cls->rwlock) OVS_RELEASES(rule->evict)
  {
      classifier_remove(cls, &rule->cr);
      if (rule->meter_id) {
@@ -5453,6 +5476,7 @@ oftable_remove_rule__(struct ofproto *ofproto, struct classifier *cls,
      if (!list_is_empty(&rule->meter_list_node)) {
          list_remove(&rule->meter_list_node);
      }
+    ovs_rwlock_unlock(&rule->evict);
  }
  
  static void
@@ -5518,6 +5542,7 @@ oftable_substitute_rule(struct rule *old, struct rule *new)
      if (new) {
          oftable_replace_rule(new);
      } else {
+        ovs_rwlock_wrlock(&old->evict);
          oftable_remove_rule(old);
      }
  }
diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c

index c23e2d7..0ba0066 100644 (file)
--- a/ofproto/tunnel.c
+++ b/ofproto/tunnel.c
@@ -31,12 +31,15 @@
  
  VLOG_DEFINE_THIS_MODULE(tunnel);
  
+/* skb mark used for IPsec tunnel packets */
+#define IPSEC_MARK 1
+
  struct tnl_match {
      ovs_be64 in_key;
      ovs_be32 ip_src;
      ovs_be32 ip_dst;
      odp_port_t odp_port;
-    uint32_t skb_mark;
+    uint32_t pkt_mark;
      bool in_key_flow;
      bool ip_src_flow;
      bool ip_dst_flow;
@@ -101,7 +104,7 @@ tnl_port_add__(const struct ofport_dpif *ofport, const struct netdev *netdev,
      tnl_port->match.ip_dst = cfg->ip_dst;
      tnl_port->match.ip_src_flow = cfg->ip_src_flow;
      tnl_port->match.ip_dst_flow = cfg->ip_dst_flow;
-    tnl_port->match.skb_mark = cfg->ipsec ? IPSEC_MARK : 0;
+    tnl_port->match.pkt_mark = cfg->ipsec ? IPSEC_MARK : 0;
      tnl_port->match.in_key_flow = cfg->in_key_flow;
      tnl_port->match.odp_port = odp_port;
  
@@ -213,7 +216,7 @@ tnl_port_receive(const struct flow *flow) OVS_EXCLUDED(rwlock)
      match.ip_src = flow->tunnel.ip_dst;
      match.ip_dst = flow->tunnel.ip_src;
      match.in_key = flow->tunnel.tun_id;
-    match.skb_mark = flow->skb_mark;
+    match.pkt_mark = flow->pkt_mark;
  
      ovs_rwlock_rdlock(&rwlock);
      tnl_port = tnl_find(&match);
@@ -249,6 +252,46 @@ out:
      return ofport;
  }
  
+static bool
+tnl_ecn_ok(const struct flow *base_flow, struct flow *flow)
+{
+    if (is_ip_any(base_flow)
+        && (flow->tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE) {
+        if ((base_flow->nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) {
+            VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE"
+                         " but is not ECN capable");
+            return false;
+        } else {
+            /* Set the ECN CE value in the tunneled packet. */
+            flow->nw_tos |= IP_ECN_CE;
+        }
+    }
+
+    return true;
+}
+
+/* Should be called at the beginning of action translation to initialize
+ * wildcards and perform any actions based on receiving on tunnel port.
+ *
+ * Returns false if the packet must be dropped. */
+bool
+tnl_xlate_init(const struct flow *base_flow, struct flow *flow,
+               struct flow_wildcards *wc)
+{
+    if (tnl_port_should_receive(flow)) {
+        memset(&wc->masks.tunnel, 0xff, sizeof wc->masks.tunnel);
+        memset(&wc->masks.pkt_mark, 0xff, sizeof wc->masks.pkt_mark);
+
+        if (!tnl_ecn_ok(base_flow, flow)) {
+            return false;
+        }
+
+        flow->pkt_mark &= ~IPSEC_MARK;
+    }
+
+    return true;
+}
+
  /* Given that 'flow' should be output to the ofport corresponding to
   * 'tnl_port', updates 'flow''s tunnel headers and returns the actual datapath
   * port that the output should happen on.  May return ODPP_NONE if the output
@@ -282,7 +325,7 @@ tnl_port_send(const struct ofport_dpif *ofport, struct flow *flow,
      if (!cfg->ip_dst_flow) {
          flow->tunnel.ip_dst = tnl_port->match.ip_dst;
      }
-    flow->skb_mark = tnl_port->match.skb_mark;
+    flow->pkt_mark = tnl_port->match.pkt_mark;
  
      if (!cfg->out_key_flow) {
          flow->tunnel.tun_id = cfg->out_key;
@@ -444,7 +487,7 @@ tnl_match_fmt(const struct tnl_match *match, struct ds *ds)
      }
  
      ds_put_format(ds, ", dp port=%"PRIu32, match->odp_port);
-    ds_put_format(ds, ", skb mark=%"PRIu32, match->skb_mark);
+    ds_put_format(ds, ", pkt mark=%"PRIu32, match->pkt_mark);
  }
  
  static void
diff --git a/ofproto/tunnel.h b/ofproto/tunnel.h

index f175f1a..27a2f7d 100644 (file)
--- a/ofproto/tunnel.h
+++ b/ofproto/tunnel.h
@@ -20,9 +20,6 @@
  #include <stdint.h>
  #include "flow.h"
  
-/* skb mark used for IPsec tunnel packets */
-#define IPSEC_MARK 1
-
  /* Tunnel port emulation layer.
   *
   * These functions emulate tunnel virtual ports based on the outer
@@ -39,6 +36,8 @@ void tnl_port_add(const struct ofport_dpif *, const struct netdev *,
  void tnl_port_del(const struct ofport_dpif *);
  
  const struct ofport_dpif *tnl_port_receive(const struct flow *);
+bool tnl_xlate_init(const struct flow *base_flow, struct flow *flow,
+                    struct flow_wildcards *);
  odp_port_t tnl_port_send(const struct ofport_dpif *, struct flow *,
                           struct flow_wildcards *wc);
  
diff --git a/ovsdb/ovsdbmonitor/.gitignore b/ovsdb/ovsdbmonitor/.gitignore

index d6f433b..e02ced0 100644 (file)
--- a/ovsdb/ovsdbmonitor/.gitignore
+++ b/ovsdb/ovsdbmonitor/.gitignore
@@ -1 +1 @@
-/ovsdbmonitor.py
+/ovsdbmonitor
diff --git a/tests/bfd.at b/tests/bfd.at

index c54fff0..fb8b1d3 100644 (file)
--- a/tests/bfd.at
+++ b/tests/bfd.at
@@ -213,3 +213,37 @@ BFD_CHECK_RX([p0], [1000ms], [300ms])
  
  OVS_VSWITCHD_STOP
  AT_CLEANUP
+
+AT_SETUP([bfd - check_tnl_key])
+OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=gre \
+                    options:remote_ip=2.2.2.2 options:key=1 ofport_request=1 -- \
+                    set interface p1 bfd:enable=true -- \
+                    set bridge br0 fail-mode=standalone])
+
+# by default check_tnl_key is false. so we should process a bfd packet with tun_id=1.
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout])
+# check that the packet should be handled as BFD packet.
+AT_CHECK([tail -2 stdout], [0], [dnl
+This flow is handled by the userspace slow path because it:
+       - Consists of BFD packets.
+], [])
+
+# turn on the check_tnl_key.
+AT_CHECK([ovs-vsctl set interface p1 bfd:check_tnl_key=true])
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout])
+# check that the packet should be handled as normal packet.
+AT_CHECK([tail -1 stdout], [0],[dnl
+Datapath actions: 100
+], [])
+
+# set the tunnel key to 0.
+AT_CHECK([ovs-vsctl set interface p1 options:key=0])
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x0,src=2.2.2.2,dst=2.2.2.1,tos=0x0,ttl=64,flags(key)),in_port(1),skb_mark(0/0),eth(src=00:11:22:33:44:55,dst=00:23:20:00:00:01),eth_type(0x0800),ipv4(src=169.254.1.0/0.0.0.0,dst=169.254.1.1/0.0.0.0,proto=17/0xff,tos=0/0,ttl=255/0,frag=no/0xff),udp(src=49152/0,dst=3784/0xffff)' -generate], [0], [stdout])
+# check that the packet should be handled as BFD packet.
+AT_CHECK([tail -2 stdout], [0], [dnl
+This flow is handled by the userspace slow path because it:
+       - Consists of BFD packets.
+], [])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
diff --git a/tests/learn.at b/tests/learn.at

index 7e781c3..fc8d071 100644 (file)
--- a/tests/learn.at
+++ b/tests/learn.at
@@ -291,12 +291,14 @@ AT_CHECK([[ovs-ofctl add-flow br0 'actions=load:3->NXM_NX_REG0[0..15],learn(tabl
  # Trace some packets arriving.  The particular packets don't matter.
  for i in 1 2 3 4 5 6 7 8 9 10; do
      ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9)'
+    ovs-appctl time/warp 10
  done
  
  # Check for the learning entry.
+ovs-appctl time/warp 1000
  AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort], [0],
  [[ n_packets=1, n_bytes=60, actions=load:0x3->NXM_NX_REG0[0..15],learn(table=0,priority=65535,NXM_OF_ETH_SRC[],NXM_OF_VLAN_TCI[0..11],output:NXM_NX_REG0[0..15]),output:2
- priority=65535,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:05 actions=output:3
+ n_packets=9, n_bytes=540, priority=65535,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:05 actions=output:3
  NXST_FLOW reply:
  ]])
  
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at

index 46e1dea..b093998 100644 (file)
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -1196,7 +1196,7 @@ Datapath actions: 2
  AT_CHECK([head -n 3 stdout], [0], [dnl
  Bridge: br0
  Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
-Flow: skb_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
+Flow: pkt_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
  ])
  
  # Test command: ofproto/trace dp_name odp_flow packet
@@ -1208,18 +1208,18 @@ Datapath actions: 2
  AT_CHECK([head -n 3 stdout], [0], [dnl
  Bridge: br0
  Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
-Flow: skb_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
+Flow: pkt_mark=0x2,skb_priority=0x1,arp,metadata=0,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:01,dl_dst=50:54:00:00:00:02,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
  ])
  
  # Test command: ofproto/trace br_name br_flow packet
  AT_CHECK([ovs-appctl ofproto/trace br0 \
-  "in_port=2,skb_priority=2,skb_mark=1" "$pkt2to1"], [0], [stdout],[stderr])
+  "in_port=2,skb_priority=2,pkt_mark=1" "$pkt2to1"], [0], [stdout],[stderr])
  AT_CHECK([tail -1 stdout], [0], [dnl
-Datapath actions: set(skb_mark(0)),1
+Datapath actions: 1
  ])
  AT_CHECK([head -n 2 stdout], [0], [dnl
  Packet: arp,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
-Flow: skb_mark=0x1,skb_priority=0x2,arp,metadata=0,in_port=2,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
+Flow: pkt_mark=0x1,skb_priority=0x2,arp,metadata=0,in_port=2,vlan_tci=0x0000,dl_src=50:54:00:00:00:02,dl_dst=50:54:00:00:00:01,arp_spa=0.0.0.0,arp_tpa=0.0.0.0,arp_sha=00:00:00:00:00:00,arp_tha=00:00:00:00:00:00
  ])
  
  OVS_VSWITCHD_STOP
@@ -2581,8 +2581,15 @@ AT_DATA([flows.txt], [dnl
  table=0 in_port=1 actions=load:2->NXM_NX_REG0[[0..15]],learn(table=1,priority=65535,NXM_OF_ETH_SRC[[]],NXM_OF_VLAN_TCI[[0..11]],output:NXM_NX_REG0[[0..15]]),output:2
  ])
  AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
-AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
-AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+# We send each packet twice because the first packet in each flow causes the
+# flow table to change and thus revalidations, which (depending on timing)
+# can keep a megaflow from being installed.  The revalidations are done by
+# the second iteration, allowing the flows to be installed.
+for i in 1 2; do
+    AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+    AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+    ovs-appctl time/warp 100
+done
  dnl The original flow is missing due to a revalidation.
  AT_CHECK([ovs-appctl dpif/dump-megaflows br0 | STRIP_XOUT], [0], [dnl
  skb_priority=0,ip,in_port=1,vlan_tci=0x0000/0x0fff,dl_src=50:54:00:00:00:09,nw_frag=no, n_subfacets:1, used:0.0s, Datapath actions: <del>
diff --git a/tests/ofproto.at b/tests/ofproto.at

index e2e6f1b..38bfb02 100644 (file)
--- a/tests/ofproto.at
+++ b/tests/ofproto.at
@@ -1563,14 +1563,14 @@ ovs-appctl -t ovs-ofctl ofctl/set-output-file monitor.log
  AT_CAPTURE_FILE([monitor.log])
  
  # Send a packet-out with a load action to set some metadata, and forward to controller
-AT_CHECK([ovs-ofctl packet-out br0 controller 'load(0xfafafafa5a5a5a5a->OXM_OF_METADATA[[0..63]]), controller' '0001020304050010203040501234'])
+AT_CHECK([ovs-ofctl packet-out br0 controller 'load(0xfafafafa5a5a5a5a->OXM_OF_METADATA[[0..63]]), load(0xaa->NXM_NX_PKT_MARK[[]]), controller' '0001020304050010203040501234'])
  
  # Stop the monitor and check its output.
  ovs-appctl -t ovs-ofctl ofctl/barrier
  ovs-appctl -t ovs-ofctl exit
  
  AT_CHECK([sed 's/ (xid=0x[[0-9a-fA-F]]*)//' monitor.log], [0], [dnl
-NXT_PACKET_IN: total_len=14 in_port=CONTROLLER metadata=0xfafafafa5a5a5a5a (via action) data_len=14 (unbuffered)
+NXT_PACKET_IN: total_len=14 in_port=CONTROLLER metadata=0xfafafafa5a5a5a5a pkt_mark=0xaa (via action) data_len=14 (unbuffered)
  metadata=0,in_port=0,vlan_tci=0x0000,dl_src=00:10:20:30:40:50,dl_dst=00:01:02:03:04:05,dl_type=0x1234
  OFPT_BARRIER_REPLY:
  ])
diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at

index 18f9152..996ea06 100644 (file)
--- a/tests/ovs-ofctl.at
+++ b/tests/ovs-ofctl.at
@@ -12,7 +12,7 @@ for test_case in \
      'metadata=0                                  NXM,OXM' \
      'in_port=1                                   any' \
      'skb_priority=0                              none' \
-    'skb_mark=1                                  none' \
+    'pkt_mark=1                                  NXM,OXM' \
      'reg0=0                                      NXM,OXM' \
      'reg1=1                                      NXM,OXM' \
      'reg2=2                                      NXM,OXM' \
@@ -180,9 +180,9 @@ AT_CHECK([ovs-ofctl parse-flows flows.txt
  AT_CLEANUP
  
  
-AT_SETUP([ovs-ofctl parse-flows (skb_mark and skb_priority)])
+AT_SETUP([ovs-ofctl parse-flows (skb_priority)])
  AT_DATA([flows.txt], [[
-skb_mark=0x12345678,skb_priority=0x12341234,tcp,tp_src=123,actions=flood
+skb_priority=0x12341234,tcp,tp_src=123,actions=flood
  ]])
  
  AT_CHECK([ovs-ofctl parse-flows flows.txt
@@ -197,6 +197,7 @@ AT_DATA([flows.txt], [[
  # comment
  tcp,tp_src=123,actions=flood
  in_port=LOCAL dl_vlan=9 dl_src=00:0A:E4:25:6B:B0 actions=drop
+pkt_mark=0xbb,actions=set_field:0xaa->pkt_mark
  udp dl_vlan_pcp=7 idle_timeout=5 actions=strip_vlan output:0
  tcp,nw_src=192.168.0.3,tp_dst=80 actions=set_queue:37,output:1
  udp,nw_src=192.168.0.3,tp_dst=53 actions=pop_queue,output:1
@@ -232,6 +233,7 @@ AT_CHECK([[sed 's/ (xid=0x[0-9a-fA-F]*)//' stdout]], [0],
  chosen protocol: NXM+table_id
  NXT_FLOW_MOD: ADD table:255 tcp,tp_src=123 actions=FLOOD
  NXT_FLOW_MOD: ADD table:255 in_port=LOCAL,dl_vlan=9,dl_src=00:0a:e4:25:6b:b0 actions=drop
+NXT_FLOW_MOD: ADD table:255 pkt_mark=0xbb actions=load:0xaa->NXM_NX_PKT_MARK[]
  NXT_FLOW_MOD: ADD table:255 udp,dl_vlan_pcp=7 idle:5 actions=strip_vlan,output:0
  NXT_FLOW_MOD: ADD table:255 tcp,nw_src=192.168.0.3,tp_dst=80 actions=set_queue:37,output:1
  NXT_FLOW_MOD: ADD table:255 udp,nw_src=192.168.0.3,tp_dst=53 actions=pop_queue,output:1
diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in

index 3e6c7fe..47b591a 100644 (file)
--- a/utilities/ovs-ofctl.8.in
+++ b/utilities/ovs-ofctl.8.in
@@ -806,6 +806,13 @@ exactly, and a 0-bit wildcards that bit.
  When a packet enters an OpenFlow switch, all of the registers are set
  to 0.  Only explicit Nicira extension actions change register values.
  .
+.IP \fBpkt_mark=\fIvalue\fR[\fB/\fImask\fR]
+Matches packet metadata mark \fIvalue\fR either exactly or with optional
+\fImask\fR. The mark is associated data that may be passed into other
+system components in order to facilitate interaction between subsystems.
+On Linux this corresponds to the skb mark but the exact implementation is
+platform-dependent.
+.
  .PP
  Defining IPv6 flows (those with \fBdl_type\fR equal to 0x86dd) requires
  support for NXM.  The following shorthand notations are available for
@@ -1107,7 +1114,7 @@ be specified as a name used for matching.  (This is similar to
  Open Flow 1.2 and above.)
  .
  .IP
-Example: \fBset_field:fe80:0123:4567:890a:a6ba:dbff:fefe:59fa\->ipv6_src\fR
+Example: \fBset_field:00:11:22:33:44:55->eth_src\fR.
  .
  .IP "\fBmultipath(\fIfields\fB, \fIbasis\fB, \fIalgorithm\fB, \fIn_links\fB, \fIarg\fB, \fIdst\fB[\fIstart\fB..\fIend\fB])\fR"
  Hashes \fIfields\fR using \fIbasis\fR as a universal hash parameter,
author	Giuseppe Lettieri <g.lettieri@iet.unipi.it>
	Thu, 15 Aug 2013 18:43:14 +0000 (20:43 +0200)
committer	Giuseppe Lettieri <g.lettieri@iet.unipi.it>
	Thu, 15 Aug 2013 18:43:14 +0000 (20:43 +0200)
FAQ		patch \| blob \| history
NEWS		patch \| blob \| history
acinclude.m4		patch \| blob \| history
datapath/Modules.mk		patch \| blob \| history
datapath/actions.c		patch \| blob \| history
datapath/compat.h		patch \| blob \| history
datapath/datapath.c		patch \| blob \| history
datapath/datapath.h		patch \| blob \| history
datapath/dp_notify.c		patch \| blob \| history
datapath/flow.h		patch \| blob \| history
datapath/linux/Modules.mk		patch \| blob \| history
datapath/linux/compat/include/linux/if_vlan.h		patch \| blob \| history
datapath/linux/compat/include/linux/netdev_features.h	[new file with mode: 0644]	patch \| blob
datapath/linux/compat/include/linux/netdevice.h		patch \| blob \| history
datapath/linux/compat/include/net/gre.h		patch \| blob \| history
datapath/linux/compat/include/net/ip_tunnels.h		patch \| blob \| history
datapath/linux/compat/ip_tunnels_core.c		patch \| blob \| history
datapath/tunnel.c	[deleted file]	patch \| blob \| history
datapath/tunnel.h	[deleted file]	patch \| blob \| history
datapath/vlan.h		patch \| blob \| history
datapath/vport-gre.c		patch \| blob \| history
datapath/vport-internal_dev.c		patch \| blob \| history
datapath/vport-lisp.c		patch \| blob \| history
datapath/vport-netdev.c		patch \| blob \| history
datapath/vport-vxlan.c		patch \| blob \| history
debian/rules		patch \| blob \| history
include/openflow/nicira-ext.h		patch \| blob \| history
include/sparse/pthread.h		patch \| blob \| history
lib/automake.mk		patch \| blob \| history
lib/bfd.c		patch \| blob \| history
lib/bond.c		patch \| blob \| history
lib/compiler.h		patch \| blob \| history
lib/dpif-netdev.c		patch \| blob \| history
lib/flow.c		patch \| blob \| history
lib/flow.h		patch \| blob \| history
lib/match.c		patch \| blob \| history
lib/match.h		patch \| blob \| history
lib/meta-flow.c		patch \| blob \| history
lib/meta-flow.h		patch \| blob \| history
lib/netdev-bsd.c		patch \| blob \| history
lib/netdev-dummy.c		patch \| blob \| history
lib/netdev-linux.c		patch \| blob \| history
lib/netdev-provider.h		patch \| blob \| history
lib/netdev-vport.c		patch \| blob \| history
lib/netdev-vport.h		patch \| blob \| history
lib/netdev.c		patch \| blob \| history
lib/netdev.h		patch \| blob \| history
lib/nx-match.c		patch \| blob \| history
lib/odp-execute.c		patch \| blob \| history
lib/odp-util.c		patch \| blob \| history
lib/odp-util.h		patch \| blob \| history
lib/ofp-print.c		patch \| blob \| history
lib/ofp-util.c		patch \| blob \| history
lib/ovs-thread.h		patch \| blob \| history
lib/poll-loop.c		patch \| blob \| history
lib/seq.c	[new file with mode: 0644]	patch \| blob
lib/seq.h	[new file with mode: 0644]	patch \| blob
ofproto/automake.mk		patch \| blob \| history
ofproto/ofproto-dpif-upcall.c	[new file with mode: 0644]	patch \| blob
ofproto/ofproto-dpif-upcall.h	[new file with mode: 0644]	patch \| blob
ofproto/ofproto-dpif-xlate.c		patch \| blob \| history
ofproto/ofproto-dpif-xlate.h		patch \| blob \| history
ofproto/ofproto-dpif.c		patch \| blob \| history
ofproto/ofproto-dpif.h		patch \| blob \| history
ofproto/ofproto-provider.h		patch \| blob \| history
ofproto/ofproto-unixctl.man		patch \| blob \| history
ofproto/ofproto.c		patch \| blob \| history
ofproto/tunnel.c		patch \| blob \| history
ofproto/tunnel.h		patch \| blob \| history
ovsdb/ovsdbmonitor/.gitignore		patch \| blob \| history
tests/bfd.at		patch \| blob \| history
tests/learn.at		patch \| blob \| history
tests/ofproto-dpif.at		patch \| blob \| history
tests/ofproto.at		patch \| blob \| history
tests/ovs-ofctl.at		patch \| blob \| history
utilities/ovs-ofctl.8.in		patch \| blob \| history