Merge branch 'mainstream'

author Giuseppe Lettieri <g.lettieri@iet.unipi.it>

Tue, 25 Feb 2014 15:42:09 +0000 (16:42 +0100)

committer Giuseppe Lettieri <g.lettieri@iet.unipi.it>

Tue, 25 Feb 2014 15:42:09 +0000 (16:42 +0100)
author Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Tue, 25 Feb 2014 15:42:09 +0000 (16:42 +0100)
committer Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Tue, 25 Feb 2014 15:42:09 +0000 (16:42 +0100)
diff --git a/AUTHORS b/AUTHORS

index 5817aa1..d8f13ba 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -69,6 +69,7 @@ Jun Nakajima            jun.nakajima@intel.com
  Justin Pettit           jpettit@nicira.com
  Keith Amidon            keith@nicira.com
  Ken Ajiro               ajiro@mxw.nes.nec.co.jp
+Kmindg G                kmindg@gmail.com
  Krishna Kondaka         kkondaka@vmware.com
  Kyle Mestery            kmestery@cisco.com
  Leo Alterman            lalterman@nicira.com
@@ -141,6 +142,7 @@ Amey Bhide              abhide@nicira.com
  Amre Shakimov           ashakimov@vmware.com
  André Ruß               andre.russ@hybris.com
  Andreas Beckmann        debian@abeckmann.de
+Andrei Andone           andrei.andone@softvision.ro
  Anton Matsiuk           anton.matsiuk@gmail.com
  Atzm Watanabe           atzm@stratosphere.co.jp
  Bastian Blank           waldi@debian.org
@@ -200,6 +202,7 @@ Koichi Yagishita        yagishita.koichi@jrc.co.jp
  Konstantin Khorenko     khorenko@openvz.org
  Kris zhang              zhang.kris@gmail.com
  Krishna Miriyala        krishna@nicira.com
+Len Gao                 leng@vmware.com
  Logan Rosen             logatronico@gmail.com
  Luca Falavigna          dktrkranz@debian.org
  Luiz Henrique Ozaki     luiz.ozaki@gmail.com
@@ -236,16 +239,19 @@ Sean Brady              sbrady@gtfservices.com
  Sebastian Andrzej Siewior  sebastian@breakpoint.cc
  Sébastien RICCIO        sr@swisscenter.com
  Spiro Kourtessis        spiro@vmware.com
+Sridhar Samudrala       samudrala.sridhar@gmail.com
  Srini Seetharaman       seethara@stanford.edu
  Stephen Hemminger       shemminger@vyatta.com
  Stephen Finucane        stephen.finucane@intel.com
  Suganya Ramachandran    suganyar@vmware.com
  Takayuki HAMA           t-hama@cb.jp.nec.com
  Teemu Koponen           koponen@nicira.com
+Thomas Morin            thomas.morin@orange.com
  Timothy Chen            tchen@nicira.com
  Torbjorn Tornkvist      kruskakli@gmail.com
  Valentin Bud            valentin@hackaserver.com
  Vasiliy Tolstov         v.tolstov@selfip.ru
+Vasu Dasari             vdasari@gmail.com
  Vishal Swarankar        vishal.swarnkar@gmail.com
  Vjekoslav Brajkovic     balkan@cs.washington.edu
  Voravit T.              voravit@kth.se
diff --git a/FAQ b/FAQ

index a08d65c..c6ccbd1 100644 (file)
--- a/FAQ
+++ b/FAQ
@@ -1403,9 +1403,9 @@ A: Yes, OpenFlow requires a switch to ignore attempts to send a packet
     even be convenient, e.g. it is often the desired behavior in a flow
     that forwards a packet to several ports ("floods" the packet).
  
-   Sometimes one really needs to send a packet out its ingress port.
-   In this case, output to OFPP_IN_PORT, which in ovs-ofctl syntax is
-   expressed as just "in_port", e.g.:
+   Sometimes one really needs to send a packet out its ingress port
+   ("hairpin"). In this case, output to OFPP_IN_PORT, which in
+   ovs-ofctl syntax is expressed as just "in_port", e.g.:
  
         ovs-ofctl add-flow br0 in_port=2,actions=in_port
  
@@ -1475,6 +1475,23 @@ A: These flows drop the ARP packets that IP hosts use to establish IP
        priority=5,in_port=1,dl_dst=01:00:00:00:00:00/01:00:00:00:00:00,actions=2
        priority=5,in_port=2,dl_dst=01:00:00:00:00:00/01:00:00:00:00:00,actions=1
  
+
+Development
+-----------
+
+Q: How do I implement a new OpenFlow message?
+
+A: Add your new message to "enum ofpraw" and "enum ofptype" in
+   lib/ofp-msgs.h, following the existing pattern.  Then recompile and
+   fix all of the new warnings, implementing new functionality for the
+   new message as needed.  (If you configure with --enable-Werror, as
+   described in INSTALL, then it is impossible to miss any warnings.)
+
+   If you need to add an OpenFlow vendor extension message for a
+   vendor that doesn't yet have any extension messages, then you will
+   also need to edit build-aux/extract-ofp-msgs.
+
+
  Contact 
  -------
  
diff --git a/INSTALL b/INSTALL

index 001d3cb..9f9491f 100644 (file)
--- a/INSTALL
+++ b/INSTALL
@@ -185,6 +185,11 @@ Prerequisites section, follow the procedure below to build.
     command line, turning warnings into errors.  That makes it
     impossible to miss warnings generated by the build.
  
+   To build with gcov code coverage support, add --enable-coverage,
+   e.g.:
+
+      % ./configure --enable-coverage
+
     The configure script accepts a number of other options and honors
     additional environment variables.  For a full list, invoke
     configure with the --help option.
@@ -379,20 +384,27 @@ above, but also replaces the old kernel module with the new one. Open vSwitch
  startup scripts for Debian, XenServer and RHEL use ovs-ctl's functions and it
  is recommended that these functions be used for other software platforms too.
  
-Running the Testsuite
-=====================
+Testsuites
+==========
+
+This section describe Open vSwitch's built-in support for various test
+suites.  You must configure and build Open vSwitch (steps 1 through 3
+in "Building and Installing Open vSwitch for Linux, FreeBSD or NetBSD"
+above) before you run the tests described here.  You do not need to
+install Open vSwitch or to build or load the kernel module to run
+these test suites.  You do not need supervisor privilege to run these
+test suites.
  
-Open vSwitch includes a testsuite.  Before you submit patches
+Self-Tests
+----------
+
+Open vSwitch includes a suite of self-tests.  Before you submit patches
  upstream, we advise that you run the tests and ensure that they pass.
  If you add new features to Open vSwitch, then adding tests for those
  features will ensure your features don't break as developers modify
  other areas of Open vSwitch.
  
-You must configure and build Open vSwitch (steps 1 through 3 in
-"Building and Installing Open vSwitch for Linux, FreeBSD or NetBSD" above)
-before you run the testsuite.  You do not need to install Open vSwitch
-or to build or load the kernel module to run the testsuite.  You do
-not need supervisor privilege to run the testsuite.
+Refer to "Testsuites" above for prerequisites.
  
  To run all the unit tests in Open vSwitch, one at a time:
        make check
@@ -432,7 +444,87 @@ usually a bug in the testsuite, not a bug in Open vSwitch itself.  If
  you find that a test fails intermittently, please report it, since the
  developers may not have noticed.
  
+OFTest
+------
+
+OFTest is an OpenFlow protocol testing suite.  Open vSwitch includes a
+Makefile target to run OFTest with Open vSwitch in "dummy mode".  In
+this mode of testing, no packets travel across physical or virtual
+networks.  Instead, Unix domain sockets stand in as simulated
+networks.  This simulation is imperfect, but it is much easier to set
+up, does not require extra physical or virtual hardware, and does not
+require supervisor privileges.
+
+To run OFTest with Open vSwitch, first read and follow the
+instructions under "Testsuites" above.  Second, obtain a copy of
+OFTest and install its prerequisites.  You need a copy of OFTest that
+includes commit 406614846c5 (make ovs-dummy platform work again).
+This commit was merged into the OFTest repository on Feb 1, 2013, so
+any copy of OFTest more recent than that should work.  Testing OVS in
+dummy mode does not require root privilege, so you may ignore that
+requirement.
+
+Optionally, add the top-level OFTest directory (containing the "oft"
+program) to your $PATH.  This slightly simplifies running OFTest later.
+
+To run OFTest in dummy mode, run the following command from your Open
+vSwitch build directory:
+    make check-oftest OFT=<oft-binary>
+where <oft-binary> is the absolute path to the "oft" program in
+OFTest.
+
+If you added "oft" to your $PATH, you may omit the OFT variable
+assignment:
+    make check-oftest
+By default, "check-oftest" passes "oft" just enough options to enable
+dummy mode.  You can use OFTFLAGS to pass additional options.  For
+example, to run just the basic.Echo test instead of all tests (the
+default) and enable verbose logging:
+    make check-oftest OFT=<oft-binary> OFTFLAGS='--verbose -T basic.Echo'
+
+If you use OFTest that does not include commit 4d1f3eb2c792 (oft:
+change default port to 6653), merged into the OFTest repository in
+October 2013, then you need to add an option to use the IETF-assigned
+controller port:
+    make check-oftest OFT=<oft-binary> OFTFLAGS='--port=6653'
+
+Please interpret OFTest results cautiously.  Open vSwitch can fail a
+given test in OFTest for many reasons, including bugs in Open vSwitch,
+bugs in OFTest, bugs in the "dummy mode" integration, and differing
+interpretations of the OpenFlow standard and other standards.
+
+Open vSwitch has not been validated against OFTest.  Please do report
+test failures that you believe to represent bugs in Open vSwitch.
+Include the precise versions of Open vSwitch and OFTest in your bug
+report, plus any other information needed to reproduce the problem.
+
+Ryu
+---
+
+Ryu is an OpenFlow controller written in Python that includes an
+extensive OpenFlow testsuite.  Open vSwitch includes a Makefile target
+to run Ryu in "dummy mode".  See "OFTest" above for an explanation of
+dummy mode.
+
+To run Ryu tests with Open vSwitch, first read and follow the
+instructions under "Testsuites" above.  Second, obtain a copy of Ryu,
+install its prerequisites, and build it.  You do not need to install
+Ryu (some of the tests do not get installed, so it does not help).
+
+To run Ryu tests, run the following command from your Open vSwitch
+build directory:
+    make check-ryu RYUDIR=<ryu-source-dir>
+where <ryu-source-dir> is the absolute path to the root of the Ryu
+source distribution.  The default <ryu-source-dir> is $srcdir/../ryu
+where $srcdir is your Open vSwitch source directory, so if this
+default is correct then you make simply run "make check-ryu".
+
+Open vSwitch has not been validated against Ryu.  Please do report
+test failures that you believe to represent bugs in Open vSwitch.
+Include the precise versions of Open vSwitch and Ryu in your bug
+report, plus any other information needed to reproduce the problem.
+
  Bug Reporting
--------------
+=============
  
  Please report problems to bugs@openvswitch.org.
diff --git a/Makefile.am b/Makefile.am

index 8e87557..da371d8 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -73,8 +73,6 @@ EXTRA_DIST = \
         NOTICE \
         OPENFLOW-1.1+ \
         PORTING \
-       README-OFTest \
-       README-gcov \
         README-lisp \
         REPORTING-BUGS \
         SubmittingPatches \
diff --git a/NEWS b/NEWS

index fe4b8cf..f0d5a10 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,13 @@ Post-v2.1.0
  ---------------------
     - The "ovsdbmonitor" graphical tool has been removed, because it was
       poorly maintained and not widely used.
+   - New "check-ryu" Makefile target for running Ryu tests for OpenFlow
+     controllers against Open vSwitch.  See INSTALL for details.
+   - Added IPFIX support for SCTP flows and templates for ICMPv4/v6 flows.
+   - Upon the receipt of a SIGHUP signal, ovs-vswitchd no longer reopens its
+     log file (it will terminate instead). Please use 'ovs-appctl vlog/reopen'
+     instead.
+
  
  v2.1.0 - xx xxx xxxx
  ---------------------
diff --git a/OPENFLOW-1.1+ b/OPENFLOW-1.1+

index eaf2ee9..1b8a0ee 100644 (file)
--- a/OPENFLOW-1.1+
+++ b/OPENFLOW-1.1+
@@ -56,7 +56,7 @@ probably incomplete.
  
      * OFPT_TABLE_MOD message.  This is new in OF1.1, so we need to
        implement it.  It should be implemented so that the default OVS
-      behavior does not change.
+      behavior does not change.  Simon Horman has posted a patch.
        [required for OF1.1 and OF1.2]
  
      * MPLS.  Simon Horman maintains a patch series that adds this
@@ -91,8 +91,8 @@ didn't compare the specs carefully yet.)
        Currently we always report OFPBRC_MULTIPART_BUFFER_OVERFLOW.
        [optional for OF1.3+]
  
-    * Add OFPMP_TABLE_FEATURES statistics.
-      [optional for OF1.3+]
+    * Add OFPMP_TABLE_FEATURES statistics.  Alexander Wu has posted a
+      patch series.  [optional for OF1.3+]
  
      * More flexible table miss support.
        This requires the following.
@@ -121,18 +121,10 @@ didn't compare the specs carefully yet.)
        some kind of "hardware" support, if we judged it useful enough.)
        [optional for OF1.3+]
  
-    * MPLS BoS matching.
-      Part of MPLS patchset by Simon Horman.
-      [optional for OF1.3+]
-
      * Provider Backbone Bridge tagging.  I don't plan to implement
        this (but we'd accept an implementation).
        [optional for OF1.3+]
  
-    * Rework tag order.
-      Part of MPLS patchset by Simon Horman.
-      [required for v1.3+]
-
      * On-demand flow counters.  I think this might be a real
        optimization in some cases for the software switch.
        [optional for OF1.3+]
diff --git a/README-OFTest b/README-OFTest

deleted file mode 100644 (file)

index cdca06d..0000000
--- a/README-OFTest
+++ /dev/null
@@ -1,77 +0,0 @@
-                How to Use OFTest With Open vSwitch
-                ===================================
-
-This document describes how to use the OFTest OpenFlow protocol
-testing suite with Open vSwitch in "dummy mode".  In this mode of
-testing, no packets travel across physical or virtual networks.
-Instead, Unix domain sockets stand in as simulated networks.  This
-simulation is imperfect, but it is much easier to set up, does not
-require extra physical or virtual hardware, and does not require
-supervisor privileges.
-
-Prerequisites
--------------
-
-First, build Open vSwitch according to the instructions in INSTALL.
-You need not install it.
-
-Second, obtain a copy of OFTest and install its prerequisites.  You
-need a copy of OFTest that includes commit 406614846c5 (make ovs-dummy
-platform work again).  This commit was merged into the OFTest
-repository on Feb 1, 2013, so any copy of OFTest more recent than that
-should work.
-
-Testing OVS in dummy mode does not require root privilege, so you may
-ignore that requirement.
-
-Optionally, add the top-level OFTest directory (containing the "oft"
-program) to your $PATH.  This slightly simplifies running OFTest later.
-
-Running OFTest
---------------
-
-To run OFTest in dummy mode, run the following command from your Open
-vSwitch build directory:
-
-    make check-oftest OFT=<oft-binary>
-
-where <oft-binary> is the absolute path to the "oft" program in
-OFTest.
-
-If you added "oft" to your $PATH, you may omit the OFT variable
-assignment:
-
-    make check-oftest
-
-By default, "check-oftest" passes "oft" just enough options to enable
-dummy mode.  You can use OFTFLAGS to pass additional options.  For
-example, to run just the basic.Echo test instead of all tests (the
-default) and enable verbose logging:
-
-    make check-oftest OFT=<oft-binary> OFTFLAGS='--verbose -T basic.Echo'
-
-If you use OFTest that does not include commit 4d1f3eb2c792 (oft:
-change default port to 6653), merged into the OFTest repository in
-October 2013, then you need to add an option to use the IETF-assigned
-controller port:
-
-    make check-oftest OFT=<oft-binary> OFTFLAGS='--port=6653'
-
-Interpreting OFTest Results
----------------------------
-
-Please interpret OFTest results cautiously.  Open vSwitch can fail a
-given test in OFTest for many reasons, including bugs in Open vSwitch,
-bugs in OFTest, bugs in the "dummy mode" integration, and differing
-interpretations of the OpenFlow standard and other standards.
-
-Open vSwitch has not been validated against OFTest.  Please do report
-test failures that you believe to represent bugs in Open vSwitch.
-Include the precise versions of Open vSwitch and OFTest in your bug
-report, plus any other information needed to reproduce the problem.
-
-Contact 
--------
-
-bugs@openvswitch.org
-http://openvswitch.org/
diff --git a/README-gcov b/README-gcov

deleted file mode 100644 (file)

index 2fe9f3a..0000000
--- a/README-gcov
+++ /dev/null
@@ -1,18 +0,0 @@
-Building with gcov support
-==========================
-
-The Open vSwitch "configure" script supports the following
-code-coverage related options:
-
-  --disable-coverage
-  --enable-coverage=no
-
-    Do not build with gcov code coverage support.
-
-    This is the default if no coverage option is passed to
-    "configure".
-
-  --enable-coverage
-  --enable-coverage=yes
-
-    Build with gcov code coverage support.
diff --git a/configure.ac b/configure.ac

index 19c095e..2fccc64 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -13,7 +13,7 @@
  # limitations under the License.
  
  AC_PREREQ(2.64)
-AC_INIT(openvswitch, 2.1.90, ovs-bugs@openvswitch.org)
+AC_INIT(openvswitch, 2.1.90, bugs@openvswitch.org)
  AC_CONFIG_SRCDIR([datapath/datapath.c])
  AC_CONFIG_MACRO_DIR([m4])
  AC_CONFIG_AUX_DIR([build-aux])
diff --git a/datapath/actions.c b/datapath/actions.c

index 30ea1d2..0b66e7c 100644 (file)
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -135,8 +135,8 @@ static int set_eth_addr(struct sk_buff *skb,
  
         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
  
-       memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN);
-       memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN);
+       ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src);
+       ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst);
  
         ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
  
diff --git a/datapath/datapath.c b/datapath/datapath.c

index d528ba0..f7c3391 100644 (file)
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -525,7 +525,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
                 packet->protocol = htons(ETH_P_802_2);
  
         /* Build an sw_flow for sending this packet. */
-       flow = ovs_flow_alloc(false);
+       flow = ovs_flow_alloc();
         err = PTR_ERR(flow);
         if (IS_ERR(flow))
                 goto err_kfree_skb;
@@ -783,7 +783,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
         struct datapath *dp;
         struct sw_flow_actions *acts = NULL;
         struct sw_flow_match match;
-       bool exact_5tuple;
         int error;
  
         /* Extract key. */
@@ -792,7 +791,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
                 goto error;
  
         ovs_match_init(&match, &key, &mask);
-       error = ovs_nla_get_match(&match, &exact_5tuple,
+       error = ovs_nla_get_match(&match,
                                   a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
         if (error)
                 goto error;
@@ -831,7 +830,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
                         goto err_unlock_ovs;
  
                 /* Allocate flow. */
-               flow = ovs_flow_alloc(!exact_5tuple);
+               flow = ovs_flow_alloc();
                 if (IS_ERR(flow)) {
                         error = PTR_ERR(flow);
                         goto err_unlock_ovs;
@@ -915,7 +914,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
         }
  
         ovs_match_init(&match, &key, NULL);
-       err = ovs_nla_get_match(&match, NULL, a[OVS_FLOW_ATTR_KEY], NULL);
+       err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
         if (err)
                 return err;
  
@@ -969,7 +968,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
         }
  
         ovs_match_init(&match, &key, NULL);
-       err = ovs_nla_get_match(&match, NULL, a[OVS_FLOW_ATTR_KEY], NULL);
+       err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
         if (err)
                 goto unlock;
  
@@ -1179,7 +1178,7 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *in
         struct datapath *dp;
  
         dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
-       if (!dp)
+       if (IS_ERR(dp))
                 return;
  
         WARN(dp->user_features, "Dropping previously announced user features\n");
@@ -1767,11 +1766,12 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
         int bucket = cb->args[0], skip = cb->args[1];
         int i, j = 0;
  
+       rcu_read_lock();
         dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
-       if (!dp)
+       if (!dp) {
+               rcu_read_unlock();
                 return -ENODEV;
-
-       rcu_read_lock();
+       }
         for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
                 struct vport *vport;
  
diff --git a/datapath/flow.c b/datapath/flow.c

index 8be3801..c3e3fcb 100644 (file)
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -66,108 +66,113 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
  {
         struct flow_stats *stats;
         __be16 tcp_flags = 0;
+       int node = numa_node_id();
  
-       if (!flow->stats.is_percpu)
-               stats = flow->stats.stat;
-       else
-               stats = this_cpu_ptr(flow->stats.cpu_stats);
+       stats = rcu_dereference(flow->stats[node]);
  
         if ((flow->key.eth.type == htons(ETH_P_IP) ||
              flow->key.eth.type == htons(ETH_P_IPV6)) &&
+           flow->key.ip.frag != OVS_FRAG_TYPE_LATER &&
             flow->key.ip.proto == IPPROTO_TCP &&
             likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
                 tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
         }
  
-       spin_lock(&stats->lock);
+       /* Check if already have node-specific stats. */
+       if (likely(stats)) {
+               spin_lock(&stats->lock);
+               /* Mark if we write on the pre-allocated stats. */
+               if (node == 0 && unlikely(flow->stats_last_writer != node))
+                       flow->stats_last_writer = node;
+       } else {
+               stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
+               spin_lock(&stats->lock);
+
+               /* If the current NUMA-node is the only writer on the
+                * pre-allocated stats keep using them.
+                */
+               if (unlikely(flow->stats_last_writer != node)) {
+                       /* A previous locker may have already allocated the
+                        * stats, so we need to check again.  If node-specific
+                        * stats were already allocated, we update the pre-
+                        * allocated stats as we have already locked them.
+                        */
+                       if (likely(flow->stats_last_writer != NUMA_NO_NODE)
+                           && likely(!rcu_dereference(flow->stats[node]))) {
+                               /* Try to allocate node-specific stats. */
+                               struct flow_stats *new_stats;
+
+                               new_stats =
+                                       kmem_cache_alloc_node(flow_stats_cache,
+                                                             GFP_THISNODE |
+                                                             __GFP_NOMEMALLOC,
+                                                             node);
+                               if (likely(new_stats)) {
+                                       new_stats->used = jiffies;
+                                       new_stats->packet_count = 1;
+                                       new_stats->byte_count = skb->len;
+                                       new_stats->tcp_flags = tcp_flags;
+                                       spin_lock_init(&new_stats->lock);
+
+                                       rcu_assign_pointer(flow->stats[node],
+                                                          new_stats);
+                                       goto unlock;
+                               }
+                       }
+                       flow->stats_last_writer = node;
+               }
+       }
+
         stats->used = jiffies;
         stats->packet_count++;
         stats->byte_count += skb->len;
         stats->tcp_flags |= tcp_flags;
+unlock:
         spin_unlock(&stats->lock);
  }
  
-static void stats_read(struct flow_stats *stats, bool lock_bh,
-                      struct ovs_flow_stats *ovs_stats,
-                      unsigned long *used, __be16 *tcp_flags)
-{
-       if (lock_bh)
-               spin_lock_bh(&stats->lock);
-       else
-               spin_lock(&stats->lock);
-
-       if (time_after(stats->used, *used))
-               *used = stats->used;
-       *tcp_flags |= stats->tcp_flags;
-       ovs_stats->n_packets += stats->packet_count;
-       ovs_stats->n_bytes += stats->byte_count;
-
-       if (lock_bh)
-               spin_unlock_bh(&stats->lock);
-       else
-               spin_unlock(&stats->lock);
-}
-
  void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
                         unsigned long *used, __be16 *tcp_flags)
  {
-       int cpu, cur_cpu;
+       int node;
  
         *used = 0;
         *tcp_flags = 0;
         memset(ovs_stats, 0, sizeof(*ovs_stats));
  
-       if (!flow->stats.is_percpu) {
-               stats_read(flow->stats.stat, true, ovs_stats, used, tcp_flags);
-       } else {
-               cur_cpu = get_cpu();
-
-               for_each_possible_cpu(cpu) {
-                       struct flow_stats *stats;
-                       bool lock_bh;
+       for_each_node(node) {
+               struct flow_stats *stats = rcu_dereference(flow->stats[node]);
  
-                       stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-                       lock_bh = (cpu == cur_cpu);
-                       stats_read(stats, lock_bh, ovs_stats, used, tcp_flags);
+               if (stats) {
+                       /* Local CPU may write on non-local stats, so we must
+                        * block bottom-halves here.
+                        */
+                       spin_lock_bh(&stats->lock);
+                       if (time_after(stats->used, *used))
+                               *used = stats->used;
+                       *tcp_flags |= stats->tcp_flags;
+                       ovs_stats->n_packets += stats->packet_count;
+                       ovs_stats->n_bytes += stats->byte_count;
+                       spin_unlock_bh(&stats->lock);
                 }
-               put_cpu();
         }
  }
  
-static void stats_reset(struct flow_stats *stats, bool lock_bh)
-{
-       if (lock_bh)
-               spin_lock_bh(&stats->lock);
-       else
-               spin_lock(&stats->lock);
-
-       stats->used = 0;
-       stats->packet_count = 0;
-       stats->byte_count = 0;
-       stats->tcp_flags = 0;
-
-       if (lock_bh)
-               spin_unlock_bh(&stats->lock);
-       else
-               spin_unlock(&stats->lock);
-}
-
  void ovs_flow_stats_clear(struct sw_flow *flow)
  {
-       int cpu, cur_cpu;
-
-       if (!flow->stats.is_percpu) {
-               stats_reset(flow->stats.stat, true);
-       } else {
-               cur_cpu = get_cpu();
-
-               for_each_possible_cpu(cpu) {
-                       bool lock_bh;
-
-                       lock_bh = (cpu == cur_cpu);
-                       stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), lock_bh);
+       int node;
+
+       for_each_node(node) {
+               struct flow_stats *stats = rcu_dereference(flow->stats[node]);
+
+               if (stats) {
+                       spin_lock_bh(&stats->lock);
+                       stats->used = 0;
+                       stats->packet_count = 0;
+                       stats->byte_count = 0;
+                       stats->tcp_flags = 0;
+                       spin_unlock_bh(&stats->lock);
                 }
-               put_cpu();
         }
  }
  
@@ -395,14 +400,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
                             && opt_len == 8) {
                                 if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
                                         goto invalid;
-                               memcpy(key->ipv6.nd.sll,
-                                   &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
+                               ether_addr_copy(key->ipv6.nd.sll,
+                                   &nd->opt[offset+sizeof(*nd_opt)]);
                         } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
                                    && opt_len == 8) {
                                 if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
                                         goto invalid;
-                               memcpy(key->ipv6.nd.tll,
-                                   &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
+                               ether_addr_copy(key->ipv6.nd.tll,
+                                   &nd->opt[offset+sizeof(*nd_opt)]);
                         }
  
                         icmp_len -= opt_len;
@@ -462,8 +467,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
          * header in the linear data area.
          */
         eth = eth_hdr(skb);
-       memcpy(key->eth.src, eth->h_source, ETH_ALEN);
-       memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
+       ether_addr_copy(key->eth.src, eth->h_source);
+       ether_addr_copy(key->eth.dst, eth->h_dest);
  
         __skb_pull(skb, 2 * ETH_ALEN);
         /* We are going to push all headers that we pull, so no need to
@@ -560,8 +565,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
                                 key->ip.proto = ntohs(arp->ar_op);
                         memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
                         memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
-                       memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
-                       memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
+                       ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
+                       ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
                 }
         } else if (key->eth.type == htons(ETH_P_IPV6)) {
                 int nh_len;             /* IPv6 Header + Extensions */
diff --git a/datapath/flow.h b/datapath/flow.h

index eafcfd8..270a324 100644 (file)
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -157,24 +157,22 @@ struct flow_stats {
         __be16 tcp_flags;               /* Union of seen TCP flags. */
  };
  
-struct sw_flow_stats {
-       bool is_percpu;
-       union {
-               struct flow_stats *stat;
-               struct flow_stats __percpu *cpu_stats;
-       };
-};
-
  struct sw_flow {
         struct rcu_head rcu;
         struct hlist_node hash_node[2];
         u32 hash;
-
+       int stats_last_writer;          /* NUMA-node id of the last writer on
+                                        * 'stats[0]'.
+                                        */
         struct sw_flow_key key;
         struct sw_flow_key unmasked_key;
         struct sw_flow_mask *mask;
         struct sw_flow_actions __rcu *sf_acts;
-       struct sw_flow_stats stats;
+       struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
+                                          * is allocated at flow creation time,
+                                          * the rest are allocated on demand
+                                          * while holding the 'stats[0].lock'.
+                                          */
  };
  
  struct arp_eth_header {
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c

index 39fe4bf..40751cb 100644 (file)
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -268,20 +268,6 @@ static bool is_all_zero(const u8 *fp, size_t size)
         return true;
  }
  
-static bool is_all_set(const u8 *fp, size_t size)
-{
-       int i;
-
-       if (!fp)
-               return false;
-
-       for (i = 0; i < size; i++)
-               if (fp[i] != 0xff)
-                       return false;
-
-       return true;
-}
-
  static int __parse_flow_nlattrs(const struct nlattr *attr,
                                 const struct nlattr *a[],
                                 u64 *attrsp, bool nz)
@@ -503,9 +489,8 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
         return 0;
  }
  
-static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple,
-                               u64 attrs, const struct nlattr **a,
-                               bool is_mask)
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+                               const struct nlattr **a, bool is_mask)
  {
         int err;
         u64 orig_attrs = attrs;
@@ -562,11 +547,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
                 SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
         }
  
-       if (is_mask && exact_5tuple) {
-               if (match->mask->key.eth.type != htons(0xffff))
-                       *exact_5tuple = false;
-       }
-
         if (attrs & (1ULL << OVS_KEY_ATTR_IPV4)) {
                 const struct ovs_key_ipv4 *ipv4_key;
  
@@ -589,13 +569,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
                 SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
                                 ipv4_key->ipv4_dst, is_mask);
                 attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4);
-
-               if (is_mask && exact_5tuple && *exact_5tuple) {
-                       if (ipv4_key->ipv4_proto != 0xff ||
-                           ipv4_key->ipv4_src != htonl(0xffffffff) ||
-                           ipv4_key->ipv4_dst != htonl(0xffffffff))
-                               *exact_5tuple = false;
-               }
         }
  
         if (attrs & (1ULL << OVS_KEY_ATTR_IPV6)) {
@@ -627,15 +600,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
                                 is_mask);
  
                 attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6);
-
-               if (is_mask && exact_5tuple && *exact_5tuple) {
-                       if (ipv6_key->ipv6_proto != 0xff ||
-                           !is_all_set((const u8 *)ipv6_key->ipv6_src,
-                                       sizeof(match->key->ipv6.addr.src)) ||
-                           !is_all_set((const u8 *)ipv6_key->ipv6_dst,
-                                       sizeof(match->key->ipv6.addr.dst)))
-                               *exact_5tuple = false;
-               }
         }
  
         if (attrs & (1ULL << OVS_KEY_ATTR_ARP)) {
@@ -678,11 +642,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
                                         tcp_key->tcp_dst, is_mask);
                 }
                 attrs &= ~(1ULL << OVS_KEY_ATTR_TCP);
-
-               if (is_mask && exact_5tuple && *exact_5tuple &&
-                   (tcp_key->tcp_src != htons(0xffff) ||
-                    tcp_key->tcp_dst != htons(0xffff)))
-                       *exact_5tuple = false;
         }
  
         if (attrs & (1ULL << OVS_KEY_ATTR_TCP_FLAGS)) {
@@ -714,11 +673,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
                                         udp_key->udp_dst, is_mask);
                 }
                 attrs &= ~(1ULL << OVS_KEY_ATTR_UDP);
-
-               if (is_mask && exact_5tuple && *exact_5tuple &&
-                   (udp_key->udp_src != htons(0xffff) ||
-                    udp_key->udp_dst != htons(0xffff)))
-                       *exact_5tuple = false;
         }
  
         if (attrs & (1ULL << OVS_KEY_ATTR_SCTP)) {
@@ -804,7 +758,6 @@ static void sw_flow_mask_set(struct sw_flow_mask *mask,
   * attribute specifies the mask field of the wildcarded flow.
   */
  int ovs_nla_get_match(struct sw_flow_match *match,
-                     bool *exact_5tuple,
                       const struct nlattr *key,
                       const struct nlattr *mask)
  {
@@ -852,13 +805,10 @@ int ovs_nla_get_match(struct sw_flow_match *match,
                 }
         }
  
-       err = ovs_key_from_nlattrs(match, NULL, key_attrs, a, false);
+       err = ovs_key_from_nlattrs(match, key_attrs, a, false);
         if (err)
                 return err;
  
-       if (exact_5tuple)
-               *exact_5tuple = true;
-
         if (mask) {
                 err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
                 if (err)
@@ -896,7 +846,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
                         }
                 }
  
-               err = ovs_key_from_nlattrs(match, exact_5tuple, mask_attrs, a, true);
+               err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
                 if (err)
                         return err;
         } else {
@@ -986,8 +936,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
                 goto nla_put_failure;
  
         eth_key = nla_data(nla);
-       memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
-       memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
+       ether_addr_copy(eth_key->eth_src, output->eth.src);
+       ether_addr_copy(eth_key->eth_dst, output->eth.dst);
  
         if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
                 __be16 eth_type;
@@ -1059,8 +1009,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
                 arp_key->arp_sip = output->ipv4.addr.src;
                 arp_key->arp_tip = output->ipv4.addr.dst;
                 arp_key->arp_op = htons(output->ip.proto);
-               memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
-               memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
+               ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
+               ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
         }
  
         if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1147,8 +1097,8 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
                                 nd_key = nla_data(nla);
                                 memcpy(nd_key->nd_target, &output->ipv6.nd.target,
                                                         sizeof(nd_key->nd_target));
-                               memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
-                               memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
+                               ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll);
+                               ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll);
                         }
                 }
         }
diff --git a/datapath/flow_netlink.h b/datapath/flow_netlink.h

index b31fbe2..4401510 100644 (file)
--- a/datapath/flow_netlink.h
+++ b/datapath/flow_netlink.h
@@ -45,7 +45,6 @@ int ovs_nla_put_flow(const struct sw_flow_key *,
  int ovs_nla_get_flow_metadata(struct sw_flow *flow,
                               const struct nlattr *attr);
  int ovs_nla_get_match(struct sw_flow_match *match,
-                     bool *exact_5tuple,
                       const struct nlattr *,
                       const struct nlattr *);
  
diff --git a/datapath/flow_table.c b/datapath/flow_table.c

index 4e6b1c0..54a2e25 100644 (file)
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -50,6 +50,7 @@
  #define REHASH_INTERVAL                (10 * 60 * HZ)
  
  static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
  
  static u16 range_n_bytes(const struct sw_flow_key_range *range)
  {
@@ -74,10 +75,11 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
                 *d++ = *s++ & *m++;
  }
  
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(void)
  {
         struct sw_flow *flow;
-       int cpu;
+       struct flow_stats *stats;
+       int node;
  
         flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
         if (!flow)
@@ -85,27 +87,22 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats)
  
         flow->sf_acts = NULL;
         flow->mask = NULL;
+       flow->stats_last_writer = NUMA_NO_NODE;
  
-       flow->stats.is_percpu = percpu_stats;
+       /* Initialize the default stat node. */
+       stats = kmem_cache_alloc_node(flow_stats_cache,
+                                     GFP_KERNEL | __GFP_ZERO, 0);
+       if (!stats)
+               goto err;
  
-       if (!percpu_stats) {
-               flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
-               if (!flow->stats.stat)
-                       goto err;
+       spin_lock_init(&stats->lock);
  
-               spin_lock_init(&flow->stats.stat->lock);
-       } else {
-               flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
-               if (!flow->stats.cpu_stats)
-                       goto err;
+       RCU_INIT_POINTER(flow->stats[0], stats);
  
-               for_each_possible_cpu(cpu) {
-                       struct flow_stats *cpu_stats;
+       for_each_node(node)
+               if (node != 0)
+                       RCU_INIT_POINTER(flow->stats[node], NULL);
  
-                       cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-                       spin_lock_init(&cpu_stats->lock);
-               }
-       }
         return flow;
  err:
         kmem_cache_free(flow_cache, flow);
@@ -142,11 +139,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
  
  static void flow_free(struct sw_flow *flow)
  {
+       int node;
+
         kfree((struct sf_flow_acts __force *)flow->sf_acts);
-       if (flow->stats.is_percpu)
-               free_percpu(flow->stats.cpu_stats);
-       else
-               kfree(flow->stats.stat);
+       for_each_node(node)
+               if (flow->stats[node])
+                       kmem_cache_free(flow_stats_cache,
+                                       (struct flow_stats __force *)flow->stats[node]);
         kmem_cache_free(flow_cache, flow);
  }
  
@@ -608,16 +607,28 @@ int ovs_flow_init(void)
         BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
         BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
  
-       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
-                                       0, NULL);
+       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+                                      + (num_possible_nodes()
+                                         * sizeof(struct flow_stats *)),
+                                      0, 0, NULL);
         if (flow_cache == NULL)
                 return -ENOMEM;
  
+       flow_stats_cache
+               = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+                                   0, SLAB_HWCACHE_ALIGN, NULL);
+       if (flow_stats_cache == NULL) {
+               kmem_cache_destroy(flow_cache);
+               flow_cache = NULL;
+               return -ENOMEM;
+       }
+
         return 0;
  }
  
  /* Uninitializes the flow module. */
  void ovs_flow_exit(void)
  {
+       kmem_cache_destroy(flow_stats_cache);
         kmem_cache_destroy(flow_cache);
  }
diff --git a/datapath/flow_table.h b/datapath/flow_table.h

index baaeb10..ca8a582 100644 (file)
--- a/datapath/flow_table.h
+++ b/datapath/flow_table.h
@@ -52,10 +52,12 @@ struct flow_table {
         unsigned int count;
  };
  
+extern struct kmem_cache *flow_stats_cache;
+
  int ovs_flow_init(void);
  void ovs_flow_exit(void);
  
-struct sw_flow *ovs_flow_alloc(bool percpu_stats);
+struct sw_flow *ovs_flow_alloc(void);
  void ovs_flow_free(struct sw_flow *, bool deferred);
  
  int ovs_flow_tbl_init(struct flow_table *);
diff --git a/datapath/linux/compat/include/linux/etherdevice.h b/datapath/linux/compat/include/linux/etherdevice.h

index eb7123e..556729d 100644 (file)
--- a/datapath/linux/compat/include/linux/etherdevice.h
+++ b/datapath/linux/compat/include/linux/etherdevice.h
@@ -34,4 +34,19 @@ static inline int eth_mac_addr(struct net_device *dev, void *p)
  }
  #endif
  
+static inline void ether_addr_copy(u8 *dst, const u8 *src)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+       *(u32 *)dst = *(const u32 *)src;
+       *(u16 *)(dst + 4) = *(const u16 *)(src + 4);
+#else
+       u16 *a = (u16 *)dst;
+       const u16 *b = (const u16 *)src;
+
+       a[0] = b[0];
+       a[1] = b[1];
+       a[2] = b[2];
+#endif
+}
+
  #endif
diff --git a/datapath/linux/compat/include/linux/rcupdate.h b/datapath/linux/compat/include/linux/rcupdate.h

index 20cdedf..a497f7b 100644 (file)
--- a/datapath/linux/compat/include/linux/rcupdate.h
+++ b/datapath/linux/compat/include/linux/rcupdate.h
@@ -22,4 +22,16 @@ static inline int rcu_read_lock_held(void)
  }
  #endif
  
+#ifndef RCU_INITIALIZER
+#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
+#endif
+
+#ifndef RCU_INIT_POINTER
+#define RCU_INIT_POINTER(p, v) \
+        do { \
+                p = RCU_INITIALIZER(v); \
+        } while (0)
+
+#endif
+
  #endif /* linux/rcupdate.h wrapper */
diff --git a/lib/automake.mk b/lib/automake.mk

index ebf3f9d..0ec18a4 100644 (file)
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -195,7 +195,6 @@ lib_libopenvswitch_la_SOURCES = \
         lib/stream-provider.h \
         lib/stream-ssl.h \
         lib/stream-tcp.c \
-       lib/stream-unix.c \
         lib/stream.c \
         lib/stream.h \
         lib/stdio.c \
@@ -248,7 +247,8 @@ lib_libopenvswitch_la_SOURCES += \
  else
  lib_libopenvswitch_la_SOURCES += \
         lib/daemon.c \
-       lib/latch.c
+       lib/latch.c \
+       lib/stream-unix.c
  endif
  
  EXTRA_DIST += \
diff --git a/lib/bfd.c b/lib/bfd.c

index a8c2294..5413105 100644 (file)
--- a/lib/bfd.c
+++ b/lib/bfd.c
@@ -878,7 +878,7 @@ bfd_forwarding__(struct bfd *bfd) OVS_REQUIRES(mutex)
  static bool
  bfd_lookup_ip(const char *host_name, struct in_addr *addr)
  {
-    if (!inet_aton(host_name, addr)) {
+    if (!inet_pton(AF_INET, host_name, addr)) {
          VLOG_ERR_RL(&rl, "\"%s\" is not a valid IP address", host_name);
          return false;
      }
diff --git a/lib/daemon.c b/lib/daemon.c

index f9290ef..9d96cba 100644 (file)
--- a/lib/daemon.c
+++ b/lib/daemon.c
@@ -441,6 +441,26 @@ monitor_daemon(pid_t daemon_pid)
      set_subprogram_name("");
  }
  
+/* Returns a readable and writable fd for /dev/null, if successful, otherwise
+ * a negative errno value.  The caller must not close the returned fd (because
+ * the same fd will be handed out to subsequent callers). */
+static int
+get_null_fd(void)
+{
+    static int null_fd;
+
+    if (!null_fd) {
+        null_fd = open("/dev/null", O_RDWR);
+        if (null_fd < 0) {
+            int error = errno;
+            VLOG_ERR("could not open /dev/null: %s", ovs_strerror(error));
+            null_fd = -error;
+        }
+    }
+
+    return null_fd;
+}
+
  /* Close standard file descriptors (except any that the client has requested we
   * leave open by calling daemon_save_fd()).  If we're started from e.g. an SSH
   * session, then this keeps us from holding that session open artificially. */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c

index b3a3742..b1c25c8 100644 (file)
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -879,6 +879,7 @@ dp_netdev_flow_unref(struct dp_netdev_flow *flow)
          cls_rule_destroy(CONST_CAST(struct cls_rule *, &flow->cr));
          ovs_mutex_lock(&flow->mutex);
          dp_netdev_actions_unref(flow->actions);
+        ovs_refcount_destroy(&flow->ref_cnt);
          ovs_mutex_unlock(&flow->mutex);
          ovs_mutex_destroy(&flow->mutex);
          free(flow);
@@ -1302,6 +1303,7 @@ dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
              ovs_mutex_unlock(&netdev_flow->mutex);
          }
          dp_netdev_remove_flow(dp, netdev_flow);
+        dp_netdev_flow_unref(netdev_flow);
      } else {
          error = ENOENT;
      }
@@ -1551,6 +1553,7 @@ void
  dp_netdev_actions_unref(struct dp_netdev_actions *actions)
  {
      if (actions && ovs_refcount_unref(&actions->ref_cnt) == 1) {
+        ovs_refcount_destroy(&actions->ref_cnt);
          free(actions->actions);
          free(actions);
      }
@@ -1712,6 +1715,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct ofpbuf *packet,
          dp_netdev_execute_actions(dp, &key, packet, md,
                                    actions->actions, actions->size);
          dp_netdev_actions_unref(actions);
+        dp_netdev_flow_unref(netdev_flow);
          ovsthread_counter_inc(dp->n_hit, 1);
      } else {
          ovsthread_counter_inc(dp->n_missed, 1);
diff --git a/lib/entropy.c b/lib/entropy.c

index 53f7e72..f980855 100644 (file)
--- a/lib/entropy.c
+++ b/lib/entropy.c
@@ -60,10 +60,8 @@ get_entropy(void *buffer, size_t n)
      CryptAcquireContext(&crypt_prov, NULL, NULL,
                          PROV_RSA_FULL, CRYPT_VERIFYCONTEXT);
      if (!CryptGenRandom(crypt_prov, n, buffer)) {
-        char *msg_buf = ovs_lasterror_to_string();
+        VLOG_ERR("CryptGenRandom: read error (%s)", ovs_lasterror_to_string());
          error = EINVAL;
-        VLOG_ERR("CryptGenRandom: read error (%s)", msg_buf);
-        LocalFree(msg_buf);
      }
  
      CryptReleaseContext(crypt_prov, 0);
diff --git a/lib/flow.c b/lib/flow.c

index 06ba036..e7fe4d3 100644 (file)
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -1130,7 +1130,7 @@ flow_count_common_mpls_labels(const struct flow *a, int an,
   * If the new label is the second or label MPLS label in 'flow', it is
   * generated as;
   *
- *     - label: 0.
+ *     - label: Copied from outer label.
   *
   *     - TTL: Copied from outer label.
   *
@@ -1156,7 +1156,7 @@ flow_push_mpls(struct flow *flow, int n, ovs_be16 mpls_eth_type,
              flow->mpls_lse[i] = flow->mpls_lse[i - 1];
          }
          flow->mpls_lse[0] = (flow->mpls_lse[1]
-                             & htonl(MPLS_TTL_MASK | MPLS_TC_MASK));
+                             & htonl(~MPLS_BOS_MASK));
      } else {
          int label = 0;          /* IPv4 Explicit Null. */
          int tc = 0;
diff --git a/lib/hmap.h b/lib/hmap.h

index 76a73ac..445e74f 100644 (file)
--- a/lib/hmap.h
+++ b/lib/hmap.h
@@ -19,6 +19,7 @@
  
  #include <stdbool.h>
  #include <stdlib.h>
+#include "ovs-atomic.h"
  #include "util.h"
  
  #ifdef  __cplusplus
@@ -189,10 +190,13 @@ hmap_capacity(const struct hmap *hmap)
  }
  
  /* Returns true if 'hmap' currently contains no nodes,
- * false otherwise. */
+ * false otherwise.
+ * Note: While hmap in general is not thread-safe without additional locking,
+ * hmap_is_empty() is. */
  static inline bool
  hmap_is_empty(const struct hmap *hmap)
  {
+    atomic_thread_fence(memory_order_acquire);
      return hmap->n == 0;
  }
  
diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c

index cef96a9..643a3c5 100644 (file)
--- a/lib/jsonrpc.c
+++ b/lib/jsonrpc.c
@@ -1090,6 +1090,14 @@ jsonrpc_session_get_reconnect_stats(const struct jsonrpc_session *s,
      reconnect_get_stats(s->reconnect, time_msec(), stats);
  }
  
+void
+jsonrpc_session_enable_reconnect(struct jsonrpc_session *s)
+{
+    reconnect_set_max_tries(s->reconnect, UINT_MAX);
+    reconnect_set_backoff(s->reconnect, RECONNECT_DEFAULT_MIN_BACKOFF,
+                          RECONNECT_DEFAULT_MAX_BACKOFF);
+}
+
  void
  jsonrpc_session_force_reconnect(struct jsonrpc_session *s)
  {
diff --git a/lib/jsonrpc.h b/lib/jsonrpc.h

index 5397200..b711d1a 100644 (file)
--- a/lib/jsonrpc.h
+++ b/lib/jsonrpc.h
@@ -123,6 +123,7 @@ int jsonrpc_session_get_last_error(const struct jsonrpc_session *);
  void jsonrpc_session_get_reconnect_stats(const struct jsonrpc_session *,
                                           struct reconnect_stats *);
  
+void jsonrpc_session_enable_reconnect(struct jsonrpc_session *);
  void jsonrpc_session_force_reconnect(struct jsonrpc_session *);
  
  void jsonrpc_session_set_max_backoff(struct jsonrpc_session *,
diff --git a/lib/lockfile.c b/lib/lockfile.c

index d8f3952..779b74e 100644 (file)
--- a/lib/lockfile.c
+++ b/lib/lockfile.c
@@ -44,6 +44,7 @@ struct lockfile {
      dev_t device;
      ino_t inode;
      int fd;
+    HANDLE lock_handle;
  };
  
  /* Lock table.
@@ -58,8 +59,14 @@ static struct hmap *const lock_table OVS_GUARDED_BY(lock_table_mutex)
      = &lock_table__;
  
  static void lockfile_unhash(struct lockfile *);
-static int lockfile_try_lock(const char *name, pid_t *pidp,
-                             struct lockfile **lockfilep);
+#ifdef _WIN32
+static int lockfile_try_lock_windows(const char *name, pid_t *pidp,
+                                     struct lockfile **lockfilep);
+static void lockfile_unlock_windows(struct lockfile * lockfile);
+#else
+static int lockfile_try_lock_posix(const char *name, pid_t *pidp,
+                                   struct lockfile **lockfilep);
+#endif
  
  /* Returns the name of the lockfile that would be created for locking a file
   * named 'filename_'.  The caller is responsible for freeing the returned name,
@@ -110,7 +117,11 @@ lockfile_lock(const char *file, struct lockfile **lockfilep)
      lock_name = lockfile_name(file);
  
      ovs_mutex_lock(&lock_table_mutex);
-    error = lockfile_try_lock(lock_name, &pid, lockfilep);
+#ifdef _WIN32
+    error = lockfile_try_lock_windows(lock_name, &pid, lockfilep);
+#else
+    error = lockfile_try_lock_posix(lock_name, &pid, lockfilep);
+#endif
      ovs_mutex_unlock(&lock_table_mutex);
  
      if (error) {
@@ -138,7 +149,11 @@ lockfile_unlock(struct lockfile *lockfile)
  {
      if (lockfile) {
          ovs_mutex_lock(&lock_table_mutex);
+#ifdef _WIN32
+        lockfile_unlock_windows(lockfile);
+#else
          lockfile_unhash(lockfile);
+#endif
          ovs_mutex_unlock(&lock_table_mutex);
  
          COVERAGE_INC(lockfile_unlock);
@@ -218,8 +233,76 @@ lockfile_register(const char *name, dev_t device, ino_t inode, int fd)
      return lockfile;
  }
  
+#ifdef _WIN32
+static void
+lockfile_unlock_windows(struct lockfile *lockfile)
+    OVS_REQUIRES(&lock_table_mutex)
+{
+    if (lockfile->fd >= 0) {
+        OVERLAPPED overl;
+        overl.hEvent = 0;
+        overl.Offset = 0;
+        overl.OffsetHigh = 0;
+        UnlockFileEx(lockfile->lock_handle, 0, 1, 0, &overl);
+
+        close(lockfile->fd);
+        lockfile->fd = -1;
+    }
+}
+
+static int
+lockfile_try_lock_windows(const char *name, pid_t *pidp,
+                          struct lockfile **lockfilep)
+    OVS_REQUIRES(&lock_table_mutex)
+{
+    HANDLE lock_handle;
+    BOOL retval;
+    OVERLAPPED overl;
+    struct lockfile *lockfile;
+    int fd;
+
+    *pidp = 0;
+
+    fd = open(name, O_RDWR | O_CREAT, 0600);
+    if (fd < 0) {
+        VLOG_WARN("%s: failed to open lock file: %s",
+                   name, ovs_strerror(errno));
+        return errno;
+    }
+
+    lock_handle = (HANDLE)_get_osfhandle(fd);
+    if (lock_handle < 0) {
+        VLOG_WARN("%s: failed to get the file handle: %s",
+                   name, ovs_strerror(errno));
+        return errno;
+    }
+
+    /* Lock the file 'name' for the region that includes just the first
+     * byte. */
+    overl.hEvent = 0;
+    overl.Offset = 0;
+    overl.OffsetHigh = 0;
+    retval = LockFileEx(lock_handle, LOCKFILE_EXCLUSIVE_LOCK
+                        | LOCKFILE_FAIL_IMMEDIATELY, 0, 1, 0, &overl);
+    if (!retval) {
+        VLOG_WARN("Failed to lock file : %s", ovs_lasterror_to_string());
+        return EEXIST;
+    }
+
+    lockfile = xmalloc(sizeof *lockfile);
+    lockfile->name = xstrdup(name);
+    lockfile->fd = fd;
+    lockfile->lock_handle = lock_handle;
+
+    *lockfilep = lockfile;
+    return 0;
+}
+#endif
+
+#ifndef _WIN32
  static int
-lockfile_try_lock(const char *name, pid_t *pidp, struct lockfile **lockfilep)
+lockfile_try_lock_posix(const char *name, pid_t *pidp,
+                        struct lockfile **lockfilep)
      OVS_REQUIRES(&lock_table_mutex)
  {
      struct flock l;
@@ -276,4 +359,4 @@ lockfile_try_lock(const char *name, pid_t *pidp, struct lockfile **lockfilep)
      }
      return error;
  }
-
+#endif
diff --git a/lib/meta-flow.c b/lib/meta-flow.c

index a168222..3afcd4c 100644 (file)
--- a/lib/meta-flow.c
+++ b/lib/meta-flow.c
@@ -1019,20 +1019,6 @@ mf_is_mask_valid(const struct mf_field *mf, const union mf_value *mask)
      OVS_NOT_REACHED();
  }
  
-static bool
-is_icmpv4(const struct flow *flow)
-{
-    return (flow->dl_type == htons(ETH_TYPE_IP)
-            && flow->nw_proto == IPPROTO_ICMP);
-}
-
-static bool
-is_icmpv6(const struct flow *flow)
-{
-    return (flow->dl_type == htons(ETH_TYPE_IPV6)
-            && flow->nw_proto == IPPROTO_ICMPV6);
-}
-
  /* Returns true if 'flow' meets the prerequisites for 'mf', false otherwise. */
  bool
  mf_are_prereqs_ok(const struct mf_field *mf, const struct flow *flow)
@@ -1769,7 +1755,7 @@ mf_set_flow_value(const struct mf_field *mf,
          break;
  
      case MFF_IP_FRAG:
-        flow->nw_frag &= value->u8;
+        flow->nw_frag = value->u8 & FLOW_NW_FRAG_MASK;
          break;
  
      case MFF_ARP_OP:
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c

index b8e7ab7..f23fc9f 100644 (file)
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -198,6 +198,7 @@ dummy_packet_stream_run(struct netdev_dummy *dev, struct dummy_packet_stream *s)
  
          txbuf = ofpbuf_from_list(list_front(&s->txq));
          retval = stream_send(s->stream, txbuf->data, txbuf->size);
+
          if (retval > 0) {
              ofpbuf_pull(txbuf, retval);
              if (!txbuf->size) {
@@ -229,6 +230,7 @@ dummy_packet_stream_run(struct netdev_dummy *dev, struct dummy_packet_stream *s)
  
          ofpbuf_prealloc_tailroom(&s->rxbuf, n);
          retval = stream_recv(s->stream, ofpbuf_tail(&s->rxbuf), n);
+
          if (retval > 0) {
              s->rxbuf.size += retval;
              if (retval == n && s->rxbuf.size > 2) {
@@ -367,26 +369,27 @@ dummy_packet_conn_set_config(struct dummy_packet_conn *conn,
          reconnect_set_name(reconnect, stream);
          reconnect_set_passive(reconnect, false, time_msec());
          reconnect_enable(reconnect, time_msec());
-        reconnect_set_backoff(reconnect, 1000, INT_MAX);
+        reconnect_set_backoff(reconnect, 100, INT_MAX);
          reconnect_set_probe_interval(reconnect, 0);
          conn->u.rconn.reconnect = reconnect;
+        conn->type = ACTIVE;
  
          error = stream_open(stream, &active_stream, DSCP_DEFAULT);
          conn->u.rconn.rstream = dummy_packet_stream_create(active_stream);
  
          switch (error) {
          case 0:
-            reconnect_connected(conn->u.rconn.reconnect, time_msec());
-            conn->type = ACTIVE;
+            reconnect_connected(reconnect, time_msec());
              break;
  
          case EAGAIN:
-            reconnect_connecting(conn->u.rconn.reconnect, time_msec());
+            reconnect_connecting(reconnect, time_msec());
              break;
  
          default:
-            reconnect_connecting(conn->u.rconn.reconnect, time_msec());
+            reconnect_connect_failed(reconnect, time_msec(), error);
              stream_close(active_stream);
+            conn->u.rconn.rstream->stream = NULL;
              break;
          }
      }
@@ -441,22 +444,29 @@ OVS_REQUIRES(dev->mutex)
      switch (reconnect_run(rconn->reconnect, time_msec())) {
      case RECONNECT_CONNECT:
          {
-            int err = stream_connect(rconn->rstream->stream);
+            int error;
+
+            if (rconn->rstream->stream) {
+                error = stream_connect(rconn->rstream->stream);
+            } else {
+                error = stream_open(reconnect_get_name(rconn->reconnect),
+                                    &rconn->rstream->stream, DSCP_DEFAULT);
+            }
  
-            switch (err) {
-            case 0: /* Connected. */
+            switch (error) {
+            case 0:
                  reconnect_connected(rconn->reconnect, time_msec());
-                dev->conn.type = ACTIVE;
                  break;
  
              case EAGAIN:
                  reconnect_connecting(rconn->reconnect, time_msec());
-                return;
+                break;
  
              default:
-                reconnect_connect_failed(rconn->reconnect, time_msec(), err);
+                reconnect_connect_failed(rconn->reconnect, time_msec(), error);
                  stream_close(rconn->rstream->stream);
-                return;
+                rconn->rstream->stream = NULL;
+                break;
              }
          }
          break;
@@ -475,6 +485,7 @@ OVS_REQUIRES(dev->mutex)
          if (err) {
              reconnect_disconnected(rconn->reconnect, time_msec(), err);
              stream_close(rconn->rstream->stream);
+            rconn->rstream->stream = NULL;
          }
      }
  }
@@ -511,7 +522,9 @@ dummy_packet_conn_wait(struct dummy_packet_conn *conn)
          }
          break;
      case ACTIVE:
-        dummy_packet_stream_wait(conn->u.rconn.rstream);
+        if (reconnect_is_connected(conn->u.rconn.reconnect)) {
+            dummy_packet_stream_wait(conn->u.rconn.rstream);
+        }
          break;
  
      case NONE:
@@ -537,8 +550,10 @@ dummy_packet_conn_send(struct dummy_packet_conn *conn,
          break;
  
      case ACTIVE:
-        dummy_packet_stream_send(conn->u.rconn.rstream, buffer, size);
-        dummy_packet_stream_wait(conn->u.rconn.rstream);
+        if (reconnect_is_connected(conn->u.rconn.reconnect)) {
+            dummy_packet_stream_send(conn->u.rconn.rstream, buffer, size);
+            dummy_packet_stream_wait(conn->u.rconn.rstream);
+        }
          break;
  
      case NONE:
diff --git a/lib/netdev.c b/lib/netdev.c

index bcdc5ee..f5f9233 100644 (file)
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -67,15 +67,14 @@ static struct shash netdev_shash OVS_GUARDED_BY(netdev_mutex)
  
  /* Protects 'netdev_classes' against insertions or deletions.
   *
- * This is not an rwlock for performance reasons but to allow recursive
- * acquisition when calling into providers.  For example, netdev_run() calls
- * into provider 'run' functions, which might reasonably want to call one of
- * the netdev functions that takes netdev_class_rwlock read-only. */
-static struct ovs_rwlock netdev_class_rwlock OVS_ACQ_BEFORE(netdev_mutex)
-    = OVS_RWLOCK_INITIALIZER;
+ * This is a recursive mutex to allow recursive acquisition when calling into
+ * providers.  For example, netdev_run() calls into provider 'run' functions,
+ * which might reasonably want to call one of the netdev functions that takes
+ * netdev_class_mutex. */
+static struct ovs_mutex netdev_class_mutex OVS_ACQ_BEFORE(netdev_mutex);
  
  /* Contains 'struct netdev_registered_class'es. */
-static struct hmap netdev_classes OVS_GUARDED_BY(netdev_class_rwlock)
+static struct hmap netdev_classes OVS_GUARDED_BY(netdev_class_mutex)
      = HMAP_INITIALIZER(&netdev_classes);
  
  struct netdev_registered_class {
@@ -93,11 +92,13 @@ void update_device_args(struct netdev *, const struct shash *args);
  
  static void
  netdev_initialize(void)
-    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
+    OVS_EXCLUDED(netdev_class_mutex, netdev_mutex)
  {
      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
  
      if (ovsthread_once_start(&once)) {
+        ovs_mutex_init_recursive(&netdev_class_mutex);
+
          fatal_signal_add_hook(restore_all_flags, NULL, NULL, true);
          netdev_vport_patch_register();
  
@@ -124,17 +125,17 @@ netdev_initialize(void)
   * main poll loop. */
  void
  netdev_run(void)
-    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
+    OVS_EXCLUDED(netdev_class_mutex, netdev_mutex)
  {
      struct netdev_registered_class *rc;
  
-    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
          if (rc->class->run) {
              rc->class->run();
          }
      }
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  }
  
  /* Arranges for poll_block() to wake up when netdev_run() needs to be called.
@@ -143,22 +144,22 @@ netdev_run(void)
   * main poll loop. */
  void
  netdev_wait(void)
-    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
+    OVS_EXCLUDED(netdev_class_mutex, netdev_mutex)
  {
      struct netdev_registered_class *rc;
  
-    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
          if (rc->class->wait) {
              rc->class->wait();
          }
      }
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  }
  
  static struct netdev_registered_class *
  netdev_lookup_class(const char *type)
-    OVS_REQ_RDLOCK(netdev_class_rwlock)
+    OVS_REQ_RDLOCK(netdev_class_mutex)
  {
      struct netdev_registered_class *rc;
  
@@ -175,11 +176,11 @@ netdev_lookup_class(const char *type)
   * registration, new netdevs of that type can be opened using netdev_open(). */
  int
  netdev_register_provider(const struct netdev_class *new_class)
-    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
+    OVS_EXCLUDED(netdev_class_mutex, netdev_mutex)
  {
      int error;
  
-    ovs_rwlock_wrlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      if (netdev_lookup_class(new_class->type)) {
          VLOG_WARN("attempted to register duplicate netdev provider: %s",
                     new_class->type);
@@ -199,7 +200,7 @@ netdev_register_provider(const struct netdev_class *new_class)
                       new_class->type, ovs_strerror(error));
          }
      }
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  
      return error;
  }
@@ -209,12 +210,12 @@ netdev_register_provider(const struct netdev_class *new_class)
   * new netdevs of that type cannot be opened using netdev_open(). */
  int
  netdev_unregister_provider(const char *type)
-    OVS_EXCLUDED(netdev_class_rwlock, netdev_mutex)
+    OVS_EXCLUDED(netdev_class_mutex, netdev_mutex)
  {
      struct netdev_registered_class *rc;
      int error;
  
-    ovs_rwlock_wrlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      rc = netdev_lookup_class(type);
      if (!rc) {
          VLOG_WARN("attempted to unregister a netdev provider that is not "
@@ -235,7 +236,7 @@ netdev_unregister_provider(const char *type)
              error = EBUSY;
          }
      }
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  
      return error;
  }
@@ -251,11 +252,11 @@ netdev_enumerate_types(struct sset *types)
      netdev_initialize();
      sset_clear(types);
  
-    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
          sset_add(types, rc->class->type);
      }
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  }
  
  /* Check that the network device name is not the same as any of the registered
@@ -271,15 +272,15 @@ netdev_is_reserved_name(const char *name)
  
      netdev_initialize();
  
-    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
          const char *dpif_port = netdev_vport_class_get_dpif_port(rc->class);
          if (dpif_port && !strcmp(dpif_port, name)) {
-            ovs_rwlock_unlock(&netdev_class_rwlock);
+            ovs_mutex_unlock(&netdev_class_mutex);
              return true;
          }
      }
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  
      if (!strncmp(name, "ovs-", 4)) {
          struct sset types;
@@ -315,7 +316,7 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp)
  
      netdev_initialize();
  
-    ovs_rwlock_rdlock(&netdev_class_rwlock);
+    ovs_mutex_lock(&netdev_class_mutex);
      ovs_mutex_lock(&netdev_mutex);
      netdev = shash_find_data(&netdev_shash, name);
      if (!netdev) {
@@ -356,7 +357,7 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp)
      }
  
      ovs_mutex_unlock(&netdev_mutex);
-    ovs_rwlock_unlock(&netdev_class_rwlock);
+    ovs_mutex_unlock(&netdev_class_mutex);
  
      if (!error) {
          netdev->ref_cnt++;
@@ -462,11 +463,11 @@ netdev_unref(struct netdev *dev)
          dev->netdev_class->dealloc(dev);
          ovs_mutex_unlock(&netdev_mutex);
  
-        ovs_rwlock_rdlock(&netdev_class_rwlock);
+        ovs_mutex_lock(&netdev_class_mutex);
          rc = netdev_lookup_class(class->type);
          atomic_sub(&rc->ref_cnt, 1, &old_ref_cnt);
          ovs_assert(old_ref_cnt > 0);
-        ovs_rwlock_unlock(&netdev_class_rwlock);
+        ovs_mutex_unlock(&netdev_class_mutex);
      } else {
          ovs_mutex_unlock(&netdev_mutex);
      }
diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c

index 781c3a1..3cd1e8b 100644 (file)
--- a/lib/ofp-actions.c
+++ b/lib/ofp-actions.c
@@ -2067,10 +2067,10 @@ ofpact_check__(enum ofputil_protocol *usable_protocols, struct ofpact *a,
          return 0;
  
      case OFPACT_POP_MPLS:
-        flow->dl_type = ofpact_get_POP_MPLS(a)->ethertype;
          if (!eth_type_mpls(flow->dl_type)) {
              inconsistent_match(usable_protocols);
          }
+        flow->dl_type = ofpact_get_POP_MPLS(a)->ethertype;
          return 0;
  
      case OFPACT_SAMPLE:
diff --git a/lib/ofp-util.c b/lib/ofp-util.c

index 7de82c6..fca18de 100644 (file)
--- a/lib/ofp-util.c
+++ b/lib/ofp-util.c
@@ -169,9 +169,11 @@ ofputil_match_from_ofp10_match(const struct ofp10_match *ofmatch,
          match->wc.masks.vlan_tci = htons(0xffff);
      } else {
          ovs_be16 vid, pcp, tci;
+        uint16_t hpcp;
  
          vid = ofmatch->dl_vlan & htons(VLAN_VID_MASK);
-        pcp = htons((ofmatch->dl_vlan_pcp << VLAN_PCP_SHIFT) & VLAN_PCP_MASK);
+        hpcp = (ofmatch->dl_vlan_pcp << VLAN_PCP_SHIFT) & VLAN_PCP_MASK;
+        pcp = htons(hpcp);
          tci = vid | pcp | htons(VLAN_CFI);
          match->flow.vlan_tci = tci & match->wc.masks.vlan_tci;
      }
@@ -3640,7 +3642,7 @@ ofputil_decode_ofp11_port(struct ofputil_phy_port *pp,
      ovs_strlcpy(pp->name, op->name, OFP_MAX_PORT_NAME_LEN);
  
      pp->config = ntohl(op->config) & OFPPC11_ALL;
-    pp->state = ntohl(op->state) & OFPPC11_ALL;
+    pp->state = ntohl(op->state) & OFPPS11_ALL;
  
      pp->curr = netdev_port_features_from_ofp11(op->curr);
      pp->advertised = netdev_port_features_from_ofp11(op->advertised);
diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c

index a20b2fd..4dfccaf 100644 (file)
--- a/lib/ovs-thread.c
+++ b/lib/ovs-thread.c
@@ -125,6 +125,12 @@ XPTHREAD_FUNC1(pthread_mutexattr_destroy, pthread_mutexattr_t *);
  XPTHREAD_FUNC2(pthread_mutexattr_settype, pthread_mutexattr_t *, int);
  XPTHREAD_FUNC2(pthread_mutexattr_gettype, pthread_mutexattr_t *, int *);
  
+XPTHREAD_FUNC1(pthread_rwlockattr_init, pthread_rwlockattr_t *);
+XPTHREAD_FUNC1(pthread_rwlockattr_destroy, pthread_rwlockattr_t *);
+#ifdef PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP
+XPTHREAD_FUNC2(pthread_rwlockattr_setkind_np, pthread_rwlockattr_t *, int);
+#endif
+
  XPTHREAD_FUNC2(pthread_cond_init, pthread_cond_t *, pthread_condattr_t *);
  XPTHREAD_FUNC1(pthread_cond_destroy, pthread_cond_t *);
  XPTHREAD_FUNC1(pthread_cond_signal, pthread_cond_t *);
@@ -168,17 +174,36 @@ ovs_mutex_init_recursive(const struct ovs_mutex *mutex)
      ovs_mutex_init__(mutex, PTHREAD_MUTEX_RECURSIVE);
  }
  
+/* Initializes 'mutex' as a recursive mutex. */
+void
+ovs_mutex_init_adaptive(const struct ovs_mutex *mutex)
+{
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+    ovs_mutex_init__(mutex, PTHREAD_MUTEX_ADAPTIVE_NP);
+#else
+    ovs_mutex_init(mutex);
+#endif
+}
+
  void
  ovs_rwlock_init(const struct ovs_rwlock *l_)
  {
      struct ovs_rwlock *l = CONST_CAST(struct ovs_rwlock *, l_);
+    pthread_rwlockattr_t attr;
      int error;
  
      l->where = NULL;
+
+    xpthread_rwlockattr_init(&attr);
+#ifdef PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP
+    xpthread_rwlockattr_setkind_np(
+        &attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+#endif
      error = pthread_rwlock_init(&l->lock, NULL);
      if (OVS_UNLIKELY(error)) {
          ovs_abort(error, "pthread_rwlock_init failed");
      }
+    xpthread_rwlockattr_destroy(&attr);
  }
  
  void
diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h

index f031894..2e9a937 100644 (file)
--- a/lib/ovs-thread.h
+++ b/lib/ovs-thread.h
@@ -37,6 +37,13 @@ struct OVS_LOCKABLE ovs_mutex {
  #define OVS_MUTEX_INITIALIZER { PTHREAD_MUTEX_INITIALIZER, NULL }
  #endif
  
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#define OVS_ADAPTIVE_MUTEX_INITIALIZER                  \
+    { PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, NULL }
+#else
+#define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
+#endif
+
  /* ovs_mutex functions analogous to pthread_mutex_*() functions.
   *
   * Most of these functions abort the process with an error message on any
@@ -44,6 +51,7 @@ struct OVS_LOCKABLE ovs_mutex {
   * return value to the caller and aborts on any other error. */
  void ovs_mutex_init(const struct ovs_mutex *);
  void ovs_mutex_init_recursive(const struct ovs_mutex *);
+void ovs_mutex_init_adaptive(const struct ovs_mutex *);
  void ovs_mutex_destroy(const struct ovs_mutex *);
  void ovs_mutex_unlock(const struct ovs_mutex *mutex) OVS_RELEASES(mutex);
  void ovs_mutex_lock_at(const struct ovs_mutex *mutex, const char *where)
@@ -69,14 +77,30 @@ void xpthread_mutexattr_destroy(pthread_mutexattr_t *);
  void xpthread_mutexattr_settype(pthread_mutexattr_t *, int type);
  void xpthread_mutexattr_gettype(pthread_mutexattr_t *, int *typep);
  
-/* Read-write lock. */
+/* Read-write lock.
+ *
+ * An ovs_rwlock does not support recursive readers, because POSIX allows
+ * taking the reader lock recursively to deadlock when a thread is waiting on
+ * the write-lock.  (NetBSD does deadlock.)  glibc rwlocks in their default
+ * configuration do not deadlock, but ovs_rwlock_init() initializes rwlocks as
+ * non-recursive (which will deadlock) for two reasons:
+ *
+ *     - glibc only provides fairness to writers in this mode.
+ *
+ *     - It's better to find bugs in the primary Open vSwitch target rather
+ *       than exposing them only to porters. */
  struct OVS_LOCKABLE ovs_rwlock {
      pthread_rwlock_t lock;
      const char *where;
  };
  
  /* Initializer. */
+#ifdef PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP
+#define OVS_RWLOCK_INITIALIZER \
+        { PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP, NULL }
+#else
  #define OVS_RWLOCK_INITIALIZER { PTHREAD_RWLOCK_INITIALIZER, NULL }
+#endif
  
  /* ovs_rwlock functions analogous to pthread_rwlock_*() functions.
   *
@@ -87,6 +111,13 @@ void ovs_rwlock_init(const struct ovs_rwlock *);
  void ovs_rwlock_destroy(const struct ovs_rwlock *);
  void ovs_rwlock_unlock(const struct ovs_rwlock *rwlock) OVS_RELEASES(rwlock);
  
+/* Wrappers for pthread_rwlockattr_*() that abort the process on any error. */
+void xpthread_rwlockattr_init(pthread_rwlockattr_t *);
+void xpthread_rwlockattr_destroy(pthread_rwlockattr_t *);
+#ifdef PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP
+void xpthread_rwlockattr_setkind_np(pthread_rwlockattr_t *, int kind);
+#endif
+
  void ovs_rwlock_wrlock_at(const struct ovs_rwlock *rwlock, const char *where)
      OVS_ACQ_WRLOCK(rwlock);
  #define ovs_rwlock_wrlock(rwlock) \
@@ -115,17 +146,6 @@ void xpthread_cond_destroy(pthread_cond_t *);
  void xpthread_cond_signal(pthread_cond_t *);
  void xpthread_cond_broadcast(pthread_cond_t *);
  
-#ifdef __CHECKER__
-/* Replace these functions by the macros already defined in the <pthread.h>
- * annotations, because the macro definitions have correct semantics for the
- * conditional acquisition that can't be captured in a function annotation.
- * The difference in semantics from pthread_*() to xpthread_*() does not matter
- * because sparse is not a compiler. */
-#define xpthread_mutex_trylock pthread_mutex_trylock
-#define xpthread_rwlock_tryrdlock pthread_rwlock_tryrdlock
-#define xpthread_rwlock_trywrlock pthread_rwlock_trywrlock
-#endif
-
  void xpthread_key_create(pthread_key_t *, void (*destructor)(void *));
  void xpthread_key_delete(pthread_key_t);
  void xpthread_setspecific(pthread_key_t, const void *);
diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c

index 02bee75..7556b7f 100644 (file)
--- a/lib/ovsdb-idl.c
+++ b/lib/ovsdb-idl.c
@@ -394,6 +394,14 @@ ovsdb_idl_has_ever_connected(const struct ovsdb_idl *idl)
      return ovsdb_idl_get_seqno(idl) != 0;
  }
  
+/* Reconfigures 'idl' so that it would reconnect to the database, if
+ * connection was dropped. */
+void
+ovsdb_idl_enable_reconnect(struct ovsdb_idl *idl)
+{
+    jsonrpc_session_enable_reconnect(idl->session);
+}
+
  /* Forces 'idl' to drop its connection to the database and reconnect.  In the
   * meantime, the contents of 'idl' will not change. */
  void
diff --git a/lib/ovsdb-idl.h b/lib/ovsdb-idl.h

index 6b5e198..cb0ad11 100644 (file)
--- a/lib/ovsdb-idl.h
+++ b/lib/ovsdb-idl.h
@@ -57,6 +57,7 @@ bool ovsdb_idl_is_lock_contended(const struct ovsdb_idl *);
  
  unsigned int ovsdb_idl_get_seqno(const struct ovsdb_idl *);
  bool ovsdb_idl_has_ever_connected(const struct ovsdb_idl *);
+void ovsdb_idl_enable_reconnect(struct ovsdb_idl *);
  void ovsdb_idl_force_reconnect(struct ovsdb_idl *);
  void ovsdb_idl_verify_write_only(struct ovsdb_idl *);
  
diff --git a/lib/packets.c b/lib/packets.c

index 003e554..7238f42 100644 (file)
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -218,7 +218,7 @@ eth_pop_vlan(struct ofpbuf *packet)
  }
  
  /* Set ethertype of the packet. */
-void
+static void
  set_ethertype(struct ofpbuf *packet, ovs_be16 eth_type)
  {
      struct eth_header *eh = packet->data;
@@ -329,9 +329,10 @@ push_mpls(struct ofpbuf *packet, ovs_be16 ethtype, ovs_be32 lse)
          return;
      }
  
+    set_ethertype(packet, ethtype);
+
      if (!is_mpls(packet)) {
-        /* Set ethtype and MPLS label stack entry. */
-        set_ethertype(packet, ethtype);
+        /* Set MPLS label stack entry. */
          packet->l2_5 = packet->l3;
      }
  
diff --git a/lib/packets.h b/lib/packets.h

index 8e21fa8..1855a1c 100644 (file)
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -154,8 +154,6 @@ void compose_rarp(struct ofpbuf *, const uint8_t eth_src[ETH_ADDR_LEN]);
  void eth_push_vlan(struct ofpbuf *, ovs_be16 tpid, ovs_be16 tci);
  void eth_pop_vlan(struct ofpbuf *);
  
-void set_ethertype(struct ofpbuf *packet, ovs_be16 eth_type);
-
  const char *eth_from_hex(const char *hex, struct ofpbuf **packetp);
  void eth_format_masked(const uint8_t eth[ETH_ADDR_LEN],
                         const uint8_t mask[ETH_ADDR_LEN], struct ds *s);
@@ -627,6 +625,18 @@ static inline bool is_ip_any(const struct flow *flow)
      return dl_type_is_ip_any(flow->dl_type);
  }
  
+static inline bool is_icmpv4(const struct flow *flow)
+{
+    return (flow->dl_type == htons(ETH_TYPE_IP)
+            && flow->nw_proto == IPPROTO_ICMP);
+}
+
+static inline bool is_icmpv6(const struct flow *flow)
+{
+    return (flow->dl_type == htons(ETH_TYPE_IPV6)
+            && flow->nw_proto == IPPROTO_ICMPV6);
+}
+
  void format_ipv6_addr(char *addr_str, const struct in6_addr *addr);
  void print_ipv6_addr(struct ds *string, const struct in6_addr *addr);
  void print_ipv6_masked(struct ds *string, const struct in6_addr *addr,
diff --git a/lib/pcap-file.c b/lib/pcap-file.c

index fdff33c..2d3f9fe 100644 (file)
--- a/lib/pcap-file.c
+++ b/lib/pcap-file.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2009, 2010, 2012, 2013 Nicira, Inc.
+ * Copyright (c) 2009, 2010, 2012, 2013, 2014 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -254,28 +254,27 @@ tcp_reader_close(struct tcp_reader *r)
  }
  
  static struct tcp_stream *
-tcp_stream_lookup(struct tcp_reader *r, const struct flow *flow)
+tcp_stream_lookup(struct tcp_reader *r,
+                  const struct tcp_key *key, uint32_t hash)
  {
      struct tcp_stream *stream;
-    struct tcp_key key;
-    uint32_t hash;
-
-    memset(&key, 0, sizeof key);
-    key.nw_src = flow->nw_src;
-    key.nw_dst = flow->nw_dst;
-    key.tp_src = flow->tp_src;
-    key.tp_dst = flow->tp_dst;
-    hash = hash_bytes(&key, sizeof key, 0);
  
      HMAP_FOR_EACH_WITH_HASH (stream, hmap_node, hash, &r->streams) {
-        if (!memcmp(&stream->key, &key, sizeof key)) {
+        if (!memcmp(&stream->key, key, sizeof *key)) {
              return stream;
          }
      }
+    return NULL;
+}
+
+static struct tcp_stream *
+tcp_stream_new(struct tcp_reader *r, const struct tcp_key *key, uint32_t hash)
+{
+    struct tcp_stream *stream;
  
      stream = xmalloc(sizeof *stream);
      hmap_insert(&r->streams, &stream->hmap_node, hash);
-    memcpy(&stream->key, &key, sizeof key);
+    memcpy(&stream->key, key, sizeof *key);
      stream->seq_no = 0;
      ofpbuf_init(&stream->payload, 2048);
      return stream;
@@ -299,6 +298,9 @@ tcp_reader_run(struct tcp_reader *r, const struct flow *flow,
      struct tcp_stream *stream;
      struct tcp_header *tcp;
      struct ofpbuf *payload;
+    unsigned int l7_length;
+    struct tcp_key key;
+    uint32_t hash;
      uint32_t seq;
      uint8_t flags;
  
@@ -307,14 +309,32 @@ tcp_reader_run(struct tcp_reader *r, const struct flow *flow,
          || !packet->l7) {
          return NULL;
      }
-
-    stream = tcp_stream_lookup(r, flow);
-    payload = &stream->payload;
-
      tcp = packet->l4;
      flags = TCP_FLAGS(tcp->tcp_ctl);
+    l7_length = (char *) ofpbuf_end(packet) - (char *) packet->l7;
      seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
-    if (flags & TCP_SYN) {
+
+    /* Construct key. */
+    memset(&key, 0, sizeof key);
+    key.nw_src = flow->nw_src;
+    key.nw_dst = flow->nw_dst;
+    key.tp_src = flow->tp_src;
+    key.tp_dst = flow->tp_dst;
+    hash = hash_bytes(&key, sizeof key, 0);
+
+    /* Find existing stream or start a new one for a SYN or if there's data. */
+    stream = tcp_stream_lookup(r, &key, hash);
+    if (!stream) {
+        if (flags & TCP_SYN || l7_length) {
+            stream = tcp_stream_new(r, &key, hash);
+            stream->seq_no = flags & TCP_SYN ? seq + 1 : seq;
+        } else {
+            return NULL;
+        }
+    }
+
+    payload = &stream->payload;
+    if (flags & TCP_SYN || !stream->seq_no) {
          ofpbuf_clear(payload);
          stream->seq_no = seq + 1;
          return NULL;
@@ -322,16 +342,13 @@ tcp_reader_run(struct tcp_reader *r, const struct flow *flow,
          tcp_stream_destroy(r, stream);
          return NULL;
      } else if (seq == stream->seq_no) {
-        size_t length;
-
          /* Shift all of the existing payload to the very beginning of the
           * allocated space, so that we reuse allocated space instead of
           * continually expanding it. */
          ofpbuf_shift(payload, (char *) payload->base - (char *) payload->data);
  
-        length = (char *) ofpbuf_end(packet) - (char *) packet->l7;
-        ofpbuf_put(payload, packet->l7, length);
-        stream->seq_no += length;
+        ofpbuf_put(payload, packet->l7, l7_length);
+        stream->seq_no += l7_length;
          return payload;
      } else {
          return NULL;
diff --git a/lib/poll-loop.c b/lib/poll-loop.c

index abd44d1..510903e 100644 (file)
--- a/lib/poll-loop.c
+++ b/lib/poll-loop.c
@@ -81,11 +81,13 @@ find_poll_node(struct poll_loop *loop, int fd, uint32_t wevent)
   *
   * On Windows system:
   *
- *     Register 'wevent' handle for the specified 'events'.  These wevents are
- *     given to the handleMultipleObjects() to be polled.  The event
- *     registration is one-shot: only the following call to poll_block() is
- *     affected.  The event will need to be re-registered after poll_block() is
- *     called if it is to persist.
+ *     If both 'wevent' handle and 'fd' is specified, associate the 'fd' with
+ *     with that 'wevent' for 'events' (implemented in poll_block()).
+ *     In case of no 'fd' specified, wake up on any event on that 'wevent'.
+ *     These wevents are given to the WaitForMultipleObjects() to be polled.
+ *     The event registration is one-shot: only the following call to
+ *     poll_block() is affected.  The event will need to be re-registered after
+ *     poll_block() is called if it is to persist.
   *
   * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
   * automatically provide the caller's source file and line number for
@@ -293,6 +295,16 @@ poll_block(void)
          pollfds[i] = node->pollfd;
  #ifdef _WIN32
          wevents[i] = node->wevent;
+        if (node->pollfd.fd && node->wevent) {
+            short int wsa_events = 0;
+            if (node->pollfd.events & POLLIN) {
+                wsa_events |= FD_READ | FD_ACCEPT | FD_CLOSE;
+            }
+            if (node->pollfd.events & POLLOUT) {
+                wsa_events |= FD_WRITE | FD_CONNECT | FD_CLOSE;
+            }
+            WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
+        }
  #endif
          i++;
      }
diff --git a/lib/poll-loop.h b/lib/poll-loop.h

index ae4c0c0..412bd09 100644 (file)
--- a/lib/poll-loop.h
+++ b/lib/poll-loop.h
@@ -53,9 +53,9 @@ extern "C" {
  void poll_fd_wait_at(int fd, HANDLE wevent, short int events, const char *where);
  #ifndef _WIN32
  #define poll_fd_wait(fd, events) poll_fd_wait_at(fd, 0, events, SOURCE_LOCATOR)
-#else
-#define poll_fd_wait_event(fd, wevent, events) poll_fd_wait_at(fd, wevent, events, SOURCE_LOCATOR)
  #endif
+#define poll_fd_wait_event(fd, wevent, events)  \
+    poll_fd_wait_at(fd, wevent, events, SOURCE_LOCATOR)
  
  void poll_timer_wait_at(long long int msec, const char *where);
  #define poll_timer_wait(msec) poll_timer_wait_at(msec, SOURCE_LOCATOR)
diff --git a/lib/process.c b/lib/process.c

index 5dd34b3..313f11f 100644 (file)
--- a/lib/process.c
+++ b/lib/process.c
@@ -21,6 +21,7 @@
  #include <signal.h>
  #include <stdlib.h>
  #include <string.h>
+#include <sys/resource.h>
  #include <sys/stat.h>
  #include <sys/wait.h>
  #include <unistd.h>
@@ -68,6 +69,7 @@ static void sigchld_handler(int signr OVS_UNUSED);
  void
  process_init(void)
  {
+#ifndef _WIN32
      static bool inited;
      struct sigaction sa;
  
@@ -86,6 +88,7 @@ process_init(void)
      sigemptyset(&sa.sa_mask);
      sa.sa_flags = SA_NOCLDSTOP | SA_RESTART;
      xsigaction(SIGCHLD, &sa, NULL);
+#endif
  }
  
  char *
@@ -163,6 +166,49 @@ process_register(const char *name, pid_t pid)
      return p;
  }
  
+#ifndef _WIN32
+static bool
+rlim_is_finite(rlim_t limit)
+{
+    if (limit == RLIM_INFINITY) {
+        return false;
+    }
+
+#ifdef RLIM_SAVED_CUR           /* FreeBSD 8.0 lacks RLIM_SAVED_CUR. */
+    if (limit == RLIM_SAVED_CUR) {
+        return false;
+    }
+#endif
+
+#ifdef RLIM_SAVED_MAX           /* FreeBSD 8.0 lacks RLIM_SAVED_MAX. */
+    if (limit == RLIM_SAVED_MAX) {
+        return false;
+    }
+#endif
+
+    return true;
+}
+
+/* Returns the maximum valid FD value, plus 1. */
+static int
+get_max_fds(void)
+{
+    static int max_fds;
+
+    if (!max_fds) {
+        struct rlimit r;
+        if (!getrlimit(RLIMIT_NOFILE, &r) && rlim_is_finite(r.rlim_cur)) {
+            max_fds = r.rlim_cur;
+        } else {
+            VLOG_WARN("failed to obtain fd limit, defaulting to 1024");
+            max_fds = 1024;
+        }
+    }
+
+    return max_fds;
+}
+#endif /* _WIN32 */
+
  /* Starts a subprocess with the arguments in the null-terminated argv[] array.
   * argv[0] is used as the name of the process.  Searches the PATH environment
   * variable to find the program to execute.
@@ -178,6 +224,7 @@ process_register(const char *name, pid_t pid)
  int
  process_start(char **argv, struct process **pp)
  {
+#ifndef _WIN32
      pid_t pid;
      int error;
  
@@ -212,6 +259,10 @@ process_start(char **argv, struct process **pp)
                  argv[0], ovs_strerror(errno));
          _exit(1);
      }
+#else
+    *pp = NULL;
+    return ENOSYS;
+#endif
  }
  
  /* Destroys process 'p'. */
@@ -230,9 +281,13 @@ process_destroy(struct process *p)
  int
  process_kill(const struct process *p, int signr)
  {
+#ifndef _WIN32
      return (p->exited ? ESRCH
              : !kill(p->pid, signr) ? 0
              : errno);
+#else
+    return ENOSYS;
+#endif
  }
  
  /* Returns the pid of process 'p'. */
@@ -275,6 +330,7 @@ char *
  process_status_msg(int status)
  {
      struct ds ds = DS_EMPTY_INITIALIZER;
+#ifndef _WIN32
      if (WIFEXITED(status)) {
          ds_put_format(&ds, "exit status %d", WEXITSTATUS(status));
      } else if (WIFSIGNALED(status)) {
@@ -293,6 +349,9 @@ process_status_msg(int status)
      if (WCOREDUMP(status)) {
          ds_put_cstr(&ds, ", core dumped");
      }
+#else
+    ds_put_cstr(&ds, "function not supported.");
+#endif
      return ds_cstr(&ds);
  }
  
@@ -300,6 +359,7 @@ process_status_msg(int status)
  void
  process_run(void)
  {
+#ifndef _WIN32
      char buf[_POSIX_PIPE_BUF];
  
      if (!list_is_empty(&all_processes) && read(fds[0], buf, sizeof buf) > 0) {
@@ -322,6 +382,7 @@ process_run(void)
              }
          }
      }
+#endif
  }
  
  
@@ -330,11 +391,15 @@ process_run(void)
  void
  process_wait(struct process *p)
  {
+#ifndef _WIN32
      if (p->exited) {
          poll_immediate_wake();
      } else {
          poll_fd_wait(fds[0], POLLIN);
      }
+#else
+    OVS_NOT_REACHED();
+#endif
  }
  
  char *
diff --git a/lib/reconnect.c b/lib/reconnect.c

index 0a773bc..5296c5c 100644 (file)
--- a/lib/reconnect.c
+++ b/lib/reconnect.c
@@ -348,7 +348,7 @@ reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
          } else {
              const char *type = fsm->passive ? "listen" : "connection";
              if (error > 0) {
-                VLOG_WARN("%s: %s attempt failed (%s)",
+                VLOG_INFO("%s: %s attempt failed (%s)",
                            fsm->name, type, ovs_strerror(error));
              } else {
                  VLOG(fsm->info, "%s: %s attempt timed out", fsm->name, type);
diff --git a/lib/signals.c b/lib/signals.c

index 27da5d6..85e5c79 100644 (file)
--- a/lib/signals.c
+++ b/lib/signals.c
@@ -39,66 +39,6 @@ VLOG_DEFINE_THIS_MODULE(signals);
  #define N_SIGNALS 32
  #endif
  
-struct signal {
-    int fds[2];
-};
-
-static struct signal signals[N_SIGNALS];
-
-static void signal_handler(int signr);
-
-/* Sets up a handler for 'signr' and returns a structure that represents it.
- *
- * Only one handler for a given signal may be registered. */
-struct signal *
-signal_register(int signr)
-{
-    struct sigaction sa;
-    struct signal *s;
-
-    ovs_assert(signr >= 1 && signr < N_SIGNALS);
-
-    /* Create a pipe. */
-    s = &signals[signr];
-    ovs_assert(!s->fds[0] && !s->fds[1]);
-    xpipe_nonblocking(s->fds);
-
-    /* Install signal handler. */
-    memset(&sa, 0, sizeof sa);
-    sa.sa_handler = signal_handler;
-    sigemptyset(&sa.sa_mask);
-    sa.sa_flags = SA_RESTART;
-    xsigaction(signr, &sa, NULL);
-
-    return s;
-}
-
-/* Returns true if signal 's' has been received since the last call to this
- * function with argument 's'. */
-bool
-signal_poll(struct signal *s)
-{
-    char buf[_POSIX_PIPE_BUF];
-
-    return read(s->fds[0], buf, sizeof buf) > 0;
-}
-
-/* Causes the next call to poll_block() to wake up when signal_poll(s) would
- * return true. */
-void
-signal_wait(struct signal *s)
-{
-    poll_fd_wait(s->fds[0], POLLIN);
-}
-\f
-static void
-signal_handler(int signr)
-{
-    if (signr >= 1 && signr < N_SIGNALS) {
-        ignore(write(signals[signr].fds[1], "", 1));
-    }
-}
-
  /* Returns the name of signal 'signum' as a string.  The return value is either
   * a statically allocated constant string or the 'bufsize'-byte buffer
   * 'namebuf'.  'bufsize' should be at least SIGNAL_NAME_BUFSIZE.
@@ -133,12 +73,3 @@ xsigaction(int signum, const struct sigaction *new, struct sigaction *old)
                     ovs_strerror(errno));
      }
  }
-
-void
-xpthread_sigmask(int how, const sigset_t *new, sigset_t *old)
-{
-    int error = pthread_sigmask(how, new, old);
-    if (error) {
-        VLOG_FATAL("pthread_sigmask failed (%s)", ovs_strerror(error));
-    }
-}
diff --git a/lib/signals.h b/lib/signals.h

index 3294293..3ef1b5b 100644 (file)
--- a/lib/signals.h
+++ b/lib/signals.h
@@ -18,18 +18,11 @@
  #define SIGNALS_H 1
  
  #include <signal.h>
-#include <stdbool.h>
-#include <stddef.h>
  #include "type-props.h"
  
-struct signal *signal_register(int signr);
-bool signal_poll(struct signal *);
-void signal_wait(struct signal *);
-
  enum { SIGNAL_NAME_BUFSIZE = 7 + INT_STRLEN(int) + 1 };
  const char *signal_name(int signum, char *namebuf, size_t bufsize);
  
  void xsigaction(int signum, const struct sigaction *, struct sigaction *old);
-void xpthread_sigmask(int how, const sigset_t *, sigset_t *old);
  
  #endif /* signals.h */
diff --git a/lib/socket-util.c b/lib/socket-util.c

index f5d3137..728c76e 100644 (file)
--- a/lib/socket-util.c
+++ b/lib/socket-util.c
@@ -27,7 +27,6 @@
  #include <stdlib.h>
  #include <string.h>
  #include <sys/ioctl.h>
-#include <sys/resource.h>
  #include <sys/socket.h>
  #include <sys/stat.h>
  #include <sys/uio.h>
@@ -35,6 +34,7 @@
  #include <unistd.h>
  #include "dynamic-string.h"
  #include "fatal-signal.h"
+#include "ovs-thread.h"
  #include "packets.h"
  #include "poll-loop.h"
  #include "util.h"
@@ -72,6 +72,7 @@ static int getsockopt_int(int fd, int level, int option, const char *optname,
  int
  set_nonblocking(int fd)
  {
+#ifndef _WIN32
      int flags = fcntl(fd, F_GETFL, 0);
      if (flags != -1) {
          if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) != -1) {
@@ -84,6 +85,15 @@ set_nonblocking(int fd)
          VLOG_ERR("fcntl(F_GETFL) failed: %s", ovs_strerror(errno));
          return errno;
      }
+#else
+    unsigned long arg = 1;
+    if (ioctlsocket(fd, FIONBIO, &arg)) {
+        int error = sock_errno();
+        VLOG_ERR("set_nonblocking failed: %s", sock_strerror(error));
+        return error;
+    }
+    return 0;
+#endif
  }
  
  void
@@ -105,62 +115,19 @@ set_dscp(int fd, uint8_t dscp)
  
      val = dscp << 2;
      if (setsockopt(fd, IPPROTO_IP, IP_TOS, &val, sizeof val)) {
-        return errno;
+        return sock_errno();
      }
  
      return 0;
  }
  
-static bool
-rlim_is_finite(rlim_t limit)
-{
-    if (limit == RLIM_INFINITY) {
-        return false;
-    }
-
-#ifdef RLIM_SAVED_CUR           /* FreeBSD 8.0 lacks RLIM_SAVED_CUR. */
-    if (limit == RLIM_SAVED_CUR) {
-        return false;
-    }
-#endif
-
-#ifdef RLIM_SAVED_MAX           /* FreeBSD 8.0 lacks RLIM_SAVED_MAX. */
-    if (limit == RLIM_SAVED_MAX) {
-        return false;
-    }
-#endif
-
-    return true;
-}
-
-/* Returns the maximum valid FD value, plus 1. */
-int
-get_max_fds(void)
-{
-    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
-    static int max_fds;
-
-    if (ovsthread_once_start(&once)) {
-        struct rlimit r;
-        if (!getrlimit(RLIMIT_NOFILE, &r) && rlim_is_finite(r.rlim_cur)) {
-            max_fds = r.rlim_cur;
-        } else {
-            VLOG_WARN("failed to obtain fd limit, defaulting to 1024");
-            max_fds = 1024;
-        }
-        ovsthread_once_done(&once);
-    }
-
-    return max_fds;
-}
-
  /* Translates 'host_name', which must be a string representation of an IP
   * address, into a numeric IP address in '*addr'.  Returns 0 if successful,
   * otherwise a positive errno value. */
  int
  lookup_ip(const char *host_name, struct in_addr *addr)
  {
-    if (!inet_aton(host_name, addr)) {
+    if (!inet_pton(AF_INET, host_name, addr)) {
          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
          VLOG_ERR_RL(&rl, "\"%s\" is not a valid IP address", host_name);
          return ENOENT;
@@ -197,7 +164,7 @@ lookup_hostname(const char *host_name, struct in_addr *addr)
      struct addrinfo *result;
      struct addrinfo hints;
  
-    if (inet_aton(host_name, addr)) {
+    if (inet_pton(AF_INET, host_name, addr)) {
          return 0;
      }
  
@@ -232,13 +199,15 @@ lookup_hostname(const char *host_name, struct in_addr *addr)
      case EAI_MEMORY:
          return ENOMEM;
  
-#ifdef EAI_NODATA
+#if defined (EAI_NODATA) && EAI_NODATA != EAI_NONAME
      case EAI_NODATA:
          return ENXIO;
  #endif
  
+#ifdef EAI_SYSTEM
      case EAI_SYSTEM:
-        return errno;
+        return sock_errno();
+#endif
  
      default:
          return EPROTO;
@@ -254,14 +223,19 @@ check_connection_completion(int fd)
  
      pfd.fd = fd;
      pfd.events = POLLOUT;
+
+#ifndef _WIN32
      do {
          retval = poll(&pfd, 1, 0);
      } while (retval < 0 && errno == EINTR);
+#else
+    retval = WSAPoll(&pfd, 1, 0);
+#endif
      if (retval == 1) {
          if (pfd.revents & POLLERR) {
-            ssize_t n = send(fd, "", 1, MSG_DONTWAIT);
+            ssize_t n = send(fd, "", 1, 0);
              if (n < 0) {
-                return errno;
+                return sock_errno();
              } else {
                  VLOG_ERR_RL(&rl, "poll return POLLERR but send succeeded");
                  return EPROTO;
@@ -269,13 +243,14 @@ check_connection_completion(int fd)
          }
          return 0;
      } else if (retval < 0) {
-        VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(errno));
+        VLOG_ERR_RL(&rl, "poll: %s", sock_strerror(sock_errno()));
          return errno;
      } else {
          return EAGAIN;
      }
  }
  
+#ifndef _WIN32
  /* Drain all the data currently in the receive queue of a datagram socket (and
   * possibly additional data).  There is no way to know how many packets are in
   * the receive queue, but we do know that the total number of bytes queued does
@@ -310,6 +285,7 @@ drain_rcvbuf(int fd)
      }
      return 0;
  }
+#endif
  
  /* Returns the size of socket 'sock''s receive buffer (SO_RCVBUF), or a
   * negative errno value if an error occurs. */
@@ -341,6 +317,7 @@ drain_fd(int fd, size_t n_packets)
      }
  }
  
+#ifndef _WIN32
  /* Attempts to shorten 'name' by opening a file descriptor for the directory
   * part of the name and indirecting through /proc/self/fd/<dirfd>/<basename>.
   * On systems with Linux-like /proc, this works as long as <basename> isn't too
@@ -601,6 +578,7 @@ get_unix_name_len(socklen_t sun_len)
              ? sun_len - offsetof(struct sockaddr_un, sun_path)
              : 0);
  }
+#endif /* _WIN32 */
  
  ovs_be32
  guess_netmask(ovs_be32 ip_)
@@ -761,8 +739,8 @@ inet_open_active(int style, const char *target, uint16_t default_port,
      /* Create non-blocking socket. */
      fd = socket(ss.ss_family, style, 0);
      if (fd < 0) {
-        VLOG_ERR("%s: socket: %s", target, ovs_strerror(errno));
-        error = errno;
+        error = sock_errno();
+        VLOG_ERR("%s: socket: %s", target, sock_strerror(error));
          goto exit;
      }
      error = set_nonblocking(fd);
@@ -775,15 +753,19 @@ inet_open_active(int style, const char *target, uint16_t default_port,
       * connect(), the handshake SYN frames will be sent with a TOS of 0. */
      error = set_dscp(fd, dscp);
      if (error) {
-        VLOG_ERR("%s: socket: %s", target, ovs_strerror(error));
+        VLOG_ERR("%s: socket: %s", target, sock_strerror(error));
          goto exit;
      }
  
      /* Connect. */
      error = connect(fd, (struct sockaddr *) &ss, ss_length(&ss)) == 0
                      ? 0
-                    : errno;
-    if (error == EINPROGRESS) {
+                    : sock_errno();
+    if (error == EINPROGRESS
+#ifdef _WIN32
+        || error == WSAEALREADY || error == WSAEWOULDBLOCK
+#endif
+        ) {
          error = EAGAIN;
      }
  
@@ -793,7 +775,7 @@ exit:
              memset(ssp, 0, sizeof *ssp);
          }
          if (fd >= 0) {
-            close(fd);
+            closesocket(fd);
              fd = -1;
          }
      } else {
@@ -880,8 +862,8 @@ inet_open_passive(int style, const char *target, int default_port,
      /* Create non-blocking socket, set SO_REUSEADDR. */
      fd = socket(ss.ss_family, style, 0);
      if (fd < 0) {
-        error = errno;
-        VLOG_ERR("%s: socket: %s", target, ovs_strerror(error));
+        error = sock_errno();
+        VLOG_ERR("%s: socket: %s", target, sock_strerror(error));
          return -error;
      }
      error = set_nonblocking(fd);
@@ -890,16 +872,16 @@ inet_open_passive(int style, const char *target, int default_port,
      }
      if (style == SOCK_STREAM
          && setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof yes) < 0) {
-        error = errno;
+        error = sock_errno();
          VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s",
-                 target, ovs_strerror(error));
+                 target, sock_strerror(error));
          goto error;
      }
  
      /* Bind. */
      if (bind(fd, (struct sockaddr *) &ss, ss_length(&ss)) < 0) {
-        error = errno;
-        VLOG_ERR("%s: bind: %s", target, ovs_strerror(error));
+        error = sock_errno();
+        VLOG_ERR("%s: bind: %s", target, sock_strerror(error));
          goto error;
      }
  
@@ -908,22 +890,22 @@ inet_open_passive(int style, const char *target, int default_port,
       * connect(), the handshake SYN frames will be sent with a TOS of 0. */
      error = set_dscp(fd, dscp);
      if (error) {
-        VLOG_ERR("%s: socket: %s", target, ovs_strerror(error));
+        VLOG_ERR("%s: socket: %s", target, sock_strerror(error));
          goto error;
      }
  
      /* Listen. */
      if (style == SOCK_STREAM && listen(fd, 10) < 0) {
-        error = errno;
-        VLOG_ERR("%s: listen: %s", target, ovs_strerror(error));
+        error = sock_errno();
+        VLOG_ERR("%s: listen: %s", target, sock_strerror(error));
          goto error;
      }
  
      if (ssp || kernel_chooses_port) {
          socklen_t ss_len = sizeof ss;
          if (getsockname(fd, (struct sockaddr *) &ss, &ss_len) < 0) {
-            error = errno;
-            VLOG_ERR("%s: getsockname: %s", target, ovs_strerror(error));
+            error = sock_errno();
+            VLOG_ERR("%s: getsockname: %s", target, sock_strerror(error));
              goto error;
          }
          if (kernel_chooses_port) {
@@ -941,32 +923,10 @@ error:
      if (ssp) {
          memset(ssp, 0, sizeof *ssp);
      }
-    close(fd);
+    closesocket(fd);
      return -error;
  }
  
-/* Returns a readable and writable fd for /dev/null, if successful, otherwise
- * a negative errno value.  The caller must not close the returned fd (because
- * the same fd will be handed out to subsequent callers). */
-int
-get_null_fd(void)
-{
-    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
-    static int null_fd;
-
-    if (ovsthread_once_start(&once)) {
-        null_fd = open("/dev/null", O_RDWR);
-        if (null_fd < 0) {
-            int error = errno;
-            VLOG_ERR("could not open /dev/null: %s", ovs_strerror(error));
-            null_fd = -error;
-        }
-        ovsthread_once_done(&once);
-    }
-
-    return null_fd;
-}
-
  int
  read_fully(int fd, void *p_, size_t size, size_t *bytes_read)
  {
@@ -1016,6 +976,7 @@ int
  fsync_parent_dir(const char *file_name)
  {
      int error = 0;
+#ifndef _WIN32
      char *dir;
      int fd;
  
@@ -1037,6 +998,7 @@ fsync_parent_dir(const char *file_name)
          VLOG_ERR("%s: open failed (%s)", dir, ovs_strerror(error));
      }
      free(dir);
+#endif
  
      return error;
  }
@@ -1094,8 +1056,8 @@ getsockopt_int(int fd, int level, int option, const char *optname, int *valuep)
  
      len = sizeof value;
      if (getsockopt(fd, level, option, &value, &len)) {
-        error = errno;
-        VLOG_ERR_RL(&rl, "getsockopt(%s): %s", optname, ovs_strerror(error));
+        error = sock_errno();
+        VLOG_ERR_RL(&rl, "getsockopt(%s): %s", optname, sock_strerror(error));
      } else if (len != sizeof value) {
          error = EINVAL;
          VLOG_ERR_RL(&rl, "getsockopt(%s): value is %u bytes (expected %"PRIuSIZE")",
@@ -1122,6 +1084,7 @@ describe_sockaddr(struct ds *string, int fd,
              ds_put_format(string, "%s:%"PRIu16,
                            ss_format_address(&ss, addrbuf, sizeof addrbuf),
                            ss_get_port(&ss));
+#ifndef _WIN32
          } else if (ss.ss_family == AF_UNIX) {
              struct sockaddr_un sun;
              const char *null;
@@ -1132,6 +1095,7 @@ describe_sockaddr(struct ds *string, int fd,
              null = memchr(sun.sun_path, '\0', maxlen);
              ds_put_buffer(string, sun.sun_path,
                            null ? null - sun.sun_path : maxlen);
+#endif
          }
  #ifdef HAVE_NETLINK
          else if (ss.ss_family == AF_NETLINK) {
@@ -1222,6 +1186,7 @@ describe_fd(int fd)
      struct stat s;
  
      ds_init(&string);
+#ifndef _WIN32
      if (fstat(fd, &s)) {
          ds_put_format(&string, "fstat failed (%s)", ovs_strerror(errno));
      } else if (S_ISSOCK(s.st_mode)) {
@@ -1241,9 +1206,13 @@ describe_fd(int fd)
          put_fd_filename(&string, fd);
  #endif
      }
+#else
+    ds_put_format(&string,"file descriptor");
+#endif /* _WIN32 */
      return ds_steal_cstr(&string);
  }
  
+#ifndef _WIN32
  /* Calls ioctl() on an AF_INET sock, passing the specified 'command' and
   * 'arg'.  Returns 0 if successful, otherwise a positive errno value. */
  int
@@ -1255,8 +1224,9 @@ af_inet_ioctl(unsigned long int command, const void *arg)
      if (ovsthread_once_start(&once)) {
          sock = socket(AF_INET, SOCK_DGRAM, 0);
          if (sock < 0) {
-            sock = -errno;
-            VLOG_ERR("failed to create inet socket: %s", ovs_strerror(errno));
+            int error = sock_errno();
+            VLOG_ERR("failed to create inet socket: %s", sock_strerror(error));
+            sock = -error;
          }
          ovsthread_once_done(&once);
      }
@@ -1281,6 +1251,7 @@ af_inet_ifreq_ioctl(const char *name, struct ifreq *ifr, unsigned long int cmd,
      }
      return error;
  }
+#endif
  \f
  /* sockaddr_storage helpers. */
  
@@ -1344,3 +1315,19 @@ ss_length(const struct sockaddr_storage *ss)
          OVS_NOT_REACHED();
      }
  }
+
+/* For Windows socket calls, 'errno' is not set.  One has to call
+ * WSAGetLastError() to get the error number and then pass it to
+ * this function to get the correct error string.
+ *
+ * ovs_strerror() calls strerror_r() and would not get the correct error
+ * string for Windows sockets, but is good for POSIX. */
+const char *
+sock_strerror(int error)
+{
+#ifdef _WIN32
+    return ovs_format_message(error);
+#else
+    return ovs_strerror(error);
+#endif
+}
diff --git a/lib/socket-util.h b/lib/socket-util.h

index b54d1a1..92f0c6f 100644 (file)
--- a/lib/socket-util.h
+++ b/lib/socket-util.h
@@ -17,6 +17,7 @@
  #ifndef SOCKET_UTIL_H
  #define SOCKET_UTIL_H 1
  
+#include <errno.h>
  #include <sys/types.h>
  #include <sys/socket.h>
  #include <sys/time.h>
@@ -30,8 +31,6 @@ int set_nonblocking(int fd);
  void xset_nonblocking(int fd);
  int set_dscp(int fd, uint8_t dscp);
  
-int get_max_fds(void);
-
  int lookup_ip(const char *host_name, struct in_addr *address);
  int lookup_ipv6(const char *host_name, struct in6_addr *address);
  
@@ -39,13 +38,16 @@ int lookup_hostname(const char *host_name, struct in_addr *);
  
  int get_socket_rcvbuf(int sock);
  int check_connection_completion(int fd);
+#ifndef _WIN32
  int drain_rcvbuf(int fd);
+#endif
  void drain_fd(int fd, size_t n_packets);
+#ifndef _WIN32
  int make_unix_socket(int style, bool nonblock,
                       const char *bind_path, const char *connect_path);
  int get_unix_name_len(socklen_t sun_len);
+#endif
  ovs_be32 guess_netmask(ovs_be32 ip);
-int get_null_fd(void);
  
  bool inet_parse_active(const char *target, uint16_t default_port,
                         struct sockaddr_storage *ssp);
@@ -73,11 +75,13 @@ char *describe_fd(int fd);
   * in <netinet/ip.h> is used. */
  #define DSCP_DEFAULT (IPTOS_PREC_INTERNETCONTROL >> 2)
  
+#ifndef _WIN32
  /* Helpers for calling ioctl() on an AF_INET socket. */
  struct ifreq;
  int af_inet_ioctl(unsigned long int command, const void *arg);
  int af_inet_ifreq_ioctl(const char *name, struct ifreq *,
                          unsigned long int cmd, const char *cmd_name);
+#endif
  
  /* Functions for working with sockaddr_storage that might contain an IPv4 or
   * IPv6 address. */
@@ -86,5 +90,40 @@ uint16_t ss_get_port(const struct sockaddr_storage *);
  char *ss_format_address(const struct sockaddr_storage *,
                          char *buf, size_t bufsize);
  size_t ss_length(const struct sockaddr_storage *);
+const char *sock_strerror(int error);
+
+#ifdef _WIN32
+/* Windows defines the 'optval' argument as char * instead of void *. */
+#define setsockopt(sock, level, optname, optval, optlen) \
+    rpl_setsockopt(sock, level, optname, optval, optlen)
+static inline int rpl_setsockopt(int sock, int level, int optname,
+                                 const void *optval, socklen_t optlen)
+{
+    return (setsockopt)(sock, level, optname, optval, optlen);
+}
+
+#define getsockopt(sock, level, optname, optval, optlen) \
+    rpl_getsockopt(sock, level, optname, optval, optlen)
+static inline int rpl_getsockopt(int sock, int level, int optname,
+                                 void *optval, socklen_t *optlen)
+{
+    return (getsockopt)(sock, level, optname, optval, optlen);
+}
+#endif
+
+/* In Windows platform, errno is not set for socket calls.
+ * The last error has to be gotten from WSAGetLastError(). */
+static inline int sock_errno(void)
+{
+#ifdef _WIN32
+    return WSAGetLastError();
+#else
+    return errno;
+#endif
+}
+
+#ifndef _WIN32
+#define closesocket close
+#endif
  
  #endif /* socket-util.h */
diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c

index a6f1362..14d62c4 100644 (file)
--- a/lib/stream-ssl.c
+++ b/lib/stream-ssl.c
@@ -29,7 +29,7 @@
  #include <openssl/ssl.h>
  #include <openssl/x509v3.h>
  #include <poll.h>
-#include <sys/fcntl.h>
+#include <fcntl.h>
  #include <sys/stat.h>
  #include <unistd.h>
  #include "coverage.h"
@@ -47,6 +47,21 @@
  #include "timeval.h"
  #include "vlog.h"
  
+#ifdef _WIN32
+/* Ref: https://www.openssl.org/support/faq.html#PROG2
+ * Your application must link against the same version of the Win32 C-Runtime
+ * against which your openssl libraries were linked.  The default version for
+ * OpenSSL is /MD - "Multithreaded DLL". If we compile Open vSwitch with
+ * something other than /MD, instead of re-compiling OpenSSL
+ * toolkit, openssl/applink.c can be #included. Also, it is important
+ * to add CRYPTO_malloc_init prior first call to OpenSSL.
+ *
+ * XXX: The behavior of the following #include when Open vSwitch is
+ * compiled with /MD is not tested. */
+#include <openssl/applink.c>
+#define SHUT_RDWR SD_BOTH
+#endif
+
  VLOG_DEFINE_THIS_MODULE(stream_ssl);
  
  /* Active SSL. */
@@ -67,6 +82,7 @@ struct ssl_stream
      enum ssl_state state;
      enum session_type type;
      int fd;
+    HANDLE wevent;
      SSL *ssl;
      struct ofpbuf *txbuf;
      unsigned int session_nr;
@@ -183,6 +199,8 @@ static void stream_ssl_set_ca_cert_file__(const char *file_name,
  static void ssl_protocol_cb(int write_p, int version, int content_type,
                              const void *, size_t, SSL *, void *sslv_);
  static bool update_ssl_config(struct ssl_config_file *, const char *file_name);
+static int sock_errno(void);
+static void clear_handle(int fd, HANDLE wevent);
  
  static short int
  want_to_poll_events(int want)
@@ -245,8 +263,9 @@ new_ssl_stream(const char *name, int fd, enum session_type type,
      /* Disable Nagle. */
      retval = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof on);
      if (retval) {
-        VLOG_ERR("%s: setsockopt(TCP_NODELAY): %s", name, ovs_strerror(errno));
-        retval = errno;
+        retval = sock_errno();
+        VLOG_ERR("%s: setsockopt(TCP_NODELAY): %s", name,
+                 sock_strerror(retval));
          goto error;
      }
  
@@ -272,6 +291,9 @@ new_ssl_stream(const char *name, int fd, enum session_type type,
      sslv->state = state;
      sslv->type = type;
      sslv->fd = fd;
+#ifdef _WIN32
+    sslv->wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
+#endif
      sslv->ssl = ssl;
      sslv->txbuf = NULL;
      sslv->rx_want = sslv->tx_want = SSL_NOTHING;
@@ -290,7 +312,7 @@ error:
      if (ssl) {
          SSL_free(ssl);
      }
-    close(fd);
+    closesocket(fd);
      return retval;
  }
  
@@ -500,7 +522,8 @@ ssl_close(struct stream *stream)
      ERR_clear_error();
  
      SSL_free(sslv->ssl);
-    close(sslv->fd);
+    clear_handle(sslv->fd, sslv->wevent);
+    closesocket(sslv->fd);
      free(sslv);
  }
  
@@ -691,7 +714,8 @@ ssl_run_wait(struct stream *stream)
      struct ssl_stream *sslv = ssl_stream_cast(stream);
  
      if (sslv->tx_want != SSL_NOTHING) {
-        poll_fd_wait(sslv->fd, want_to_poll_events(sslv->tx_want));
+        poll_fd_wait_event(sslv->fd, sslv->wevent,
+                           want_to_poll_events(sslv->tx_want));
      }
  }
  
@@ -707,14 +731,14 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
          } else {
              switch (sslv->state) {
              case STATE_TCP_CONNECTING:
-                poll_fd_wait(sslv->fd, POLLOUT);
+                poll_fd_wait_event(sslv->fd, sslv->wevent, POLLOUT);
                  break;
  
              case STATE_SSL_CONNECTING:
                  /* ssl_connect() called SSL_accept() or SSL_connect(), which
                   * set up the status that we test here. */
-                poll_fd_wait(sslv->fd,
-                             want_to_poll_events(SSL_want(sslv->ssl)));
+                poll_fd_wait_event(sslv->fd, sslv->wevent,
+                                   want_to_poll_events(SSL_want(sslv->ssl)));
                  break;
  
              default:
@@ -725,7 +749,8 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
  
      case STREAM_RECV:
          if (sslv->rx_want != SSL_NOTHING) {
-            poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
+            poll_fd_wait_event(sslv->fd, sslv->wevent,
+                               want_to_poll_events(sslv->rx_want));
          } else {
              poll_immediate_wake();
          }
@@ -765,6 +790,7 @@ struct pssl_pstream
  {
      struct pstream pstream;
      int fd;
+    HANDLE wevent;
  };
  
  const struct pstream_class pssl_pstream_class;
@@ -806,6 +832,9 @@ pssl_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp,
      pstream_init(&pssl->pstream, &pssl_pstream_class, bound_name);
      pstream_set_bound_port(&pssl->pstream, htons(port));
      pssl->fd = fd;
+#ifdef _WIN32
+    pssl->wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
+#endif
      *pstreamp = &pssl->pstream;
      return 0;
  }
@@ -814,7 +843,8 @@ static void
  pssl_close(struct pstream *pstream)
  {
      struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
-    close(pssl->fd);
+    clear_handle(pssl->fd, pssl->wevent);
+    closesocket(pssl->fd);
      free(pssl);
  }
  
@@ -831,16 +861,21 @@ pssl_accept(struct pstream *pstream, struct stream **new_streamp)
  
      new_fd = accept(pssl->fd, (struct sockaddr *) &ss, &ss_len);
      if (new_fd < 0) {
-        error = errno;
+        error = sock_errno();
+#ifdef _WIN32
+        if (error == WSAEWOULDBLOCK) {
+            error = EAGAIN;
+        }
+#endif
          if (error != EAGAIN) {
-            VLOG_DBG_RL(&rl, "accept: %s", ovs_strerror(error));
+            VLOG_DBG_RL(&rl, "accept: %s", sock_strerror(error));
          }
          return error;
      }
  
      error = set_nonblocking(new_fd);
      if (error) {
-        close(new_fd);
+        closesocket(new_fd);
          return error;
      }
  
@@ -855,7 +890,7 @@ static void
  pssl_wait(struct pstream *pstream)
  {
      struct pssl_pstream *pssl = pssl_pstream_cast(pstream);
-    poll_fd_wait(pssl->fd, POLLIN);
+    poll_fd_wait_event(pssl->fd, pssl->wevent, POLLIN);
  }
  
  static int
@@ -903,6 +938,10 @@ do_ssl_init(void)
  {
      SSL_METHOD *method;
  
+#ifdef _WIN32
+    /* The following call is needed if we "#include <openssl/applink.c>". */
+    CRYPTO_malloc_init();
+#endif
      SSL_library_init();
      SSL_load_error_strings();
  
@@ -1374,3 +1413,16 @@ ssl_protocol_cb(int write_p, int version OVS_UNUSED, int content_type,
  
      ds_destroy(&details);
  }
+
+static void
+clear_handle(int fd OVS_UNUSED, HANDLE wevent OVS_UNUSED)
+{
+#ifdef _WIN32
+    if (fd) {
+        WSAEventSelect(fd, NULL, 0);
+    }
+    if (wevent) {
+        CloseHandle(wevent);
+    }
+#endif
+}
diff --git a/lib/stream.c b/lib/stream.c

index b69f03c..2e7accb 100644 (file)
--- a/lib/stream.c
+++ b/lib/stream.c
@@ -51,7 +51,9 @@ enum stream_state {
  
  static const struct stream_class *stream_classes[] = {
      &tcp_stream_class,
+#ifdef AF_UNIX
      &unix_stream_class,
+#endif
  #ifdef HAVE_OPENSSL
      &ssl_stream_class,
  #endif
@@ -59,7 +61,9 @@ static const struct stream_class *stream_classes[] = {
  
  static const struct pstream_class *pstream_classes[] = {
      &ptcp_pstream_class,
+#ifdef AF_UNIX
      &punix_pstream_class,
+#endif
  #ifdef HAVE_OPENSSL
      &pssl_pstream_class,
  #endif
diff --git a/lib/string.h.in b/lib/string.h.in

index 9e8fba9..bbdaeb4 100644 (file)
--- a/lib/string.h.in
+++ b/lib/string.h.in
@@ -35,6 +35,7 @@
  #define strtok_r strtok_s
  #define strcasecmp _stricmp
  #define strncasecmp _strnicmp
+#define strerror_r(errnum, buf, buflen) strerror_s(buf, buflen, errnum)
  #endif
  
  #ifndef HAVE_STRNLEN
diff --git a/lib/util.c b/lib/util.c

index 845f86c..911cc3e 100644 (file)
--- a/lib/util.c
+++ b/lib/util.c
@@ -50,11 +50,13 @@ DEFINE_PER_THREAD_MALLOCED_DATA(char *, subprogram_name);
  /* --version option output. */
  static char *program_version;
  
-/* Buffer used by ovs_strerror(). */
+/* Buffer used by ovs_strerror() and ovs_format_message(). */
  DEFINE_STATIC_PER_THREAD_DATA(struct { char s[128]; },
                                strerror_buffer,
                                { "" });
  
+static char *xreadlink(const char *filename);
+
  void
  ovs_assert_failure(const char *where, const char *function,
                     const char *condition)
@@ -323,6 +325,10 @@ ovs_retval_to_string(int retval)
              : ovs_strerror(retval));
  }
  
+/* This function returns the string describing the error number in 'error'
+ * for POSIX platforms.  For Windows, this function can be used for C library
+ * calls.  For socket calls that are also used in Windows, use sock_strerror()
+ * instead.  For WINAPI calls, look at ovs_lasterror_to_string(). */
  const char *
  ovs_strerror(int error)
  {
@@ -741,7 +747,7 @@ abs_file_name(const char *dir, const char *file_name)
  /* Like readlink(), but returns the link name as a null-terminated string in
   * allocated memory that the caller must eventually free (with free()).
   * Returns NULL on error, in which case errno is set appropriately. */
-char *
+static char *
  xreadlink(const char *filename)
  {
      size_t size;
@@ -773,10 +779,14 @@ xreadlink(const char *filename)
   *
   *     - Only symlinks in the final component of 'filename' are dereferenced.
   *
+ * For Windows platform, this function returns a string that has the same
+ * value as the passed string.
+ *
   * The caller must eventually free the returned string (with free()). */
  char *
  follow_symlinks(const char *filename)
  {
+#ifndef _WIN32
      struct stat s;
      char *fn;
      int i;
@@ -821,6 +831,7 @@ follow_symlinks(const char *filename)
  
      VLOG_WARN("%s: too many levels of symlinks", filename);
      free(fn);
+#endif
      return xstrdup(filename);
  }
  
@@ -1655,17 +1666,22 @@ exit:
  
  #ifdef _WIN32
  \f
-/* Calls FormatMessage() with GetLastError() as an argument. Returns
- * pointer to a buffer that receives the null-terminated string that specifies
- * the formatted message and that has to be freed by the caller with
- * LocalFree(). */
  char *
-ovs_lasterror_to_string(void)
+ovs_format_message(int error)
  {
-    char *buffer;
-    FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM
-                  | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, GetLastError(), 0,
-                  (char *)&buffer, 0, NULL);
+    enum { BUFSIZE = sizeof strerror_buffer_get()->s };
+    char *buffer = strerror_buffer_get()->s;
+
+    FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                  NULL, error, 0, buffer, BUFSIZE, NULL);
      return buffer;
  }
+
+/* Returns a null-terminated string that explains the last error.
+ * Use this function to get the error string for WINAPI calls. */
+char *
+ovs_lasterror_to_string(void)
+{
+    return ovs_format_message(GetLastError());
+}
  #endif
diff --git a/lib/util.h b/lib/util.h

index 2e3d1da..9afe10e 100644 (file)
--- a/lib/util.h
+++ b/lib/util.h
@@ -310,7 +310,6 @@ char *dir_name(const char *file_name);
  char *base_name(const char *file_name);
  char *abs_file_name(const char *dir, const char *file_name);
  
-char *xreadlink(const char *filename);
  char *follow_symlinks(const char *filename);
  
  void ignore(bool x OVS_UNUSED);
@@ -492,6 +491,7 @@ uint64_t bitwise_get(const void *src, unsigned int src_len,
  
  #ifdef _WIN32
  \f
+char *ovs_format_message(int error);
  char *ovs_lasterror_to_string(void);
  #endif
  
diff --git a/ofproto/bond.c b/ofproto/bond.c

index b4d9487..c4cfa45 100644 (file)
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -43,6 +43,10 @@
  
  VLOG_DEFINE_THIS_MODULE(bond);
  
+static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
+static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
+static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
+
  /* Bit-mask for hashing a flow down to a bucket.
   * There are (BOND_MASK + 1) buckets. */
  #define BOND_MASK 0xff
@@ -58,6 +62,7 @@ struct bond_entry {
  /* A bond slave, that is, one of the links comprising a bond. */
  struct bond_slave {
      struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
+    struct list list_node;      /* In struct bond's enabled_slaves list. */
      struct bond *bond;          /* The bond that contains this slave. */
      void *aux;                  /* Client-provided handle for this slave. */
  
@@ -85,6 +90,14 @@ struct bond {
      /* Slaves. */
      struct hmap slaves;
  
+    /* Enabled slaves.
+     *
+     * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
+     * (To prevent the bond_slave from disappearing they must also hold
+     * 'rwlock'.) */
+    struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
+    struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
+
      /* Bonding info. */
      enum bond_mode balance;     /* Balancing mode, one of BM_*. */
      struct bond_slave *active_slave;
@@ -106,10 +119,6 @@ struct bond {
      struct ovs_refcount ref_cnt;
  };
  
-static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
-static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
-static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
-
  static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
  static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
      OVS_REQ_RDLOCK(rwlock);
@@ -127,6 +136,8 @@ static struct bond_entry *lookup_bond_entry(const struct bond *,
                                              const struct flow *,
                                              uint16_t vlan)
      OVS_REQ_RDLOCK(rwlock);
+static struct bond_slave *get_enabled_slave(struct bond *)
+    OVS_REQ_RDLOCK(rwlock);
  static struct bond_slave *choose_output_slave(const struct bond *,
                                                const struct flow *,
                                                struct flow_wildcards *,
@@ -180,6 +191,8 @@ bond_create(const struct bond_settings *s)
  
      bond = xzalloc(sizeof *bond);
      hmap_init(&bond->slaves);
+    list_init(&bond->enabled_slaves);
+    ovs_mutex_init(&bond->mutex);
      bond->next_fake_iface_update = LLONG_MAX;
      ovs_refcount_init(&bond->ref_cnt);
  
@@ -220,6 +233,7 @@ bond_unref(struct bond *bond)
      }
      hmap_destroy(&bond->slaves);
  
+    ovs_mutex_destroy(&bond->mutex);
      free(bond->hash);
      free(bond->name);
      ovs_refcount_destroy(&bond->ref_cnt);
@@ -1339,6 +1353,15 @@ bond_enable_slave(struct bond_slave *slave, bool enable)
      if (enable != slave->enabled) {
          slave->bond->bond_revalidate = true;
          slave->enabled = enable;
+
+        ovs_mutex_lock(&slave->bond->mutex);
+        if (enable) {
+            list_insert(&slave->bond->enabled_slaves, &slave->list_node);
+        } else {
+            list_remove(&slave->list_node);
+        }
+        ovs_mutex_unlock(&slave->bond->mutex);
+
          VLOG_INFO("interface %s: %s", slave->name,
                    slave->enabled ? "enabled" : "disabled");
      }
@@ -1414,6 +1437,27 @@ lookup_bond_entry(const struct bond *bond, const struct flow *flow,
      return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
  }
  
+/* Selects and returns an enabled slave from the 'enabled_slaves' list
+ * in a round-robin fashion.  If the 'enabled_slaves' list is empty,
+ * returns NULL. */
+static struct bond_slave *
+get_enabled_slave(struct bond *bond)
+{
+    struct list *node;
+
+    ovs_mutex_lock(&bond->mutex);
+    if (list_is_empty(&bond->enabled_slaves)) {
+        ovs_mutex_unlock(&bond->mutex);
+        return NULL;
+    }
+
+    node = list_pop_front(&bond->enabled_slaves);
+    list_push_back(&bond->enabled_slaves, node);
+    ovs_mutex_unlock(&bond->mutex);
+
+    return CONTAINER_OF(node, struct bond_slave, list_node);
+}
+
  static struct bond_slave *
  choose_output_slave(const struct bond *bond, const struct flow *flow,
                      struct flow_wildcards *wc, uint16_t vlan)
@@ -1451,11 +1495,7 @@ choose_output_slave(const struct bond *bond, const struct flow *flow,
          }
          e = lookup_bond_entry(bond, flow, vlan);
          if (!e->slave || !e->slave->enabled) {
-            e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
-                                    struct bond_slave, hmap_node);
-            if (!e->slave->enabled) {
-                e->slave = bond->active_slave;
-            }
+            e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
          }
          return e->slave;
  
diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c

index 2500efd..a529884 100644 (file)
--- a/ofproto/ofproto-dpif-ipfix.c
+++ b/ofproto/ofproto-dpif-ipfix.c
@@ -117,7 +117,8 @@ enum ipfix_proto_l3 {
  };
  enum ipfix_proto_l4 {
      IPFIX_PROTO_L4_UNKNOWN = 0,
-    IPFIX_PROTO_L4_TCP_UDP,
+    IPFIX_PROTO_L4_TCP_UDP_SCTP,
+    IPFIX_PROTO_L4_ICMP,
      NUM_IPFIX_PROTO_L4
  };
  
@@ -200,13 +201,21 @@ struct ipfix_data_record_flow_key_ipv6 {
  });
  BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_flow_key_ipv6) == 36);
  
-/* Part of data record flow key for TCP/UDP entities. */
+/* Part of data record flow key for TCP/UDP/SCTP entities. */
  OVS_PACKED(
-struct ipfix_data_record_flow_key_tcpudp {
+struct ipfix_data_record_flow_key_transport {
      ovs_be16 source_transport_port;  /* SOURCE_TRANSPORT_PORT */
      ovs_be16 destination_transport_port;  /* DESTINATION_TRANSPORT_PORT */
  });
-BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_flow_key_tcpudp) == 4);
+BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_flow_key_transport) == 4);
+
+/* Part of data record flow key for ICMP entities. */
+OVS_PACKED(
+struct ipfix_data_record_flow_key_icmp {
+    uint8_t icmp_type;  /* ICMP_TYPE_IPV4 / ICMP_TYPE_IPV6 */
+    uint8_t icmp_code;  /* ICMP_CODE_IPV4 / ICMP_CODE_IPV6 */
+});
+BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_flow_key_icmp) == 2);
  
  /* Cf. IETF RFC 5102 Section 5.11.3. */
  enum ipfix_flow_end_reason {
@@ -231,18 +240,21 @@ BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_aggregated_common) == 25);
  /* Part of data record for IP aggregated elements. */
  OVS_PACKED(
  struct ipfix_data_record_aggregated_ip {
+    ovs_be64 octet_delta_count;  /* OCTET_DELTA_COUNT */
      ovs_be64 octet_delta_sum_of_squares;  /* OCTET_DELTA_SUM_OF_SQUARES */
      ovs_be64 minimum_ip_total_length;  /* MINIMUM_IP_TOTAL_LENGTH */
      ovs_be64 maximum_ip_total_length;  /* MAXIMUM_IP_TOTAL_LENGTH */
  });
-BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_aggregated_ip) == 24);
+BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_aggregated_ip) == 32);
  
-#define MAX_FLOW_KEY_LEN                                 \
-    (sizeof(struct ipfix_data_record_flow_key_common)    \
-     + sizeof(struct ipfix_data_record_flow_key_vlan)    \
-     + sizeof(struct ipfix_data_record_flow_key_ip)      \
-     + sizeof(struct ipfix_data_record_flow_key_ipv6)    \
-     + sizeof(struct ipfix_data_record_flow_key_tcpudp))
+#define MAX_FLOW_KEY_LEN                                        \
+    (sizeof(struct ipfix_data_record_flow_key_common)           \
+     + sizeof(struct ipfix_data_record_flow_key_vlan)           \
+     + sizeof(struct ipfix_data_record_flow_key_ip)             \
+     + MAX(sizeof(struct ipfix_data_record_flow_key_ipv4),      \
+           sizeof(struct ipfix_data_record_flow_key_ipv6))      \
+     + MAX(sizeof(struct ipfix_data_record_flow_key_icmp),      \
+           sizeof(struct ipfix_data_record_flow_key_transport)))
  
  #define MAX_DATA_RECORD_LEN                                 \
      (MAX_FLOW_KEY_LEN                                       \
@@ -280,6 +292,7 @@ struct ipfix_flow_cache_entry {
      uint64_t flow_end_timestamp_usec;
      uint64_t packet_delta_count;
      uint64_t layer2_octet_delta_count;
+    uint64_t octet_delta_count;
      uint64_t octet_delta_sum_of_squares;  /* 0 if not IP. */
      uint16_t minimum_ip_total_length;  /* 0 if not IP. */
      uint16_t maximum_ip_total_length;  /* 0 if not IP. */
@@ -781,18 +794,27 @@ ipfix_define_template_fields(enum ipfix_proto_l2 l2, enum ipfix_proto_l3 l3,
          if (l3 == IPFIX_PROTO_L3_IPV4) {
              DEF(SOURCE_IPV4_ADDRESS);
              DEF(DESTINATION_IPV4_ADDRESS);
+            if (l4 == IPFIX_PROTO_L4_TCP_UDP_SCTP) {
+                DEF(SOURCE_TRANSPORT_PORT);
+                DEF(DESTINATION_TRANSPORT_PORT);
+            } else if (l4 == IPFIX_PROTO_L4_ICMP) {
+                DEF(ICMP_TYPE_IPV4);
+                DEF(ICMP_CODE_IPV4);
+            }
          } else {  /* l3 == IPFIX_PROTO_L3_IPV6 */
              DEF(SOURCE_IPV6_ADDRESS);
              DEF(DESTINATION_IPV6_ADDRESS);
              DEF(FLOW_LABEL_IPV6);
+            if (l4 == IPFIX_PROTO_L4_TCP_UDP_SCTP) {
+                DEF(SOURCE_TRANSPORT_PORT);
+                DEF(DESTINATION_TRANSPORT_PORT);
+            } else if (l4 == IPFIX_PROTO_L4_ICMP) {
+                DEF(ICMP_TYPE_IPV6);
+                DEF(ICMP_CODE_IPV6);
+            }
          }
      }
  
-    if (l4 != IPFIX_PROTO_L4_UNKNOWN) {
-        DEF(SOURCE_TRANSPORT_PORT);
-        DEF(DESTINATION_TRANSPORT_PORT);
-    }
-
      /* 2. Flow aggregated data. */
  
      DEF(FLOW_START_DELTA_MICROSECONDS);
@@ -802,6 +824,7 @@ ipfix_define_template_fields(enum ipfix_proto_l2 l2, enum ipfix_proto_l3 l3,
      DEF(FLOW_END_REASON);
  
      if (l3 != IPFIX_PROTO_L3_UNKNOWN) {
+        DEF(OCTET_DELTA_COUNT);
          DEF(OCTET_DELTA_SUM_OF_SQUARES);
          DEF(MINIMUM_IP_TOTAL_LENGTH);
          DEF(MAXIMUM_IP_TOTAL_LENGTH);
@@ -946,6 +969,7 @@ ipfix_cache_aggregate_entries(struct ipfix_flow_cache_entry *from_entry,
      to_entry->packet_delta_count += from_entry->packet_delta_count;
      to_entry->layer2_octet_delta_count += from_entry->layer2_octet_delta_count;
  
+    to_entry->octet_delta_count += from_entry->octet_delta_count;
      to_entry->octet_delta_sum_of_squares +=
          from_entry->octet_delta_sum_of_squares;
  
@@ -1019,22 +1043,37 @@ ipfix_cache_entry_init(struct ipfix_flow_cache_entry *entry,
      switch(ntohs(flow->dl_type)) {
      case ETH_TYPE_IP:
          l3 = IPFIX_PROTO_L3_IPV4;
+        switch(flow->nw_proto) {
+        case IPPROTO_TCP:
+        case IPPROTO_UDP:
+        case IPPROTO_SCTP:
+            l4 = IPFIX_PROTO_L4_TCP_UDP_SCTP;
+            break;
+        case IPPROTO_ICMP:
+            l4 = IPFIX_PROTO_L4_ICMP;
+            break;
+        default:
+            l4 = IPFIX_PROTO_L4_UNKNOWN;
+        }
          break;
      case ETH_TYPE_IPV6:
          l3 = IPFIX_PROTO_L3_IPV6;
-        break;
-    default:
-        l3 = IPFIX_PROTO_L3_UNKNOWN;
-    }
-
-    l4 = IPFIX_PROTO_L4_UNKNOWN;
-    if (l3 != IPFIX_PROTO_L3_UNKNOWN) {
          switch(flow->nw_proto) {
-        case IPPROTO_TCP:  /* TCP */
-        case IPPROTO_UDP:  /* UDP */
-            l4 = IPFIX_PROTO_L4_TCP_UDP;
+        case IPPROTO_TCP:
+        case IPPROTO_UDP:
+        case IPPROTO_SCTP:
+            l4 = IPFIX_PROTO_L4_TCP_UDP_SCTP;
+            break;
+        case IPPROTO_ICMPV6:
+            l4 = IPFIX_PROTO_L4_ICMP;
              break;
+        default:
+            l4 = IPFIX_PROTO_L4_UNKNOWN;
          }
+        break;
+    default:
+        l3 = IPFIX_PROTO_L3_UNKNOWN;
+        l4 = IPFIX_PROTO_L4_UNKNOWN;
      }
  
      flow_key->obs_domain_id = obs_domain_id;
@@ -1086,6 +1125,7 @@ ipfix_cache_entry_init(struct ipfix_flow_cache_entry *entry,
  
          if (l3 == IPFIX_PROTO_L3_IPV4) {
              struct ipfix_data_record_flow_key_ipv4 *data_ipv4;
+
              data_ipv4 = ofpbuf_put_zeros(&msg, sizeof *data_ipv4);
              data_ipv4->source_ipv4_address = flow->nw_src;
              data_ipv4->destination_ipv4_address = flow->nw_dst;
@@ -1101,12 +1141,18 @@ ipfix_cache_entry_init(struct ipfix_flow_cache_entry *entry,
          }
      }
  
-    if (l4 != IPFIX_PROTO_L4_UNKNOWN) {
-        struct ipfix_data_record_flow_key_tcpudp *data_tcpudp;
+    if (l4 == IPFIX_PROTO_L4_TCP_UDP_SCTP) {
+        struct ipfix_data_record_flow_key_transport *data_transport;
+
+        data_transport = ofpbuf_put_zeros(&msg, sizeof *data_transport);
+        data_transport->source_transport_port = flow->tp_src;
+        data_transport->destination_transport_port = flow->tp_dst;
+    } else if (l4 == IPFIX_PROTO_L4_ICMP) {
+        struct ipfix_data_record_flow_key_icmp *data_icmp;
  
-        data_tcpudp = ofpbuf_put_zeros(&msg, sizeof *data_tcpudp);
-        data_tcpudp->source_transport_port = flow->tp_src;
-        data_tcpudp->destination_transport_port = flow->tp_dst;
+        data_icmp = ofpbuf_put_zeros(&msg, sizeof *data_icmp);
+        data_icmp->icmp_type = ntohs(flow->tp_src) & 0xff;
+        data_icmp->icmp_code = ntohs(flow->tp_dst) & 0xff;
      }
  
      flow_key->flow_key_msg_part_size = msg.size;
@@ -1130,9 +1176,15 @@ ipfix_cache_entry_init(struct ipfix_flow_cache_entry *entry,
      if (l3 != IPFIX_PROTO_L3_UNKNOWN) {
          uint16_t ip_total_length =
              ethernet_total_length - ethernet_header_length;
+        uint64_t octet_delta_count;
+
+        /* Calculate the total matched octet count by considering as
+         * an approximation that all matched packets have the same
+         * length. */
+        octet_delta_count = packet_delta_count * ip_total_length;
  
-        entry->octet_delta_sum_of_squares =
-            packet_delta_count * ip_total_length * ip_total_length;
+        entry->octet_delta_count = octet_delta_count;
+        entry->octet_delta_sum_of_squares = octet_delta_count * ip_total_length;
          entry->minimum_ip_total_length = ip_total_length;
          entry->maximum_ip_total_length = ip_total_length;
      } else {
@@ -1198,6 +1250,8 @@ ipfix_put_data_set(uint32_t export_time_sec,
  
          data_aggregated_ip = ofpbuf_put_zeros(
              msg, sizeof *data_aggregated_ip);
+        data_aggregated_ip->octet_delta_count = htonll(
+            entry->octet_delta_count);
          data_aggregated_ip->octet_delta_sum_of_squares = htonll(
              entry->octet_delta_sum_of_squares);
          data_aggregated_ip->minimum_ip_total_length = htonll(
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c

index 489012a..cad1310 100644 (file)
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -236,6 +236,8 @@ static void upcall_unixctl_disable_megaflows(struct unixctl_conn *, int argc,
                                               const char *argv[], void *aux);
  static void upcall_unixctl_enable_megaflows(struct unixctl_conn *, int argc,
                                              const char *argv[], void *aux);
+static void upcall_unixctl_set_flow_limit(struct unixctl_conn *conn, int argc,
+                                            const char *argv[], void *aux);
  static void ukey_delete(struct revalidator *, struct udpif_key *);
  
  static atomic_bool enable_megaflows = ATOMIC_VAR_INIT(true);
@@ -253,6 +255,8 @@ udpif_create(struct dpif_backer *backer, struct dpif *dpif)
                                   upcall_unixctl_disable_megaflows, NULL);
          unixctl_command_register("upcall/enable-megaflows", "", 0, 0,
                                   upcall_unixctl_enable_megaflows, NULL);
+        unixctl_command_register("upcall/set-flow-limit", "", 1, 1,
+                                 upcall_unixctl_set_flow_limit, NULL);
          ovsthread_once_done(&once);
      }
  
@@ -746,16 +750,16 @@ classify_upcall(const struct upcall *upcall)
      }
      memset(&cookie, 0, sizeof cookie);
      memcpy(&cookie, nl_attr_get(dpif_upcall->userdata), userdata_len);
-    if (userdata_len == sizeof cookie.sflow
+    if (userdata_len == MAX(8, sizeof cookie.sflow)
          && cookie.type == USER_ACTION_COOKIE_SFLOW) {
          return SFLOW_UPCALL;
-    } else if (userdata_len == sizeof cookie.slow_path
+    } else if (userdata_len == MAX(8, sizeof cookie.slow_path)
                 && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) {
          return MISS_UPCALL;
-    } else if (userdata_len == sizeof cookie.flow_sample
+    } else if (userdata_len == MAX(8, sizeof cookie.flow_sample)
                 && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) {
          return FLOW_SAMPLE_UPCALL;
-    } else if (userdata_len == sizeof cookie.ipfix
+    } else if (userdata_len == MAX(8, sizeof cookie.ipfix)
                 && cookie.type == USER_ACTION_COOKIE_IPFIX) {
          return IPFIX_UPCALL;
      } else {
@@ -1581,3 +1585,25 @@ upcall_unixctl_enable_megaflows(struct unixctl_conn *conn,
      udpif_flush();
      unixctl_command_reply(conn, "megaflows enabled");
  }
+
+/* Set the flow limit.
+ *
+ * This command is only needed for advanced debugging, so it's not
+ * documented in the man page. */
+static void
+upcall_unixctl_set_flow_limit(struct unixctl_conn *conn,
+                              int argc OVS_UNUSED,
+                              const char *argv[] OVS_UNUSED,
+                              void *aux OVS_UNUSED)
+{
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    struct udpif *udpif;
+    unsigned int flow_limit = atoi(argv[1]);
+
+    LIST_FOR_EACH (udpif, list_node, &all_udpifs) {
+        atomic_store(&udpif->flow_limit, flow_limit);
+    }
+    ds_put_format(&ds, "set flow_limit to %u\n", flow_limit);
+    unixctl_command_reply(conn, ds_cstr(&ds));
+    ds_destroy(&ds);
+}
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c

index a880376..89d92af 100644 (file)
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -178,6 +178,7 @@ struct xlate_ctx {
      /* Resubmit statistics, via xlate_table_action(). */
      int recurse;                /* Current resubmit nesting depth. */
      int resubmits;              /* Total number of resubmits. */
+    bool in_group;              /* Currently translating ofgroup, if true. */
  
      uint32_t orig_skb_priority; /* Priority when packet arrived. */
      uint8_t table_id;           /* OpenFlow table ID where flow was found. */
@@ -1780,19 +1781,18 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
                                   &ctx->xout->odp_actions);
          flow->tunnel = flow_tnl; /* Restore tunnel metadata */
      } else {
-        ofp_port_t vlandev_port;
-
          odp_port = xport->odp_port;
+        out_port = odp_port;
          if (ofproto_has_vlan_splinters(ctx->xbridge->ofproto)) {
+            ofp_port_t vlandev_port;
+
              wc->masks.vlan_tci |= htons(VLAN_VID_MASK | VLAN_CFI);
-        }
-        vlandev_port = vsp_realdev_to_vlandev(ctx->xbridge->ofproto, ofp_port,
-                                              flow->vlan_tci);
-        if (vlandev_port == ofp_port) {
-            out_port = odp_port;
-        } else {
-            out_port = ofp_port_to_odp_port(ctx->xbridge, vlandev_port);
-            flow->vlan_tci = htons(0);
+            vlandev_port = vsp_realdev_to_vlandev(ctx->xbridge->ofproto,
+                                                  ofp_port, flow->vlan_tci);
+            if (vlandev_port != ofp_port) {
+                out_port = ofp_port_to_odp_port(ctx->xbridge, vlandev_port);
+                flow->vlan_tci = htons(0);
+            }
          }
      }
  
@@ -1981,6 +1981,8 @@ xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
  static void
  xlate_group_action__(struct xlate_ctx *ctx, struct group_dpif *group)
  {
+    ctx->in_group = true;
+
      switch (group_dpif_get_type(group)) {
      case OFPGT11_ALL:
      case OFPGT11_INDIRECT:
@@ -1996,12 +1998,38 @@ xlate_group_action__(struct xlate_ctx *ctx, struct group_dpif *group)
          OVS_NOT_REACHED();
      }
      group_dpif_release(group);
+
+    ctx->in_group = false;
+}
+
+static bool
+xlate_group_resource_check(struct xlate_ctx *ctx)
+{
+    if (!xlate_resubmit_resource_check(ctx)) {
+        return false;
+    } else if (ctx->in_group) {
+        /* Prevent nested translation of OpenFlow groups.
+         *
+         * OpenFlow allows this restriction.  We enforce this restriction only
+         * because, with the current architecture, we would otherwise have to
+         * take a possibly recursive read lock on the ofgroup rwlock, which is
+         * unsafe given that POSIX allows taking a read lock to block if there
+         * is a thread blocked on taking the write lock.  Other solutions
+         * without this restriction are also possible, but seem unwarranted
+         * given the current limited use of groups. */
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+
+        VLOG_ERR_RL(&rl, "cannot recursively translate OpenFlow group");
+        return false;
+    } else {
+        return true;
+    }
  }
  
  static bool
  xlate_group_action(struct xlate_ctx *ctx, uint32_t group_id)
  {
-    if (xlate_resubmit_resource_check(ctx)) {
+    if (xlate_group_resource_check(ctx)) {
          struct group_dpif *group;
          bool got_group;
  
@@ -2911,6 +2939,7 @@ xlate_actions__(struct xlate_in *xin, struct xlate_out *xout)
      struct xlate_ctx ctx;
      size_t ofpacts_len;
      bool tnl_may_send;
+    bool is_icmp;
  
      COVERAGE_INC(xlate_actions);
  
@@ -2965,6 +2994,7 @@ xlate_actions__(struct xlate_in *xin, struct xlate_out *xout)
      if (is_ip_any(flow)) {
          wc->masks.nw_frag |= FLOW_NW_FRAG_MASK;
      }
+    is_icmp = is_icmpv4(flow) || is_icmpv6(flow);
  
      tnl_may_send = tnl_xlate_init(&ctx.base_flow, flow, wc);
      if (ctx.xbridge->netflow) {
@@ -2973,6 +3003,7 @@ xlate_actions__(struct xlate_in *xin, struct xlate_out *xout)
  
      ctx.recurse = 0;
      ctx.resubmits = 0;
+    ctx.in_group = false;
      ctx.orig_skb_priority = flow->skb_priority;
      ctx.table_id = 0;
      ctx.exit = false;
@@ -3124,6 +3155,21 @@ xlate_actions__(struct xlate_in *xin, struct xlate_out *xout)
       * use non-header fields as part of the cache. */
      flow_wildcards_clear_non_packet_fields(wc);
  
+    /* ICMPv4 and ICMPv6 have 8-bit "type" and "code" fields.  struct flow uses
+     * the low 8 bits of the 16-bit tp_src and tp_dst members to represent
+     * these fields.  The datapath interface, on the other hand, represents
+     * them with just 8 bits each.  This means that if the high 8 bits of the
+     * masks for these fields somehow become set, then they will get chopped
+     * off by a round trip through the datapath, and revalidation will spot
+     * that as an inconsistency and delete the flow.  Avoid the problem here by
+     * making sure that only the low 8 bits of either field can be unwildcarded
+     * for ICMP.
+     */
+    if (is_icmp) {
+        wc->masks.tp_src &= htons(UINT8_MAX);
+        wc->masks.tp_dst &= htons(UINT8_MAX);
+    }
+
  out:
      rule_actions_unref(actions);
      rule_dpif_unref(rule);
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c

index 7b3e1eb..64e2747 100644 (file)
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -89,9 +89,11 @@ struct rule_dpif {
      struct ovs_mutex stats_mutex;
      uint64_t packet_count OVS_GUARDED;  /* Number of packets received. */
      uint64_t byte_count OVS_GUARDED;    /* Number of bytes received. */
+    long long int used;                 /* Last used time (msec). */
  };
  
-static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes);
+static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes,
+                           long long int *used);
  static struct rule_dpif *rule_dpif_cast(const struct rule *);
  static void rule_expire(struct rule_dpif *);
  
@@ -1047,7 +1049,7 @@ construct(struct ofproto *ofproto_)
      ofproto->mbridge = mbridge_create();
      ofproto->has_bonded_bundles = false;
      ofproto->lacp_enabled = false;
-    ovs_mutex_init(&ofproto->stats_mutex);
+    ovs_mutex_init_adaptive(&ofproto->stats_mutex);
      ovs_mutex_init(&ofproto->vsp_mutex);
  
      guarded_list_init(&ofproto->pins);
@@ -1402,14 +1404,16 @@ get_tables(struct ofproto *ofproto_, struct ofp12_table_stats *ots)
      struct dpif_dp_stats s;
      uint64_t n_miss, n_no_pkt_in, n_bytes, n_dropped_frags;
      uint64_t n_lookup;
+    long long int used;
  
      strcpy(ots->name, "classifier");
  
      dpif_get_dp_stats(ofproto->backer->dpif, &s);
-    rule_get_stats(&ofproto->miss_rule->up, &n_miss, &n_bytes);
-    rule_get_stats(&ofproto->no_packet_in_rule->up, &n_no_pkt_in, &n_bytes);
-    rule_get_stats(&ofproto->drop_frags_rule->up, &n_dropped_frags, &n_bytes);
-
+    rule_get_stats(&ofproto->miss_rule->up, &n_miss, &n_bytes, &used);
+    rule_get_stats(&ofproto->no_packet_in_rule->up, &n_no_pkt_in, &n_bytes,
+                   &used);
+    rule_get_stats(&ofproto->drop_frags_rule->up, &n_dropped_frags, &n_bytes,
+                   &used);
      n_lookup = s.n_hit + s.n_missed - n_dropped_frags;
      ots->lookup_count = htonll(n_lookup);
      ots->matched_count = htonll(n_lookup - n_miss - n_no_pkt_in);
@@ -1546,6 +1550,9 @@ port_destruct(struct ofport *port_)
      bundle_remove(port_);
      set_cfm(port_, NULL);
      set_bfd(port_, NULL);
+    if (port->stp_port) {
+        stp_port_disable(port->stp_port);
+    }
      if (ofproto->sflow) {
          dpif_sflow_del_port(ofproto->sflow, port->odp_port);
      }
@@ -2911,24 +2918,39 @@ static void
  rule_expire(struct rule_dpif *rule)
      OVS_REQUIRES(ofproto_mutex)
  {
-    uint16_t idle_timeout, hard_timeout;
+    uint16_t hard_timeout, idle_timeout;
      long long int now = time_msec();
-    int reason;
+    int reason = -1;
  
      ovs_assert(!rule->up.pending);
  
-    /* Has 'rule' expired? */
-    ovs_mutex_lock(&rule->up.mutex);
      hard_timeout = rule->up.hard_timeout;
      idle_timeout = rule->up.idle_timeout;
-    if (hard_timeout && now > rule->up.modified + hard_timeout * 1000) {
-        reason = OFPRR_HARD_TIMEOUT;
-    } else if (idle_timeout && now > rule->up.used + idle_timeout * 1000) {
-        reason = OFPRR_IDLE_TIMEOUT;
-    } else {
-        reason = -1;
+
+    /* Has 'rule' expired? */
+    if (hard_timeout) {
+        long long int modified;
+
+        ovs_mutex_lock(&rule->up.mutex);
+        modified = rule->up.modified;
+        ovs_mutex_unlock(&rule->up.mutex);
+
+        if (now > modified + hard_timeout * 1000) {
+            reason = OFPRR_HARD_TIMEOUT;
+        }
+    }
+
+    if (reason < 0 && idle_timeout) {
+        long long int used;
+
+        ovs_mutex_lock(&rule->stats_mutex);
+        used = rule->used;
+        ovs_mutex_unlock(&rule->stats_mutex);
+
+        if (now > used + idle_timeout * 1000) {
+            reason = OFPRR_IDLE_TIMEOUT;
+        }
      }
-    ovs_mutex_unlock(&rule->up.mutex);
  
      if (reason >= 0) {
          COVERAGE_INC(ofproto_dpif_expired);
@@ -2992,7 +3014,7 @@ rule_dpif_credit_stats(struct rule_dpif *rule,
      ovs_mutex_lock(&rule->stats_mutex);
      rule->packet_count += stats->n_packets;
      rule->byte_count += stats->n_bytes;
-    rule->up.used = MAX(rule->up.used, stats->used);
+    rule->used = MAX(rule->used, stats->used);
      ovs_mutex_unlock(&rule->stats_mutex);
  }
  
@@ -3154,13 +3176,13 @@ rule_dealloc(struct rule *rule_)
  
  static enum ofperr
  rule_construct(struct rule *rule_)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
  {
      struct rule_dpif *rule = rule_dpif_cast(rule_);
-    ovs_mutex_init(&rule->stats_mutex);
-    ovs_mutex_lock(&rule->stats_mutex);
+    ovs_mutex_init_adaptive(&rule->stats_mutex);
      rule->packet_count = 0;
      rule->byte_count = 0;
-    ovs_mutex_unlock(&rule->stats_mutex);
+    rule->used = rule->up.modified;
      return 0;
  }
  
@@ -3188,13 +3210,15 @@ rule_destruct(struct rule *rule_)
  }
  
  static void
-rule_get_stats(struct rule *rule_, uint64_t *packets, uint64_t *bytes)
+rule_get_stats(struct rule *rule_, uint64_t *packets, uint64_t *bytes,
+               long long int *used)
  {
      struct rule_dpif *rule = rule_dpif_cast(rule_);
  
      ovs_mutex_lock(&rule->stats_mutex);
      *packets = rule->packet_count;
      *bytes = rule->byte_count;
+    *used = rule->used;
      ovs_mutex_unlock(&rule->stats_mutex);
  }
  
@@ -3270,7 +3294,21 @@ static enum ofperr
  group_construct(struct ofgroup *group_)
  {
      struct group_dpif *group = group_dpif_cast(group_);
-    ovs_mutex_init(&group->stats_mutex);
+    const struct ofputil_bucket *bucket;
+
+    /* Prevent group chaining because our locking structure makes it hard to
+     * implement deadlock-free.  (See xlate_group_resource_check().) */
+    LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
+        const struct ofpact *a;
+
+        OFPACT_FOR_EACH (a, bucket->ofpacts, bucket->ofpacts_len) {
+            if (a->type == OFPACT_GROUP) {
+                return OFPERR_OFPGMFC_CHAINING_UNSUPPORTED;
+            }
+        }
+    }
+
+    ovs_mutex_init_adaptive(&group->stats_mutex);
      ovs_mutex_lock(&group->stats_mutex);
      group_construct_stats(group);
      ovs_mutex_unlock(&group->stats_mutex);
@@ -4250,12 +4288,8 @@ bool
  ofproto_has_vlan_splinters(const struct ofproto_dpif *ofproto)
      OVS_EXCLUDED(ofproto->vsp_mutex)
  {
-    bool ret;
-
-    ovs_mutex_lock(&ofproto->vsp_mutex);
-    ret = !hmap_is_empty(&ofproto->realdev_vid_map);
-    ovs_mutex_unlock(&ofproto->vsp_mutex);
-    return ret;
+    /* hmap_is_empty is thread safe. */
+    return !hmap_is_empty(&ofproto->realdev_vid_map);
  }
  
  static ofp_port_t
@@ -4293,6 +4327,10 @@ vsp_realdev_to_vlandev(const struct ofproto_dpif *ofproto,
  {
      ofp_port_t ret;
  
+    /* hmap_is_empty is thread safe, see if we can return immediately. */
+    if (hmap_is_empty(&ofproto->realdev_vid_map)) {
+        return realdev_ofp_port;
+    }
      ovs_mutex_lock(&ofproto->vsp_mutex);
      ret = vsp_realdev_to_vlandev__(ofproto, realdev_ofp_port, vlan_tci);
      ovs_mutex_unlock(&ofproto->vsp_mutex);
@@ -4356,6 +4394,11 @@ vsp_adjust_flow(const struct ofproto_dpif *ofproto, struct flow *flow)
      ofp_port_t realdev;
      int vid;
  
+    /* hmap_is_empty is thread safe. */
+    if (hmap_is_empty(&ofproto->vlandev_map)) {
+        return false;
+    }
+
      ovs_mutex_lock(&ofproto->vsp_mutex);
      realdev = vsp_vlandev_to_realdev(ofproto, flow->in_port.ofp_port, &vid);
      ovs_mutex_unlock(&ofproto->vsp_mutex);
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h

index 19d1551..2c72fbc 100644 (file)
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -339,7 +339,9 @@ struct rule {
  
      /* Protects members marked OVS_GUARDED.
       * Readers only need to hold this mutex.
-     * Writers must hold both this mutex AND ofproto_mutex. */
+     * Writers must hold both this mutex AND ofproto_mutex.
+     * By implication writers can read *without* taking this mutex while they
+     * hold ofproto_mutex. */
      struct ovs_mutex mutex OVS_ACQ_AFTER(ofproto_mutex);
  
      /* Number of references.
@@ -356,10 +358,6 @@ struct rule {
      ovs_be64 flow_cookie OVS_GUARDED;
      struct hindex_node cookie_node OVS_GUARDED_BY(ofproto_mutex);
  
-    /* Times. */
-    long long int created OVS_GUARDED; /* Creation time. */
-    long long int modified OVS_GUARDED; /* Time of last modification. */
-    long long int used OVS_GUARDED; /* Last use; time created if never used. */
      enum ofputil_flow_mod_flags flags OVS_GUARDED;
  
      /* Timeouts. */
@@ -393,6 +391,13 @@ struct rule {
      /* Optimisation for flow expiry.  In ofproto's 'expirable' list if this
       * rule is expirable, otherwise empty. */
      struct list expirable OVS_GUARDED_BY(ofproto_mutex);
+
+    /* Times.  Last so that they are more likely close to the stats managed
+     * by the provider. */
+    long long int created OVS_GUARDED; /* Creation time. */
+
+    /* Must hold 'mutex' for both read/write, 'ofproto_mutex' not needed. */
+    long long int modified OVS_GUARDED; /* Time of last modification. */
  };
  
  void ofproto_rule_ref(struct rule *);
@@ -463,10 +468,6 @@ extern unsigned ofproto_flow_limit;
   * ofproto-dpif implementation. */
  extern size_t n_handlers, n_revalidators;
  
-/* Determines which model to use for handling misses in the ofproto-dpif
- * implementation */
-extern enum ofproto_flow_miss_model flow_miss_model;
-
  static inline struct rule *
  rule_from_cls_rule(const struct cls_rule *cls_rule)
  {
@@ -936,8 +937,9 @@ struct ofproto_class {
      void (*port_reconfigured)(struct ofport *ofport,
                                enum ofputil_port_config old_config);
  
-    /* Looks up a port named 'devname' in 'ofproto'.  On success, initializes
-     * '*port' appropriately.
+    /* Looks up a port named 'devname' in 'ofproto'.  On success, returns 0 and
+     * initializes '*port' appropriately. Otherwise, returns a positive errno
+     * value.
       *
       * The caller owns the data in 'port' and must free it with
       * ofproto_port_destroy() when it is no longer needed. */
@@ -1279,7 +1281,7 @@ struct ofproto_class {
       * in '*byte_count'.  UINT64_MAX indicates that the packet count or byte
       * count is unknown. */
      void (*rule_get_stats)(struct rule *rule, uint64_t *packet_count,
-                           uint64_t *byte_count)
+                           uint64_t *byte_count, long long int *used)
          /* OVS_EXCLUDED(ofproto_mutex) */;
  
      /* Applies the actions in 'rule' to 'packet'.  (This implements sending
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c

index a9cf221..02e628a 100644 (file)
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -182,7 +182,7 @@ struct eviction_group {
  
  static bool choose_rule_to_evict(struct oftable *table, struct rule **rulep);
  static void ofproto_evict(struct ofproto *) OVS_EXCLUDED(ofproto_mutex);
-static uint32_t rule_eviction_priority(struct rule *);
+static uint32_t rule_eviction_priority(struct ofproto *ofproto, struct rule *);
  static void eviction_group_add_rule(struct rule *);
  static void eviction_group_remove_rule(struct rule *);
  
@@ -307,7 +307,6 @@ static size_t allocated_ofproto_classes;
  struct ovs_mutex ofproto_mutex = OVS_MUTEX_INITIALIZER;
  
  unsigned ofproto_flow_limit = OFPROTO_FLOW_LIMIT_DEFAULT;
-enum ofproto_flow_miss_model flow_miss_model = OFPROTO_HANDLE_MISS_AUTO;
  
  size_t n_handlers, n_revalidators;
  
@@ -698,13 +697,6 @@ ofproto_set_flow_limit(unsigned limit)
      ofproto_flow_limit = limit;
  }
  
-/* Sets the path for handling flow misses. */
-void
-ofproto_set_flow_miss_model(unsigned model)
-{
-    flow_miss_model = model;
-}
-
  /* If forward_bpdu is true, the NORMAL action will forward frames with
   * reserved (e.g. STP) destination Ethernet addresses. if forward_bpdu is false,
   * the NORMAL action will drop these frames. */
@@ -1935,6 +1927,46 @@ int
  ofproto_flow_mod(struct ofproto *ofproto, struct ofputil_flow_mod *fm)
      OVS_EXCLUDED(ofproto_mutex)
  {
+    /* Optimize for the most common case of a repeated learn action.
+     * If an identical flow already exists we only need to update its
+     * 'modified' time. */
+    if (fm->command == OFPFC_MODIFY_STRICT && fm->table_id != OFPTT_ALL
+        && !(fm->flags & OFPUTIL_FF_RESET_COUNTS)) {
+        struct oftable *table = &ofproto->tables[fm->table_id];
+        struct cls_rule match_rule;
+        struct rule *rule;
+        bool done = false;
+
+        cls_rule_init(&match_rule, &fm->match, fm->priority);
+        fat_rwlock_rdlock(&table->cls.rwlock);
+        rule = rule_from_cls_rule(classifier_find_rule_exactly(&table->cls,
+                                                               &match_rule));
+        if (rule) {
+            /* Reading many of the rule fields and writing on 'modified'
+             * requires the rule->mutex.  Also, rule->actions may change
+             * if rule->mutex is not held. */
+            ovs_mutex_lock(&rule->mutex);
+            if (rule->idle_timeout == fm->idle_timeout
+                && rule->hard_timeout == fm->hard_timeout
+                && rule->flags == (fm->flags & OFPUTIL_FF_STATE)
+                && (!fm->modify_cookie || (fm->new_cookie == rule->flow_cookie))
+                && ofpacts_equal(fm->ofpacts, fm->ofpacts_len,
+                                 rule->actions->ofpacts,
+                                 rule->actions->ofpacts_len)) {
+                /* Rule already exists and need not change, only update the
+                   modified timestamp. */
+                rule->modified = time_msec();
+                done = true;
+            }
+            ovs_mutex_unlock(&rule->mutex);
+        }
+        fat_rwlock_unlock(&table->cls.rwlock);
+
+        if (done) {
+            return 0;
+        }
+    }
+
      return handle_flow_mod__(ofproto, NULL, fm, NULL);
  }
  
@@ -2204,7 +2236,8 @@ ofport_modified(struct ofport *port, struct ofputil_phy_port *pp)
      memcpy(port->pp.hw_addr, pp->hw_addr, ETH_ADDR_LEN);
      port->pp.config = ((port->pp.config & ~OFPUTIL_PC_PORT_DOWN)
                          | (pp->config & OFPUTIL_PC_PORT_DOWN));
-    port->pp.state = pp->state;
+    port->pp.state = ((port->pp.state & ~OFPUTIL_PS_LINK_DOWN)
+                      | (pp->state & OFPUTIL_PS_LINK_DOWN));
      port->pp.curr = pp->curr;
      port->pp.advertised = pp->advertised;
      port->pp.supported = pp->supported;
@@ -3561,20 +3594,20 @@ handle_flow_stats_request(struct ofconn *ofconn,
          fs.idle_timeout = rule->idle_timeout;
          fs.hard_timeout = rule->hard_timeout;
          created = rule->created;
-        used = rule->used;
          modified = rule->modified;
          actions = rule_get_actions__(rule);
          flags = rule->flags;
          ovs_mutex_unlock(&rule->mutex);
  
+        ofproto->ofproto_class->rule_get_stats(rule, &fs.packet_count,
+                                               &fs.byte_count, &used);
+
          minimatch_expand(&rule->cr.match, &fs.match);
          fs.table_id = rule->table_id;
          calc_duration(created, now, &fs.duration_sec, &fs.duration_nsec);
          fs.priority = rule->cr.priority;
          fs.idle_age = age_secs(now - used);
          fs.hard_age = age_secs(now - modified);
-        ofproto->ofproto_class->rule_get_stats(rule, &fs.packet_count,
-                                               &fs.byte_count);
          fs.ofpacts = actions->ofpacts;
          fs.ofpacts_len = actions->ofpacts_len;
  
@@ -3597,10 +3630,10 @@ flow_stats_ds(struct rule *rule, struct ds *results)
  {
      uint64_t packet_count, byte_count;
      struct rule_actions *actions;
-    long long int created;
+    long long int created, used;
  
-    rule->ofproto->ofproto_class->rule_get_stats(rule,
-                                                 &packet_count, &byte_count);
+    rule->ofproto->ofproto_class->rule_get_stats(rule, &packet_count,
+                                                 &byte_count, &used);
  
      ovs_mutex_lock(&rule->mutex);
      actions = rule_get_actions__(rule);
@@ -3711,9 +3744,10 @@ handle_aggregate_stats_request(struct ofconn *ofconn,
          struct rule *rule = rules.rules[i];
          uint64_t packet_count;
          uint64_t byte_count;
+        long long int used;
  
          ofproto->ofproto_class->rule_get_stats(rule, &packet_count,
-                                               &byte_count);
+                                               &byte_count, &used);
  
          if (packet_count == UINT64_MAX) {
              unknown_packets = true;
@@ -4015,7 +4049,7 @@ add_flow(struct ofproto *ofproto, struct ofconn *ofconn,
      ovs_refcount_init(&rule->ref_count);
      rule->pending = NULL;
      rule->flow_cookie = fm->new_cookie;
-    rule->created = rule->modified = rule->used = time_msec();
+    rule->created = rule->modified = time_msec();
  
      ovs_mutex_init(&rule->mutex);
      ovs_mutex_lock(&rule->mutex);
@@ -4305,6 +4339,7 @@ ofproto_rule_send_removed(struct rule *rule, uint8_t reason)
      OVS_REQUIRES(ofproto_mutex)
  {
      struct ofputil_flow_removed fr;
+    long long int used;
  
      if (ofproto_rule_is_hidden(rule) ||
          !(rule->flags & OFPUTIL_FF_SEND_FLOW_REM)) {
@@ -4323,7 +4358,7 @@ ofproto_rule_send_removed(struct rule *rule, uint8_t reason)
      fr.hard_timeout = rule->hard_timeout;
      ovs_mutex_unlock(&rule->mutex);
      rule->ofproto->ofproto_class->rule_get_stats(rule, &fr.packet_count,
-                                                 &fr.byte_count);
+                                                 &fr.byte_count, &used);
  
      connmgr_send_flow_removed(rule->ofproto->connmgr, &fr);
  }
@@ -6192,10 +6227,12 @@ ofopgroup_complete(struct ofopgroup *group)
              if (!op->error) {
                  long long int now = time_msec();
  
+                ovs_mutex_lock(&rule->mutex);
                  rule->modified = now;
                  if (op->type == OFOPERATION_REPLACE) {
-                    rule->created = rule->used = now;
+                    rule->created = now;
                  }
+                ovs_mutex_unlock(&rule->mutex);
              } else {
                  ofproto_rule_change_cookie(ofproto, rule, op->flow_cookie);
                  ovs_mutex_lock(&rule->mutex);
@@ -6558,26 +6595,34 @@ eviction_group_find(struct oftable *table, uint32_t id)
  
  /* Returns an eviction priority for 'rule'.  The return value should be
   * interpreted so that higher priorities make a rule more attractive candidates
- * for eviction. */
+ * for eviction.
+ * Called only if have a timeout. */
  static uint32_t
-rule_eviction_priority(struct rule *rule)
+rule_eviction_priority(struct ofproto *ofproto, struct rule *rule)
      OVS_REQUIRES(ofproto_mutex)
  {
-    long long int hard_expiration;
-    long long int idle_expiration;
-    long long int expiration;
+    long long int expiration = LLONG_MAX;
+    long long int modified;
      uint32_t expiration_offset;
  
-    /* Calculate time of expiration. */
+    /* 'modified' needs protection even when we hold 'ofproto_mutex'. */
      ovs_mutex_lock(&rule->mutex);
-    hard_expiration = (rule->hard_timeout
-                       ? rule->modified + rule->hard_timeout * 1000
-                       : LLONG_MAX);
-    idle_expiration = (rule->idle_timeout
-                       ? rule->used + rule->idle_timeout * 1000
-                       : LLONG_MAX);
-    expiration = MIN(hard_expiration, idle_expiration);
+    modified = rule->modified;
      ovs_mutex_unlock(&rule->mutex);
+
+    if (rule->hard_timeout) {
+        expiration = modified + rule->hard_timeout * 1000;
+    }
+    if (rule->idle_timeout) {
+        uint64_t packets, bytes;
+        long long int used;
+        long long int idle_expiration;
+
+        ofproto->ofproto_class->rule_get_stats(rule, &packets, &bytes, &used);
+        idle_expiration = used + rule->idle_timeout * 1000;
+        expiration = MIN(expiration, idle_expiration);
+    }
+
      if (expiration == LLONG_MAX) {
          return 0;
      }
@@ -6607,9 +6652,9 @@ eviction_group_add_rule(struct rule *rule)
      struct oftable *table = &ofproto->tables[rule->table_id];
      bool has_timeout;
  
-    ovs_mutex_lock(&rule->mutex);
+    /* Timeouts may be modified only when holding 'ofproto_mutex'.  We have it
+     * so no additional protection is needed. */
      has_timeout = rule->hard_timeout || rule->idle_timeout;
-    ovs_mutex_unlock(&rule->mutex);
  
      if (table->eviction_fields && has_timeout) {
          struct eviction_group *evg;
@@ -6618,7 +6663,7 @@ eviction_group_add_rule(struct rule *rule)
  
          rule->eviction_group = evg;
          heap_insert(&evg->rules, &rule->evg_node,
-                    rule_eviction_priority(rule));
+                    rule_eviction_priority(ofproto, rule));
          eviction_group_resized(table, evg);
      }
  }
diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h

index 0ac4454..1f9cb15 100644 (file)
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -215,13 +215,6 @@ int ofproto_port_dump_done(struct ofproto_port_dump *);
  
  #define OFPROTO_FLOW_LIMIT_DEFAULT 200000
  
-/* How flow misses should be handled in ofproto-dpif */
-enum ofproto_flow_miss_model {
-    OFPROTO_HANDLE_MISS_AUTO,           /* Based on flow eviction threshold. */
-    OFPROTO_HANDLE_MISS_WITH_FACETS,    /* Always create facets. */
-    OFPROTO_HANDLE_MISS_WITHOUT_FACETS  /* Always handle without facets.*/
-};
-
  const char *ofproto_port_open_type(const char *datapath_type,
                                     const char *port_type);
  int ofproto_port_add(struct ofproto *, struct netdev *, ofp_port_t *ofp_portp);
@@ -243,7 +236,6 @@ void ofproto_set_extra_in_band_remotes(struct ofproto *,
                                         const struct sockaddr_in *, size_t n);
  void ofproto_set_in_band_queue(struct ofproto *, int queue_id);
  void ofproto_set_flow_limit(unsigned limit);
-void ofproto_set_flow_miss_model(unsigned model);
  void ofproto_set_forward_bpdu(struct ofproto *, bool forward_bpdu);
  void ofproto_set_mac_table_config(struct ofproto *, unsigned idle_time,
                                    size_t max_entries);
diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c

index 09497a3..38b782f 100644 (file)
--- a/ofproto/tunnel.c
+++ b/ofproto/tunnel.c
@@ -60,8 +60,46 @@ struct tnl_port {
  
  static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
  
-static struct hmap tnl_match_map__ = HMAP_INITIALIZER(&tnl_match_map__);
-static struct hmap *tnl_match_map OVS_GUARDED_BY(rwlock) = &tnl_match_map__;
+/* Tunnel matches.
+ *
+ * This module maps packets received over tunnel protocols to vports.  The
+ * tunnel protocol and, for some protocols, tunnel-specific information (e.g.,
+ * for VXLAN, the UDP destination port number) are always use as part of the
+ * mapping.  Which other fields are used for the mapping depends on the vports
+ * themselves (the parenthesized notations refer to "struct tnl_match" fields):
+ *
+ *     - in_key: A vport may match a specific tunnel ID (in_key_flow == false)
+ *       or arrange for the tunnel ID to be matched as tunnel.tun_id in the
+ *       OpenFlow flow (in_key_flow == true).
+ *
+ *     - ip_dst: A vport may match a specific destination IP address
+ *       (ip_dst_flow == false) or arrange for the destination IP to be matched
+ *       as tunnel.ip_dst in the OpenFlow flow (ip_dst_flow == true).
+ *
+ *     - ip_src: A vport may match a specific IP source address (ip_src_flow ==
+ *       false, ip_src != 0), wildcard all source addresses (ip_src_flow ==
+ *       false, ip_src == 0), or arrange for the IP source address to be
+ *       handled in the OpenFlow flow table (ip_src_flow == true).
+ *
+ * Thus, there are 2 * 2 * 3 == 12 possible ways a vport can match against a
+ * tunnel packet.  We number the possibilities for each field in increasing
+ * order as listed in each bullet above.  We order the 12 overall combinations
+ * in lexicographic order considering in_key first, then ip_dst, then
+ * ip_src. */
+#define N_MATCH_TYPES (2 * 2 * 3)
+
+/* The three possibilities (see above) for vport ip_src matches. */
+enum ip_src_type {
+    IP_SRC_CFG,             /* ip_src must equal configured address. */
+    IP_SRC_ANY,             /* Any ip_src is acceptable. */
+    IP_SRC_FLOW             /* ip_src is handled in flow table. */
+};
+
+/* Each hmap contains "struct tnl_port"s.
+ * The index is a combination of how each of the fields listed under "Tunnel
+ * matches" above matches, see the final paragraph for ordering. */
+static struct hmap *tnl_match_maps[N_MATCH_TYPES] OVS_GUARDED_BY(rwlock);
+static struct hmap **tnl_match_map(const struct tnl_match *);
  
  static struct hmap ofport_map__ = HMAP_INITIALIZER(&ofport_map__);
  static struct hmap *ofport_map OVS_GUARDED_BY(rwlock) = &ofport_map__;
@@ -70,7 +108,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
  static struct vlog_rate_limit dbg_rl = VLOG_RATE_LIMIT_INIT(60, 60);
  
  static struct tnl_port *tnl_find(const struct flow *) OVS_REQ_RDLOCK(rwlock);
-static struct tnl_port *tnl_find_exact(struct tnl_match *)
+static struct tnl_port *tnl_find_exact(struct tnl_match *, struct hmap *)
      OVS_REQ_RDLOCK(rwlock);
  static struct tnl_port *tnl_find_ofport(const struct ofport_dpif *)
      OVS_REQ_RDLOCK(rwlock);
@@ -92,6 +130,7 @@ tnl_port_add__(const struct ofport_dpif *ofport, const struct netdev *netdev,
      const struct netdev_tunnel_config *cfg;
      struct tnl_port *existing_port;
      struct tnl_port *tnl_port;
+    struct hmap **map;
  
      cfg = netdev_get_tunnel_config(netdev);
      ovs_assert(cfg);
@@ -110,7 +149,8 @@ tnl_port_add__(const struct ofport_dpif *ofport, const struct netdev *netdev,
      tnl_port->match.in_key_flow = cfg->in_key_flow;
      tnl_port->match.odp_port = odp_port;
  
-    existing_port = tnl_find_exact(&tnl_port->match);
+    map = tnl_match_map(&tnl_port->match);
+    existing_port = tnl_find_exact(&tnl_port->match, *map);
      if (existing_port) {
          if (warn) {
              struct ds ds = DS_EMPTY_INITIALIZER;
@@ -125,8 +165,12 @@ tnl_port_add__(const struct ofport_dpif *ofport, const struct netdev *netdev,
      }
  
      hmap_insert(ofport_map, &tnl_port->ofport_node, hash_pointer(ofport, 0));
-    hmap_insert(tnl_match_map, &tnl_port->match_node,
-                tnl_hash(&tnl_port->match));
+
+    if (!*map) {
+        *map = xmalloc(sizeof **map);
+        hmap_init(*map);
+    }
+    hmap_insert(*map, &tnl_port->match_node, tnl_hash(&tnl_port->match));
      tnl_port_mod_log(tnl_port, "adding");
      return true;
  }
@@ -182,8 +226,16 @@ tnl_port_del__(const struct ofport_dpif *ofport) OVS_REQ_WRLOCK(rwlock)
  
      tnl_port = tnl_find_ofport(ofport);
      if (tnl_port) {
+        struct hmap **map;
+
          tnl_port_mod_log(tnl_port, "removing");
-        hmap_remove(tnl_match_map, &tnl_port->match_node);
+        map = tnl_match_map(&tnl_port->match);
+        hmap_remove(*map, &tnl_port->match_node);
+        if (hmap_is_empty(*map)) {
+            hmap_destroy(*map);
+            free(*map);
+            *map = NULL;
+        }
          hmap_remove(ofport_map, &tnl_port->ofport_node);
          netdev_close(tnl_port->netdev);
          free(tnl_port);
@@ -396,14 +448,16 @@ tnl_find_ofport(const struct ofport_dpif *ofport) OVS_REQ_RDLOCK(rwlock)
  }
  
  static struct tnl_port *
-tnl_find_exact(struct tnl_match *match) OVS_REQ_RDLOCK(rwlock)
+tnl_find_exact(struct tnl_match *match, struct hmap *map)
+    OVS_REQ_RDLOCK(rwlock)
  {
-    struct tnl_port *tnl_port;
+    if (map) {
+        struct tnl_port *tnl_port;
  
-    HMAP_FOR_EACH_WITH_HASH (tnl_port, match_node, tnl_hash(match),
-                             tnl_match_map) {
-        if (!memcmp(match, &tnl_port->match, sizeof *match)) {
-            return tnl_port;
+        HMAP_FOR_EACH_WITH_HASH (tnl_port, match_node, tnl_hash(match), map) {
+            if (!memcmp(match, &tnl_port->match, sizeof *match)) {
+                return tnl_port;
+            }
          }
      }
      return NULL;
@@ -414,55 +468,67 @@ tnl_find_exact(struct tnl_match *match) OVS_REQ_RDLOCK(rwlock)
  static struct tnl_port *
  tnl_find(const struct flow *flow) OVS_REQ_RDLOCK(rwlock)
  {
-    enum ip_src_type {
-        IP_SRC_CFG,             /* ip_src must equal configured address. */
-        IP_SRC_ANY,             /* Any ip_src is acceptable. */
-        IP_SRC_FLOW             /* ip_src is handled in flow table. */
-    };
-
-    struct tnl_match_pattern {
-        bool in_key_flow;
-        bool ip_dst_flow;
-        enum ip_src_type ip_src;
-    };
-
-    static const struct tnl_match_pattern patterns[] = {
-        { false, false, IP_SRC_CFG },  /* remote_ip, local_ip, in_key. */
-        { false, false, IP_SRC_ANY },  /* remote_ip, in_key. */
-        { true,  false, IP_SRC_CFG },  /* remote_ip, local_ip. */
-        { true,  false, IP_SRC_ANY },  /* remote_ip. */
-        { true,  true,  IP_SRC_ANY },  /* Flow-based remote. */
-        { true,  true,  IP_SRC_FLOW }, /* Flow-based everything. */
-    };
-
-    const struct tnl_match_pattern *p;
-    struct tnl_match match;
-
-    memset(&match, 0, sizeof match);
-    match.odp_port = flow->in_port.odp_port;
-    match.pkt_mark = flow->pkt_mark;
-
-    for (p = patterns; p < &patterns[ARRAY_SIZE(patterns)]; p++) {
-        struct tnl_port *tnl_port;
-
-        match.in_key_flow = p->in_key_flow;
-        match.in_key = p->in_key_flow ? 0 : flow->tunnel.tun_id;
-
-        match.ip_dst_flow = p->ip_dst_flow;
-        match.ip_dst = p->ip_dst_flow ? 0 : flow->tunnel.ip_src;
-
-        match.ip_src_flow = p->ip_src == IP_SRC_FLOW;
-        match.ip_src = p->ip_src == IP_SRC_CFG ? flow->tunnel.ip_dst : 0;
-
-        tnl_port = tnl_find_exact(&match);
-        if (tnl_port) {
-            return tnl_port;
+    enum ip_src_type ip_src;
+    int in_key_flow;
+    int ip_dst_flow;
+    int i;
+
+    i = 0;
+    for (in_key_flow = 0; in_key_flow < 2; in_key_flow++) {
+        for (ip_dst_flow = 0; ip_dst_flow < 2; ip_dst_flow++) {
+            for (ip_src = 0; ip_src < 3; ip_src++) {
+                struct hmap *map = tnl_match_maps[i];
+
+                if (map) {
+                    struct tnl_port *tnl_port;
+                    struct tnl_match match;
+
+                    memset(&match, 0, sizeof match);
+
+                    /* The apparent mix-up of 'ip_dst' and 'ip_src' below is
+                     * correct, because "struct tnl_match" is expressed in
+                     * terms of packets being sent out, but we are using it
+                     * here as a description of how to treat received
+                     * packets. */
+                    match.in_key = in_key_flow ? 0 : flow->tunnel.tun_id;
+                    match.ip_src = (ip_src == IP_SRC_CFG
+                                    ? flow->tunnel.ip_dst
+                                    : 0);
+                    match.ip_dst = ip_dst_flow ? 0 : flow->tunnel.ip_src;
+                    match.odp_port = flow->in_port.odp_port;
+                    match.pkt_mark = flow->pkt_mark;
+                    match.in_key_flow = in_key_flow;
+                    match.ip_dst_flow = ip_dst_flow;
+                    match.ip_src_flow = ip_src == IP_SRC_FLOW;
+
+                    tnl_port = tnl_find_exact(&match, map);
+                    if (tnl_port) {
+                        return tnl_port;
+                    }
+                }
+
+                i++;
+            }
          }
      }
  
      return NULL;
  }
  
+/* Returns a pointer to the 'tnl_match_maps' element corresponding to 'm''s
+ * matching criteria. */
+static struct hmap **
+tnl_match_map(const struct tnl_match *m)
+{
+    enum ip_src_type ip_src;
+
+    ip_src = (m->ip_src_flow ? IP_SRC_FLOW
+              : m->ip_src ? IP_SRC_CFG
+              : IP_SRC_ANY);
+
+    return &tnl_match_maps[6 * m->in_key_flow + 3 * m->ip_dst_flow + ip_src];
+}
+
  static void
  tnl_match_fmt(const struct tnl_match *match, struct ds *ds)
      OVS_REQ_RDLOCK(rwlock)
diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in

index 6924d42..a4cf344 100644 (file)
--- a/ovsdb/ovsdb-server.1.in
+++ b/ovsdb/ovsdb-server.1.in
@@ -88,6 +88,8 @@ otherwise, it exits with exit code 1.
  This option can be useful where a database server is needed only to
  run a single command, e.g.:
  .B "ovsdb\-server \-\-remote=punix:socket \-\-run='ovsdb\-client dump unix:socket Open_vSwitch'"
+.IP
+This option is not supported on Windows platform.
  .SS "Daemon Options"
  .ds DD \
  \fBovsdb\-server\fR detaches only after it starts listening on all \
diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c

index c24d355..bdcdad9 100644 (file)
--- a/ovsdb/ovsdb-server.c
+++ b/ovsdb/ovsdb-server.c
@@ -1172,7 +1172,9 @@ parse_options(int *argcp, char **argvp[],
      static const struct option long_options[] = {
          {"remote",      required_argument, NULL, OPT_REMOTE},
          {"unixctl",     required_argument, NULL, OPT_UNIXCTL},
+#ifndef _WIN32
          {"run",         required_argument, NULL, OPT_RUN},
+#endif
          {"help",        no_argument, NULL, 'h'},
          {"version",     no_argument, NULL, 'V'},
          DAEMON_LONG_OPTIONS,
diff --git a/tests/automake.mk b/tests/automake.mk

index f78547b..0cf45a4 100644 (file)
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -154,6 +154,11 @@ check-valgrind: all tests/atconfig tests/atlocal $(TESTSUITE) \
  check-oftest: all
         srcdir='$(srcdir)' $(SHELL) $(srcdir)/tests/run-oftest
  EXTRA_DIST += tests/run-oftest
+
+# Ryu support.
+check-ryu: all
+       srcdir='$(srcdir)' $(SHELL) $(srcdir)/tests/run-ryu
+EXTRA_DIST += tests/run-ryu
  \f
  clean-local:
         test ! -f '$(TESTSUITE)' || $(SHELL) '$(TESTSUITE)' -C tests --clean
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at

index 46a6b67..6d48e5a 100644 (file)
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -112,6 +112,17 @@ AT_CHECK([tail -1 stdout], [0],
  OVS_VSWITCHD_STOP
  AT_CLEANUP
  
+AT_SETUP([ofproto-dpif - group chaining not supported])
+OVS_VSWITCHD_START
+ADD_OF_PORTS([br0], [1], [10], [11])
+AT_CHECK([ovs-ofctl -O OpenFlow12 add-group br0 'group_id=1234,type=all,bucket=output:10,set_field:192.168.3.90->ip_src,group:123,bucket=output:11'],
+  [1], [], [stderr])
+AT_CHECK([STRIP_XIDS stderr | sed 1q], [0],
+  [OFPT_ERROR (OF1.2): OFPGMFC_CHAINING_UNSUPPORTED
+])
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
  AT_SETUP([ofproto-dpif - all group in action list])
  OVS_VSWITCHD_START
  ADD_OF_PORTS([br0], [1], [10], [11])
@@ -1112,6 +1123,120 @@ done
  OVS_VSWITCHD_STOP
  AT_CLEANUP
  
+AT_SETUP([ofproto-dpif - MPLS handling])
+OVS_VSWITCHD_START([dnl
+   add-port br0 p1 -- set Interface p1 type=dummy
+])
+ON_EXIT([kill `cat ovs-ofctl.pid`])
+
+AT_CAPTURE_FILE([ofctl_monitor.log])
+AT_DATA([flows.txt], [dnl
+dl_src=40:44:44:44:00:00 actions=push_mpls:0x8847,controller
+dl_src=40:44:44:44:00:01,mpls actions=push_mpls:0x8847,controller
+dl_src=40:44:44:44:00:02,mpls actions=push_mpls:0x8848,controller
+])
+AT_CHECK([ovs-ofctl --protocols=OpenFlow12 add-flows br0 flows.txt])
+
+dnl In this test, we push an MPLS tag to an ethernet packet.
+AT_CHECK([ovs-ofctl --protocols=OpenFlow12 monitor br0 65534 -m -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:00:00,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log | ofctl_strip], [0], [dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:00,dl_dst=50:54:00:00:00:07,mpls_label=0,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+00000000  50 54 00 00 00 07 40 44-44 44 00 00 88 47 00 00
+00000010  01 40 45 00 00 28 00 00-00 00 40 06 f9 7c c0 a8
+00000020  00 01 c0 a8 00 02 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 50 00 00 00 00 00-00 00 00 00 00 00 00 00
+dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:00,dl_dst=50:54:00:00:00:07,mpls_label=0,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+00000000  50 54 00 00 00 07 40 44-44 44 00 00 88 47 00 00
+00000010  01 40 45 00 00 28 00 00-00 00 40 06 f9 7c c0 a8
+00000020  00 01 c0 a8 00 02 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 50 00 00 00 00 00-00 00 00 00 00 00 00 00
+dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:00,dl_dst=50:54:00:00:00:07,mpls_label=0,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+00000000  50 54 00 00 00 07 40 44-44 44 00 00 88 47 00 00
+00000010  01 40 45 00 00 28 00 00-00 00 40 06 f9 7c c0 a8
+00000020  00 01 c0 a8 00 02 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 50 00 00 00 00 00-00 00 00 00 00 00 00 00
+])
+
+dnl In this test, we push an MPLS tag to an MPLS packet. The LSE should be
+dnl copied exactly, except for the BOS bit.
+AT_CHECK([ovs-ofctl --protocols=OpenFlow12 monitor br0 65534 -m -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:00:01,dst=50:54:00:00:00:07),eth_type(0x8847),mpls(label=10,tc=0,ttl=64,bos=1)'
+done
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log | ofctl_strip], [0], [dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:01,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=0,mpls_lse1=41280
+00000000  50 54 00 00 00 07 40 44-44 44 00 01 88 47 00 00
+00000010  a0 40 00 00 a1 40 00 00-00 00 00 00 00 00 00 00
+00000020  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:01,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=0,mpls_lse1=41280
+00000000  50 54 00 00 00 07 40 44-44 44 00 01 88 47 00 00
+00000010  a0 40 00 00 a1 40 00 00-00 00 00 00 00 00 00 00
+00000020  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:01,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=0,mpls_lse1=41280
+00000000  50 54 00 00 00 07 40 44-44 44 00 01 88 47 00 00
+00000010  a0 40 00 00 a1 40 00 00-00 00 00 00 00 00 00 00
+00000020  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+])
+
+dnl In this test, we push an MPLS tag to an MPLS packet. The LSE should be
+dnl copied exactly, except for the BOS bit. The ethertype should be updated
+dnl to the MPLS ethertype of the MPLS push action which differs to that
+dnl of the input packet.
+AT_CHECK([ovs-ofctl --protocols=OpenFlow12 monitor br0 65534 -m -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:00:02,dst=50:54:00:00:00:07),eth_type(0x8847),mpls(label=10,tc=0,ttl=64,bos=1)'
+done
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log | ofctl_strip], [0], [dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mplsm,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:02,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=0,mpls_lse1=41280
+00000000  50 54 00 00 00 07 40 44-44 44 00 02 88 48 00 00
+00000010  a0 40 00 00 a1 40 00 00-00 00 00 00 00 00 00 00
+00000020  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mplsm,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:02,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=0,mpls_lse1=41280
+00000000  50 54 00 00 00 07 40 44-44 44 00 02 88 48 00 00
+00000010  a0 40 00 00 a1 40 00 00-00 00 00 00 00 00 00 00
+00000020  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+dnl
+OFPT_PACKET_IN (OF1.2): total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mplsm,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:00:02,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=0,mpls_lse1=41280
+00000000  50 54 00 00 00 07 40 44-44 44 00 02 88 48 00 00
+00000010  a0 40 00 00 a1 40 00 00-00 00 00 00 00 00 00 00
+00000020  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+00000030  00 00 00 00 00 00 00 00-00 00 00 00 00 00 00 00
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
  AT_SETUP([ofproto-dpif - VLAN+MPLS handling])
  OVS_VSWITCHD_START([dnl
     add-port br0 p1 -- set Interface p1 type=dummy
@@ -2857,22 +2982,19 @@ dl_src=60:66:66:66:66:01 actions=pop_mpls:0x8849,controller
  AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
  
  dnl Packet is sent to userspace because a MPLS push or pop action is applied to
-dnl a packet with 4 MPLS LSEs but userspace and the datapath can only handle up
-dnl to 3 labels.
+dnl a packet with 2 MPLS LSEs but dpif-netdev can't handle any labels.
  dnl
-dnl The input is a frame with four MPLS labels which tcpdump -vve shows as:
+dnl The input is a frame with two MPLS labels which tcpdump -vve shows as:
  dnl 60:66:66:66:66:00 > 50:54:00:00:00:07, ethertype MPLS unicast (0x8847), length 74: MPLS (label 20, exp 0, ttl 32)
-dnl         (label 20, exp 0, ttl 32)
-dnl         (label 20, exp 0, ttl 32)
  dnl         (label 20, exp 0, [S], ttl 32)
  dnl         (tos 0x0, ttl 64, id 0, offset 0, flags [none], proto TCP (6), length 44, bad cksum 3b78 (->f978)!)
  dnl     192.168.0.1.80 > 192.168.0.2.0: Flags [none], cksum 0x7744 (correct), seq 42:46, win 10000, length 4
  
  for dl_src in 00 01; do
-    AT_CHECK([ovs-appctl netdev-dummy/receive p1 "505400000007 6066666666$dl_src 8847 00014020 00014020 00014020 00014120 45 00 00 2c 00 00 00 00 40 06 3b 78 c0 a8 00 01 c0 a8 00 02 00 50 00 00 00 00 00 2a 00 00 00 2a 50 00 27 10 77 44 00 00 48 4f 47 45"])
+    AT_CHECK([ovs-appctl netdev-dummy/receive p1 "505400000007 6066666666$dl_src 8847 00014020 00014120 45 00 00 2c 00 00 00 00 40 06 3b 78 c0 a8 00 01 c0 a8 00 02 00 50 00 00 00 00 00 2a 00 00 00 2a 50 00 27 10 77 44 00 00 48 4f 47 45"])
  
      AT_CHECK_UNQUOTED([ovs-appctl dpif/dump-flows br0 | grep ":$dl_src/" | STRIP_USED], [0], [dnl
-skb_priority(0),in_port(1),eth(src=60:66:66:66:66:$dl_src/ff:ff:ff:ff:ff:ff,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x8847),mpls(lse0=0x14020/0x100,lse1=0x14020/0x100,lse2=0x14020/0x100), packets:0, bytes:0, used:never, actions:drop
+skb_priority(0),in_port(1),eth(src=60:66:66:66:66:$dl_src/ff:ff:ff:ff:ff:ff,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x8847),mpls(lse0=0x14020,lse1=0x14120), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(controller))
  ])
  done
  
@@ -2895,7 +3017,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
  
  dnl Packet is dropped because an MPLS PUSH action is applied to a packet with
  dnl 4 MPLS LSEs but ovs-vswtichd can only handle up to 3 MPLS LSEs and thus
-dnl can't determine the resulting MPLS label after an MPLS PUSH action.
+dnl can't determine the resulting MPLS label after MPLS push/pop actions.
  dnl
  dnl The input is a frame with two MPLS headers which tcpdump -vve shows as:
  dnl 60:66:66:66:66:01 > 50:54:00:00:00:07, ethertype MPLS unicast (0x8847), length 74: MPLS (label 20, exp 0, ttl 32)
@@ -2904,11 +3026,14 @@ dnl         (label 20, exp 0, ttl 32)
  dnl         (label 20, exp 0, [S], ttl 32)
  dnl         (tos 0x0, ttl 64, id 0, offset 0, flags [none], proto TCP (6), length 44, bad cksum 3b78 (->f978)!)
  dnl     192.168.0.1.80 > 192.168.0.2.0: Flags [none], cksum 0x7744 (correct), seq 42:46, win 10000, length 4
-AT_CHECK([ovs-appctl netdev-dummy/receive p1 '50 54 00 00 00 07 60 66 66 66 66 01 88 47 00 01 40 20 00 01 40 20 00 01 40 20 00 01 41 20 45 00 00 2c 00 00 00 00 40 06 3b 78 c0 a8 00 01 c0 a8 00 02 00 50 00 00 00 00 00 2a 00 00 00 2a 50 00 27 10 77 44 00 00 48 4f 47 45'])
  
-AT_CHECK([ovs-appctl dpif/dump-flows br0 | sort | STRIP_USED], [0], [dnl
-skb_priority(0),in_port(1),eth(src=60:66:66:66:66:01/ff:ff:ff:ff:ff:ff,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x8847),mpls(lse0=0x14020/0x100,lse1=0x14020/0x100,lse2=0x14020/0x100), packets:0, bytes:0, used:never, actions:drop
+for dl_src in 00 01; do
+    AT_CHECK([ovs-appctl netdev-dummy/receive p1 "505400000007 6066666666$dl_src 8847 00014020 00014120 45 00 00 2c 00 00 00 00 40 06 3b 78 c0 a8 00 01 c0 a8 00 02 00 50 00 00 00 00 00 2a 00 00 00 2a 50 00 27 10 77 44 00 00 48 4f 47 45"])
+
+    AT_CHECK_UNQUOTED([ovs-appctl dpif/dump-flows br0 | grep ":$dl_src/" | STRIP_USED], [0], [dnl
+skb_priority(0),in_port(1),eth(src=60:66:66:66:66:$dl_src/ff:ff:ff:ff:ff:ff,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x8847),mpls(lse0=0x14020,lse1=0x14120), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(controller))
  ])
+done
  
  OVS_VSWITCHD_STOP
  AT_CLEANUP
@@ -3470,8 +3595,8 @@ for i in 1 2 3 4; do
  done
  sleep 1
  AT_CHECK([ovs-appctl dpif/dump-flows br0 | STRIP_USED], [0], [dnl
-skb_priority(0),skb_mark(0),in_port(1/0xffff),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/255.255.255.255,dst=10.0.0.1/255.255.255.255,proto=1/0xff,tos=0/0xff,ttl=64/0xff,frag=no/0xfc),icmp(type=8,code=0), packets:3, bytes:180, used:0.0s, actions:2
-skb_priority(0),skb_mark(0),in_port(1/0xffff),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4/255.255.255.255,dst=10.0.0.3/255.255.255.255,proto=1/0xff,tos=0/0xff,ttl=64/0xff,frag=no/0xfc),icmp(type=8,code=0), packets:3, bytes:180, used:0.0s, actions:drop
+skb_priority(0),skb_mark(0),in_port(1/0xffff),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:180, used:0.0s, actions:2
+skb_priority(0),skb_mark(0),in_port(1/0xffff),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:180, used:0.0s, actions:drop
  ])
  OVS_VSWITCHD_STOP
  AT_CLEANUP
diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at

index b01743d..0cc4375 100644 (file)
--- a/tests/ovsdb-server.at
+++ b/tests/ovsdb-server.at
@@ -876,6 +876,8 @@ cat stdout >> output
  
  EXECUTION_EXAMPLES
  
+AT_BANNER([OVSDB -- ovsdb-server transactions (TCP IPv6 sockets)])
+
  # OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS])
  #
  # Creates a database with the given SCHEMA, starts an ovsdb-server on
diff --git a/tests/run-ryu b/tests/run-ryu

new file mode 100755 (executable)

index 0000000..94fe6bb
--- /dev/null
+++ b/tests/run-ryu
@@ -0,0 +1,148 @@
+#! /bin/sh
+
+run () {
+    echo "$@"
+    "$@" || exit 1
+}
+
+# Put built tools early in $PATH.
+builddir=`pwd`
+if test ! -e vswitchd/ovs-vswitchd; then
+    echo >&2 'not in build directory, please change directory or run via \"make check-ryu'
+    exit 1
+fi
+PATH=$builddir/ovsdb:$builddir/vswitchd:$builddir/utilities:$PATH; export PATH
+
+# Find srcdir.
+case $srcdir in
+    '') srcdir=$builddir ;;
+    /*) ;;
+    *) srcdir=`pwd`/$srcdir ;;
+esac
+if test ! -e "$srcdir"/WHY-OVS; then
+    echo >&2 'source directory not found, please set $srcdir or run via \"make check-ryu'
+    exit 1
+fi
+
+# Make sure ryu is available.
+if test X"$RYUDIR" = X; then
+    RYUDIR=$srcdir/../ryu
+fi
+PYTHONPATH=$RYUDIR:$PYTHONPATH; export PYTHONPATH
+PATH=$RYUDIR/bin:$PATH; export PATH
+if (ryu-manager --version) >/dev/null 2>&1; then
+    :
+else
+    echo >&2 '"ryu-manager" binary not found or cannot be run, please set $RYUDIR'
+    exit 1
+fi
+
+# Create sandbox.
+rm -rf sandbox
+mkdir sandbox
+cd sandbox
+sandbox=`pwd`
+
+# Set up environment for OVS programs to sandbox themselves.
+OVS_RUNDIR=$sandbox; export OVS_RUNDIR
+OVS_LOGDIR=$sandbox; export OVS_LOGDIR
+OVS_DBDIR=$sandbox; export OVS_DBDIR
+OVS_SYSCONFDIR=$sandbox; export OVS_SYSCONFDIR
+
+for signal in 0 1 2 3 13 14 15; do
+    trap 'kill `cat $sandbox/*.pid`; trap - $signal; kill -$signal $$' $signal
+done
+
+# Create database and start ovsdb-server.
+touch .conf.db.~lock~
+rm -f conf.db
+run ovsdb-tool create conf.db "$srcdir"/vswitchd/vswitch.ovsschema
+run ovsdb-server --detach --no-chdir --pidfile -vconsole:off --log-file \
+    --remote=punix:"$sandbox"/db.sock
+
+# Start ovs-vswitchd.
+run ovs-vswitchd --detach --no-chdir --pidfile -vconsole:off --log-file \
+    --enable-dummy --disable-system -vvconn -vnetdev_dummy
+
+# Add bridges for Ryu to use, and configure them to connect to Ryu.
+for config in \
+    'br0 0000000000000001 a c b d' \
+    'br1 0000000000000002 c a d b'
+do
+    set $config
+    bridge=$1 dpid=$2 port1=$3 peer1=$4 port2=$5 peer2=$6
+    run ovs-vsctl --no-wait \
+        -- add-br $bridge \
+        -- set bridge $bridge \
+               datapath-type=dummy fail-mode=secure \
+               protocols='[OpenFlow10,OpenFlow11,OpenFlow12,OpenFlow13]' \
+               other-config:datapath-id=$dpid \
+        -- set-controller $bridge tcp:127.0.0.1:6633 \
+        -- set controller $bridge connection-mode=out-of-band \
+                                  max-backoff=1000 \
+        -- add-port $bridge $port1 \
+        -- set interface $port1 ofport_request=1 type=patch options:peer=$peer1 \
+        -- add-port $bridge $port2 \
+        -- set interface $port2 ofport_request=2 type=patch options:peer=$peer2
+done
+
+logs=
+
+run_app() {
+    app=$1
+    cat <<EOF
+
+--- Running $app...
+
+EOF
+    logfile=$sandbox/`echo $app | sed 's,/,.,g'`.log
+    logs="$logs
+        $logfile"
+    ryu-manager "$app" --log-file="$logfile" & pid=$!
+    echo $pid > "$sandbox/ryu.pid"
+    i=0
+    while sleep 1; do
+        if grep -q -E 'TEST_FINISHED|Test end|uncaught exception' "$logfile" \
+                >/dev/null
+        then
+            break
+        fi
+
+        i=`expr $i + 1`
+        if test $i -ge 600; then
+            echo "--- TIMEOUT after $i seconds"
+            break
+        fi
+    done
+    kill $pid
+    wait
+}
+
+# Run Ryu.
+cd $RYUDIR
+for app in \
+    ryu/tests/switch/tester.py
+do
+    run_app $app
+done
+
+# tweak OVS setup because the following tests assume single bridge.
+run ovs-vsctl -- del-br br1
+
+for app in \
+    ryu/tests/integrated/test_add_flow_v10.py \
+    ryu/tests/integrated/test_request_reply_v12.py \
+    ryu/tests/integrated/test_add_flow_v12_actions.py \
+    ryu/tests/integrated/test_add_flow_v12_matches.py
+do
+    run_app $app
+done
+
+cat <<EOF
+
+----------------------------------------------------------------------
+Logs may be found under $sandbox, e.g.:$logs
+       $sandbox/ovs-vswitchd.log
+       $sandbox/ovsdb-server.log
+----------------------------------------------------------------------
+EOF
diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c

index af6ae5f..c563eee 100644 (file)
--- a/utilities/ovs-vsctl.c
+++ b/utilities/ovs-vsctl.c
@@ -4123,6 +4123,11 @@ do_vsctl(const char *args, struct vsctl_command *commands, size_t n_commands,
      free(commands);
  
      if (wait_for_reload && status != TXN_UNCHANGED) {
+        /* Even, if --retry flag was not specified, ovs-vsctl still
+         * has to retry to establish OVSDB connection, if wait_for_reload
+         * was set.  Otherwise, ovs-vsctl would end up waiting forever
+         * until cur_cfg would be updated. */
+        ovsdb_idl_enable_reconnect(idl);
          for (;;) {
              ovsdb_idl_run(idl);
              OVSREC_OPEN_VSWITCH_FOR_EACH (ovs, idl) {
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c

index cde4bd0..aa4ab31 100644 (file)
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -194,7 +194,6 @@ static void bridge_del_ports(struct bridge *,
  static void bridge_add_ports(struct bridge *,
                               const struct shash *wanted_ports);
  
-static void bridge_configure_flow_miss_model(const char *opt);
  static void bridge_configure_datapath_id(struct bridge *);
  static void bridge_configure_netflow(struct bridge *);
  static void bridge_configure_forward_bpdu(struct bridge *);
@@ -499,9 +498,6 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg)
          smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0),
          smap_get_int(&ovs_cfg->other_config, "n-revalidator-threads", 0));
  
-    bridge_configure_flow_miss_model(smap_get(&ovs_cfg->other_config,
-                                              "force-miss-model"));
-
      /* Destroy "struct bridge"s, "struct port"s, and "struct iface"s according
       * to 'ovs_cfg', with only very minimal configuration otherwise.
       *
@@ -878,22 +874,6 @@ port_configure(struct port *port)
      free(s.lacp_slaves);
  }
  
-static void
-bridge_configure_flow_miss_model(const char *opt)
-{
-    enum ofproto_flow_miss_model model = OFPROTO_HANDLE_MISS_AUTO;
-
-    if (opt) {
-        if (!strcmp(opt, "with-facets")) {
-            model = OFPROTO_HANDLE_MISS_WITH_FACETS;
-        } else if (!strcmp(opt, "without-facets")) {
-            model = OFPROTO_HANDLE_MISS_WITHOUT_FACETS;
-        }
-    }
-
-    ofproto_set_flow_miss_model(model);
-}
-
  /* Pick local port hardware address and datapath ID for 'br'. */
  static void
  bridge_configure_datapath_id(struct bridge *br)
@@ -2843,7 +2823,8 @@ bridge_configure_local_iface_netdev(struct bridge *br,
  
      /* If there's no local interface or no IP address, give up. */
      local_iface = iface_from_ofp_port(br, OFPP_LOCAL);
-    if (!local_iface || !c->local_ip || !inet_aton(c->local_ip, &ip)) {
+    if (!local_iface || !c->local_ip
+        || !inet_pton(AF_INET, c->local_ip, &ip)) {
          return;
      }
  
@@ -2853,7 +2834,7 @@ bridge_configure_local_iface_netdev(struct bridge *br,
  
      /* Configure the IP address and netmask. */
      if (!c->local_netmask
-        || !inet_aton(c->local_netmask, &mask)
+        || !inet_pton(AF_INET, c->local_netmask, &mask)
          || !mask.s_addr) {
          mask.s_addr = guess_netmask(ip.s_addr);
      }
@@ -2864,7 +2845,7 @@ bridge_configure_local_iface_netdev(struct bridge *br,
  
      /* Configure the default gateway. */
      if (c->local_gateway
-        && inet_aton(c->local_gateway, &gateway)
+        && inet_pton(AF_INET, c->local_gateway, &gateway)
          && gateway.s_addr) {
          if (!netdev_add_router(netdev, gateway)) {
              VLOG_INFO("bridge %s: configured gateway "IP_FMT,
diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in

index d2544f7..e9dc483 100644 (file)
--- a/vswitchd/ovs-vswitchd.8.in
+++ b/vswitchd/ovs-vswitchd.8.in
@@ -30,9 +30,6 @@ switching across each bridge described in its configuration files.  As
  the database changes, \fBovs\-vswitchd\fR automatically updates its
  configuration to match.
  .PP
-Upon receipt of a SIGHUP signal, \fBovs\-vswitchd\fR reopens its log
-file, if one was specified on the command line.
-.PP
  \fBovs\-vswitchd\fR switches may be configured with any of the following
  features:
  .
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c

index 9da2f49..da18a0a 100644 (file)
--- a/vswitchd/ovs-vswitchd.c
+++ b/vswitchd/ovs-vswitchd.c
@@ -37,8 +37,6 @@
  #include "openflow/openflow.h"
  #include "ovsdb-idl.h"
  #include "poll-loop.h"
-#include "process.h"
-#include "signals.h"
  #include "simap.h"
  #include "stream-ssl.h"
  #include "stream.h"
@@ -66,7 +64,6 @@ main(int argc, char *argv[])
  {
      char *unixctl_path = NULL;
      struct unixctl_server *unixctl;
-    struct signal *sighup;
      char *remote;
      bool exiting;
      int retval;
@@ -76,8 +73,6 @@ main(int argc, char *argv[])
      service_start(&argc, &argv);
      remote = parse_options(argc, argv, &unixctl_path);
      signal(SIGPIPE, SIG_IGN);
-    sighup = signal_register(SIGHUP);
-    process_init();
      ovsrec_init();
  
      daemonize_start();
@@ -103,9 +98,6 @@ main(int argc, char *argv[])
  
      exiting = false;
      while (!exiting) {
-        if (signal_poll(sighup)) {
-            vlog_reopen_log_file();
-        }
          memory_run();
          if (memory_should_report()) {
              struct simap usage;
@@ -119,7 +111,6 @@ main(int argc, char *argv[])
          unixctl_server_run(unixctl);
          netdev_run();
  
-        signal_wait(sighup);
          memory_wait();
          bridge_wait();
          unixctl_server_wait(unixctl);
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml

index e915caf..b640a0f 100644 (file)
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -136,28 +136,6 @@
          </p>
        </column>
  
-      <column name="other_config" key="force-miss-model">
-        <p>
-          Specifies userspace behaviour for handling flow misses. This takes
-          precedence over flow-eviction-threshold.
-        </p>
-        <p>
-          <dl>
-            <dt><code>auto</code></dt>
-            <dd>Handle automatically based on the flow-eviction-threshold and
-            the flow setup governer (default, recommended).</dd>
-            <dt><code>with-facets</code></dt>
-            <dd>Always create facets. Expensive kernel flow creation and
-            statistics tracking is always performed, even on flows with only
-            a small number of packets.</dd>
-            <dt><code>without-facets</code></dt>
-            <dd>Always handle without facets. Forces flow misses to be handled
-            in userspace. May cause an increase in CPU usage and packet loss
-            on high throughput.</dd>
-          </dl>
-        </p>
-      </column>
-
        <column name="other_config" key="n-handler-threads"
                type='{"type": "integer", "minInteger": 1}'>
          <p>
author	Giuseppe Lettieri <g.lettieri@iet.unipi.it>
	Tue, 25 Feb 2014 15:42:09 +0000 (16:42 +0100)
committer	Giuseppe Lettieri <g.lettieri@iet.unipi.it>
	Tue, 25 Feb 2014 15:42:09 +0000 (16:42 +0100)
AUTHORS		patch \| blob \| history
FAQ		patch \| blob \| history
INSTALL		patch \| blob \| history
Makefile.am		patch \| blob \| history
NEWS		patch \| blob \| history
OPENFLOW-1.1+		patch \| blob \| history
README-OFTest	[deleted file]	patch \| blob \| history
README-gcov	[deleted file]	patch \| blob \| history
configure.ac		patch \| blob \| history
datapath/actions.c		patch \| blob \| history
datapath/datapath.c		patch \| blob \| history
datapath/flow.c		patch \| blob \| history
datapath/flow.h		patch \| blob \| history
datapath/flow_netlink.c		patch \| blob \| history
datapath/flow_netlink.h		patch \| blob \| history
datapath/flow_table.c		patch \| blob \| history
datapath/flow_table.h		patch \| blob \| history
datapath/linux/compat/include/linux/etherdevice.h		patch \| blob \| history
datapath/linux/compat/include/linux/rcupdate.h		patch \| blob \| history
lib/automake.mk		patch \| blob \| history
lib/bfd.c		patch \| blob \| history
lib/daemon.c		patch \| blob \| history
lib/dpif-netdev.c		patch \| blob \| history
lib/entropy.c		patch \| blob \| history
lib/flow.c		patch \| blob \| history
lib/hmap.h		patch \| blob \| history
lib/jsonrpc.c		patch \| blob \| history
lib/jsonrpc.h		patch \| blob \| history
lib/lockfile.c		patch \| blob \| history
lib/meta-flow.c		patch \| blob \| history
lib/netdev-dummy.c		patch \| blob \| history
lib/netdev.c		patch \| blob \| history
lib/ofp-actions.c		patch \| blob \| history
lib/ofp-util.c		patch \| blob \| history
lib/ovs-thread.c		patch \| blob \| history
lib/ovs-thread.h		patch \| blob \| history
lib/ovsdb-idl.c		patch \| blob \| history
lib/ovsdb-idl.h		patch \| blob \| history
lib/packets.c		patch \| blob \| history
lib/packets.h		patch \| blob \| history
lib/pcap-file.c		patch \| blob \| history
lib/poll-loop.c		patch \| blob \| history
lib/poll-loop.h		patch \| blob \| history
lib/process.c		patch \| blob \| history
lib/reconnect.c		patch \| blob \| history
lib/signals.c		patch \| blob \| history
lib/signals.h		patch \| blob \| history
lib/socket-util.c		patch \| blob \| history
lib/socket-util.h		patch \| blob \| history
lib/stream-ssl.c		patch \| blob \| history
lib/stream.c		patch \| blob \| history
lib/string.h.in		patch \| blob \| history
lib/util.c		patch \| blob \| history
lib/util.h		patch \| blob \| history
ofproto/bond.c		patch \| blob \| history
ofproto/ofproto-dpif-ipfix.c		patch \| blob \| history
ofproto/ofproto-dpif-upcall.c		patch \| blob \| history
ofproto/ofproto-dpif-xlate.c		patch \| blob \| history
ofproto/ofproto-dpif.c		patch \| blob \| history
ofproto/ofproto-provider.h		patch \| blob \| history
ofproto/ofproto.c		patch \| blob \| history
ofproto/ofproto.h		patch \| blob \| history
ofproto/tunnel.c		patch \| blob \| history
ovsdb/ovsdb-server.1.in		patch \| blob \| history
ovsdb/ovsdb-server.c		patch \| blob \| history
tests/automake.mk		patch \| blob \| history
tests/ofproto-dpif.at		patch \| blob \| history
tests/ovsdb-server.at		patch \| blob \| history
tests/run-ryu	[new file with mode: 0755]	patch \| blob
utilities/ovs-vsctl.c		patch \| blob \| history
vswitchd/bridge.c		patch \| blob \| history
vswitchd/ovs-vswitchd.8.in		patch \| blob \| history
vswitchd/ovs-vswitchd.c		patch \| blob \| history
vswitchd/vswitch.xml		patch \| blob \| history