From: Ben Pfaff <blp@nicira.com>
Date: Thu, 19 Sep 2013 18:03:47 +0000 (-0700)
Subject: ofproto-dpif-upcall: Forward packets in order of arrival.
X-Git-Tag: sliver-openvswitch-2.0.90-1~13^2~21
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=04a19fb8f4b8ba19a9805906aac7b30b65b57206;p=sliver-openvswitch.git

ofproto-dpif-upcall: Forward packets in order of arrival.

Until now, the code in ofproto-dpif-upcall (and the code that preceded it
in ofproto-dpif) obtained a batch of incoming packets, inserted them into
a hash table based on hashes of their flows, processed them, and then
forwarded them in hash order.  Usually this maintains order within a single
network connection, but because OVS's notion of a flow is so fine-grained,
it can reorder packets within (e.g.) a TCP connection if two packets
handled in a single batch have (e.g.) different ECN values.

This commit fixes the problem by making ofproto-dpif-upcall always forward
packets in the same order they were received.

This is far from the minimal change necessary to avoid reordering packets.
I think that the code is easier to understand afterward.

Reported-by: Dmitry Fleytman <dfleytma@redhat.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
---

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index b9d91b291..d75c61b61 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -381,11 +381,11 @@ udpif_dispatcher(void *arg)
 static void *
 udpif_miss_handler(void *arg)
 {
-    struct list misses = LIST_INITIALIZER(&misses);
     struct handler *handler = arg;
 
     set_subprogram_name("miss_handler");
     for (;;) {
+        struct list misses = LIST_INITIALIZER(&misses);
         size_t i;
 
         ovs_mutex_lock(&handler->mutex);
@@ -416,12 +416,6 @@ udpif_miss_handler(void *arg)
 static void
 miss_destroy(struct flow_miss *miss)
 {
-    struct upcall *upcall, *next;
-
-    LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) {
-        list_remove(&upcall->list_node);
-        upcall_destroy(upcall);
-    }
     xlate_out_uninit(&miss->xout);
 }
 
@@ -598,105 +592,6 @@ flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto,
     return NULL;
 }
 
-/* Executes flow miss 'miss'.  May add any required datapath operations
- * to 'ops', incrementing '*n_ops' for each new op. */
-static void
-execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops)
-{
-    struct ofproto_dpif *ofproto = miss->ofproto;
-    struct upcall *upcall;
-    struct flow_wildcards wc;
-    struct rule_dpif *rule;
-    struct xlate_in xin;
-
-    memset(&miss->stats, 0, sizeof miss->stats);
-    miss->stats.used = time_msec();
-    LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
-        struct ofpbuf *packet = upcall->dpif_upcall.packet;
-
-        miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
-        miss->stats.n_bytes += packet->size;
-        miss->stats.n_packets++;
-    }
-
-    flow_wildcards_init_catchall(&wc);
-    rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule);
-    rule_dpif_credit_stats(rule, &miss->stats);
-    xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags,
-                  NULL);
-    xin.may_learn = true;
-    xin.resubmit_stats = &miss->stats;
-    xlate_actions(&xin, &miss->xout);
-    flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc);
-
-    if (rule_dpif_fail_open(rule)) {
-        LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
-            struct ofpbuf *packet = upcall->dpif_upcall.packet;
-            struct ofputil_packet_in *pin;
-
-            /* Extra-special case for fail-open mode.
-             *
-             * We are in fail-open mode and the packet matched the fail-open
-             * rule, but we are connected to a controller too.  We should send
-             * the packet up to the controller in the hope that it will try to
-             * set up a flow and thereby allow us to exit fail-open.
-             *
-             * See the top-level comment in fail-open.c for more information. */
-            pin = xmalloc(sizeof(*pin));
-            pin->packet = xmemdup(packet->data, packet->size);
-            pin->packet_len = packet->size;
-            pin->reason = OFPR_NO_MATCH;
-            pin->controller_id = 0;
-            pin->table_id = 0;
-            pin->cookie = 0;
-            pin->send_len = 0; /* Not used for flow table misses. */
-            flow_get_metadata(&miss->flow, &pin->fmd);
-            ofproto_dpif_send_packet_in(ofproto, pin);
-        }
-    }
-
-    if (miss->xout.slow) {
-        LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
-            struct ofpbuf *packet = upcall->dpif_upcall.packet;
-            struct xlate_in xin;
-
-            xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
-            xlate_actions_for_side_effects(&xin);
-        }
-    }
-    rule_dpif_unref(rule);
-
-    if (miss->xout.odp_actions.size) {
-        LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
-            struct ofpbuf *packet = upcall->dpif_upcall.packet;
-            struct dpif_op *op = &ops[*n_ops];
-            struct dpif_execute *execute = &op->u.execute;
-
-            if (miss->flow.in_port.ofp_port
-                != vsp_realdev_to_vlandev(miss->ofproto,
-                                          miss->flow.in_port.ofp_port,
-                                          miss->flow.vlan_tci)) {
-                /* This packet was received on a VLAN splinter port.  We
-                 * added a VLAN to the packet to make the packet resemble
-                 * the flow, but the actions were composed assuming that
-                 * the packet contained no VLAN.  So, we must remove the
-                 * VLAN header from the packet before trying to execute the
-                 * actions. */
-                eth_pop_vlan(packet);
-            }
-
-            op->type = DPIF_OP_EXECUTE;
-            execute->key = miss->key;
-            execute->key_len = miss->key_len;
-            execute->packet = packet;
-            execute->actions = miss->xout.odp_actions.data;
-            execute->actions_len = miss->xout.odp_actions.size;
-
-            (*n_ops)++;
-        }
-    }
-}
-
 static void
 handle_miss_upcalls(struct udpif *udpif, struct list *upcalls)
 {
@@ -707,92 +602,185 @@ handle_miss_upcalls(struct udpif *udpif, struct list *upcalls)
     size_t n_upcalls, n_ops, i;
     struct flow_miss *miss;
     unsigned int reval_seq;
+    bool fail_open;
 
-    /* Construct the to-do list.
+    /* Extract the flow from each upcall.  Construct in fmb->misses a hash
+     * table that maps each unique flow to a 'struct flow_miss'.
+     *
+     * Most commonly there is a single packet per flow_miss, but there are
+     * several reasons why there might be more than one, e.g.:
+     *
+     *   - The dpif packet interface does not support TSO (or UFO, etc.), so a
+     *     large packet sent to userspace is split into a sequence of smaller
+     *     ones.
      *
-     * This just amounts to extracting the flow from each packet and sticking
-     * the packets that have the same flow in the same "flow_miss" structure so
-     * that we can process them together. */
+     *   - A stream of quickly arriving packets in an established "slow-pathed"
+     *     flow.
+     *
+     *   - Rarely, a stream of quickly arriving packets in a flow not yet
+     *     established.  (This is rare because most protocols do not send
+     *     multiple back-to-back packets before receiving a reply from the
+     *     other end of the connection, which gives OVS a chance to set up a
+     *     datapath flow.)
+     */
     fmb = xmalloc(sizeof *fmb);
     atomic_read(&udpif->reval_seq, &fmb->reval_seq);
     hmap_init(&fmb->misses);
     n_upcalls = 0;
     LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
         struct dpif_upcall *dupcall = &upcall->dpif_upcall;
+        struct ofpbuf *packet = dupcall->packet;
         struct flow_miss *miss = &fmb->miss_buf[n_upcalls];
         struct flow_miss *existing_miss;
         struct ofproto_dpif *ofproto;
         odp_port_t odp_in_port;
         struct flow flow;
-        uint32_t hash;
         int error;
 
-        error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key,
+        error = xlate_receive(udpif->backer, packet, dupcall->key,
                               dupcall->key_len, &flow, &miss->key_fitness,
                               &ofproto, &odp_in_port);
 
-        if (error == ENODEV) {
-            struct drop_key *drop_key;
-
-            /* Received packet on datapath port for which we couldn't
-             * associate an ofproto.  This can happen if a port is removed
-             * while traffic is being received.  Print a rate-limited message
-             * in case it happens frequently.  Install a drop flow so
-             * that future packets of the flow are inexpensively dropped
-             * in the kernel. */
-            VLOG_INFO_RL(&rl, "received packet on unassociated datapath port "
-                              "%"PRIu32, odp_in_port);
-
-            drop_key = xmalloc(sizeof *drop_key);
-            drop_key->key = xmemdup(dupcall->key, dupcall->key_len);
-            drop_key->key_len = dupcall->key_len;
-
-            if (guarded_list_push_back(&udpif->drop_keys, &drop_key->list_node,
-                                       MAX_QUEUE_LENGTH)) {
-                seq_change(udpif->wait_seq);
+        if (!error) {
+            uint32_t hash;
+
+            flow_extract(packet, flow.skb_priority, flow.pkt_mark,
+                         &flow.tunnel, &flow.in_port, &miss->flow);
+
+            hash = flow_hash(&miss->flow, 0);
+            existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow,
+                                           hash);
+            if (!existing_miss) {
+                hmap_insert(&fmb->misses, &miss->hmap_node, hash);
+                miss->ofproto = ofproto;
+                miss->key = dupcall->key;
+                miss->key_len = dupcall->key_len;
+                miss->upcall_type = dupcall->type;
+                miss->stats.n_packets = 0;
+                miss->stats.n_bytes = 0;
+                miss->stats.used = time_msec();
+                miss->stats.tcp_flags = 0;
+
+                n_upcalls++;
             } else {
-                COVERAGE_INC(drop_queue_overflow);
-                drop_key_destroy(drop_key);
+                miss = existing_miss;
             }
-            continue;
-        } else if (error) {
-            continue;
-        }
+            miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
+            miss->stats.n_bytes += packet->size;
+            miss->stats.n_packets++;
 
-        flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark,
-                     &flow.tunnel, &flow.in_port, &miss->flow);
-
-        /* Add other packets to a to-do list. */
-        hash = flow_hash(&miss->flow, 0);
-        existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, hash);
-        if (!existing_miss) {
-            hmap_insert(&fmb->misses, &miss->hmap_node, hash);
-            miss->ofproto = ofproto;
-            miss->key = dupcall->key;
-            miss->key_len = dupcall->key_len;
-            miss->upcall_type = dupcall->type;
-            list_init(&miss->upcalls);
-
-            n_upcalls++;
+            upcall->flow_miss = miss;
         } else {
-            miss = existing_miss;
+            if (error == ENODEV) {
+                struct drop_key *drop_key;
+
+                /* Received packet on datapath port for which we couldn't
+                 * associate an ofproto.  This can happen if a port is removed
+                 * while traffic is being received.  Print a rate-limited
+                 * message in case it happens frequently.  Install a drop flow
+                 * so that future packets of the flow are inexpensively dropped
+                 * in the kernel. */
+                VLOG_INFO_RL(&rl, "received packet on unassociated datapath "
+                             "port %"PRIu32, odp_in_port);
+
+                drop_key = xmalloc(sizeof *drop_key);
+                drop_key->key = xmemdup(dupcall->key, dupcall->key_len);
+                drop_key->key_len = dupcall->key_len;
+
+                if (guarded_list_push_back(&udpif->drop_keys,
+                                           &drop_key->list_node,
+                                           MAX_QUEUE_LENGTH)) {
+                    seq_change(udpif->wait_seq);
+                } else {
+                    COVERAGE_INC(drop_queue_overflow);
+                    drop_key_destroy(drop_key);
+                }
+            }
+            list_remove(&upcall->list_node);
+            upcall_destroy(upcall);
         }
-        list_remove(&upcall->list_node);
-        list_push_back(&miss->upcalls, &upcall->list_node);
     }
 
-    LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
-        list_remove(&upcall->list_node);
-        upcall_destroy(upcall);
-    }
-
-    /* Process each element in the to-do list, constructing the set of
-     * operations to batch. */
-    n_ops = 0;
+    /* Initialize each 'struct flow_miss's ->xout.
+     *
+     * We do this per-flow_miss rather than per-packet because, most commonly,
+     * all the packets in a flow can use the same translation.
+     *
+     * We can't do this in the previous loop because we need the TCP flags for
+     * all the packets in each miss. */
+    fail_open = false;
     HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
-        execute_flow_miss(miss, ops, &n_ops);
+        struct flow_wildcards wc;
+        struct rule_dpif *rule;
+        struct xlate_in xin;
+
+        flow_wildcards_init_catchall(&wc);
+        rule_dpif_lookup(miss->ofproto, &miss->flow, &wc, &rule);
+        if (rule_dpif_fail_open(rule)) {
+            fail_open = true;
+        }
+        rule_dpif_credit_stats(rule, &miss->stats);
+        xlate_in_init(&xin, miss->ofproto, &miss->flow, rule,
+                      miss->stats.tcp_flags, NULL);
+        xin.may_learn = true;
+        xin.resubmit_stats = &miss->stats;
+        xlate_actions(&xin, &miss->xout);
+        flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc);
+        rule_dpif_unref(rule);
+    }
+
+    /* Now handle the packets individually in order of arrival.  In the common
+     * case each packet of a miss can share the same actions, but slow-pathed
+     * packets need to be translated individually:
+     *
+     *   - For SLOW_CFM, SLOW_LACP, SLOW_STP, and SLOW_BFD, translation is what
+     *     processes received packets for these protocols.
+     *
+     *   - For SLOW_CONTROLLER, translation sends the packet to the OpenFlow
+     *     controller.
+     *
+     * The loop fills 'ops' with an array of operations to execute in the
+     * datapath. */
+    n_ops = 0;
+    LIST_FOR_EACH (upcall, list_node, upcalls) {
+        struct flow_miss *miss = upcall->flow_miss;
+        struct ofpbuf *packet = upcall->dpif_upcall.packet;
+
+        if (miss->xout.slow) {
+            struct rule_dpif *rule;
+            struct xlate_in xin;
+
+            rule_dpif_lookup(miss->ofproto, &miss->flow, NULL, &rule);
+            xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
+            xlate_actions_for_side_effects(&xin);
+            rule_dpif_unref(rule);
+        }
+
+        if (miss->xout.odp_actions.size) {
+            struct dpif_op *op;
+
+            if (miss->flow.in_port.ofp_port
+                != vsp_realdev_to_vlandev(miss->ofproto,
+                                          miss->flow.in_port.ofp_port,
+                                          miss->flow.vlan_tci)) {
+                /* This packet was received on a VLAN splinter port.  We
+                 * added a VLAN to the packet to make the packet resemble
+                 * the flow, but the actions were composed assuming that
+                 * the packet contained no VLAN.  So, we must remove the
+                 * VLAN header from the packet before trying to execute the
+                 * actions. */
+                eth_pop_vlan(packet);
+            }
+
+            op = &ops[n_ops++];
+            op->type = DPIF_OP_EXECUTE;
+            op->u.execute.key = miss->key;
+            op->u.execute.key_len = miss->key_len;
+            op->u.execute.packet = packet;
+            op->u.execute.actions = miss->xout.odp_actions.data;
+            op->u.execute.actions_len = miss->xout.odp_actions.size;
+        }
     }
-    ovs_assert(n_ops <= ARRAY_SIZE(ops));
 
     /* Execute batch. */
     for (i = 0; i < n_ops; i++) {
@@ -800,6 +788,32 @@ handle_miss_upcalls(struct udpif *udpif, struct list *upcalls)
     }
     dpif_operate(udpif->dpif, opsp, n_ops);
 
+    /* Special case for fail-open mode.
+     *
+     * If we are in fail-open mode, but we are connected to a controller too,
+     * then we should send the packet up to the controller in the hope that it
+     * will try to set up a flow and thereby allow us to exit fail-open.
+     *
+     * See the top-level comment in fail-open.c for more information. */
+    if (fail_open) {
+        LIST_FOR_EACH (upcall, list_node, upcalls) {
+            struct flow_miss *miss = upcall->flow_miss;
+            struct ofpbuf *packet = upcall->dpif_upcall.packet;
+            struct ofputil_packet_in *pin;
+
+            pin = xmalloc(sizeof *pin);
+            pin->packet = xmemdup(packet->data, packet->size);
+            pin->packet_len = packet->size;
+            pin->reason = OFPR_NO_MATCH;
+            pin->controller_id = 0;
+            pin->table_id = 0;
+            pin->cookie = 0;
+            pin->send_len = 0; /* Not used for flow table misses. */
+            flow_get_metadata(&miss->flow, &pin->fmd);
+            ofproto_dpif_send_packet_in(miss->ofproto, pin);
+        }
+    }
+
     atomic_read(&udpif->reval_seq, &reval_seq);
     if (reval_seq != fmb->reval_seq) {
         COVERAGE_INC(fmb_queue_revalidated);
diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h
index a23f7a0e2..9bd19ad52 100644
--- a/ofproto/ofproto-dpif-upcall.h
+++ b/ofproto/ofproto-dpif-upcall.h
@@ -59,6 +59,7 @@ enum upcall_type {
 /* An upcall. */
 struct upcall {
     struct list list_node;          /* For queuing upcalls. */
+    struct flow_miss *flow_miss;    /* This upcall's flow_miss. */
 
     enum upcall_type type;          /* Classification. */
 
@@ -94,8 +95,6 @@ struct flow_miss {
     struct dpif_flow_stats stats;
 
     struct xlate_out xout;
-
-    struct list upcalls;        /* Contains "struct upcall"s. */
 };
 
 struct flow_miss_batch {