X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=ofproto%2Fofproto-dpif.c;h=33b09c633176539809a792612412d914669103b9;hb=d8558b4ae86f0f64afe520c4c1d99f55a94063d8;hp=d6121f1f821bd53520c7ba6f852b246c132aaf06;hpb=655ab909c455de622963b14ad8de4feb3c3f56e3;p=sliver-openvswitch.git

diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index d6121f1f8..33b09c633 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -515,6 +515,7 @@ static void facet_reset_counters(struct facet *);
 static void facet_push_stats(struct facet *);
 static void facet_learn(struct facet *);
 static void facet_account(struct facet *);
+static void push_all_stats(void);
 
 static struct subfacet *facet_get_subfacet(struct facet *);
 
@@ -597,6 +598,7 @@ static void port_run_fast(struct ofport_dpif *);
 static void port_wait(struct ofport_dpif *);
 static int set_cfm(struct ofport *, const struct cfm_settings *);
 static void ofport_clear_priorities(struct ofport_dpif *);
+static void run_fast_rl(void);
 
 struct dpif_completion {
     struct list list_node;
@@ -907,6 +909,7 @@ lookup_ofproto_dpif_by_port_name(const char *name)
 static int
 type_run(const char *type)
 {
+    static long long int push_timer = LLONG_MIN;
     struct dpif_backer *backer;
     char *devname;
     int error;
@@ -920,6 +923,16 @@ type_run(const char *type)
 
     dpif_run(backer->dpif);
 
+    /* The most natural place to push facet statistics is when they're pulled
+     * from the datapath.  However, when there are many flows in the datapath,
+     * this expensive operation can occur so frequently, that it reduces our
+     * ability to quickly set up flows.  To reduce the cost, we push statistics
+     * here instead. */
+    if (time_msec() > push_timer) {
+        push_timer = time_msec() + 2000;
+        push_all_stats();
+    }
+
     if (backer->need_revalidate
         || !tag_set_is_empty(&backer->revalidate_set)) {
         struct tag_set revalidate_set = backer->revalidate_set;
@@ -1007,6 +1020,7 @@ type_run(const char *type)
                 if (need_revalidate
                     || tag_set_intersects(&revalidate_set, facet->tags)) {
                     facet_revalidate(facet);
+                    run_fast_rl();
                 }
             }
         }
@@ -1074,18 +1088,10 @@ type_run(const char *type)
 }
 
 static int
-type_run_fast(const char *type)
+dpif_backer_run_fast(struct dpif_backer *backer, int max_batch)
 {
-    struct dpif_backer *backer;
     unsigned int work;
 
-    backer = shash_find_data(&all_dpif_backers, type);
-    if (!backer) {
-        /* This is not necessarily a problem, since backers are only
-         * created on demand. */
-        return 0;
-    }
-
     /* Handle one or more batches of upcalls, until there's nothing left to do
      * or until we do a fixed total amount of work.
      *
@@ -1096,8 +1102,8 @@ type_run_fast(const char *type)
      * optimizations can make major improvements on some benchmarks and
      * presumably for real traffic as well. */
     work = 0;
-    while (work < FLOW_MISS_MAX_BATCH) {
-        int retval = handle_upcalls(backer, FLOW_MISS_MAX_BATCH - work);
+    while (work < max_batch) {
+        int retval = handle_upcalls(backer, max_batch - work);
         if (retval <= 0) {
             return -retval;
         }
@@ -1107,6 +1113,58 @@ type_run_fast(const char *type)
     return 0;
 }
 
+static int
+type_run_fast(const char *type)
+{
+    struct dpif_backer *backer;
+
+    backer = shash_find_data(&all_dpif_backers, type);
+    if (!backer) {
+        /* This is not necessarily a problem, since backers are only
+         * created on demand. */
+        return 0;
+    }
+
+    return dpif_backer_run_fast(backer, FLOW_MISS_MAX_BATCH);
+}
+
+static void
+run_fast_rl(void)
+{
+    static long long int port_rl = LLONG_MIN;
+    static unsigned int backer_rl = 0;
+
+    if (time_msec() >= port_rl) {
+        struct ofproto_dpif *ofproto;
+        struct ofport_dpif *ofport;
+
+        HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
+
+            HMAP_FOR_EACH (ofport, up.hmap_node, &ofproto->up.ports) {
+                port_run_fast(ofport);
+            }
+        }
+        port_rl = time_msec() + 200;
+    }
+
+    /* XXX: We have to be careful not to do too much work in this function.  If
+     * we call dpif_backer_run_fast() too often, or with too large a batch,
+     * performance improves signifcantly, but at a cost.  It's possible for the
+     * number of flows in the datapath to increase without bound, and for poll
+     * loops to take 10s of seconds.   The correct solution to this problem,
+     * long term, is to separate flow miss handling into it's own thread so it
+     * isn't affected by revalidations, and expirations.  Until then, this is
+     * the best we can do. */
+    if (++backer_rl >= 10) {
+        struct shash_node *node;
+
+        backer_rl = 0;
+        SHASH_FOR_EACH (node, &all_dpif_backers) {
+            dpif_backer_run_fast(node->data, 1);
+        }
+    }
+}
+
 static void
 type_wait(const char *type)
 {
@@ -2937,6 +2995,8 @@ mirror_get_stats(struct ofproto *ofproto_, void *aux,
         return 0;
     }
 
+    push_all_stats();
+
     *packets = mirror->packet_count;
     *bytes = mirror->byte_count;
 
@@ -3195,6 +3255,8 @@ port_get_stats(const struct ofport *ofport_, struct netdev_stats *stats)
     struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
     int error;
 
+    push_all_stats();
+
     error = netdev_get_stats(ofport->up.netdev, stats);
 
     if (!error && ofport_->ofp_port == OFPP_LOCAL) {
@@ -4159,7 +4221,6 @@ update_subfacet_stats(struct subfacet *subfacet,
         facet_account(facet);
         facet->accounted_bytes = facet->byte_count;
     }
-    facet_push_stats(facet);
 }
 
 /* 'key' with length 'key_len' bytes is a flow in 'dpif' that we know nothing
@@ -4203,13 +4264,13 @@ update_stats(struct dpif_backer *backer)
     const struct dpif_flow_stats *stats;
     struct dpif_flow_dump dump;
     const struct nlattr *key;
+    struct ofproto_dpif *ofproto;
     size_t key_len;
 
     dpif_flow_dump_start(&dump, backer->dpif);
     while (dpif_flow_dump_next(&dump, &key, &key_len, NULL, NULL, &stats)) {
         struct flow flow;
         struct subfacet *subfacet;
-        struct ofproto_dpif *ofproto;
         struct ofport_dpif *ofport;
         uint32_t key_hash;
 
@@ -4220,7 +4281,6 @@ update_stats(struct dpif_backer *backer)
 
         ofproto->total_subfacet_count += hmap_count(&ofproto->subfacets);
         ofproto->n_update_stats++;
-        update_moving_averages(ofproto);
 
         ofport = get_ofp_port(ofproto, flow.in_port);
         if (ofport && ofport->tnl_port) {
@@ -4249,8 +4309,14 @@ update_stats(struct dpif_backer *backer)
             delete_unexpected_flow(ofproto, key, key_len);
             break;
         }
+        run_fast_rl();
     }
     dpif_flow_dump_done(&dump);
+
+    HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
+        update_moving_averages(ofproto);
+    }
+
 }
 
 /* Calculates and returns the number of milliseconds of idle time after which
@@ -4523,13 +4589,14 @@ facet_learn(struct facet *facet)
     struct ofproto_dpif *ofproto = ofproto_dpif_cast(facet->rule->up.ofproto);
     struct subfacet *subfacet= CONTAINER_OF(list_front(&facet->subfacets),
                                             struct subfacet, list_node);
+    long long int now = time_msec();
     struct action_xlate_ctx ctx;
 
-    if (time_msec() < facet->learn_rl) {
+    if (!facet->has_fin_timeout && now < facet->learn_rl) {
         return;
     }
 
-    facet->learn_rl = time_msec() + 500;
+    facet->learn_rl = now + 500;
 
     if (!facet->has_learn
         && !facet->has_normal
@@ -5037,6 +5104,36 @@ facet_push_stats(struct facet *facet)
     }
 }
 
+static void
+push_all_stats__(bool run_fast)
+{
+    static long long int rl = LLONG_MIN;
+    struct ofproto_dpif *ofproto;
+
+    if (time_msec() < rl) {
+        return;
+    }
+
+    HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
+        struct facet *facet;
+
+        HMAP_FOR_EACH (facet, hmap_node, &ofproto->facets) {
+            facet_push_stats(facet);
+            if (run_fast) {
+                run_fast_rl();
+            }
+        }
+    }
+
+    rl = time_msec() + 100;
+}
+
+static void
+push_all_stats(void)
+{
+    push_all_stats__(true);
+}
+
 static void
 rule_credit_stats(struct rule_dpif *rule, const struct dpif_flow_stats *stats)
 {
@@ -5203,6 +5300,7 @@ subfacet_destroy_batch(struct ofproto_dpif *ofproto,
         subfacet_reset_dp_stats(subfacets[i], &stats[i]);
         subfacets[i]->path = SF_NOT_INSTALLED;
         subfacet_destroy(subfacets[i]);
+        run_fast_rl();
     }
 }
 
@@ -5515,13 +5613,14 @@ rule_destruct(struct rule *rule_)
 static void
 rule_get_stats(struct rule *rule_, uint64_t *packets, uint64_t *bytes)
 {
-    struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule_->ofproto);
     struct rule_dpif *rule = rule_dpif_cast(rule_);
     struct facet *facet;
 
-    HMAP_FOR_EACH (facet, hmap_node, &ofproto->facets) {
-        facet_push_stats(facet);
-    }
+    /* push_all_stats() can handle flow misses which, when using the learn
+     * action, can cause rules to be added and deleted.  This can corrupt our
+     * caller's datastructures which assume that rule_get_stats() doesn't have
+     * an impact on the flow table. To be safe, we disable miss handling. */
+    push_all_stats__(false);
 
     /* Start from historical data for 'rule' itself that are no longer tracked
      * in facets.  This counts, for example, facets that have expired. */
@@ -5822,9 +5921,9 @@ compose_output_action__(struct action_xlate_ctx *ctx, uint16_t ofp_port,
                         bool check_stp)
 {
     const struct ofport_dpif *ofport = get_ofp_port(ctx->ofproto, ofp_port);
-    ovs_be16 flow_vlan_tci = ctx->flow.vlan_tci;
-    ovs_be64 flow_tun_id = ctx->flow.tunnel.tun_id;
-    uint8_t flow_nw_tos = ctx->flow.nw_tos;
+    ovs_be16 flow_vlan_tci;
+    uint32_t flow_skb_mark;
+    uint8_t flow_nw_tos;
     struct priority_to_dscp *pdscp;
     uint32_t out_port, odp_port;
 
@@ -5897,6 +5996,10 @@ compose_output_action__(struct action_xlate_ctx *ctx, uint16_t ofp_port,
         return;
     }
 
+    flow_vlan_tci = ctx->flow.vlan_tci;
+    flow_skb_mark = ctx->flow.skb_mark;
+    flow_nw_tos = ctx->flow.nw_tos;
+
     pdscp = get_priority(ofport, ctx->flow.skb_priority);
     if (pdscp) {
         ctx->flow.nw_tos &= ~IP_DSCP_MASK;
@@ -5904,10 +6007,15 @@ compose_output_action__(struct action_xlate_ctx *ctx, uint16_t ofp_port,
     }
 
     if (ofport->tnl_port) {
+         /* Save tunnel metadata so that changes made due to
+          * the Logical (tunnel) Port are not visible for any further
+          * matches, while explicit set actions on tunnel metadata are.
+          */
+        struct flow_tnl flow_tnl = ctx->flow.tunnel;
         odp_port = tnl_port_send(ofport->tnl_port, &ctx->flow);
         if (odp_port == OVSP_NONE) {
             xlate_report(ctx, "Tunneling decided against output");
-            return;
+            goto out; /* restore flow_nw_tos */
         }
 
         if (ctx->resubmit_stats) {
@@ -5916,6 +6024,7 @@ compose_output_action__(struct action_xlate_ctx *ctx, uint16_t ofp_port,
         out_port = odp_port;
         commit_odp_tunnel_action(&ctx->flow, &ctx->base_flow,
                                  ctx->odp_actions);
+        ctx->flow.tunnel = flow_tnl; /* Restore tunnel metadata */
     } else {
         odp_port = ofport->odp_port;
         out_port = vsp_realdev_to_vlandev(ctx->ofproto, odp_port,
@@ -5931,8 +6040,11 @@ compose_output_action__(struct action_xlate_ctx *ctx, uint16_t ofp_port,
     ctx->sflow_odp_port = odp_port;
     ctx->sflow_n_outputs++;
     ctx->nf_output_iface = ofp_port;
-    ctx->flow.tunnel.tun_id = flow_tun_id;
+
+    /* Restore flow */
     ctx->flow.vlan_tci = flow_vlan_tci;
+    ctx->flow.skb_mark = flow_skb_mark;
+ out:
     ctx->flow.nw_tos = flow_nw_tos;
 }
 
@@ -6348,11 +6460,6 @@ xlate_set_queue_action(struct action_xlate_ctx *ctx, uint32_t queue_id)
     }
 }
 
-struct xlate_reg_state {
-    ovs_be16 vlan_tci;
-    ovs_be64 tun_id;
-};
-
 static bool
 slave_enabled_cb(uint16_t ofp_port, void *ofproto_)
 {