-/* Upcall handling. */
-
-struct flow_miss_op {
- struct dpif_op dpif_op;
-
- uint64_t slow_stub[128 / 8]; /* Buffer for compose_slow_path() */
- struct xlate_out xout;
- bool xout_garbage; /* 'xout' needs to be uninitialized? */
-
- struct ofpbuf mask; /* Flow mask for "put" ops. */
- struct odputil_keybuf maskbuf;
-
- /* If this is a "put" op, then a pointer to the subfacet that should
- * be marked as uninstalled if the operation fails. */
- struct subfacet *subfacet;
-};
-
-/* Figures out whether a flow that missed in 'ofproto', whose details are in
- * 'miss' masked by 'wc', is likely to be worth tracking in detail in userspace
- * and (usually) installing a datapath flow. The answer is usually "yes" (a
- * return value of true). However, for short flows the cost of bookkeeping is
- * much higher than the benefits, so when the datapath holds a large number of
- * flows we impose some heuristics to decide which flows are likely to be worth
- * tracking. */
-static bool
-flow_miss_should_make_facet(struct flow_miss *miss)
-{
- struct dpif_backer *backer = miss->ofproto->backer;
- uint32_t hash;
-
- switch (flow_miss_model) {
- case OFPROTO_HANDLE_MISS_AUTO:
- break;
- case OFPROTO_HANDLE_MISS_WITH_FACETS:
- return true;
- case OFPROTO_HANDLE_MISS_WITHOUT_FACETS:
- return false;
- }
-
- if (!backer->governor) {
- size_t n_subfacets;
-
- n_subfacets = hmap_count(&backer->subfacets);
- if (n_subfacets * 2 <= flow_eviction_threshold) {
- return true;
- }
-
- backer->governor = governor_create();
- }
-
- hash = flow_hash_in_wildcards(&miss->flow, &miss->xout.wc, 0);
- return governor_should_install_flow(backer->governor, hash,
- miss->stats.n_packets);
-}
-
-/* Handles 'miss', which matches 'facet'. May add any required datapath
- * operations to 'ops', incrementing '*n_ops' for each new op.
- *
- * All of the packets in 'miss' are considered to have arrived at time
- * 'miss->stats.used'. This is really important only for new facets: if we
- * just called time_msec() here, then the new subfacet or its packets could
- * look (occasionally) as though it was used some time after the facet was
- * used. That can make a one-packet flow look like it has a nonzero duration,
- * which looks odd in e.g. NetFlow statistics. */
-static void
-handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
- struct flow_miss_op *ops, size_t *n_ops)
-{
- enum subfacet_path want_path;
- struct subfacet *subfacet;
- uint32_t key_hash;
-
- /* Update facet stats. */
- facet->packet_count += miss->stats.n_packets;
- facet->prev_packet_count += miss->stats.n_packets;
- facet->byte_count += miss->stats.n_bytes;
- facet->prev_byte_count += miss->stats.n_bytes;
-
- /* Look for an existing subfacet. If we find one, update its used time. */
- key_hash = odp_flow_key_hash(miss->key, miss->key_len);
- if (!list_is_empty(&facet->subfacets)) {
- subfacet = subfacet_find(miss->ofproto->backer,
- miss->key, miss->key_len, key_hash);
- if (subfacet) {
- if (subfacet->facet == facet) {
- subfacet->used = MAX(subfacet->used, miss->stats.used);
- } else {
- /* This shouldn't happen. */
- VLOG_ERR_RL(&rl, "subfacet with wrong facet");
- subfacet_destroy(subfacet);
- subfacet = NULL;
- }
- }
- } else {
- subfacet = NULL;
- }
-
- /* Don't install the flow if it's the result of the "userspace"
- * action for an already installed facet. This can occur when a
- * datapath flow with wildcards has a "userspace" action and flows
- * sent to userspace result in a different subfacet, which will then
- * be rejected as overlapping by the datapath. */
- if (miss->upcall_type == DPIF_UC_ACTION
- && !list_is_empty(&facet->subfacets)) {
- return;
- }
-
- /* Create a subfacet, if we don't already have one. */
- if (!subfacet) {
- subfacet = subfacet_create(facet, miss, key_hash);
- }
-
- /* Install the subfacet, if it's not already installed. */
- want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH;
- if (subfacet->path != want_path) {
- struct flow_miss_op *op = &ops[(*n_ops)++];
- struct dpif_flow_put *put = &op->dpif_op.u.flow_put;
-
- subfacet->path = want_path;
-
- ofpbuf_use_stack(&op->mask, &op->maskbuf, sizeof op->maskbuf);
- if (enable_megaflows) {
- odp_flow_key_from_mask(&op->mask, &facet->xout.wc.masks,
- &miss->flow, UINT32_MAX);
- }
-
- op->xout_garbage = false;
- op->dpif_op.type = DPIF_OP_FLOW_PUT;
- op->subfacet = subfacet;
- put->flags = DPIF_FP_CREATE;
- put->key = miss->key;
- put->key_len = miss->key_len;
- put->mask = op->mask.data;
- put->mask_len = op->mask.size;
-
- if (want_path == SF_FAST_PATH) {
- put->actions = facet->xout.odp_actions.data;
- put->actions_len = facet->xout.odp_actions.size;
- } else {
- compose_slow_path(facet->ofproto, &miss->flow, facet->xout.slow,
- op->slow_stub, sizeof op->slow_stub,
- &put->actions, &put->actions_len);
- }
- put->stats = NULL;
- }
-}
-
-/* Handles flow miss 'miss'. May add any required datapath operations
- * to 'ops', incrementing '*n_ops' for each new op. */
-static void
-handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops,
- size_t *n_ops)
-{
- struct facet *facet;
-
- miss->ofproto->n_missed += miss->stats.n_packets;
-
- facet = facet_lookup_valid(miss->ofproto, &miss->flow);
- if (!facet) {
- /* There does not exist a bijection between 'struct flow' and datapath
- * flow keys with fitness ODP_FIT_TO_LITTLE. This breaks a fundamental
- * assumption used throughout the facet and subfacet handling code.
- * Since we have to handle these misses in userspace anyway, we simply
- * skip facet creation, avoiding the problem altogether. */
- if (miss->key_fitness == ODP_FIT_TOO_LITTLE
- || !flow_miss_should_make_facet(miss)) {
- return;
- }
-
- facet = facet_create(miss);
- }
- handle_flow_miss_with_facet(miss, facet, ops, n_ops);
-}
-
-static struct drop_key *
-drop_key_lookup(const struct dpif_backer *backer, const struct nlattr *key,
- size_t key_len)
-{
- struct drop_key *drop_key;
-
- HMAP_FOR_EACH_WITH_HASH (drop_key, hmap_node, hash_bytes(key, key_len, 0),
- &backer->drop_keys) {
- if (drop_key->key_len == key_len
- && !memcmp(drop_key->key, key, key_len)) {
- return drop_key;
- }
- }
- return NULL;
-}
-
-static void
-drop_key_clear(struct dpif_backer *backer)
-{
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 15);
- struct drop_key *drop_key, *next;
-
- HMAP_FOR_EACH_SAFE (drop_key, next, hmap_node, &backer->drop_keys) {
- int error;
-
- error = dpif_flow_del(backer->dpif, drop_key->key, drop_key->key_len,
- NULL);
- if (error && !VLOG_DROP_WARN(&rl)) {
- struct ds ds = DS_EMPTY_INITIALIZER;
- odp_flow_key_format(drop_key->key, drop_key->key_len, &ds);
- VLOG_WARN("Failed to delete drop key (%s) (%s)",
- ovs_strerror(error), ds_cstr(&ds));
- ds_destroy(&ds);
- }
-
- hmap_remove(&backer->drop_keys, &drop_key->hmap_node);
- drop_key_destroy(drop_key);
- }
-
- udpif_drop_key_clear(backer->udpif);
-}
-
-static void
-handle_flow_misses(struct dpif_backer *backer, struct flow_miss_batch *fmb)
-{
- struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH];
- struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH];
- struct flow_miss *miss;
- size_t n_ops, i;
-
- /* Process each element in the to-do list, constructing the set of
- * operations to batch. */
- n_ops = 0;
- HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
- handle_flow_miss(miss, flow_miss_ops, &n_ops);
- }
- ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
-
- /* Execute batch. */
- for (i = 0; i < n_ops; i++) {
- dpif_ops[i] = &flow_miss_ops[i].dpif_op;
- }
- dpif_operate(backer->dpif, dpif_ops, n_ops);
-
- for (i = 0; i < n_ops; i++) {
- if (dpif_ops[i]->error != 0
- && flow_miss_ops[i].dpif_op.type == DPIF_OP_FLOW_PUT
- && flow_miss_ops[i].subfacet) {
- struct subfacet *subfacet = flow_miss_ops[i].subfacet;
-
- COVERAGE_INC(subfacet_install_fail);
-
- /* Zero-out subfacet counters when installation failed, but
- * datapath reported hits. This should not happen and
- * indicates a bug, since if the datapath flow exists, we
- * should not be attempting to create a new subfacet. A
- * buggy datapath could trigger this, so just zero out the
- * counters and log an error. */
- if (subfacet->dp_packet_count || subfacet->dp_byte_count) {
- VLOG_ERR_RL(&rl, "failed to install subfacet for which "
- "datapath reported hits");
- subfacet->dp_packet_count = subfacet->dp_byte_count = 0;
- }
-
- subfacet->path = SF_NOT_INSTALLED;
- }
- }
-}
-
-static void
-handle_upcalls(struct dpif_backer *backer)
-{
- struct flow_miss_batch *fmb;
- int n_processed;
-
- for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) {
- struct drop_key *drop_key = drop_key_next(backer->udpif);
- if (!drop_key) {
- break;
- }
-
- if (!drop_key_lookup(backer, drop_key->key, drop_key->key_len)) {
- hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
- hash_bytes(drop_key->key, drop_key->key_len, 0));
- dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY,
- drop_key->key, drop_key->key_len,
- NULL, 0, NULL, 0, NULL);
- } else {
- drop_key_destroy(drop_key);
- }
- }
-
- fmb = flow_miss_batch_next(backer->udpif);
- if (fmb) {
- handle_flow_misses(backer, fmb);
- flow_miss_batch_destroy(fmb);
- }
-}
-\f
-/* Flow expiration. */
-
-static int subfacet_max_idle(const struct dpif_backer *);
-static void update_stats(struct dpif_backer *);
-static void rule_expire(struct rule_dpif *) OVS_REQUIRES(ofproto_mutex);
-static void expire_subfacets(struct dpif_backer *, int dp_max_idle);
-
-/* This function is called periodically by run(). Its job is to collect
- * updates for the flows that have been installed into the datapath, most
- * importantly when they last were used, and then use that information to
- * expire flows that have not been used recently.
- *
- * Returns the number of milliseconds after which it should be called again. */
-static int
-expire(struct dpif_backer *backer)
-{
- struct ofproto_dpif *ofproto;
- size_t n_subfacets;
- int max_idle;
-
- /* Periodically clear out the drop keys in an effort to keep them
- * relatively few. */
- drop_key_clear(backer);
-
- /* Update stats for each flow in the backer. */
- update_stats(backer);
-
- n_subfacets = hmap_count(&backer->subfacets);
- if (n_subfacets) {
- struct subfacet *subfacet;
- long long int total, now;
-
- total = 0;
- now = time_msec();
- HMAP_FOR_EACH (subfacet, hmap_node, &backer->subfacets) {
- total += now - subfacet->created;
- }
- backer->avg_subfacet_life += total / n_subfacets;
- }
- backer->avg_subfacet_life /= 2;
-
- backer->avg_n_subfacet += n_subfacets;
- backer->avg_n_subfacet /= 2;
-
- backer->max_n_subfacet = MAX(backer->max_n_subfacet, n_subfacets);
-
- max_idle = subfacet_max_idle(backer);
- expire_subfacets(backer, max_idle);
-
- HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) {
- struct rule *rule, *next_rule;
-
- if (ofproto->backer != backer) {
- continue;
- }
-
- /* Expire OpenFlow flows whose idle_timeout or hard_timeout
- * has passed. */
- ovs_mutex_lock(&ofproto_mutex);
- LIST_FOR_EACH_SAFE (rule, next_rule, expirable,
- &ofproto->up.expirable) {
- rule_expire(rule_dpif_cast(rule));
- }
- ovs_mutex_unlock(&ofproto_mutex);
-
- /* All outstanding data in existing flows has been accounted, so it's a
- * good time to do bond rebalancing. */
- if (ofproto->has_bonded_bundles) {
- struct ofbundle *bundle;
-
- HMAP_FOR_EACH (bundle, hmap_node, &ofproto->bundles) {
- if (bundle->bond) {
- bond_rebalance(bundle->bond);
- }
- }
- }
- }
-
- return MIN(max_idle, 1000);
-}
-
-/* Updates flow table statistics given that the datapath just reported 'stats'
- * as 'subfacet''s statistics. */
-static void
-update_subfacet_stats(struct subfacet *subfacet,
- const struct dpif_flow_stats *stats)
-{
- struct facet *facet = subfacet->facet;
- struct dpif_flow_stats diff;
-
- diff.tcp_flags = stats->tcp_flags;
- diff.used = stats->used;
-
- if (stats->n_packets >= subfacet->dp_packet_count) {
- diff.n_packets = stats->n_packets - subfacet->dp_packet_count;
- } else {
- VLOG_WARN_RL(&rl, "unexpected packet count from the datapath");
- diff.n_packets = 0;
- }
-
- if (stats->n_bytes >= subfacet->dp_byte_count) {
- diff.n_bytes = stats->n_bytes - subfacet->dp_byte_count;
- } else {
- VLOG_WARN_RL(&rl, "unexpected byte count from datapath");
- diff.n_bytes = 0;
- }
-
- facet->ofproto->n_hit += diff.n_packets;
- subfacet->dp_packet_count = stats->n_packets;
- subfacet->dp_byte_count = stats->n_bytes;
- subfacet_update_stats(subfacet, &diff);
-
- if (diff.n_packets) {
- facet_learn(facet);
- }
-}
-
-/* 'key' with length 'key_len' bytes is a flow in 'dpif' that we know nothing
- * about, or a flow that shouldn't be installed but was anyway. Delete it. */
-static void
-delete_unexpected_flow(struct dpif_backer *backer,
- const struct nlattr *key, size_t key_len)
-{
- if (!VLOG_DROP_WARN(&rl)) {
- struct ds s;
-
- ds_init(&s);
- odp_flow_key_format(key, key_len, &s);
- VLOG_WARN("unexpected flow: %s", ds_cstr(&s));
- ds_destroy(&s);
- }
-
- COVERAGE_INC(facet_unexpected);
- dpif_flow_del(backer->dpif, key, key_len, NULL);
-}
-
-/* Update 'packet_count', 'byte_count', and 'used' members of installed facets.
- *
- * This function also pushes statistics updates to rules which each facet
- * resubmits into. Generally these statistics will be accurate. However, if a
- * facet changes the rule it resubmits into at some time in between
- * update_stats() runs, it is possible that statistics accrued to the
- * old rule will be incorrectly attributed to the new rule. This could be
- * avoided by calling update_stats() whenever rules are created or
- * deleted. However, the performance impact of making so many calls to the
- * datapath do not justify the benefit of having perfectly accurate statistics.
- *
- * In addition, this function maintains per ofproto flow hit counts. The patch
- * port is not treated specially. e.g. A packet ingress from br0 patched into
- * br1 will increase the hit count of br0 by 1, however, does not affect
- * the hit or miss counts of br1.
- */
-static void
-update_stats(struct dpif_backer *backer)
-{
- const struct dpif_flow_stats *stats;
- struct dpif_flow_dump dump;
- const struct nlattr *key, *mask;
- size_t key_len, mask_len;
-
- dpif_flow_dump_start(&dump, backer->dpif);
- while (dpif_flow_dump_next(&dump, &key, &key_len,
- &mask, &mask_len, NULL, NULL, &stats)) {
- struct subfacet *subfacet;
- uint32_t key_hash;
-
- key_hash = odp_flow_key_hash(key, key_len);
- subfacet = subfacet_find(backer, key, key_len, key_hash);
- switch (subfacet ? subfacet->path : SF_NOT_INSTALLED) {
- case SF_FAST_PATH:
- update_subfacet_stats(subfacet, stats);
- break;
-
- case SF_SLOW_PATH:
- /* Stats are updated per-packet. */
- break;
-
- case SF_NOT_INSTALLED:
- default:
- delete_unexpected_flow(backer, key, key_len);
- break;
- }
- }
- dpif_flow_dump_done(&dump);
-}
-
-/* Calculates and returns the number of milliseconds of idle time after which
- * subfacets should expire from the datapath. When a subfacet expires, we fold
- * its statistics into its facet, and when a facet's last subfacet expires, we
- * fold its statistic into its rule. */
-static int
-subfacet_max_idle(const struct dpif_backer *backer)
-{
- /*
- * Idle time histogram.
- *
- * Most of the time a switch has a relatively small number of subfacets.
- * When this is the case we might as well keep statistics for all of them
- * in userspace and to cache them in the kernel datapath for performance as
- * well.
- *
- * As the number of subfacets increases, the memory required to maintain
- * statistics about them in userspace and in the kernel becomes
- * significant. However, with a large number of subfacets it is likely
- * that only a few of them are "heavy hitters" that consume a large amount
- * of bandwidth. At this point, only heavy hitters are worth caching in
- * the kernel and maintaining in userspaces; other subfacets we can
- * discard.
- *
- * The technique used to compute the idle time is to build a histogram with
- * N_BUCKETS buckets whose width is BUCKET_WIDTH msecs each. Each subfacet
- * that is installed in the kernel gets dropped in the appropriate bucket.
- * After the histogram has been built, we compute the cutoff so that only
- * the most-recently-used 1% of subfacets (but at least
- * flow_eviction_threshold flows) are kept cached. At least
- * the most-recently-used bucket of subfacets is kept, so actually an
- * arbitrary number of subfacets can be kept in any given expiration run
- * (though the next run will delete most of those unless they receive
- * additional data).
- *
- * This requires a second pass through the subfacets, in addition to the
- * pass made by update_stats(), because the former function never looks at
- * uninstallable subfacets.
- */
- enum { BUCKET_WIDTH = 100 };
- enum { N_BUCKETS = 5000 / BUCKET_WIDTH };
- int buckets[N_BUCKETS] = { 0 };
- int total, subtotal, bucket;
- struct subfacet *subfacet;
- long long int now;
- int i;
-
- total = hmap_count(&backer->subfacets);
- if (total <= flow_eviction_threshold) {
- return N_BUCKETS * BUCKET_WIDTH;
- }
-
- /* Build histogram. */
- now = time_msec();
- HMAP_FOR_EACH (subfacet, hmap_node, &backer->subfacets) {
- long long int idle = now - subfacet->used;
- int bucket = (idle <= 0 ? 0
- : idle >= BUCKET_WIDTH * N_BUCKETS ? N_BUCKETS - 1
- : (unsigned int) idle / BUCKET_WIDTH);
- buckets[bucket]++;
- }
-
- /* Find the first bucket whose flows should be expired. */
- subtotal = bucket = 0;
- do {
- subtotal += buckets[bucket++];
- } while (bucket < N_BUCKETS &&
- subtotal < MAX(flow_eviction_threshold, total / 100));
-
- if (VLOG_IS_DBG_ENABLED()) {
- struct ds s;
-
- ds_init(&s);
- ds_put_cstr(&s, "keep");
- for (i = 0; i < N_BUCKETS; i++) {
- if (i == bucket) {
- ds_put_cstr(&s, ", drop");
- }
- if (buckets[i]) {
- ds_put_format(&s, " %d:%d", i * BUCKET_WIDTH, buckets[i]);
- }
- }
- VLOG_INFO("%s (msec:count)", ds_cstr(&s));
- ds_destroy(&s);
- }
-
- return bucket * BUCKET_WIDTH;
-}
-
-static void
-expire_subfacets(struct dpif_backer *backer, int dp_max_idle)
-{
- /* Cutoff time for most flows. */
- long long int normal_cutoff = time_msec() - dp_max_idle;
-
- /* We really want to keep flows for special protocols around, so use a more
- * conservative cutoff. */
- long long int special_cutoff = time_msec() - 10000;
-
- struct subfacet *subfacet, *next_subfacet;
- struct subfacet *batch[SUBFACET_DESTROY_MAX_BATCH];
- int n_batch;
-
- n_batch = 0;
- HMAP_FOR_EACH_SAFE (subfacet, next_subfacet, hmap_node,
- &backer->subfacets) {
- long long int cutoff;
-
- cutoff = (subfacet->facet->xout.slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP
- | SLOW_STP)
- ? special_cutoff
- : normal_cutoff);
- if (subfacet->used < cutoff) {
- if (subfacet->path != SF_NOT_INSTALLED) {
- batch[n_batch++] = subfacet;
- if (n_batch >= SUBFACET_DESTROY_MAX_BATCH) {
- subfacet_destroy_batch(backer, batch, n_batch);
- n_batch = 0;
- }
- } else {
- subfacet_destroy(subfacet);
- }
- }
- }
-
- if (n_batch > 0) {
- subfacet_destroy_batch(backer, batch, n_batch);
- }
-}
-