From: Ethan Jackson Date: Tue, 24 Sep 2013 20:39:56 +0000 (-0700) Subject: ofproto: Handle flow installation and eviction in upcall. X-Git-Tag: sliver-openvswitch-2.1.90-1~10^2~108 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;ds=sidebyside;h=e79a6c833e0d72370951d6f8841098103cbb0b2d;p=sliver-openvswitch.git ofproto: Handle flow installation and eviction in upcall. This patch moves flow installation and eviction from ofproto-dpif and the main thread, into ofproto-dpif-upcall. This performs significantly better (approximately 2x TCP_CRR improvement), and allows ovs-vswitchd to maintain significantly larger datapath flow tables. On top of that, it significantly simplifies the code, retiring "struct facet" and friends. Signed-off-by: Ethan Jackson Acked-by: Ben Pfaff --- diff --git a/NEWS b/NEWS index 45b8239bf..b279572cc 100644 --- a/NEWS +++ b/NEWS @@ -54,6 +54,10 @@ Post-v2.0.0 - ovsdb-client: * The "monitor" command can now monitor all tables in a database, instead of being limited to a single table. + - The flow-eviction-threshold has been replaced by the flow-limit which is a + hard limit on the number of flows in the datapath. It defaults to 200,000 + flows. OVS automatically adjusts this number depending on network + conditions. v2.0.0 - 15 Oct 2013 diff --git a/ofproto/automake.mk b/ofproto/automake.mk index 25be10587..068a74234 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -25,8 +25,6 @@ ofproto_libofproto_la_SOURCES = \ ofproto/ofproto.h \ ofproto/ofproto-dpif.c \ ofproto/ofproto-dpif.h \ - ofproto/ofproto-dpif-governor.c \ - ofproto/ofproto-dpif-governor.h \ ofproto/ofproto-dpif-ipfix.c \ ofproto/ofproto-dpif-ipfix.h \ ofproto/ofproto-dpif-mirror.c \ diff --git a/ofproto/ofproto-dpif-governor.c b/ofproto/ofproto-dpif-governor.c deleted file mode 100644 index 459f8715e..000000000 --- a/ofproto/ofproto-dpif-governor.c +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2012 Nicira, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "ofproto-dpif-governor.h" - -#include - -#include "coverage.h" -#include "poll-loop.h" -#include "random.h" -#include "timeval.h" -#include "util.h" -#include "valgrind.h" -#include "vlog.h" - -VLOG_DEFINE_THIS_MODULE(ofproto_dpif_governor); - -/* Minimum number of observed packets before setting up a flow. - * - * This value seems OK empirically. */ -#define FLOW_SETUP_THRESHOLD 5 -BUILD_ASSERT_DECL(FLOW_SETUP_THRESHOLD > 1); -BUILD_ASSERT_DECL(FLOW_SETUP_THRESHOLD < 16); - -/* Minimum and maximum size of a governor, in bytes. */ -enum { MIN_SIZE = 16 * 1024 }; -enum { MAX_SIZE = 256 * 1024 }; -BUILD_ASSERT_DECL(IS_POW2(MIN_SIZE)); -BUILD_ASSERT_DECL(IS_POW2(MAX_SIZE)); - -/* Minimum and maximum time to process the number of packets that make up a - * given generation. If a generation completes faster than the minimum time, - * we double the table size (but no more than MAX_SIZE). If a generation take - * more than the maximum time to complete, we halve the table size (but no - * smaller than MIN_SIZE). */ -enum { MIN_ELAPSED = 1000 }; /* In milliseconds. */ -enum { MAX_ELAPSED = 5000 }; /* In milliseconds. */ - -static void governor_new_generation(struct governor *, unsigned int size); - -/* Creates and returns a new governor. */ -struct governor * -governor_create(void) -{ - struct governor *g = xzalloc(sizeof *g); - governor_new_generation(g, MIN_SIZE); - return g; -} - -/* Destroys 'g'. */ -void -governor_destroy(struct governor *g) -{ - if (g) { - VLOG_INFO("disengaging"); - free(g->table); - free(g); - } -} - -/* Performs periodic maintenance work on 'g'. */ -void -governor_run(struct governor *g) -{ - if (time_msec() - g->start > MAX_ELAPSED) { - if (g->size > MIN_SIZE) { - governor_new_generation(g, g->size / 2); - } else { - /* Don't start a new generation (we'd never go idle). */ - } - } -} - -/* Arranges for the poll loop to wake up when 'g' needs to do some work. */ -void -governor_wait(struct governor *g) -{ - if (g->size > MIN_SIZE) { - poll_timer_wait_until(g->start + MAX_ELAPSED); - } -} - -/* Returns true if 'g' has been doing only a minimal amount of work and thus - * the client should consider getting rid of it entirely. */ -bool -governor_is_idle(struct governor *g) -{ - return g->size == MIN_SIZE && time_msec() - g->start > MAX_ELAPSED; -} - -/* Tests whether a flow whose hash is 'hash' and for which 'n' packets have - * just arrived should be set up in the datapath or just processed on a - * packet-by-packet basis. Returns true to set up a datapath flow, false to - * process the packets individually. - * - * One would expect 'n' to ordinarily be 1, if batching leads multiple packets - * to be processed at a time then it could be greater. */ -bool -governor_should_install_flow(struct governor *g, uint32_t hash, int n) -{ - int old_count, new_count; - bool install_flow; - uint8_t *e; - - ovs_assert(n > 0); - - /* Count these packets and begin a new generation if necessary. */ - g->n_packets += n; - if (g->n_packets >= g->size / 4) { - unsigned int new_size; - long long elapsed; - - elapsed = time_msec() - g->start; - new_size = (elapsed < MIN_ELAPSED && g->size < MAX_SIZE ? g->size * 2 - : elapsed > MAX_ELAPSED && g->size > MIN_SIZE ? g->size / 2 - : g->size); - governor_new_generation(g, new_size); - } - - /* If we've set up most of the flows we've seen, then we're wasting time - * handling most packets one at a time, so in this case instead set up most - * flows directly and use the remaining flows as a sample set to adjust our - * criteria later. - * - * The definition of "most" is conservative, but the sample size is tuned - * based on a few experiments with TCP_CRR mode in netperf. */ - if (g->n_setups >= g->n_flows - g->n_flows / 16 - && g->n_flows >= 64 - && hash & 0x3f) { - g->n_shortcuts++; - return true; - } - - /* Do hash table processing. - * - * Even-numbered hash values use high-order nibbles. - * Odd-numbered hash values use low-order nibbles. */ - e = &g->table[(hash >> 1) & (g->size - 1)]; - old_count = (hash & 1 ? *e >> 4 : *e & 0x0f); - if (!old_count) { - g->n_flows++; - } - new_count = n + old_count; - if (new_count >= FLOW_SETUP_THRESHOLD) { - g->n_setups++; - install_flow = true; - new_count = 0; - } else { - install_flow = false; - } - *e = hash & 1 ? (new_count << 4) | (*e & 0x0f) : (*e & 0xf0) | new_count; - - return install_flow; -} - -/* Starts a new generation in 'g' with a table size of 'size' bytes. 'size' - * must be a power of two between MIN_SIZE and MAX_SIZE, inclusive. */ -static void -governor_new_generation(struct governor *g, unsigned int size) -{ - ovs_assert(size >= MIN_SIZE && size <= MAX_SIZE); - ovs_assert(is_pow2(size)); - - /* Allocate new table, if necessary. */ - if (g->size != size) { - if (!g->size) { - VLOG_INFO("engaging governor with %u kB hash table", size / 1024); - } else { - VLOG_INFO("processed %u packets in %.2f s, " - "%s hash table to %u kB " - "(%u hashes, %u setups, %u shortcuts)", - g->n_packets, - (time_msec() - g->start) / 1000.0, - size > g->size ? "enlarging" : "shrinking", - size / 1024, - g->n_flows, g->n_setups, g->n_shortcuts); - } - - free(g->table); - g->table = xmalloc(size * sizeof *g->table); - g->size = size; - } else { - VLOG_DBG("processed %u packets in %.2f s with %u kB hash table " - "(%u hashes, %u setups, %u shortcuts)", - g->n_packets, (time_msec() - g->start) / 1000.0, - size / 1024, g->n_flows, g->n_setups, g->n_shortcuts); - } - - /* Clear data for next generation. */ - memset(g->table, 0, size * sizeof *g->table); - g->start = time_msec(); - g->n_packets = 0; - g->n_flows /= 2; - g->n_setups /= 2; - g->n_shortcuts = 0; -} diff --git a/ofproto/ofproto-dpif-governor.h b/ofproto/ofproto-dpif-governor.h deleted file mode 100644 index 7e6ec9279..000000000 --- a/ofproto/ofproto-dpif-governor.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2012 Nicira, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef OFPROTO_DPIF_GOVERNOR_H -#define OFPROTO_DPIF_GOVERNOR_H 1 - -/* Flow setup rate limiter. - * - * A governor in an engine limits a vehicle's speed. This governor limits the - * rate at which flows are set up in the datapath. The client provides as - * input the hashes of observed packets. The governor keeps track of hashes - * seen multiple times. When a given hash is seen often enough, the governor - * indicates to its client that it should set up a facet and a subfacet and a - * datapath flow for that flow. - * - * The same tracking could be done in terms of facets and subfacets directly, - * but the governor code uses much less time and space to do the same job. */ - -#include -#include - -struct governor { - char *name; /* Name, for log messages. */ - uint8_t *table; /* Table of counters, two per byte. */ - unsigned int size; /* Table size in bytes. */ - long long int start; /* Time when the table was last cleared. */ - unsigned int n_packets; /* Number of packets processed. */ - - /* Statistics for skipping counters when most flows get set up. */ - unsigned int n_flows; /* Number of unique flows seen. */ - unsigned int n_setups; /* Number of flows set up based on counters. */ - unsigned int n_shortcuts; /* Number of flows set up based on history. */ -}; - -struct governor *governor_create(void); -void governor_destroy(struct governor *); - -void governor_run(struct governor *); -void governor_wait(struct governor *); - -bool governor_is_idle(struct governor *); - -bool governor_should_install_flow(struct governor *, uint32_t hash, int n); - -#endif /* ofproto/ofproto-dpif-governor.h */ diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index f0088f0ea..78424fd42 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -31,6 +31,7 @@ #include "ofpbuf.h" #include "ofproto-dpif-ipfix.h" #include "ofproto-dpif-sflow.h" +#include "ofproto-dpif-xlate.h" #include "packets.h" #include "poll-loop.h" #include "seq.h" @@ -38,17 +39,16 @@ #include "vlog.h" #define MAX_QUEUE_LENGTH 512 +#define FLOW_MISS_MAX_BATCH 50 +#define REVALIDATE_MAX_BATCH 50 VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall); -COVERAGE_DEFINE(drop_queue_overflow); COVERAGE_DEFINE(upcall_queue_overflow); -COVERAGE_DEFINE(fmb_queue_overflow); -COVERAGE_DEFINE(fmb_queue_revalidated); /* A thread that processes each upcall handed to it by the dispatcher thread, - * forwards the upcall's packet, and then queues it to the main ofproto_dpif - * to possibly set up a kernel flow as a cache. */ + * forwards the upcall's packet, and possibly sets up a kernel flow as a + * cache. */ struct handler { struct udpif *udpif; /* Parent udpif. */ pthread_t thread; /* Thread ID. */ @@ -66,13 +66,34 @@ struct handler { 'mutex'. */ }; +/* A thread that processes each kernel flow handed to it by the flow_dumper + * thread, updates OpenFlow statistics, and updates or removes the kernel flow + * as necessary. */ +struct revalidator { + struct udpif *udpif; /* Parent udpif. */ + char *name; /* Thread name. */ + + pthread_t thread; /* Thread ID. */ + struct hmap ukeys; /* Datapath flow keys. */ + + uint64_t dump_seq; + + struct ovs_mutex mutex; /* Mutex guarding the following. */ + pthread_cond_t wake_cond; + struct list udumps OVS_GUARDED; /* Unprocessed udumps. */ + size_t n_udumps OVS_GUARDED; /* Number of unprocessed udumps. */ +}; + /* An upcall handler for ofproto_dpif. * - * udpif is implemented as a "dispatcher" thread that reads upcalls from the - * kernel. It processes each upcall just enough to figure out its next - * destination. For a "miss" upcall (MISS_UPCALL), this is one of several - * "handler" threads (see struct handler). Other upcalls are queued to the - * main ofproto_dpif. */ + * udpif has two logically separate pieces: + * + * - A "dispatcher" thread that reads upcalls from the kernel and dispatches + * them to one of several "handler" threads (see struct handler). + * + * - A "flow_dumper" thread that reads the kernel flow table and dispatches + * flows to one of several "revalidator" threads (see struct + * revalidator). */ struct udpif { struct list list_node; /* In all_udpifs list. */ @@ -82,18 +103,30 @@ struct udpif { uint32_t secret; /* Random seed for upcall hash. */ pthread_t dispatcher; /* Dispatcher thread ID. */ + pthread_t flow_dumper; /* Flow dumper thread ID. */ struct handler *handlers; /* Upcall handlers. */ size_t n_handlers; - /* Queues to pass up to ofproto-dpif. */ - struct guarded_list drop_keys; /* "struct drop key"s. */ - struct guarded_list fmbs; /* "struct flow_miss_batch"es. */ + struct revalidator *revalidators; /* Flow revalidators. */ + size_t n_revalidators; + + uint64_t last_reval_seq; /* 'reval_seq' at last revalidation. */ + struct seq *reval_seq; /* Incremented to force revalidation. */ + + struct seq *dump_seq; /* Increments each dump iteration. */ + + struct latch exit_latch; /* Tells child threads to exit. */ + + long long int dump_duration; /* Duration of the last flow dump. */ - struct seq *wait_seq; - struct seq *reval_seq; + /* Datapath flow statistics. */ + unsigned int max_n_flows; + unsigned int avg_n_flows; - struct latch exit_latch; /* Tells child threads to exit. */ + /* Following fields are accessed and modified by different threads. */ + atomic_llong max_idle; /* Maximum datapath flow idle time. */ + atomic_uint flow_limit; /* Datapath flow hard limit. */ }; enum upcall_type { @@ -114,18 +147,92 @@ struct upcall { uint64_t upcall_stub[512 / 8]; /* Buffer to reduce need for malloc(). */ }; +/* 'udpif_key's are responsible for tracking the little bit of state udpif + * needs to do flow expiration which can't be pulled directly from the + * datapath. They are owned, created by, maintained, and destroyed by a single + * revalidator making them easy to efficiently handle with multiple threads. */ +struct udpif_key { + struct hmap_node hmap_node; /* In parent revalidator 'ukeys' map. */ + + struct nlattr *key; /* Datapath flow key. */ + size_t key_len; /* Length of 'key'. */ + + struct dpif_flow_stats stats; /* Stats at most recent flow dump. */ + long long int created; /* Estimation of creation time. */ + + bool mark; /* Used by mark and sweep GC algorithm. */ + + struct odputil_keybuf key_buf; /* Memory for 'key'. */ +}; + +/* 'udpif_flow_dump's hold the state associated with one iteration in a flow + * dump operation. This is created by the flow_dumper thread and handed to the + * appropriate revalidator thread to be processed. */ +struct udpif_flow_dump { + struct list list_node; + + struct nlattr *key; /* Datapath flow key. */ + size_t key_len; /* Length of 'key'. */ + uint32_t key_hash; /* Hash of 'key'. */ + + struct odputil_keybuf mask_buf; + struct nlattr *mask; /* Datapath mask for 'key'. */ + size_t mask_len; /* Length of 'mask'. */ + + struct dpif_flow_stats stats; /* Stats pulled from the datapath. */ + + bool need_revalidate; /* Key needs revalidation? */ + + struct odputil_keybuf key_buf; +}; + +/* Flow miss batching. + * + * Some dpifs implement operations faster when you hand them off in a batch. + * To allow batching, "struct flow_miss" queues the dpif-related work needed + * for a given flow. Each "struct flow_miss" corresponds to sending one or + * more packets, plus possibly installing the flow in the dpif. */ +struct flow_miss { + struct hmap_node hmap_node; + struct ofproto_dpif *ofproto; + + struct flow flow; + enum odp_key_fitness key_fitness; + const struct nlattr *key; + size_t key_len; + enum dpif_upcall_type upcall_type; + struct dpif_flow_stats stats; + odp_port_t odp_in_port; + + uint64_t slow_path_buf[128 / 8]; + struct odputil_keybuf mask_buf; + + struct xlate_out xout; +}; + static void upcall_destroy(struct upcall *); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); static struct list all_udpifs = LIST_INITIALIZER(&all_udpifs); static void recv_upcalls(struct udpif *); -static void handle_upcalls(struct udpif *, struct list *upcalls); -static void miss_destroy(struct flow_miss *); +static void handle_upcalls(struct handler *handler, struct list *upcalls); +static void *udpif_flow_dumper(void *); static void *udpif_dispatcher(void *); static void *udpif_upcall_handler(void *); +static void *udpif_revalidator(void *); +static uint64_t udpif_get_n_flows(const struct udpif *); +static void revalidate_udumps(struct revalidator *, struct list *udumps); +static void revalidator_sweep(struct revalidator *); static void upcall_unixctl_show(struct unixctl_conn *conn, int argc, const char *argv[], void *aux); +static void upcall_unixctl_disable_megaflows(struct unixctl_conn *, int argc, + const char *argv[], void *aux); +static void upcall_unixctl_enable_megaflows(struct unixctl_conn *, int argc, + const char *argv[], void *aux); +static void ukey_delete(struct revalidator *, struct udpif_key *); + +static atomic_bool enable_megaflows = ATOMIC_VAR_INIT(true); struct udpif * udpif_create(struct dpif_backer *backer, struct dpif *dpif) @@ -136,17 +243,21 @@ udpif_create(struct dpif_backer *backer, struct dpif *dpif) if (ovsthread_once_start(&once)) { unixctl_command_register("upcall/show", "", 0, 0, upcall_unixctl_show, NULL); + unixctl_command_register("upcall/disable-megaflows", "", 0, 0, + upcall_unixctl_disable_megaflows, NULL); + unixctl_command_register("upcall/enable-megaflows", "", 0, 0, + upcall_unixctl_enable_megaflows, NULL); ovsthread_once_done(&once); } udpif->dpif = dpif; udpif->backer = backer; + atomic_init(&udpif->max_idle, 5000); + atomic_init(&udpif->flow_limit, MIN(ofproto_flow_limit, 10000)); udpif->secret = random_uint32(); - udpif->wait_seq = seq_create(); udpif->reval_seq = seq_create(); + udpif->dump_seq = seq_create(); latch_init(&udpif->exit_latch); - guarded_list_init(&udpif->drop_keys); - guarded_list_init(&udpif->fmbs); list_push_back(&all_udpifs, &udpif->list_node); return udpif; @@ -155,62 +266,82 @@ udpif_create(struct dpif_backer *backer, struct dpif *dpif) void udpif_destroy(struct udpif *udpif) { - struct flow_miss_batch *fmb; - struct drop_key *drop_key; + udpif_set_threads(udpif, 0, 0); + udpif_flush(); - udpif_set_threads(udpif, 0); list_remove(&udpif->list_node); - - while ((drop_key = drop_key_next(udpif))) { - drop_key_destroy(drop_key); - } - - while ((fmb = flow_miss_batch_next(udpif))) { - flow_miss_batch_destroy(fmb); - } - - guarded_list_destroy(&udpif->drop_keys); - guarded_list_destroy(&udpif->fmbs); latch_destroy(&udpif->exit_latch); - seq_destroy(udpif->wait_seq); seq_destroy(udpif->reval_seq); + seq_destroy(udpif->dump_seq); free(udpif); } /* Tells 'udpif' how many threads it should use to handle upcalls. Disables - * all threads if 'n_handlers' is zero. 'udpif''s datapath handle must have - * packet reception enabled before starting threads. */ + * all threads if 'n_handlers' and 'n_revalidators' is zero. 'udpif''s + * datapath handle must have packet reception enabled before starting threads. + */ void -udpif_set_threads(struct udpif *udpif, size_t n_handlers) +udpif_set_threads(struct udpif *udpif, size_t n_handlers, + size_t n_revalidators) { /* Stop the old threads (if any). */ - if (udpif->handlers && udpif->n_handlers != n_handlers) { + if (udpif->handlers && + (udpif->n_handlers != n_handlers + || udpif->n_revalidators != n_revalidators)) { size_t i; latch_set(&udpif->exit_latch); - /* Wake the handlers so they can exit. */ for (i = 0; i < udpif->n_handlers; i++) { struct handler *handler = &udpif->handlers[i]; ovs_mutex_lock(&handler->mutex); xpthread_cond_signal(&handler->wake_cond); ovs_mutex_unlock(&handler->mutex); + xpthread_join(handler->thread, NULL); + } + + for (i = 0; i < udpif->n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; + + ovs_mutex_lock(&revalidator->mutex); + xpthread_cond_signal(&revalidator->wake_cond); + ovs_mutex_unlock(&revalidator->mutex); + xpthread_join(revalidator->thread, NULL); } + xpthread_join(udpif->flow_dumper, NULL); xpthread_join(udpif->dispatcher, NULL); + + for (i = 0; i < udpif->n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; + struct udpif_flow_dump *udump, *next_udump; + struct udpif_key *ukey, *next_ukey; + + LIST_FOR_EACH_SAFE (udump, next_udump, list_node, + &revalidator->udumps) { + list_remove(&udump->list_node); + free(udump); + } + + HMAP_FOR_EACH_SAFE (ukey, next_ukey, hmap_node, + &revalidator->ukeys) { + ukey_delete(revalidator, ukey); + } + hmap_destroy(&revalidator->ukeys); + ovs_mutex_destroy(&revalidator->mutex); + + free(revalidator->name); + } + for (i = 0; i < udpif->n_handlers; i++) { struct handler *handler = &udpif->handlers[i]; struct upcall *miss, *next; - xpthread_join(handler->thread, NULL); - - ovs_mutex_lock(&handler->mutex); LIST_FOR_EACH_SAFE (miss, next, list_node, &handler->upcalls) { list_remove(&miss->list_node); upcall_destroy(miss); } - ovs_mutex_unlock(&handler->mutex); ovs_mutex_destroy(&handler->mutex); xpthread_cond_destroy(&handler->wake_cond); @@ -218,6 +349,10 @@ udpif_set_threads(struct udpif *udpif, size_t n_handlers) } latch_poll(&udpif->exit_latch); + free(udpif->revalidators); + udpif->revalidators = NULL; + udpif->n_revalidators = 0; + free(udpif->handlers); udpif->handlers = NULL; udpif->n_handlers = 0; @@ -228,6 +363,8 @@ udpif_set_threads(struct udpif *udpif, size_t n_handlers) size_t i; udpif->n_handlers = n_handlers; + udpif->n_revalidators = n_revalidators; + udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers); for (i = 0; i < udpif->n_handlers; i++) { struct handler *handler = &udpif->handlers[i]; @@ -240,19 +377,22 @@ udpif_set_threads(struct udpif *udpif, size_t n_handlers) xpthread_create(&handler->thread, NULL, udpif_upcall_handler, handler); } - xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif); - } -} -void -udpif_wait(struct udpif *udpif) -{ - uint64_t seq = seq_read(udpif->wait_seq); - if (!guarded_list_is_empty(&udpif->drop_keys) || - !guarded_list_is_empty(&udpif->fmbs)) { - poll_immediate_wake(); - } else { - seq_wait(udpif->wait_seq, seq); + udpif->revalidators = xzalloc(udpif->n_revalidators + * sizeof *udpif->revalidators); + for (i = 0; i < udpif->n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; + + revalidator->udpif = udpif; + list_init(&revalidator->udumps); + hmap_init(&revalidator->ukeys); + ovs_mutex_init(&revalidator->mutex); + xpthread_cond_init(&revalidator->wake_cond, NULL); + xpthread_create(&revalidator->thread, NULL, udpif_revalidator, + revalidator); + } + xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif); + xpthread_create(&udpif->flow_dumper, NULL, udpif_flow_dumper, udpif); } } @@ -261,22 +401,16 @@ udpif_wait(struct udpif *udpif) void udpif_revalidate(struct udpif *udpif) { - struct flow_miss_batch *fmb, *next_fmb; - struct list fmbs; - - /* Since we remove each miss on revalidation, their statistics won't be - * accounted to the appropriate 'facet's in the upper layer. In most - * cases, this is alright because we've already pushed the stats to the - * relevant rules. */ seq_change(udpif->reval_seq); +} - guarded_list_pop_all(&udpif->fmbs, &fmbs); - LIST_FOR_EACH_SAFE (fmb, next_fmb, list_node, &fmbs) { - list_remove(&fmb->list_node); - flow_miss_batch_destroy(fmb); - } - - udpif_drop_key_clear(udpif); +/* Returns a seq which increments every time 'udpif' pulls stats from the + * datapath. Callers can use this to get a sense of when might be a good time + * to do periodic work which relies on relatively up to date statistics. */ +struct seq * +udpif_dump_seq(struct udpif *udpif) +{ + return udpif->dump_seq; } void @@ -294,8 +428,32 @@ udpif_get_memory_usage(struct udpif *udpif, struct simap *usage) simap_increase(usage, "handler upcalls", handler->n_upcalls); ovs_mutex_unlock(&handler->mutex); } + + simap_increase(usage, "revalidators", udpif->n_revalidators); + for (i = 0; i < udpif->n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; + ovs_mutex_lock(&revalidator->mutex); + simap_increase(usage, "revalidator dumps", revalidator->n_udumps); + + /* XXX: This isn't technically thread safe because the revalidator + * ukeys maps isn't protected by a mutex since it's per thread. */ + simap_increase(usage, "revalidator keys", + hmap_count(&revalidator->ukeys)); + ovs_mutex_unlock(&revalidator->mutex); + } } +/* Removes all flows from all datapaths. */ +void +udpif_flush(void) +{ + struct udpif *udpif; + + LIST_FOR_EACH (udpif, list_node, &all_udpifs) { + dpif_flow_flush(udpif->dpif); + } +} + /* Destroys and deallocates 'upcall'. */ static void upcall_destroy(struct upcall *upcall) @@ -307,103 +465,148 @@ upcall_destroy(struct upcall *upcall) } } -/* Retrieves the next batch of processed flow misses for 'udpif' to install. - * The caller is responsible for destroying it with flow_miss_batch_destroy(). - */ -struct flow_miss_batch * -flow_miss_batch_next(struct udpif *udpif) +static uint64_t +udpif_get_n_flows(const struct udpif *udpif) { - int i; + struct dpif_dp_stats stats; - for (i = 0; i < 50; i++) { - struct flow_miss_batch *next; - struct list *next_node; - - next_node = guarded_list_pop_front(&udpif->fmbs); - if (!next_node) { - break; - } + dpif_get_dp_stats(udpif->dpif, &stats); + return stats.n_flows; +} - next = CONTAINER_OF(next_node, struct flow_miss_batch, list_node); - if (next->reval_seq == seq_read(udpif->reval_seq)) { - return next; - } +/* The dispatcher thread is responsible for receiving upcalls from the kernel, + * assigning them to a upcall_handler thread. */ +static void * +udpif_dispatcher(void *arg) +{ + struct udpif *udpif = arg; - flow_miss_batch_destroy(next); + set_subprogram_name("dispatcher"); + while (!latch_is_set(&udpif->exit_latch)) { + recv_upcalls(udpif); + dpif_recv_wait(udpif->dpif); + latch_wait(&udpif->exit_latch); + poll_block(); } return NULL; } -/* Destroys and deallocates 'fmb'. */ -void -flow_miss_batch_destroy(struct flow_miss_batch *fmb) +static void * +udpif_flow_dumper(void *arg) { - struct flow_miss *miss, *next; - struct upcall *upcall, *next_upcall; - - if (!fmb) { - return; - } - - HMAP_FOR_EACH_SAFE (miss, next, hmap_node, &fmb->misses) { - hmap_remove(&fmb->misses, &miss->hmap_node); - miss_destroy(miss); - } - - LIST_FOR_EACH_SAFE (upcall, next_upcall, list_node, &fmb->upcalls) { - list_remove(&upcall->list_node); - upcall_destroy(upcall); - } + struct udpif *udpif = arg; - hmap_destroy(&fmb->misses); - free(fmb); -} + set_subprogram_name("flow_dumper"); + while (!latch_is_set(&udpif->exit_latch)) { + const struct dpif_flow_stats *stats; + long long int start_time, duration; + const struct nlattr *key, *mask; + struct dpif_flow_dump dump; + size_t key_len, mask_len; + unsigned int flow_limit; + long long int max_idle; + bool need_revalidate; + uint64_t reval_seq; + size_t n_flows, i; + + reval_seq = seq_read(udpif->reval_seq); + need_revalidate = udpif->last_reval_seq != reval_seq; + udpif->last_reval_seq = reval_seq; + + n_flows = udpif_get_n_flows(udpif); + udpif->max_n_flows = MAX(n_flows, udpif->max_n_flows); + udpif->avg_n_flows = (udpif->avg_n_flows + n_flows) / 2; + + atomic_read(&udpif->flow_limit, &flow_limit); + if (n_flows < flow_limit / 8) { + max_idle = 5000; + } else if (n_flows < flow_limit / 4) { + max_idle = 2000; + } else if (n_flows < flow_limit / 2) { + max_idle = 1000; + } else { + max_idle = 500; + } + atomic_store(&udpif->max_idle, max_idle); + + start_time = time_msec(); + dpif_flow_dump_start(&dump, udpif->dpif); + while (dpif_flow_dump_next(&dump, &key, &key_len, &mask, &mask_len, + NULL, NULL, &stats) + && !latch_is_set(&udpif->exit_latch)) { + struct udpif_flow_dump *udump = xmalloc(sizeof *udump); + struct revalidator *revalidator; + + udump->key_hash = hash_bytes(key, key_len, udpif->secret); + memcpy(&udump->key_buf, key, key_len); + udump->key = (struct nlattr *) &udump->key_buf; + udump->key_len = key_len; + + memcpy(&udump->mask_buf, mask, mask_len); + udump->mask = (struct nlattr *) &udump->mask_buf; + udump->mask_len = mask_len; + + udump->stats = *stats; + udump->need_revalidate = need_revalidate; + + revalidator = &udpif->revalidators[udump->key_hash + % udpif->n_revalidators]; + + ovs_mutex_lock(&revalidator->mutex); + while (revalidator->n_udumps >= REVALIDATE_MAX_BATCH * 3 + && !latch_is_set(&udpif->exit_latch)) { + ovs_mutex_cond_wait(&revalidator->wake_cond, + &revalidator->mutex); + } + list_push_back(&revalidator->udumps, &udump->list_node); + revalidator->n_udumps++; + xpthread_cond_signal(&revalidator->wake_cond); + ovs_mutex_unlock(&revalidator->mutex); + } + dpif_flow_dump_done(&dump); + + /* Let all the revalidators finish and garbage collect. */ + seq_change(udpif->dump_seq); + for (i = 0; i < udpif->n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; + ovs_mutex_lock(&revalidator->mutex); + xpthread_cond_signal(&revalidator->wake_cond); + ovs_mutex_unlock(&revalidator->mutex); + } -/* Retrieves the next drop key which ofproto-dpif needs to process. The caller - * is responsible for destroying it with drop_key_destroy(). */ -struct drop_key * -drop_key_next(struct udpif *udpif) -{ - struct list *next = guarded_list_pop_front(&udpif->drop_keys); - return next ? CONTAINER_OF(next, struct drop_key, list_node) : NULL; -} + for (i = 0; i < udpif->n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; -/* Destroys and deallocates 'drop_key'. */ -void -drop_key_destroy(struct drop_key *drop_key) -{ - if (drop_key) { - free(drop_key->key); - free(drop_key); - } -} + ovs_mutex_lock(&revalidator->mutex); + while (revalidator->dump_seq != seq_read(udpif->dump_seq) + && !latch_is_set(&udpif->exit_latch)) { + ovs_mutex_cond_wait(&revalidator->wake_cond, + &revalidator->mutex); + } + ovs_mutex_unlock(&revalidator->mutex); + } -/* Clears all drop keys waiting to be processed by drop_key_next(). */ -void -udpif_drop_key_clear(struct udpif *udpif) -{ - struct drop_key *drop_key, *next; - struct list list; + duration = time_msec() - start_time; + udpif->dump_duration = duration; + if (duration > 2000) { + flow_limit /= duration / 1000; + } else if (duration > 1300) { + flow_limit = flow_limit * 3 / 4; + } else if (duration < 1000 && n_flows > 2000 + && flow_limit < n_flows * 1000 / duration) { + flow_limit += 1000; + } + flow_limit = MIN(ofproto_flow_limit, MAX(flow_limit, 1000)); + atomic_store(&udpif->flow_limit, flow_limit); - guarded_list_pop_all(&udpif->drop_keys, &list); - LIST_FOR_EACH_SAFE (drop_key, next, list_node, &list) { - list_remove(&drop_key->list_node); - drop_key_destroy(drop_key); - } -} - -/* The dispatcher thread is responsible for receiving upcalls from the kernel, - * assigning them to a upcall_handler thread. */ -static void * -udpif_dispatcher(void *arg) -{ - struct udpif *udpif = arg; + if (duration > 2000) { + VLOG_WARN("Spent an unreasonably long %lldms dumping flows", + duration); + } - set_subprogram_name("dispatcher"); - while (!latch_is_set(&udpif->exit_latch)) { - recv_upcalls(udpif); - dpif_recv_wait(udpif->dpif); + poll_timer_wait_until(start_time + MIN(max_idle, 500)); + seq_wait(udpif->reval_seq, udpif->last_reval_seq); latch_wait(&udpif->exit_latch); poll_block(); } @@ -447,18 +650,57 @@ udpif_upcall_handler(void *arg) } ovs_mutex_unlock(&handler->mutex); - handle_upcalls(handler->udpif, &misses); + handle_upcalls(handler, &misses); coverage_clear(); } } - -static void -miss_destroy(struct flow_miss *miss) + +static void * +udpif_revalidator(void *arg) { - xlate_out_uninit(&miss->xout); -} + struct revalidator *revalidator = arg; + revalidator->name = xasprintf("revalidator_%u", ovsthread_id_self()); + set_subprogram_name("%s", revalidator->name); + for (;;) { + struct list udumps = LIST_INITIALIZER(&udumps); + struct udpif *udpif = revalidator->udpif; + size_t i; + + ovs_mutex_lock(&revalidator->mutex); + if (latch_is_set(&udpif->exit_latch)) { + ovs_mutex_unlock(&revalidator->mutex); + return NULL; + } + + if (!revalidator->n_udumps) { + if (revalidator->dump_seq != seq_read(udpif->dump_seq)) { + revalidator->dump_seq = seq_read(udpif->dump_seq); + revalidator_sweep(revalidator); + } else { + ovs_mutex_cond_wait(&revalidator->wake_cond, + &revalidator->mutex); + } + } + + for (i = 0; i < REVALIDATE_MAX_BATCH && revalidator->n_udumps; i++) { + list_push_back(&udumps, list_pop_front(&revalidator->udumps)); + revalidator->n_udumps--; + } + + /* Wake up the flow dumper. */ + xpthread_cond_signal(&revalidator->wake_cond); + ovs_mutex_unlock(&revalidator->mutex); + + if (!list_is_empty(&udumps)) { + revalidate_udumps(revalidator, &udumps); + } + } + + return NULL; +} + static enum upcall_type classify_upcall(const struct upcall *upcall) { @@ -601,6 +843,27 @@ recv_upcalls(struct udpif *udpif) } } +/* Calculates slow path actions for 'xout'. 'buf' must statically be + * initialized with at least 128 bytes of space. */ +static void +compose_slow_path(struct udpif *udpif, struct xlate_out *xout, + odp_port_t odp_in_port, struct ofpbuf *buf) +{ + union user_action_cookie cookie; + odp_port_t port; + uint32_t pid; + + cookie.type = USER_ACTION_COOKIE_SLOW_PATH; + cookie.slow_path.unused = 0; + cookie.slow_path.reason = xout->slow; + + port = xout->slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP | SLOW_STP) + ? ODPP_NONE + : odp_in_port; + pid = dpif_port_get_pid(udpif->dpif, port); + odp_put_userspace_action(pid, &cookie, sizeof cookie.slow_path, buf); +} + static struct flow_miss * flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto, const struct flow *flow, uint32_t hash) @@ -617,19 +880,26 @@ flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto, } static void -handle_upcalls(struct udpif *udpif, struct list *upcalls) +handle_upcalls(struct handler *handler, struct list *upcalls) { - struct dpif_op *opsp[FLOW_MISS_MAX_BATCH]; - struct dpif_op ops[FLOW_MISS_MAX_BATCH]; + struct hmap misses = HMAP_INITIALIZER(&misses); + struct udpif *udpif = handler->udpif; + + struct flow_miss miss_buf[FLOW_MISS_MAX_BATCH]; + struct dpif_op *opsp[FLOW_MISS_MAX_BATCH * 2]; + struct dpif_op ops[FLOW_MISS_MAX_BATCH * 2]; + struct flow_miss *miss, *next_miss; struct upcall *upcall, *next; - struct flow_miss_batch *fmb; size_t n_misses, n_ops, i; - struct flow_miss *miss; + unsigned int flow_limit; + bool fail_open, may_put; enum upcall_type type; - bool fail_open; - /* Extract the flow from each upcall. Construct in fmb->misses a hash - * table that maps each unique flow to a 'struct flow_miss'. + atomic_read(&udpif->flow_limit, &flow_limit); + may_put = udpif_get_n_flows(udpif) < flow_limit; + + /* Extract the flow from each upcall. Construct in 'misses' a hash table + * that maps each unique flow to a 'struct flow_miss'. * * Most commonly there is a single packet per flow_miss, but there are * several reasons why there might be more than one, e.g.: @@ -647,15 +917,11 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) * other end of the connection, which gives OVS a chance to set up a * datapath flow.) */ - fmb = xmalloc(sizeof *fmb); - fmb->reval_seq = seq_read(udpif->reval_seq); - hmap_init(&fmb->misses); - list_init(&fmb->upcalls); n_misses = 0; LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { struct dpif_upcall *dupcall = &upcall->dpif_upcall; + struct flow_miss *miss = &miss_buf[n_misses]; struct ofpbuf *packet = &dupcall->packet; - struct flow_miss *miss = &fmb->miss_buf[n_misses]; struct flow_miss *existing_miss; struct ofproto_dpif *ofproto; struct dpif_sflow *sflow; @@ -669,8 +935,6 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) &ofproto, &ipfix, &sflow, NULL, &odp_in_port); if (error) { if (error == ENODEV) { - struct drop_key *drop_key; - /* Received packet on datapath port for which we couldn't * associate an ofproto. This can happen if a port is removed * while traffic is being received. Print a rate-limited @@ -679,19 +943,9 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) * in the kernel. */ VLOG_INFO_RL(&rl, "received packet on unassociated datapath " "port %"PRIu32, odp_in_port); - - drop_key = xmalloc(sizeof *drop_key); - drop_key->key = xmemdup(dupcall->key, dupcall->key_len); - drop_key->key_len = dupcall->key_len; - - if (guarded_list_push_back(&udpif->drop_keys, - &drop_key->list_node, - MAX_QUEUE_LENGTH)) { - seq_change(udpif->wait_seq); - } else { - COVERAGE_INC(drop_queue_overflow); - drop_key_destroy(drop_key); - } + dpif_flow_put(udpif->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY, + dupcall->key, dupcall->key_len, NULL, 0, NULL, 0, + NULL); } list_remove(&upcall->list_node); upcall_destroy(upcall); @@ -706,10 +960,10 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) &flow.tunnel, &flow.in_port, &miss->flow); hash = flow_hash(&miss->flow, 0); - existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, + existing_miss = flow_miss_find(&misses, ofproto, &miss->flow, hash); if (!existing_miss) { - hmap_insert(&fmb->misses, &miss->hmap_node, hash); + hmap_insert(&misses, &miss->hmap_node, hash); miss->ofproto = ofproto; miss->key = dupcall->key; miss->key_len = dupcall->key_len; @@ -718,6 +972,7 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) miss->stats.n_bytes = 0; miss->stats.used = time_msec(); miss->stats.tcp_flags = 0; + miss->odp_in_port = odp_in_port; n_misses++; } else { @@ -786,13 +1041,21 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) * We can't do this in the previous loop because we need the TCP flags for * all the packets in each miss. */ fail_open = false; - HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { + HMAP_FOR_EACH (miss, hmap_node, &misses) { struct xlate_in xin; xlate_in_init(&xin, miss->ofproto, &miss->flow, NULL, miss->stats.tcp_flags, NULL); xin.may_learn = true; - xin.resubmit_stats = &miss->stats; + + if (miss->upcall_type == DPIF_UC_MISS) { + xin.resubmit_stats = &miss->stats; + } else { + /* For non-miss upcalls, there's a flow in the datapath which this + * packet was accounted to. Presumably the revalidators will deal + * with pushing its stats eventually. */ + } + xlate_actions(&xin, &miss->xout); fail_open = fail_open || miss->xout.fail_open; } @@ -813,6 +1076,9 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) LIST_FOR_EACH (upcall, list_node, upcalls) { struct flow_miss *miss = upcall->flow_miss; struct ofpbuf *packet = &upcall->dpif_upcall.packet; + struct ofpbuf mask; + struct dpif_op *op; + bool megaflow; if (miss->xout.slow) { struct xlate_in xin; @@ -821,9 +1087,38 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) xlate_actions_for_side_effects(&xin); } - if (miss->xout.odp_actions.size) { - struct dpif_op *op; + atomic_read(&enable_megaflows, &megaflow); + ofpbuf_use_stack(&mask, &miss->mask_buf, sizeof miss->mask_buf); + if (megaflow) { + odp_flow_key_from_mask(&mask, &miss->xout.wc.masks, &miss->flow, + UINT32_MAX); + } + if (may_put) { + op = &ops[n_ops++]; + op->type = DPIF_OP_FLOW_PUT; + op->u.flow_put.flags = DPIF_FP_CREATE | DPIF_FP_MODIFY; + op->u.flow_put.key = miss->key; + op->u.flow_put.key_len = miss->key_len; + op->u.flow_put.mask = mask.data; + op->u.flow_put.mask_len = mask.size; + op->u.flow_put.stats = NULL; + + if (!miss->xout.slow) { + op->u.flow_put.actions = miss->xout.odp_actions.data; + op->u.flow_put.actions_len = miss->xout.odp_actions.size; + } else { + struct ofpbuf buf; + + ofpbuf_use_stack(&buf, miss->slow_path_buf, + sizeof miss->slow_path_buf); + compose_slow_path(udpif, &miss->xout, miss->odp_in_port, &buf); + op->u.flow_put.actions = buf.data; + op->u.flow_put.actions_len = buf.size; + } + } + + if (miss->xout.odp_actions.size) { if (miss->flow.in_port.ofp_port != vsp_realdev_to_vlandev(miss->ofproto, miss->flow.in_port.ofp_port, @@ -882,17 +1177,287 @@ handle_upcalls(struct udpif *udpif, struct list *upcalls) } dpif_operate(udpif->dpif, opsp, n_ops); - list_move(&fmb->upcalls, upcalls); + HMAP_FOR_EACH_SAFE (miss, next_miss, hmap_node, &misses) { + hmap_remove(&misses, &miss->hmap_node); + xlate_out_uninit(&miss->xout); + } + hmap_destroy(&misses); + + LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { + list_remove(&upcall->list_node); + upcall_destroy(upcall); + } +} + +static struct udpif_key * +ukey_lookup(struct revalidator *revalidator, struct udpif_flow_dump *udump) +{ + struct udpif_key *ukey; + + HMAP_FOR_EACH_WITH_HASH (ukey, hmap_node, udump->key_hash, + &revalidator->ukeys) { + if (ukey->key_len == udump->key_len + && !memcmp(ukey->key, udump->key, udump->key_len)) { + return ukey; + } + } + return NULL; +} + +static void +ukey_delete(struct revalidator *revalidator, struct udpif_key *ukey) +{ + hmap_remove(&revalidator->ukeys, &ukey->hmap_node); + free(ukey); +} + +static bool +revalidate_ukey(struct udpif *udpif, struct udpif_flow_dump *udump, + struct udpif_key *ukey) +{ + struct ofpbuf xout_actions, *actions; + uint64_t slow_path_buf[128 / 8]; + struct xlate_out xout, *xoutp; + struct flow flow, udump_mask; + struct ofproto_dpif *ofproto; + struct dpif_flow_stats push; + uint32_t *udump32, *xout32; + odp_port_t odp_in_port; + struct xlate_in xin; + int error; + size_t i; + bool ok; + + ok = false; + xoutp = NULL; + actions = NULL; + + /* If we don't need to revalidate, we can simply push the stats contained + * in the udump, otherwise we'll have to get the actions so we can check + * them. */ + if (udump->need_revalidate) { + if (dpif_flow_get(udpif->dpif, ukey->key, ukey->key_len, &actions, + &udump->stats)) { + goto exit; + } + } + + push.used = udump->stats.used; + push.tcp_flags = udump->stats.tcp_flags; + push.n_packets = udump->stats.n_packets > ukey->stats.n_packets + ? udump->stats.n_packets - ukey->stats.n_packets + : 0; + push.n_bytes = udump->stats.n_bytes > ukey->stats.n_bytes + ? udump->stats.n_bytes - ukey->stats.n_bytes + : 0; + ukey->stats = udump->stats; + + if (!push.n_packets && !udump->need_revalidate) { + ok = true; + goto exit; + } + + error = xlate_receive(udpif->backer, NULL, ukey->key, ukey->key_len, &flow, + NULL, &ofproto, NULL, NULL, NULL, &odp_in_port); + if (error) { + goto exit; + } + + xlate_in_init(&xin, ofproto, &flow, NULL, push.tcp_flags, NULL); + xin.resubmit_stats = push.n_packets ? &push : NULL; + xin.may_learn = push.n_packets > 0; + xin.skip_wildcards = !udump->need_revalidate; + xlate_actions(&xin, &xout); + xoutp = &xout; - if (fmb->reval_seq != seq_read(udpif->reval_seq)) { - COVERAGE_INC(fmb_queue_revalidated); - flow_miss_batch_destroy(fmb); - } else if (!guarded_list_push_back(&udpif->fmbs, &fmb->list_node, - MAX_QUEUE_LENGTH)) { - COVERAGE_INC(fmb_queue_overflow); - flow_miss_batch_destroy(fmb); + if (!udump->need_revalidate) { + ok = true; + goto exit; + } + + if (!xout.slow) { + ofpbuf_use_const(&xout_actions, xout.odp_actions.data, + xout.odp_actions.size); } else { - seq_change(udpif->wait_seq); + ofpbuf_use_stack(&xout_actions, slow_path_buf, sizeof slow_path_buf); + compose_slow_path(udpif, &xout, odp_in_port, &xout_actions); + } + + if (!ofpbuf_equal(&xout_actions, actions)) { + goto exit; + } + + if (odp_flow_key_to_mask(udump->mask, udump->mask_len, &udump_mask, &flow) + == ODP_FIT_ERROR) { + goto exit; + } + + /* Since the kernel is free to ignore wildcarded bits in the mask, we can't + * directly check that the masks are the same. Instead we check that the + * mask in the kernel is more specific i.e. less wildcarded, than what + * we've calculated here. This guarantees we don't catch any packets we + * shouldn't with the megaflow. */ + udump32 = (uint32_t *) &udump_mask; + xout32 = (uint32_t *) &xout.wc.masks; + for (i = 0; i < FLOW_U32S; i++) { + if ((udump32[i] | xout32[i]) != udump32[i]) { + goto exit; + } + } + ok = true; + +exit: + ofpbuf_delete(actions); + xlate_out_uninit(xoutp); + return ok; +} + +static void +revalidate_udumps(struct revalidator *revalidator, struct list *udumps) +{ + struct udpif *udpif = revalidator->udpif; + + struct { + struct dpif_flow_stats ukey_stats; /* Stats stored in the ukey. */ + struct dpif_flow_stats stats; /* Stats for 'op'. */ + struct dpif_op op; /* Flow del operation. */ + } ops[REVALIDATE_MAX_BATCH]; + + struct dpif_op *opsp[REVALIDATE_MAX_BATCH]; + struct udpif_flow_dump *udump, *next_udump; + size_t n_ops, i, n_flows; + unsigned int flow_limit; + long long int max_idle; + bool must_del; + + atomic_read(&udpif->max_idle, &max_idle); + atomic_read(&udpif->flow_limit, &flow_limit); + + n_flows = udpif_get_n_flows(udpif); + + must_del = false; + if (n_flows > flow_limit) { + must_del = n_flows > 2 * flow_limit; + max_idle = 100; + } + + n_ops = 0; + LIST_FOR_EACH_SAFE (udump, next_udump, list_node, udumps) { + long long int used, now; + struct udpif_key *ukey; + + now = time_msec(); + ukey = ukey_lookup(revalidator, udump); + + used = udump->stats.used; + if (!used && ukey) { + used = ukey->created; + } + + if (must_del || (used && used < now - max_idle)) { + struct dpif_flow_stats *ukey_stats = &ops[n_ops].ukey_stats; + struct dpif_op *op = &ops[n_ops].op; + + op->type = DPIF_OP_FLOW_DEL; + op->u.flow_del.key = udump->key; + op->u.flow_del.key_len = udump->key_len; + op->u.flow_del.stats = &ops[n_ops].stats; + n_ops++; + + if (ukey) { + *ukey_stats = ukey->stats; + ukey_delete(revalidator, ukey); + } else { + memset(ukey_stats, 0, sizeof *ukey_stats); + } + + continue; + } + + if (!ukey) { + ukey = xmalloc(sizeof *ukey); + + ukey->key = (struct nlattr *) &ukey->key_buf; + memcpy(ukey->key, udump->key, udump->key_len); + ukey->key_len = udump->key_len; + + ukey->created = used ? used : now; + memset(&ukey->stats, 0, sizeof ukey->stats); + + ukey->mark = false; + + hmap_insert(&revalidator->ukeys, &ukey->hmap_node, + udump->key_hash); + } + ukey->mark = true; + + if (!revalidate_ukey(udpif, udump, ukey)) { + dpif_flow_del(udpif->dpif, udump->key, udump->key_len, NULL); + ukey_delete(revalidator, ukey); + } + + list_remove(&udump->list_node); + free(udump); + } + + for (i = 0; i < n_ops; i++) { + opsp[i] = &ops[i].op; + } + dpif_operate(udpif->dpif, opsp, n_ops); + + for (i = 0; i < n_ops; i++) { + struct dpif_flow_stats push, *stats, *ukey_stats; + + ukey_stats = &ops[i].ukey_stats; + stats = ops[i].op.u.flow_del.stats; + push.used = MAX(stats->used, ukey_stats->used); + push.tcp_flags = stats->tcp_flags | ukey_stats->tcp_flags; + push.n_packets = stats->n_packets - ukey_stats->n_packets; + push.n_bytes = stats->n_bytes - ukey_stats->n_bytes; + + if (push.n_packets || netflow_exists()) { + struct ofproto_dpif *ofproto; + struct netflow *netflow; + struct flow flow; + + if (!xlate_receive(udpif->backer, NULL, ops[i].op.u.flow_del.key, + ops[i].op.u.flow_del.key_len, &flow, NULL, + &ofproto, NULL, NULL, &netflow, NULL)) { + struct xlate_in xin; + + xlate_in_init(&xin, ofproto, &flow, NULL, push.tcp_flags, + NULL); + xin.resubmit_stats = push.n_packets ? &push : NULL; + xin.may_learn = push.n_packets > 0; + xin.skip_wildcards = true; + xlate_actions_for_side_effects(&xin); + + if (netflow) { + netflow_expire(netflow, &flow); + netflow_flow_clear(netflow, &flow); + netflow_unref(netflow); + } + } + } + } + + LIST_FOR_EACH_SAFE (udump, next_udump, list_node, udumps) { + list_remove(&udump->list_node); + free(udump); + } +} + +static void +revalidator_sweep(struct revalidator *revalidator) +{ + struct udpif_key *ukey, *next; + + HMAP_FOR_EACH_SAFE (ukey, next, hmap_node, &revalidator->ukeys) { + if (ukey->mark) { + ukey->mark = false; + } else { + ukey_delete(revalidator, ukey); + } } } @@ -904,9 +1469,21 @@ upcall_unixctl_show(struct unixctl_conn *conn, int argc OVS_UNUSED, struct udpif *udpif; LIST_FOR_EACH (udpif, list_node, &all_udpifs) { + unsigned int flow_limit; + long long int max_idle; size_t i; + atomic_read(&udpif->flow_limit, &flow_limit); + atomic_read(&udpif->max_idle, &max_idle); + ds_put_format(&ds, "%s:\n", dpif_name(udpif->dpif)); + ds_put_format(&ds, "\tflows : (current %"PRIu64")" + " (avg %u) (max %u) (limit %u)\n", udpif_get_n_flows(udpif), + udpif->avg_n_flows, udpif->max_n_flows, flow_limit); + ds_put_format(&ds, "\tmax idle : %lldms\n", max_idle); + ds_put_format(&ds, "\tdump duration : %lldms\n", udpif->dump_duration); + + ds_put_char(&ds, '\n'); for (i = 0; i < udpif->n_handlers; i++) { struct handler *handler = &udpif->handlers[i]; @@ -915,8 +1492,51 @@ upcall_unixctl_show(struct unixctl_conn *conn, int argc OVS_UNUSED, handler->name, handler->n_upcalls); ovs_mutex_unlock(&handler->mutex); } + + ds_put_char(&ds, '\n'); + for (i = 0; i < n_revalidators; i++) { + struct revalidator *revalidator = &udpif->revalidators[i]; + + /* XXX: The result of hmap_count(&revalidator->ukeys) may not be + * accurate because it's not protected by the revalidator mutex. */ + ovs_mutex_lock(&revalidator->mutex); + ds_put_format(&ds, "\t%s: (dump queue %"PRIuSIZE") (keys %"PRIuSIZE + ")\n", revalidator->name, revalidator->n_udumps, + hmap_count(&revalidator->ukeys)); + ovs_mutex_unlock(&revalidator->mutex); + } } unixctl_command_reply(conn, ds_cstr(&ds)); ds_destroy(&ds); } + +/* Disable using the megaflows. + * + * This command is only needed for advanced debugging, so it's not + * documented in the man page. */ +static void +upcall_unixctl_disable_megaflows(struct unixctl_conn *conn, + int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + atomic_store(&enable_megaflows, false); + udpif_flush(); + unixctl_command_reply(conn, "megaflows disabled"); +} + +/* Re-enable using megaflows. + * + * This command is only needed for advanced debugging, so it's not + * documented in the man page. */ +static void +upcall_unixctl_enable_megaflows(struct unixctl_conn *conn, + int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + atomic_store(&enable_megaflows, true); + udpif_flush(); + unixctl_command_reply(conn, "megaflows enabled"); +} diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h index a4d8228aa..d73ae4c9a 100644 --- a/ofproto/ofproto-dpif-upcall.h +++ b/ofproto/ofproto-dpif-upcall.h @@ -15,90 +15,24 @@ #ifndef OFPROTO_DPIF_UPCALL_H #define OFPROTO_DPIF_UPCALL_H -#define FLOW_MISS_MAX_BATCH 50 - -#include "dpif.h" -#include "flow.h" -#include "hmap.h" -#include "list.h" -#include "odp-util.h" -#include "ofpbuf.h" -#include "ofproto-dpif-xlate.h" +#include struct dpif; struct dpif_backer; +struct seq; +struct simap; -/* udif is responsible for retrieving upcalls from the kernel, processing miss - * upcalls, and handing more complex ones up to the main ofproto-dpif - * module. */ +/* Udif is responsible for retrieving upcalls from the kernel and processing + * them. Additionally, it's responsible for maintaining the datapath flow + * table. */ struct udpif *udpif_create(struct dpif_backer *, struct dpif *); -void udpif_set_threads(struct udpif *, size_t n_handlers); +void udpif_set_threads(struct udpif *, size_t n_handlers, + size_t n_revalidators); void udpif_destroy(struct udpif *); - -void udpif_wait(struct udpif *); - void udpif_revalidate(struct udpif *); - void udpif_get_memory_usage(struct udpif *, struct simap *usage); - -/* udpif figures out how to forward packets, and does forward them, but it - * can't set up datapath flows on its own. This interface passes packet - * forwarding data from udpif to the higher level ofproto_dpif to allow the - * latter to set up datapath flows. */ - -/* Flow miss batching. - * - * Some dpifs implement operations faster when you hand them off in a batch. - * To allow batching, "struct flow_miss" queues the dpif-related work needed - * for a given flow. Each "struct flow_miss" corresponds to sending one or - * more packets, plus possibly installing the flow in the dpif. */ -struct flow_miss { - struct hmap_node hmap_node; - struct ofproto_dpif *ofproto; - - struct flow flow; - enum odp_key_fitness key_fitness; - const struct nlattr *key; - size_t key_len; - enum dpif_upcall_type upcall_type; - struct dpif_flow_stats stats; - - struct xlate_out xout; -}; - -struct flow_miss_batch { - struct list list_node; - - struct flow_miss miss_buf[FLOW_MISS_MAX_BATCH]; - struct hmap misses; - - unsigned int reval_seq; - - /* Flow misses refer to the memory held by "struct upcall"s, - * so we need to keep track of the upcalls to be able to - * free them when done. */ - struct list upcalls; /* Contains "struct upcall"s. */ -}; - -struct flow_miss_batch *flow_miss_batch_next(struct udpif *); -void flow_miss_batch_destroy(struct flow_miss_batch *); - -/* Drop keys are odp flow keys which have drop flows installed in the kernel. - * These are datapath flows which have no associated ofproto, if they did we - * would use facets. - * - * udpif can't install drop flows by itself. This interfaces allows udpif to - * pass the drop flows up to ofproto_dpif to get it to install them. */ -struct drop_key { - struct hmap_node hmap_node; - struct list list_node; - struct nlattr *key; - size_t key_len; -}; - -struct drop_key *drop_key_next(struct udpif *); -void drop_key_destroy(struct drop_key *); -void udpif_drop_key_clear(struct udpif *); +struct seq *udpif_dump_seq(struct udpif *); +void udpif_flush(void); #endif /* ofproto-dpif-upcall.h */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 254453413..befa9f710 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -50,7 +50,6 @@ #include "ofp-actions.h" #include "ofp-parse.h" #include "ofp-print.h" -#include "ofproto-dpif-governor.h" #include "ofproto-dpif-ipfix.h" #include "ofproto-dpif-mirror.h" #include "ofproto-dpif-monitor.h" @@ -71,13 +70,6 @@ VLOG_DEFINE_THIS_MODULE(ofproto_dpif); COVERAGE_DEFINE(ofproto_dpif_expired); -COVERAGE_DEFINE(facet_revalidate); -COVERAGE_DEFINE(facet_unexpected); -COVERAGE_DEFINE(facet_create); -COVERAGE_DEFINE(facet_remove); -COVERAGE_DEFINE(subfacet_create); -COVERAGE_DEFINE(subfacet_destroy); -COVERAGE_DEFINE(subfacet_install_fail); COVERAGE_DEFINE(packet_in_overflow); /* Number of implemented OpenFlow tables. */ @@ -86,24 +78,14 @@ enum { TBL_INTERNAL = N_TABLES - 1 }; /* Used for internal hidden rules. */ BUILD_ASSERT_DECL(N_TABLES >= 2 && N_TABLES <= 255); struct flow_miss; -struct facet; struct rule_dpif { struct rule up; /* These statistics: * - * - Do include packets and bytes from facets that have been deleted or - * whose own statistics have been folded into the rule. - * - * - Do include packets and bytes sent "by hand" that were accounted to - * the rule without any facet being involved (this is a rare corner - * case in rule_execute()). - * - * - Do not include packet or bytes that can be obtained from any facet's - * packet_count or byte_count member or that can be obtained from the - * datapath by, e.g., dpif_flow_get() for any subfacet. - */ + * - Do include packets and bytes from datapath flows which have not + * recently been processed by a revalidator. */ struct ovs_mutex stats_mutex; uint64_t packet_count OVS_GUARDED; /* Number of packets received. */ uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */ @@ -111,23 +93,15 @@ struct rule_dpif { static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes); static struct rule_dpif *rule_dpif_cast(const struct rule *); +static void rule_expire(struct rule_dpif *); struct group_dpif { struct ofgroup up; /* These statistics: * - * - Do include packets and bytes from facets that have been deleted or - * whose own statistics have been folded into the rule. - * - * - Do include packets and bytes sent "by hand" that were accounted to - * the rule without any facet being involved (this is a rare corner - * case in rule_execute()). - * - * - Do not include packet or bytes that can be obtained from any facet's - * packet_count or byte_count member or that can be obtained from the - * datapath by, e.g., dpif_flow_get() for any subfacet. - */ + * - Do include packets and bytes from datapath flows which have not + * recently been processed by a revalidator. */ struct ovs_mutex stats_mutex; uint64_t packet_count OVS_GUARDED; /* Number of packets received. */ uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */ @@ -166,161 +140,6 @@ static void stp_wait(struct ofproto_dpif *ofproto); static int set_stp_port(struct ofport *, const struct ofproto_port_stp_settings *); -static void compose_slow_path(const struct ofproto_dpif *, const struct flow *, - enum slow_path_reason, - uint64_t *stub, size_t stub_size, - const struct nlattr **actionsp, - size_t *actions_lenp); - -/* A subfacet (see "struct subfacet" below) has three possible installation - * states: - * - * - SF_NOT_INSTALLED: Not installed in the datapath. This will only be the - * case just after the subfacet is created, just before the subfacet is - * destroyed, or if the datapath returns an error when we try to install a - * subfacet. - * - * - SF_FAST_PATH: The subfacet's actions are installed in the datapath. - * - * - SF_SLOW_PATH: An action that sends every packet for the subfacet through - * ofproto_dpif is installed in the datapath. - */ -enum subfacet_path { - SF_NOT_INSTALLED, /* No datapath flow for this subfacet. */ - SF_FAST_PATH, /* Full actions are installed. */ - SF_SLOW_PATH, /* Send-to-userspace action is installed. */ -}; - -/* A dpif flow and actions associated with a facet. - * - * See also the large comment on struct facet. */ -struct subfacet { - /* Owners. */ - struct hmap_node hmap_node; /* In struct ofproto_dpif 'subfacets' list. */ - struct list list_node; /* In struct facet's 'facets' list. */ - struct facet *facet; /* Owning facet. */ - struct dpif_backer *backer; /* Owning backer. */ - - struct nlattr *key; - int key_len; - - long long int used; /* Time last used; time created if not used. */ - long long int created; /* Time created. */ - - uint64_t dp_packet_count; /* Last known packet count in the datapath. */ - uint64_t dp_byte_count; /* Last known byte count in the datapath. */ - - enum subfacet_path path; /* Installed in datapath? */ -}; - -#define SUBFACET_DESTROY_MAX_BATCH 50 - -static struct subfacet *subfacet_create(struct facet *, struct flow_miss *, - uint32_t key_hash); -static struct subfacet *subfacet_find(struct dpif_backer *, - const struct nlattr *key, size_t key_len, - uint32_t key_hash); -static void subfacet_destroy(struct subfacet *); -static void subfacet_destroy__(struct subfacet *); -static void subfacet_destroy_batch(struct dpif_backer *, - struct subfacet **, int n); -static void subfacet_reset_dp_stats(struct subfacet *, - struct dpif_flow_stats *); -static void subfacet_update_stats(struct subfacet *, - const struct dpif_flow_stats *); -static int subfacet_install(struct subfacet *, - const struct ofpbuf *odp_actions, - struct dpif_flow_stats *); -static void subfacet_uninstall(struct subfacet *); - -/* A unique, non-overlapping instantiation of an OpenFlow flow. - * - * A facet associates a "struct flow", which represents the Open vSwitch - * userspace idea of an exact-match flow, with one or more subfacets. - * While the facet is created based on an exact-match flow, it is stored - * within the ofproto based on the wildcards that could be expressed - * based on the flow table and other configuration. (See the 'wc' - * description in "struct xlate_out" for more details.) - * - * Each subfacet tracks the datapath's idea of the flow equivalent to - * the facet. When the kernel module (or other dpif implementation) and - * Open vSwitch userspace agree on the definition of a flow key, there - * is exactly one subfacet per facet. If the dpif implementation - * supports more-specific flow matching than userspace, however, a facet - * can have more than one subfacet. Examples include the dpif - * implementation not supporting the same wildcards as userspace or some - * distinction in flow that userspace simply doesn't understand. - * - * Flow expiration works in terms of subfacets, so a facet must have at - * least one subfacet or it will never expire, leaking memory. */ -struct facet { - /* Owner. */ - struct ofproto_dpif *ofproto; - - /* Owned data. */ - struct list subfacets; - long long int used; /* Time last used; time created if not used. */ - - /* Key. */ - struct flow flow; /* Flow of the creating subfacet. */ - struct cls_rule cr; /* In 'ofproto_dpif's facets classifier. */ - - /* These statistics: - * - * - Do include packets and bytes sent "by hand", e.g. with - * dpif_execute(). - * - * - Do include packets and bytes that were obtained from the datapath - * when a subfacet's statistics were reset (e.g. dpif_flow_put() with - * DPIF_FP_ZERO_STATS). - * - * - Do not include packets or bytes that can be obtained from the - * datapath for any existing subfacet. - */ - uint64_t packet_count; /* Number of packets received. */ - uint64_t byte_count; /* Number of bytes received. */ - - /* Resubmit statistics. */ - uint64_t prev_packet_count; /* Number of packets from last stats push. */ - uint64_t prev_byte_count; /* Number of bytes from last stats push. */ - long long int prev_used; /* Used time from last stats push. */ - - /* Accounting. */ - uint16_t tcp_flags; /* TCP flags seen for this 'rule'. */ - - struct xlate_out xout; - - /* Storage for a single subfacet, to reduce malloc() time and space - * overhead. (A facet always has at least one subfacet and in the common - * case has exactly one subfacet. However, 'one_subfacet' may not - * always be valid, since it could have been removed after newer - * subfacets were pushed onto the 'subfacets' list.) */ - struct subfacet one_subfacet; - - long long int learn_rl; /* Rate limiter for facet_learn(). */ -}; - -static struct facet *facet_create(const struct flow_miss *); -static void facet_remove(struct facet *); -static void facet_free(struct facet *); - -static struct facet *facet_find(struct ofproto_dpif *, const struct flow *); -static struct facet *facet_lookup_valid(struct ofproto_dpif *, - const struct flow *); -static bool facet_revalidate(struct facet *); -static bool facet_check_consistency(struct facet *); - -static void facet_flush_stats(struct facet *); - -static void facet_reset_counters(struct facet *); -static void flow_push_stats(struct ofproto_dpif *, struct flow *, - struct dpif_flow_stats *, bool may_learn); -static void facet_push_stats(struct facet *, bool may_learn); -static void facet_learn(struct facet *); -static void push_all_stats(void); - -static bool facet_is_controller_flow(struct facet *); - struct ofport_dpif { struct hmap_node odp_port_node; /* In dpif_backer's "odp_to_ofport_map". */ struct ofport up; @@ -394,8 +213,8 @@ struct dpif_completion { struct ofoperation *op; }; -/* Reasons that we might need to revalidate every facet, and corresponding - * coverage counters. +/* Reasons that we might need to revalidate every datapath flow, and + * corresponding coverage counters. * * A value of 0 means that there is no need to revalidate. * @@ -409,7 +228,6 @@ enum revalidate_reason { REV_PORT_TOGGLED, /* Port enabled or disabled by CFM, LACP, ...*/ REV_FLOW_TABLE, /* Flow table changed. */ REV_MAC_LEARNING, /* Mac learning changed. */ - REV_INCONSISTENCY /* Facet self-check failed. */ }; COVERAGE_DEFINE(rev_reconfigure); COVERAGE_DEFINE(rev_stp); @@ -417,7 +235,6 @@ COVERAGE_DEFINE(rev_bond); COVERAGE_DEFINE(rev_port_toggled); COVERAGE_DEFINE(rev_flow_table); COVERAGE_DEFINE(rev_mac_learning); -COVERAGE_DEFINE(rev_inconsistency); /* All datapaths of a given type share a single dpif backer instance. */ struct dpif_backer { @@ -425,43 +242,27 @@ struct dpif_backer { int refcount; struct dpif *dpif; struct udpif *udpif; - struct timer next_expiration; struct ovs_rwlock odp_to_ofport_lock; struct hmap odp_to_ofport_map OVS_GUARDED; /* Contains "struct ofport"s. */ struct simap tnl_backers; /* Set of dpif ports backing tunnels. */ - /* Facet revalidation flags applying to facets which use this backer. */ - enum revalidate_reason need_revalidate; /* Revalidate every facet. */ + enum revalidate_reason need_revalidate; /* Revalidate all flows. */ - struct hmap drop_keys; /* Set of dropped odp keys. */ bool recv_set_enable; /* Enables or disables receiving packets. */ - - struct hmap subfacets; - struct governor *governor; - - /* Subfacet statistics. - * - * These keep track of the total number of subfacets added and deleted and - * flow life span. They are useful for computing the flow rates stats - * exposed via "ovs-appctl dpif/show". The goal is to learn about - * traffic patterns in ways that we can use later to improve Open vSwitch - * performance in new situations. */ - unsigned max_n_subfacet; /* Maximum number of flows */ - unsigned avg_n_subfacet; /* Average number of flows. */ }; /* All existing ofproto_backer instances, indexed by ofproto->up.type. */ static struct shash all_dpif_backers = SHASH_INITIALIZER(&all_dpif_backers); -static void drop_key_clear(struct dpif_backer *); - struct ofproto_dpif { struct hmap_node all_ofproto_dpifs_node; /* In 'all_ofproto_dpifs'. */ struct ofproto up; struct dpif_backer *backer; + uint64_t dump_seq; /* Last read of udpif_dump_seq(). */ + /* Special OpenFlow rules. */ struct rule_dpif *miss_rule; /* Sends flow table misses to controller. */ struct rule_dpif *no_packet_in_rule; /* Drops flow table misses. */ @@ -477,10 +278,6 @@ struct ofproto_dpif { bool lacp_enabled; struct mbridge *mbridge; - /* Facets. */ - struct classifier facets; /* Contains 'struct facet's. */ - long long int consistency_rl; - struct ovs_mutex stats_mutex; struct netdev_stats stats OVS_GUARDED; /* To account packets generated and * consumed in userspace. */ @@ -501,18 +298,10 @@ struct ofproto_dpif { int port_poll_errno; /* Last errno for port_poll() reply. */ uint64_t change_seq; /* Connectivity status changes. */ - /* Per ofproto's dpif stats. */ - uint64_t n_hit; - uint64_t n_missed; - /* Work queues. */ struct guarded_list pins; /* Contains "struct ofputil_packet_in"s. */ }; -/* By default, flows in the datapath are wildcarded (megaflows). They - * may be disabled with the "ovs-appctl dpif/disable-megaflows" command. */ -static bool enable_megaflows = true; - /* All existing ofproto_dpif instances, indexed by ->up.name. */ static struct hmap all_ofproto_dpifs = HMAP_INITIALIZER(&all_ofproto_dpifs); @@ -532,12 +321,6 @@ static void ofproto_trace(struct ofproto_dpif *, const struct flow *, const struct ofpact[], size_t ofpacts_len, struct ds *); -/* Upcalls. */ -static void handle_upcalls(struct dpif_backer *); - -/* Flow expiration. */ -static int expire(struct dpif_backer *); - /* Global variables. */ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); @@ -653,7 +436,6 @@ lookup_ofproto_dpif_by_port_name(const char *name) static int type_run(const char *type) { - static long long int push_timer = LLONG_MIN; struct dpif_backer *backer; backer = shash_find_data(&all_dpif_backers, type); @@ -665,18 +447,6 @@ type_run(const char *type) dpif_run(backer->dpif); - handle_upcalls(backer); - - /* The most natural place to push facet statistics is when they're pulled - * from the datapath. However, when there are many flows in the datapath, - * this expensive operation can occur so frequently, that it reduces our - * ability to quickly set up flows. To reduce the cost, we push statistics - * here instead. */ - if (time_msec() > push_timer) { - push_timer = time_msec() + 2000; - push_all_stats(); - } - /* If vswitchd started with other_config:flow_restore_wait set as "true", * and the configuration has now changed to "false", enable receiving * packets from the datapath. */ @@ -695,7 +465,7 @@ type_run(const char *type) } if (backer->recv_set_enable) { - udpif_set_threads(backer->udpif, n_handlers); + udpif_set_threads(backer->udpif, n_handlers, n_revalidators); } if (backer->need_revalidate) { @@ -763,18 +533,11 @@ type_run(const char *type) case REV_PORT_TOGGLED: COVERAGE_INC(rev_port_toggled); break; case REV_FLOW_TABLE: COVERAGE_INC(rev_flow_table); break; case REV_MAC_LEARNING: COVERAGE_INC(rev_mac_learning); break; - case REV_INCONSISTENCY: COVERAGE_INC(rev_inconsistency); break; } backer->need_revalidate = 0; - /* Clear the drop_keys in case we should now be accepting some - * formerly dropped flows. */ - drop_key_clear(backer); - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - struct facet *facet, *next; struct ofport_dpif *ofport; - struct cls_cursor cursor; struct ofbundle *bundle; if (ofproto->backer != backer) { @@ -812,48 +575,13 @@ type_run(const char *type) ofport->is_tunnel, ofport->may_enable); } ovs_rwlock_unlock(&xlate_rwlock); - - /* Only ofproto-dpif cares about the facet classifier so we just - * lock cls_cursor_init() to appease the thread safety analysis. */ - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cls_cursor_init(&cursor, &ofproto->facets, NULL); - ovs_rwlock_unlock(&ofproto->facets.rwlock); - CLS_CURSOR_FOR_EACH_SAFE (facet, next, cr, &cursor) { - facet_revalidate(facet); - } } udpif_revalidate(backer->udpif); } - if (!backer->recv_set_enable) { - /* Wake up before a max of 1000ms. */ - timer_set_duration(&backer->next_expiration, 1000); - } else if (timer_expired(&backer->next_expiration)) { - int delay = expire(backer); - timer_set_duration(&backer->next_expiration, delay); - } - process_dpif_port_changes(backer); - if (backer->governor) { - size_t n_subfacets; - - governor_run(backer->governor); - - /* If the governor has shrunk to its minimum size and the number of - * subfacets has dwindled, then drop the governor entirely. - * - * For hysteresis, the number of subfacets to drop the governor is - * smaller than the number needed to trigger its creation. */ - n_subfacets = hmap_count(&backer->subfacets); - if (n_subfacets * 4 < flow_eviction_threshold - && governor_is_idle(backer->governor)) { - governor_destroy(backer->governor); - backer->governor = NULL; - } - } - return 0; } @@ -998,13 +726,7 @@ type_wait(const char *type) return; } - if (backer->governor) { - governor_wait(backer->governor); - } - - timer_wait(&backer->next_expiration); dpif_wait(backer->dpif); - udpif_wait(backer->udpif); } /* Basic life-cycle. */ @@ -1034,9 +756,6 @@ close_dpif_backer(struct dpif_backer *backer) return; } - drop_key_clear(backer); - hmap_destroy(&backer->drop_keys); - udpif_destroy(backer->udpif); simap_destroy(&backer->tnl_backers); @@ -1046,10 +765,6 @@ close_dpif_backer(struct dpif_backer *backer) free(backer->type); dpif_close(backer->dpif); - ovs_assert(hmap_is_empty(&backer->subfacets)); - hmap_destroy(&backer->subfacets); - governor_destroy(backer->governor); - free(backer); } @@ -1116,13 +831,9 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) backer->udpif = udpif_create(backer, backer->dpif); backer->type = xstrdup(type); - backer->governor = NULL; backer->refcount = 1; hmap_init(&backer->odp_to_ofport_map); ovs_rwlock_init(&backer->odp_to_ofport_lock); - hmap_init(&backer->drop_keys); - hmap_init(&backer->subfacets); - timer_set_duration(&backer->next_expiration, 1000); backer->need_revalidate = 0; simap_init(&backer->tnl_backers); backer->recv_set_enable = !ofproto_get_flow_restore_wait(); @@ -1163,12 +874,9 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) } if (backer->recv_set_enable) { - udpif_set_threads(backer->udpif, n_handlers); + udpif_set_threads(backer->udpif, n_handlers, n_revalidators); } - backer->max_n_subfacet = 0; - backer->avg_n_subfacet = 0; - return error; } @@ -1188,6 +896,7 @@ construct(struct ofproto *ofproto_) ofproto->sflow = NULL; ofproto->ipfix = NULL; ofproto->stp = NULL; + ofproto->dump_seq = 0; hmap_init(&ofproto->bundles); ofproto->ml = mac_learning_create(MAC_ENTRY_DEFAULT_IDLE_TIME); ofproto->mbridge = mbridge_create(); @@ -1196,9 +905,6 @@ construct(struct ofproto *ofproto_) ovs_mutex_init(&ofproto->stats_mutex); ovs_mutex_init(&ofproto->vsp_mutex); - classifier_init(&ofproto->facets, NULL); - ofproto->consistency_rl = LLONG_MIN; - guarded_list_init(&ofproto->pins); ofproto_dpif_unixctl_init(); @@ -1236,9 +942,6 @@ construct(struct ofproto *ofproto_) error = add_internal_flows(ofproto); ofproto->up.tables[TBL_INTERNAL].flags = OFTABLE_HIDDEN | OFTABLE_READONLY; - ofproto->n_hit = 0; - ofproto->n_missed = 0; - return error; } @@ -1324,18 +1027,9 @@ destruct(struct ofproto *ofproto_) struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); struct rule_dpif *rule, *next_rule; struct ofproto_packet_in *pin, *next_pin; - struct facet *facet, *next_facet; - struct cls_cursor cursor; struct oftable *table; struct list pins; - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cls_cursor_init(&cursor, &ofproto->facets, NULL); - ovs_rwlock_unlock(&ofproto->facets.rwlock); - CLS_CURSOR_FOR_EACH_SAFE (facet, next_facet, cr, &cursor) { - facet_remove(facet); - } - ofproto->backer->need_revalidate = REV_RECONFIGURE; ovs_rwlock_wrlock(&xlate_rwlock); xlate_remove_ofproto(ofproto); @@ -1373,8 +1067,6 @@ destruct(struct ofproto *ofproto_) hmap_destroy(&ofproto->bundles); mac_learning_unref(ofproto->ml); - classifier_destroy(&ofproto->facets); - hmap_destroy(&ofproto->vlandev_map); hmap_destroy(&ofproto->realdev_vid_map); @@ -1392,7 +1084,7 @@ static int run(struct ofproto *ofproto_) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - uint64_t new_seq; + uint64_t new_seq, new_dump_seq; if (mbridge_need_revalidate(ofproto->mbridge)) { ofproto->backer->need_revalidate = REV_RECONFIGURE; @@ -1451,28 +1143,35 @@ run(struct ofproto *ofproto_) } ovs_rwlock_unlock(&ofproto->ml->rwlock); - /* Check the consistency of a random facet, to aid debugging. */ - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - if (time_msec() >= ofproto->consistency_rl - && !classifier_is_empty(&ofproto->facets) - && !ofproto->backer->need_revalidate) { - struct cls_subtable *table; - struct cls_rule *cr; - struct facet *facet; - - ofproto->consistency_rl = time_msec() + 250; - - table = CONTAINER_OF(hmap_random_node(&ofproto->facets.subtables), - struct cls_subtable, hmap_node); - cr = CONTAINER_OF(hmap_random_node(&table->rules), struct cls_rule, - hmap_node); - facet = CONTAINER_OF(cr, struct facet, cr); - - if (!facet_check_consistency(facet)) { - ofproto->backer->need_revalidate = REV_INCONSISTENCY; + new_dump_seq = seq_read(udpif_dump_seq(ofproto->backer->udpif)); + if (ofproto->dump_seq != new_dump_seq) { + struct rule *rule, *next_rule; + + /* We know stats are relatively fresh, so now is a good time to do some + * periodic work. */ + ofproto->dump_seq = new_dump_seq; + + /* Expire OpenFlow flows whose idle_timeout or hard_timeout + * has passed. */ + ovs_mutex_lock(&ofproto_mutex); + LIST_FOR_EACH_SAFE (rule, next_rule, expirable, + &ofproto->up.expirable) { + rule_expire(rule_dpif_cast(rule)); + } + ovs_mutex_unlock(&ofproto_mutex); + + /* All outstanding data in existing flows has been accounted, so it's a + * good time to do bond rebalancing. */ + if (ofproto->has_bonded_bundles) { + struct ofbundle *bundle; + + HMAP_FOR_EACH (bundle, hmap_node, &ofproto->bundles) { + if (bundle->bond) { + bond_rebalance(bundle->bond); + } + } } } - ovs_rwlock_unlock(&ofproto->facets.rwlock); return 0; } @@ -1511,27 +1210,8 @@ wait(struct ofproto *ofproto_) VLOG_DBG_RL(&rl, "need revalidate in ofproto_wait_cb()"); poll_immediate_wake(); } -} -static void -get_memory_usage(const struct ofproto *ofproto_, struct simap *usage) -{ - const struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - struct cls_cursor cursor; - size_t n_subfacets = 0; - struct facet *facet; - - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - simap_increase(usage, "facets", classifier_count(&ofproto->facets)); - ovs_rwlock_unlock(&ofproto->facets.rwlock); - - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cls_cursor_init(&cursor, &ofproto->facets, NULL); - CLS_CURSOR_FOR_EACH (facet, cr, &cursor) { - n_subfacets += list_size(&facet->subfacets); - } - ovs_rwlock_unlock(&ofproto->facets.rwlock); - simap_increase(usage, "subfacets", n_subfacets); + seq_wait(udpif_dump_seq(ofproto->backer->udpif), ofproto->dump_seq); } static void @@ -1546,34 +1226,9 @@ type_get_memory_usage(const char *type, struct simap *usage) } static void -flush(struct ofproto *ofproto_) +flush(struct ofproto *ofproto OVS_UNUSED) { - struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - struct subfacet *subfacet, *next_subfacet; - struct subfacet *batch[SUBFACET_DESTROY_MAX_BATCH]; - int n_batch; - - n_batch = 0; - HMAP_FOR_EACH_SAFE (subfacet, next_subfacet, hmap_node, - &ofproto->backer->subfacets) { - if (subfacet->facet->ofproto != ofproto) { - continue; - } - - if (subfacet->path != SF_NOT_INSTALLED) { - batch[n_batch++] = subfacet; - if (n_batch >= SUBFACET_DESTROY_MAX_BATCH) { - subfacet_destroy_batch(ofproto->backer, batch, n_batch); - n_batch = 0; - } - } else { - subfacet_destroy(subfacet); - } - } - - if (n_batch > 0) { - subfacet_destroy_batch(ofproto->backer, batch, n_batch); - } + udpif_flush(); } static void @@ -2706,7 +2361,6 @@ static int mirror_get_stats__(struct ofproto *ofproto, void *aux, uint64_t *packets, uint64_t *bytes) { - push_all_stats(); return mirror_get_stats(ofproto_dpif_cast(ofproto)->mbridge, aux, packets, bytes); } @@ -2970,8 +2624,6 @@ port_get_stats(const struct ofport *ofport_, struct netdev_stats *stats) struct ofport_dpif *ofport = ofport_dpif_cast(ofport_); int error; - push_all_stats(); - error = netdev_get_stats(ofport->up.netdev, stats); if (!error && ofport_->ofp_port == OFPP_LOCAL) { @@ -3108,1372 +2760,127 @@ port_is_lacp_current(const struct ofport *ofport_) : -1); } -/* Upcall handling. */ - -struct flow_miss_op { - struct dpif_op dpif_op; - - uint64_t slow_stub[128 / 8]; /* Buffer for compose_slow_path() */ - struct xlate_out xout; - bool xout_garbage; /* 'xout' needs to be uninitialized? */ - - struct ofpbuf mask; /* Flow mask for "put" ops. */ - struct odputil_keybuf maskbuf; - - /* If this is a "put" op, then a pointer to the subfacet that should - * be marked as uninstalled if the operation fails. */ - struct subfacet *subfacet; -}; - -/* Figures out whether a flow that missed in 'ofproto', whose details are in - * 'miss' masked by 'wc', is likely to be worth tracking in detail in userspace - * and (usually) installing a datapath flow. The answer is usually "yes" (a - * return value of true). However, for short flows the cost of bookkeeping is - * much higher than the benefits, so when the datapath holds a large number of - * flows we impose some heuristics to decide which flows are likely to be worth - * tracking. */ -static bool -flow_miss_should_make_facet(struct flow_miss *miss) +/* If 'rule' is an OpenFlow rule, that has expired according to OpenFlow rules, + * then delete it entirely. */ +static void +rule_expire(struct rule_dpif *rule) + OVS_REQUIRES(ofproto_mutex) { - struct dpif_backer *backer = miss->ofproto->backer; - uint32_t hash; - - switch (flow_miss_model) { - case OFPROTO_HANDLE_MISS_AUTO: - break; - case OFPROTO_HANDLE_MISS_WITH_FACETS: - return true; - case OFPROTO_HANDLE_MISS_WITHOUT_FACETS: - return false; - } - - if (!backer->governor) { - size_t n_subfacets; - - n_subfacets = hmap_count(&backer->subfacets); - if (n_subfacets * 2 <= flow_eviction_threshold) { - return true; - } - - backer->governor = governor_create(); - } + uint16_t idle_timeout, hard_timeout; + long long int now = time_msec(); + int reason; - hash = flow_hash_in_wildcards(&miss->flow, &miss->xout.wc, 0); - return governor_should_install_flow(backer->governor, hash, - miss->stats.n_packets); -} + ovs_assert(!rule->up.pending); -/* Handles 'miss', which matches 'facet'. May add any required datapath - * operations to 'ops', incrementing '*n_ops' for each new op. - * - * All of the packets in 'miss' are considered to have arrived at time - * 'miss->stats.used'. This is really important only for new facets: if we - * just called time_msec() here, then the new subfacet or its packets could - * look (occasionally) as though it was used some time after the facet was - * used. That can make a one-packet flow look like it has a nonzero duration, - * which looks odd in e.g. NetFlow statistics. */ -static void -handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet, - struct flow_miss_op *ops, size_t *n_ops) -{ - enum subfacet_path want_path; - struct subfacet *subfacet; - uint32_t key_hash; - - /* Update facet stats. */ - facet->packet_count += miss->stats.n_packets; - facet->prev_packet_count += miss->stats.n_packets; - facet->byte_count += miss->stats.n_bytes; - facet->prev_byte_count += miss->stats.n_bytes; - - /* Look for an existing subfacet. If we find one, update its used time. */ - key_hash = odp_flow_key_hash(miss->key, miss->key_len); - if (!list_is_empty(&facet->subfacets)) { - subfacet = subfacet_find(miss->ofproto->backer, - miss->key, miss->key_len, key_hash); - if (subfacet) { - if (subfacet->facet == facet) { - subfacet->used = MAX(subfacet->used, miss->stats.used); - } else { - /* This shouldn't happen. */ - VLOG_ERR_RL(&rl, "subfacet with wrong facet"); - subfacet_destroy(subfacet); - subfacet = NULL; - } - } + /* Has 'rule' expired? */ + ovs_mutex_lock(&rule->up.mutex); + hard_timeout = rule->up.hard_timeout; + idle_timeout = rule->up.idle_timeout; + if (hard_timeout && now > rule->up.modified + hard_timeout * 1000) { + reason = OFPRR_HARD_TIMEOUT; + } else if (idle_timeout && now > rule->up.used + idle_timeout * 1000) { + reason = OFPRR_IDLE_TIMEOUT; } else { - subfacet = NULL; + reason = -1; } + ovs_mutex_unlock(&rule->up.mutex); - /* Don't install the flow if it's the result of the "userspace" - * action for an already installed facet. This can occur when a - * datapath flow with wildcards has a "userspace" action and flows - * sent to userspace result in a different subfacet, which will then - * be rejected as overlapping by the datapath. */ - if (miss->upcall_type == DPIF_UC_ACTION - && !list_is_empty(&facet->subfacets)) { - return; + if (reason >= 0) { + COVERAGE_INC(ofproto_dpif_expired); + ofproto_rule_expire(&rule->up, reason); } +} - /* Create a subfacet, if we don't already have one. */ - if (!subfacet) { - subfacet = subfacet_create(facet, miss, key_hash); - } +/* Executes, within 'ofproto', the actions in 'rule' or 'ofpacts' on 'packet'. + * 'flow' must reflect the data in 'packet'. */ +int +ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto, + const struct flow *flow, + struct rule_dpif *rule, + const struct ofpact *ofpacts, size_t ofpacts_len, + struct ofpbuf *packet) +{ + struct odputil_keybuf keybuf; + struct dpif_flow_stats stats; + struct xlate_out xout; + struct xlate_in xin; + ofp_port_t in_port; + struct ofpbuf key; + int error; - /* Install the subfacet, if it's not already installed. */ - want_path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH; - if (subfacet->path != want_path) { - struct flow_miss_op *op = &ops[(*n_ops)++]; - struct dpif_flow_put *put = &op->dpif_op.u.flow_put; + ovs_assert((rule != NULL) != (ofpacts != NULL)); - subfacet->path = want_path; + dpif_flow_stats_extract(flow, packet, time_msec(), &stats); + if (rule) { + rule_dpif_credit_stats(rule, &stats); + } - ofpbuf_use_stack(&op->mask, &op->maskbuf, sizeof op->maskbuf); - if (enable_megaflows) { - odp_flow_key_from_mask(&op->mask, &facet->xout.wc.masks, - &miss->flow, UINT32_MAX); - } + xlate_in_init(&xin, ofproto, flow, rule, stats.tcp_flags, packet); + xin.ofpacts = ofpacts; + xin.ofpacts_len = ofpacts_len; + xin.resubmit_stats = &stats; + xlate_actions(&xin, &xout); - op->xout_garbage = false; - op->dpif_op.type = DPIF_OP_FLOW_PUT; - op->subfacet = subfacet; - put->flags = DPIF_FP_CREATE; - put->key = miss->key; - put->key_len = miss->key_len; - put->mask = op->mask.data; - put->mask_len = op->mask.size; - - if (want_path == SF_FAST_PATH) { - put->actions = facet->xout.odp_actions.data; - put->actions_len = facet->xout.odp_actions.size; - } else { - compose_slow_path(facet->ofproto, &miss->flow, facet->xout.slow, - op->slow_stub, sizeof op->slow_stub, - &put->actions, &put->actions_len); - } - put->stats = NULL; + ofpbuf_use_stack(&key, &keybuf, sizeof keybuf); + in_port = flow->in_port.ofp_port; + if (in_port == OFPP_NONE) { + in_port = OFPP_LOCAL; } -} + odp_flow_key_from_flow(&key, flow, ofp_port_to_odp_port(ofproto, in_port)); -/* Handles flow miss 'miss'. May add any required datapath operations - * to 'ops', incrementing '*n_ops' for each new op. */ -static void -handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops, - size_t *n_ops) -{ - struct facet *facet; - - miss->ofproto->n_missed += miss->stats.n_packets; - - facet = facet_lookup_valid(miss->ofproto, &miss->flow); - if (!facet) { - /* There does not exist a bijection between 'struct flow' and datapath - * flow keys with fitness ODP_FIT_TO_LITTLE. This breaks a fundamental - * assumption used throughout the facet and subfacet handling code. - * Since we have to handle these misses in userspace anyway, we simply - * skip facet creation, avoiding the problem altogether. */ - if (miss->key_fitness == ODP_FIT_TOO_LITTLE - || !flow_miss_should_make_facet(miss)) { - return; - } + error = dpif_execute(ofproto->backer->dpif, key.data, key.size, + xout.odp_actions.data, xout.odp_actions.size, packet, + (xout.slow & SLOW_ACTION) != 0); + xlate_out_uninit(&xout); - facet = facet_create(miss); - } - handle_flow_miss_with_facet(miss, facet, ops, n_ops); + return error; } -static struct drop_key * -drop_key_lookup(const struct dpif_backer *backer, const struct nlattr *key, - size_t key_len) +void +rule_dpif_credit_stats(struct rule_dpif *rule, + const struct dpif_flow_stats *stats) { - struct drop_key *drop_key; - - HMAP_FOR_EACH_WITH_HASH (drop_key, hmap_node, hash_bytes(key, key_len, 0), - &backer->drop_keys) { - if (drop_key->key_len == key_len - && !memcmp(drop_key->key, key, key_len)) { - return drop_key; - } - } - return NULL; + ovs_mutex_lock(&rule->stats_mutex); + rule->packet_count += stats->n_packets; + rule->byte_count += stats->n_bytes; + rule->up.used = MAX(rule->up.used, stats->used); + ovs_mutex_unlock(&rule->stats_mutex); } -static void -drop_key_clear(struct dpif_backer *backer) +bool +rule_dpif_is_fail_open(const struct rule_dpif *rule) { - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 15); - struct drop_key *drop_key, *next; - - HMAP_FOR_EACH_SAFE (drop_key, next, hmap_node, &backer->drop_keys) { - int error; - - error = dpif_flow_del(backer->dpif, drop_key->key, drop_key->key_len, - NULL); - if (error && !VLOG_DROP_WARN(&rl)) { - struct ds ds = DS_EMPTY_INITIALIZER; - odp_flow_key_format(drop_key->key, drop_key->key_len, &ds); - VLOG_WARN("Failed to delete drop key (%s) (%s)", - ovs_strerror(error), ds_cstr(&ds)); - ds_destroy(&ds); - } - - hmap_remove(&backer->drop_keys, &drop_key->hmap_node); - drop_key_destroy(drop_key); - } + return is_fail_open_rule(&rule->up); +} - udpif_drop_key_clear(backer->udpif); +bool +rule_dpif_is_table_miss(const struct rule_dpif *rule) +{ + return rule_is_table_miss(&rule->up); } -static void -handle_flow_misses(struct dpif_backer *backer, struct flow_miss_batch *fmb) -{ - struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH]; - struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH]; - struct flow_miss *miss; - size_t n_ops, i; - - /* Process each element in the to-do list, constructing the set of - * operations to batch. */ - n_ops = 0; - HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { - handle_flow_miss(miss, flow_miss_ops, &n_ops); - } - ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops)); - - /* Execute batch. */ - for (i = 0; i < n_ops; i++) { - dpif_ops[i] = &flow_miss_ops[i].dpif_op; - } - dpif_operate(backer->dpif, dpif_ops, n_ops); - - for (i = 0; i < n_ops; i++) { - if (dpif_ops[i]->error != 0 - && flow_miss_ops[i].dpif_op.type == DPIF_OP_FLOW_PUT - && flow_miss_ops[i].subfacet) { - struct subfacet *subfacet = flow_miss_ops[i].subfacet; - - COVERAGE_INC(subfacet_install_fail); - - /* Zero-out subfacet counters when installation failed, but - * datapath reported hits. This should not happen and - * indicates a bug, since if the datapath flow exists, we - * should not be attempting to create a new subfacet. A - * buggy datapath could trigger this, so just zero out the - * counters and log an error. */ - if (subfacet->dp_packet_count || subfacet->dp_byte_count) { - VLOG_ERR_RL(&rl, "failed to install subfacet for which " - "datapath reported hits"); - subfacet->dp_packet_count = subfacet->dp_byte_count = 0; - } +ovs_be64 +rule_dpif_get_flow_cookie(const struct rule_dpif *rule) + OVS_REQUIRES(rule->up.mutex) +{ + return rule->up.flow_cookie; +} - subfacet->path = SF_NOT_INSTALLED; - } - } +void +rule_dpif_reduce_timeouts(struct rule_dpif *rule, uint16_t idle_timeout, + uint16_t hard_timeout) +{ + ofproto_rule_reduce_timeouts(&rule->up, idle_timeout, hard_timeout); } -static void -handle_upcalls(struct dpif_backer *backer) +/* Returns 'rule''s actions. The caller owns a reference on the returned + * actions and must eventually release it (with rule_actions_unref()) to avoid + * a memory leak. */ +struct rule_actions * +rule_dpif_get_actions(const struct rule_dpif *rule) { - struct flow_miss_batch *fmb; - int n_processed; - - for (n_processed = 0; n_processed < FLOW_MISS_MAX_BATCH; n_processed++) { - struct drop_key *drop_key = drop_key_next(backer->udpif); - if (!drop_key) { - break; - } - - if (!drop_key_lookup(backer, drop_key->key, drop_key->key_len)) { - hmap_insert(&backer->drop_keys, &drop_key->hmap_node, - hash_bytes(drop_key->key, drop_key->key_len, 0)); - dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY, - drop_key->key, drop_key->key_len, - NULL, 0, NULL, 0, NULL); - } else { - drop_key_destroy(drop_key); - } - } - - fmb = flow_miss_batch_next(backer->udpif); - if (fmb) { - handle_flow_misses(backer, fmb); - flow_miss_batch_destroy(fmb); - } -} - -/* Flow expiration. */ - -static int subfacet_max_idle(const struct dpif_backer *); -static void update_stats(struct dpif_backer *); -static void rule_expire(struct rule_dpif *) OVS_REQUIRES(ofproto_mutex); -static void expire_subfacets(struct dpif_backer *, int dp_max_idle); - -/* This function is called periodically by run(). Its job is to collect - * updates for the flows that have been installed into the datapath, most - * importantly when they last were used, and then use that information to - * expire flows that have not been used recently. - * - * Returns the number of milliseconds after which it should be called again. */ -static int -expire(struct dpif_backer *backer) -{ - struct ofproto_dpif *ofproto; - size_t n_subfacets; - int max_idle; - - /* Periodically clear out the drop keys in an effort to keep them - * relatively few. */ - drop_key_clear(backer); - - /* Update stats for each flow in the backer. */ - update_stats(backer); - - n_subfacets = hmap_count(&backer->subfacets); - backer->avg_n_subfacet += n_subfacets; - backer->avg_n_subfacet /= 2; - - backer->max_n_subfacet = MAX(backer->max_n_subfacet, n_subfacets); - - max_idle = subfacet_max_idle(backer); - expire_subfacets(backer, max_idle); - - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - struct rule *rule, *next_rule; - - if (ofproto->backer != backer) { - continue; - } - - /* Expire OpenFlow flows whose idle_timeout or hard_timeout - * has passed. */ - ovs_mutex_lock(&ofproto_mutex); - LIST_FOR_EACH_SAFE (rule, next_rule, expirable, - &ofproto->up.expirable) { - rule_expire(rule_dpif_cast(rule)); - } - ovs_mutex_unlock(&ofproto_mutex); - - /* All outstanding data in existing flows has been accounted, so it's a - * good time to do bond rebalancing. */ - if (ofproto->has_bonded_bundles) { - struct ofbundle *bundle; - - HMAP_FOR_EACH (bundle, hmap_node, &ofproto->bundles) { - if (bundle->bond) { - bond_rebalance(bundle->bond); - } - } - } - } - - return MIN(max_idle, 1000); -} - -/* Updates flow table statistics given that the datapath just reported 'stats' - * as 'subfacet''s statistics. */ -static void -update_subfacet_stats(struct subfacet *subfacet, - const struct dpif_flow_stats *stats) -{ - struct facet *facet = subfacet->facet; - struct dpif_flow_stats diff; - - diff.tcp_flags = stats->tcp_flags; - diff.used = stats->used; - - if (stats->n_packets >= subfacet->dp_packet_count) { - diff.n_packets = stats->n_packets - subfacet->dp_packet_count; - } else { - VLOG_WARN_RL(&rl, "unexpected packet count from the datapath"); - diff.n_packets = 0; - } - - if (stats->n_bytes >= subfacet->dp_byte_count) { - diff.n_bytes = stats->n_bytes - subfacet->dp_byte_count; - } else { - VLOG_WARN_RL(&rl, "unexpected byte count from datapath"); - diff.n_bytes = 0; - } - - facet->ofproto->n_hit += diff.n_packets; - subfacet->dp_packet_count = stats->n_packets; - subfacet->dp_byte_count = stats->n_bytes; - subfacet_update_stats(subfacet, &diff); - - if (diff.n_packets) { - facet_learn(facet); - } -} - -/* 'key' with length 'key_len' bytes is a flow in 'dpif' that we know nothing - * about, or a flow that shouldn't be installed but was anyway. Delete it. */ -static void -delete_unexpected_flow(struct dpif_backer *backer, - const struct nlattr *key, size_t key_len) -{ - if (!VLOG_DROP_WARN(&rl)) { - struct ds s; - - ds_init(&s); - odp_flow_key_format(key, key_len, &s); - VLOG_WARN("unexpected flow: %s", ds_cstr(&s)); - ds_destroy(&s); - } - - COVERAGE_INC(facet_unexpected); - dpif_flow_del(backer->dpif, key, key_len, NULL); -} - -/* Update 'packet_count', 'byte_count', and 'used' members of installed facets. - * - * This function also pushes statistics updates to rules which each facet - * resubmits into. Generally these statistics will be accurate. However, if a - * facet changes the rule it resubmits into at some time in between - * update_stats() runs, it is possible that statistics accrued to the - * old rule will be incorrectly attributed to the new rule. This could be - * avoided by calling update_stats() whenever rules are created or - * deleted. However, the performance impact of making so many calls to the - * datapath do not justify the benefit of having perfectly accurate statistics. - * - * In addition, this function maintains per ofproto flow hit counts. The patch - * port is not treated specially. e.g. A packet ingress from br0 patched into - * br1 will increase the hit count of br0 by 1, however, does not affect - * the hit or miss counts of br1. - */ -static void -update_stats(struct dpif_backer *backer) -{ - const struct dpif_flow_stats *stats; - struct dpif_flow_dump dump; - const struct nlattr *key, *mask; - size_t key_len, mask_len; - - dpif_flow_dump_start(&dump, backer->dpif); - while (dpif_flow_dump_next(&dump, &key, &key_len, - &mask, &mask_len, NULL, NULL, &stats)) { - struct subfacet *subfacet; - uint32_t key_hash; - - key_hash = odp_flow_key_hash(key, key_len); - subfacet = subfacet_find(backer, key, key_len, key_hash); - switch (subfacet ? subfacet->path : SF_NOT_INSTALLED) { - case SF_FAST_PATH: - update_subfacet_stats(subfacet, stats); - break; - - case SF_SLOW_PATH: - /* Stats are updated per-packet. */ - break; - - case SF_NOT_INSTALLED: - default: - delete_unexpected_flow(backer, key, key_len); - break; - } - } - dpif_flow_dump_done(&dump); -} - -/* Calculates and returns the number of milliseconds of idle time after which - * subfacets should expire from the datapath. When a subfacet expires, we fold - * its statistics into its facet, and when a facet's last subfacet expires, we - * fold its statistic into its rule. */ -static int -subfacet_max_idle(const struct dpif_backer *backer) -{ - /* - * Idle time histogram. - * - * Most of the time a switch has a relatively small number of subfacets. - * When this is the case we might as well keep statistics for all of them - * in userspace and to cache them in the kernel datapath for performance as - * well. - * - * As the number of subfacets increases, the memory required to maintain - * statistics about them in userspace and in the kernel becomes - * significant. However, with a large number of subfacets it is likely - * that only a few of them are "heavy hitters" that consume a large amount - * of bandwidth. At this point, only heavy hitters are worth caching in - * the kernel and maintaining in userspaces; other subfacets we can - * discard. - * - * The technique used to compute the idle time is to build a histogram with - * N_BUCKETS buckets whose width is BUCKET_WIDTH msecs each. Each subfacet - * that is installed in the kernel gets dropped in the appropriate bucket. - * After the histogram has been built, we compute the cutoff so that only - * the most-recently-used 1% of subfacets (but at least - * flow_eviction_threshold flows) are kept cached. At least - * the most-recently-used bucket of subfacets is kept, so actually an - * arbitrary number of subfacets can be kept in any given expiration run - * (though the next run will delete most of those unless they receive - * additional data). - * - * This requires a second pass through the subfacets, in addition to the - * pass made by update_stats(), because the former function never looks at - * uninstallable subfacets. - */ - enum { BUCKET_WIDTH = 100 }; - enum { N_BUCKETS = 5000 / BUCKET_WIDTH }; - int buckets[N_BUCKETS] = { 0 }; - int total, subtotal, bucket; - struct subfacet *subfacet; - long long int now; - int i; - - total = hmap_count(&backer->subfacets); - if (total <= flow_eviction_threshold) { - return N_BUCKETS * BUCKET_WIDTH; - } - - /* Build histogram. */ - now = time_msec(); - HMAP_FOR_EACH (subfacet, hmap_node, &backer->subfacets) { - long long int idle = now - subfacet->used; - int bucket = (idle <= 0 ? 0 - : idle >= BUCKET_WIDTH * N_BUCKETS ? N_BUCKETS - 1 - : (unsigned int) idle / BUCKET_WIDTH); - buckets[bucket]++; - } - - /* Find the first bucket whose flows should be expired. */ - subtotal = bucket = 0; - do { - subtotal += buckets[bucket++]; - } while (bucket < N_BUCKETS && - subtotal < MAX(flow_eviction_threshold, total / 100)); - - if (VLOG_IS_DBG_ENABLED()) { - struct ds s; - - ds_init(&s); - ds_put_cstr(&s, "keep"); - for (i = 0; i < N_BUCKETS; i++) { - if (i == bucket) { - ds_put_cstr(&s, ", drop"); - } - if (buckets[i]) { - ds_put_format(&s, " %d:%d", i * BUCKET_WIDTH, buckets[i]); - } - } - VLOG_INFO("%s (msec:count)", ds_cstr(&s)); - ds_destroy(&s); - } - - return bucket * BUCKET_WIDTH; -} - -static void -expire_subfacets(struct dpif_backer *backer, int dp_max_idle) -{ - /* Cutoff time for most flows. */ - long long int normal_cutoff = time_msec() - dp_max_idle; - - /* We really want to keep flows for special protocols around, so use a more - * conservative cutoff. */ - long long int special_cutoff = time_msec() - 10000; - - struct subfacet *subfacet, *next_subfacet; - struct subfacet *batch[SUBFACET_DESTROY_MAX_BATCH]; - int n_batch; - - n_batch = 0; - HMAP_FOR_EACH_SAFE (subfacet, next_subfacet, hmap_node, - &backer->subfacets) { - long long int cutoff; - - cutoff = (subfacet->facet->xout.slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP - | SLOW_STP) - ? special_cutoff - : normal_cutoff); - if (subfacet->used < cutoff) { - if (subfacet->path != SF_NOT_INSTALLED) { - batch[n_batch++] = subfacet; - if (n_batch >= SUBFACET_DESTROY_MAX_BATCH) { - subfacet_destroy_batch(backer, batch, n_batch); - n_batch = 0; - } - } else { - subfacet_destroy(subfacet); - } - } - } - - if (n_batch > 0) { - subfacet_destroy_batch(backer, batch, n_batch); - } -} - -/* If 'rule' is an OpenFlow rule, that has expired according to OpenFlow rules, - * then delete it entirely. */ -static void -rule_expire(struct rule_dpif *rule) - OVS_REQUIRES(ofproto_mutex) -{ - uint16_t idle_timeout, hard_timeout; - long long int now = time_msec(); - int reason; - - ovs_assert(!rule->up.pending); - - /* Has 'rule' expired? */ - ovs_mutex_lock(&rule->up.mutex); - hard_timeout = rule->up.hard_timeout; - idle_timeout = rule->up.idle_timeout; - if (hard_timeout && now > rule->up.modified + hard_timeout * 1000) { - reason = OFPRR_HARD_TIMEOUT; - } else if (idle_timeout && now > rule->up.used + idle_timeout * 1000) { - reason = OFPRR_IDLE_TIMEOUT; - } else { - reason = -1; - } - ovs_mutex_unlock(&rule->up.mutex); - - if (reason >= 0) { - COVERAGE_INC(ofproto_dpif_expired); - ofproto_rule_expire(&rule->up, reason); - } -} - -/* Facets. */ - -/* Creates and returns a new facet based on 'miss'. - * - * The caller must already have determined that no facet with an identical - * 'miss->flow' exists in 'miss->ofproto'. - * - * 'rule' and 'xout' must have been created based on 'miss'. - * - * 'facet'' statistics are initialized based on 'stats'. - * - * The facet will initially have no subfacets. The caller should create (at - * least) one subfacet with subfacet_create(). */ -static struct facet * -facet_create(const struct flow_miss *miss) -{ - struct ofproto_dpif *ofproto = miss->ofproto; - struct facet *facet; - struct match match; - - COVERAGE_INC(facet_create); - facet = xzalloc(sizeof *facet); - facet->ofproto = miss->ofproto; - facet->used = miss->stats.used; - facet->flow = miss->flow; - facet->learn_rl = time_msec() + 500; - - list_init(&facet->subfacets); - - xlate_out_copy(&facet->xout, &miss->xout); - - match_init(&match, &facet->flow, &facet->xout.wc); - cls_rule_init(&facet->cr, &match, OFP_DEFAULT_PRIORITY); - ovs_rwlock_wrlock(&ofproto->facets.rwlock); - classifier_insert(&ofproto->facets, &facet->cr); - ovs_rwlock_unlock(&ofproto->facets.rwlock); - - return facet; -} - -static void -facet_free(struct facet *facet) -{ - if (facet) { - xlate_out_uninit(&facet->xout); - free(facet); - } -} - -/* Executes, within 'ofproto', the actions in 'rule' or 'ofpacts' on 'packet'. - * 'flow' must reflect the data in 'packet'. */ -int -ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto, - const struct flow *flow, - struct rule_dpif *rule, - const struct ofpact *ofpacts, size_t ofpacts_len, - struct ofpbuf *packet) -{ - struct odputil_keybuf keybuf; - struct dpif_flow_stats stats; - struct xlate_out xout; - struct xlate_in xin; - ofp_port_t in_port; - struct ofpbuf key; - int error; - - ovs_assert((rule != NULL) != (ofpacts != NULL)); - - dpif_flow_stats_extract(flow, packet, time_msec(), &stats); - if (rule) { - rule_dpif_credit_stats(rule, &stats); - } - - xlate_in_init(&xin, ofproto, flow, rule, stats.tcp_flags, packet); - xin.ofpacts = ofpacts; - xin.ofpacts_len = ofpacts_len; - xin.resubmit_stats = &stats; - xlate_actions(&xin, &xout); - - ofpbuf_use_stack(&key, &keybuf, sizeof keybuf); - in_port = flow->in_port.ofp_port; - if (in_port == OFPP_NONE) { - in_port = OFPP_LOCAL; - } - odp_flow_key_from_flow(&key, flow, ofp_port_to_odp_port(ofproto, in_port)); - - error = dpif_execute(ofproto->backer->dpif, key.data, key.size, - xout.odp_actions.data, xout.odp_actions.size, packet, - (xout.slow & SLOW_ACTION) != 0); - xlate_out_uninit(&xout); - - return error; -} - -/* Remove 'facet' from its ofproto and free up the associated memory: - * - * - If 'facet' was installed in the datapath, uninstalls it and updates its - * rule's statistics, via subfacet_uninstall(). - * - * - Removes 'facet' from its rule and from ofproto->facets. - */ -static void -facet_remove(struct facet *facet) -{ - struct subfacet *subfacet, *next_subfacet; - - COVERAGE_INC(facet_remove); - ovs_assert(!list_is_empty(&facet->subfacets)); - - /* First uninstall all of the subfacets to get final statistics. */ - LIST_FOR_EACH (subfacet, list_node, &facet->subfacets) { - subfacet_uninstall(subfacet); - } - - /* Flush the final stats to the rule. - * - * This might require us to have at least one subfacet around so that we - * can use its actions for accounting in facet_account(), which is why we - * have uninstalled but not yet destroyed the subfacets. */ - facet_flush_stats(facet); - - /* Now we're really all done so destroy everything. */ - LIST_FOR_EACH_SAFE (subfacet, next_subfacet, list_node, - &facet->subfacets) { - subfacet_destroy__(subfacet); - } - ovs_rwlock_wrlock(&facet->ofproto->facets.rwlock); - classifier_remove(&facet->ofproto->facets, &facet->cr); - ovs_rwlock_unlock(&facet->ofproto->facets.rwlock); - cls_rule_destroy(&facet->cr); - facet_free(facet); -} - -/* Feed information from 'facet' back into the learning table to keep it in - * sync with what is actually flowing through the datapath. */ -static void -facet_learn(struct facet *facet) -{ - long long int now = time_msec(); - - if (!facet->xout.has_fin_timeout && now < facet->learn_rl) { - return; - } - - facet->learn_rl = now + 500; - - if (!facet->xout.has_learn - && !facet->xout.has_normal - && (!facet->xout.has_fin_timeout - || !(facet->tcp_flags & (TCP_FIN | TCP_RST)))) { - return; - } - - facet_push_stats(facet, true); -} - -/* Returns true if the only action for 'facet' is to send to the controller. - * (We don't report NetFlow expiration messages for such facets because they - * are just part of the control logic for the network, not real traffic). */ -static bool -facet_is_controller_flow(struct facet *facet) -{ - if (facet) { - struct ofproto_dpif *ofproto = facet->ofproto; - const struct ofpact *ofpacts; - struct rule_actions *actions; - struct rule_dpif *rule; - size_t ofpacts_len; - bool is_controller; - - rule_dpif_lookup(ofproto, &facet->flow, NULL, &rule); - actions = rule_dpif_get_actions(rule); - rule_dpif_unref(rule); - - ofpacts_len = actions->ofpacts_len; - ofpacts = actions->ofpacts; - is_controller = ofpacts_len > 0 - && ofpacts->type == OFPACT_CONTROLLER - && ofpact_next(ofpacts) >= ofpact_end(ofpacts, ofpacts_len); - rule_actions_unref(actions); - - return is_controller; - } - return false; -} - -/* Folds all of 'facet''s statistics into its rule. Also updates the - * accounting ofhook and emits a NetFlow expiration if appropriate. All of - * 'facet''s statistics in the datapath should have been zeroed and folded into - * its packet and byte counts before this function is called. */ -static void -facet_flush_stats(struct facet *facet) -{ - struct ofproto_dpif *ofproto = facet->ofproto; - struct subfacet *subfacet; - - LIST_FOR_EACH (subfacet, list_node, &facet->subfacets) { - ovs_assert(!subfacet->dp_byte_count); - ovs_assert(!subfacet->dp_packet_count); - } - - facet_push_stats(facet, false); - - if (ofproto->netflow && !facet_is_controller_flow(facet)) { - netflow_expire(ofproto->netflow, &facet->flow); - netflow_flow_clear(ofproto->netflow, &facet->flow); - } - - /* Reset counters to prevent double counting if 'facet' ever gets - * reinstalled. */ - facet_reset_counters(facet); - facet->tcp_flags = 0; -} - -/* Searches 'ofproto''s table of facets for one which would be responsible for - * 'flow'. Returns it if found, otherwise a null pointer. - * - * The returned facet might need revalidation; use facet_lookup_valid() - * instead if that is important. */ -static struct facet * -facet_find(struct ofproto_dpif *ofproto, const struct flow *flow) -{ - struct cls_rule *cr; - - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cr = classifier_lookup(&ofproto->facets, flow, NULL); - ovs_rwlock_unlock(&ofproto->facets.rwlock); - return cr ? CONTAINER_OF(cr, struct facet, cr) : NULL; -} - -/* Searches 'ofproto''s table of facets for one capable that covers - * 'flow'. Returns it if found, otherwise a null pointer. - * - * The returned facet is guaranteed to be valid. */ -static struct facet * -facet_lookup_valid(struct ofproto_dpif *ofproto, const struct flow *flow) -{ - struct facet *facet; - - facet = facet_find(ofproto, flow); - if (facet - && ofproto->backer->need_revalidate - && !facet_revalidate(facet)) { - return NULL; - } - - return facet; -} - -static bool -facet_check_consistency(struct facet *facet) -{ - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 15); - - struct xlate_out xout; - struct xlate_in xin; - bool ok; - - /* Check the datapath actions for consistency. */ - xlate_in_init(&xin, facet->ofproto, &facet->flow, NULL, 0, NULL); - xlate_actions(&xin, &xout); - - ok = ofpbuf_equal(&facet->xout.odp_actions, &xout.odp_actions) - && facet->xout.slow == xout.slow; - if (!ok && !VLOG_DROP_WARN(&rl)) { - struct ds s = DS_EMPTY_INITIALIZER; - - flow_format(&s, &facet->flow); - ds_put_cstr(&s, ": inconsistency in facet"); - - if (!ofpbuf_equal(&facet->xout.odp_actions, &xout.odp_actions)) { - ds_put_cstr(&s, " (actions were: "); - format_odp_actions(&s, facet->xout.odp_actions.data, - facet->xout.odp_actions.size); - ds_put_cstr(&s, ") (correct actions: "); - format_odp_actions(&s, xout.odp_actions.data, - xout.odp_actions.size); - ds_put_char(&s, ')'); - } - - if (facet->xout.slow != xout.slow) { - ds_put_format(&s, " slow path incorrect. should be %d", xout.slow); - } - - ds_destroy(&s); - } - xlate_out_uninit(&xout); - - return ok; -} - -/* Re-searches the classifier for 'facet': - * - * - If the rule found is different from 'facet''s current rule, moves - * 'facet' to the new rule and recompiles its actions. - * - * - If the rule found is the same as 'facet''s current rule, leaves 'facet' - * where it is and recompiles its actions anyway. - * - * - If any of 'facet''s subfacets correspond to a new flow according to - * xlate_receive(), 'facet' is removed. - * - * Returns true if 'facet' is still valid. False if 'facet' was removed. */ -static bool -facet_revalidate(struct facet *facet) -{ - struct ofproto_dpif *ofproto = facet->ofproto; - struct rule_dpif *new_rule; - struct subfacet *subfacet; - struct flow_wildcards wc; - struct xlate_out xout; - struct xlate_in xin; - - COVERAGE_INC(facet_revalidate); - - /* Check that child subfacets still correspond to this facet. Tunnel - * configuration changes could cause a subfacet's OpenFlow in_port to - * change. */ - LIST_FOR_EACH (subfacet, list_node, &facet->subfacets) { - struct ofproto_dpif *recv_ofproto; - struct flow recv_flow; - int error; - - error = xlate_receive(ofproto->backer, NULL, subfacet->key, - subfacet->key_len, &recv_flow, NULL, - &recv_ofproto, NULL, NULL, NULL, NULL); - if (error - || recv_ofproto != ofproto - || facet != facet_find(ofproto, &recv_flow)) { - facet_remove(facet); - return false; - } - } - - flow_wildcards_init_catchall(&wc); - rule_dpif_lookup(ofproto, &facet->flow, &wc, &new_rule); - - /* Calculate new datapath actions. - * - * We do not modify any 'facet' state yet, because we might need to, e.g., - * emit a NetFlow expiration and, if so, we need to have the old state - * around to properly compose it. */ - xlate_in_init(&xin, ofproto, &facet->flow, new_rule, 0, NULL); - xlate_actions(&xin, &xout); - flow_wildcards_or(&xout.wc, &xout.wc, &wc); - /* Make sure non -packet fields are not masked. If not cleared, - * the memcmp() below may fail, causing an otherwise valid facet - * to be removed. */ - flow_wildcards_clear_non_packet_fields(&xout.wc); - - /* A facet's slow path reason should only change under dramatic - * circumstances. Rather than try to update everything, it's simpler to - * remove the facet and start over. - * - * More importantly, if a facet's wildcards change, it will be relatively - * difficult to figure out if its subfacets still belong to it, and if not - * which facet they may belong to. Again, to avoid the complexity, we - * simply give up instead. */ - if (facet->xout.slow != xout.slow - || memcmp(&facet->xout.wc, &xout.wc, sizeof xout.wc)) { - facet_remove(facet); - xlate_out_uninit(&xout); - rule_dpif_unref(new_rule); - return false; - } - - if (!ofpbuf_equal(&facet->xout.odp_actions, &xout.odp_actions)) { - LIST_FOR_EACH(subfacet, list_node, &facet->subfacets) { - if (subfacet->path == SF_FAST_PATH) { - struct dpif_flow_stats stats; - - subfacet_install(subfacet, &xout.odp_actions, &stats); - subfacet_update_stats(subfacet, &stats); - } - } - - facet_flush_stats(facet); - - ofpbuf_clear(&facet->xout.odp_actions); - ofpbuf_put(&facet->xout.odp_actions, xout.odp_actions.data, - xout.odp_actions.size); - } - - /* Update 'facet' now that we've taken care of all the old state. */ - facet->xout.slow = xout.slow; - facet->xout.has_learn = xout.has_learn; - facet->xout.has_normal = xout.has_normal; - facet->xout.has_fin_timeout = xout.has_fin_timeout; - facet->xout.nf_output_iface = xout.nf_output_iface; - facet->xout.mirrors = xout.mirrors; - - ovs_mutex_lock(&new_rule->up.mutex); - facet->used = MAX(facet->used, new_rule->up.created); - ovs_mutex_unlock(&new_rule->up.mutex); - - xlate_out_uninit(&xout); - rule_dpif_unref(new_rule); - return true; -} - -static void -facet_reset_counters(struct facet *facet) -{ - facet->packet_count = 0; - facet->byte_count = 0; - facet->prev_packet_count = 0; - facet->prev_byte_count = 0; -} - -static void -flow_push_stats(struct ofproto_dpif *ofproto, struct flow *flow, - struct dpif_flow_stats *stats, bool may_learn) -{ - struct xlate_in xin; - - xlate_in_init(&xin, ofproto, flow, NULL, stats->tcp_flags, NULL); - xin.resubmit_stats = stats; - xin.may_learn = may_learn; - xlate_actions_for_side_effects(&xin); -} - -static void -facet_push_stats(struct facet *facet, bool may_learn) -{ - struct dpif_flow_stats stats; - - ovs_assert(facet->packet_count >= facet->prev_packet_count); - ovs_assert(facet->byte_count >= facet->prev_byte_count); - ovs_assert(facet->used >= facet->prev_used); - - stats.n_packets = facet->packet_count - facet->prev_packet_count; - stats.n_bytes = facet->byte_count - facet->prev_byte_count; - stats.used = facet->used; - stats.tcp_flags = facet->tcp_flags; - - if (may_learn || stats.n_packets || facet->used > facet->prev_used) { - facet->prev_packet_count = facet->packet_count; - facet->prev_byte_count = facet->byte_count; - facet->prev_used = facet->used; - flow_push_stats(facet->ofproto, &facet->flow, &stats, may_learn); - } -} - -static void -push_all_stats(void) -{ - static long long int rl = LLONG_MIN; - struct ofproto_dpif *ofproto; - - if (time_msec() < rl) { - return; - } - - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - struct cls_cursor cursor; - struct facet *facet; - - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cls_cursor_init(&cursor, &ofproto->facets, NULL); - CLS_CURSOR_FOR_EACH (facet, cr, &cursor) { - facet_push_stats(facet, false); - } - ovs_rwlock_unlock(&ofproto->facets.rwlock); - } - - rl = time_msec() + 100; -} - -void -rule_dpif_credit_stats(struct rule_dpif *rule, - const struct dpif_flow_stats *stats) -{ - ovs_mutex_lock(&rule->stats_mutex); - rule->packet_count += stats->n_packets; - rule->byte_count += stats->n_bytes; - rule->up.used = MAX(rule->up.used, stats->used); - ovs_mutex_unlock(&rule->stats_mutex); -} - -bool -rule_dpif_is_fail_open(const struct rule_dpif *rule) -{ - return is_fail_open_rule(&rule->up); -} - -bool -rule_dpif_is_table_miss(const struct rule_dpif *rule) -{ - return rule_is_table_miss(&rule->up); -} - -ovs_be64 -rule_dpif_get_flow_cookie(const struct rule_dpif *rule) - OVS_REQUIRES(rule->up.mutex) -{ - return rule->up.flow_cookie; -} - -void -rule_dpif_reduce_timeouts(struct rule_dpif *rule, uint16_t idle_timeout, - uint16_t hard_timeout) -{ - ofproto_rule_reduce_timeouts(&rule->up, idle_timeout, hard_timeout); -} - -/* Returns 'rule''s actions. The caller owns a reference on the returned - * actions and must eventually release it (with rule_actions_unref()) to avoid - * a memory leak. */ -struct rule_actions * -rule_dpif_get_actions(const struct rule_dpif *rule) -{ - return rule_get_actions(&rule->up); -} - -/* Subfacets. */ - -static struct subfacet * -subfacet_find(struct dpif_backer *backer, const struct nlattr *key, - size_t key_len, uint32_t key_hash) -{ - struct subfacet *subfacet; - - HMAP_FOR_EACH_WITH_HASH (subfacet, hmap_node, key_hash, - &backer->subfacets) { - if (subfacet->key_len == key_len - && !memcmp(key, subfacet->key, key_len)) { - return subfacet; - } - } - - return NULL; -} - -/* Creates and returns a new subfacet within 'facet' for the flow in 'miss'. - * 'key_hash' must be a hash over miss->key. The caller must have already - * ensured that no subfacet subfacet already exists. */ -static struct subfacet * -subfacet_create(struct facet *facet, struct flow_miss *miss, uint32_t key_hash) -{ - struct dpif_backer *backer = miss->ofproto->backer; - const struct nlattr *key = miss->key; - size_t key_len = miss->key_len; - struct subfacet *subfacet; - - subfacet = (list_is_empty(&facet->subfacets) - ? &facet->one_subfacet - : xmalloc(sizeof *subfacet)); - - COVERAGE_INC(subfacet_create); - hmap_insert(&backer->subfacets, &subfacet->hmap_node, key_hash); - list_push_back(&facet->subfacets, &subfacet->list_node); - subfacet->facet = facet; - subfacet->key = xmemdup(key, key_len); - subfacet->key_len = key_len; - subfacet->used = miss->stats.used; - subfacet->created = subfacet->used; - subfacet->dp_packet_count = 0; - subfacet->dp_byte_count = 0; - subfacet->path = SF_NOT_INSTALLED; - subfacet->backer = backer; - - return subfacet; -} - -/* Uninstalls 'subfacet' from the datapath, if it is installed, removes it from - * its facet within 'ofproto', and frees it. */ -static void -subfacet_destroy__(struct subfacet *subfacet) -{ - struct facet *facet = subfacet->facet; - - COVERAGE_INC(subfacet_destroy); - subfacet_uninstall(subfacet); - hmap_remove(&subfacet->backer->subfacets, &subfacet->hmap_node); - list_remove(&subfacet->list_node); - free(subfacet->key); - if (subfacet != &facet->one_subfacet) { - free(subfacet); - } -} - -/* Destroys 'subfacet', as with subfacet_destroy__(), and then if this was the - * last remaining subfacet in its facet destroys the facet too. */ -static void -subfacet_destroy(struct subfacet *subfacet) -{ - struct facet *facet = subfacet->facet; - - if (list_is_singleton(&facet->subfacets)) { - /* facet_remove() needs at least one subfacet (it will remove it). */ - facet_remove(facet); - } else { - subfacet_destroy__(subfacet); - } -} - -static void -subfacet_destroy_batch(struct dpif_backer *backer, - struct subfacet **subfacets, int n) -{ - struct dpif_op ops[SUBFACET_DESTROY_MAX_BATCH]; - struct dpif_op *opsp[SUBFACET_DESTROY_MAX_BATCH]; - struct dpif_flow_stats stats[SUBFACET_DESTROY_MAX_BATCH]; - int i; - - for (i = 0; i < n; i++) { - ops[i].type = DPIF_OP_FLOW_DEL; - ops[i].u.flow_del.key = subfacets[i]->key; - ops[i].u.flow_del.key_len = subfacets[i]->key_len; - ops[i].u.flow_del.stats = &stats[i]; - opsp[i] = &ops[i]; - } - - dpif_operate(backer->dpif, opsp, n); - for (i = 0; i < n; i++) { - subfacet_reset_dp_stats(subfacets[i], &stats[i]); - subfacets[i]->path = SF_NOT_INSTALLED; - subfacet_destroy(subfacets[i]); - } -} - -/* Updates 'subfacet''s datapath flow, setting its actions to 'actions_len' - * bytes of actions in 'actions'. If 'stats' is non-null, statistics counters - * in the datapath will be zeroed and 'stats' will be updated with traffic new - * since 'subfacet' was last updated. - * - * Returns 0 if successful, otherwise a positive errno value. */ -static int -subfacet_install(struct subfacet *subfacet, const struct ofpbuf *odp_actions, - struct dpif_flow_stats *stats) -{ - struct facet *facet = subfacet->facet; - enum subfacet_path path = facet->xout.slow ? SF_SLOW_PATH : SF_FAST_PATH; - const struct nlattr *actions = odp_actions->data; - size_t actions_len = odp_actions->size; - struct odputil_keybuf maskbuf; - struct ofpbuf mask; - - uint64_t slow_path_stub[128 / 8]; - enum dpif_flow_put_flags flags; - int ret; - - flags = subfacet->path == SF_NOT_INSTALLED ? DPIF_FP_CREATE - : DPIF_FP_MODIFY; - if (stats) { - flags |= DPIF_FP_ZERO_STATS; - } - - if (path == SF_SLOW_PATH) { - compose_slow_path(facet->ofproto, &facet->flow, facet->xout.slow, - slow_path_stub, sizeof slow_path_stub, - &actions, &actions_len); - } - - ofpbuf_use_stack(&mask, &maskbuf, sizeof maskbuf); - if (enable_megaflows) { - odp_flow_key_from_mask(&mask, &facet->xout.wc.masks, - &facet->flow, UINT32_MAX); - } - - ret = dpif_flow_put(subfacet->backer->dpif, flags, subfacet->key, - subfacet->key_len, mask.data, mask.size, - actions, actions_len, stats); - - if (stats) { - subfacet_reset_dp_stats(subfacet, stats); - } - - if (ret) { - COVERAGE_INC(subfacet_install_fail); - } else { - subfacet->path = path; - } - return ret; -} - -/* If 'subfacet' is installed in the datapath, uninstalls it. */ -static void -subfacet_uninstall(struct subfacet *subfacet) -{ - if (subfacet->path != SF_NOT_INSTALLED) { - struct ofproto_dpif *ofproto = subfacet->facet->ofproto; - struct dpif_flow_stats stats; - int error; - - error = dpif_flow_del(ofproto->backer->dpif, subfacet->key, - subfacet->key_len, &stats); - subfacet_reset_dp_stats(subfacet, &stats); - if (!error) { - subfacet_update_stats(subfacet, &stats); - } - subfacet->path = SF_NOT_INSTALLED; - } else { - ovs_assert(subfacet->dp_packet_count == 0); - ovs_assert(subfacet->dp_byte_count == 0); - } -} - -/* Resets 'subfacet''s datapath statistics counters. This should be called - * when 'subfacet''s statistics are cleared in the datapath. If 'stats' is - * non-null, it should contain the statistics returned by dpif when 'subfacet' - * was reset in the datapath. 'stats' will be modified to include only - * statistics new since 'subfacet' was last updated. */ -static void -subfacet_reset_dp_stats(struct subfacet *subfacet, - struct dpif_flow_stats *stats) -{ - if (stats - && subfacet->dp_packet_count <= stats->n_packets - && subfacet->dp_byte_count <= stats->n_bytes) { - stats->n_packets -= subfacet->dp_packet_count; - stats->n_bytes -= subfacet->dp_byte_count; - } - - subfacet->dp_packet_count = 0; - subfacet->dp_byte_count = 0; -} - -/* Folds the statistics from 'stats' into the counters in 'subfacet'. - * - * Because of the meaning of a subfacet's counters, it only makes sense to do - * this if 'stats' are not tracked in the datapath, that is, if 'stats' - * represents a packet that was sent by hand or if it represents statistics - * that have been cleared out of the datapath. */ -static void -subfacet_update_stats(struct subfacet *subfacet, - const struct dpif_flow_stats *stats) -{ - if (stats->n_packets || stats->used > subfacet->used) { - struct facet *facet = subfacet->facet; - - subfacet->used = MAX(subfacet->used, stats->used); - facet->used = MAX(facet->used, stats->used); - facet->packet_count += stats->n_packets; - facet->byte_count += stats->n_bytes; - facet->tcp_flags |= stats->tcp_flags; - } -} - -/* Rules. */ + return rule_get_actions(&rule->up); +} /* Lookup 'flow' in 'ofproto''s classifier. If 'wc' is non-null, sets * the fields that were relevant as part of the lookup. */ @@ -4634,10 +3041,6 @@ rule_get_stats(struct rule *rule_, uint64_t *packets, uint64_t *bytes) { struct rule_dpif *rule = rule_dpif_cast(rule_); - push_all_stats(); - - /* Start from historical data for 'rule' itself that are no longer tracked - * in facets. This counts, for example, facets that have expired. */ ovs_mutex_lock(&rule->stats_mutex); *packets = rule->packet_count; *bytes = rule->byte_count; @@ -4762,8 +3165,6 @@ group_get_stats(const struct ofgroup *group_, struct ofputil_group_stats *ogs) { struct group_dpif *group = group_dpif_cast(group_); - /* Start from historical data for 'group' itself that are no longer tracked - * in facets. This counts, for example, facets that have expired. */ ovs_mutex_lock(&group->stats_mutex); ogs->packet_count = group->packet_count; ogs->byte_count = group->byte_count; @@ -4826,46 +3227,6 @@ ofproto_dpif_send_packet(const struct ofport_dpif *ofport, struct ofpbuf *packet ovs_mutex_unlock(&ofproto->stats_mutex); return error; } - -/* Composes an ODP action for a "slow path" action for 'flow' within 'ofproto'. - * The action will state 'slow' as the reason that the action is in the slow - * path. (This is purely informational: it allows a human viewing "ovs-dpctl - * dump-flows" output to see why a flow is in the slow path.) - * - * The 'stub_size' bytes in 'stub' will be used to store the action. - * 'stub_size' must be large enough for the action. - * - * The action and its size will be stored in '*actionsp' and '*actions_lenp', - * respectively. */ -static void -compose_slow_path(const struct ofproto_dpif *ofproto, const struct flow *flow, - enum slow_path_reason slow, - uint64_t *stub, size_t stub_size, - const struct nlattr **actionsp, size_t *actions_lenp) -{ - union user_action_cookie cookie; - struct ofpbuf buf; - - cookie.type = USER_ACTION_COOKIE_SLOW_PATH; - cookie.slow_path.unused = 0; - cookie.slow_path.reason = slow; - - ofpbuf_use_stack(&buf, stub, stub_size); - if (slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP | SLOW_STP)) { - uint32_t pid = dpif_port_get_pid(ofproto->backer->dpif, - ODPP_NONE); - odp_put_userspace_action(pid, &cookie, sizeof cookie.slow_path, &buf); - } else { - odp_port_t odp_port; - uint32_t pid; - - odp_port = ofp_port_to_odp_port(ofproto, flow->in_port.ofp_port); - pid = dpif_port_get_pid(ofproto->backer->dpif, odp_port); - odp_put_userspace_action(pid, &cookie, sizeof cookie.slow_path, &buf); - } - *actionsp = buf.data; - *actions_lenp = buf.size; -} static bool set_frag_handling(struct ofproto *ofproto_, @@ -5452,61 +3813,6 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow, rule_dpif_unref(rule); } -/* Runs a self-check of flow translations in 'ofproto'. Appends a message to - * 'reply' describing the results. */ -static void -ofproto_dpif_self_check__(struct ofproto_dpif *ofproto, struct ds *reply) -{ - struct cls_cursor cursor; - struct facet *facet; - int errors; - - errors = 0; - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cls_cursor_init(&cursor, &ofproto->facets, NULL); - CLS_CURSOR_FOR_EACH (facet, cr, &cursor) { - if (!facet_check_consistency(facet)) { - errors++; - } - } - ovs_rwlock_unlock(&ofproto->facets.rwlock); - if (errors) { - ofproto->backer->need_revalidate = REV_INCONSISTENCY; - } - - if (errors) { - ds_put_format(reply, "%s: self-check failed (%d errors)\n", - ofproto->up.name, errors); - } else { - ds_put_format(reply, "%s: self-check passed\n", ofproto->up.name); - } -} - -static void -ofproto_dpif_self_check(struct unixctl_conn *conn, - int argc, const char *argv[], void *aux OVS_UNUSED) -{ - struct ds reply = DS_EMPTY_INITIALIZER; - struct ofproto_dpif *ofproto; - - if (argc > 1) { - ofproto = ofproto_dpif_lookup(argv[1]); - if (!ofproto) { - unixctl_command_reply_error(conn, "Unknown ofproto (use " - "ofproto/list for help)"); - return; - } - ofproto_dpif_self_check__(ofproto, &reply); - } else { - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - ofproto_dpif_self_check__(ofproto, &reply); - } - } - - unixctl_command_reply(conn, ds_cstr(&reply)); - ds_destroy(&reply); -} - /* Store the current ofprotos in 'ofproto_shash'. Returns a sorted list * of the 'ofproto_shash' nodes. It is the responsibility of the caller * to destroy 'ofproto_shash' and free the returned value. */ @@ -5551,25 +3857,14 @@ static void dpif_show_backer(const struct dpif_backer *backer, struct ds *ds) { const struct shash_node **ofprotos; - struct ofproto_dpif *ofproto; + struct dpif_dp_stats dp_stats; struct shash ofproto_shash; - uint64_t n_hit, n_missed; size_t i; - n_hit = n_missed = 0; - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - if (ofproto->backer == backer) { - n_missed += ofproto->n_missed; - n_hit += ofproto->n_hit; - } - } + dpif_get_dp_stats(backer->dpif, &dp_stats); ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n", - dpif_name(backer->dpif), n_hit, n_missed); - - ds_put_format(ds, "\tflows: cur: %"PRIuSIZE", avg: %u, max: %u\n", - hmap_count(&backer->subfacets), backer->avg_n_subfacet, - backer->max_n_subfacet); + dpif_name(backer->dpif), dp_stats.n_hit, dp_stats.n_missed); shash_init(&ofproto_shash); ofprotos = get_ofprotos(&ofproto_shash); @@ -5582,8 +3877,7 @@ dpif_show_backer(const struct dpif_backer *backer, struct ds *ds) continue; } - ds_put_format(ds, "\t%s: hit:%"PRIu64" missed:%"PRIu64"\n", - ofproto->up.name, ofproto->n_hit, ofproto->n_missed); + ds_put_format(ds, "\t%s:\n", ofproto->up.name); ports = shash_sort(&ofproto->up.port_by_name); for (j = 0; j < shash_count(&ofproto->up.port_by_name); j++) { @@ -5646,104 +3940,6 @@ ofproto_unixctl_dpif_show(struct unixctl_conn *conn, int argc OVS_UNUSED, ds_destroy(&ds); } -/* Dump the megaflow (facet) cache. This is useful to check the - * correctness of flow wildcarding, since the same mechanism is used for - * both xlate caching and kernel wildcarding. - * - * It's important to note that in the output the flow description uses - * OpenFlow (OFP) ports, but the actions use datapath (ODP) ports. - * - * This command is only needed for advanced debugging, so it's not - * documented in the man page. */ -static void -ofproto_unixctl_dpif_dump_megaflows(struct unixctl_conn *conn, - int argc OVS_UNUSED, const char *argv[], - void *aux OVS_UNUSED) -{ - struct ds ds = DS_EMPTY_INITIALIZER; - const struct ofproto_dpif *ofproto; - long long int now = time_msec(); - struct cls_cursor cursor; - struct facet *facet; - - ofproto = ofproto_dpif_lookup(argv[1]); - if (!ofproto) { - unixctl_command_reply_error(conn, "no such bridge"); - return; - } - - ovs_rwlock_rdlock(&ofproto->facets.rwlock); - cls_cursor_init(&cursor, &ofproto->facets, NULL); - CLS_CURSOR_FOR_EACH (facet, cr, &cursor) { - cls_rule_format(&facet->cr, &ds); - ds_put_cstr(&ds, ", "); - ds_put_format(&ds, "n_subfacets:%"PRIuSIZE", ", list_size(&facet->subfacets)); - ds_put_format(&ds, "used:%.3fs, ", (now - facet->used) / 1000.0); - ds_put_cstr(&ds, "Datapath actions: "); - if (facet->xout.slow) { - uint64_t slow_path_stub[128 / 8]; - const struct nlattr *actions; - size_t actions_len; - - compose_slow_path(ofproto, &facet->flow, facet->xout.slow, - slow_path_stub, sizeof slow_path_stub, - &actions, &actions_len); - format_odp_actions(&ds, actions, actions_len); - } else { - format_odp_actions(&ds, facet->xout.odp_actions.data, - facet->xout.odp_actions.size); - } - ds_put_cstr(&ds, "\n"); - } - ovs_rwlock_unlock(&ofproto->facets.rwlock); - - ds_chomp(&ds, '\n'); - unixctl_command_reply(conn, ds_cstr(&ds)); - ds_destroy(&ds); -} - -/* Disable using the megaflows. - * - * This command is only needed for advanced debugging, so it's not - * documented in the man page. */ -static void -ofproto_unixctl_dpif_disable_megaflows(struct unixctl_conn *conn, - int argc OVS_UNUSED, - const char *argv[] OVS_UNUSED, - void *aux OVS_UNUSED) -{ - struct ofproto_dpif *ofproto; - - enable_megaflows = false; - - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - flush(&ofproto->up); - } - - unixctl_command_reply(conn, "megaflows disabled"); -} - -/* Re-enable using megaflows. - * - * This command is only needed for advanced debugging, so it's not - * documented in the man page. */ -static void -ofproto_unixctl_dpif_enable_megaflows(struct unixctl_conn *conn, - int argc OVS_UNUSED, - const char *argv[] OVS_UNUSED, - void *aux OVS_UNUSED) -{ - struct ofproto_dpif *ofproto; - - enable_megaflows = true; - - HMAP_FOR_EACH (ofproto, all_ofproto_dpifs_node, &all_ofproto_dpifs) { - flush(&ofproto->up); - } - - unixctl_command_reply(conn, "megaflows enabled"); -} - static bool ofproto_dpif_contains_flow(const struct ofproto_dpif *ofproto, const struct nlattr *key, size_t key_len) @@ -5826,20 +4022,12 @@ ofproto_dpif_unixctl_init(void) ofproto_unixctl_fdb_flush, NULL); unixctl_command_register("fdb/show", "bridge", 1, 1, ofproto_unixctl_fdb_show, NULL); - unixctl_command_register("ofproto/self-check", "[bridge]", 0, 1, - ofproto_dpif_self_check, NULL); unixctl_command_register("dpif/dump-dps", "", 0, 0, ofproto_unixctl_dpif_dump_dps, NULL); unixctl_command_register("dpif/show", "", 0, 0, ofproto_unixctl_dpif_show, NULL); unixctl_command_register("dpif/dump-flows", "bridge", 1, 1, ofproto_unixctl_dpif_dump_flows, NULL); - unixctl_command_register("dpif/dump-megaflows", "bridge", 1, 1, - ofproto_unixctl_dpif_dump_megaflows, NULL); - unixctl_command_register("dpif/disable-megaflows", "", 0, 0, - ofproto_unixctl_dpif_disable_megaflows, NULL); - unixctl_command_register("dpif/enable-megaflows", "", 0, 0, - ofproto_unixctl_dpif_enable_megaflows, NULL); } /* Linux VLAN device support (e.g. "eth0.10" for VLAN 10.) @@ -6111,7 +4299,7 @@ const struct ofproto_class ofproto_dpif_class = { dealloc, run, wait, - get_memory_usage, + NULL, /* get_memory_usage. */ type_get_memory_usage, flush, get_features, diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index bc08189ae..cc318eed1 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -455,13 +455,13 @@ void rule_collection_ref(struct rule_collection *) OVS_REQUIRES(ofproto_mutex); void rule_collection_unref(struct rule_collection *); void rule_collection_destroy(struct rule_collection *); -/* Threshold at which to begin flow table eviction. Only affects the - * ofproto-dpif implementation */ -extern unsigned flow_eviction_threshold; +/* Limits the number of flows allowed in the datapath. Only affects the + * ofproto-dpif implementation. */ +extern unsigned ofproto_flow_limit; -/* Number of upcall handler threads. Only affects the ofproto-dpif - * implementation. */ -extern size_t n_handlers; +/* Number of upcall handler and revalidator threads. Only affects the + * ofproto-dpif implementation. */ +extern size_t n_handlers, n_revalidators; /* Determines which model to use for handling misses in the ofproto-dpif * implementation */ diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index ef444a24a..75461e2dd 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -306,10 +306,10 @@ static size_t allocated_ofproto_classes; /* Global lock that protects all flow table operations. */ struct ovs_mutex ofproto_mutex = OVS_MUTEX_INITIALIZER; -unsigned flow_eviction_threshold = OFPROTO_FLOW_EVICTION_THRESHOLD_DEFAULT; +unsigned ofproto_flow_limit = OFPROTO_FLOW_LIMIT_DEFAULT; enum ofproto_flow_miss_model flow_miss_model = OFPROTO_HANDLE_MISS_AUTO; -size_t n_handlers; +size_t n_handlers, n_revalidators; /* Map from datapath name to struct ofproto, for use by unixctl commands. */ static struct hmap all_ofprotos = HMAP_INITIALIZER(&all_ofprotos); @@ -693,10 +693,9 @@ ofproto_set_in_band_queue(struct ofproto *ofproto, int queue_id) /* Sets the number of flows at which eviction from the kernel flow table * will occur. */ void -ofproto_set_flow_eviction_threshold(unsigned threshold) +ofproto_set_flow_limit(unsigned limit) { - flow_eviction_threshold = MAX(OFPROTO_FLOW_EVICTION_THRESHOLD_MIN, - threshold); + ofproto_flow_limit = limit; } /* Sets the path for handling flow misses. */ @@ -734,13 +733,23 @@ ofproto_set_mac_table_config(struct ofproto *ofproto, unsigned idle_time, } } -/* Sets number of upcall handler threads. The default is - * (number of online cores - 2). */ void -ofproto_set_threads(size_t n_handlers_) +ofproto_set_threads(size_t n_handlers_, size_t n_revalidators_) { - int threads = MAX(count_cpu_cores() - 2, 1); - n_handlers = n_handlers_ ? n_handlers_ : threads; + int threads = MAX(count_cpu_cores(), 2); + + n_revalidators = n_revalidators_; + n_handlers = n_handlers_; + + if (!n_revalidators) { + n_revalidators = n_handlers + ? MAX(threads - (int) n_handlers, 1) + : threads / 4 + 1; + } + + if (!n_handlers) { + n_handlers = MAX(threads - (int) n_revalidators, 1); + } } void diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 482212878..3034d32b3 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -213,8 +213,7 @@ int ofproto_port_dump_done(struct ofproto_port_dump *); : (ofproto_port_dump_done(DUMP), false)); \ ) -#define OFPROTO_FLOW_EVICTION_THRESHOLD_DEFAULT 2500 -#define OFPROTO_FLOW_EVICTION_THRESHOLD_MIN 100 +#define OFPROTO_FLOW_LIMIT_DEFAULT 200000 /* How flow misses should be handled in ofproto-dpif */ enum ofproto_flow_miss_model { @@ -243,12 +242,12 @@ void ofproto_reconnect_controllers(struct ofproto *); void ofproto_set_extra_in_band_remotes(struct ofproto *, const struct sockaddr_in *, size_t n); void ofproto_set_in_band_queue(struct ofproto *, int queue_id); -void ofproto_set_flow_eviction_threshold(unsigned threshold); +void ofproto_set_flow_limit(unsigned limit); void ofproto_set_flow_miss_model(unsigned model); void ofproto_set_forward_bpdu(struct ofproto *, bool forward_bpdu); void ofproto_set_mac_table_config(struct ofproto *, unsigned idle_time, size_t max_entries); -void ofproto_set_threads(size_t n_handlers); +void ofproto_set_threads(size_t n_handlers, size_t n_revalidators); void ofproto_set_dp_desc(struct ofproto *, const char *dp_desc); int ofproto_set_snoops(struct ofproto *, const struct sset *snoops); int ofproto_set_netflow(struct ofproto *, diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 90faea1ca..9cd23938d 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -2100,22 +2100,23 @@ for delay in 1000 30000; do ovs-appctl time/warp $delay done +ovs-appctl time/warp 6000 sleep 1 OVS_VSWITCHD_STOP ovs-appctl -t test-netflow exit -AT_CHECK([[sed -e 's/, uptime [0-9]*// -s/, now [0-9.]*// -s/time \([0-9]*\)\.\.\.\1$/time / -s/time [0-9]*\.\.\.[0-9]*/time / -' netflow.log | sort]], [0], - [ -header: v5, seq 0, engine 2,1 -header: v5, seq 1, engine 2,1 -seq 0: 192.168.0.1 > 192.168.0.2, if 1 > 65535, 1 pkts, 60 bytes, ICMP 8:0, time -seq 1: 192.168.0.1 > 192.168.0.2, if 1 > 2, 1 pkts, 60 bytes, ICMP 8:0, time -seq 1: 192.168.0.2 > 192.168.0.1, if 2 > 1, 2 pkts, 120 bytes, ICMP 0:0, time -]) +AT_CHECK([grep "192.168.0.1 > 192.168.0.2, if 1 > 65535, 1 pkts, 60 bytes, ICMP 8:0" netflow.log | wc -l], [0], [dnl +1 +], [ignore]) + +AT_CHECK([grep "192.168.0.1 > 192.168.0.2, if 1 > 2, 1 pkts, 60 bytes, ICMP 8:0" netflow.log | wc -l], [0], [dnl +1 +], [ignore]) + +combined=`grep "192.168.0.2 > 192.168.0.1, if 2 > 1, 2 pkts, 120 bytes, ICMP 0:0" netflow.log | wc -l` +separate=`grep "192.168.0.2 > 192.168.0.1, if 2 > 1, 1 pkts, 60 bytes, ICMP 0:0" netflow.log | wc -l` +AT_CHECK([test $separate = 2 || test $combined = 1], [0]) + AT_CLEANUP dnl Test that basic NetFlow reports active expirations correctly. @@ -2192,12 +2193,6 @@ done < netflow.log AT_CHECK([echo $n_learn $n_in $n_out $n_other], [0], [1 59 60 0 ]) -# There should be 1 expiration for MAC learning, -# at least 5 active and a final expiration in one direction, -# and at least 5 active and a final expiration in the other direction. -echo $n_recs -AT_CHECK([test $n_recs -ge 13]) - AT_CLEANUP AT_SETUP([idle_age and hard_age increase over time]) @@ -2347,12 +2342,11 @@ ADD_OF_PORTS([br1], [3]) AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 - flows: cur: 0, avg: 0, max: 0 - br0: hit:0 missed:0 + br0: br0 65534/100: (dummy) p1 1/1: (dummy) p2 2/2: (dummy) - br1: hit:0 missed:0 + br1: br1 65534/101: (dummy) p3 3/3: (dummy) ]) @@ -2409,12 +2403,11 @@ warped AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:13 missed:2 - flows: cur: 2, avg: 1, max: 2 - br0: hit:9 missed:1 + br0: br0 65534/100: (dummy) p2 2/2: (dummy) pbr0 1/none: (patch: peer=pbr1) - br1: hit:4 missed:1 + br1: br1 65534/101: (dummy) p3 3/3: (dummy) pbr1 1/none: (patch: peer=pbr0) @@ -2442,34 +2435,6 @@ OFPST_PORT reply (xid=0x4): 1 ports OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto-dpif - ovs-appctl dpif/show rates]) -OVS_VSWITCHD_START([set Bridge br0 fail-mode=secure]) -ADD_OF_PORTS([br0], 1, 2) - -AT_CHECK([ovs-appctl time/stop]) -AT_CHECK([ovs-ofctl add-flow br0 actions=LOCAL,output:1,output:2]) - -for i in $(seq 1 61); do - ovs-appctl netdev-dummy/receive br0 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' - ovs-appctl time/warp 10000 - ovs-appctl time/warp 50000 -done - -AT_CHECK([ovs-appctl time/warp 10000], [0], [warped -]) - -AT_CHECK([ovs-appctl dpif/show | sed 's/ 10[[0-9]]\{3\}(ms)$/ 10000(ms)/'], [0], [dnl -dummy@ovs-dummy: hit:0 missed:61 - flows: cur: 0, avg: 0, max: 1 - br0: hit:0 missed:61 - br0 65534/100: (dummy) - p1 1/1: (dummy) - p2 2/2: (dummy) -]) - -OVS_VSWITCHD_STOP -AT_CLEANUP - AT_SETUP([ofproto-dpif - port duration]) OVS_VSWITCHD_START([set Bridge br0 protocols=OpenFlow13]) ADD_OF_PORTS([br0], 1, 2) @@ -2949,7 +2914,7 @@ AT_DATA([flows.txt], [dnl table=0 in_port=1,ip,nw_dst=10.0.0.1 actions=output(2) table=0 in_port=1,ip,nw_dst=10.0.0.3 actions=drop ]) -AT_CHECK([ovs-appctl dpif/disable-megaflows], [0], [megaflows disabled +AT_CHECK([ovs-appctl upcall/disable-megaflows], [0], [megaflows disabled ], []) AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg], [0], [], []) AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at index 6e6461b86..bda866615 100644 --- a/tests/ovs-ofctl.at +++ b/tests/ovs-ofctl.at @@ -2485,7 +2485,7 @@ AT_CHECK([ovs-ofctl add-flow br0 "tcp,tcp_flags=+ack-ack,action="], [1], [], [ovs-ofctl: ack: Each TCP flag can be specified only once ]) -AT_CHECK([ovs-appctl dpif/show | tail -n +5], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl p1 1/1: (dummy) p2 2/2: (dummy) ]) diff --git a/tests/tunnel.at b/tests/tunnel.at index 982d22a5d..4f22b3fd7 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -14,7 +14,7 @@ actions=IN_PORT AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: remote_ip=1.1.1.1) p2 2/1: (gre: local_ip=2.2.2.2, remote_ip=1.1.1.1) @@ -37,7 +37,7 @@ dnl reconfigure, local_ip, remote_ip AT_CHECK([ovs-vsctl set Interface p2 type=gre options:local_ip=2.2.2.3 \ options:df_default=false options:ttl=1 options:csum=true \ -- set Interface p3 type=gre64]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: remote_ip=1.1.1.1) p2 2/1: (gre: csum=true, df_default=false, local_ip=2.2.2.3, remote_ip=1.1.1.1, ttl=1) @@ -72,7 +72,7 @@ actions=2 AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: remote_ip=1.1.1.1) p2 2/2: (dummy) @@ -116,7 +116,7 @@ actions=output:1 AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: key=5, local_ip=2.2.2.2, remote_ip=1.1.1.1) p2 2/2: (dummy) @@ -148,7 +148,7 @@ actions=output:1 AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: remote_ip=1.1.1.1, tos=inherit, ttl=inherit) p2 2/2: (dummy) @@ -190,7 +190,7 @@ actions=set_tunnel:1,output:1,set_tunnel:2,output:2,set_tunnel:3,output:3,set_tu AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: key=flow, remote_ip=1.1.1.1) p2 2/1: (gre: key=flow, remote_ip=2.2.2.2) @@ -222,7 +222,7 @@ actions=IN_PORT,output:1,output:2,output:3 AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: key=1, remote_ip=1.1.1.1) p2 2/1: (gre: in_key=2, out_key=3, remote_ip=1.1.1.1) @@ -274,7 +274,7 @@ tun_id=4,actions=output:5 AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (gre: key=flow, remote_ip=1.1.1.1) p2 2/1: (gre: key=3, remote_ip=3.3.3.3) @@ -310,7 +310,7 @@ AT_SETUP([tunnel - VXLAN]) OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=vxlan \ options:remote_ip=1.1.1.1 ofport_request=1]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (vxlan: remote_ip=1.1.1.1) ]) @@ -322,7 +322,7 @@ AT_SETUP([tunnel - LISP]) OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=lisp \ options:remote_ip=1.1.1.1 ofport_request=1]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (lisp: remote_ip=1.1.1.1) ]) @@ -334,7 +334,7 @@ AT_SETUP([tunnel - different VXLAN UDP port]) OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=vxlan \ options:remote_ip=1.1.1.1 ofport_request=1 options:dst_port=4341]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (vxlan: dst_port=4341, remote_ip=1.1.1.1) ]) @@ -343,7 +343,7 @@ dnl change UDP port AT_CHECK([ovs-vsctl -- set Interface p1 options:dst_port=5000]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/2: (vxlan: dst_port=5000, remote_ip=1.1.1.1) ]) @@ -352,7 +352,7 @@ dnl change UDP port to default AT_CHECK([ovs-vsctl -- set Interface p1 options:dst_port=4789]) -AT_CHECK([ovs-appctl dpif/show | tail -n +4], [0], [dnl +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl br0 65534/100: (dummy) p1 1/1: (vxlan: remote_ip=1.1.1.1) ]) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 9282c59d2..6311ff3a4 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -493,12 +493,12 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) COVERAGE_INC(bridge_reconfigure); - ofproto_set_flow_eviction_threshold( - smap_get_int(&ovs_cfg->other_config, "flow-eviction-threshold", - OFPROTO_FLOW_EVICTION_THRESHOLD_DEFAULT)); + ofproto_set_flow_limit(smap_get_int(&ovs_cfg->other_config, "flow-limit", + OFPROTO_FLOW_LIMIT_DEFAULT)); ofproto_set_threads( - smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0)); + smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0), + smap_get_int(&ovs_cfg->other_config, "n-revalidator-threads", 0)); bridge_configure_flow_miss_model(smap_get(&ovs_cfg->other_config, "force-miss-model")); diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index ee1c0586c..5fd82fcb8 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -123,17 +123,16 @@

-

- A number of flows as a nonnegative integer. This sets number of - flows at which eviction from the datapath flow table will be - triggered. If there are a large number of flows then increasing this - value to around the number of flows present can result in reduced CPU - usage and packet loss. + The maximum + number of flows allowed in the datapath flow table. Internally OVS + will choose a flow limit which will likely be lower than this number, + based on real time network conditions.

- The default is 2500. Values below 100 will be rounded up to 100. + The default is 200000.

@@ -163,8 +162,28 @@ type='{"type": "integer", "minInteger": 1}'>

Specifies the number of threads for software datapaths to use for - handling new flows. The default is two less than the number of - online CPU cores (but at least 1). + handling new flows. The default the number of online CPU cores minus + the number of revalidators. +

+

+ This configuration is per datapath. If you have more than one + software datapath (e.g. some system bridges and some + netdev bridges), then the total number of threads is + n-handler-threads times the number of software + datapaths. +

+ + + +

+ Specifies the number of threads for software datapaths to use for + revalidating flows in the datapath. Typically, there is a direct + correlation between the number of revalidator threads, and the number + of flows allowed in the datapath. The default is the number of cpu + cores divided by four plus one. If n-handler-threads is + set, the default changes to the number of cpu cores minus the number + of handler threads.

This configuration is per datapath. If you have more than one