From 0f2ea84841e1689ebf18e2260819e6493dbf2356 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 18 Mar 2014 16:34:28 -0700 Subject: [PATCH] ovs-rcu: New library. RCU allows multiple threads to read objects in parallel without any performance penalty. The following commit will introduce the first use. Signed-off-by: Ben Pfaff Acked-by: Andy Zhou --- lib/automake.mk | 2 + lib/ovs-rcu.c | 293 ++++++++++++++++++++++++++++++++++ lib/ovs-rcu.h | 182 +++++++++++++++++++++ lib/ovs-thread.c | 16 +- lib/ovs-thread.h | 2 + lib/timeval.c | 11 ++ ofproto/ofproto-dpif-upcall.c | 5 + 7 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 lib/ovs-rcu.c create mode 100644 lib/ovs-rcu.h diff --git a/lib/automake.mk b/lib/automake.mk index e22165ca3..1460ee29d 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -145,6 +145,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/ovs-atomic-locked.h \ lib/ovs-atomic-pthreads.h \ lib/ovs-atomic.h \ + lib/ovs-rcu.c \ + lib/ovs-rcu.h \ lib/ovs-thread.c \ lib/ovs-thread.h \ lib/ovsdb-data.c \ diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c new file mode 100644 index 000000000..ac4513b33 --- /dev/null +++ b/lib/ovs-rcu.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ovs-rcu.h" +#include "guarded-list.h" +#include "list.h" +#include "ovs-thread.h" +#include "poll-loop.h" +#include "seq.h" + +struct ovsrcu_cb { + void (*function)(void *aux); + void *aux; +}; + +struct ovsrcu_cbset { + struct list list_node; + struct ovsrcu_cb cbs[16]; + int n_cbs; +}; + +struct ovsrcu_perthread { + struct list list_node; /* In global list. */ + + struct ovs_mutex mutex; + uint64_t seqno; + struct ovsrcu_cbset *cbset; +}; + +static struct seq *global_seqno; + +static pthread_key_t perthread_key; +static struct list ovsrcu_threads; +static struct ovs_mutex ovsrcu_threads_mutex; + +static struct guarded_list flushed_cbsets; +static struct seq *flushed_cbsets_seq; + +static void ovsrcu_init(void); +static void ovsrcu_flush_cbset(struct ovsrcu_perthread *); +static void ovsrcu_unregister__(struct ovsrcu_perthread *); +static bool ovsrcu_call_postponed(void); +static void *ovsrcu_postpone_thread(void *arg OVS_UNUSED); +static void ovsrcu_synchronize(void); + +static struct ovsrcu_perthread * +ovsrcu_perthread_get(void) +{ + struct ovsrcu_perthread *perthread; + + ovsrcu_init(); + + perthread = pthread_getspecific(perthread_key); + if (!perthread) { + perthread = xmalloc(sizeof *perthread); + ovs_mutex_init(&perthread->mutex); + perthread->seqno = seq_read(global_seqno); + perthread->cbset = NULL; + + ovs_mutex_lock(&ovsrcu_threads_mutex); + list_push_back(&ovsrcu_threads, &perthread->list_node); + ovs_mutex_unlock(&ovsrcu_threads_mutex); + + pthread_setspecific(perthread_key, perthread); + } + return perthread; +} + +/* Indicates the end of a quiescent state. See "Details" near the top of + * ovs-rcu.h. + * + * Quiescent states don't stack or nest, so this always ends a quiescent state + * even if ovsrcu_quiesce_start() was called multiple times in a row. */ +void +ovsrcu_quiesce_end(void) +{ + ovsrcu_perthread_get(); +} + +static void +ovsrcu_quiesced(void) +{ + if (single_threaded()) { + ovsrcu_call_postponed(); + } else { + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + if (ovsthread_once_start(&once)) { + xpthread_create(NULL, NULL, ovsrcu_postpone_thread, NULL); + ovsthread_once_done(&once); + } + } +} + +/* Indicates the beginning of a quiescent state. See "Details" near the top of + * ovs-rcu.h. */ +void +ovsrcu_quiesce_start(void) +{ + struct ovsrcu_perthread *perthread; + + ovsrcu_init(); + perthread = pthread_getspecific(perthread_key); + if (perthread) { + pthread_setspecific(perthread_key, NULL); + ovsrcu_unregister__(perthread); + } + + ovsrcu_quiesced(); +} + +/* Indicates a momentary quiescent state. See "Details" near the top of + * ovs-rcu.h. */ +void +ovsrcu_quiesce(void) +{ + ovsrcu_perthread_get()->seqno = seq_read(global_seqno); + seq_change(global_seqno); + + ovsrcu_quiesced(); +} + +static void +ovsrcu_synchronize(void) +{ + uint64_t target_seqno; + + if (single_threaded()) { + return; + } + + target_seqno = seq_read(global_seqno); + ovsrcu_quiesce_start(); + + for (;;) { + uint64_t cur_seqno = seq_read(global_seqno); + struct ovsrcu_perthread *perthread; + bool done = true; + + ovs_mutex_lock(&ovsrcu_threads_mutex); + LIST_FOR_EACH (perthread, list_node, &ovsrcu_threads) { + if (perthread->seqno <= target_seqno) { + done = false; + break; + } + } + ovs_mutex_unlock(&ovsrcu_threads_mutex); + + if (done) { + break; + } + + seq_wait(global_seqno, cur_seqno); + poll_block(); + } + ovsrcu_quiesce_end(); +} + +/* Registers 'function' to be called, passing 'aux' as argument, after the + * next grace period. + * + * This function is more conveniently called through the ovsrcu_postpone() + * macro, which provides a type-safe way to allow 'function''s parameter to be + * any pointer type. */ +void +ovsrcu_postpone__(void (*function)(void *aux), void *aux) +{ + struct ovsrcu_perthread *perthread = ovsrcu_perthread_get(); + struct ovsrcu_cbset *cbset; + struct ovsrcu_cb *cb; + + cbset = perthread->cbset; + if (!cbset) { + cbset = perthread->cbset = xmalloc(sizeof *perthread->cbset); + cbset->n_cbs = 0; + } + + cb = &cbset->cbs[cbset->n_cbs++]; + cb->function = function; + cb->aux = aux; + + if (cbset->n_cbs >= ARRAY_SIZE(cbset->cbs)) { + ovsrcu_flush_cbset(perthread); + } +} + +static bool +ovsrcu_call_postponed(void) +{ + struct ovsrcu_cbset *cbset, *next_cbset; + struct list cbsets; + + guarded_list_pop_all(&flushed_cbsets, &cbsets); + if (list_is_empty(&cbsets)) { + return false; + } + + ovsrcu_synchronize(); + + LIST_FOR_EACH_SAFE (cbset, next_cbset, list_node, &cbsets) { + struct ovsrcu_cb *cb; + + for (cb = cbset->cbs; cb < &cbset->cbs[cbset->n_cbs]; cb++) { + cb->function(cb->aux); + } + list_remove(&cbset->list_node); + free(cbset); + } + + return true; +} + +static void * +ovsrcu_postpone_thread(void *arg OVS_UNUSED) +{ + pthread_detach(pthread_self()); + + for (;;) { + uint64_t seqno = seq_read(flushed_cbsets_seq); + if (!ovsrcu_call_postponed()) { + seq_wait(flushed_cbsets_seq, seqno); + poll_block(); + } + } + + OVS_NOT_REACHED(); +} + +static void +ovsrcu_flush_cbset(struct ovsrcu_perthread *perthread) +{ + struct ovsrcu_cbset *cbset = perthread->cbset; + + if (cbset) { + guarded_list_push_back(&flushed_cbsets, &cbset->list_node, SIZE_MAX); + perthread->cbset = NULL; + + seq_change(flushed_cbsets_seq); + } +} + +static void +ovsrcu_unregister__(struct ovsrcu_perthread *perthread) +{ + if (perthread->cbset) { + ovsrcu_flush_cbset(perthread); + } + + ovs_mutex_lock(&ovsrcu_threads_mutex); + list_remove(&perthread->list_node); + ovs_mutex_unlock(&ovsrcu_threads_mutex); + + ovs_mutex_destroy(&perthread->mutex); + free(perthread); + + seq_change(global_seqno); +} + +static void +ovsrcu_thread_exit_cb(void *perthread) +{ + ovsrcu_unregister__(perthread); +} + +static void +ovsrcu_init(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + if (ovsthread_once_start(&once)) { + global_seqno = seq_create(); + xpthread_key_create(&perthread_key, ovsrcu_thread_exit_cb); + list_init(&ovsrcu_threads); + ovs_mutex_init(&ovsrcu_threads_mutex); + + guarded_list_init(&flushed_cbsets); + flushed_cbsets_seq = seq_create(); + + ovsthread_once_done(&once); + } +} diff --git a/lib/ovs-rcu.h b/lib/ovs-rcu.h new file mode 100644 index 000000000..710870a8b --- /dev/null +++ b/lib/ovs-rcu.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OVS_RCU_H +#define OVS_RCU_H 1 + +/* Read-Copy-Update (RCU) + * ====================== + * + * Introduction + * ------------ + * + * Atomic pointer access makes it pretty easy to implement lock-free + * algorithms. There is one big problem, though: when a writer updates a + * pointer to point to a new data structure, some thread might be reading the + * old version, and there's no convenient way to free the old version when all + * threads are done with the old version. + * + * The function ovsrcu_postpone() solves that problem. The function pointer + * passed in as its argument is called only after all threads are done with old + * versions of data structures. The function callback frees an old version of + * data no longer in use. This technique is called "read-copy-update", or RCU + * for short. + * + * + * Details + * ------- + * + * A "quiescent state" is a time at which a thread holds no pointers to memory + * that is managed by RCU; that is, when the thread is known not to reference + * memory that might be an old version of some object freed via RCU. For + * example, poll_block() includes a quiescent state, as does + * ovs_mutex_cond_wait(). + * + * The following functions manage the recognition of quiescent states: + * + * void ovsrcu_quiesce(void) + * + * Recognizes a momentary quiescent state in the current thread. + * + * void ovsrcu_quiesce_start(void) + * void ovsrcu_quiesce_end(void) + * + * Brackets a time period during which the current thread is quiescent. + * + * A newly created thread is initially active, not quiescent. + * + * When a quiescient state has occurred in every thread, we say that a "grace + * period" has occurred. Following a grace period, all of the callbacks + * postponed before the start of the grace period may be invoked. OVS takes + * care of this automatically through the RCU mechanism: while a process still + * has only a single thread, it invokes the postponed callbacks directly from + * ovsrcu_quiesce() and ovsrcu_quiesce_start(); after additional threads have + * been created, it creates an extra helper thread to invoke callbacks. + * + * + * Use + * --- + * + * Use OVSRCU_TYPE(TYPE) to declare a pointer to RCU-protected data, e.g. the + * following declares an RCU-protected "struct flow *" named flowp: + * + * OVSRCU_TYPE(struct flow *) flowp; + * + * Use ovsrcu_get(TYPE, VAR) to read an RCU-protected pointer, e.g. to read the + * pointer variable declared above: + * + * struct flow *flow = ovsrcu_get(struct flow *, flowp); + * + * Use ovsrcu_set() to write an RCU-protected pointer and ovsrcu_postpone() to + * free the previous data. If more than one thread can write the pointer, then + * some form of external synchronization, e.g. a mutex, is needed to prevent + * writers from interfering with one another. For example, to write the + * pointer variable declared above while safely freeing the old value: + * + * static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER; + * + * static void + * free_flow(struct flow *flow) + * { + * free(flow); + * } + * + * void + * change_flow(struct flow *new_flow) + * { + * ovs_mutex_lock(&mutex); + * ovsrcu_postpone(free_flow, + * ovsrcu_get_protected(struct flow *, &flowp)); + * ovsrcu_set(&flowp, new_flow); + * ovs_mutex_unlock(&mutex); + * } + * + */ + +#include "compiler.h" +#include "ovs-atomic.h" + +/* Use OVSRCU_TYPE(TYPE) to declare a pointer to RCU-protected data, e.g. the + * following declares an RCU-protected "struct flow *" named flowp: + * + * OVSRCU_TYPE(struct flow *) flowp; + * + * Use ovsrcu_get(TYPE, VAR) to read an RCU-protected pointer, e.g. to read the + * pointer variable declared above: + * + * struct flow *flow = ovsrcu_get(struct flow *, flowp); + * + * If the pointer variable is currently protected against change (because + * the current thread holds a mutex that protects it), ovsrcu_get_protected() + * may be used instead. Only on the Alpha architecture is this likely to + * generate different code, but it may be useful documentation. + * + * (With GNU C or Clang, you get a compiler error if TYPE is wrong; other + * compilers will merrily carry along accepting the wrong type.) + */ +#if __GNUC__ +#define OVSRCU_TYPE(TYPE) struct { ATOMIC(TYPE) p; } +#define ovsrcu_get__(TYPE, VAR, ORDER) \ + ({ \ + TYPE value__; \ + \ + atomic_read_explicit(CONST_CAST(ATOMIC(TYPE) *, &(VAR)->p), \ + &value__, ORDER); \ + \ + value__; \ + }) +#define ovsrcu_get(TYPE, VAR) \ + CONST_CAST(TYPE, ovsrcu_get__(TYPE, VAR, memory_order_consume)) +#define ovsrcu_get_protected(TYPE, VAR) \ + CONST_CAST(TYPE, ovsrcu_get__(TYPE, VAR, memory_order_relaxed)) +#else /* not GNU C */ +typedef struct ovsrcu_pointer { ATOMIC(void *) p; }; +#define OVSRCU_TYPE(TYPE) struct ovsrcu_pointer +static inline void * +ovsrcu_get__(const struct ovsrcu_pointer *pointer, memory_order order) +{ + void *value; + atomic_read_explicit(&CONST_CAST(struct ovsrcu_pointer *, pointer)->p, + &value, order); + return value; +} +#define ovsrcu_get(TYPE, VAR) \ + CONST_CAST(TYPE, ovsrcu_get__(VAR, memory_order_consume)) +#define ovsrcu_get_protected(TYPE, VAR) \ + CONST_CAST(TYPE, ovsrcu_get__(VAR, memory_order_relaxed)) +#endif + +/* Writes VALUE to the RCU-protected pointer whose address is VAR. + * + * Users require external synchronization (e.g. a mutex). See "Usage" above + * for an example. */ +#define ovsrcu_set(VAR, VALUE) \ + atomic_store_explicit(&(VAR)->p, VALUE, memory_order_release) + +/* Calls FUNCTION passing ARG as its pointer-type argument following the next + * grace period. See "Usage" above for example. */ +void ovsrcu_postpone__(void (*function)(void *aux), void *aux); +#define ovsrcu_postpone(FUNCTION, ARG) \ + ((void) sizeof((FUNCTION)(ARG), 1), \ + (void) sizeof(*(ARG)), \ + ovsrcu_postpone__((void (*)(void *))(FUNCTION), ARG)) + +/* Quiescent states. */ +void ovsrcu_quiesce_start(void); +void ovsrcu_quiesce_end(void); +void ovsrcu_quiesce(void); + +#endif /* ovs-rcu.h */ diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c index 76cbbd325..0dc558547 100644 --- a/lib/ovs-thread.c +++ b/lib/ovs-thread.c @@ -22,6 +22,7 @@ #include #include "compiler.h" #include "hash.h" +#include "ovs-rcu.h" #include "poll-loop.h" #include "socket-util.h" #include "util.h" @@ -223,7 +224,12 @@ void ovs_mutex_cond_wait(pthread_cond_t *cond, const struct ovs_mutex *mutex_) { struct ovs_mutex *mutex = CONST_CAST(struct ovs_mutex *, mutex_); - int error = pthread_cond_wait(cond, &mutex->lock); + int error; + + ovsrcu_quiesce_start(); + error = pthread_cond_wait(cond, &mutex->lock); + ovsrcu_quiesce_end(); + if (OVS_UNLIKELY(error)) { ovs_abort(error, "pthread_cond_wait failed"); } @@ -264,6 +270,7 @@ ovsthread_wrapper(void *aux_) aux = *auxp; free(auxp); + ovsrcu_quiesce_end(); return aux.start(aux.arg); } @@ -277,6 +284,7 @@ xpthread_create(pthread_t *threadp, pthread_attr_t *attr, forbid_forking("multiple threads exist"); multithreaded = true; + ovsrcu_quiesce_end(); aux = xmalloc(sizeof *aux); aux->start = start; @@ -307,6 +315,12 @@ ovsthread_once_done(struct ovsthread_once *once) ovs_mutex_unlock(&once->mutex); } +bool +single_threaded(void) +{ + return !multithreaded; +} + /* Asserts that the process has not yet created any threads (beyond the initial * thread). * diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h index d3bd538a4..ca269b09d 100644 --- a/lib/ovs-thread.h +++ b/lib/ovs-thread.h @@ -598,6 +598,8 @@ void ovsthread_counter_inc(struct ovsthread_counter *, unsigned long long int); unsigned long long int ovsthread_counter_read( const struct ovsthread_counter *); +bool single_threaded(void); + void assert_single_threaded_at(const char *where); #define assert_single_threaded() assert_single_threaded_at(SOURCE_LOCATOR) diff --git a/lib/timeval.c b/lib/timeval.c index ef806e762..ebbdb9843 100644 --- a/lib/timeval.c +++ b/lib/timeval.c @@ -31,6 +31,7 @@ #include "fatal-signal.h" #include "hash.h" #include "hmap.h" +#include "ovs-rcu.h" #include "ovs-thread.h" #include "signals.h" #include "seq.h" @@ -286,6 +287,12 @@ time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED, time_left = timeout_when - now; } + if (!time_left) { + ovsrcu_quiesce(); + } else { + ovsrcu_quiesce_start(); + } + #ifndef _WIN32 retval = poll(pollfds, n_pollfds, time_left); if (retval < 0) { @@ -306,6 +313,10 @@ time_poll(struct pollfd *pollfds, int n_pollfds, HANDLE *handles OVS_UNUSED, } #endif + if (time_left) { + ovsrcu_quiesce_end(); + } + if (deadline <= time_msec()) { #ifndef _WIN32 fatal_signal_handler(SIGALRM); diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index cc4982fe8..839cbff79 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -32,6 +32,7 @@ #include "ofproto-dpif-ipfix.h" #include "ofproto-dpif-sflow.h" #include "ofproto-dpif-xlate.h" +#include "ovs-rcu.h" #include "packets.h" #include "poll-loop.h" #include "seq.h" @@ -297,6 +298,8 @@ void udpif_set_threads(struct udpif *udpif, size_t n_handlers, size_t n_revalidators) { + ovsrcu_quiesce_start(); + /* Stop the old threads (if any). */ if (udpif->handlers && (udpif->n_handlers != n_handlers @@ -405,6 +408,8 @@ udpif_set_threads(struct udpif *udpif, size_t n_handlers, xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif); xpthread_create(&udpif->flow_dumper, NULL, udpif_flow_dumper, udpif); } + + ovsrcu_quiesce_end(); } /* Waits for all ongoing upcall translations to complete. This ensures that -- 2.43.0