#define TBL_MIN_BUCKETS 1024
#define REHASH_INTERVAL (10 * 60 * HZ)
+#define MC_HASH_SHIFT 8
+#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT)
+#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
+
static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
static u16 range_n_bytes(const struct sw_flow_key_range *range)
{
void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
const struct sw_flow_mask *mask)
{
- const long *m = (long *)((u8 *)&mask->key + mask->range.start);
- const long *s = (long *)((u8 *)src + mask->range.start);
+ const long *m = (const long *)((const u8 *)&mask->key +
+ mask->range.start);
+ const long *s = (const long *)((const u8 *)src +
+ mask->range.start);
long *d = (long *)((u8 *)dst + mask->range.start);
int i;
*d++ = *s++ & *m++;
}
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(void)
{
struct sw_flow *flow;
- int cpu;
+ struct flow_stats *stats;
+ int node;
flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
if (!flow)
flow->sf_acts = NULL;
flow->mask = NULL;
+ flow->stats_last_writer = NUMA_NO_NODE;
- flow->stats.is_percpu = percpu_stats;
+ /* Initialize the default stat node. */
+ stats = kmem_cache_alloc_node(flow_stats_cache,
+ GFP_KERNEL | __GFP_ZERO, 0);
+ if (!stats)
+ goto err;
- if (!percpu_stats) {
- flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
- if (!flow->stats.stat)
- goto err;
+ spin_lock_init(&stats->lock);
- spin_lock_init(&flow->stats.stat->lock);
- } else {
- flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
- if (!flow->stats.cpu_stats)
- goto err;
+ RCU_INIT_POINTER(flow->stats[0], stats);
- for_each_possible_cpu(cpu) {
- struct flow_stats *cpu_stats;
+ for_each_node(node)
+ if (node != 0)
+ RCU_INIT_POINTER(flow->stats[node], NULL);
- cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
- spin_lock_init(&cpu_stats->lock);
- }
- }
return flow;
err:
kmem_cache_free(flow_cache, flow);
static void flow_free(struct sw_flow *flow)
{
- kfree((struct sf_flow_acts __force *)flow->sf_acts);
- if (flow->stats.is_percpu)
- free_percpu(flow->stats.cpu_stats);
- else
- kfree(flow->stats.stat);
+ int node;
+
+ kfree((struct sw_flow_actions __force *)flow->sf_acts);
+ for_each_node(node)
+ if (flow->stats[node])
+ kmem_cache_free(flow_stats_cache,
+ (struct flow_stats __force *)flow->stats[node]);
kmem_cache_free(flow_cache, flow);
}
if (!flow)
return;
- ASSERT_OVSL();
-
- if (flow->mask) {
- struct sw_flow_mask *mask = flow->mask;
-
- BUG_ON(!mask->ref_count);
- mask->ref_count--;
-
- if (!mask->ref_count) {
- list_del_rcu(&mask->list);
- if (deferred)
- call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb);
- else
- kfree(mask);
- }
- }
-
if (deferred)
call_rcu(&flow->rcu, rcu_free_flow_callback);
else
{
struct table_instance *ti;
- ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) *
+ MC_HASH_ENTRIES, __alignof__(struct mask_cache_entry));
+ if (!table->mask_cache)
+ return -ENOMEM;
- if (!ti)
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!ti) {
+ free_percpu(table->mask_cache);
return -ENOMEM;
+ }
rcu_assign_pointer(table->ti, ti);
INIT_LIST_HEAD(&table->mask_list);
__table_instance_destroy(ti);
}
-void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
+/* No need for locking this function is called from RCU callback or
+ * error path. */
+void ovs_flow_tbl_destroy(struct flow_table *table)
{
- struct table_instance *ti = ovsl_dereference(table->ti);
+ struct table_instance *ti = (struct table_instance __force *)table->ti;
- table_instance_destroy(ti, deferred);
+ free_percpu(table->mask_cache);
+ table_instance_destroy(ti, false);
}
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
static u32 flow_hash(const struct sw_flow_key *key, int key_start,
int key_end)
{
- u32 *hash_key = (u32 *)((u8 *)key + key_start);
+ const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
int hash_u32s = (key_end - key_start) >> 2;
/* Make sure number of hash bytes are multiple of u32. */
const struct sw_flow_key *key2,
int key_start, int key_end)
{
- const long *cp1 = (long *)((u8 *)key1 + key_start);
- const long *cp2 = (long *)((u8 *)key2 + key_start);
+ const long *cp1 = (const long *)((const u8 *)key1 + key_start);
+ const long *cp2 = (const long *)((const u8 *)key2 + key_start);
long diffs = 0;
int i;
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
const struct sw_flow_key *unmasked,
- struct sw_flow_mask *mask)
+ struct sw_flow_mask *mask,
+ u32 *n_mask_hit)
{
struct sw_flow *flow;
struct hlist_head *head;
ovs_flow_mask_key(&masked_key, unmasked, mask);
hash = flow_hash(&masked_key, key_start, key_end);
head = find_bucket(ti, hash);
+ (*n_mask_hit)++;
hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) {
if (flow->mask == mask && flow->hash == hash &&
flow_cmp_masked_key(flow, &masked_key,
return NULL;
}
-struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
- const struct sw_flow_key *key,
- u32 *n_mask_hit)
+
+static struct sw_flow *flow_lookup(struct flow_table *tbl,
+ struct table_instance *ti,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit)
{
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
struct sw_flow_mask *mask;
struct sw_flow *flow;
- *n_mask_hit = 0;
list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
- (*n_mask_hit)++;
- flow = masked_flow_lookup(ti, key, mask);
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) /* Found */
return flow;
}
return NULL;
}
+/*
+ * mask_cache maps flow to probable mask. This cache is not tightly
+ * coupled cache, It means updates to mask list can result in inconsistent
+ * cache entry in mask cache.
+ * This is per cpu cache and is divided in MC_HASH_SEGS segments.
+ * In case of a hash collision the entry is hashed in next segment.
+ * */
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 skb_hash,
+ u32 *n_mask_hit)
+{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct mask_cache_entry *entries, *ce, *del;
+ struct sw_flow *flow;
+ u32 hash = skb_hash;
+ int seg;
+
+ *n_mask_hit = 0;
+ if (unlikely(!skb_hash))
+ return flow_lookup(tbl, ti, key, n_mask_hit);
+
+ del = NULL;
+ entries = this_cpu_ptr(tbl->mask_cache);
+
+ for (seg = 0; seg < MC_HASH_SEGS; seg++) {
+ int index;
+
+ index = hash & (MC_HASH_ENTRIES - 1);
+ ce = &entries[index];
+
+ if (ce->skb_hash == skb_hash) {
+ struct sw_flow_mask *mask;
+ int i;
+
+ i = 0;
+ list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
+ if (ce->mask_index == i++) {
+ flow = masked_flow_lookup(ti, key, mask,
+ n_mask_hit);
+ if (flow) /* Found */
+ return flow;
+
+ break;
+ }
+ }
+ del = ce;
+ break;
+ }
+
+ if (!del || (del->skb_hash && !ce->skb_hash)) {
+ del = ce;
+ }
+
+ hash >>= MC_HASH_SHIFT;
+ }
+
+ flow = flow_lookup(tbl, ti, key, n_mask_hit);
+
+ if (flow) {
+ del->skb_hash = skb_hash;
+ del->mask_index = (*n_mask_hit - 1);
+ }
+ return flow;
+}
+
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
const struct sw_flow_key *key)
{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
u32 __always_unused n_mask_hit;
- return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
+ n_mask_hit = 0;
+ return flow_lookup(tbl, ti, key, &n_mask_hit);
}
int ovs_flow_tbl_num_masks(const struct flow_table *table)
return table_instance_rehash(ti, ti->n_buckets * 2);
}
+/* Remove 'mask' from the mask list, if it is not needed any more. */
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+ if (mask) {
+ /* ovs-lock is required to protect mask-refcount and
+ * mask list.
+ */
+ ASSERT_OVSL();
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count) {
+ list_del_rcu(&mask->list);
+ call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb);
+ }
+ }
+}
+
+/* Must be called with OVS mutex held. */
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
struct table_instance *ti = ovsl_dereference(table->ti);
BUG_ON(table->count == 0);
hlist_del_rcu(&flow->hash_node[ti->node_ver]);
table->count--;
+
+ /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
+ * accessible as long as the RCU read lock is held. */
+ flow_mask_remove(table, flow->mask);
}
static struct sw_flow_mask *mask_alloc(void)
static bool mask_equal(const struct sw_flow_mask *a,
const struct sw_flow_mask *b)
{
- u8 *a_ = (u8 *)&a->key + a->range.start;
- u8 *b_ = (u8 *)&b->key + b->range.start;
+ const u8 *a_ = (const u8 *)&a->key + a->range.start;
+ const u8 *b_ = (const u8 *)&b->key + b->range.start;
return (a->range.end == b->range.end)
&& (a->range.start == b->range.start)
return -ENOMEM;
mask->key = new->key;
mask->range = new->range;
- list_add_rcu(&mask->list, &tbl->mask_list);
+ list_add_tail_rcu(&mask->list, &tbl->mask_list);
} else {
BUG_ON(!mask->ref_count);
mask->ref_count++;
return 0;
}
+/* Must be called with OVS mutex held. */
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
struct sw_flow_mask *mask)
{
BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
- flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
- 0, NULL);
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+ + (num_possible_nodes()
+ * sizeof(struct flow_stats *)),
+ 0, 0, NULL);
if (flow_cache == NULL)
return -ENOMEM;
+ flow_stats_cache
+ = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (flow_stats_cache == NULL) {
+ kmem_cache_destroy(flow_cache);
+ flow_cache = NULL;
+ return -ENOMEM;
+ }
+
return 0;
}
/* Uninitializes the flow module. */
void ovs_flow_exit(void)
{
+ kmem_cache_destroy(flow_stats_cache);
kmem_cache_destroy(flow_cache);
}