datapath: Use hash table more tolerant of collisions for flow table.

[sliver-openvswitch.git] / datapath / table.c
diff --git a/datapath/table.c b/datapath/table.c

index 11aeb88..23ae8ab 100644 (file)
--- a/datapath/table.c
+++ b/datapath/table.c
@@ -11,50 +11,76 @@
  
  #include <linux/gfp.h>
  #include <linux/jhash.h>
+#include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
  #include <asm/pgtable.h>
  
-static void free_table(struct sw_flow ***flows, unsigned int n_buckets,
-                      int free_flows)
+static inline int bucket_size(int n_flows)
+{
+       return sizeof(struct dp_bucket) + sizeof(struct sw_flow*) * n_flows;
+}
+
+static struct dp_bucket *dp_bucket_alloc(int n_flows)
+{
+       return kmalloc(bucket_size(n_flows), GFP_KERNEL);
+}
+
+static void free_buckets(struct dp_bucket ***l1, unsigned int n_buckets,
+                        int free_flows)
  {
         unsigned int i;
  
         for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
-               struct sw_flow **l2 = flows[i];
-               if (free_flows) {
-                       unsigned int j;
-                       for (j = 0; j < DP_L1_SIZE; j++) {
-                               if (l2[j])
-                                       flow_free(l2[j]);
+               struct dp_bucket **l2 = l1[i];
+               unsigned int j;
+
+               for (j = 0; j < DP_L1_SIZE; j++) {
+                       struct dp_bucket *bucket = l2[j];
+                       if (!bucket)
+                               continue;
+
+                       if (free_flows) {
+                               unsigned int k;
+                               for (k = 0; k < bucket->n_flows; k++)
+                                       flow_free(bucket->flows[k]);
                         }
+                       kfree(bucket);
                 }
                 free_page((unsigned long)l2);
         }
-       kfree(flows);
+       kfree(l1);
  }
  
-static struct sw_flow ***alloc_table(unsigned int n_buckets)
+static struct dp_bucket ***alloc_buckets(unsigned int n_buckets)
  {
-       struct sw_flow ***flows;
+       struct dp_bucket ***l1;
         unsigned int i;
  
-       flows = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct sw_flow**),
-                       GFP_KERNEL);
-       if (!flows)
+       l1 = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct dp_bucket**),
+                    GFP_KERNEL);
+       if (!l1)
                 return NULL;
         for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
-               flows[i] = (struct sw_flow **)get_zeroed_page(GFP_KERNEL);
-               if (!flows[i]) {
-                       free_table(flows, i << DP_L1_BITS, 0);
+               l1[i] = (struct dp_bucket **)get_zeroed_page(GFP_KERNEL);
+               if (!l1[i]) {
+                       free_buckets(l1, i << DP_L1_BITS, 0);
                         return NULL;
                 }
         }
-       return flows;
+       return l1;
  }
  
+/**
+ * dp_table_create - create and return a new flow table
+ * @n_buckets: number of buckets in the new table
+ *
+ * Creates and returns a new flow table, or %NULL if memory cannot be
+ * allocated.  @n_buckets must be a power of 2 in the range %DP_L1_SIZE to
+ * %DP_MAX_BUCKETS.
+ */
  struct dp_table *dp_table_create(unsigned int n_buckets)
  {
         struct dp_table *table;
@@ -64,95 +90,124 @@ struct dp_table *dp_table_create(unsigned int n_buckets)
                 goto err;
  
         table->n_buckets = n_buckets;
-       table->flows[0] = alloc_table(n_buckets);
-       if (!table[0].flows)
-               goto err_free_tables;
-
-       table->flows[1] = alloc_table(n_buckets);
-       if (!table->flows[1])
-               goto err_free_flows0;
+       table->buckets = alloc_buckets(n_buckets);
+       if (!table->buckets)
+               goto err_free_table;
+       get_random_bytes(&table->hash_seed, sizeof table->hash_seed);
  
         return table;
  
-err_free_flows0:
-       free_table(table->flows[0], table->n_buckets, 0);
-err_free_tables:
+err_free_table:
         kfree(table);
  err:
         return NULL;
  }
  
+/**
+ * dp_table_destroy - destroy flow table and optionally the flows it contains
+ * @table: table to destroy (must not be %NULL)
+ * @free_flows: whether to destroy the flows
+ *
+ * If @free_flows is zero, then the buckets in @table are destroyed but not the
+ * flows within those buckets.  This behavior is useful when a table is being
+ * replaced by a larger or smaller one without destroying the flows.
+ *
+ * If @free_flows is nonzero, then the flows in @table are destroyed as well as
+ * the buckets.
+ */
  void dp_table_destroy(struct dp_table *table, int free_flows)
  {
-       int i;
-       for (i = 0; i < 2; i++)
-               free_table(table->flows[i], table->n_buckets, free_flows);
+       free_buckets(table->buckets, table->n_buckets, free_flows);
         kfree(table);
  }
  
-static struct sw_flow **find_bucket(struct dp_table *table,
-                                   struct sw_flow ***flows, u32 hash)
+static struct dp_bucket **find_bucket(struct dp_table *table, u32 hash)
  {
         unsigned int l1 = (hash & (table->n_buckets - 1)) >> DP_L1_SHIFT;
         unsigned int l2 = hash & ((1 << DP_L2_BITS) - 1);
-       return &flows[l1][l2];
+       return &table->buckets[l1][l2];
  }
  
-static struct sw_flow *lookup_table(struct dp_table *table,
-                                   struct sw_flow ***flows, u32 hash,
-                                   const struct odp_flow_key *key)
+static int search_bucket(const struct dp_bucket *bucket, const struct odp_flow_key *key)
  {
-       struct sw_flow **bucket = find_bucket(table, flows, hash);
-       struct sw_flow *flow = rcu_dereference(*bucket);
-       if (flow && !memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
-               return flow;
-       return NULL;
-}
+       int i;
  
-static u32 flow_hash0(const struct odp_flow_key *key)
-{
-       return jhash2((u32*)key, sizeof *key / sizeof(u32), 0xaaaaaaaa);
+       for (i = 0; i < bucket->n_flows; i++) {
+               struct sw_flow *flow = rcu_dereference(bucket->flows[i]);
+               if (!memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
+                       return i;
+       }
+
+       return -1;
  }
  
-static u32 flow_hash1(const struct odp_flow_key *key)
+static struct sw_flow *lookup_flow(struct dp_table *table, u32 hash,
+                                  const struct odp_flow_key *key)
  {
-       return jhash2((u32*)key, sizeof *key / sizeof(u32), 0x55555555);
+       struct dp_bucket **bucketp = find_bucket(table, hash);
+       struct dp_bucket *bucket = rcu_dereference(*bucketp);
+       int index;
+
+       if (!bucket)
+               return NULL;
+
+       index = search_bucket(bucket, key);
+       if (index < 0)
+               return NULL;
+
+       return bucket->flows[index];
  }
  
-static void find_buckets(struct dp_table *table,
-                        const struct odp_flow_key *key,
-                        struct sw_flow **buckets[2])
+static u32 flow_hash(const struct dp_table *table,
+                    const struct odp_flow_key *key)
  {
-       buckets[0] = find_bucket(table, table->flows[0], flow_hash0(key));
-       buckets[1] = find_bucket(table, table->flows[1], flow_hash1(key));
+       return jhash2((u32*)key, sizeof *key / sizeof(u32), table->hash_seed);
  }
  
+/**
+ * dp_table_lookup - searches flow table for a matching flow
+ * @table: flow table to search
+ * @key: flow key for which to search
+ *
+ * Searches @table for a flow whose key is equal to @key.  Returns the flow if
+ * successful, otherwise %NULL.
+ */
  struct sw_flow *dp_table_lookup(struct dp_table *table,
                                 const struct odp_flow_key *key)
  {
-       struct sw_flow *flow;
-       flow = lookup_table(table, table->flows[0], flow_hash0(key), key);
-       if (!flow)
-               flow = lookup_table(table, table->flows[1],
-                                   flow_hash1(key), key);
-       return flow;
+       return lookup_flow(table, flow_hash(table, key), key);
  }
  
+/**
+ * dp_table_foreach - iterate through flow table
+ * @table: table to iterate
+ * @callback: function to call for each flow entry
+ * @aux: Extra data to pass to @callback
+ *
+ * Iterates through all of the flows in @table in hash order, passing each of
+ * them in turn to @callback.  If @callback returns nonzero, this terminates
+ * the iteration and dp_table_foreach() returns the same value.  Returns 0 if
+ * @callback never returns nonzero.
+ *
+ * This function does not try to intelligently handle the case where @callback
+ * adds or removes flows in @table.
+ */
  int dp_table_foreach(struct dp_table *table,
                      int (*callback)(struct sw_flow *flow, void *aux),
                      void *aux)
  {
         unsigned int i, j, k;
-       for (i = 0; i < 2; i++) {
-               for (j = 0; j < table->n_buckets >> DP_L1_BITS; j++) {
-                       struct sw_flow **l2 = table->flows[i][j];
-                       for (k = 0; k < DP_L1_SIZE; k++) {
-                               struct sw_flow *flow = rcu_dereference(l2[k]);
-                               if (flow) {
-                                       int error = callback(flow, aux);
-                                       if (error)
-                                               return error;
-                               }
+       for (i = 0; i < table->n_buckets >> DP_L1_BITS; i++) {
+               struct dp_bucket **l2 = table->buckets[i];
+               for (j = 0; j < DP_L1_SIZE; j++) {
+                       struct dp_bucket *bucket = rcu_dereference(l2[j]);
+                       if (!bucket)
+                               continue;
+
+                       for (k = 0; k < bucket->n_flows; k++) {
+                               int error = (*callback)(bucket->flows[k], aux);
+                               if (error)
+                                       return error;
                         }
                 }
         }
@@ -162,18 +217,7 @@ int dp_table_foreach(struct dp_table *table,
  static int insert_flow(struct sw_flow *flow, void *new_table_)
  {
         struct dp_table *new_table = new_table_;
-       struct sw_flow **buckets[2];
-       int i;
-
-       find_buckets(new_table, &flow->key, buckets);
-       for (i = 0; i < 2; i++) {
-               if (!*buckets[i]) {
-                       rcu_assign_pointer(*buckets[i], flow);
-                       return 0;
-               }
-       }
-       WARN_ON_ONCE(1);
-       return 0;
+       return dp_table_insert(new_table, flow);
  }
  
  static void dp_free_table_rcu(struct rcu_head *rcu)
@@ -182,16 +226,34 @@ static void dp_free_table_rcu(struct rcu_head *rcu)
         dp_table_destroy(table, 0);
  }
  
+/**
+ * dp_table_expand - replace datapath's flow table by one with more buckets
+ * @dp: datapath to expand
+ *
+ * Replaces @dp's flow table by one that has twice as many buckets.  All of the
+ * flows in @dp's flow table are moved to the new flow table.  Returns 0 if
+ * successful, otherwise a negative error.
+ */
  int dp_table_expand(struct datapath *dp)
  {
         struct dp_table *old_table = rcu_dereference(dp->table);
-       struct dp_table *new_table = dp_table_create(old_table->n_buckets * 2);
+       struct dp_table *new_table;
+
+       new_table = dp_table_create(old_table->n_buckets * 2);
         if (!new_table)
-               return -ENOMEM;
-       dp_table_foreach(old_table, insert_flow, new_table);
+               goto error;
+
+       if (dp_table_foreach(old_table, insert_flow, new_table))
+               goto error_free_new_table;
+
         rcu_assign_pointer(dp->table, new_table);
         call_rcu(&old_table->rcu, dp_free_table_rcu);
         return 0;
+
+error_free_new_table:
+       dp_table_destroy(new_table, 0);
+error:
+       return -ENOMEM;
  }
  
  static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
@@ -200,6 +262,13 @@ static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
         dp_table_destroy(table, 1);
  }
  
+/**
+ * dp_table_flush - clear datapath's flow table
+ * @dp: datapath to clear
+ *
+ * Replaces @dp's flow table by an empty flow table, destroying all the flows
+ * in the old table (after a suitable RCU grace period).
+ */
  int dp_table_flush(struct datapath *dp)
  {
         struct dp_table *old_table = rcu_dereference(dp->table);
@@ -211,38 +280,88 @@ int dp_table_flush(struct datapath *dp)
         return 0;
  }
  
-struct sw_flow **
-dp_table_lookup_for_insert(struct dp_table *table,
-                          const struct odp_flow_key *target)
+static void dp_free_bucket_rcu(struct rcu_head *rcu)
  {
-       struct sw_flow **buckets[2];
-       struct sw_flow **empty_bucket = NULL;
-       int i;
+       struct dp_bucket *bucket = container_of(rcu, struct dp_bucket, rcu);
+       kfree(bucket);
+}
  
-       find_buckets(table, target, buckets);
-       for (i = 0; i < 2; i++) {
-               struct sw_flow *f = rcu_dereference(*buckets[i]);
-               if (f) {
-                       if (!memcmp(&f->key, target, sizeof(struct odp_flow_key)))
-                               return buckets[i];
-               } else if (!empty_bucket)
-                       empty_bucket = buckets[i];
-       }
-       return empty_bucket;
+/**
+ * dp_table_insert - insert flow into table
+ * @table: table in which to insert flow
+ * @target: flow to insert
+ *
+ * The caller must ensure that no flow with key identical to @target->key
+ * already exists in @table.  Returns 0 or a negative error (currently just
+ * -ENOMEM).
+ *
+ * The caller is responsible for updating &struct datapath's n_flows member.
+ */
+int dp_table_insert(struct dp_table *table, struct sw_flow *target)
+{
+       u32 hash = flow_hash(table, &target->key);
+       struct dp_bucket **oldp = find_bucket(table, hash);
+       struct dp_bucket *old = *rcu_dereference(oldp);
+       unsigned int n = old ? old->n_flows : 0;
+       struct dp_bucket *new = dp_bucket_alloc(n + 1);
+
+       if (!new)
+               return -ENOMEM;
+
+       new->n_flows = n + 1;
+       if (old)
+               memcpy(new->flows, old->flows, n * sizeof(struct sw_flow*));
+       new->flows[n] = target;
+
+       rcu_assign_pointer(*oldp, new);
+       if (old)
+               call_rcu(&old->rcu, dp_free_bucket_rcu);
+
+       return 0;
  }
  
+/**
+ * dp_table_delete - remove flow from table
+ * @table: table from which to remove flow
+ * @target: flow to remove
+ *
+ * The caller must ensure that @target itself is in @table.  (It is not
+ * good enough for @table to contain a different flow with a key equal to
+ * @target's key.)
+ *
+ * Returns 0 or a negative error (currently just -ENOMEM).  Yes, it *is*
+ * possible for a flow deletion to fail due to lack of memory.
+ *
+ * The caller is responsible for updating &struct datapath's n_flows member.
+ */
  int dp_table_delete(struct dp_table *table, struct sw_flow *target)
  {
-       struct sw_flow **buckets[2];
-       int i;
+       u32 hash = flow_hash(table, &target->key);
+       struct dp_bucket **oldp = find_bucket(table, hash);
+       struct dp_bucket *old = *rcu_dereference(oldp);
+       unsigned int n = old->n_flows;
+       struct dp_bucket *new;
+
+       if (n > 1) {
+               unsigned int i;
  
-       find_buckets(table, &target->key, buckets);
-       for (i = 0; i < 2; i++) {
-               struct sw_flow *flow = rcu_dereference(*buckets[i]);
-               if (flow == target) {
-                       rcu_assign_pointer(*buckets[i], NULL);
-                       return 0;
+               new = dp_bucket_alloc(n - 1);
+               if (!new)
+                       return -ENOMEM;
+
+               new->n_flows = 0;
+               for (i = 0; i < n; i++) {
+                       struct sw_flow *flow = old->flows[i];
+                       if (flow != target)
+                               new->flows[new->n_flows++] = flow;
                 }
+               WARN_ON_ONCE(new->n_flows != n - 1);
+       } else {
+               new = NULL;
         }
-       return -ENOENT;
+
+       rcu_assign_pointer(*oldp, new);
+       call_rcu(&old->rcu, dp_free_bucket_rcu);
+
+       return 0;
  }