Merge citrix branch into master.

author Ben Pfaff <blp@nicira.com>

Wed, 2 Sep 2009 17:14:53 +0000 (10:14 -0700)

committer Ben Pfaff <blp@nicira.com>

Wed, 2 Sep 2009 17:14:53 +0000 (10:14 -0700)
author Ben Pfaff <blp@nicira.com>
Wed, 2 Sep 2009 17:14:53 +0000 (10:14 -0700)
committer Ben Pfaff <blp@nicira.com>
Wed, 2 Sep 2009 17:14:53 +0000 (10:14 -0700)
diff --git a/datapath/datapath.c b/datapath/datapath.c

index d822b73..6f96ee4 100644 (file)
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -839,7 +839,7 @@ static void clear_stats(struct sw_flow *flow)
  static int put_flow(struct datapath *dp, struct odp_flow_put __user *ufp)
  {
         struct odp_flow_put uf;
-       struct sw_flow *flow, **bucket;
+       struct sw_flow *flow;
         struct dp_table *table;
         struct odp_flow_stats stats;
         int error;
@@ -849,15 +849,10 @@ static int put_flow(struct datapath *dp, struct odp_flow_put __user *ufp)
                 goto error;
         uf.flow.key.reserved = 0;
  
-retry:
         table = rcu_dereference(dp->table);
-       bucket = dp_table_lookup_for_insert(table, &uf.flow.key);
-       if (!bucket) {
-               /* No such flow, and the slots where it could go are full. */
-               error = uf.flags & ODPPF_CREATE ? -EFBIG : -ENOENT;
-               goto error;
-       } else if (!*bucket) {
-               /* No such flow, but we found an available slot for it. */
+       flow = dp_table_lookup(table, &uf.flow.key);
+       if (!flow) {
+               /* No such flow. */
                 struct sw_flow_actions *acts;
  
                 error = -ENOENT;
@@ -865,14 +860,15 @@ retry:
                         goto error;
  
                 /* Expand table, if necessary, to make room. */
-               if (dp->n_flows * 4 >= table->n_buckets &&
-                   table->n_buckets < DP_MAX_BUCKETS) {
+               if (dp->n_flows >= table->n_buckets) {
+                       error = -ENOSPC;
+                       if (table->n_buckets >= DP_MAX_BUCKETS)
+                               goto error;
+
                         error = dp_table_expand(dp);
                         if (error)
                                 goto error;
-
-                       /* The bucket's location has changed.  Try again. */
-                       goto retry;
+                       table = rcu_dereference(dp->table);
                 }
  
                 /* Allocate flow. */
@@ -892,12 +888,13 @@ retry:
                 rcu_assign_pointer(flow->sf_acts, acts);
  
                 /* Put flow in bucket. */
-               rcu_assign_pointer(*bucket, flow);
+               error = dp_table_insert(table, flow);
+               if (error)
+                       goto error_free_flow_acts;
                 dp->n_flows++;
                 memset(&stats, 0, sizeof(struct odp_flow_stats));
         } else {
                 /* We found a matching flow. */
-               struct sw_flow *flow = *rcu_dereference(bucket);
                 struct sw_flow_actions *old_acts, *new_acts;
                 unsigned long int flags;
  
@@ -935,6 +932,8 @@ retry:
                 return -EFAULT;
         return 0;
  
+error_free_flow_acts:
+       kfree(flow->sf_acts);
  error_free_flow:
         kmem_cache_free(flow_cache, flow);
  error:
@@ -1167,8 +1166,8 @@ static int get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp)
         int i;
  
         stats.n_flows = dp->n_flows;
-       stats.cur_capacity = rcu_dereference(dp->table)->n_buckets * 2;
-       stats.max_capacity = DP_MAX_BUCKETS * 2;
+       stats.cur_capacity = rcu_dereference(dp->table)->n_buckets;
+       stats.max_capacity = DP_MAX_BUCKETS;
         stats.n_ports = dp->n_ports;
         stats.max_ports = DP_MAX_PORTS;
         stats.max_groups = DP_MAX_GROUPS;
diff --git a/datapath/datapath.h b/datapath/datapath.h

index b520084..122706a 100644 (file)
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -29,20 +29,54 @@
  #define DP_MAX_PORTS 256
  #define DP_MAX_GROUPS 16
  
-#define DP_L2_BITS (PAGE_SHIFT - ilog2(sizeof(struct sw_flow*)))
+#define DP_L2_BITS (PAGE_SHIFT - ilog2(sizeof(struct dp_bucket*)))
  #define DP_L2_SIZE (1 << DP_L2_BITS)
  #define DP_L2_SHIFT 0
  
-#define DP_L1_BITS (PAGE_SHIFT - ilog2(sizeof(struct sw_flow**)))
+#define DP_L1_BITS (PAGE_SHIFT - ilog2(sizeof(struct dp_bucket**)))
  #define DP_L1_SIZE (1 << DP_L1_BITS)
  #define DP_L1_SHIFT DP_L2_BITS
  
+/* For 4 kB pages, this is 1,048,576 on 32-bit or 262,144 on 64-bit. */
  #define DP_MAX_BUCKETS (DP_L1_SIZE * DP_L2_SIZE)
  
+/**
+ * struct dp_table - flow table
+ * @n_buckets: number of buckets (a power of 2 between %DP_L1_SIZE and
+ * %DP_MAX_BUCKETS)
+ * @buckets: pointer to @n_buckets/%DP_L1_SIZE pointers to %DP_L1_SIZE pointers
+ * to buckets
+ * @hash_seed: random number used for flow hashing, to make the hash
+ * distribution harder to predict
+ * @rcu: RCU callback structure
+ *
+ * The @buckets array is logically an array of pointers to buckets.  It is
+ * broken into two levels to avoid the need to kmalloc() any object larger than
+ * a single page or to use vmalloc().  @buckets is always nonnull, as is each
+ * @buckets[i], but each @buckets[i][j] is nonnull only if the specified hash
+ * bucket is nonempty (for 0 <= i < @n_buckets/%DP_L1_SIZE, 0 <= j <
+ * %DP_L1_SIZE).
+ */
  struct dp_table {
         unsigned int n_buckets;
-       struct sw_flow ***flows[2];
+       struct dp_bucket ***buckets;
+       unsigned int hash_seed;
+       struct rcu_head rcu;
+};
+
+/**
+ * struct dp_bucket - single bucket within datapath flow table
+ * @rcu: RCU callback structure
+ * @n_flows: number of flows in @flows[] array
+ * @flows: array of @n_flows pointers to flows
+ *
+ * The expected number of flows per bucket is 1, but this allows for an
+ * arbitrary number of collisions.
+ */
+struct dp_bucket {
         struct rcu_head rcu;
+       unsigned int n_flows;
+       struct sw_flow *flows[];
  };
  
  #define DP_N_QUEUES 2
@@ -105,7 +139,7 @@ extern int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
  struct dp_table *dp_table_create(unsigned int n_buckets);
  void dp_table_destroy(struct dp_table *, int free_flows);
  struct sw_flow *dp_table_lookup(struct dp_table *, const struct odp_flow_key *);
-struct sw_flow **dp_table_lookup_for_insert(struct dp_table *, const struct odp_flow_key *);
+int dp_table_insert(struct dp_table *, struct sw_flow *);
  int dp_table_delete(struct dp_table *, struct sw_flow *);
  int dp_table_expand(struct datapath *);
  int dp_table_flush(struct datapath *);
diff --git a/datapath/flow.c b/datapath/flow.c

index 2ac79e7..ae60617 100644 (file)
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -18,6 +18,7 @@
  #include <linux/module.h>
  #include <linux/in.h>
  #include <linux/rcupdate.h>
+#include <linux/if_arp.h>
  #include <linux/if_ether.h>
  #include <linux/ip.h>
  #include <linux/tcp.h>
@@ -29,6 +30,27 @@
  
  struct kmem_cache *flow_cache;
  
+struct arp_eth_header
+{
+       __be16      ar_hrd;     /* format of hardware address   */
+       __be16      ar_pro;     /* format of protocol address   */
+       unsigned char   ar_hln; /* length of hardware address   */
+       unsigned char   ar_pln; /* length of protocol address   */
+       __be16      ar_op;      /* ARP opcode (command)     */
+
+       /* Ethernet+IPv4 specific members. */
+       unsigned char       ar_sha[ETH_ALEN];   /* sender hardware address  */
+       unsigned char       ar_sip[4];          /* sender IP address        */
+       unsigned char       ar_tha[ETH_ALEN];   /* target hardware address  */
+       unsigned char       ar_tip[4];          /* target IP address        */
+} __attribute__((packed));
+
+static inline int arphdr_ok(struct sk_buff *skb)
+{
+       int nh_ofs = skb_network_offset(skb);
+       return pskb_may_pull(skb, nh_ofs + sizeof(struct arp_eth_header));
+}
+
  static inline int iphdr_ok(struct sk_buff *skb)
  {
         int nh_ofs = skb_network_offset(skb);
@@ -266,6 +288,27 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct odp_flow_key *key)
                 } else {
                         retval = 1;
                 }
+       } else if (key->dl_type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
+               struct arp_eth_header *arp;
+
+               arp = (struct arp_eth_header *)skb_network_header(skb);
+
+        if (arp->ar_hrd == htons(1)
+                && arp->ar_pro == htons(ETH_P_IP)
+                && arp->ar_hln == ETH_ALEN
+                && arp->ar_pln == 4) {
+
+            /* We only match on the lower 8 bits of the opcode. */
+            if (ntohs(arp->ar_op) <= 0xff) {
+                key->nw_proto = ntohs(arp->ar_op);
+            }
+
+            if (key->nw_proto == ARPOP_REQUEST 
+                    || key->nw_proto == ARPOP_REPLY) {
+                memcpy(&key->nw_src, arp->ar_sip, sizeof(key->nw_src));
+                memcpy(&key->nw_dst, arp->ar_tip, sizeof(key->nw_dst));
+            }
+        }
         } else {
                 skb_reset_transport_header(skb);
         }
diff --git a/datapath/table.c b/datapath/table.c

index 11aeb88..23ae8ab 100644 (file)
--- a/datapath/table.c
+++ b/datapath/table.c
@@ -11,50 +11,76 @@
  
  #include <linux/gfp.h>
  #include <linux/jhash.h>
+#include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
  #include <asm/pgtable.h>
  
-static void free_table(struct sw_flow ***flows, unsigned int n_buckets,
-                      int free_flows)
+static inline int bucket_size(int n_flows)
+{
+       return sizeof(struct dp_bucket) + sizeof(struct sw_flow*) * n_flows;
+}
+
+static struct dp_bucket *dp_bucket_alloc(int n_flows)
+{
+       return kmalloc(bucket_size(n_flows), GFP_KERNEL);
+}
+
+static void free_buckets(struct dp_bucket ***l1, unsigned int n_buckets,
+                        int free_flows)
  {
         unsigned int i;
  
         for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
-               struct sw_flow **l2 = flows[i];
-               if (free_flows) {
-                       unsigned int j;
-                       for (j = 0; j < DP_L1_SIZE; j++) {
-                               if (l2[j])
-                                       flow_free(l2[j]);
+               struct dp_bucket **l2 = l1[i];
+               unsigned int j;
+
+               for (j = 0; j < DP_L1_SIZE; j++) {
+                       struct dp_bucket *bucket = l2[j];
+                       if (!bucket)
+                               continue;
+
+                       if (free_flows) {
+                               unsigned int k;
+                               for (k = 0; k < bucket->n_flows; k++)
+                                       flow_free(bucket->flows[k]);
                         }
+                       kfree(bucket);
                 }
                 free_page((unsigned long)l2);
         }
-       kfree(flows);
+       kfree(l1);
  }
  
-static struct sw_flow ***alloc_table(unsigned int n_buckets)
+static struct dp_bucket ***alloc_buckets(unsigned int n_buckets)
  {
-       struct sw_flow ***flows;
+       struct dp_bucket ***l1;
         unsigned int i;
  
-       flows = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct sw_flow**),
-                       GFP_KERNEL);
-       if (!flows)
+       l1 = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct dp_bucket**),
+                    GFP_KERNEL);
+       if (!l1)
                 return NULL;
         for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
-               flows[i] = (struct sw_flow **)get_zeroed_page(GFP_KERNEL);
-               if (!flows[i]) {
-                       free_table(flows, i << DP_L1_BITS, 0);
+               l1[i] = (struct dp_bucket **)get_zeroed_page(GFP_KERNEL);
+               if (!l1[i]) {
+                       free_buckets(l1, i << DP_L1_BITS, 0);
                         return NULL;
                 }
         }
-       return flows;
+       return l1;
  }
  
+/**
+ * dp_table_create - create and return a new flow table
+ * @n_buckets: number of buckets in the new table
+ *
+ * Creates and returns a new flow table, or %NULL if memory cannot be
+ * allocated.  @n_buckets must be a power of 2 in the range %DP_L1_SIZE to
+ * %DP_MAX_BUCKETS.
+ */
  struct dp_table *dp_table_create(unsigned int n_buckets)
  {
         struct dp_table *table;
@@ -64,95 +90,124 @@ struct dp_table *dp_table_create(unsigned int n_buckets)
                 goto err;
  
         table->n_buckets = n_buckets;
-       table->flows[0] = alloc_table(n_buckets);
-       if (!table[0].flows)
-               goto err_free_tables;
-
-       table->flows[1] = alloc_table(n_buckets);
-       if (!table->flows[1])
-               goto err_free_flows0;
+       table->buckets = alloc_buckets(n_buckets);
+       if (!table->buckets)
+               goto err_free_table;
+       get_random_bytes(&table->hash_seed, sizeof table->hash_seed);
  
         return table;
  
-err_free_flows0:
-       free_table(table->flows[0], table->n_buckets, 0);
-err_free_tables:
+err_free_table:
         kfree(table);
  err:
         return NULL;
  }
  
+/**
+ * dp_table_destroy - destroy flow table and optionally the flows it contains
+ * @table: table to destroy (must not be %NULL)
+ * @free_flows: whether to destroy the flows
+ *
+ * If @free_flows is zero, then the buckets in @table are destroyed but not the
+ * flows within those buckets.  This behavior is useful when a table is being
+ * replaced by a larger or smaller one without destroying the flows.
+ *
+ * If @free_flows is nonzero, then the flows in @table are destroyed as well as
+ * the buckets.
+ */
  void dp_table_destroy(struct dp_table *table, int free_flows)
  {
-       int i;
-       for (i = 0; i < 2; i++)
-               free_table(table->flows[i], table->n_buckets, free_flows);
+       free_buckets(table->buckets, table->n_buckets, free_flows);
         kfree(table);
  }
  
-static struct sw_flow **find_bucket(struct dp_table *table,
-                                   struct sw_flow ***flows, u32 hash)
+static struct dp_bucket **find_bucket(struct dp_table *table, u32 hash)
  {
         unsigned int l1 = (hash & (table->n_buckets - 1)) >> DP_L1_SHIFT;
         unsigned int l2 = hash & ((1 << DP_L2_BITS) - 1);
-       return &flows[l1][l2];
+       return &table->buckets[l1][l2];
  }
  
-static struct sw_flow *lookup_table(struct dp_table *table,
-                                   struct sw_flow ***flows, u32 hash,
-                                   const struct odp_flow_key *key)
+static int search_bucket(const struct dp_bucket *bucket, const struct odp_flow_key *key)
  {
-       struct sw_flow **bucket = find_bucket(table, flows, hash);
-       struct sw_flow *flow = rcu_dereference(*bucket);
-       if (flow && !memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
-               return flow;
-       return NULL;
-}
+       int i;
  
-static u32 flow_hash0(const struct odp_flow_key *key)
-{
-       return jhash2((u32*)key, sizeof *key / sizeof(u32), 0xaaaaaaaa);
+       for (i = 0; i < bucket->n_flows; i++) {
+               struct sw_flow *flow = rcu_dereference(bucket->flows[i]);
+               if (!memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
+                       return i;
+       }
+
+       return -1;
  }
  
-static u32 flow_hash1(const struct odp_flow_key *key)
+static struct sw_flow *lookup_flow(struct dp_table *table, u32 hash,
+                                  const struct odp_flow_key *key)
  {
-       return jhash2((u32*)key, sizeof *key / sizeof(u32), 0x55555555);
+       struct dp_bucket **bucketp = find_bucket(table, hash);
+       struct dp_bucket *bucket = rcu_dereference(*bucketp);
+       int index;
+
+       if (!bucket)
+               return NULL;
+
+       index = search_bucket(bucket, key);
+       if (index < 0)
+               return NULL;
+
+       return bucket->flows[index];
  }
  
-static void find_buckets(struct dp_table *table,
-                        const struct odp_flow_key *key,
-                        struct sw_flow **buckets[2])
+static u32 flow_hash(const struct dp_table *table,
+                    const struct odp_flow_key *key)
  {
-       buckets[0] = find_bucket(table, table->flows[0], flow_hash0(key));
-       buckets[1] = find_bucket(table, table->flows[1], flow_hash1(key));
+       return jhash2((u32*)key, sizeof *key / sizeof(u32), table->hash_seed);
  }
  
+/**
+ * dp_table_lookup - searches flow table for a matching flow
+ * @table: flow table to search
+ * @key: flow key for which to search
+ *
+ * Searches @table for a flow whose key is equal to @key.  Returns the flow if
+ * successful, otherwise %NULL.
+ */
  struct sw_flow *dp_table_lookup(struct dp_table *table,
                                 const struct odp_flow_key *key)
  {
-       struct sw_flow *flow;
-       flow = lookup_table(table, table->flows[0], flow_hash0(key), key);
-       if (!flow)
-               flow = lookup_table(table, table->flows[1],
-                                   flow_hash1(key), key);
-       return flow;
+       return lookup_flow(table, flow_hash(table, key), key);
  }
  
+/**
+ * dp_table_foreach - iterate through flow table
+ * @table: table to iterate
+ * @callback: function to call for each flow entry
+ * @aux: Extra data to pass to @callback
+ *
+ * Iterates through all of the flows in @table in hash order, passing each of
+ * them in turn to @callback.  If @callback returns nonzero, this terminates
+ * the iteration and dp_table_foreach() returns the same value.  Returns 0 if
+ * @callback never returns nonzero.
+ *
+ * This function does not try to intelligently handle the case where @callback
+ * adds or removes flows in @table.
+ */
  int dp_table_foreach(struct dp_table *table,
                      int (*callback)(struct sw_flow *flow, void *aux),
                      void *aux)
  {
         unsigned int i, j, k;
-       for (i = 0; i < 2; i++) {
-               for (j = 0; j < table->n_buckets >> DP_L1_BITS; j++) {
-                       struct sw_flow **l2 = table->flows[i][j];
-                       for (k = 0; k < DP_L1_SIZE; k++) {
-                               struct sw_flow *flow = rcu_dereference(l2[k]);
-                               if (flow) {
-                                       int error = callback(flow, aux);
-                                       if (error)
-                                               return error;
-                               }
+       for (i = 0; i < table->n_buckets >> DP_L1_BITS; i++) {
+               struct dp_bucket **l2 = table->buckets[i];
+               for (j = 0; j < DP_L1_SIZE; j++) {
+                       struct dp_bucket *bucket = rcu_dereference(l2[j]);
+                       if (!bucket)
+                               continue;
+
+                       for (k = 0; k < bucket->n_flows; k++) {
+                               int error = (*callback)(bucket->flows[k], aux);
+                               if (error)
+                                       return error;
                         }
                 }
         }
@@ -162,18 +217,7 @@ int dp_table_foreach(struct dp_table *table,
  static int insert_flow(struct sw_flow *flow, void *new_table_)
  {
         struct dp_table *new_table = new_table_;
-       struct sw_flow **buckets[2];
-       int i;
-
-       find_buckets(new_table, &flow->key, buckets);
-       for (i = 0; i < 2; i++) {
-               if (!*buckets[i]) {
-                       rcu_assign_pointer(*buckets[i], flow);
-                       return 0;
-               }
-       }
-       WARN_ON_ONCE(1);
-       return 0;
+       return dp_table_insert(new_table, flow);
  }
  
  static void dp_free_table_rcu(struct rcu_head *rcu)
@@ -182,16 +226,34 @@ static void dp_free_table_rcu(struct rcu_head *rcu)
         dp_table_destroy(table, 0);
  }
  
+/**
+ * dp_table_expand - replace datapath's flow table by one with more buckets
+ * @dp: datapath to expand
+ *
+ * Replaces @dp's flow table by one that has twice as many buckets.  All of the
+ * flows in @dp's flow table are moved to the new flow table.  Returns 0 if
+ * successful, otherwise a negative error.
+ */
  int dp_table_expand(struct datapath *dp)
  {
         struct dp_table *old_table = rcu_dereference(dp->table);
-       struct dp_table *new_table = dp_table_create(old_table->n_buckets * 2);
+       struct dp_table *new_table;
+
+       new_table = dp_table_create(old_table->n_buckets * 2);
         if (!new_table)
-               return -ENOMEM;
-       dp_table_foreach(old_table, insert_flow, new_table);
+               goto error;
+
+       if (dp_table_foreach(old_table, insert_flow, new_table))
+               goto error_free_new_table;
+
         rcu_assign_pointer(dp->table, new_table);
         call_rcu(&old_table->rcu, dp_free_table_rcu);
         return 0;
+
+error_free_new_table:
+       dp_table_destroy(new_table, 0);
+error:
+       return -ENOMEM;
  }
  
  static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
@@ -200,6 +262,13 @@ static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
         dp_table_destroy(table, 1);
  }
  
+/**
+ * dp_table_flush - clear datapath's flow table
+ * @dp: datapath to clear
+ *
+ * Replaces @dp's flow table by an empty flow table, destroying all the flows
+ * in the old table (after a suitable RCU grace period).
+ */
  int dp_table_flush(struct datapath *dp)
  {
         struct dp_table *old_table = rcu_dereference(dp->table);
@@ -211,38 +280,88 @@ int dp_table_flush(struct datapath *dp)
         return 0;
  }
  
-struct sw_flow **
-dp_table_lookup_for_insert(struct dp_table *table,
-                          const struct odp_flow_key *target)
+static void dp_free_bucket_rcu(struct rcu_head *rcu)
  {
-       struct sw_flow **buckets[2];
-       struct sw_flow **empty_bucket = NULL;
-       int i;
+       struct dp_bucket *bucket = container_of(rcu, struct dp_bucket, rcu);
+       kfree(bucket);
+}
  
-       find_buckets(table, target, buckets);
-       for (i = 0; i < 2; i++) {
-               struct sw_flow *f = rcu_dereference(*buckets[i]);
-               if (f) {
-                       if (!memcmp(&f->key, target, sizeof(struct odp_flow_key)))
-                               return buckets[i];
-               } else if (!empty_bucket)
-                       empty_bucket = buckets[i];
-       }
-       return empty_bucket;
+/**
+ * dp_table_insert - insert flow into table
+ * @table: table in which to insert flow
+ * @target: flow to insert
+ *
+ * The caller must ensure that no flow with key identical to @target->key
+ * already exists in @table.  Returns 0 or a negative error (currently just
+ * -ENOMEM).
+ *
+ * The caller is responsible for updating &struct datapath's n_flows member.
+ */
+int dp_table_insert(struct dp_table *table, struct sw_flow *target)
+{
+       u32 hash = flow_hash(table, &target->key);
+       struct dp_bucket **oldp = find_bucket(table, hash);
+       struct dp_bucket *old = *rcu_dereference(oldp);
+       unsigned int n = old ? old->n_flows : 0;
+       struct dp_bucket *new = dp_bucket_alloc(n + 1);
+
+       if (!new)
+               return -ENOMEM;
+
+       new->n_flows = n + 1;
+       if (old)
+               memcpy(new->flows, old->flows, n * sizeof(struct sw_flow*));
+       new->flows[n] = target;
+
+       rcu_assign_pointer(*oldp, new);
+       if (old)
+               call_rcu(&old->rcu, dp_free_bucket_rcu);
+
+       return 0;
  }
  
+/**
+ * dp_table_delete - remove flow from table
+ * @table: table from which to remove flow
+ * @target: flow to remove
+ *
+ * The caller must ensure that @target itself is in @table.  (It is not
+ * good enough for @table to contain a different flow with a key equal to
+ * @target's key.)
+ *
+ * Returns 0 or a negative error (currently just -ENOMEM).  Yes, it *is*
+ * possible for a flow deletion to fail due to lack of memory.
+ *
+ * The caller is responsible for updating &struct datapath's n_flows member.
+ */
  int dp_table_delete(struct dp_table *table, struct sw_flow *target)
  {
-       struct sw_flow **buckets[2];
-       int i;
+       u32 hash = flow_hash(table, &target->key);
+       struct dp_bucket **oldp = find_bucket(table, hash);
+       struct dp_bucket *old = *rcu_dereference(oldp);
+       unsigned int n = old->n_flows;
+       struct dp_bucket *new;
+
+       if (n > 1) {
+               unsigned int i;
  
-       find_buckets(table, &target->key, buckets);
-       for (i = 0; i < 2; i++) {
-               struct sw_flow *flow = rcu_dereference(*buckets[i]);
-               if (flow == target) {
-                       rcu_assign_pointer(*buckets[i], NULL);
-                       return 0;
+               new = dp_bucket_alloc(n - 1);
+               if (!new)
+                       return -ENOMEM;
+
+               new->n_flows = 0;
+               for (i = 0; i < n; i++) {
+                       struct sw_flow *flow = old->flows[i];
+                       if (flow != target)
+                               new->flows[new->n_flows++] = flow;
                 }
+               WARN_ON_ONCE(new->n_flows != n - 1);
+       } else {
+               new = NULL;
         }
-       return -ENOENT;
+
+       rcu_assign_pointer(*oldp, new);
+       call_rcu(&old->rcu, dp_free_bucket_rcu);
+
+       return 0;
  }
diff --git a/debian/corekeeper.init b/debian/corekeeper.init

index 27d62a1..d820b02 100755 (executable)
--- a/debian/corekeeper.init
+++ b/debian/corekeeper.init
@@ -6,6 +6,7 @@
  # adjust it to the program you want to run.
  #
  # Copyright (c) 2007 Javier Fernandez-Sanguino <jfs@debian.org>
+# Copyright (c) 2009 Nicira Networks, Inc.
  #
  # This is free software; you may redistribute it and/or modify
  # it under the terms of the GNU General Public License as
@@ -42,7 +43,7 @@ set -e
  case "$1" in
    start)
         log_daemon_msg "Initializing core dump location..."
-        if echo "/var/log/core/core.%e.%t" > /proc/sys/kernel/core_pattern
+        if echo "/var/log/core/core.%e.%t.%p" > /proc/sys/kernel/core_pattern
          then
              log_progress_msg "success"
              log_end_msg 0
diff --git a/extras/ezio/ovs-switchui.c b/extras/ezio/ovs-switchui.c

index 721717e..0f6640e 100644 (file)
--- a/extras/ezio/ovs-switchui.c
+++ b/extras/ezio/ovs-switchui.c
@@ -2480,7 +2480,7 @@ choose_netdevs(struct svec *choices)
  
          retval = netdev_open(name, NETDEV_ETH_TYPE_NONE, &netdev);
          if (!retval) {
-            bool exclude = netdev_get_in4(netdev, NULL) == 0;
+            bool exclude = netdev_get_in4(netdev, NULL, NULL) == 0;
              netdev_close(netdev);
              if (exclude) {
                  continue;
diff --git a/include/openflow/openflow-mgmt.h b/include/openflow/openflow-mgmt.h

index c3b62c9..04017d4 100644 (file)
--- a/include/openflow/openflow-mgmt.h
+++ b/include/openflow/openflow-mgmt.h
@@ -243,7 +243,8 @@ enum ofmp_extended_data_flags {
  
  /* Body of extended data message.  May be sent by either the switch or the
   * controller to send messages that are greater than 65535 bytes in
- * length.
+ * length.  The OpenFlow transaction id (xid) must be the same for all
+ * the individual OpenFlow messages that make up an extended message.
   *
   * OFMPT_EXTENDED_DATA (switch <-> controller) */
  struct ofmp_extended_data {
diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h

index 868c854..04423d9 100644 (file)
--- a/include/openvswitch/datapath-protocol.h
+++ b/include/openvswitch/datapath-protocol.h
@@ -160,7 +160,8 @@ struct odp_flow_key {
      __be16 tp_dst;               /* TCP/UDP destination port. */
      __u8   dl_src[ETH_ALEN];     /* Ethernet source address. */
      __u8   dl_dst[ETH_ALEN];     /* Ethernet destination address. */
-    __u8   nw_proto;             /* IP protocol. */
+    __u8   nw_proto;             /* IP protocol or lower 8 bits of 
+                                    ARP opcode. */
      __u8   reserved;             /* Pad to 64 bits. */
  };
  
diff --git a/lib/flow.c b/lib/flow.c

index 1801d4d..c1f6240 100644 (file)
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -31,6 +31,12 @@
  #include "vlog.h"
  #define THIS_MODULE VLM_flow
  
+static struct arp_eth_header *
+pull_arp(struct ofpbuf *packet)
+{
+    return ofpbuf_try_pull(packet, ARP_ETH_HEADER_LEN);
+}
+
  static struct ip_header *
  pull_ip(struct ofpbuf *packet)
  {
@@ -185,6 +191,23 @@ flow_extract(struct ofpbuf *packet, uint16_t in_port, flow_t *flow)
                      retval = 1;
                  }
              }
+        } else if (flow->dl_type == htons(ETH_TYPE_ARP)) {
+            const struct arp_eth_header *arp = pull_arp(&b);
+            if (arp && arp->ar_hrd == htons(1)
+                    && arp->ar_pro == htons(ETH_TYPE_IP) 
+                    && arp->ar_hln == ETH_ADDR_LEN
+                    && arp->ar_pln == 4) {
+                /* We only match on the lower 8 bits of the opcode. */
+                if (ntohs(arp->ar_op) <= 0xff) {
+                    flow->nw_proto = ntohs(arp->ar_op);
+                }
+
+                if ((flow->nw_proto == ARP_OP_REQUEST) 
+                        || (flow->nw_proto == ARP_OP_REPLY)) {
+                    flow->nw_src = arp->ar_spa;
+                    flow->nw_dst = arp->ar_tpa;
+                }
+            }
          }
      }
      return retval;
@@ -212,8 +235,12 @@ flow_extract_stats(const flow_t *flow, struct ofpbuf *packet,
      stats->n_packets = 1;
  }
  
+/* The Open vSwitch datapath supports matching on ARP payloads, which 
+ * OpenFlow does not.  This function is identical to 'flow_to_match',
+ * but does not hide the datapath's ability to match on ARP. */
  void
-flow_to_match(const flow_t *flow, uint32_t wildcards, struct ofp_match *match)
+flow_to_ovs_match(const flow_t *flow, uint32_t wildcards, 
+                  struct ofp_match *match)
  {
      match->wildcards = htonl(wildcards);
      match->in_port = htons(flow->in_port == ODPP_LOCAL ? OFPP_LOCAL
@@ -230,6 +257,26 @@ flow_to_match(const flow_t *flow, uint32_t wildcards, struct ofp_match *match)
      match->pad = 0;
  }
  
+/* Extract 'flow' with 'wildcards' into the OpenFlow match structure
+ * 'match'. */
+void
+flow_to_match(const flow_t *flow, uint32_t wildcards, struct ofp_match *match)
+{
+    flow_to_ovs_match(flow, wildcards, match);
+
+    /* The datapath supports matching on an ARP's opcode and IP addresses, 
+     * but OpenFlow does not.  We wildcard and zero out the appropriate
+     * fields so that OpenFlow is unaware of our trickery. */
+    if (flow->dl_type == htons(ETH_TYPE_ARP)) {
+        wildcards |= (OFPFW_NW_PROTO | OFPFW_NW_SRC_ALL | OFPFW_NW_DST_ALL);
+        match->nw_src = 0;
+        match->nw_dst = 0;
+        match->nw_proto = 0;
+    }
+    match->wildcards = htonl(wildcards);
+}
+
+
  void
  flow_from_match(flow_t *flow, uint32_t *wildcards,
                  const struct ofp_match *match)
@@ -237,6 +284,14 @@ flow_from_match(flow_t *flow, uint32_t *wildcards,
      if (wildcards) {
          *wildcards = ntohl(match->wildcards);
      }
+    /* The datapath supports matching on an ARP's opcode and IP addresses, 
+     * but OpenFlow does not.  In case the controller hasn't, we need to 
+     * set the appropriate wildcard bits so that we're externally 
+     * OpenFlow-compliant. */
+    if (match->dl_type == htons(ETH_TYPE_ARP)) {
+        *wildcards |= (OFPFW_NW_PROTO | OFPFW_NW_SRC_ALL | OFPFW_NW_DST_ALL);
+    }
+
      flow->nw_src = match->nw_src;
      flow->nw_dst = match->nw_dst;
      flow->in_port = (match->in_port == htons(OFPP_LOCAL) ? ODPP_LOCAL
diff --git a/lib/flow.h b/lib/flow.h

index 3541505..cb20109 100644 (file)
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -36,6 +36,7 @@ int flow_extract(struct ofpbuf *, uint16_t in_port, flow_t *);
  void flow_extract_stats(const flow_t *flow, struct ofpbuf *packet, 
          struct odp_flow_stats *stats);
  void flow_to_match(const flow_t *, uint32_t wildcards, struct ofp_match *);
+void flow_to_ovs_match(const flow_t *, uint32_t wildcards, struct ofp_match *);
  void flow_from_match(flow_t *, uint32_t *wildcards, const struct ofp_match *);
  char *flow_to_string(const flow_t *);
  void flow_format(struct ds *, const flow_t *);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c

index 3e34044..11d83e9 100644 (file)
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -96,7 +96,7 @@ struct netdev_linux_cache {
  
      int ifindex;
      uint8_t etheraddr[ETH_ADDR_LEN];
-    struct in_addr in4;
+    struct in_addr address, netmask;
      struct in6_addr in6;
      int mtu;
      int carrier;
@@ -125,6 +125,8 @@ static int netdev_linux_do_ethtool(struct netdev *, struct ethtool_cmd *,
                                     int cmd, const char *cmd_name);
  static int netdev_linux_do_ioctl(const struct netdev *, struct ifreq *,
                                   int cmd, const char *cmd_name);
+static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
+                                 int cmd, const char *cmd_name);
  static int get_flags(const struct netdev *, int *flagsp);
  static int set_flags(struct netdev *, int flags);
  static int do_get_ifindex(const char *netdev_name);
@@ -935,49 +937,48 @@ netdev_linux_set_policing(struct netdev *netdev,
      return 0;
  }
  
-/* If 'netdev' has an assigned IPv4 address, sets '*in4' to that address (if
- * 'in4' is non-null) and returns true.  Otherwise, returns false. */
  static int
-netdev_linux_get_in4(const struct netdev *netdev_, struct in_addr *in4)
+netdev_linux_get_in4(const struct netdev *netdev_,
+                     struct in_addr *address, struct in_addr *netmask)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      if (!(netdev->cache->valid & VALID_IN4)) {
-        const struct sockaddr_in *sin;
-        struct ifreq ifr;
          int error;
  
-        ifr.ifr_addr.sa_family = AF_INET;
-        error = netdev_linux_do_ioctl(netdev_, &ifr,
+        error = netdev_linux_get_ipv4(netdev_, &netdev->cache->address,
                                        SIOCGIFADDR, "SIOCGIFADDR");
          if (error) {
              return error;
          }
  
-        sin = (struct sockaddr_in *) &ifr.ifr_addr;
-        netdev->cache->in4 = sin->sin_addr;
+        error = netdev_linux_get_ipv4(netdev_, &netdev->cache->netmask,
+                                      SIOCGIFNETMASK, "SIOCGIFNETMASK");
+        if (error) {
+            return error;
+        }
+
          netdev->cache->valid |= VALID_IN4;
      }
-    *in4 = netdev->cache->in4;
-    return in4->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
+    *address = netdev->cache->address;
+    *netmask = netdev->cache->netmask;
+    return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
  }
  
-/* Assigns 'addr' as 'netdev''s IPv4 address and 'mask' as its netmask.  If
- * 'addr' is INADDR_ANY, 'netdev''s IPv4 address is cleared.  Returns a
- * positive errno value. */
  static int
-netdev_linux_set_in4(struct netdev *netdev_, struct in_addr addr,
-                     struct in_addr mask)
+netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
+                     struct in_addr netmask)
  {
      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
      int error;
  
-    error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", addr);
+    error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
      if (!error) {
          netdev->cache->valid |= VALID_IN4;
-        netdev->cache->in4 = addr;
-        if (addr.s_addr != INADDR_ANY) {
+        netdev->cache->address = address;
+        netdev->cache->netmask = netmask;
+        if (address.s_addr != INADDR_ANY) {
              error = do_set_addr(netdev_, SIOCSIFNETMASK,
-                                "SIOCSIFNETMASK", mask);
+                                "SIOCSIFNETMASK", netmask);
          }
      }
      return error;
@@ -1076,6 +1077,67 @@ netdev_linux_add_router(struct netdev *netdev UNUSED, struct in_addr router)
      return error;
  }
  
+static int
+netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
+                          char **netdev_name)
+{
+    static const char fn[] = "/proc/net/route";
+    FILE *stream;
+    char line[256];
+    int ln;
+
+    *netdev_name = NULL;
+    stream = fopen(fn, "r");
+    if (stream == NULL) {
+        VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
+        return errno;
+    }
+
+    ln = 0;
+    while (fgets(line, sizeof line, stream)) {
+        if (++ln >= 2) {
+            char iface[17];
+            uint32_t dest, gateway, mask;
+            int refcnt, metric, mtu;
+            unsigned int flags, use, window, irtt;
+
+            if (sscanf(line,
+                       "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
+                       " %d %u %u\n",
+                       iface, &dest, &gateway, &flags, &refcnt,
+                       &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
+
+                VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s", 
+                        fn, ln, line);
+                continue;
+            }
+            if (!(flags & RTF_UP)) {
+                /* Skip routes that aren't up. */
+                continue;
+            }
+
+            /* The output of 'dest', 'mask', and 'gateway' were given in
+             * network byte order, so we don't need need any endian 
+             * conversions here. */
+            if ((dest & mask) == (host->s_addr & mask)) {
+                if (!gateway) {
+                    /* The host is directly reachable. */
+                    next_hop->s_addr = 0;
+                } else {
+                    /* To reach the host, we must go through a gateway. */
+                    next_hop->s_addr = gateway;
+                }
+                *netdev_name = xstrdup(iface);
+                fclose(stream);
+                return 0;
+            }
+        }
+    }
+
+    fclose(stream);
+    return ENXIO;
+}
+
  /* Looks up the ARP table entry for 'ip' on 'netdev'.  If one exists and can be
   * successfully retrieved, it stores the corresponding MAC address in 'mac' and
   * returns 0.  Otherwise, it returns a positive errno value; in particular,
@@ -1269,6 +1331,7 @@ const struct netdev_class netdev_linux_class = {
      netdev_linux_set_in4,
      netdev_linux_get_in6,
      netdev_linux_add_router,
+    netdev_linux_get_next_hop,
      netdev_linux_arp_lookup,
  
      netdev_linux_update_flags,
@@ -1312,6 +1375,7 @@ const struct netdev_class netdev_tap_class = {
      netdev_linux_set_in4,
      netdev_linux_get_in6,
      netdev_linux_add_router,
+    netdev_linux_get_next_hop,
      netdev_linux_arp_lookup,
  
      netdev_linux_update_flags,
@@ -1591,3 +1655,19 @@ netdev_linux_do_ioctl(const struct netdev *netdev, struct ifreq *ifr,
      }
      return 0;
  }
+
+static int
+netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
+                      int cmd, const char *cmd_name)
+{
+    struct ifreq ifr;
+    int error;
+
+    ifr.ifr_addr.sa_family = AF_INET;
+    error = netdev_linux_do_ioctl(netdev, &ifr, cmd, cmd_name);
+    if (!error) {
+        const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
+        *ip = sin->sin_addr;
+    }
+    return error;
+}
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h

index 9c880b1..a573e24 100644 (file)
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -208,7 +208,8 @@ struct netdev_class {
      int (*set_policing)(struct netdev *netdev, unsigned int kbits_rate,
                          unsigned int kbits_burst);
  
-    /* If 'netdev' has an assigned IPv4 address, sets '*in4' to that address.
+    /* If 'netdev' has an assigned IPv4 address, sets '*address' to that
+     * address and '*netmask' to the associated netmask.
       *
       * The following error values have well-defined meanings:
       *
@@ -218,7 +219,8 @@ struct netdev_class {
       *
       * This function may be set to null if it would always return EOPNOTSUPP
       * anyhow. */
-    int (*get_in4)(const struct netdev *netdev, struct in_addr *in4);
+    int (*get_in4)(const struct netdev *netdev, struct in_addr *address,
+                   struct in_addr *netmask);
  
      /* Assigns 'addr' as 'netdev''s IPv4 address and 'mask' as its netmask.  If
       * 'addr' is INADDR_ANY, 'netdev''s IPv4 address is cleared.
@@ -246,6 +248,17 @@ struct netdev_class {
       * anyhow. */
      int (*add_router)(struct netdev *netdev, struct in_addr router);
  
+    /* Looks up the next hop for 'host'.  If succesful, stores the next hop
+     * gateway's address (0 if 'host' is on a directly connected network) in
+     * '*next_hop' and a copy of the name of the device to reach 'host' in
+     * '*netdev_name', and returns 0.  The caller is responsible for freeing
+     * '*netdev_name' (by calling free()).
+     *
+     * This function may be set to null if it would always return EOPNOTSUPP
+     * anyhow. */
+    int (*get_next_hop)(const struct in_addr *host, struct in_addr *next_hop,
+                        char **netdev_name);
+
      /* Looks up the ARP table entry for 'ip' on 'netdev' and stores the
       * corresponding MAC address in 'mac'.  A return value of ENXIO, in
       * particular, indicates that there is no ARP table entry for 'ip' on
diff --git a/lib/netdev.c b/lib/netdev.c

index dcb63fa..38610e1 100644 (file)
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -391,9 +391,9 @@ netdev_set_advertisements(struct netdev *netdev, uint32_t advertise)
              : EOPNOTSUPP);
  }
  
-/* If 'netdev' has an assigned IPv4 address, sets '*in4' to that address and
- * returns 0.  Otherwise, returns a positive errno value and sets '*in4' to 0
- * (INADDR_ANY).
+/* If 'netdev' has an assigned IPv4 address, sets '*address' to that address
+ * and '*netmask' to its netmask and returns 0.  Otherwise, returns a positive
+ * errno value and sets '*address' to 0 (INADDR_ANY).
   *
   * The following error values have well-defined meanings:
   *
@@ -401,18 +401,24 @@ netdev_set_advertisements(struct netdev *netdev, uint32_t advertise)
   *
   *   - EOPNOTSUPP: No IPv4 network stack attached to 'netdev'.
   *
- * 'in4' may be null, in which case the address itself is not reported. */
+ * 'address' or 'netmask' or both may be null, in which case the address or netmask
+ * is not reported. */
  int
-netdev_get_in4(const struct netdev *netdev, struct in_addr *in4)
+netdev_get_in4(const struct netdev *netdev,
+               struct in_addr *address_, struct in_addr *netmask_)
  {
-    struct in_addr dummy;
+    struct in_addr address;
+    struct in_addr netmask;
      int error;
  
      error = (netdev->class->get_in4
-             ? netdev->class->get_in4(netdev, in4 ? in4 : &dummy)
+             ? netdev->class->get_in4(netdev, &address, &netmask)
               : EOPNOTSUPP);
-    if (error && in4) {
-        in4->s_addr = 0;
+    if (address_) {
+        address_->s_addr = error ? 0 : address.s_addr;
+    }
+    if (netmask_) {
+        netmask_->s_addr = error ? 0 : netmask.s_addr;
      }
      return error;
  }
@@ -439,6 +445,28 @@ netdev_add_router(struct netdev *netdev, struct in_addr router)
              : EOPNOTSUPP);
  }
  
+/* Looks up the next hop for 'host' for the TCP/IP stack that corresponds to
+ * 'netdev'.  If a route cannot not be determined, sets '*next_hop' to 0,
+ * '*netdev_name' to null, and returns a positive errno value.  Otherwise, if a
+ * next hop is found, stores the next hop gateway's address (0 if 'host' is on
+ * a directly connected network) in '*next_hop' and a copy of the name of the
+ * device to reach 'host' in '*netdev_name', and returns 0.  The caller is
+ * responsible for freeing '*netdev_name' (by calling free()). */
+int
+netdev_get_next_hop(const struct netdev *netdev,
+                    const struct in_addr *host, struct in_addr *next_hop,
+                    char **netdev_name)
+{
+    int error = (netdev->class->get_next_hop
+                 ? netdev->class->get_next_hop(host, next_hop, netdev_name)
+                 : EOPNOTSUPP);
+    if (error) {
+        next_hop->s_addr = 0;
+        *netdev_name = NULL;
+    }
+    return error;
+}
+
  /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address and
   * returns 0.  Otherwise, returns a positive errno value and sets '*in6' to
   * all-zero-bits (in6addr_any).
@@ -633,7 +661,7 @@ netdev_find_dev_by_in4(const struct in_addr *in4)
          struct in_addr dev_in4;
  
          if (!netdev_open(name, NETDEV_ETH_TYPE_NONE, &netdev)
-            && !netdev_get_in4(netdev, &dev_in4)
+            && !netdev_get_in4(netdev, &dev_in4, NULL)
              && dev_in4.s_addr == in4->s_addr) {
              goto exit;
          }
diff --git a/lib/netdev.h b/lib/netdev.h

index b66d7bc..4a29cf3 100644 (file)
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -107,10 +107,13 @@ int netdev_get_features(struct netdev *,
                          uint32_t *supported, uint32_t *peer);
  int netdev_set_advertisements(struct netdev *, uint32_t advertise);
  
-int netdev_get_in4(const struct netdev *, struct in_addr *);
+int netdev_get_in4(const struct netdev *, struct in_addr *address,
+                   struct in_addr *netmask);
  int netdev_set_in4(struct netdev *, struct in_addr addr, struct in_addr mask);
  int netdev_get_in6(const struct netdev *, struct in6_addr *);
  int netdev_add_router(struct netdev *, struct in_addr router);
+int netdev_get_next_hop(const struct netdev *, const struct in_addr *host,
+                        struct in_addr *next_hop, char **);
  int netdev_arp_lookup(const struct netdev *, uint32_t ip, uint8_t mac[6]);
  
  int netdev_get_flags(const struct netdev *, enum netdev_flags *);
diff --git a/ofproto/in-band.c b/ofproto/in-band.c

index a08af07..18415f4 100644 (file)
--- a/ofproto/in-band.c
+++ b/ofproto/in-band.c
@@ -22,6 +22,8 @@
  #include <net/if.h>
  #include <string.h>
  #include <stdlib.h>
+#include "dhcp.h"
+#include "dpif.h"
  #include "flow.h"
  #include "mac-learning.h"
  #include "netdev.h"
@@ -30,6 +32,7 @@
  #include "ofproto.h"
  #include "ofpbuf.h"
  #include "openflow/openflow.h"
+#include "openvswitch/datapath-protocol.h"
  #include "packets.h"
  #include "poll-loop.h"
  #include "rconn.h"
@@ -43,14 +46,15 @@
  #define IB_BASE_PRIORITY 18181800
  
  enum {
-    IBR_FROM_LOCAL_PORT,        /* Sent by the local port. */
-    IBR_OFP_TO_LOCAL,           /* Sent to secure channel on local port. */
-    IBR_ARP_FROM_LOCAL,         /* ARP from the local port. */
-    IBR_ARP_FROM_CTL,           /* ARP from the controller. */
-    IBR_TO_CTL_OFP_SRC,         /* To controller, OpenFlow source port. */
-    IBR_TO_CTL_OFP_DST,         /* To controller, OpenFlow dest port. */
-    IBR_FROM_CTL_OFP_SRC,       /* From controller, OpenFlow source port. */
-    IBR_FROM_CTL_OFP_DST,       /* From controller, OpenFlow dest port. */
+    IBR_FROM_LOCAL_DHCP,          /* From local port, DHCP. */
+    IBR_TO_LOCAL_ARP,             /* To local port, ARP. */
+    IBR_FROM_LOCAL_ARP,           /* From local port, ARP. */
+    IBR_TO_REMOTE_ARP,            /* To remote MAC, ARP. */
+    IBR_FROM_REMOTE_ARP,          /* From remote MAC, ARP. */
+    IBR_TO_CTL_ARP,               /* To controller IP, ARP. */
+    IBR_FROM_CTL_ARP,             /* From controller IP, ARP. */
+    IBR_TO_CTL_OFP,               /* To controller, OpenFlow port. */
+    IBR_FROM_CTL_OFP,             /* From controller, OpenFlow port. */
  #if OFP_TCP_PORT != OFP_SSL_PORT
  #error Need to support separate TCP and SSL flows.
  #endif
@@ -69,17 +73,17 @@ struct in_band {
      struct rconn *controller;
      struct status_category *ss_cat;
  
-    /* Keeping track of controller's MAC address. */
-    uint32_t ip;                /* Current IP, 0 if unknown. */
-    uint32_t last_ip;           /* Last known IP, 0 if never known. */
-    uint8_t mac[ETH_ADDR_LEN];  /* Current MAC, 0 if unknown. */
-    uint8_t last_mac[ETH_ADDR_LEN]; /* Last known MAC, 0 if never known */
-    struct netdev *netdev;
-    time_t next_refresh;        /* Next time to refresh MAC address. */
+    /* Keep track of local port's information. */
+    uint8_t local_mac[ETH_ADDR_LEN];       /* Current MAC. */
+    struct netdev *local_netdev;           /* Local port's network device. */
+    time_t next_local_refresh;
  
-    /* Keeping track of the local port's MAC address. */
-    uint8_t local_mac[ETH_ADDR_LEN]; /* Current MAC. */
-    time_t next_local_refresh;  /* Next time to refresh MAC address. */
+    /* Keep track of controller and next hop's information. */
+    uint32_t controller_ip;                /* Controller IP, 0 if unknown. */
+    uint8_t remote_mac[ETH_ADDR_LEN];      /* Remote MAC. */
+    struct netdev *remote_netdev;
+    uint8_t last_remote_mac[ETH_ADDR_LEN]; /* Previous remote MAC. */
+    time_t next_remote_refresh;
  
      /* Rules that we set up. */
      struct ib_rule rules[N_IB_RULES];
@@ -88,58 +92,64 @@ struct in_band {
  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
  
  static const uint8_t *
-get_controller_mac(struct in_band *ib)
+get_remote_mac(struct in_band *ib)
  {
+    int retval;
+    bool have_mac;
+    struct in_addr c_in4;   /* Controller's IP address. */
+    struct in_addr r_in4;   /* Next hop IP address. */
+    char *next_hop_dev;
      time_t now = time_now();
-    uint32_t controller_ip;
  
-    controller_ip = rconn_get_remote_ip(ib->controller);
-    if (controller_ip != ib->ip || now >= ib->next_refresh) {
-        bool have_mac;
-
-        ib->ip = controller_ip;
-
-        /* Look up MAC address. */
-        memset(ib->mac, 0, sizeof ib->mac);
-        if (ib->ip) {
-            struct in_addr local_in4 = { rconn_get_local_ip(ib->controller) };
-            struct in_addr in4;
-            int retval;
-
-            /* Refresh device with IP address 'in4'. */
-            if (!ib->netdev
-                || netdev_get_in4(ib->netdev, &in4)
-                || in4.s_addr != local_in4.s_addr)
-            {
-                netdev_close(ib->netdev);
-                ib->netdev = netdev_find_dev_by_in4(&local_in4);
-            }
+    if (now >= ib->next_remote_refresh) {
+        /* Find the next-hop IP address. */
+        c_in4.s_addr = ib->controller_ip;
+        memset(ib->remote_mac, 0, sizeof ib->remote_mac);
+        retval = netdev_get_next_hop(ib->local_netdev,
+                                     &c_in4, &r_in4, &next_hop_dev);
+        if (retval) {
+            VLOG_WARN("cannot find route for controller ("IP_FMT"): %s",
+                    IP_ARGS(&ib->controller_ip), strerror(retval));
+            ib->next_remote_refresh = now + 1;
+            return NULL;
+        }
+        if (!r_in4.s_addr) {
+            r_in4.s_addr = c_in4.s_addr;
+        }
  
-            if (ib->netdev) {
-                retval = netdev_arp_lookup(ib->netdev, ib->ip, ib->mac);
-                if (retval) {
-                    VLOG_DBG_RL(&rl, "cannot look up controller MAC address "
-                                "("IP_FMT"): %s",
-                                IP_ARGS(&ib->ip), strerror(retval));
-                }
-            } else {
-                VLOG_DBG_RL(&rl, "cannot find device with IP address "IP_FMT,
-                    IP_ARGS(&local_in4.s_addr));
+        /* Get the next-hop IP and network device. */
+        if (!ib->remote_netdev
+            || strcmp(netdev_get_name(ib->remote_netdev), next_hop_dev))
+        {
+            netdev_close(ib->remote_netdev);
+            retval = netdev_open(next_hop_dev, NETDEV_ETH_TYPE_NONE,
+                                 &ib->remote_netdev);
+            if (retval) {
+                VLOG_WARN_RL(&rl, "cannot open netdev %s (next hop "
+                             "to controller "IP_FMT"): %s",
+                             next_hop_dev, IP_ARGS(&ib->controller_ip),
+                             strerror(retval));
+                ib->next_remote_refresh = now + 1;
+                return NULL;
              }
          }
-        have_mac = !eth_addr_is_zero(ib->mac);
  
-        /* Log changes in IP, MAC addresses. */
-        if (ib->ip && ib->ip != ib->last_ip) {
-            VLOG_DBG("controller IP address changed from "IP_FMT
-                     " to "IP_FMT, IP_ARGS(&ib->last_ip), IP_ARGS(&ib->ip));
-            ib->last_ip = ib->ip;
+        /* Look up the MAC address of the next-hop IP address. */
+        retval = netdev_arp_lookup(ib->remote_netdev, r_in4.s_addr,
+                                   ib->remote_mac);
+        if (retval) {
+            VLOG_DBG_RL(&rl, "cannot look up remote MAC address ("IP_FMT"): %s",
+                        IP_ARGS(&r_in4.s_addr), strerror(retval));
          }
-        if (have_mac && memcmp(ib->last_mac, ib->mac, ETH_ADDR_LEN)) {
-            VLOG_DBG("controller MAC address changed from "ETH_ADDR_FMT" to "
+        have_mac = !eth_addr_is_zero(ib->remote_mac);
+        free(next_hop_dev);
+        if (have_mac
+            && !eth_addr_equals(ib->last_remote_mac, ib->remote_mac)) {
+            VLOG_DBG("remote MAC address changed from "ETH_ADDR_FMT" to "
                       ETH_ADDR_FMT,
-                     ETH_ADDR_ARGS(ib->last_mac), ETH_ADDR_ARGS(ib->mac));
-            memcpy(ib->last_mac, ib->mac, ETH_ADDR_LEN);
+                     ETH_ADDR_ARGS(ib->last_remote_mac),
+                     ETH_ADDR_ARGS(ib->remote_mac));
+            memcpy(ib->last_remote_mac, ib->remote_mac, ETH_ADDR_LEN);
          }
  
          /* Schedule next refresh.
@@ -147,9 +157,11 @@ get_controller_mac(struct in_band *ib)
           * If we have an IP address but not a MAC address, then refresh
           * quickly, since we probably will get a MAC address soon (via ARP).
           * Otherwise, we can afford to wait a little while. */
-        ib->next_refresh = now + (!ib->ip || have_mac ? 10 : 1);
+        ib->next_remote_refresh 
+                = now + (!ib->controller_ip || have_mac ? 10 : 1);
      }
-    return !eth_addr_is_zero(ib->mac) ? ib->mac : NULL;
+
+    return !eth_addr_is_zero(ib->remote_mac) ? ib->remote_mac : NULL;
  }
  
  static const uint8_t *
@@ -158,7 +170,7 @@ get_local_mac(struct in_band *ib)
      time_t now = time_now();
      if (now >= ib->next_local_refresh) {
          uint8_t ea[ETH_ADDR_LEN];
-        if (ib->netdev && !netdev_get_etheraddr(ib->netdev, ea)) {
+        if (ib->local_netdev && netdev_get_etheraddr(ib->local_netdev, ea)) {
              memcpy(ib->local_mac, ea, ETH_ADDR_LEN);
          }
          ib->next_local_refresh = now + 1;
@@ -170,19 +182,15 @@ static void
  in_band_status_cb(struct status_reply *sr, void *in_band_)
  {
      struct in_band *in_band = in_band_;
-    const uint8_t *local_mac;
-    const uint8_t *controller_mac;
  
-    local_mac = get_local_mac(in_band);
-    if (local_mac) {
+    if (!eth_addr_is_zero(in_band->local_mac)) {
          status_reply_put(sr, "local-mac="ETH_ADDR_FMT,
-                         ETH_ADDR_ARGS(local_mac));
+                         ETH_ADDR_ARGS(in_band->local_mac));
      }
  
-    controller_mac = get_controller_mac(in_band);
-    if (controller_mac) {
-        status_reply_put(sr, "controller-mac="ETH_ADDR_FMT,
-                         ETH_ADDR_ARGS(controller_mac));
+    if (!eth_addr_is_zero(in_band->remote_mac)) {
+        status_reply_put(sr, "remote-mac="ETH_ADDR_FMT,
+                         ETH_ADDR_ARGS(in_band->remote_mac));
      }
  }
  
@@ -224,54 +232,174 @@ setup_flow(struct in_band *in_band, int rule_idx, const flow_t *flow,
      }
  }
  
+/* Returns true if 'packet' should be sent to the local port regardless
+ * of the flow table. */ 
+bool
+in_band_msg_in_hook(struct in_band *in_band, const flow_t *flow, 
+                    const struct ofpbuf *packet)
+{
+    if (!in_band) {
+        return false;
+    }
+
+    /* Regardless of how the flow table is configured, we want to be
+     * able to see replies to our DHCP requests. */
+    if (flow->dl_type == htons(ETH_TYPE_IP)
+            && flow->nw_proto == IP_TYPE_UDP
+            && flow->tp_src == htons(DHCP_SERVER_PORT)
+            && flow->tp_dst == htons(DHCP_CLIENT_PORT)
+            && packet->l7) {
+        struct dhcp_header *dhcp;
+        const uint8_t *local_mac;
+
+        dhcp = ofpbuf_at(packet, (char *)packet->l7 - (char *)packet->data,
+                         sizeof *dhcp);
+        if (!dhcp) {
+            return false;
+        }
+
+        local_mac = get_local_mac(in_band);
+        if (eth_addr_equals(dhcp->chaddr, local_mac)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Returns true if the rule that would match 'flow' with 'actions' is 
+ * allowed to be set up in the datapath. */
+bool
+in_band_rule_check(struct in_band *in_band, const flow_t *flow,
+                   const struct odp_actions *actions)
+{
+    if (!in_band) {
+        return true;
+    }
+
+    /* Don't allow flows that would prevent DHCP replies from being seen
+     * by the local port. */
+    if (flow->dl_type == htons(ETH_TYPE_IP)
+            && flow->nw_proto == IP_TYPE_UDP
+            && flow->tp_src == htons(DHCP_SERVER_PORT) 
+            && flow->tp_dst == htons(DHCP_CLIENT_PORT)) {
+        int i;
+
+        for (i=0; i<actions->n_actions; i++) {
+            if (actions->actions[i].output.type == ODPAT_OUTPUT 
+                    && actions->actions[i].output.port == ODPP_LOCAL) {
+                return true;
+            }   
+        }
+        return false;
+    }
+
+    return true;
+}
+
  void
  in_band_run(struct in_band *in_band)
  {
-    const uint8_t *controller_mac;
+    time_t now = time_now();
+    uint32_t controller_ip;
+    const uint8_t *remote_mac;
      const uint8_t *local_mac;
      flow_t flow;
  
-    if (time_now() < MIN(in_band->next_refresh, in_band->next_local_refresh)) {
+    if (now < in_band->next_remote_refresh 
+            && now < in_band->next_local_refresh) {
          return;
      }
-    controller_mac = get_controller_mac(in_band);
-    local_mac = get_local_mac(in_band);
  
-    /* Switch traffic sent by the local port. */
-    memset(&flow, 0, sizeof flow);
-    flow.in_port = ODPP_LOCAL;
-    setup_flow(in_band, IBR_FROM_LOCAL_PORT, &flow, OFPFW_IN_PORT,
-               OFPP_NORMAL);
+    controller_ip = rconn_get_remote_ip(in_band->controller);
+    if (in_band->controller_ip && controller_ip != in_band->controller_ip) {
+        VLOG_DBG("controller IP address changed from "IP_FMT" to "IP_FMT, 
+                 IP_ARGS(&in_band->controller_ip),
+                 IP_ARGS(&controller_ip));
+    }
+    in_band->controller_ip = controller_ip;
+
+    remote_mac = get_remote_mac(in_band);
+    local_mac = get_local_mac(in_band);
  
      if (local_mac) {
-        /* Deliver traffic sent to the connection's interface. */
+        /* Allow DHCP requests to be sent from the local port. */
+        memset(&flow, 0, sizeof flow);
+        flow.in_port = ODPP_LOCAL;
+        flow.dl_type = htons(ETH_TYPE_IP);
+        memcpy(flow.dl_src, local_mac, ETH_ADDR_LEN);
+        flow.nw_proto = IP_TYPE_UDP;
+        flow.tp_src = htons(DHCP_CLIENT_PORT);
+        flow.tp_dst = htons(DHCP_SERVER_PORT);
+        setup_flow(in_band, IBR_FROM_LOCAL_DHCP, &flow,
+                   (OFPFW_IN_PORT | OFPFW_DL_TYPE | OFPFW_DL_SRC
+                    | OFPFW_NW_PROTO | OFPFW_TP_SRC | OFPFW_TP_DST), 
+                   OFPP_NORMAL);
+
+        /* Allow the connection's interface to receive directed ARP traffic. */
          memset(&flow, 0, sizeof flow);
+        flow.dl_type = htons(ETH_TYPE_ARP);
          memcpy(flow.dl_dst, local_mac, ETH_ADDR_LEN);
-        setup_flow(in_band, IBR_OFP_TO_LOCAL, &flow, OFPFW_DL_DST,
-                    OFPP_NORMAL);
+        flow.nw_proto = ARP_OP_REPLY;
+        setup_flow(in_band, IBR_TO_LOCAL_ARP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO), 
+                   OFPP_NORMAL);
  
          /* Allow the connection's interface to be the source of ARP traffic. */
          memset(&flow, 0, sizeof flow);
          flow.dl_type = htons(ETH_TYPE_ARP);
          memcpy(flow.dl_src, local_mac, ETH_ADDR_LEN);
-        setup_flow(in_band, IBR_ARP_FROM_LOCAL, &flow,
-                   OFPFW_DL_TYPE | OFPFW_DL_SRC, OFPP_NORMAL);
+        flow.nw_proto = ARP_OP_REQUEST;
+        setup_flow(in_band, IBR_FROM_LOCAL_ARP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO),
+                   OFPP_NORMAL);
+    } else {
+        drop_flow(in_band, IBR_TO_LOCAL_ARP);
+        drop_flow(in_band, IBR_FROM_LOCAL_ARP);
+    }
+
+    if (remote_mac) {
+        /* Allow ARP replies to the remote side's MAC. */
+        memset(&flow, 0, sizeof flow);
+        flow.dl_type = htons(ETH_TYPE_ARP);
+        memcpy(flow.dl_dst, remote_mac, ETH_ADDR_LEN);
+        flow.nw_proto = ARP_OP_REPLY;
+        setup_flow(in_band, IBR_TO_REMOTE_ARP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO), 
+                   OFPP_NORMAL);
+
+       /* Allow ARP requests from the remote side's MAC. */
+        memset(&flow, 0, sizeof flow);
+        flow.dl_type = htons(ETH_TYPE_ARP);
+        memcpy(flow.dl_src, remote_mac, ETH_ADDR_LEN);
+        flow.nw_proto = ARP_OP_REQUEST;
+        setup_flow(in_band, IBR_FROM_REMOTE_ARP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO), 
+                   OFPP_NORMAL);
      } else {
-        drop_flow(in_band, IBR_OFP_TO_LOCAL);
-        drop_flow(in_band, IBR_ARP_FROM_LOCAL);
+        drop_flow(in_band, IBR_TO_REMOTE_ARP);
+        drop_flow(in_band, IBR_FROM_REMOTE_ARP);
      }
  
-    if (controller_mac) {
-        /* Switch ARP requests sent by the controller.  (OFPP_NORMAL will "do
-         * the right thing" regarding VLANs here.) */
+    if (controller_ip) {
+        /* Allow ARP replies to the controller's IP. */
          memset(&flow, 0, sizeof flow);
          flow.dl_type = htons(ETH_TYPE_ARP);
-        memcpy(flow.dl_dst, eth_addr_broadcast, ETH_ADDR_LEN);
-        memcpy(flow.dl_src, controller_mac, ETH_ADDR_LEN);
-        setup_flow(in_band, IBR_ARP_FROM_CTL, &flow,
-                   OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_DL_SRC,
+        flow.nw_proto = ARP_OP_REPLY;
+        flow.nw_dst = controller_ip;
+        setup_flow(in_band, IBR_TO_CTL_ARP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK),
                     OFPP_NORMAL);
  
+       /* Allow ARP requests from the controller's IP. */
+        memset(&flow, 0, sizeof flow);
+        flow.dl_type = htons(ETH_TYPE_ARP);
+        flow.nw_proto = ARP_OP_REQUEST;
+        flow.nw_src = controller_ip;
+        setup_flow(in_band, IBR_FROM_CTL_ARP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK),
+                   OFPP_NORMAL);
+     
          /* OpenFlow traffic to or from the controller.
           *
           * (A given field's value is completely ignored if it is wildcarded,
@@ -279,29 +407,22 @@ in_band_run(struct in_band *in_band)
           * case here.) */
          memset(&flow, 0, sizeof flow);
          flow.dl_type = htons(ETH_TYPE_IP);
-        memcpy(flow.dl_src, controller_mac, ETH_ADDR_LEN);
-        memcpy(flow.dl_dst, controller_mac, ETH_ADDR_LEN);
          flow.nw_proto = IP_TYPE_TCP;
+        flow.nw_src = controller_ip;
+        flow.nw_dst = controller_ip;
          flow.tp_src = htons(OFP_TCP_PORT);
          flow.tp_dst = htons(OFP_TCP_PORT);
-        setup_flow(in_band, IBR_TO_CTL_OFP_SRC, &flow,
-                   (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO
-                    | OFPFW_TP_SRC), OFPP_NORMAL);
-        setup_flow(in_band, IBR_TO_CTL_OFP_DST, &flow,
-                   (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO
+        setup_flow(in_band, IBR_TO_CTL_OFP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK 
                      | OFPFW_TP_DST), OFPP_NORMAL);
-        setup_flow(in_band, IBR_FROM_CTL_OFP_SRC, &flow,
-                   (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO
+        setup_flow(in_band, IBR_FROM_CTL_OFP, &flow,
+                   (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK
                      | OFPFW_TP_SRC), OFPP_NORMAL);
-        setup_flow(in_band, IBR_FROM_CTL_OFP_DST, &flow,
-                   (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO
-                    | OFPFW_TP_DST), OFPP_NORMAL);
      } else {
-        drop_flow(in_band, IBR_ARP_FROM_CTL);
-        drop_flow(in_band, IBR_TO_CTL_OFP_DST);
-        drop_flow(in_band, IBR_TO_CTL_OFP_SRC);
-        drop_flow(in_band, IBR_FROM_CTL_OFP_DST);
-        drop_flow(in_band, IBR_FROM_CTL_OFP_SRC);
+        drop_flow(in_band, IBR_TO_CTL_ARP);
+        drop_flow(in_band, IBR_FROM_CTL_ARP);
+        drop_flow(in_band, IBR_TO_CTL_OFP);
+        drop_flow(in_band, IBR_FROM_CTL_OFP);
      }
  }
  
@@ -309,7 +430,8 @@ void
  in_band_wait(struct in_band *in_band)
  {
      time_t now = time_now();
-    time_t wakeup = MIN(in_band->next_refresh, in_band->next_local_refresh);
+    time_t wakeup 
+            = MIN(in_band->next_remote_refresh, in_band->next_local_refresh);
      if (wakeup > now) {
          poll_timer_wait((wakeup - now) * 1000);
      } else {
@@ -327,22 +449,44 @@ in_band_flushed(struct in_band *in_band)
      }
  }
  
-void
-in_band_create(struct ofproto *ofproto, struct switch_status *ss,
-               struct rconn *controller, struct in_band **in_bandp)
+int
+in_band_create(struct ofproto *ofproto, struct dpif *dpif,
+               struct switch_status *ss, struct rconn *controller, 
+               struct in_band **in_bandp)
  {
      struct in_band *in_band;
+    char local_name[IF_NAMESIZE];
+    struct netdev *local_netdev;
+    int error;
+
+    error = dpif_port_get_name(dpif, ODPP_LOCAL,
+                               local_name, sizeof local_name);
+    if (error) {
+        VLOG_ERR("failed to initialize in-band control: cannot get name "
+                 "of datapath local port (%s)", strerror(error));
+        return error;
+    }
+
+    error = netdev_open(local_name, NETDEV_ETH_TYPE_NONE, &local_netdev);
+    if (error) {
+        VLOG_ERR("failed to initialize in-band control: cannot open "
+                 "datapath local port %s (%s)", local_name, strerror(error));
+        return error;
+    }
  
      in_band = xcalloc(1, sizeof *in_band);
      in_band->ofproto = ofproto;
      in_band->controller = controller;
      in_band->ss_cat = switch_status_register(ss, "in-band",
                                               in_band_status_cb, in_band);
-    in_band->next_refresh = TIME_MIN;
+    in_band->local_netdev = local_netdev;
      in_band->next_local_refresh = TIME_MIN;
-    in_band->netdev = NULL;
+    in_band->remote_netdev = NULL;
+    in_band->next_remote_refresh = TIME_MIN;
  
      *in_bandp = in_band;
+
+    return 0;
  }
  
  void
@@ -350,7 +494,8 @@ in_band_destroy(struct in_band *in_band)
  {
      if (in_band) {
          switch_status_unregister(in_band->ss_cat);
-        netdev_close(in_band->netdev);
+        netdev_close(in_band->local_netdev);
+        netdev_close(in_band->remote_netdev);
          /* We don't own the rconn. */
      }
  }
diff --git a/ofproto/in-band.h b/ofproto/in-band.h

index 624bee9..ddbc5e5 100644 (file)
--- a/ofproto/in-band.h
+++ b/ofproto/in-band.h
@@ -21,15 +21,20 @@
  
  struct dpif;
  struct in_band;
+struct odp_actions;
  struct ofproto;
  struct rconn;
  struct settings;
  struct switch_status;
  
-void in_band_create(struct ofproto *, struct switch_status *,
-                    struct rconn *controller, struct in_band **);
+int in_band_create(struct ofproto *, struct dpif *, struct switch_status *,
+                   struct rconn *controller, struct in_band **);
  void in_band_destroy(struct in_band *);
  void in_band_run(struct in_band *);
+bool in_band_msg_in_hook(struct in_band *, const flow_t *, 
+                         const struct ofpbuf *packet);
+bool in_band_rule_check(struct in_band *, const flow_t *,
+                        const struct odp_actions *);
  void in_band_wait(struct in_band *);
  void in_band_flushed(struct in_band *);
  
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c

index dbaa75b..7650068 100644 (file)
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -424,9 +424,8 @@ ofproto_set_in_band(struct ofproto *p, bool in_band)
  {
      if (in_band != (p->in_band != NULL)) {
          if (in_band) {
-            in_band_create(p, p->switch_status, p->controller->rconn, 
-                           &p->in_band);
-            return 0;
+            return in_band_create(p, p->dpif, p->switch_status,
+                                  p->controller->rconn, &p->in_band);
          } else {
              ofproto_set_discovery(p, false, NULL, true);
              in_band_destroy(p->in_band);
@@ -1700,7 +1699,7 @@ rule_post_uninstall(struct ofproto *ofproto, struct rule *rule)
      struct rule *super = rule->super;
  
      rule_account(ofproto, rule, 0);
-    if (ofproto->netflow) {
+    if (ofproto->netflow && rule->byte_count) {
          struct ofexpired expired;
          expired.flow = rule->cr.flow;
          expired.packet_count = rule->packet_count;
@@ -2127,6 +2126,13 @@ xlate_actions(const union ofp_action *in, size_t n_in,
      ctx.tags = tags ? tags : &no_tags;
      ctx.may_setup_flow = true;
      do_xlate_actions(in, n_in, &ctx);
+
+    /* Check with in-band control to see if we're allowed to setup this
+     * flow. */
+    if (!in_band_rule_check(ofproto->in_band, flow, out)) {
+        ctx.may_setup_flow = false;
+    }
+
      if (may_setup_flow) {
          *may_setup_flow = ctx.may_setup_flow;
      }
@@ -2516,11 +2522,11 @@ flow_stats_ds_cb(struct cls_rule *rule_, void *cbdata_)
      }
  
      query_stats(cbdata->ofproto, rule, &packet_count, &byte_count);
-    flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &match);
+    flow_to_ovs_match(&rule->cr.flow, rule->cr.wc.wildcards, &match);
  
      ds_put_format(results, "duration=%llds, ",
                    (time_msec() - rule->created) / 1000);
-    ds_put_format(results, "priority=%u", rule->cr.priority);
+    ds_put_format(results, "priority=%u, ", rule->cr.priority);
      ds_put_format(results, "n_packets=%"PRIu64", ", packet_count);
      ds_put_format(results, "n_bytes=%"PRIu64", ", byte_count);
      ofp_print_match(results, &match, true);
@@ -3028,6 +3034,17 @@ handle_odp_msg(struct ofproto *p, struct ofpbuf *packet)
      payload.size = msg->length - sizeof *msg;
      flow_extract(&payload, msg->port, &flow);
  
+    /* Check with in-band control to see if this packet should be sent
+     * to the local port regardless of the flow table. */
+    if (in_band_msg_in_hook(p->in_band, &flow, &payload)) {
+        union odp_action action;
+
+        memset(&action, 0, sizeof(action));
+        action.output.type = ODPAT_OUTPUT;
+        action.output.port = ODPP_LOCAL;
+        dpif_execute(p->dpif, flow.in_port, &action, 1, &payload);
+    }
+
      rule = lookup_valid_rule(p, &flow);
      if (!rule) {
          /* Don't send a packet-in if OFPPC_NO_PACKET_IN asserted. */
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c

index 869d717..7081512 100644 (file)
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -1774,12 +1774,14 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan,
                  for (i = 0; i < br->n_ports; i++) {
                      struct port *port = br->ports[i];
                      if (port_includes_vlan(port, m->out_vlan)
-                        && set_dst(dst, flow, in_port, port, tags)
-                        && !dst_is_duplicate(dsts, dst - dsts, dst))
+                        && set_dst(dst, flow, in_port, port, tags))
                      {
                          if (port->vlan < 0) {
                              dst->vlan = m->out_vlan;
                          }
+                        if (dst_is_duplicate(dsts, dst - dsts, dst)) {
+                            continue;
+                        }
                          if (dst->dp_ifidx == flow->in_port
                              && dst->vlan == vlan) {
                              /* Don't send out input port on same VLAN. */
@@ -3369,6 +3371,7 @@ mirror_reconfigure_one(struct mirror *m)
      int *vlans;
      size_t i;
      bool mirror_all_ports;
+    bool any_ports_specified;
  
      /* Get output port. */
      out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port",
@@ -3407,11 +3410,18 @@ mirror_reconfigure_one(struct mirror *m)
      cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx);
      cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx);
      cfg_get_all_keys(&ports, "%s.select.port", pfx);
+    any_ports_specified = src_ports.n || dst_ports.n || ports.n;
      svec_append(&src_ports, &ports);
      svec_append(&dst_ports, &ports);
      svec_destroy(&ports);
      prune_ports(m, &src_ports);
      prune_ports(m, &dst_ports);
+    if (any_ports_specified && !src_ports.n && !dst_ports.n) {
+        VLOG_ERR("%s: none of the specified ports exist; "
+                 "disabling port mirror %s", pfx, pfx);
+        mirror_destroy(m);
+        goto exit;
+    }
  
      /* Get all the vlans, and drop duplicate and invalid vlans. */
      svec_init(&vlan_strings);
@@ -3463,6 +3473,7 @@ mirror_reconfigure_one(struct mirror *m)
      }
  
      /* Clean up. */
+exit:
      svec_destroy(&src_ports);
      svec_destroy(&dst_ports);
      free(pfx);
diff --git a/vswitchd/mgmt.c b/vswitchd/mgmt.c

index e6e7d4e..d15b4ba 100644 (file)
--- a/vswitchd/mgmt.c
+++ b/vswitchd/mgmt.c
@@ -54,6 +54,7 @@ static struct rconn *mgmt_rconn;
  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
  static struct svec capabilities;
  static struct ofpbuf ext_data_buffer;
+static uint32_t ext_data_xid = UINT32_MAX;
  uint64_t mgmt_id;
  
  
@@ -222,6 +223,10 @@ mgmt_reconfigure(void)
      if (retval == EAFNOSUPPORT) {
          VLOG_ERR("no support for %s vconn", controller_name);
      }
+
+    /* Reset the extended message buffer when we create a new
+     * management connection. */
+    ofpbuf_clear(&ext_data_buffer);
  }
  
  static void *
@@ -261,12 +266,18 @@ send_openflow_buffer(struct ofpbuf *buffer)
          return EINVAL;
      }
  
+    /* Make sure there's room to transmit the data.  We don't want to
+     * fail part way through a send. */
+    if (rconn_packet_counter_read(txqlen) >= TXQ_LIMIT) {
+        return EAGAIN;
+    }
+
      /* OpenFlow messages use a 16-bit length field, so messages over 64K
       * must be broken into multiple pieces. 
       */
      if (buffer->size <= 65535) {
          update_openflow_length(buffer);
-        retval = rconn_send_with_limit(mgmt_rconn, buffer, txqlen, TXQ_LIMIT);
+        retval = rconn_send(mgmt_rconn, buffer, txqlen);
          if (retval) {
              VLOG_WARN_RL(&rl, "send to %s failed: %s",
                           rconn_get_name(mgmt_rconn), strerror(retval));
@@ -292,12 +303,10 @@ send_openflow_buffer(struct ofpbuf *buffer)
                      &new_buffer);
              oed->type = header->type;
  
-            if (remain > 65535) {
+            if (remain > new_len) {
                  oed->flags |= OFMPEDF_MORE_DATA;
              }
  
-            printf("xxx SENDING LEN: %d\n", new_len);
-
              /* Copy the entire original message, including the OpenFlow
               * header, since management protocol structure definitions
               * include these headers.
@@ -305,8 +314,7 @@ send_openflow_buffer(struct ofpbuf *buffer)
              ofpbuf_put(new_buffer, ptr, new_len);
  
              update_openflow_length(new_buffer);
-            retval = rconn_send_with_limit(mgmt_rconn, new_buffer, txqlen, 
-                    TXQ_LIMIT);
+            retval = rconn_send(mgmt_rconn, new_buffer, txqlen);
              if (retval) {
                  VLOG_WARN_RL(&rl, "send to %s failed: %s",
                               rconn_get_name(mgmt_rconn), strerror(retval));
@@ -670,23 +678,48 @@ static int
  recv_ofmp_extended_data(uint32_t xid, const struct ofmp_header *ofmph,
          size_t len)
  {
-    size_t data_len;
+    int data_len;
      struct ofmp_extended_data *ofmped;
-    uint8_t *ptr;
  
-    data_len = len - sizeof(*ofmped);
-    if (data_len <= sizeof(*ofmped)) {
+    if (len <= sizeof(*ofmped)) {
          /* xxx Send error. */
          return -EINVAL;
      }
  
+    ext_data_xid = xid;
      ofmped = (struct ofmp_extended_data *)ofmph;
  
-    ptr = ofpbuf_put(&ext_data_buffer, ofmped->data, data_len);
+    data_len = len - sizeof(*ofmped);
+    ofpbuf_put(&ext_data_buffer, ofmped->data, data_len);
+
+    if (!(ofmped->flags & OFMPEDF_MORE_DATA)) {
+        struct ofmp_header *new_oh;
+        int error;
+
+        /* An embedded message must be greater than the size of an
+         * OpenFlow message. */
+        new_oh = ofpbuf_at(&ext_data_buffer, 0, 65536);
+        if (!new_oh) {
+            VLOG_WARN_RL(&rl, "received short embedded message: %d\n",
+                    ext_data_buffer.size);
+            return -EINVAL;
+        }
+
+        /* Make sure that this is a management message and that there's
+         * not an embedded extended data message. */
+        if ((new_oh->header.vendor != htonl(NX_VENDOR_ID))
+                || (new_oh->header.subtype != htonl(NXT_MGMT))
+                || (new_oh->type == htonl(OFMPT_EXTENDED_DATA))) {
+            VLOG_WARN_RL(&rl, "received bad embedded message\n");
+            return -EINVAL;
+        }
+        new_oh->header.header.xid = ext_data_xid;
+        new_oh->header.header.length = 0;
  
-    if (!ofmped->flags & OFMPEDF_MORE_DATA) {
-        recv_ofmp(xid, ext_data_buffer.data, ext_data_buffer.size);
+        error = recv_ofmp(xid, ext_data_buffer.data, ext_data_buffer.size);
          ofpbuf_clear(&ext_data_buffer);
+
+        return error;
      }
  
      return 0;
@@ -707,6 +740,12 @@ int recv_ofmp(uint32_t xid, struct ofmp_header *ofmph, size_t len)
          len = ntohs(ofmph->header.header.length);
      }
  
+    /* Reset the extended data buffer if this isn't a continuation of an 
+     * existing extended data message. */
+    if (ext_data_xid != xid) {
+        ofpbuf_clear(&ext_data_buffer);
+    }
+
      /* xxx Should sanity-check for min/max length */
      switch (ntohs(ofmph->type)) 
      {
author	Ben Pfaff <blp@nicira.com>
	Wed, 2 Sep 2009 17:14:53 +0000 (10:14 -0700)
committer	Ben Pfaff <blp@nicira.com>
	Wed, 2 Sep 2009 17:14:53 +0000 (10:14 -0700)
datapath/datapath.c		patch \| blob \| history
datapath/datapath.h		patch \| blob \| history
datapath/flow.c		patch \| blob \| history
datapath/table.c		patch \| blob \| history
debian/corekeeper.init		patch \| blob \| history
extras/ezio/ovs-switchui.c		patch \| blob \| history
include/openflow/openflow-mgmt.h		patch \| blob \| history
include/openvswitch/datapath-protocol.h		patch \| blob \| history
lib/flow.c		patch \| blob \| history
lib/flow.h		patch \| blob \| history
lib/netdev-linux.c		patch \| blob \| history
lib/netdev-provider.h		patch \| blob \| history
lib/netdev.c		patch \| blob \| history
lib/netdev.h		patch \| blob \| history
ofproto/in-band.c		patch \| blob \| history
ofproto/in-band.h		patch \| blob \| history
ofproto/ofproto.c		patch \| blob \| history
vswitchd/bridge.c		patch \| blob \| history
vswitchd/mgmt.c		patch \| blob \| history