Fix off-by-one error in looking up datapaths by index.
[sliver-openvswitch.git] / datapath / datapath.c
index d2fcfa9..9d0cea7 100644 (file)
@@ -15,6 +15,7 @@
 #include <net/genetlink.h>
 #include <linux/ip.h>
 #include <linux/delay.h>
+#include <linux/time.h>
 #include <linux/etherdevice.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
 #include <linux/ethtool.h>
 #include <linux/random.h>
 #include <asm/system.h>
+#include <asm/div64.h>
 #include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
 #include <linux/inetdevice.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/workqueue.h>
+#include <linux/dmi.h>
 
-#include "openflow-netlink.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow-netlink.h"
 #include "datapath.h"
+#include "nx_act_snat.h"
 #include "table.h"
 #include "chain.h"
 #include "dp_dev.h"
@@ -44,9 +50,9 @@
 
 /* Strings to describe the manufacturer, hardware, and software.  This data 
  * is queriable through the switch description stats message. */
-static char mfr_desc[DESC_STR_LEN] = "Nicira Networks";
+static char mfr_desc[DESC_STR_LEN] = "Nicira Networks, Inc.";
 static char hw_desc[DESC_STR_LEN] = "Reference Linux Kernel Module";
-static char sw_desc[DESC_STR_LEN] = VERSION;
+static char sw_desc[DESC_STR_LEN] = VERSION BUILDNR;
 static char serial_num[SERIAL_NUM_LEN] = "None";
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
@@ -218,37 +224,15 @@ send_openflow_skb(struct sk_buff *skb, const struct sender *sender)
                : genlmsg_multicast(skb, 0, mc_group.id, GFP_ATOMIC));
 }
 
-/* Generates a unique datapath id.  It incorporates the datapath index
- * and a hardware address, if available.  If not, it generates a random
- * one.
- */
+/* Retrieves the datapath id, which is the MAC address of the "of" device. */
 static 
-uint64_t gen_datapath_id(uint16_t dp_idx)
+uint64_t get_datapath_id(struct net_device *dev)
 {
-       uint64_t id;
+       uint64_t id = 0;
        int i;
-       struct net_device *dev;
-
-       /* The top 16 bits are used to identify the datapath.  The lower 48 bits
-        * use an interface address.  */
-       id = (uint64_t)dp_idx << 48;
-       if ((dev = dev_get_by_name(&init_net, "ctl0")) 
-                       || (dev = dev_get_by_name(&init_net, "eth0"))) {
-               for (i=0; i<ETH_ALEN; i++) {
-                       id |= (uint64_t)dev->dev_addr[i] << (8*(ETH_ALEN-1 - i));
-               }
-               dev_put(dev);
-       } else {
-               /* Randomly choose the lower 48 bits if we cannot find an
-                * address and mark the most significant bit to indicate that
-                * this was randomly generated. */
-               uint8_t rand[ETH_ALEN];
-               get_random_bytes(rand, ETH_ALEN);
-               id |= (uint64_t)1 << 63;
-               for (i=0; i<ETH_ALEN; i++) {
-                       id |= (uint64_t)rand[i] << (8*(ETH_ALEN-1 - i));
-               }
-       }
+
+       for (i=0; i<ETH_ALEN; i++) 
+               id |= (uint64_t)dev->dev_addr[i] << (8*(ETH_ALEN-1 - i));
 
        return id;
 }
@@ -277,13 +261,13 @@ static int new_dp(int dp_idx)
        if (dp == NULL)
                goto err_unlock;
 
+       dp->dp_idx = dp_idx;
+
        /* Setup our "of" device */
        err = dp_dev_setup(dp);
        if (err)
                goto err_free_dp;
 
-       dp->dp_idx = dp_idx;
-       dp->id = gen_datapath_id(dp_idx);
        dp->chain = chain_create(dp);
        if (dp->chain == NULL)
                goto err_destroy_dp_dev;
@@ -323,7 +307,7 @@ err_unlock:
 static int find_portno(struct datapath *dp)
 {
        int i;
-       for (i = 0; i < OFPP_MAX; i++)
+       for (i = 0; i < DP_MAX_PORTS; i++)
                if (dp->ports[i] == NULL)
                        return i;
        return -EXFULL;
@@ -352,7 +336,7 @@ static struct net_bridge_port *new_nbp(struct datapath *dp,
        INIT_WORK(&p->port_task, NULL);
        if (port_no != OFPP_LOCAL)
                rcu_assign_pointer(dev->br_port, p);
-       if (port_no < OFPP_MAX)
+       if (port_no < DP_MAX_PORTS)
                rcu_assign_pointer(dp->ports[port_no], p); 
        list_add_rcu(&p->node, &dp->port_list);
 
@@ -387,6 +371,10 @@ int add_switch_port(struct datapath *dp, struct net_device *dev)
 /* Delete 'p' from switch. */
 int dp_del_switch_port(struct net_bridge_port *p)
 {
+#ifdef SUPPORT_SNAT
+       unsigned long flags;
+#endif
+
        /* First drop references to device. */
        cancel_work_sync(&p->port_task);
        rtnl_lock();
@@ -400,6 +388,13 @@ int dp_del_switch_port(struct net_bridge_port *p)
        /* Then wait until no one is still using it, and destroy it. */
        synchronize_rcu();
 
+#ifdef SUPPORT_SNAT
+       /* Free any SNAT configuration on the port. */
+       spin_lock_irqsave(&p->lock, flags);
+       snat_free_conf(p);
+       spin_unlock_irqrestore(&p->lock, flags);
+#endif
+
        /* Notify the ctlpath that this port no longer exists */
        dp_send_port_status(p, OFPPR_DELETE);
 
@@ -441,6 +436,16 @@ static int dp_maint_func(void *data)
        struct datapath *dp = (struct datapath *) data;
 
        while (!kthread_should_stop()) {
+#ifdef SUPPORT_SNAT
+               struct net_bridge_port *p;
+
+               /* Expire old SNAT entries */
+               rcu_read_lock();
+               list_for_each_entry_rcu (p, &dp->port_list, node) 
+                       snat_maint(p);
+               rcu_read_unlock();
+#endif
+
                /* Timeout old entries */
                chain_timeout(dp->chain);
                msleep_interruptible(MAINT_SLEEP_MSECS);
@@ -452,8 +457,24 @@ static int dp_maint_func(void *data)
 static void
 do_port_input(struct net_bridge_port *p, struct sk_buff *skb) 
 {
+       /* Make our own copy of the packet.  Otherwise we will mangle the
+        * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
+        * (No one comes after us, since we tell handle_bridge() that we took
+        * the packet.) */
+       skb = skb_share_check(skb, GFP_ATOMIC);
+       if (!skb)
+               return;
+
+#ifdef SUPPORT_SNAT
+       /* Check if this packet needs early SNAT processing. */
+       if (snat_pre_route(skb)) {
+               return;
+       }
+#endif
+
        /* Push the Ethernet header back on. */
        skb_push(skb, ETH_HLEN);
+       skb_reset_mac_header(skb);
        fwd_port_input(p->dp->chain, skb, p);
 }
 
@@ -536,16 +557,18 @@ output_all(struct datapath *dp, struct sk_buff *skb, int flood)
 void dp_set_origin(struct datapath *dp, uint16_t in_port,
                           struct sk_buff *skb)
 {
-       struct net_bridge_port *p = (in_port < OFPP_MAX ? dp->ports[in_port]
-                                    : in_port == OFPP_LOCAL ? dp->local_port
-                                    : NULL);
+       struct net_bridge_port *p;
+       p = (in_port < DP_MAX_PORTS ? dp->ports[in_port]
+            : in_port == OFPP_LOCAL ? dp->local_port
+            : NULL);
        if (p) 
                skb->dev = p->dev;
         else 
                skb->dev = NULL;
 }
 
-static int xmit_skb(struct sk_buff *skb)
+int 
+dp_xmit_skb(struct sk_buff *skb)
 {
        int len = skb->len;
        if (packet_length(skb) > skb->dev->mtu) {
@@ -576,7 +599,7 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
                        kfree_skb(skb);
                        return -ESRCH;
                }
-               return xmit_skb(skb);
+               return dp_xmit_skb(skb);
                
        case OFPP_TABLE: {
                int retval = run_flow_through_tables(dp->chain, skb,
@@ -598,10 +621,13 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
 
        case OFPP_LOCAL: {
                struct net_device *dev = dp->netdev;
+#ifdef SUPPORT_SNAT
+               snat_local_in(skb);
+#endif
                return dev ? dp_dev_recv(dev, skb) : -ESRCH;
        }
 
-       case 0 ... OFPP_MAX-1: {
+       case 0 ... DP_MAX_PORTS - 1: {
                struct net_bridge_port *p = dp->ports[out_port];
                if (p == NULL)
                        goto bad_port;
@@ -617,7 +643,7 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
                        return 0;
                }
                skb->dev = p->dev; 
-               return xmit_skb(skb);
+               return dp_xmit_skb(skb);
        }
 
        default:
@@ -645,7 +671,6 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb,
         * forward the whole packet? */
        struct sk_buff *f_skb;
        struct ofp_packet_in *opi;
-       struct net_bridge_port *p;
        size_t fwd_len, opi_len;
        int err;
 
@@ -661,11 +686,12 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb,
        }
        opi->buffer_id      = htonl(buffer_id);
        opi->total_len      = htons(skb->len);
-       p = skb->dev->br_port;
-       opi->in_port        = htons(p ? p->port_no : OFPP_LOCAL);
+       opi->in_port        = htons(skb->dev && skb->dev->br_port
+                                   ? skb->dev->br_port->port_no
+                                   : OFPP_LOCAL);
        opi->reason         = reason;
        opi->pad            = 0;
-       memcpy(opi->data, skb_mac_header(skb), fwd_len);
+       skb_copy_bits(skb, 0, opi->data, fwd_len);
        err = send_openflow_skb(f_skb, NULL);
 
 out:
@@ -781,9 +807,10 @@ static int
 fill_features_reply(struct datapath *dp, struct ofp_switch_features *ofr)
 {
        struct net_bridge_port *p;
+       uint64_t dpid = get_datapath_id(dp->netdev);
        int port_count = 0;
 
-       ofr->datapath_id  = cpu_to_be64(dp->id); 
+       ofr->datapath_id  = cpu_to_be64(dpid);
 
        ofr->n_buffers    = htonl(N_PKT_BUFFERS);
        ofr->n_tables     = dp->chain->n_tables;
@@ -808,7 +835,7 @@ dp_send_features_reply(struct datapath *dp, const struct sender *sender)
        int port_count;
 
        /* Overallocate. */
-       port_max_len = sizeof(struct ofp_phy_port) * OFPP_MAX;
+       port_max_len = sizeof(struct ofp_phy_port) * DP_MAX_PORTS;
        ofr = alloc_openflow_skb(dp, sizeof(*ofr) + port_max_len,
                                 OFPT_FEATURES_REPLY, sender, &skb);
        if (!ofr)
@@ -898,9 +925,10 @@ dp_update_port_flags(struct datapath *dp, const struct ofp_port_mod *opm)
 {
        unsigned long int flags;
        int port_no = ntohs(opm->port_no);
-       struct net_bridge_port *p = (port_no < OFPP_MAX ? dp->ports[port_no]
-                                    : port_no == OFPP_LOCAL ? dp->local_port
-                                    : NULL);
+       struct net_bridge_port *p;
+       p = (port_no < DP_MAX_PORTS ? dp->ports[port_no]
+            : port_no == OFPP_LOCAL ? dp->local_port
+            : NULL);
 
        /* Make sure the port id hasn't changed since this was sent */
        if (!p || memcmp(opm->hw_addr, p->dev->dev_addr, ETH_ALEN))
@@ -971,34 +999,55 @@ dp_send_port_status(struct net_bridge_port *p, uint8_t status)
        return send_openflow_skb(skb, NULL);
 }
 
+/* Convert jiffies_64 to milliseconds. */
+static u64 inline jiffies_64_to_msecs(const u64 j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+               return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+               return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+               return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+
 int 
-dp_send_flow_expired(struct datapath *dp, struct sw_flow *flow,
-                    enum ofp_flow_expired_reason reason)
+dp_send_flow_end(struct datapath *dp, struct sw_flow *flow,
+                    enum nx_flow_end_reason reason)
 {
        struct sk_buff *skb;
-       struct ofp_flow_expired *ofe;
+       struct nx_flow_end *nfe;
 
-       if (!(dp->flags & OFPC_SEND_FLOW_EXP))
+       if (!dp->send_flow_end)
                return 0;
 
-       ofe = alloc_openflow_skb(dp, sizeof *ofe, OFPT_FLOW_EXPIRED, 0, &skb);
-       if (!ofe)
+       nfe = alloc_openflow_skb(dp, sizeof *nfe, OFPT_VENDOR, 0, &skb);
+       if (!nfe)
                return -ENOMEM;
 
-       flow_fill_match(&ofe->match, &flow->key);
+       nfe->header.vendor = htonl(NX_VENDOR_ID);
+       nfe->header.subtype = htonl(NXT_FLOW_END);
+
+       flow_fill_match(&nfe->match, &flow->key);
+
+       nfe->priority = htons(flow->priority);
+       nfe->reason = reason;
 
-       ofe->priority = htons(flow->priority);
-       ofe->reason = reason;
-       memset(ofe->pad, 0, sizeof ofe->pad);
+       nfe->tcp_flags = flow->tcp_flags;
+       nfe->ip_tos = flow->ip_tos;
 
-       ofe->duration     = htonl((jiffies - flow->init_time) / HZ);
-       memset(ofe->pad2, 0, sizeof ofe->pad2);
-       ofe->packet_count = cpu_to_be64(flow->packet_count);
-       ofe->byte_count   = cpu_to_be64(flow->byte_count);
+       memset(nfe->pad, 0, sizeof nfe->pad);
+
+       nfe->init_time = cpu_to_be64(jiffies_64_to_msecs(flow->created));
+       nfe->used_time = cpu_to_be64(jiffies_64_to_msecs(flow->used));
+       nfe->end_time = cpu_to_be64(jiffies_64_to_msecs(get_jiffies_64()));
+
+       nfe->packet_count = cpu_to_be64(flow->packet_count);
+       nfe->byte_count   = cpu_to_be64(flow->byte_count);
 
        return send_openflow_skb(skb, NULL);
 }
-EXPORT_SYMBOL(dp_send_flow_expired);
+EXPORT_SYMBOL(dp_send_flow_end);
 
 int
 dp_send_error_msg(struct datapath *dp, const struct sender *sender, 
@@ -1076,7 +1125,7 @@ static struct genl_ops dp_genl_ops_add_dp = {
 
 struct datapath *dp_get(int dp_idx)
 {
-       if (dp_idx < 0 || dp_idx > DP_MAX)
+       if (dp_idx < 0 || dp_idx >= DP_MAX)
                return NULL;
        return rcu_dereference(dps[dp_idx]);
 }
@@ -1089,7 +1138,7 @@ static int dp_genl_del(struct sk_buff *skb, struct genl_info *info)
        if (!info->attrs[DP_GENL_A_DP_IDX])
                return -EINVAL;
 
-       dp = dp_get(nla_get_u32((info->attrs[DP_GENL_A_DP_IDX])));
+       dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
        if (!dp)
                err = -ENOENT;
        else {
@@ -1150,13 +1199,11 @@ static int dp_genl_query(struct sk_buff *skb, struct genl_info *info)
 
                genlmsg_end(ans_skb, data);
                err = genlmsg_reply(ans_skb, info);
-               if (!err)
-                       ans_skb = NULL;
+               ans_skb = NULL;
        }
 err:
 nla_put_failure:
-       if (ans_skb)
-               kfree_skb(ans_skb);
+       kfree_skb(ans_skb);
        rcu_read_unlock();
        return err;
 }
@@ -1308,6 +1355,7 @@ static int flow_stats_dump_callback(struct sw_flow *flow, void *private)
        struct flow_stats_state *s = private;
        struct ofp_flow_stats *ofs;
        int length;
+       uint64_t duration;
 
        length = sizeof *ofs + sf_acts->actions_len;
        if (length + s->bytes_used > s->bytes_allocated)
@@ -1329,7 +1377,14 @@ static int flow_stats_dump_callback(struct sw_flow *flow, void *private)
        ofs->match.pad       = 0;
        ofs->match.tp_src    = flow->key.tp_src;
        ofs->match.tp_dst    = flow->key.tp_dst;
-       ofs->duration        = htonl((jiffies - flow->init_time) / HZ);
+
+       /* The kernel doesn't support 64-bit division, so use the 'do_div' 
+        * macro instead.  The first argument is replaced with the quotient,
+        * while the remainder is the return value. */
+       duration = get_jiffies_64() - flow->created;
+       do_div(duration, HZ);
+       ofs->duration        = htonl(duration);
+
        ofs->priority        = htons(flow->priority);
        ofs->idle_timeout    = htons(flow->idle_timeout);
        ofs->hard_timeout    = htons(flow->hard_timeout);
@@ -1359,8 +1414,8 @@ static int flow_stats_dump(struct datapath *dp, void *state,
        {
                struct sw_table *table = dp->chain->tables[s->table_idx];
 
-               error = table->iterate(table, &match_key, &s->position,
-                                      flow_stats_dump_callback, s);
+               error = table->iterate(table, &match_key, s->rq->out_port, 
+                               &s->position, flow_stats_dump_callback, s);
                if (error)
                        break;
 
@@ -1424,7 +1479,7 @@ static int aggregate_stats_dump(struct datapath *dp, void *state,
                struct sw_table *table = dp->chain->tables[table_idx];
                int error;
 
-               error = table->iterate(table, &match_key, &position,
+               error = table->iterate(table, &match_key, rq->out_port, &position,
                                       aggregate_stats_dump_callback, rpy);
                if (error)
                        return error;
@@ -1492,7 +1547,7 @@ static int port_stats_dump(struct datapath *dp, void *state,
        ops = body;
 
        n_ports = 0;
-       for (i = s->port; i < OFPP_MAX && n_ports < max_ports; i++) {
+       for (i = s->port; i < DP_MAX_PORTS && n_ports < max_ports; i++) {
                struct net_bridge_port *p = dp->ports[i];
                struct net_device_stats *stats;
                if (!p)
@@ -1766,12 +1821,39 @@ static void dp_uninit_netlink(void)
        genl_unregister_family(&dp_genl_family);
 }
 
+/* Set the description strings if appropriate values are available from
+ * the DMI. */
+static void set_desc(void)
+{
+       const char *uuid = dmi_get_system_info(DMI_PRODUCT_UUID);
+       const char *uptr = uuid + 24;
+
+       if (!uuid || *uuid == '\0' || strlen(uuid) != 36) 
+               return;
+
+       /* We are only interested version 1 UUIDs, since the last six bytes
+        * are an IEEE 802 MAC address. */
+       if (uuid[14] != '1') 
+               return;
+
+       /* Only set if the UUID is from Nicira. */
+       if (strncmp(uptr, NICIRA_OUI_STR, strlen(NICIRA_OUI_STR)))
+               return;
+
+       strlcpy(mfr_desc, dmi_get_system_info(DMI_SYS_VENDOR), sizeof(mfr_desc));
+       snprintf(hw_desc, sizeof(hw_desc), "%s %s", 
+                       dmi_get_system_info(DMI_PRODUCT_NAME), 
+                       dmi_get_system_info(DMI_PRODUCT_VERSION));
+       strlcpy(serial_num, dmi_get_system_info(DMI_PRODUCT_SERIAL), 
+                       sizeof(serial_num));
+}
+
 static int __init dp_init(void)
 {
        int err;
 
-       printk("OpenFlow "VERSION", built "__DATE__" "__TIME__", "
-              "protocol 0x%02x\n", OFP_VERSION);
+       printk("OpenFlow %s, built "__DATE__" "__TIME__", "
+              "protocol 0x%02x\n", VERSION BUILDNR, OFP_VERSION);
 
        err = flow_init();
        if (err)
@@ -1785,6 +1867,10 @@ static int __init dp_init(void)
        if (err)
                goto error_unreg_notifier;
 
+       /* Check if better descriptions of the switch are available than the
+        * defaults. */
+       set_desc();
+
        /* Hook into callback used by the bridge to intercept packets.
         * Parasites we are. */
        if (br_handle_frame_hook)