Fix leaking of flows when output action validation fails.
[sliver-openvswitch.git] / datapath / datapath.c
index 564d303..27152c8 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/random.h>
 #include <asm/system.h>
 #include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
 #include <linux/inetdevice.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
@@ -33,6 +34,7 @@
 
 #include "openflow-netlink.h"
 #include "datapath.h"
+#include "nx_act_snat.h"
 #include "table.h"
 #include "chain.h"
 #include "dp_dev.h"
@@ -323,7 +325,7 @@ err_unlock:
 static int find_portno(struct datapath *dp)
 {
        int i;
-       for (i = 0; i < OFPP_MAX; i++)
+       for (i = 0; i < DP_MAX_PORTS; i++)
                if (dp->ports[i] == NULL)
                        return i;
        return -EXFULL;
@@ -352,7 +354,7 @@ static struct net_bridge_port *new_nbp(struct datapath *dp,
        INIT_WORK(&p->port_task, NULL);
        if (port_no != OFPP_LOCAL)
                rcu_assign_pointer(dev->br_port, p);
-       if (port_no < OFPP_MAX)
+       if (port_no < DP_MAX_PORTS)
                rcu_assign_pointer(dp->ports[port_no], p); 
        list_add_rcu(&p->node, &dp->port_list);
 
@@ -387,6 +389,10 @@ int add_switch_port(struct datapath *dp, struct net_device *dev)
 /* Delete 'p' from switch. */
 int dp_del_switch_port(struct net_bridge_port *p)
 {
+#ifdef SUPPORT_SNAT
+       unsigned long flags;
+#endif
+
        /* First drop references to device. */
        cancel_work_sync(&p->port_task);
        rtnl_lock();
@@ -400,6 +406,13 @@ int dp_del_switch_port(struct net_bridge_port *p)
        /* Then wait until no one is still using it, and destroy it. */
        synchronize_rcu();
 
+#ifdef SUPPORT_SNAT
+       /* Free any SNAT configuration on the port. */
+       spin_lock_irqsave(&p->lock, flags);
+       snat_free_conf(p);
+       spin_unlock_irqrestore(&p->lock, flags);
+#endif
+
        /* Notify the ctlpath that this port no longer exists */
        dp_send_port_status(p, OFPPR_DELETE);
 
@@ -441,6 +454,16 @@ static int dp_maint_func(void *data)
        struct datapath *dp = (struct datapath *) data;
 
        while (!kthread_should_stop()) {
+#ifdef SUPPORT_SNAT
+               struct net_bridge_port *p;
+
+               /* Expire old SNAT entries */
+               rcu_read_lock();
+               list_for_each_entry_rcu (p, &dp->port_list, node) 
+                       snat_maint(p);
+               rcu_read_unlock();
+#endif
+
                /* Timeout old entries */
                chain_timeout(dp->chain);
                msleep_interruptible(MAINT_SLEEP_MSECS);
@@ -452,6 +475,14 @@ static int dp_maint_func(void *data)
 static void
 do_port_input(struct net_bridge_port *p, struct sk_buff *skb) 
 {
+#ifdef SUPPORT_SNAT
+       /* Check if this packet needs early SNAT processing. */
+       if (snat_pre_route(skb)) {
+               kfree_skb(skb);
+               return;
+       }
+#endif
+
        /* Push the Ethernet header back on. */
        skb_push(skb, ETH_HLEN);
        fwd_port_input(p->dp->chain, skb, p);
@@ -506,12 +537,12 @@ static inline unsigned packet_length(const struct sk_buff *skb)
 static int
 output_all(struct datapath *dp, struct sk_buff *skb, int flood)
 {
-       u32 disable = flood ? OFPPFL_NO_FLOOD : 0;
+       u32 disable = flood ? OFPPC_NO_FLOOD : 0;
        struct net_bridge_port *p;
        int prev_port = -1;
 
        list_for_each_entry_rcu (p, &dp->port_list, node) {
-               if (skb->dev == p->dev || p->flags & disable)
+               if (skb->dev == p->dev || p->config & disable)
                        continue;
                if (prev_port != -1) {
                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -533,20 +564,21 @@ output_all(struct datapath *dp, struct sk_buff *skb, int flood)
 
 /* Marks 'skb' as having originated from 'in_port' in 'dp'.
    FIXME: how are devices reference counted? */
-int dp_set_origin(struct datapath *dp, uint16_t in_port,
+void dp_set_origin(struct datapath *dp, uint16_t in_port,
                           struct sk_buff *skb)
 {
-       struct net_bridge_port *p = (in_port < OFPP_MAX ? dp->ports[in_port]
-                                    : in_port == OFPP_LOCAL ? dp->local_port
-                                    : NULL);
-       if (p) {
+       struct net_bridge_port *p;
+       p = (in_port < DP_MAX_PORTS ? dp->ports[in_port]
+            : in_port == OFPP_LOCAL ? dp->local_port
+            : NULL);
+       if (p) 
                skb->dev = p->dev;
-               return 0;
-       }
-       return -ENOENT;
+        else 
+               skb->dev = NULL;
 }
 
-static int xmit_skb(struct sk_buff *skb)
+int 
+dp_xmit_skb(struct sk_buff *skb)
 {
        int len = skb->len;
        if (packet_length(skb) > skb->dev->mtu) {
@@ -574,10 +606,10 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
                if (!skb->dev) {
                        if (net_ratelimit())
                                printk("skb device not set forwarding to in_port\n");
-                       kfree(skb);
+                       kfree_skb(skb);
                        return -ESRCH;
                }
-               return xmit_skb(skb);
+               return dp_xmit_skb(skb);
                
        case OFPP_TABLE: {
                int retval = run_flow_through_tables(dp->chain, skb,
@@ -599,10 +631,13 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
 
        case OFPP_LOCAL: {
                struct net_device *dev = dp->netdev;
+#ifdef SUPPORT_SNAT
+               snat_local_in(skb);
+#endif
                return dev ? dp_dev_recv(dev, skb) : -ESRCH;
        }
 
-       case 0 ... OFPP_MAX-1: {
+       case 0 ... DP_MAX_PORTS - 1: {
                struct net_bridge_port *p = dp->ports[out_port];
                if (p == NULL)
                        goto bad_port;
@@ -613,12 +648,12 @@ int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
                                printk("can't directly forward to input port\n");
                        return -EINVAL;
                }
-               if (p->flags & OFPPFL_NO_FWD && !ignore_no_fwd) {
+               if (p->config & OFPPC_NO_FWD && !ignore_no_fwd) {
                        kfree_skb(skb);
                        return 0;
                }
                skb->dev = p->dev; 
-               return xmit_skb(skb);
+               return dp_xmit_skb(skb);
        }
 
        default:
@@ -681,12 +716,14 @@ static void fill_port_desc(struct net_bridge_port *p, struct ofp_phy_port *desc)
        strncpy(desc->name, p->dev->name, OFP_MAX_PORT_NAME_LEN);
        desc->name[OFP_MAX_PORT_NAME_LEN-1] = '\0';
        memcpy(desc->hw_addr, p->dev->dev_addr, ETH_ALEN);
-       desc->flags = 0;
-       desc->features = 0;
-       desc->speed = 0;
+       desc->curr = 0;
+       desc->supported = 0;
+       desc->advertised = 0;
+       desc->peer = 0;
 
        spin_lock_irqsave(&p->lock, flags);
-       desc->flags = htonl(p->flags | p->status);
+       desc->config = htonl(p->config);
+       desc->state = htonl(p->state);
        spin_unlock_irqrestore(&p->lock, flags);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,24)
@@ -694,27 +731,86 @@ static void fill_port_desc(struct net_bridge_port *p, struct ofp_phy_port *desc)
                struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET };
 
                if (!p->dev->ethtool_ops->get_settings(p->dev, &ecmd)) {
+                       /* Set the supported features */
                        if (ecmd.supported & SUPPORTED_10baseT_Half) 
-                               desc->features |= OFPPF_10MB_HD;
+                               desc->supported |= OFPPF_10MB_HD;
                        if (ecmd.supported & SUPPORTED_10baseT_Full)
-                               desc->features |= OFPPF_10MB_FD;
+                               desc->supported |= OFPPF_10MB_FD;
                        if (ecmd.supported & SUPPORTED_100baseT_Half) 
-                               desc->features |= OFPPF_100MB_HD;
+                               desc->supported |= OFPPF_100MB_HD;
                        if (ecmd.supported & SUPPORTED_100baseT_Full)
-                               desc->features |= OFPPF_100MB_FD;
+                               desc->supported |= OFPPF_100MB_FD;
                        if (ecmd.supported & SUPPORTED_1000baseT_Half)
-                               desc->features |= OFPPF_1GB_HD;
+                               desc->supported |= OFPPF_1GB_HD;
                        if (ecmd.supported & SUPPORTED_1000baseT_Full)
-                               desc->features |= OFPPF_1GB_FD;
-                       /* 10Gbps half-duplex doesn't exist... */
+                               desc->supported |= OFPPF_1GB_FD;
                        if (ecmd.supported & SUPPORTED_10000baseT_Full)
-                               desc->features |= OFPPF_10GB_FD;
-
-                       desc->speed = htonl(ecmd.speed);
+                               desc->supported |= OFPPF_10GB_FD;
+                       if (ecmd.supported & SUPPORTED_TP)
+                               desc->supported |= OFPPF_COPPER;
+                       if (ecmd.supported & SUPPORTED_FIBRE)
+                               desc->supported |= OFPPF_FIBER;
+                       if (ecmd.supported & SUPPORTED_Autoneg)
+                               desc->supported |= OFPPF_AUTONEG;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+                       if (ecmd.supported & SUPPORTED_Pause)
+                               desc->supported |= OFPPF_PAUSE;
+                       if (ecmd.supported & SUPPORTED_Asym_Pause)
+                               desc->supported |= OFPPF_PAUSE_ASYM;
+#endif /* kernel >= 2.6.14 */
+
+                       /* Set the advertised features */
+                       if (ecmd.advertising & ADVERTISED_10baseT_Half) 
+                               desc->advertised |= OFPPF_10MB_HD;
+                       if (ecmd.advertising & ADVERTISED_10baseT_Full)
+                               desc->advertised |= OFPPF_10MB_FD;
+                       if (ecmd.advertising & ADVERTISED_100baseT_Half) 
+                               desc->advertised |= OFPPF_100MB_HD;
+                       if (ecmd.advertising & ADVERTISED_100baseT_Full)
+                               desc->advertised |= OFPPF_100MB_FD;
+                       if (ecmd.advertising & ADVERTISED_1000baseT_Half)
+                               desc->advertised |= OFPPF_1GB_HD;
+                       if (ecmd.advertising & ADVERTISED_1000baseT_Full)
+                               desc->advertised |= OFPPF_1GB_FD;
+                       if (ecmd.advertising & ADVERTISED_10000baseT_Full)
+                               desc->advertised |= OFPPF_10GB_FD;
+                       if (ecmd.advertising & ADVERTISED_TP)
+                               desc->advertised |= OFPPF_COPPER;
+                       if (ecmd.advertising & ADVERTISED_FIBRE)
+                               desc->advertised |= OFPPF_FIBER;
+                       if (ecmd.advertising & ADVERTISED_Autoneg)
+                               desc->advertised |= OFPPF_AUTONEG;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+                       if (ecmd.advertising & ADVERTISED_Pause)
+                               desc->advertised |= OFPPF_PAUSE;
+                       if (ecmd.advertising & ADVERTISED_Asym_Pause)
+                               desc->advertised |= OFPPF_PAUSE_ASYM;
+#endif /* kernel >= 2.6.14 */
+
+                       /* Set the current features */
+                       if (ecmd.speed == SPEED_10)
+                               desc->curr = (ecmd.duplex) ? OFPPF_10MB_FD : OFPPF_10MB_HD;
+                       else if (ecmd.speed == SPEED_100)
+                               desc->curr = (ecmd.duplex) ? OFPPF_100MB_FD : OFPPF_100MB_HD;
+                       else if (ecmd.speed == SPEED_1000)
+                               desc->curr = (ecmd.duplex) ? OFPPF_1GB_FD : OFPPF_1GB_HD;
+                       else if (ecmd.speed == SPEED_10000)
+                               desc->curr = OFPPF_10GB_FD;
+
+                       if (ecmd.port == PORT_TP) 
+                               desc->curr |= OFPPF_COPPER;
+                       else if (ecmd.port == PORT_FIBRE) 
+                               desc->curr |= OFPPF_FIBER;
+
+                       if (ecmd.autoneg)
+                               desc->curr |= OFPPF_AUTONEG;
                }
        }
 #endif
-       desc->features = htonl(desc->features);
+       desc->curr = htonl(desc->curr);
+       desc->supported = htonl(desc->supported);
+       desc->advertised = htonl(desc->advertised);
+       desc->peer = htonl(desc->peer);
 }
 
 static int 
@@ -748,7 +844,7 @@ dp_send_features_reply(struct datapath *dp, const struct sender *sender)
        int port_count;
 
        /* Overallocate. */
-       port_max_len = sizeof(struct ofp_phy_port) * OFPP_MAX;
+       port_max_len = sizeof(struct ofp_phy_port) * DP_MAX_PORTS;
        ofr = alloc_openflow_skb(dp, sizeof(*ofr) + port_max_len,
                                 OFPT_FEATURES_REPLY, sender, &skb);
        if (!ofr)
@@ -815,7 +911,7 @@ down_port_cb(struct work_struct *work)
                if (net_ratelimit())
                        printk("problem bringing up port %s\n", p->dev->name);
        rtnl_unlock();
-       p->status |= OFPPFL_PORT_DOWN;
+       p->config |= OFPPC_PORT_DOWN;
 }
 
 /* Callback function for a workqueue to enable an interface */
@@ -830,42 +926,41 @@ up_port_cb(struct work_struct *work)
                if (net_ratelimit())
                        printk("problem bringing down port %s\n", p->dev->name);
        rtnl_unlock();
-       p->status &= ~OFPPFL_PORT_DOWN;
+       p->config &= ~OFPPC_PORT_DOWN;
 }
 
 int
 dp_update_port_flags(struct datapath *dp, const struct ofp_port_mod *opm)
 {
        unsigned long int flags;
-       const struct ofp_phy_port *opp = &opm->desc;
-       int port_no = ntohs(opp->port_no);
-       struct net_bridge_port *p = (port_no < OFPP_MAX ? dp->ports[port_no]
-                                    : port_no == OFPP_LOCAL ? dp->local_port
-                                    : NULL);
-       uint32_t flag_mask;
+       int port_no = ntohs(opm->port_no);
+       struct net_bridge_port *p;
+       p = (port_no < DP_MAX_PORTS ? dp->ports[port_no]
+            : port_no == OFPP_LOCAL ? dp->local_port
+            : NULL);
 
        /* Make sure the port id hasn't changed since this was sent */
-       if (!p || memcmp(opp->hw_addr, p->dev->dev_addr, ETH_ALEN))
+       if (!p || memcmp(opm->hw_addr, p->dev->dev_addr, ETH_ALEN))
                return -1;
 
        spin_lock_irqsave(&p->lock, flags);
-       flag_mask = ntohl(opm->mask) & PORT_FLAG_BITS;
-       if (flag_mask) {
-               p->flags &= ~flag_mask;
-               p->flags |= ntohl(opp->flags) & flag_mask;
+       if (opm->mask) {
+               uint32_t config_mask = ntohl(opm->mask);
+               p->config &= ~config_mask;
+               p->config |= ntohl(opm->config) & config_mask;
        }
 
        /* Modifying the status of an interface requires taking a lock
         * that cannot be done from here.  For this reason, we use a shared 
         * workqueue, which will cause it to be executed from a safer 
         * context. */
-       if (opm->mask & htonl(OFPPFL_PORT_DOWN)) {
-               if ((opp->flags & htonl(OFPPFL_PORT_DOWN))
-                   && (p->status & OFPPFL_PORT_DOWN) == 0) {
+       if (opm->mask & htonl(OFPPC_PORT_DOWN)) {
+               if ((opm->config & htonl(OFPPC_PORT_DOWN))
+                   && (p->config & OFPPC_PORT_DOWN) == 0) {
                        PREPARE_WORK(&p->port_task, down_port_cb);
                        schedule_work(&p->port_task);
-               } else if ((opp->flags & htonl(OFPPFL_PORT_DOWN)) == 0
-                          && (p->status & OFPPFL_PORT_DOWN)) {
+               } else if ((opm->config & htonl(OFPPC_PORT_DOWN)) == 0
+                          && (p->config & OFPPC_PORT_DOWN)) {
                        PREPARE_WORK(&p->port_task, up_port_cb);
                        schedule_work(&p->port_task);
                }
@@ -884,14 +979,14 @@ init_port_status(struct net_bridge_port *p)
        spin_lock_irqsave(&p->lock, flags);
 
        if (p->dev->flags & IFF_UP) 
-               p->status &= ~OFPPFL_PORT_DOWN;
+               p->config &= ~OFPPC_PORT_DOWN;
        else
-               p->status |= OFPPFL_PORT_DOWN;
+               p->config |= OFPPC_PORT_DOWN;
 
        if (netif_carrier_ok(p->dev))
-               p->status &= ~OFPPFL_LINK_DOWN;
+               p->state &= ~OFPPS_LINK_DOWN;
        else
-               p->status |= OFPPFL_LINK_DOWN;
+               p->state |= OFPPS_LINK_DOWN;
 
        spin_unlock_irqrestore(&p->lock, flags);
 }
@@ -1249,11 +1344,9 @@ static int flow_stats_dump_callback(struct sw_flow *flow, void *private)
        struct sw_flow_actions *sf_acts = rcu_dereference(flow->sf_acts);
        struct flow_stats_state *s = private;
        struct ofp_flow_stats *ofs;
-       int actions_length;
        int length;
 
-       actions_length = sizeof *ofs->actions * sf_acts->n_actions;
-       length = sizeof *ofs + actions_length;
+       length = sizeof *ofs + sf_acts->actions_len;
        if (length + s->bytes_used > s->bytes_allocated)
                return 1;
 
@@ -1280,7 +1373,7 @@ static int flow_stats_dump_callback(struct sw_flow *flow, void *private)
        memset(ofs->pad2, 0, sizeof ofs->pad2);
        ofs->packet_count    = cpu_to_be64(flow->packet_count);
        ofs->byte_count      = cpu_to_be64(flow->byte_count);
-       memcpy(ofs->actions, sf_acts->actions, actions_length);
+       memcpy(ofs->actions, sf_acts->actions, sf_acts->actions_len);
 
        s->bytes_used += length;
        return 0;
@@ -1436,7 +1529,7 @@ static int port_stats_dump(struct datapath *dp, void *state,
        ops = body;
 
        n_ports = 0;
-       for (i = s->port; i < OFPP_MAX && n_ports < max_ports; i++) {
+       for (i = s->port; i < DP_MAX_PORTS && n_ports < max_ports; i++) {
                struct net_bridge_port *p = dp->ports[i];
                struct net_device_stats *stats;
                if (!p)