vswitchd: Make the MAC entry aging time configurable.
[sliver-openvswitch.git] / datapath / datapath.c
index dee1b0f..e8ff4a5 100644 (file)
@@ -51,6 +51,7 @@
 #include "flow.h"
 #include "loop_counter.h"
 #include "table.h"
+#include "vlan.h"
 #include "vport-internal_dev.h"
 
 int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
@@ -79,6 +80,8 @@ EXPORT_SYMBOL(dp_ioctl_hook);
 static LIST_HEAD(dps);
 
 static struct vport *new_vport(const struct vport_parms *);
+static int queue_control_packets(struct datapath *, struct sk_buff *,
+                                const struct dp_upcall_info *);
 
 /* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
 struct datapath *get_dp(int dp_ifindex)
@@ -356,7 +359,6 @@ static void copy_and_csum_skb(struct sk_buff *skb, void *to)
 
        get_skb_csum_pointers(skb, &csum_start, &csum_offset);
        csum_start -= skb_headroom(skb);
-       BUG_ON(csum_start >= skb_headlen(skb));
 
        skb_copy_bits(skb, 0, to, csum_start);
 
@@ -365,13 +367,98 @@ static void copy_and_csum_skb(struct sk_buff *skb, void *to)
        *(__sum16 *)(to + csum_start + csum_offset) = csum_fold(csum);
 }
 
-static struct genl_family dp_packet_genl_family;
+static struct genl_family dp_packet_genl_family = {
+       .id = GENL_ID_GENERATE,
+       .hdrsize = sizeof(struct odp_header),
+       .name = ODP_PACKET_FAMILY,
+       .version = 1,
+       .maxattr = ODP_PACKET_ATTR_MAX
+};
+
+/* Generic Netlink multicast groups for upcalls.
+ *
+ * We really want three unique multicast groups per datapath, but we can't even
+ * get one, because genl_register_mc_group() takes genl_lock, which is also
+ * held during Generic Netlink message processing, so trying to acquire
+ * multicast groups during ODP_DP_NEW processing deadlocks.  Instead, we
+ * preallocate a few groups and use them round-robin for datapaths.  Collision
+ * isn't fatal--multicast listeners should check that the family is the one
+ * that they want and discard others--but it wastes time and memory to receive
+ * unwanted messages.
+ */
 #define PACKET_N_MC_GROUPS 16
+static struct genl_multicast_group packet_mc_groups[PACKET_N_MC_GROUPS];
 
-static int packet_mc_group(struct datapath *dp, u8 cmd)
+static u32 packet_mc_group(struct datapath *dp, u8 cmd)
 {
+       u32 idx;
        BUILD_BUG_ON_NOT_POWER_OF_2(PACKET_N_MC_GROUPS);
-       return jhash_2words(dp->dp_ifindex, cmd, 0) & (PACKET_N_MC_GROUPS - 1);
+
+       idx = jhash_2words(dp->dp_ifindex, cmd, 0) & (PACKET_N_MC_GROUPS - 1);
+       return packet_mc_groups[idx].id;
+}
+
+static int packet_register_mc_groups(void)
+{
+       int i;
+
+       for (i = 0; i < PACKET_N_MC_GROUPS; i++) {
+               struct genl_multicast_group *group = &packet_mc_groups[i];
+               int error;
+
+               sprintf(group->name, "packet%d", i);
+               error = genl_register_mc_group(&dp_packet_genl_family, group);
+               if (error)
+                       return error;
+       }
+       return 0;
+}
+
+int dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info)
+{
+       struct dp_stats_percpu *stats;
+       int err;
+
+       WARN_ON_ONCE(skb_shared(skb));
+
+       forward_ip_summed(skb);
+
+       err = vswitch_skb_checksum_setup(skb);
+       if (err)
+               goto err_kfree_skb;
+
+       /* Break apart GSO packets into their component pieces.  Otherwise
+        * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
+       if (skb_is_gso(skb)) {
+               struct sk_buff *nskb = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
+               
+               kfree_skb(skb);
+               skb = nskb;
+               if (IS_ERR(skb)) {
+                       err = PTR_ERR(skb);
+                       goto err;
+               }
+       }
+
+       err = queue_control_packets(dp, skb, upcall_info);
+       if (err)
+               goto err;
+
+       return 0;
+
+err_kfree_skb:
+       kfree_skb(skb);
+err:
+       local_bh_disable();
+       stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
+
+       write_seqcount_begin(&stats->seqlock);
+       stats->n_lost++;
+       write_seqcount_end(&stats->seqlock);
+
+       local_bh_enable();
+
+       return err;
 }
 
 /* Send each packet in the 'skb' list to userspace for 'dp' as directed by
@@ -400,8 +487,14 @@ static int queue_control_packets(struct datapath *dp, struct sk_buff *skb,
                nskb = skb->next;
                skb->next = NULL;
 
+               err = vlan_deaccel_tag(skb);
+               if (unlikely(err))
+                       goto err_kfree_skbs;
+
+               if (nla_attr_size(skb->len) > USHRT_MAX)
+                       goto err_kfree_skbs;
+
                len = sizeof(struct odp_header);
-               len += nla_total_size(4); /* ODP_PACKET_ATTR_TYPE. */
                len += nla_total_size(skb->len);
                len += nla_total_size(FLOW_BUFSIZE);
                if (upcall_info->userdata)
@@ -461,86 +554,6 @@ err_kfree_skbs:
        return err;
 }
 
-/* Generic Netlink multicast groups for upcalls.
- *
- * We really want three unique multicast groups per datapath, but we can't even
- * get one, because genl_register_mc_group() takes genl_lock, which is also
- * held during Generic Netlink message processing, so trying to acquire
- * multicast groups during ODP_DP_NEW processing deadlocks.  Instead, we
- * preallocate a few groups and use them round-robin for datapaths.  Collision
- * isn't fatal--multicast listeners should check that the family is the one
- * that they want and discard others--but it wastes time and memory to receive
- * unwanted messages.
- */
-static struct genl_multicast_group packet_mc_groups[PACKET_N_MC_GROUPS];
-
-static struct genl_family dp_packet_genl_family = {
-       .id = GENL_ID_GENERATE,
-       .hdrsize = sizeof(struct odp_header),
-       .name = ODP_PACKET_FAMILY,
-       .version = 1,
-       .maxattr = ODP_PACKET_ATTR_MAX
-};
-
-static int packet_register_mc_groups(void)
-{
-       int i;
-
-       for (i = 0; i < PACKET_N_MC_GROUPS; i++) {
-               struct genl_multicast_group *group = &packet_mc_groups[i];
-               int error;
-
-               sprintf(group->name, "packet%d", i);
-               error = genl_register_mc_group(&dp_packet_genl_family, group);
-               if (error)
-                       return error;
-       }
-       return 0;
-}
-
-int dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info)
-{
-       struct dp_stats_percpu *stats;
-       int err;
-
-       WARN_ON_ONCE(skb_shared(skb));
-
-       forward_ip_summed(skb);
-
-       err = vswitch_skb_checksum_setup(skb);
-       if (err)
-               goto err_kfree_skb;
-
-       /* Break apart GSO packets into their component pieces.  Otherwise
-        * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
-       if (skb_is_gso(skb)) {
-               struct sk_buff *nskb = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
-               
-               kfree_skb(skb);
-               skb = nskb;
-               if (IS_ERR(skb)) {
-                       err = PTR_ERR(skb);
-                       goto err;
-               }
-       }
-
-       return queue_control_packets(dp, skb, upcall_info);
-
-err_kfree_skb:
-       kfree_skb(skb);
-err:
-       local_bh_disable();
-       stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
-
-       write_seqcount_begin(&stats->seqlock);
-       stats->n_lost++;
-       write_seqcount_end(&stats->seqlock);
-
-       local_bh_enable();
-
-       return err;
-}
-
 /* Called with genl_mutex. */
 static int flush_flows(int dp_ifindex)
 {
@@ -570,58 +583,58 @@ static int validate_actions(const struct nlattr *attr)
        int rem;
 
        nla_for_each_nested(a, attr, rem) {
-               static const u32 action_lens[ODPAT_MAX + 1] = {
-                       [ODPAT_OUTPUT] = 4,
-                       [ODPAT_CONTROLLER] = 8,
-                       [ODPAT_SET_DL_TCI] = 2,
-                       [ODPAT_STRIP_VLAN] = 0,
-                       [ODPAT_SET_DL_SRC] = ETH_ALEN,
-                       [ODPAT_SET_DL_DST] = ETH_ALEN,
-                       [ODPAT_SET_NW_SRC] = 4,
-                       [ODPAT_SET_NW_DST] = 4,
-                       [ODPAT_SET_NW_TOS] = 1,
-                       [ODPAT_SET_TP_SRC] = 2,
-                       [ODPAT_SET_TP_DST] = 2,
-                       [ODPAT_SET_TUNNEL] = 8,
-                       [ODPAT_SET_PRIORITY] = 4,
-                       [ODPAT_POP_PRIORITY] = 0,
-                       [ODPAT_DROP_SPOOFED_ARP] = 0,
+               static const u32 action_lens[ODP_ACTION_ATTR_MAX + 1] = {
+                       [ODP_ACTION_ATTR_OUTPUT] = 4,
+                       [ODP_ACTION_ATTR_CONTROLLER] = 8,
+                       [ODP_ACTION_ATTR_SET_DL_TCI] = 2,
+                       [ODP_ACTION_ATTR_STRIP_VLAN] = 0,
+                       [ODP_ACTION_ATTR_SET_DL_SRC] = ETH_ALEN,
+                       [ODP_ACTION_ATTR_SET_DL_DST] = ETH_ALEN,
+                       [ODP_ACTION_ATTR_SET_NW_SRC] = 4,
+                       [ODP_ACTION_ATTR_SET_NW_DST] = 4,
+                       [ODP_ACTION_ATTR_SET_NW_TOS] = 1,
+                       [ODP_ACTION_ATTR_SET_TP_SRC] = 2,
+                       [ODP_ACTION_ATTR_SET_TP_DST] = 2,
+                       [ODP_ACTION_ATTR_SET_TUNNEL] = 8,
+                       [ODP_ACTION_ATTR_SET_PRIORITY] = 4,
+                       [ODP_ACTION_ATTR_POP_PRIORITY] = 0,
+                       [ODP_ACTION_ATTR_DROP_SPOOFED_ARP] = 0,
                };
                int type = nla_type(a);
 
-               if (type > ODPAT_MAX || nla_len(a) != action_lens[type])
+               if (type > ODP_ACTION_ATTR_MAX || nla_len(a) != action_lens[type])
                        return -EINVAL;
 
                switch (type) {
-               case ODPAT_UNSPEC:
+               case ODP_ACTION_ATTR_UNSPEC:
                        return -EINVAL;
 
-               case ODPAT_CONTROLLER:
-               case ODPAT_STRIP_VLAN:
-               case ODPAT_SET_DL_SRC:
-               case ODPAT_SET_DL_DST:
-               case ODPAT_SET_NW_SRC:
-               case ODPAT_SET_NW_DST:
-               case ODPAT_SET_TP_SRC:
-               case ODPAT_SET_TP_DST:
-               case ODPAT_SET_TUNNEL:
-               case ODPAT_SET_PRIORITY:
-               case ODPAT_POP_PRIORITY:
-               case ODPAT_DROP_SPOOFED_ARP:
+               case ODP_ACTION_ATTR_CONTROLLER:
+               case ODP_ACTION_ATTR_STRIP_VLAN:
+               case ODP_ACTION_ATTR_SET_DL_SRC:
+               case ODP_ACTION_ATTR_SET_DL_DST:
+               case ODP_ACTION_ATTR_SET_NW_SRC:
+               case ODP_ACTION_ATTR_SET_NW_DST:
+               case ODP_ACTION_ATTR_SET_TP_SRC:
+               case ODP_ACTION_ATTR_SET_TP_DST:
+               case ODP_ACTION_ATTR_SET_TUNNEL:
+               case ODP_ACTION_ATTR_SET_PRIORITY:
+               case ODP_ACTION_ATTR_POP_PRIORITY:
+               case ODP_ACTION_ATTR_DROP_SPOOFED_ARP:
                        /* No validation needed. */
                        break;
 
-               case ODPAT_OUTPUT:
+               case ODP_ACTION_ATTR_OUTPUT:
                        if (nla_get_u32(a) >= DP_MAX_PORTS)
                                return -EINVAL;
                        break;
 
-               case ODPAT_SET_DL_TCI:
+               case ODP_ACTION_ATTR_SET_DL_TCI:
                        if (nla_get_be16(a) & htons(VLAN_CFI_MASK))
                                return -EINVAL;
                        break;
 
-               case ODPAT_SET_NW_TOS:
+               case ODP_ACTION_ATTR_SET_NW_TOS:
                        if (nla_get_u8(a) & INET_ECN_MASK)
                                return -EINVAL;
                        break;
@@ -674,16 +687,16 @@ static int odp_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
        err = -EINVAL;
        if (!a[ODP_PACKET_ATTR_PACKET] || !a[ODP_PACKET_ATTR_ACTIONS] ||
            nla_len(a[ODP_PACKET_ATTR_PACKET]) < ETH_HLEN)
-               goto exit;
+               goto err;
 
        err = validate_actions(a[ODP_PACKET_ATTR_ACTIONS]);
        if (err)
-               goto exit;
+               goto err;
 
        packet = skb_clone(skb, GFP_KERNEL);
        err = -ENOMEM;
        if (!packet)
-               goto exit;
+               goto err;
        packet->data = nla_data(a[ODP_PACKET_ATTR_PACKET]);
        packet->len = nla_len(a[ODP_PACKET_ATTR_PACKET]);
 
@@ -698,20 +711,29 @@ static int odp_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
        else
                packet->protocol = htons(ETH_P_802_2);
 
+       /* Initialize OVS_CB (it came from Netlink so might not be zeroed). */
+       memset(OVS_CB(packet), 0, sizeof(struct ovs_skb_cb));
+
        err = flow_extract(packet, -1, &key, &is_frag);
        if (err)
-               goto exit;
+               goto err_kfree_skb;
 
        rcu_read_lock();
        dp = get_dp(odp_header->dp_ifindex);
        err = -ENODEV;
-       if (dp)
-               err = execute_actions(dp, packet, &key,
-                                     nla_data(a[ODP_PACKET_ATTR_ACTIONS]),
-                                     nla_len(a[ODP_PACKET_ATTR_ACTIONS]));
+       if (!dp)
+               goto err_unlock;
+       err = execute_actions(dp, packet, &key,
+                             nla_data(a[ODP_PACKET_ATTR_ACTIONS]),
+                             nla_len(a[ODP_PACKET_ATTR_ACTIONS]));
        rcu_read_unlock();
+       return err;
 
-exit:
+err_unlock:
+       rcu_read_unlock();
+err_kfree_skb:
+       kfree_skb(packet);
+err:
        return err;
 }
 
@@ -771,6 +793,8 @@ int dp_min_mtu(const struct datapath *dp)
                        continue;
 
                dev_mtu = vport_get_mtu(p);
+               if (!dev_mtu)
+                       continue;
                if (!mtu || dev_mtu < mtu)
                        mtu = dev_mtu;
        }
@@ -825,7 +849,6 @@ static int odp_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
        struct nlattr *nla;
        unsigned long used;
        u8 tcp_flags;
-       int nla_len;
        int err;
 
        sf_acts = rcu_dereference_protected(flow->sf_acts,
@@ -853,7 +876,7 @@ static int odp_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
        spin_unlock_bh(&flow->lock);
 
        if (used)
-               NLA_PUT_MSECS(skb, ODP_FLOW_ATTR_USED, used);
+               NLA_PUT_U64(skb, ODP_FLOW_ATTR_USED, flow_used_time(used));
 
        if (stats.n_packets)
                NLA_PUT(skb, ODP_FLOW_ATTR_STATS, sizeof(struct odp_flow_stats), &stats);
@@ -861,23 +884,20 @@ static int odp_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
        if (tcp_flags)
                NLA_PUT_U8(skb, ODP_FLOW_ATTR_TCP_FLAGS, tcp_flags);
 
-       /* If ODP_FLOW_ATTR_ACTIONS doesn't fit, and this is the first flow to
-        * be dumped into 'skb', then expand the skb.  This is unusual for
-        * Netlink but individual action lists can be longer than a page and
-        * thus entirely undumpable if we didn't do this. */
-       nla_len = nla_total_size(sf_acts->actions_len);
-       if (nla_len > skb_tailroom(skb) && !skb_orig_len) {
-               int hdr_off = (unsigned char *)odp_header - skb->data;
-
-               err = pskb_expand_head(skb, 0, nla_len - skb_tailroom(skb), GFP_KERNEL);
-               if (err)
-                       goto error;
-
-               odp_header = (struct odp_header *)(skb->data + hdr_off);
-       }
-       nla = nla_nest_start(skb, ODP_FLOW_ATTR_ACTIONS);
-       memcpy(__skb_put(skb, sf_acts->actions_len), sf_acts->actions, sf_acts->actions_len);
-       nla_nest_end(skb, nla);
+       /* If ODP_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
+        * this is the first flow to be dumped into 'skb'.  This is unusual for
+        * Netlink but individual action lists can be longer than
+        * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
+        * The userspace caller can always fetch the actions separately if it
+        * really wants them.  (Most userspace callers in fact don't care.)
+        *
+        * This can only fail for dump operations because the skb is always
+        * properly sized for single flows.
+        */
+       err = nla_put(skb, ODP_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
+                     sf_acts->actions);
+       if (err < 0 && skb_orig_len)
+               goto error;
 
        return genlmsg_end(skb, odp_header);
 
@@ -1290,7 +1310,7 @@ static int odp_dp_cmd_validate(struct nlattr *a[ODP_DP_ATTR_MAX + 1])
                        return -EINVAL;
        }
 
-       return VERIFY_NUL_STRING(a[ODP_DP_ATTR_NAME], IFNAMSIZ - 1);
+       return CHECK_NUL_STRING(a[ODP_DP_ATTR_NAME], IFNAMSIZ - 1);
 }
 
 /* Called with genl_mutex and optionally with RTNL lock also. */
@@ -1442,12 +1462,20 @@ static int odp_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
        list_del(&dp->list_node);
        dp_detach_port(get_vport_protected(dp, ODPP_LOCAL));
 
+       /* rtnl_unlock() will wait until all the references to devices that
+        * are pending unregistration have been dropped.  We do it here to
+        * ensure that any internal devices (which contain DP pointers) are
+        * fully destroyed before freeing the datapath.
+        */
+       rtnl_unlock();
+
        call_rcu(&dp->rcu, destroy_dp_rcu);
        module_put(THIS_MODULE);
 
        genl_notify(reply, genl_info_net(info), info->snd_pid,
                    dp_datapath_multicast_group.id, info->nlhdr, GFP_KERNEL);
-       err = 0;
+
+       return 0;
 
 exit_unlock:
        rtnl_unlock();
@@ -1584,6 +1612,7 @@ static int odp_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
        struct odp_header *odp_header;
        struct nlattr *nla;
        int ifindex, iflink;
+       int mtu;
        int err;
 
        odp_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
@@ -1605,7 +1634,9 @@ static int odp_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 
        NLA_PUT(skb, ODP_VPORT_ATTR_ADDRESS, ETH_ALEN, vport_get_addr(vport));
 
-       NLA_PUT_U32(skb, ODP_VPORT_ATTR_MTU, vport_get_mtu(vport));
+       mtu = vport_get_mtu(vport);
+       if (mtu)
+               NLA_PUT_U32(skb, ODP_VPORT_ATTR_MTU, mtu);
 
        err = vport_get_options(vport, skb);
        if (err == -EMSGSIZE)
@@ -1649,7 +1680,7 @@ static struct sk_buff *odp_vport_cmd_build_info(struct vport *vport, u32 pid,
 
 static int odp_vport_cmd_validate(struct nlattr *a[ODP_VPORT_ATTR_MAX + 1])
 {
-       return VERIFY_NUL_STRING(a[ODP_VPORT_ATTR_NAME], IFNAMSIZ - 1);
+       return CHECK_NUL_STRING(a[ODP_VPORT_ATTR_NAME], IFNAMSIZ - 1);
 }
 
 /* Called with RTNL lock or RCU read lock. */