datapath: Always allow tunnel mask to be specified in the netlink
[sliver-openvswitch.git] / datapath / flow.c
index fb4fc21..29d3062 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
 
 static struct kmem_cache *flow_cache;
 
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+               struct sw_flow_key_range *range, u8 val);
+
+static void update_range__(struct sw_flow_match *match,
+                         size_t offset, size_t size, bool is_mask)
+{
+       struct sw_flow_key_range *range = NULL;
+       size_t start = offset;
+       size_t end = offset + size;
+
+       if (!is_mask)
+               range = &match->range;
+       else if (match->mask)
+               range = &match->mask->range;
+
+       if (!range)
+               return;
+
+       if (range->start == range->end) {
+               range->start = start;
+               range->end = end;
+               return;
+       }
+
+       if (range->start > start)
+               range->start = start;
+
+       if (range->end < end)
+               range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+       do { \
+               update_range__(match, offsetof(struct sw_flow_key, field),  \
+                                    sizeof((match)->key->field), is_mask); \
+               if (is_mask) {                                              \
+                       if ((match)->mask)                                  \
+                               (match)->mask->key.field = value;           \
+               } else {                                                    \
+                       (match)->key->field = value;                        \
+               }                                                           \
+       } while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+       do { \
+               update_range__(match, offsetof(struct sw_flow_key, field),  \
+                               len, is_mask);                              \
+               if (is_mask) {                                              \
+                       if ((match)->mask)                                  \
+                               memcpy(&(match)->mask->key.field, value_p, len);\
+               } else {                                                    \
+                       memcpy(&(match)->key->field, value_p, len);         \
+               }                                                           \
+       } while (0)
+
+void ovs_match_init(struct sw_flow_match *match,
+                   struct sw_flow_key *key,
+                   struct sw_flow_mask *mask)
+{
+       memset(match, 0, sizeof(*match));
+       match->key = key;
+       match->mask = mask;
+
+       memset(key, 0, sizeof(*key));
+
+       if (mask) {
+               memset(&mask->key, 0, sizeof(mask->key));
+               mask->range.start = mask->range.end = 0;
+       }
+}
+
+static bool ovs_match_validate(const struct sw_flow_match *match,
+               u64 key_attrs, u64 mask_attrs)
+{
+       u64 key_expected = 1ULL << OVS_KEY_ATTR_ETHERNET;
+       u64 mask_allowed = key_attrs;  /* At most allow all key attributes */
+
+       /* The following mask attributes allowed only if they
+        * pass the validation tests. */
+       mask_allowed &= ~((1ULL << OVS_KEY_ATTR_IPV4)
+                       | (1ULL << OVS_KEY_ATTR_IPV6)
+                       | (1ULL << OVS_KEY_ATTR_TCP)
+                       | (1ULL << OVS_KEY_ATTR_UDP)
+                       | (1ULL << OVS_KEY_ATTR_ICMP)
+                       | (1ULL << OVS_KEY_ATTR_ICMPV6)
+                       | (1ULL << OVS_KEY_ATTR_ARP)
+                       | (1ULL << OVS_KEY_ATTR_ND));
+
+       /* Tunnel mask is always allowed. */
+       mask_allowed |= (1ULL << OVS_KEY_ATTR_TUNNEL);
+
+       if (match->key->phy.in_port == DP_MAX_PORTS &&
+           match->mask && (match->mask->key.phy.in_port == 0xffff))
+               mask_allowed |= (1ULL << OVS_KEY_ATTR_IN_PORT);
+
+       if (match->key->eth.type == htons(ETH_P_802_2) &&
+           match->mask && (match->mask->key.eth.type == htons(0xffff)))
+               mask_allowed |= (1ULL << OVS_KEY_ATTR_ETHERTYPE);
+
+       /* Check key attributes. */
+       if (match->key->eth.type == htons(ETH_P_ARP)
+                       || match->key->eth.type == htons(ETH_P_RARP)) {
+               key_expected |= 1ULL << OVS_KEY_ATTR_ARP;
+               if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+                       mask_allowed |= 1ULL << OVS_KEY_ATTR_ARP;
+       }
+
+       if (match->key->eth.type == htons(ETH_P_IP)) {
+               key_expected |= 1ULL << OVS_KEY_ATTR_IPV4;
+               if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+                       mask_allowed |= 1ULL << OVS_KEY_ATTR_IPV4;
+
+               if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+                       if (match->key->ip.proto == IPPROTO_UDP) {
+                               key_expected |= 1ULL << OVS_KEY_ATTR_UDP;
+                               if (match->mask && (match->mask->key.ip.proto == 0xff))
+                                       mask_allowed |= 1ULL << OVS_KEY_ATTR_UDP;
+                       }
+
+                       if (match->key->ip.proto == IPPROTO_TCP) {
+                               key_expected |= 1ULL << OVS_KEY_ATTR_TCP;
+                               if (match->mask && (match->mask->key.ip.proto == 0xff))
+                                       mask_allowed |= 1ULL << OVS_KEY_ATTR_TCP;
+                       }
+
+                       if (match->key->ip.proto == IPPROTO_ICMP) {
+                               key_expected |= 1ULL << OVS_KEY_ATTR_ICMP;
+                               if (match->mask && (match->mask->key.ip.proto == 0xff))
+                                       mask_allowed |= 1ULL << OVS_KEY_ATTR_ICMP;
+                       }
+               }
+       }
+
+       if (match->key->eth.type == htons(ETH_P_IPV6)) {
+               key_expected |= 1ULL << OVS_KEY_ATTR_IPV6;
+               if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+                       mask_allowed |= 1ULL << OVS_KEY_ATTR_IPV6;
+
+               if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+                       if (match->key->ip.proto == IPPROTO_UDP) {
+                               key_expected |= 1ULL << OVS_KEY_ATTR_UDP;
+                               if (match->mask && (match->mask->key.ip.proto == 0xff))
+                                       mask_allowed |= 1ULL << OVS_KEY_ATTR_UDP;
+                       }
+
+                       if (match->key->ip.proto == IPPROTO_TCP) {
+                               key_expected |= 1ULL << OVS_KEY_ATTR_TCP;
+                               if (match->mask && (match->mask->key.ip.proto == 0xff))
+                                       mask_allowed |= 1ULL << OVS_KEY_ATTR_TCP;
+                       }
+
+                       if (match->key->ip.proto == IPPROTO_ICMPV6) {
+                               key_expected |= 1ULL << OVS_KEY_ATTR_ICMPV6;
+                               if (match->mask && (match->mask->key.ip.proto == 0xff))
+                                       mask_allowed |= 1ULL << OVS_KEY_ATTR_ICMPV6;
+
+                               if (match->key->ipv6.tp.src ==
+                                               htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+                                   match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+                                       key_expected |= 1ULL << OVS_KEY_ATTR_ND;
+                                       if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
+                                               mask_allowed |= 1ULL << OVS_KEY_ATTR_ND;
+                               }
+                       }
+               }
+       }
+
+       if ((key_attrs & key_expected) != key_expected) {
+               /* Key attributes check failed. */
+               OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+                               key_attrs, key_expected);
+               return false;
+       }
+
+       if ((mask_attrs & mask_allowed) != mask_attrs) {
+               /* Mask attributes check failed. */
+               OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+                               mask_attrs, mask_allowed);
+               return false;
+       }
+
+       return true;
+}
+
 static int check_header(struct sk_buff *skb, int len)
 {
        if (unlikely(skb->len < len))
@@ -122,12 +306,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
        return cur_ms - idle_ms;
 }
 
-#define SW_FLOW_KEY_OFFSET(field)              \
-       (offsetof(struct sw_flow_key, field) +  \
-        FIELD_SIZEOF(struct sw_flow_key, field))
-
-static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
-                        int *key_lenp)
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
 {
        unsigned int nh_ofs = skb_network_offset(skb);
        unsigned int nh_len;
@@ -137,8 +316,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
        __be16 frag_off;
        int err;
 
-       *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
-
        err = check_header(skb, nh_ofs + sizeof(*nh));
        if (unlikely(err))
                return err;
@@ -177,6 +354,21 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
                                  sizeof(struct icmp6hdr));
 }
 
+void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
+                      const struct sw_flow_mask *mask)
+{
+       u8 *m = (u8 *)&mask->key + mask->range.start;
+       u8 *s = (u8 *)src + mask->range.start;
+       u8 *d = (u8 *)dst + mask->range.start;
+       int i;
+
+       memset(dst, 0, sizeof(*dst));
+       for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) {
+               *d = *s & *m;
+               d++, s++, m++;
+       }
+}
+
 #define TCP_FLAGS_OFFSET 13
 #define TCP_FLAG_MASK 0x3f
 
@@ -184,8 +376,10 @@ void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
 {
        u8 tcp_flags = 0;
 
-       if (flow->key.eth.type == htons(ETH_P_IP) &&
-           flow->key.ip.proto == IPPROTO_TCP) {
+       if ((flow->key.eth.type == htons(ETH_P_IP) ||
+            flow->key.eth.type == htons(ETH_P_IPV6)) &&
+           flow->key.ip.proto == IPPROTO_TCP &&
+           likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
                u8 *tcp = (u8 *)tcp_hdr(skb);
                tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
        }
@@ -198,20 +392,18 @@ void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
        spin_unlock(&flow->lock);
 }
 
-struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
+struct sw_flow_actions *ovs_flow_actions_alloc(int size)
 {
-       int actions_len = nla_len(actions);
        struct sw_flow_actions *sfa;
 
-       if (actions_len > MAX_ACTIONS_BUFSIZE)
+       if (size > MAX_ACTIONS_BUFSIZE)
                return ERR_PTR(-EINVAL);
 
-       sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
+       sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
        if (!sfa)
                return ERR_PTR(-ENOMEM);
 
-       sfa->actions_len = actions_len;
-       memcpy(sfa->actions, nla_data(actions), actions_len);
+       sfa->actions_len = 0;
        return sfa;
 }
 
@@ -224,9 +416,8 @@ struct sw_flow *ovs_flow_alloc(void)
                return ERR_PTR(-ENOMEM);
 
        spin_lock_init(&flow->lock);
-       atomic_set(&flow->refcnt, 1);
        flow->sf_acts = NULL;
-       flow->dead = false;
+       flow->mask = NULL;
 
        return flow;
 }
@@ -243,7 +434,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
        struct flex_array *buckets;
        int i, err;
 
-       buckets = flex_array_alloc(sizeof(struct hlist_head *),
+       buckets = flex_array_alloc(sizeof(struct hlist_head),
                                   n_buckets, GFP_KERNEL);
        if (!buckets)
                return NULL;
@@ -266,7 +457,7 @@ static void free_buckets(struct flex_array *buckets)
        flex_array_free(buckets);
 }
 
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
+static struct flow_table *__flow_tbl_alloc(int new_size)
 {
        struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
 
@@ -284,63 +475,78 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
        table->node_ver = 0;
        table->keep_flows = false;
        get_random_bytes(&table->hash_seed, sizeof(u32));
+       table->mask_list = NULL;
 
        return table;
 }
 
-static void flow_free(struct sw_flow *flow)
-{
-       flow->dead = true;
-       ovs_flow_put(flow);
-}
-
-void ovs_flow_tbl_destroy(struct flow_table *table)
+static void __flow_tbl_destroy(struct flow_table *table)
 {
        int i;
 
-       if (!table)
-               return;
-
        if (table->keep_flows)
                goto skip_flows;
 
        for (i = 0; i < table->n_buckets; i++) {
                struct sw_flow *flow;
                struct hlist_head *head = flex_array_get(table->buckets, i);
-               struct hlist_node *node, *n;
+               struct hlist_node *n;
                int ver = table->node_ver;
 
-               hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
-                       hlist_del_rcu(&flow->hash_node[ver]);
-                       flow_free(flow);
+               hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+                       hlist_del(&flow->hash_node[ver]);
+                       ovs_flow_free(flow, false);
                }
        }
 
+       BUG_ON(!list_empty(table->mask_list));
+       kfree(table->mask_list);
+
 skip_flows:
        free_buckets(table->buckets);
        kfree(table);
 }
 
+struct flow_table *ovs_flow_tbl_alloc(int new_size)
+{
+       struct flow_table *table = __flow_tbl_alloc(new_size);
+
+       if (!table)
+               return NULL;
+
+       table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL);
+       if (!table->mask_list) {
+               table->keep_flows = true;
+               __flow_tbl_destroy(table);
+               return NULL;
+       }
+       INIT_LIST_HEAD(table->mask_list);
+
+       return table;
+}
+
 static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
 {
        struct flow_table *table = container_of(rcu, struct flow_table, rcu);
 
-       ovs_flow_tbl_destroy(table);
+       __flow_tbl_destroy(table);
 }
 
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
 {
        if (!table)
                return;
 
-       call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+       if (deferred)
+               call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+       else
+               __flow_tbl_destroy(table);
 }
 
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
+struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last)
 {
        struct sw_flow *flow;
        struct hlist_head *head;
-       struct hlist_node *n;
        int ver;
        int i;
 
@@ -348,7 +554,7 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
        while (*bucket < table->n_buckets) {
                i = 0;
                head = flex_array_get(table->buckets, *bucket);
-               hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) {
+               hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
                        if (i < *last) {
                                i++;
                                continue;
@@ -363,6 +569,16 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
        return NULL;
 }
 
+static void __tbl_insert(struct flow_table *table, struct sw_flow *flow)
+{
+       struct hlist_head *head;
+
+       head = find_bucket(table, flow->hash);
+       hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+
+       table->count++;
+}
+
 static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
 {
        int old_ver;
@@ -375,13 +591,14 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new
        for (i = 0; i < old->n_buckets; i++) {
                struct sw_flow *flow;
                struct hlist_head *head;
-               struct hlist_node *n;
 
                head = flex_array_get(old->buckets, i);
 
-               hlist_for_each_entry(flow, n, head, hash_node[old_ver])
-                       ovs_flow_tbl_insert(new, flow);
+               hlist_for_each_entry(flow, head, hash_node[old_ver])
+                       __tbl_insert(new, flow);
        }
+
+       new->mask_list = old->mask_list;
        old->keep_flows = true;
 }
 
@@ -389,7 +606,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck
 {
        struct flow_table *new_table;
 
-       new_table = ovs_flow_tbl_alloc(n_buckets);
+       new_table = __flow_tbl_alloc(n_buckets);
        if (!new_table)
                return ERR_PTR(-ENOMEM);
 
@@ -408,36 +625,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
        return __flow_tbl_rehash(table, table->n_buckets * 2);
 }
 
-/* RCU callback used by ovs_flow_deferred_free. */
-static void rcu_free_flow_callback(struct rcu_head *rcu)
+static void __flow_free(struct sw_flow *flow)
 {
-       struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
-
-       flow->dead = true;
-       ovs_flow_put(flow);
+       kfree((struct sf_flow_acts __force *)flow->sf_acts);
+       kmem_cache_free(flow_cache, flow);
 }
 
-/* Schedules 'flow' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free(struct sw_flow *flow)
+static void rcu_free_flow_callback(struct rcu_head *rcu)
 {
-       call_rcu(&flow->rcu, rcu_free_flow_callback);
-}
+       struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
 
-void ovs_flow_hold(struct sw_flow *flow)
-{
-       atomic_inc(&flow->refcnt);
+       __flow_free(flow);
 }
 
-void ovs_flow_put(struct sw_flow *flow)
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
 {
-       if (unlikely(!flow))
+       if (!flow)
                return;
 
-       if (atomic_dec_and_test(&flow->refcnt)) {
-               kfree((struct sf_flow_acts __force *)flow->sf_acts);
-               kmem_cache_free(flow_cache, flow);
-       }
+       ovs_sw_flow_mask_del_ref(flow->mask, deferred);
+
+       if (deferred)
+               call_rcu(&flow->rcu, rcu_free_flow_callback);
+       else
+               __flow_free(flow);
 }
 
 /* RCU callback used by ovs_flow_deferred_free_acts. */
@@ -492,7 +703,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
        proto = *(__be16 *) skb->data;
        __skb_pull(skb, sizeof(__be16));
 
-       if (ntohs(proto) >= 1536)
+       if (ntohs(proto) >= ETH_P_802_3_MIN)
                return proto;
 
        if (skb->len < sizeof(struct llc_snap_hdr))
@@ -508,22 +719,23 @@ static __be16 parse_ethertype(struct sk_buff *skb)
                return htons(ETH_P_802_2);
 
        __skb_pull(skb, sizeof(struct llc_snap_hdr));
-       return llc->ethertype;
+
+       if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
+               return llc->ethertype;
+
+       return htons(ETH_P_802_2);
 }
 
 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
-                       int *key_lenp, int nh_len)
+                       int nh_len)
 {
        struct icmp6hdr *icmp = icmp6_hdr(skb);
-       int error = 0;
-       int key_len;
 
        /* The ICMPv6 type and code fields use the 16-bit transport port
         * fields, so we need to store them in 16-bit network byte order.
         */
        key->ipv6.tp.src = htons(icmp->icmp6_type);
        key->ipv6.tp.dst = htons(icmp->icmp6_code);
-       key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 
        if (icmp->icmp6_code == 0 &&
            (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
@@ -532,21 +744,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
                struct nd_msg *nd;
                int offset;
 
-               key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-
                /* In order to process neighbor discovery options, we need the
                 * entire packet.
                 */
                if (unlikely(icmp_len < sizeof(*nd)))
-                       goto out;
-               if (unlikely(skb_linearize(skb))) {
-                       error = -ENOMEM;
-                       goto out;
-               }
+                       return 0;
+
+               if (unlikely(skb_linearize(skb)))
+                       return -ENOMEM;
 
                nd = (struct nd_msg *)skb_transport_header(skb);
                key->ipv6.nd.target = nd->target;
-               key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
 
                icmp_len -= sizeof(*nd);
                offset = 0;
@@ -556,7 +764,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
                        int opt_len = nd_opt->nd_opt_len * 8;
 
                        if (unlikely(!opt_len || opt_len > icmp_len))
-                               goto invalid;
+                               return 0;
 
                        /* Store the link layer address if the appropriate
                         * option is provided.  It is considered an error if
@@ -581,16 +789,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
                }
        }
 
-       goto out;
+       return 0;
 
 invalid:
        memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
        memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
        memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
 
-out:
-       *key_lenp = key_len;
-       return error;
+       return 0;
 }
 
 /**
@@ -612,23 +818,23 @@ out:
  *    - skb->network_header: just past the Ethernet header, or just past the
  *      VLAN header, to the first byte of the Ethernet payload.
  *
- *    - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6
+ *    - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
  *      on output, then just past the IP header, if one is present and
  *      of a correct length, otherwise the same as skb->network_header.
- *      For other key->dl_type values it is left untouched.
+ *      For other key->eth.type values it is left untouched.
  */
-int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
-                int *key_lenp)
+int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 {
-       int error = 0;
-       int key_len = SW_FLOW_KEY_OFFSET(eth);
+       int error;
        struct ethhdr *eth;
 
        memset(key, 0, sizeof(*key));
 
        key->phy.priority = skb->priority;
-       key->phy.tun_id = OVS_CB(skb)->tun_id;
+       if (OVS_CB(skb)->tun_key)
+               memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
        key->phy.in_port = in_port;
+       key->phy.skb_mark = skb_get_mark(skb);
 
        skb_reset_mac_header(skb);
 
@@ -640,6 +846,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
        memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
 
        __skb_pull(skb, 2 * ETH_ALEN);
+       /* We are going to push all headers that we pull, so no need to
+        * update skb->csum here. */
 
        if (vlan_tx_tag_present(skb))
                key->eth.tci = htons(vlan_get_tci(skb));
@@ -659,15 +867,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
                struct iphdr *nh;
                __be16 offset;
 
-               key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
-
                error = check_iphdr(skb);
                if (unlikely(error)) {
                        if (error == -EINVAL) {
                                skb->transport_header = skb->network_header;
                                error = 0;
                        }
-                       goto out;
+                       return error;
                }
 
                nh = ip_hdr(skb);
@@ -681,7 +887,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
                offset = nh->frag_off & htons(IP_OFFSET);
                if (offset) {
                        key->ip.frag = OVS_FRAG_TYPE_LATER;
-                       goto out;
+                       return 0;
                }
                if (nh->frag_off & htons(IP_MF) ||
                         skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
@@ -689,21 +895,18 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 
                /* Transport layer. */
                if (key->ip.proto == IPPROTO_TCP) {
-                       key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
                        if (tcphdr_ok(skb)) {
                                struct tcphdr *tcp = tcp_hdr(skb);
                                key->ipv4.tp.src = tcp->source;
                                key->ipv4.tp.dst = tcp->dest;
                        }
                } else if (key->ip.proto == IPPROTO_UDP) {
-                       key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
                        if (udphdr_ok(skb)) {
                                struct udphdr *udp = udp_hdr(skb);
                                key->ipv4.tp.src = udp->source;
                                key->ipv4.tp.dst = udp->dest;
                        }
                } else if (key->ip.proto == IPPROTO_ICMP) {
-                       key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
                        if (icmphdr_ok(skb)) {
                                struct icmphdr *icmp = icmp_hdr(skb);
                                /* The ICMP type and code fields use the 16-bit
@@ -714,7 +917,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
                        }
                }
 
-       } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
+       } else if ((key->eth.type == htons(ETH_P_ARP) ||
+                  key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
                struct arp_eth_header *arp;
 
                arp = (struct arp_eth_header *)skb_network_header(skb);
@@ -727,103 +931,162 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
                        /* We only match on the lower 8 bits of the opcode. */
                        if (ntohs(arp->ar_op) <= 0xff)
                                key->ip.proto = ntohs(arp->ar_op);
-
-                       if (key->ip.proto == ARPOP_REQUEST
-                                       || key->ip.proto == ARPOP_REPLY) {
-                               memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
-                               memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
-                               memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
-                               memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
-                               key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
-                       }
+                       memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
+                       memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
+                       memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
+                       memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
                }
        } else if (key->eth.type == htons(ETH_P_IPV6)) {
                int nh_len;             /* IPv6 Header + Extensions */
 
-               nh_len = parse_ipv6hdr(skb, key, &key_len);
+               nh_len = parse_ipv6hdr(skb, key);
                if (unlikely(nh_len < 0)) {
-                       if (nh_len == -EINVAL)
+                       if (nh_len == -EINVAL) {
                                skb->transport_header = skb->network_header;
-                       else
+                               error = 0;
+                       } else {
                                error = nh_len;
-                       goto out;
+                       }
+                       return error;
                }
 
                if (key->ip.frag == OVS_FRAG_TYPE_LATER)
-                       goto out;
+                       return 0;
                if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
                        key->ip.frag = OVS_FRAG_TYPE_FIRST;
 
                /* Transport layer. */
                if (key->ip.proto == NEXTHDR_TCP) {
-                       key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
                        if (tcphdr_ok(skb)) {
                                struct tcphdr *tcp = tcp_hdr(skb);
                                key->ipv6.tp.src = tcp->source;
                                key->ipv6.tp.dst = tcp->dest;
                        }
                } else if (key->ip.proto == NEXTHDR_UDP) {
-                       key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
                        if (udphdr_ok(skb)) {
                                struct udphdr *udp = udp_hdr(skb);
                                key->ipv6.tp.src = udp->source;
                                key->ipv6.tp.dst = udp->dest;
                        }
                } else if (key->ip.proto == NEXTHDR_ICMP) {
-                       key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
                        if (icmp6hdr_ok(skb)) {
-                               error = parse_icmpv6(skb, key, &key_len, nh_len);
-                               if (error < 0)
-                                       goto out;
+                               error = parse_icmpv6(skb, key, nh_len);
+                               if (error)
+                                       return error;
                        }
                }
        }
 
-out:
-       *key_lenp = key_len;
-       return error;
+       return 0;
+}
+
+static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
+{
+       return jhash2((u32 *)((u8 *)key + key_start),
+                     DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
+}
+
+static int flow_key_start(const struct sw_flow_key *key)
+{
+       if (key->tun_key.ipv4_dst)
+               return 0;
+       else
+               return offsetof(struct sw_flow_key, phy);
+}
+
+static bool __cmp_key(const struct sw_flow_key *key1,
+               const struct sw_flow_key *key2,  int key_start, int key_len)
+{
+       return !memcmp((u8 *)key1 + key_start,
+                       (u8 *)key2 + key_start, (key_len - key_start));
+}
+
+static bool __flow_cmp_key(const struct sw_flow *flow,
+               const struct sw_flow_key *key, int key_start, int key_len)
+{
+       return __cmp_key(&flow->key, key, key_start, key_len);
 }
 
-u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
+static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
+                 const struct sw_flow_key *key, int key_start, int key_len)
 {
-       return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
+       return __cmp_key(&flow->unmasked_key, key, key_start, key_len);
+}
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+               const struct sw_flow_key *key, int key_len)
+{
+       int key_start;
+       key_start = flow_key_start(key);
+
+       return __flow_cmp_unmasked_key(flow, key, key_start, key_len);
+
+}
+
+struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
+                                      struct sw_flow_match *match)
+{
+       struct sw_flow_key *unmasked = match->key;
+       int key_len = match->range.end;
+       struct sw_flow *flow;
+
+       flow = ovs_flow_lookup(table, unmasked);
+       if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_len)))
+               flow = NULL;
+
+       return flow;
 }
 
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
-                               struct sw_flow_key *key, int key_len)
+static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
+                                   const struct sw_flow_key *flow_key,
+                                   struct sw_flow_mask *mask)
 {
        struct sw_flow *flow;
-       struct hlist_node *n;
        struct hlist_head *head;
+       int key_start = mask->range.start;
+       int key_len = mask->range.end;
        u32 hash;
+       struct sw_flow_key masked_key;
 
-       hash = ovs_flow_hash(key, key_len);
-
+       ovs_flow_key_mask(&masked_key, flow_key, mask);
+       hash = ovs_flow_hash(&masked_key, key_start, key_len);
        head = find_bucket(table, hash);
-       hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
-
-               if (flow->hash == hash &&
-                   !memcmp(&flow->key, key, key_len)) {
+       hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
+               if (flow->mask == mask &&
+                   __flow_cmp_key(flow, &masked_key, key_start, key_len))
                        return flow;
-               }
        }
        return NULL;
 }
 
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+struct sw_flow *ovs_flow_lookup(struct flow_table *tbl,
+                               const struct sw_flow_key *key)
 {
-       struct hlist_head *head;
+       struct sw_flow *flow = NULL;
+       struct sw_flow_mask *mask;
 
-       head = find_bucket(table, flow->hash);
-       hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
-       table->count++;
+       list_for_each_entry_rcu(mask, tbl->mask_list, list) {
+               flow = ovs_masked_flow_lookup(tbl, key, mask);
+               if (flow)  /* Found */
+                       break;
+       }
+
+       return flow;
 }
 
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+
+void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow)
 {
+       flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start,
+                       flow->mask->range.end);
+       __tbl_insert(table, flow);
+}
+
+void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow)
+{
+       BUG_ON(table->count == 0);
        hlist_del_rcu(&flow->hash_node[table->node_ver]);
        table->count--;
-       BUG_ON(table->count < 0);
 }
 
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
@@ -831,6 +1094,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
        [OVS_KEY_ATTR_ENCAP] = -1,
        [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
        [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
+       [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
        [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
        [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
        [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
@@ -842,317 +1106,555 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
        [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
        [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
        [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
-
-       /* Not upstream. */
-       [OVS_KEY_ATTR_TUN_ID] = sizeof(__be64),
+       [OVS_KEY_ATTR_TUNNEL] = -1,
 };
 
-static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
-                                 const struct nlattr *a[], u64 *attrs)
+static bool is_all_zero(const u8 *fp, size_t size)
 {
-       const struct ovs_key_icmp *icmp_key;
-       const struct ovs_key_tcp *tcp_key;
-       const struct ovs_key_udp *udp_key;
-
-       switch (swkey->ip.proto) {
-       case IPPROTO_TCP:
-               if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
-                       return -EINVAL;
-               *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
-               *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-               tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
-               swkey->ipv4.tp.src = tcp_key->tcp_src;
-               swkey->ipv4.tp.dst = tcp_key->tcp_dst;
-               break;
-
-       case IPPROTO_UDP:
-               if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
-                       return -EINVAL;
-               *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-               *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-               udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
-               swkey->ipv4.tp.src = udp_key->udp_src;
-               swkey->ipv4.tp.dst = udp_key->udp_dst;
-               break;
+       int i;
 
-       case IPPROTO_ICMP:
-               if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
-                       return -EINVAL;
-               *attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+       if (!fp)
+               return false;
 
-               *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-               icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
-               swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
-               swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
-               break;
-       }
+       for (i = 0; i < size; i++)
+               if (fp[i])
+                       return false;
 
-       return 0;
+       return true;
 }
 
-static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
-                                 const struct nlattr *a[], u64 *attrs)
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+                             const struct nlattr *a[],
+                             u64 *attrsp, bool nz)
 {
-       const struct ovs_key_icmpv6 *icmpv6_key;
-       const struct ovs_key_tcp *tcp_key;
-       const struct ovs_key_udp *udp_key;
+       const struct nlattr *nla;
+       u64 attrs;
+       int rem;
 
-       switch (swkey->ip.proto) {
-       case IPPROTO_TCP:
-               if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
-                       return -EINVAL;
-               *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+       attrs = *attrsp;
+       nla_for_each_nested(nla, attr, rem) {
+               u16 type = nla_type(nla);
+               int expected_len;
 
-               *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-               tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
-               swkey->ipv6.tp.src = tcp_key->tcp_src;
-               swkey->ipv6.tp.dst = tcp_key->tcp_dst;
-               break;
+               if (type > OVS_KEY_ATTR_MAX) {
+                       OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
+                                 type, OVS_KEY_ATTR_MAX);
+               }
 
-       case IPPROTO_UDP:
-               if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
+               if (attrs & (1ULL << type)) {
+                       OVS_NLERR("Duplicate key attribute (type %d).\n", type);
                        return -EINVAL;
-               *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-               *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-               udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
-               swkey->ipv6.tp.src = udp_key->udp_src;
-               swkey->ipv6.tp.dst = udp_key->udp_dst;
-               break;
+               }
 
-       case IPPROTO_ICMPV6:
-               if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
+               expected_len = ovs_key_lens[type];
+               if (nla_len(nla) != expected_len && expected_len != -1) {
+                       OVS_NLERR("Key attribute has unexpected length (type=%d"
+                                 ", length=%d, expected=%d).\n", type,
+                                 nla_len(nla), expected_len);
                        return -EINVAL;
-               *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+               }
 
-               *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-               icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
-               swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
-               swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
-
-               if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
-                   swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
-                       const struct ovs_key_nd *nd_key;
-
-                       if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
-                               return -EINVAL;
-                       *attrs &= ~(1 << OVS_KEY_ATTR_ND);
-
-                       *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-                       nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
-                       memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
-                              sizeof(swkey->ipv6.nd.target));
-                       memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
-                       memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
+               if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+                       attrs |= 1ULL << type;
+                       a[type] = nla;
                }
-               break;
+       }
+       if (rem) {
+               OVS_NLERR("Message has %d unknown bytes.\n", rem);
+               return -EINVAL;
        }
 
+       *attrsp = attrs;
        return 0;
 }
 
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+                             const struct nlattr *a[], u64 *attrsp)
+{
+       return __parse_flow_nlattrs(attr, a, attrsp, true);
+}
+
 static int parse_flow_nlattrs(const struct nlattr *attr,
                              const struct nlattr *a[], u64 *attrsp)
 {
-       const struct nlattr *nla;
-       u64 attrs;
-       int rem;
+       return __parse_flow_nlattrs(attr, a, attrsp, false);
+}
 
-       attrs = 0;
-       nla_for_each_nested(nla, attr, rem) {
-               u16 type = nla_type(nla);
-               int expected_len;
+int ipv4_tun_from_nlattr(const struct nlattr *attr,
+                        struct sw_flow_match *match, bool is_mask)
+{
+       struct nlattr *a;
+       int rem;
+       bool ttl = false;
+       __be16 tun_flags = 0;
+
+       nla_for_each_nested(a, attr, rem) {
+               int type = nla_type(a);
+               static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+                       [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
+                       [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
+                       [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
+                       [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
+                       [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
+                       [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
+                       [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
+               };
+
+               if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+                       OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
+                       type, OVS_TUNNEL_KEY_ATTR_MAX);
+                       return -EINVAL;
+               }
 
-               if (type > OVS_KEY_ATTR_MAX || attrs & (1ULL << type))
+               if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+                       OVS_NLERR("IPv4 tunnel attribute type has unexpected "
+                                 " legnth (type=%d, length=%d, expected=%d).\n",
+                                 type, nla_len(a), ovs_tunnel_key_lens[type]);
                        return -EINVAL;
+               }
 
-               expected_len = ovs_key_lens[type];
-               if (nla_len(nla) != expected_len && expected_len != -1)
+               switch (type) {
+               case OVS_TUNNEL_KEY_ATTR_ID:
+                       SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+                                       nla_get_be64(a), is_mask);
+                       tun_flags |= TUNNEL_KEY;
+                       break;
+               case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
+                       SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+                                       nla_get_be32(a), is_mask);
+                       break;
+               case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
+                       SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+                                       nla_get_be32(a), is_mask);
+                       break;
+               case OVS_TUNNEL_KEY_ATTR_TOS:
+                       SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+                                       nla_get_u8(a), is_mask);
+                       break;
+               case OVS_TUNNEL_KEY_ATTR_TTL:
+                       SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+                                       nla_get_u8(a), is_mask);
+                       ttl = true;
+                       break;
+               case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
+                       tun_flags |= TUNNEL_DONT_FRAGMENT;
+                       break;
+               case OVS_TUNNEL_KEY_ATTR_CSUM:
+                       tun_flags |= TUNNEL_CSUM;
+                       break;
+               default:
                        return -EINVAL;
+               }
+       }
 
-               attrs |= 1ULL << type;
-               a[type] = nla;
+       SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+
+       if (rem > 0) {
+               OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
+               return -EINVAL;
        }
-       if (rem)
+
+       if (!match->key->tun_key.ipv4_dst) {
+               OVS_NLERR("IPv4 tunnel destination address is zero.\n");
                return -EINVAL;
+       }
+
+       if (!ttl) {
+               OVS_NLERR("IPv4 tunnel TTL not specified.\n");
+               return -EINVAL;
+       }
 
-       *attrsp = attrs;
        return 0;
 }
 
-/**
- * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
- * @swkey: receives the extracted flow key.
- * @key_lenp: number of bytes used in @swkey.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- */
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
-                     const struct nlattr *attr)
+int ipv4_tun_to_nlattr(struct sk_buff *skb,
+                       const struct ovs_key_ipv4_tunnel *tun_key,
+                       const struct ovs_key_ipv4_tunnel *output)
 {
-       const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
-       const struct ovs_key_ethernet *eth_key;
-       int key_len;
-       u64 attrs;
-       int err;
+       struct nlattr *nla;
 
-       memset(swkey, 0, sizeof(struct sw_flow_key));
-       key_len = SW_FLOW_KEY_OFFSET(eth);
+       nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+       if (!nla)
+               return -EMSGSIZE;
+
+       if (output->tun_flags & TUNNEL_KEY &&
+           nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
+               return -EMSGSIZE;
+       if (output->ipv4_src &&
+               nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+               return -EMSGSIZE;
+       if (output->ipv4_dst &&
+               nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+               return -EMSGSIZE;
+       if (output->ipv4_tos &&
+               nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+               return -EMSGSIZE;
+       if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+               return -EMSGSIZE;
+       if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+               nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+               return -EMSGSIZE;
+       if ((output->tun_flags & TUNNEL_CSUM) &&
+               nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+               return -EMSGSIZE;
+
+       nla_nest_end(skb, nla);
+       return 0;
+}
 
-       err = parse_flow_nlattrs(attr, a, &attrs);
-       if (err)
-               return err;
 
-       /* Metadata attributes. */
-       if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
-               swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
-               attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
+               const struct nlattr **a, bool is_mask)
+{
+       if (*attrs & (1ULL << OVS_KEY_ATTR_PRIORITY)) {
+               SW_FLOW_KEY_PUT(match, phy.priority,
+                         nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+               *attrs &= ~(1ULL << OVS_KEY_ATTR_PRIORITY);
        }
-       if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+
+       if (*attrs & (1ULL << OVS_KEY_ATTR_IN_PORT)) {
                u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
-               if (in_port >= DP_MAX_PORTS)
+
+               if (!is_mask && in_port >= DP_MAX_PORTS)
                        return -EINVAL;
-               swkey->phy.in_port = in_port;
-               attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
-       } else {
-               swkey->phy.in_port = DP_MAX_PORTS;
+               SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+               *attrs &= ~(1ULL << OVS_KEY_ATTR_IN_PORT);
+       } else if (!is_mask) {
+               SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
        }
 
-       if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) {
-               swkey->phy.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
-               attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID);
+       if (*attrs & (1ULL << OVS_KEY_ATTR_SKB_MARK)) {
+               uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) && !defined(CONFIG_NETFILTER)
+               if (!is_mask && mark != 0) {
+                       OVS_NLERR("skb->mark must be zero on this kernel (mark=%d).\n", mark);
+                       return -EINVAL;
+               }
+#endif
+               SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+               *attrs &= ~(1ULL << OVS_KEY_ATTR_SKB_MARK);
+       }
+       if (*attrs & (1ULL << OVS_KEY_ATTR_TUNNEL)) {
+               if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+                                       is_mask))
+                       return -EINVAL;
+               *attrs &= ~(1ULL << OVS_KEY_ATTR_TUNNEL);
        }
+       return 0;
+}
 
-       /* Data attributes. */
-       if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
-               return -EINVAL;
-       attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
+               const struct nlattr **a, bool is_mask)
+{
+       int err;
+       u64 orig_attrs = attrs;
+
+       err = metadata_from_nlattrs(match, &attrs, a, is_mask);
+       if (err)
+               return err;
 
-       eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
-       memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
-       memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
+       if (attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) {
+               const struct ovs_key_ethernet *eth_key;
+
+               eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+               SW_FLOW_KEY_MEMCPY(match, eth.src,
+                               eth_key->eth_src, ETH_ALEN, is_mask);
+               SW_FLOW_KEY_MEMCPY(match, eth.dst,
+                               eth_key->eth_dst, ETH_ALEN, is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_ETHERNET);
+       }
 
-       if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
-           nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
-               const struct nlattr *encap;
+       if (attrs & (1ULL << OVS_KEY_ATTR_VLAN)) {
                __be16 tci;
 
-               if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
-                             (1 << OVS_KEY_ATTR_ETHERTYPE) |
-                             (1 << OVS_KEY_ATTR_ENCAP)))
+               tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+               if (!(tci & htons(VLAN_TAG_PRESENT))) {
+                       if (is_mask)
+                               OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
+                       else
+                               OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
+
                        return -EINVAL;
+               }
 
-               encap = a[OVS_KEY_ATTR_ENCAP];
-               tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-               if (tci & htons(VLAN_TAG_PRESENT)) {
-                       swkey->eth.tci = tci;
+               SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_VLAN);
+       } else if (!is_mask)
+               SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
 
-                       err = parse_flow_nlattrs(encap, a, &attrs);
-                       if (err)
-                               return err;
-               } else if (!tci) {
-                       /* Corner case for truncated 802.1Q header. */
-                       if (nla_len(encap))
-                               return -EINVAL;
+       if (attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)) {
+               __be16 eth_type;
 
-                       swkey->eth.type = htons(ETH_P_8021Q);
-                       *key_lenp = key_len;
-                       return 0;
-               } else {
+               eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+               if (!is_mask && ntohs(eth_type) < ETH_P_802_3_MIN) {
+                       OVS_NLERR("EtherType is less than mimimum (type=%x, min=%x).\n",
+                                       ntohs(eth_type), ETH_P_802_3_MIN);
                        return -EINVAL;
                }
-       }
 
-       if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
-               swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-               if (ntohs(swkey->eth.type) < 1536)
-                       return -EINVAL;
-               attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
-       } else {
-               swkey->eth.type = htons(ETH_P_802_2);
+               SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_ETHERTYPE);
+       } else if (!is_mask) {
+               SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
        }
 
-       if (swkey->eth.type == htons(ETH_P_IP)) {
+       if (attrs & (1ULL << OVS_KEY_ATTR_IPV4)) {
                const struct ovs_key_ipv4 *ipv4_key;
 
-               if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
-                       return -EINVAL;
-               attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
-
-               key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
                ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
-               if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
+               if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+                       OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
+                               ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
                        return -EINVAL;
-               swkey->ip.proto = ipv4_key->ipv4_proto;
-               swkey->ip.tos = ipv4_key->ipv4_tos;
-               swkey->ip.ttl = ipv4_key->ipv4_ttl;
-               swkey->ip.frag = ipv4_key->ipv4_frag;
-               swkey->ipv4.addr.src = ipv4_key->ipv4_src;
-               swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
-
-               if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-                       err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
-                       if (err)
-                               return err;
                }
-       } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+               SW_FLOW_KEY_PUT(match, ip.proto,
+                               ipv4_key->ipv4_proto, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.tos,
+                               ipv4_key->ipv4_tos, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.ttl,
+                               ipv4_key->ipv4_ttl, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.frag,
+                               ipv4_key->ipv4_frag, is_mask);
+               SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+                               ipv4_key->ipv4_src, is_mask);
+               SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+                               ipv4_key->ipv4_dst, is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_IPV6)) {
                const struct ovs_key_ipv6 *ipv6_key;
 
-               if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
+               ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
+               if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+                       OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
+                               ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
                        return -EINVAL;
-               attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+               }
+               SW_FLOW_KEY_PUT(match, ipv6.label,
+                               ipv6_key->ipv6_label, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.proto,
+                               ipv6_key->ipv6_proto, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.tos,
+                               ipv6_key->ipv6_tclass, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.ttl,
+                               ipv6_key->ipv6_hlimit, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.frag,
+                               ipv6_key->ipv6_frag, is_mask);
+               SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+                               ipv6_key->ipv6_src,
+                               sizeof(match->key->ipv6.addr.src),
+                               is_mask);
+               SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+                               ipv6_key->ipv6_dst,
+                               sizeof(match->key->ipv6.addr.dst),
+                               is_mask);
+
+               attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6);
+       }
 
-               key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
-               ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
-               if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
+       if (attrs & (1ULL << OVS_KEY_ATTR_ARP)) {
+               const struct ovs_key_arp *arp_key;
+
+               arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+               if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+                       OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
+                                 arp_key->arp_op);
                        return -EINVAL;
-               swkey->ipv6.label = ipv6_key->ipv6_label;
-               swkey->ip.proto = ipv6_key->ipv6_proto;
-               swkey->ip.tos = ipv6_key->ipv6_tclass;
-               swkey->ip.ttl = ipv6_key->ipv6_hlimit;
-               swkey->ip.frag = ipv6_key->ipv6_frag;
-               memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
-                      sizeof(swkey->ipv6.addr.src));
-               memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
-                      sizeof(swkey->ipv6.addr.dst));
-
-               if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-                       err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
+               }
+
+               SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+                               arp_key->arp_sip, is_mask);
+               SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+                       arp_key->arp_tip, is_mask);
+               SW_FLOW_KEY_PUT(match, ip.proto,
+                               ntohs(arp_key->arp_op), is_mask);
+               SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+                               arp_key->arp_sha, ETH_ALEN, is_mask);
+               SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+                               arp_key->arp_tha, ETH_ALEN, is_mask);
+
+               attrs &= ~(1ULL << OVS_KEY_ATTR_ARP);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_TCP)) {
+               const struct ovs_key_tcp *tcp_key;
+
+               tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+               if (orig_attrs & (1ULL << OVS_KEY_ATTR_IPV4)) {
+                       SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+                                       tcp_key->tcp_src, is_mask);
+                       SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+                                       tcp_key->tcp_dst, is_mask);
+               } else {
+                       SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+                                       tcp_key->tcp_src, is_mask);
+                       SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+                                       tcp_key->tcp_dst, is_mask);
+               }
+               attrs &= ~(1ULL << OVS_KEY_ATTR_TCP);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_UDP)) {
+               const struct ovs_key_udp *udp_key;
+
+               udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+               if (orig_attrs & (1ULL << OVS_KEY_ATTR_IPV4)) {
+                       SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+                                       udp_key->udp_src, is_mask);
+                       SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+                                       udp_key->udp_dst, is_mask);
+               } else {
+                       SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+                                       udp_key->udp_src, is_mask);
+                       SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+                                       udp_key->udp_dst, is_mask);
+               }
+               attrs &= ~(1ULL << OVS_KEY_ATTR_UDP);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_ICMP)) {
+               const struct ovs_key_icmp *icmp_key;
+
+               icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+               SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+                               htons(icmp_key->icmp_type), is_mask);
+               SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+                               htons(icmp_key->icmp_code), is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_ICMP);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_ICMPV6)) {
+               const struct ovs_key_icmpv6 *icmpv6_key;
+
+               icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+               SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+                               htons(icmpv6_key->icmpv6_type), is_mask);
+               SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+                               htons(icmpv6_key->icmpv6_code), is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_ICMPV6);
+       }
+
+       if (attrs & (1ULL << OVS_KEY_ATTR_ND)) {
+               const struct ovs_key_nd *nd_key;
+
+               nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+               SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+                       nd_key->nd_target,
+                       sizeof(match->key->ipv6.nd.target),
+                       is_mask);
+               SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+                       nd_key->nd_sll, ETH_ALEN, is_mask);
+               SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+                               nd_key->nd_tll, ETH_ALEN, is_mask);
+               attrs &= ~(1ULL << OVS_KEY_ATTR_ND);
+       }
+
+       if (attrs != 0)
+               return -EINVAL;
+
+       return 0;
+}
+
+/**
+ * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ */
+int ovs_match_from_nlattrs(struct sw_flow_match *match,
+                          const struct nlattr *key,
+                          const struct nlattr *mask)
+{
+       const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+       const struct nlattr *encap;
+       u64 key_attrs = 0;
+       u64 mask_attrs = 0;
+       bool encap_valid = false;
+       int err;
+
+       err = parse_flow_nlattrs(key, a, &key_attrs);
+       if (err)
+               return err;
+
+       if (key_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) {
+               encap = a[OVS_KEY_ATTR_ENCAP];
+               key_attrs &= ~(1ULL << OVS_KEY_ATTR_ENCAP);
+               if (nla_len(encap)) {
+                       __be16 eth_type = 0; /* ETH_P_8021Q */
+
+                       if (a[OVS_KEY_ATTR_ETHERTYPE])
+                               eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+                       if  ((eth_type == htons(ETH_P_8021Q)) && (a[OVS_KEY_ATTR_VLAN])) {
+                               encap_valid = true;
+                               key_attrs &= ~(1ULL << OVS_KEY_ATTR_ETHERTYPE);
+                               err = parse_flow_nlattrs(encap, a, &key_attrs);
+                       } else {
+                               OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
+                               err = -EINVAL;
+                       }
+
                        if (err)
                                return err;
                }
-       } else if (swkey->eth.type == htons(ETH_P_ARP)) {
-               const struct ovs_key_arp *arp_key;
+       }
 
-               if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
-                       return -EINVAL;
-               attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+       err = ovs_key_from_nlattrs(match, key_attrs, a, false);
+       if (err)
+               return err;
 
-               key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
-               arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
-               swkey->ipv4.addr.src = arp_key->arp_sip;
-               swkey->ipv4.addr.dst = arp_key->arp_tip;
-               if (arp_key->arp_op & htons(0xff00))
-                       return -EINVAL;
-               swkey->ip.proto = ntohs(arp_key->arp_op);
-               memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
-               memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
+       if (mask) {
+               err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
+               if (err)
+                       return err;
+
+               if ((mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) && encap_valid) {
+                       __be16 eth_type = 0;
+
+                       mask_attrs &= ~(1ULL << OVS_KEY_ATTR_ENCAP);
+                       if (a[OVS_KEY_ATTR_ETHERTYPE])
+                               eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+                       if (eth_type == htons(0xffff)) {
+                               mask_attrs &= ~(1ULL << OVS_KEY_ATTR_ETHERTYPE);
+                               encap = a[OVS_KEY_ATTR_ENCAP];
+                               err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
+                       } else {
+                               OVS_NLERR("VLAN frames must have an exact match"
+                                        " on the TPID (mask=%x).\n",
+                                        ntohs(eth_type));
+                               err = -EINVAL;
+                       }
+
+                       if (err)
+                               return err;
+               }
+
+               err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
+               if (err)
+                       return err;
+       } else {
+               /* Populate exact match flow's key mask. */
+               if (match->mask)
+                       ovs_sw_flow_mask_set(match->mask, &match->range, 0xff);
        }
 
-       if (attrs)
+       if (!ovs_match_validate(match, key_attrs, mask_attrs))
                return -EINVAL;
-       *key_lenp = key_len;
 
        return 0;
 }
 
 /**
  * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
- * @in_port: receives the extracted input port.
- * @tun_id: receives the extracted tunnel ID.
- * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
+ * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
  * sequence.
  *
  * This parses a series of Netlink attributes that form a flow key, which must
@@ -1160,80 +1662,100 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
  */
-int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
-                                  const struct nlattr *attr)
+
+int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
+               const struct nlattr *attr)
 {
-       const struct nlattr *nla;
-       int rem;
+       struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
+       const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+       u64 attrs = 0;
+       int err;
+       struct sw_flow_match match;
 
-       *in_port = DP_MAX_PORTS;
-       *tun_id = 0;
-       *priority = 0;
+       flow->key.phy.in_port = DP_MAX_PORTS;
+       flow->key.phy.priority = 0;
+       flow->key.phy.skb_mark = 0;
+       memset(tun_key, 0, sizeof(flow->key.tun_key));
 
-       nla_for_each_nested(nla, attr, rem) {
-               int type = nla_type(nla);
-
-               if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
-                       if (nla_len(nla) != ovs_key_lens[type])
-                               return -EINVAL;
-
-                       switch (type) {
-                       case OVS_KEY_ATTR_PRIORITY:
-                               *priority = nla_get_u32(nla);
-                               break;
-
-                       case OVS_KEY_ATTR_TUN_ID:
-                               *tun_id = nla_get_be64(nla);
-                               break;
-
-                       case OVS_KEY_ATTR_IN_PORT:
-                               if (nla_get_u32(nla) >= DP_MAX_PORTS)
-                                       return -EINVAL;
-                               *in_port = nla_get_u32(nla);
-                               break;
-                       }
-               }
-       }
-       if (rem)
+       err = parse_flow_nlattrs(attr, a, &attrs);
+       if (err)
                return -EINVAL;
+
+       memset(&match, 0, sizeof(match));
+       match.key = &flow->key;
+
+       err = metadata_from_nlattrs(&match, &attrs, a, false);
+       if (err)
+               return err;
+
        return 0;
 }
 
-int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
+int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
+               const struct sw_flow_key *output, struct sk_buff *skb)
 {
        struct ovs_key_ethernet *eth_key;
        struct nlattr *nla, *encap;
 
-       if (swkey->phy.priority)
-               NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority);
+       if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
+               goto nla_put_failure;
 
-       if (swkey->phy.tun_id != cpu_to_be64(0))
-               NLA_PUT_BE64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun_id);
+       if (swkey->tun_key.ipv4_dst &&
+           ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
+               goto nla_put_failure;
 
-       if (swkey->phy.in_port != DP_MAX_PORTS)
-               NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port);
+       if (swkey->phy.in_port == DP_MAX_PORTS) {
+               if ((swkey != output) && (output->phy.in_port == 0xffff))
+                       if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+                               goto nla_put_failure;
+       } else {
+               u16 upper_u16;
+               upper_u16 = (swkey == output) ? 0 : 0xffff;
+
+               if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+                               (upper_u16 << 16) | output->phy.in_port))
+                       goto nla_put_failure;
+       }
+
+       if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
+               goto nla_put_failure;
 
        nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
        if (!nla)
                goto nla_put_failure;
+
        eth_key = nla_data(nla);
-       memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
-       memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
+       memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
+       memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
 
        if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
-               NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q));
-               NLA_PUT_BE16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci);
+               __be16 eth_type;
+               eth_type = (swkey == output) ? htons(ETH_P_8021Q) : htons(0xffff) ;
+               if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+                   nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+                       goto nla_put_failure;
                encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
                if (!swkey->eth.tci)
                        goto unencap;
-       } else {
+       } else
                encap = NULL;
-       }
 
-       if (swkey->eth.type == htons(ETH_P_802_2))
+       if (swkey->eth.type == htons(ETH_P_802_2)) {
+               /*
+                * Ethertype 802.2 is represented in the netlink with omitted
+                * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+                * 0xffff in the mask attribute.  Ethertype can also
+                * be wildcarded.
+                */
+               if (swkey != output && output->eth.type)
+                       if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+                                               output->eth.type))
+                               goto nla_put_failure;
                goto unencap;
+       }
 
-       NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type);
+       if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
+               goto nla_put_failure;
 
        if (swkey->eth.type == htons(ETH_P_IP)) {
                struct ovs_key_ipv4 *ipv4_key;
@@ -1242,12 +1764,12 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                if (!nla)
                        goto nla_put_failure;
                ipv4_key = nla_data(nla);
-               ipv4_key->ipv4_src = swkey->ipv4.addr.src;
-               ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
-               ipv4_key->ipv4_proto = swkey->ip.proto;
-               ipv4_key->ipv4_tos = swkey->ip.tos;
-               ipv4_key->ipv4_ttl = swkey->ip.ttl;
-               ipv4_key->ipv4_frag = swkey->ip.frag;
+               ipv4_key->ipv4_src = output->ipv4.addr.src;
+               ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+               ipv4_key->ipv4_proto = output->ip.proto;
+               ipv4_key->ipv4_tos = output->ip.tos;
+               ipv4_key->ipv4_ttl = output->ip.ttl;
+               ipv4_key->ipv4_frag = output->ip.frag;
        } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
                struct ovs_key_ipv6 *ipv6_key;
 
@@ -1255,16 +1777,17 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                if (!nla)
                        goto nla_put_failure;
                ipv6_key = nla_data(nla);
-               memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
+               memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
                                sizeof(ipv6_key->ipv6_src));
-               memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
+               memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
                                sizeof(ipv6_key->ipv6_dst));
-               ipv6_key->ipv6_label = swkey->ipv6.label;
-               ipv6_key->ipv6_proto = swkey->ip.proto;
-               ipv6_key->ipv6_tclass = swkey->ip.tos;
-               ipv6_key->ipv6_hlimit = swkey->ip.ttl;
-               ipv6_key->ipv6_frag = swkey->ip.frag;
-       } else if (swkey->eth.type == htons(ETH_P_ARP)) {
+               ipv6_key->ipv6_label = output->ipv6.label;
+               ipv6_key->ipv6_proto = output->ip.proto;
+               ipv6_key->ipv6_tclass = output->ip.tos;
+               ipv6_key->ipv6_hlimit = output->ip.ttl;
+               ipv6_key->ipv6_frag = output->ip.frag;
+       } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+                  swkey->eth.type == htons(ETH_P_RARP)) {
                struct ovs_key_arp *arp_key;
 
                nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
@@ -1272,11 +1795,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                        goto nla_put_failure;
                arp_key = nla_data(nla);
                memset(arp_key, 0, sizeof(struct ovs_key_arp));
-               arp_key->arp_sip = swkey->ipv4.addr.src;
-               arp_key->arp_tip = swkey->ipv4.addr.dst;
-               arp_key->arp_op = htons(swkey->ip.proto);
-               memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
-               memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
+               arp_key->arp_sip = output->ipv4.addr.src;
+               arp_key->arp_tip = output->ipv4.addr.dst;
+               arp_key->arp_op = htons(output->ip.proto);
+               memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
+               memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
        }
 
        if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1291,11 +1814,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                                goto nla_put_failure;
                        tcp_key = nla_data(nla);
                        if (swkey->eth.type == htons(ETH_P_IP)) {
-                               tcp_key->tcp_src = swkey->ipv4.tp.src;
-                               tcp_key->tcp_dst = swkey->ipv4.tp.dst;
+                               tcp_key->tcp_src = output->ipv4.tp.src;
+                               tcp_key->tcp_dst = output->ipv4.tp.dst;
                        } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-                               tcp_key->tcp_src = swkey->ipv6.tp.src;
-                               tcp_key->tcp_dst = swkey->ipv6.tp.dst;
+                               tcp_key->tcp_src = output->ipv6.tp.src;
+                               tcp_key->tcp_dst = output->ipv6.tp.dst;
                        }
                } else if (swkey->ip.proto == IPPROTO_UDP) {
                        struct ovs_key_udp *udp_key;
@@ -1305,11 +1828,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                                goto nla_put_failure;
                        udp_key = nla_data(nla);
                        if (swkey->eth.type == htons(ETH_P_IP)) {
-                               udp_key->udp_src = swkey->ipv4.tp.src;
-                               udp_key->udp_dst = swkey->ipv4.tp.dst;
+                               udp_key->udp_src = output->ipv4.tp.src;
+                               udp_key->udp_dst = output->ipv4.tp.dst;
                        } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-                               udp_key->udp_src = swkey->ipv6.tp.src;
-                               udp_key->udp_dst = swkey->ipv6.tp.dst;
+                               udp_key->udp_src = output->ipv6.tp.src;
+                               udp_key->udp_dst = output->ipv6.tp.dst;
                        }
                } else if (swkey->eth.type == htons(ETH_P_IP) &&
                           swkey->ip.proto == IPPROTO_ICMP) {
@@ -1319,8 +1842,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                        if (!nla)
                                goto nla_put_failure;
                        icmp_key = nla_data(nla);
-                       icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
-                       icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
+                       icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
+                       icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
                } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
                           swkey->ip.proto == IPPROTO_ICMPV6) {
                        struct ovs_key_icmpv6 *icmpv6_key;
@@ -1330,8 +1853,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                        if (!nla)
                                goto nla_put_failure;
                        icmpv6_key = nla_data(nla);
-                       icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
-                       icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
+                       icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
+                       icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
 
                        if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
                            icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
@@ -1341,10 +1864,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
                                if (!nla)
                                        goto nla_put_failure;
                                nd_key = nla_data(nla);
-                               memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
+                               memcpy(nd_key->nd_target, &output->ipv6.nd.target,
                                                        sizeof(nd_key->nd_target));
-                               memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
-                               memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
+                               memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
+                               memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
                        }
                }
        }
@@ -1376,3 +1899,91 @@ void ovs_flow_exit(void)
 {
        kmem_cache_destroy(flow_cache);
 }
+
+struct sw_flow_mask *ovs_sw_flow_mask_alloc(void)
+{
+       struct sw_flow_mask *mask;
+
+       mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+       if (mask)
+               mask->ref_count = 0;
+
+       return mask;
+}
+
+void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask)
+{
+       mask->ref_count++;
+}
+
+static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu)
+{
+       struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu);
+
+       kfree(mask);
+}
+
+void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
+{
+       if (!mask)
+               return;
+
+       BUG_ON(!mask->ref_count);
+       mask->ref_count--;
+
+       if (!mask->ref_count) {
+               list_del_rcu(&mask->list);
+               if (deferred)
+                       call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb);
+               else
+                       kfree(mask);
+       }
+}
+
+static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a,
+               const struct sw_flow_mask *b)
+{
+       u8 *a_ = (u8 *)&a->key + a->range.start;
+       u8 *b_ = (u8 *)&b->key + b->range.start;
+
+       return  (a->range.end == b->range.end)
+               && (a->range.start == b->range.start)
+               && (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0);
+}
+
+struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
+                                           const struct sw_flow_mask *mask)
+{
+       struct list_head *ml;
+
+       list_for_each(ml, tbl->mask_list) {
+               struct sw_flow_mask *m;
+               m = container_of(ml, struct sw_flow_mask, list);
+               if (ovs_sw_flow_mask_equal(mask, m))
+                       return m;
+       }
+
+       return NULL;
+}
+
+/**
+ * add a new mask into the mask list.
+ * The caller needs to make sure that 'mask' is not the same
+ * as any masks that are already on the list.
+ */
+void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+       list_add_rcu(&mask->list, tbl->mask_list);
+}
+
+/**
+ * Set 'range' fields in the mask to the value of 'val'.
+ */
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+               struct sw_flow_key_range *range, u8 val)
+{
+       u8 *m = (u8 *)&mask->key + range->start;
+
+       mask->range = *range;
+       memset(m, val, ovs_sw_flow_mask_size_roundup(mask));
+}