X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=datapath%2Fflow.c;h=2dc87aee7f5a6524988c5508cf5ff56549aecc63;hb=cdb1a85bba8adf03e6dac40e603f0cb7206fe2d6;hp=e091f50173cb81e5efb61ef1529afd15393e889d;hpb=18886b60bc7face9e08bc7ef06da365ee5c39e0a;p=sliver-openvswitch.git diff --git a/datapath/flow.c b/datapath/flow.c index e091f5017..2dc87aee7 100644 --- a/datapath/flow.c +++ b/datapath/flow.c @@ -8,7 +8,7 @@ #include "flow.h" #include "datapath.h" -#include +#include #include #include #include @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -48,13 +49,13 @@ static int check_header(struct sk_buff *skb, int len) return 0; } -static inline bool arphdr_ok(struct sk_buff *skb) +static bool arphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_network_offset(skb) + sizeof(struct arp_eth_header)); } -static inline int check_iphdr(struct sk_buff *skb) +static int check_iphdr(struct sk_buff *skb) { unsigned int nh_ofs = skb_network_offset(skb); unsigned int ip_len; @@ -73,7 +74,7 @@ static inline int check_iphdr(struct sk_buff *skb) return 0; } -static inline bool tcphdr_ok(struct sk_buff *skb) +static bool tcphdr_ok(struct sk_buff *skb) { int th_ofs = skb_transport_offset(skb); int tcp_len; @@ -89,13 +90,13 @@ static inline bool tcphdr_ok(struct sk_buff *skb) return true; } -static inline bool udphdr_ok(struct sk_buff *skb) +static bool udphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)); } -static inline bool icmphdr_ok(struct sk_buff *skb) +static bool icmphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct icmphdr)); @@ -115,8 +116,69 @@ u64 flow_used_time(unsigned long flow_jiffies) } #define SW_FLOW_KEY_OFFSET(field) \ - offsetof(struct sw_flow_key, field) + \ - FIELD_SIZEOF(struct sw_flow_key, field) + (offsetof(struct sw_flow_key, field) + \ + FIELD_SIZEOF(struct sw_flow_key, field)) + +/** + * skip_exthdr - skip any IPv6 extension headers + * @skb: skbuff to parse + * @start: offset of first extension header + * @nexthdrp: Initially, points to the type of the extension header at @start. + * This function updates it to point to the extension header at the final + * offset. + * @tos_frag: Points to the @tos_frag member in a &struct sw_flow_key. This + * function sets an appropriate %OVS_FRAG_TYPE_* value. + * + * This is based on ipv6_skip_exthdr() but adds the updates to *@tos_frag. + * + * When there is more than one fragment header, this version reports whether + * the final fragment header that it examines is a first fragment. + * + * Returns the final payload offset, or -1 on error. + */ +static int skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, + u8 *tos_frag) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + if (nexthdr == NEXTHDR_NONE) + return -1; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -1; + if (nexthdr == NEXTHDR_FRAGMENT) { + __be16 _frag_off, *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -1; + + *tos_frag &= ~OVS_FRAG_TYPE_MASK; + if (ntohs(*fp) & ~0x7) { + *tos_frag |= OVS_FRAG_TYPE_LATER; + break; + } + *tos_frag |= OVS_FRAG_TYPE_FIRST; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, int *key_lenp) @@ -139,11 +201,12 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, payload_ofs = (u8 *)(nh + 1) - skb->data; key->ip.proto = NEXTHDR_NONE; - key->ip.tos = ipv6_get_dsfield(nh) & ~INET_ECN_MASK; + key->ip.tos_frag = ipv6_get_dsfield(nh) & ~INET_ECN_MASK; ipv6_addr_copy(&key->ipv6.addr.src, &nh->saddr); ipv6_addr_copy(&key->ipv6.addr.dst, &nh->daddr); - payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr); + payload_ofs = skip_exthdr(skb, payload_ofs, + &nexthdr, &key->ip.tos_frag); if (unlikely(payload_ofs < 0)) return -EINVAL; @@ -172,12 +235,12 @@ void flow_used(struct sw_flow *flow, struct sk_buff *skb) tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; } - spin_lock_bh(&flow->lock); + spin_lock(&flow->lock); flow->used = jiffies; flow->packet_count++; flow->byte_count += skb->len; flow->tcp_flags |= tcp_flags; - spin_unlock_bh(&flow->lock); + spin_unlock(&flow->lock); } struct sw_flow_actions *flow_actions_alloc(const struct nlattr *actions) @@ -216,14 +279,153 @@ struct sw_flow *flow_alloc(void) return flow; } -void flow_free_tbl(struct tbl_node *node) +static struct hlist_head __rcu *find_bucket(struct flow_table * table, u32 hash) +{ + return flex_array_get(table->buckets, + (hash & (table->n_buckets - 1))); +} + +static struct flex_array __rcu *alloc_buckets(unsigned int n_buckets) { - struct sw_flow *flow = flow_cast(node); + struct flex_array __rcu *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head *), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + +struct flow_table *flow_tbl_alloc(int new_size) +{ + struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); + + if (!table) + return NULL; + + table->buckets = alloc_buckets(new_size); + + if (!table->buckets) { + kfree(table); + return NULL; + } + table->n_buckets = new_size; + table->count = 0; + + return table; +} +static void flow_free(struct sw_flow *flow) +{ flow->dead = true; flow_put(flow); } +void flow_tbl_destroy(struct flow_table *table) +{ + int i; + + if (!table) + return; + + for (i = 0; i < table->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *node, *n; + + hlist_for_each_entry_safe(flow, node, n, head, hash_node) { + hlist_del_init_rcu(&flow->hash_node); + flow_free(flow); + } + } + + free_buckets(table->buckets); + kfree(table); +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct flow_table *table = container_of(rcu, struct flow_table, rcu); + + flow_tbl_destroy(table); +} + +void flow_tbl_deferred_destroy(struct flow_table *table) +{ + if (!table) + return; + + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); +} + +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + struct hlist_node *n; + int i; + + while (*bucket < table->n_buckets) { + i = 0; + head = flex_array_get(table->buckets, *bucket); + hlist_for_each_entry_rcu(flow, n, head, hash_node) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +struct flow_table *flow_tbl_expand(struct flow_table *table) +{ + struct flow_table *new_table; + int n_buckets = table->n_buckets * 2; + int i; + + new_table = flow_tbl_alloc(n_buckets); + if (!new_table) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < table->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + struct hlist_node *n, *pos; + + head = flex_array_get(table->buckets, i); + + hlist_for_each_entry_safe(flow, n, pos, head, hash_node) { + hlist_del_init_rcu(&flow->hash_node); + flow_tbl_insert(new_table, flow); + } + } + + return new_table; +} + /* RCU callback used by flow_deferred_free. */ static void rcu_free_flow_callback(struct rcu_head *rcu) { @@ -364,7 +566,8 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, icmp_len -= sizeof(*nd); offset = 0; while (icmp_len >= 8) { - struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd->opt + offset); + struct nd_opt_hdr *nd_opt = + (struct nd_opt_hdr *)(nd->opt + offset); int opt_len = nd_opt->nd_opt_len * 8; if (unlikely(!opt_len || opt_len > icmp_len)) @@ -412,8 +615,6 @@ out: * @in_port: port number on which @skb was received. * @key: output flow key * @key_lenp: length of output flow key - * @is_frag: set to 1 if @skb contains an IPv4 fragment, or to 0 if @skb does - * not contain an IPv4 packet or if it is not a fragment. * * The caller must ensure that skb->len >= ETH_HLEN. * @@ -432,16 +633,17 @@ out: * For other key->dl_type values it is left untouched. */ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, - int *key_lenp, bool *is_frag) + int *key_lenp) { int error = 0; int key_len = SW_FLOW_KEY_OFFSET(eth); struct ethhdr *eth; memset(key, 0, sizeof(*key)); - key->eth.tun_id = OVS_CB(skb)->tun_id; - key->eth.in_port = in_port; - *is_frag = false; + + key->phy.priority = skb->priority; + key->phy.tun_id = OVS_CB(skb)->tun_id; + key->phy.in_port = in_port; skb_reset_mac_header(skb); @@ -470,6 +672,7 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { struct iphdr *nh; + __be16 offset; key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); @@ -485,35 +688,41 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, nh = ip_hdr(skb); key->ipv4.addr.src = nh->saddr; key->ipv4.addr.dst = nh->daddr; - key->ip.tos = nh->tos & ~INET_ECN_MASK; + key->ip.proto = nh->protocol; + key->ip.tos_frag = nh->tos & ~INET_ECN_MASK; - /* Transport layer. */ - if ((nh->frag_off & htons(IP_MF | IP_OFFSET)) || - (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) - *is_frag = true; + offset = nh->frag_off & htons(IP_OFFSET); + if (offset) { + key->ip.tos_frag |= OVS_FRAG_TYPE_LATER; + goto out; + } + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.tos_frag |= OVS_FRAG_TYPE_FIRST; + /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - if (!*is_frag && tcphdr_ok(skb)) { + if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv4.tp.src = tcp->source; key->ipv4.tp.dst = tcp->dest; } } else if (key->ip.proto == IPPROTO_UDP) { key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - if (!*is_frag && udphdr_ok(skb)) { + if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv4.tp.src = udp->source; key->ipv4.tp.dst = udp->dest; } } else if (key->ip.proto == IPPROTO_ICMP) { key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - if (!*is_frag && icmphdr_ok(skb)) { + if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); /* The ICMP type and code fields use the 16-bit - * transport port fields, so we need to store them - * in 16-bit network byte order. */ + * transport port fields, so we need to store + * them in 16-bit network byte order. */ key->ipv4.tp.src = htons(icmp->type); key->ipv4.tp.dst = htons(icmp->code); } @@ -554,6 +763,11 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, goto out; } + if ((key->ip.tos_frag & OVS_FRAG_TYPE_MASK) == OVS_FRAG_TYPE_LATER) + goto out; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.tos_frag |= OVS_FRAG_TYPE_FIRST; + /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); @@ -586,19 +800,60 @@ out: u32 flow_hash(const struct sw_flow_key *key, int key_len) { - return jhash2((u32*)key, DIV_ROUND_UP(key_len, sizeof(u32)), hash_seed); + return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), hash_seed); +} + +struct sw_flow *flow_tbl_lookup(struct flow_table *table, + struct sw_flow_key *key, int key_len) +{ + struct sw_flow *flow; + struct hlist_node *n; + struct hlist_head *head; + u32 hash; + + hash = flow_hash(key, key_len); + + head = find_bucket(table, hash); + hlist_for_each_entry_rcu(flow, n, head, hash_node) { + + if (flow->hash == hash && + !memcmp(&flow->key, key, key_len)) { + return flow; + } + } + return NULL; +} + +void flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node, head); + table->count++; +} + +void flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + if (!hlist_unhashed(&flow->hash_node)) { + hlist_del_init_rcu(&flow->hash_node); + table->count--; + BUG_ON(table->count < 0); + } } -int flow_cmp(const struct tbl_node *node, void *key2_, int len) +static int parse_tos_frag(struct sw_flow_key *swkey, u8 tos, u8 frag) { - const struct sw_flow_key *key1 = &flow_cast(node)->key; - const struct sw_flow_key *key2 = key2_; + if (tos & INET_ECN_MASK || frag > OVS_FRAG_TYPE_MAX) + return -EINVAL; - return !memcmp(key1, key2, len); + swkey->ip.tos_frag = tos | frag; + return 0; } /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ -static const u32 key_lens[OVS_KEY_ATTR_MAX + 1] = { +const u32 ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_PRIORITY] = 4, [OVS_KEY_ATTR_TUN_ID] = 8, [OVS_KEY_ATTR_IN_PORT] = 4, [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), @@ -624,20 +879,24 @@ static const u32 key_lens[OVS_KEY_ATTR_MAX + 1] = { * This state machine accepts the following forms, with [] for optional * elements and | for alternatives: * - * [tun_id] [in_port] ethernet [8021q] [ethertype \ + * [priority] [tun_id] [in_port] ethernet [8021q] [ethertype \ * [IPv4 [TCP|UDP|ICMP] | IPv6 [TCP|UDP|ICMPv6 [ND]] | ARP]] + * + * except that IPv4 or IPv6 terminates the sequence if its @ipv4_frag or + * @ipv6_frag member, respectively, equals %OVS_FRAG_TYPE_LATER. */ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, const struct nlattr *attr) { int error = 0; + enum ovs_frag_type frag_type; const struct nlattr *nla; u16 prev_type; int rem; int key_len; memset(swkey, 0, sizeof(*swkey)); - swkey->eth.in_port = USHRT_MAX; + swkey->phy.in_port = USHRT_MAX; swkey->eth.type = htons(ETH_P_802_2); key_len = SW_FLOW_KEY_OFFSET(eth); @@ -654,25 +913,33 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, const struct ovs_key_arp *arp_key; const struct ovs_key_nd *nd_key; - int type = nla_type(nla); + int type = nla_type(nla); - if (type > OVS_KEY_ATTR_MAX || nla_len(nla) != key_lens[type]) - goto invalid; + if (type > OVS_KEY_ATTR_MAX || + nla_len(nla) != ovs_key_lens[type]) + goto invalid; #define TRANSITION(PREV_TYPE, TYPE) (((PREV_TYPE) << 16) | (TYPE)) switch (TRANSITION(prev_type, type)) { + case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_PRIORITY): + swkey->phy.priority = nla_get_u32(nla); + break; + case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_TUN_ID): - swkey->eth.tun_id = nla_get_be64(nla); + case TRANSITION(OVS_KEY_ATTR_PRIORITY, OVS_KEY_ATTR_TUN_ID): + swkey->phy.tun_id = nla_get_be64(nla); break; case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_IN_PORT): + case TRANSITION(OVS_KEY_ATTR_PRIORITY, OVS_KEY_ATTR_IN_PORT): case TRANSITION(OVS_KEY_ATTR_TUN_ID, OVS_KEY_ATTR_IN_PORT): if (nla_get_u32(nla) >= DP_MAX_PORTS) goto invalid; - swkey->eth.in_port = nla_get_u32(nla); + swkey->phy.in_port = nla_get_u32(nla); break; case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_ETHERNET): + case TRANSITION(OVS_KEY_ATTR_PRIORITY, OVS_KEY_ATTR_ETHERNET): case TRANSITION(OVS_KEY_ATTR_TUN_ID, OVS_KEY_ATTR_ETHERNET): case TRANSITION(OVS_KEY_ATTR_IN_PORT, OVS_KEY_ATTR_ETHERNET): eth_key = nla_data(nla); @@ -703,11 +970,11 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, goto invalid; ipv4_key = nla_data(nla); swkey->ip.proto = ipv4_key->ipv4_proto; - swkey->ip.tos = ipv4_key->ipv4_tos; + if (parse_tos_frag(swkey, ipv4_key->ipv4_tos, + ipv4_key->ipv4_frag)) + goto invalid; swkey->ipv4.addr.src = ipv4_key->ipv4_src; swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; - if (swkey->ip.tos & INET_ECN_MASK) - goto invalid; break; case TRANSITION(OVS_KEY_ATTR_ETHERTYPE, OVS_KEY_ATTR_IPV6): @@ -716,13 +983,13 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, goto invalid; ipv6_key = nla_data(nla); swkey->ip.proto = ipv6_key->ipv6_proto; - swkey->ip.tos = ipv6_key->ipv6_tos; + if (parse_tos_frag(swkey, ipv6_key->ipv6_tos, + ipv6_key->ipv6_frag)) + goto invalid; memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, sizeof(swkey->ipv6.addr.src)); memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, sizeof(swkey->ipv6.addr.dst)); - if (swkey->ip.tos & INET_ECN_MASK) - goto invalid; break; case TRANSITION(OVS_KEY_ATTR_IPV4, OVS_KEY_ATTR_TCP): @@ -814,10 +1081,12 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, if (rem) goto invalid; + frag_type = swkey->ip.tos_frag & OVS_FRAG_TYPE_MASK; switch (prev_type) { case OVS_KEY_ATTR_UNSPEC: goto invalid; + case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_TUN_ID: case OVS_KEY_ATTR_IN_PORT: goto invalid; @@ -828,11 +1097,14 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, case OVS_KEY_ATTR_ETHERTYPE: if (swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6) || swkey->eth.type == htons(ETH_P_ARP)) goto invalid; goto ok; case OVS_KEY_ATTR_IPV4: + if (frag_type == OVS_FRAG_TYPE_LATER) + goto ok; if (swkey->ip.proto == IPPROTO_TCP || swkey->ip.proto == IPPROTO_UDP || swkey->ip.proto == IPPROTO_ICMP) @@ -840,6 +1112,8 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, goto ok; case OVS_KEY_ATTR_IPV6: + if (frag_type == OVS_FRAG_TYPE_LATER) + goto ok; if (swkey->ip.proto == IPPROTO_TCP || swkey->ip.proto == IPPROTO_UDP || swkey->ip.proto == IPPROTO_ICMPV6) @@ -848,15 +1122,20 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, case OVS_KEY_ATTR_ICMPV6: if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) + swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT) || + frag_type == OVS_FRAG_TYPE_LATER) goto invalid; goto ok; case OVS_KEY_ATTR_TCP: case OVS_KEY_ATTR_UDP: case OVS_KEY_ATTR_ICMP: - case OVS_KEY_ATTR_ARP: case OVS_KEY_ATTR_ND: + if (frag_type == OVS_FRAG_TYPE_LATER) + goto invalid; + goto ok; + + case OVS_KEY_ATTR_ARP: goto ok; default: @@ -867,7 +1146,6 @@ invalid: error = -EINVAL; ok: - WARN_ON_ONCE(!key_len && !error); *key_lenp = key_len; return error; } @@ -884,7 +1162,7 @@ ok: * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. */ -int flow_metadata_from_nlattrs(u16 *in_port, __be64 *tun_id, +int flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id, const struct nlattr *attr) { const struct nlattr *nla; @@ -893,20 +1171,27 @@ int flow_metadata_from_nlattrs(u16 *in_port, __be64 *tun_id, *in_port = USHRT_MAX; *tun_id = 0; + *priority = 0; prev_type = OVS_KEY_ATTR_UNSPEC; nla_for_each_nested(nla, attr, rem) { - int type = nla_type(nla); + int type = nla_type(nla); - if (type > OVS_KEY_ATTR_MAX || nla_len(nla) != key_lens[type]) - return -EINVAL; + if (type > OVS_KEY_ATTR_MAX || nla_len(nla) != ovs_key_lens[type]) + return -EINVAL; switch (TRANSITION(prev_type, type)) { + case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_PRIORITY): + *priority = nla_get_u32(nla); + break; + case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_TUN_ID): + case TRANSITION(OVS_KEY_ATTR_PRIORITY, OVS_KEY_ATTR_TUN_ID): *tun_id = nla_get_be64(nla); break; case TRANSITION(OVS_KEY_ATTR_UNSPEC, OVS_KEY_ATTR_IN_PORT): + case TRANSITION(OVS_KEY_ATTR_PRIORITY, OVS_KEY_ATTR_IN_PORT): case TRANSITION(OVS_KEY_ATTR_TUN_ID, OVS_KEY_ATTR_IN_PORT): if (nla_get_u32(nla) >= DP_MAX_PORTS) return -EINVAL; @@ -929,16 +1214,14 @@ int flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) struct ovs_key_ethernet *eth_key; struct nlattr *nla; - /* This is an imperfect sanity-check that FLOW_BUFSIZE doesn't need - * to be updated, but will at least raise awareness when new - * datapath key types are added. */ - BUILD_BUG_ON(__OVS_KEY_ATTR_MAX != 14); + if (swkey->phy.priority) + NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority); - if (swkey->eth.tun_id != cpu_to_be64(0)) - NLA_PUT_BE64(skb, OVS_KEY_ATTR_TUN_ID, swkey->eth.tun_id); + if (swkey->phy.tun_id != cpu_to_be64(0)) + NLA_PUT_BE64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun_id); - if (swkey->eth.in_port != USHRT_MAX) - NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->eth.in_port); + if (swkey->phy.in_port != USHRT_MAX) + NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port); nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) @@ -971,7 +1254,8 @@ int flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) ipv4_key->ipv4_src = swkey->ipv4.addr.src; ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; ipv4_key->ipv4_proto = swkey->ip.proto; - ipv4_key->ipv4_tos = swkey->ip.tos; + ipv4_key->ipv4_tos = swkey->ip.tos_frag & ~INET_ECN_MASK; + ipv4_key->ipv4_frag = swkey->ip.tos_frag & OVS_FRAG_TYPE_MASK; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { struct ovs_key_ipv6 *ipv6_key; @@ -985,7 +1269,8 @@ int flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, sizeof(ipv6_key->ipv6_dst)); ipv6_key->ipv6_proto = swkey->ip.proto; - ipv6_key->ipv6_tos = swkey->ip.tos; + ipv6_key->ipv6_tos = swkey->ip.tos_frag & ~INET_ECN_MASK; + ipv6_key->ipv6_frag = swkey->ip.tos_frag & OVS_FRAG_TYPE_MASK; } else if (swkey->eth.type == htons(ETH_P_ARP)) { struct ovs_key_arp *arp_key; @@ -1001,8 +1286,9 @@ int flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); } - if (swkey->eth.type == htons(ETH_P_IP) || - swkey->eth.type == htons(ETH_P_IPV6)) { + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + (swkey->ip.tos_frag & OVS_FRAG_TYPE_MASK) != OVS_FRAG_TYPE_LATER) { if (swkey->ip.proto == IPPROTO_TCP) { struct ovs_key_tcp *tcp_key;