nicira-ext: Support matching IPv6 Neighbor Discovery messages.
[sliver-openvswitch.git] / datapath / flow.c
1 /*
2  * Distributed under the terms of the GNU GPL version 2.
3  * Copyright (c) 2007, 2008, 2009, 2010, 2011 Nicira Networks.
4  *
5  * Significant portions of this file may be copied from parts of the Linux
6  * kernel, by Linus Torvalds and others.
7  */
8
9 #include "flow.h"
10 #include "datapath.h"
11 #include <asm/uaccess.h>
12 #include <linux/netdevice.h>
13 #include <linux/etherdevice.h>
14 #include <linux/if_ether.h>
15 #include <linux/if_vlan.h>
16 #include <net/llc_pdu.h>
17 #include <linux/kernel.h>
18 #include <linux/jhash.h>
19 #include <linux/jiffies.h>
20 #include <linux/llc.h>
21 #include <linux/module.h>
22 #include <linux/in.h>
23 #include <linux/rcupdate.h>
24 #include <linux/if_arp.h>
25 #include <linux/if_ether.h>
26 #include <linux/ip.h>
27 #include <linux/ipv6.h>
28 #include <linux/tcp.h>
29 #include <linux/udp.h>
30 #include <linux/icmp.h>
31 #include <linux/icmpv6.h>
32 #include <net/inet_ecn.h>
33 #include <net/ip.h>
34 #include <net/ipv6.h>
35 #include <net/ndisc.h>
36
37 static struct kmem_cache *flow_cache;
38 static unsigned int hash_seed __read_mostly;
39
40 static inline bool arphdr_ok(struct sk_buff *skb)
41 {
42         return skb->len >= skb_network_offset(skb) + sizeof(struct arp_eth_header);
43 }
44
45 static inline int check_iphdr(struct sk_buff *skb)
46 {
47         unsigned int nh_ofs = skb_network_offset(skb);
48         unsigned int ip_len;
49
50         if (skb->len < nh_ofs + sizeof(struct iphdr))
51                 return -EINVAL;
52
53         ip_len = ip_hdrlen(skb);
54         if (ip_len < sizeof(struct iphdr) || skb->len < nh_ofs + ip_len)
55                 return -EINVAL;
56
57         /*
58          * Pull enough header bytes to account for the IP header plus the
59          * longest transport header that we parse, currently 20 bytes for TCP.
60          */
61         if (!pskb_may_pull(skb, min(nh_ofs + ip_len + 20, skb->len)))
62                 return -ENOMEM;
63
64         skb_set_transport_header(skb, nh_ofs + ip_len);
65         return 0;
66 }
67
68 static inline bool tcphdr_ok(struct sk_buff *skb)
69 {
70         int th_ofs = skb_transport_offset(skb);
71         if (skb->len >= th_ofs + sizeof(struct tcphdr)) {
72                 int tcp_len = tcp_hdrlen(skb);
73                 return (tcp_len >= sizeof(struct tcphdr)
74                         && skb->len >= th_ofs + tcp_len);
75         }
76         return false;
77 }
78
79 static inline bool udphdr_ok(struct sk_buff *skb)
80 {
81         return skb->len >= skb_transport_offset(skb) + sizeof(struct udphdr);
82 }
83
84 static inline bool icmphdr_ok(struct sk_buff *skb)
85 {
86         return skb->len >= skb_transport_offset(skb) + sizeof(struct icmphdr);
87 }
88
89 u64 flow_used_time(unsigned long flow_jiffies)
90 {
91         struct timespec cur_ts;
92         u64 cur_ms, idle_ms;
93
94         ktime_get_ts(&cur_ts);
95         idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
96         cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
97                  cur_ts.tv_nsec / NSEC_PER_MSEC;
98
99         return cur_ms - idle_ms;
100 }
101
102 static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
103 {
104         unsigned int nh_ofs = skb_network_offset(skb);
105         unsigned int nh_len;
106         int payload_ofs;
107         int payload_len;
108         struct ipv6hdr *nh;
109         uint8_t nexthdr;
110
111         if (unlikely(skb->len < nh_ofs + sizeof(*nh)))
112                 return -EINVAL;
113
114         nh = ipv6_hdr(skb);
115         nexthdr = nh->nexthdr;
116         payload_ofs = (u8 *)(nh + 1) - skb->data;
117         payload_len = ntohs(nh->payload_len);
118
119         memcpy(key->ipv6_src, nh->saddr.in6_u.u6_addr8, sizeof(key->ipv6_src));
120         memcpy(key->ipv6_dst, nh->daddr.in6_u.u6_addr8, sizeof(key->ipv6_dst));
121         key->nw_tos = ipv6_get_dsfield(nh) & ~INET_ECN_MASK;
122         key->nw_proto = NEXTHDR_NONE;
123
124         /* We don't process jumbograms. */
125         if (!payload_len)
126                 return -EINVAL;
127
128         if (unlikely(skb->len < nh_ofs + sizeof(*nh) + payload_len))
129                 return -EINVAL;
130
131         payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr);
132         if (payload_ofs < 0) {
133                 return -EINVAL;
134         }
135         nh_len = payload_ofs - nh_ofs;
136
137         /* Ensure that the payload length claimed is at least large enough
138          * for the headers we've already processed. */
139         if (payload_len < nh_len - sizeof(*nh))
140                 return -EINVAL;
141
142         /* Pull enough header bytes to account for the IP header plus the
143          * longest transport header that we parse, currently 20 bytes for TCP.
144          * To dig deeper than the transport header, transport parsers may need
145          * to pull more header bytes.
146          */
147         if (unlikely(!pskb_may_pull(skb, min(nh_ofs + nh_len + 20, skb->len))))
148                 return -ENOMEM;
149
150         skb_set_transport_header(skb, nh_ofs + nh_len);
151         key->nw_proto = nexthdr;
152         return nh_len;
153 }
154
155 static bool icmp6hdr_ok(struct sk_buff *skb)
156 {
157         return skb->len >= skb_transport_offset(skb) + sizeof(struct icmp6hdr);
158 }
159
160 #define TCP_FLAGS_OFFSET 13
161 #define TCP_FLAG_MASK 0x3f
162
163 void flow_used(struct sw_flow *flow, struct sk_buff *skb)
164 {
165         u8 tcp_flags = 0;
166
167         if (flow->key.dl_type == htons(ETH_P_IP) &&
168             flow->key.nw_proto == IPPROTO_TCP) {
169                 u8 *tcp = (u8 *)tcp_hdr(skb);
170                 tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
171         }
172
173         spin_lock_bh(&flow->lock);
174         flow->used = jiffies;
175         flow->packet_count++;
176         flow->byte_count += skb->len;
177         flow->tcp_flags |= tcp_flags;
178         spin_unlock_bh(&flow->lock);
179 }
180
181 struct sw_flow_actions *flow_actions_alloc(const struct nlattr *actions)
182 {
183         int actions_len = nla_len(actions);
184         struct sw_flow_actions *sfa;
185
186         /* At least DP_MAX_PORTS actions are required to be able to flood a
187          * packet to every port.  Factor of 2 allows for setting VLAN tags,
188          * etc. */
189         if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4))
190                 return ERR_PTR(-EINVAL);
191
192         sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
193         if (!sfa)
194                 return ERR_PTR(-ENOMEM);
195
196         sfa->actions_len = actions_len;
197         memcpy(sfa->actions, nla_data(actions), actions_len);
198         return sfa;
199 }
200
201 struct sw_flow *flow_alloc(void)
202 {
203         struct sw_flow *flow;
204
205         flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
206         if (!flow)
207                 return ERR_PTR(-ENOMEM);
208
209         spin_lock_init(&flow->lock);
210         atomic_set(&flow->refcnt, 1);
211         flow->dead = false;
212
213         return flow;
214 }
215
216 void flow_free_tbl(struct tbl_node *node)
217 {
218         struct sw_flow *flow = flow_cast(node);
219
220         flow->dead = true;
221         flow_put(flow);
222 }
223
224 /* RCU callback used by flow_deferred_free. */
225 static void rcu_free_flow_callback(struct rcu_head *rcu)
226 {
227         struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
228
229         flow->dead = true;
230         flow_put(flow);
231 }
232
233 /* Schedules 'flow' to be freed after the next RCU grace period.
234  * The caller must hold rcu_read_lock for this to be sensible. */
235 void flow_deferred_free(struct sw_flow *flow)
236 {
237         call_rcu(&flow->rcu, rcu_free_flow_callback);
238 }
239
240 void flow_hold(struct sw_flow *flow)
241 {
242         atomic_inc(&flow->refcnt);
243 }
244
245 void flow_put(struct sw_flow *flow)
246 {
247         if (unlikely(!flow))
248                 return;
249
250         if (atomic_dec_and_test(&flow->refcnt)) {
251                 kfree((struct sf_flow_acts __force *)flow->sf_acts);
252                 kmem_cache_free(flow_cache, flow);
253         }
254 }
255
256 /* RCU callback used by flow_deferred_free_acts. */
257 static void rcu_free_acts_callback(struct rcu_head *rcu)
258 {
259         struct sw_flow_actions *sf_acts = container_of(rcu,
260                         struct sw_flow_actions, rcu);
261         kfree(sf_acts);
262 }
263
264 /* Schedules 'sf_acts' to be freed after the next RCU grace period.
265  * The caller must hold rcu_read_lock for this to be sensible. */
266 void flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
267 {
268         call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
269 }
270
271 static void parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
272 {
273         struct qtag_prefix {
274                 __be16 eth_type; /* ETH_P_8021Q */
275                 __be16 tci;
276         };
277         struct qtag_prefix *qp;
278
279         if (skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))
280                 return;
281
282         qp = (struct qtag_prefix *) skb->data;
283         key->dl_tci = qp->tci | htons(VLAN_TAG_PRESENT);
284         __skb_pull(skb, sizeof(struct qtag_prefix));
285 }
286
287 static __be16 parse_ethertype(struct sk_buff *skb)
288 {
289         struct llc_snap_hdr {
290                 u8  dsap;  /* Always 0xAA */
291                 u8  ssap;  /* Always 0xAA */
292                 u8  ctrl;
293                 u8  oui[3];
294                 __be16 ethertype;
295         };
296         struct llc_snap_hdr *llc;
297         __be16 proto;
298
299         proto = *(__be16 *) skb->data;
300         __skb_pull(skb, sizeof(__be16));
301
302         if (ntohs(proto) >= 1536)
303                 return proto;
304
305         if (unlikely(skb->len < sizeof(struct llc_snap_hdr)))
306                 return htons(ETH_P_802_2);
307
308         llc = (struct llc_snap_hdr *) skb->data;
309         if (llc->dsap != LLC_SAP_SNAP ||
310             llc->ssap != LLC_SAP_SNAP ||
311             (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
312                 return htons(ETH_P_802_2);
313
314         __skb_pull(skb, sizeof(struct llc_snap_hdr));
315         return llc->ethertype;
316 }
317
318 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
319                 int nh_len)
320 {
321         struct ipv6hdr *nh = ipv6_hdr(skb);
322         int icmp_len = ntohs(nh->payload_len) + sizeof(*nh) - nh_len;
323         struct icmp6hdr *icmp = icmp6_hdr(skb);
324
325         /* The ICMPv6 type and code fields use the 16-bit transport port
326          * fields, so we need to store them in 16-bit network byte order. */
327         key->tp_src = htons(icmp->icmp6_type);
328         key->tp_dst = htons(icmp->icmp6_code);
329
330         if (!icmp->icmp6_code
331                         && ((icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
332                           || (icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT))) {
333                 struct nd_msg *nd;
334                 int offset;
335
336                 /* In order to process neighbor discovery options, we need the
337                  * entire packet. */
338                 if (icmp_len < sizeof(*nd))
339                         goto invalid;
340                 if (!pskb_may_pull(skb, skb_transport_offset(skb) + icmp_len))
341                         return -ENOMEM;
342
343                 nd = (struct nd_msg *)skb_transport_header(skb);
344                 memcpy(key->nd_target, &nd->target, sizeof(key->nd_target));
345
346                 icmp_len -= sizeof(*nd);
347                 offset = 0;
348                 while (icmp_len >= 8) {
349                         struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd->opt + offset);
350                         int opt_len = nd_opt->nd_opt_len * 8;
351
352                         if (!opt_len || (opt_len > icmp_len))
353                                 goto invalid;
354
355                         /* Store the link layer address if the appropriate option is
356                          * provided.  It is considered an error if the same link
357                          * layer option is specified twice. */
358                         if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
359                                         && opt_len == 8) {
360                                 if (!is_zero_ether_addr(key->arp_sha))
361                                         goto invalid;
362                                 memcpy(key->arp_sha,
363                                                 &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
364                         } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
365                                         && opt_len == 8) {
366                                 if (!is_zero_ether_addr(key->arp_tha))
367                                         goto invalid;
368                                 memcpy(key->arp_tha,
369                                                 &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
370                         }
371
372                         icmp_len -= opt_len;
373                         offset += opt_len;
374                 }
375         }
376
377         return 0;
378
379 invalid:
380         memset(key->nd_target, 0, sizeof(key->nd_target));
381         memset(key->arp_sha, 0, sizeof(key->arp_sha));
382         memset(key->arp_tha, 0, sizeof(key->arp_tha));
383
384         return 0;
385 }
386
387 /**
388  * flow_extract - extracts a flow key from an Ethernet frame.
389  * @skb: sk_buff that contains the frame, with skb->data pointing to the
390  * Ethernet header
391  * @in_port: port number on which @skb was received.
392  * @key: output flow key
393  * @is_frag: set to 1 if @skb contains an IPv4 fragment, or to 0 if @skb does
394  * not contain an IPv4 packet or if it is not a fragment.
395  *
396  * The caller must ensure that skb->len >= ETH_HLEN.
397  *
398  * Returns 0 if successful, otherwise a negative errno value.
399  *
400  * Initializes @skb header pointers as follows:
401  *
402  *    - skb->mac_header: the Ethernet header.
403  *
404  *    - skb->network_header: just past the Ethernet header, or just past the
405  *      VLAN header, to the first byte of the Ethernet payload.
406  *
407  *    - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6
408  *      on output, then just past the IP header, if one is present and
409  *      of a correct length, otherwise the same as skb->network_header.
410  *      For other key->dl_type values it is left untouched.
411  */
412 int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
413                  bool *is_frag)
414 {
415         struct ethhdr *eth;
416
417         memset(key, 0, sizeof(*key));
418         key->tun_id = OVS_CB(skb)->tun_id;
419         key->in_port = in_port;
420         *is_frag = false;
421
422         /*
423          * We would really like to pull as many bytes as we could possibly
424          * want to parse into the linear data area.  Currently, for IPv4,
425          * that is:
426          *
427          *    14     Ethernet header
428          *     4     VLAN header
429          *    60     max IP header with options
430          *    20     max TCP/UDP/ICMP header (don't care about options)
431          *    --
432          *    98
433          *
434          * But Xen only allocates 64 or 72 bytes for the linear data area in
435          * netback, which means that we would reallocate and copy the skb's
436          * linear data on every packet if we did that.  So instead just pull 64
437          * bytes, which is always sufficient without IP options, and then check
438          * whether we need to pull more later when we look at the IP header.
439          */
440         if (!pskb_may_pull(skb, min(skb->len, 64u)))
441                 return -ENOMEM;
442
443         skb_reset_mac_header(skb);
444
445         /* Link layer. */
446         eth = eth_hdr(skb);
447         memcpy(key->dl_src, eth->h_source, ETH_ALEN);
448         memcpy(key->dl_dst, eth->h_dest, ETH_ALEN);
449
450         /* dl_type, dl_vlan, dl_vlan_pcp. */
451         __skb_pull(skb, 2 * ETH_ALEN);
452         if (eth->h_proto == htons(ETH_P_8021Q))
453                 parse_vlan(skb, key);
454         key->dl_type = parse_ethertype(skb);
455         skb_reset_network_header(skb);
456         __skb_push(skb, skb->data - (unsigned char *)eth);
457
458         /* Network layer. */
459         if (key->dl_type == htons(ETH_P_IP)) {
460                 struct iphdr *nh;
461                 int error;
462
463                 error = check_iphdr(skb);
464                 if (unlikely(error)) {
465                         if (error == -EINVAL) {
466                                 skb->transport_header = skb->network_header;
467                                 return 0;
468                         }
469                         return error;
470                 }
471
472                 nh = ip_hdr(skb);
473                 key->ipv4_src = nh->saddr;
474                 key->ipv4_dst = nh->daddr;
475                 key->nw_tos = nh->tos & ~INET_ECN_MASK;
476                 key->nw_proto = nh->protocol;
477
478                 /* Transport layer. */
479                 if (!(nh->frag_off & htons(IP_MF | IP_OFFSET)) &&
480                     !(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) {
481                         if (key->nw_proto == IPPROTO_TCP) {
482                                 if (tcphdr_ok(skb)) {
483                                         struct tcphdr *tcp = tcp_hdr(skb);
484                                         key->tp_src = tcp->source;
485                                         key->tp_dst = tcp->dest;
486                                 }
487                         } else if (key->nw_proto == IPPROTO_UDP) {
488                                 if (udphdr_ok(skb)) {
489                                         struct udphdr *udp = udp_hdr(skb);
490                                         key->tp_src = udp->source;
491                                         key->tp_dst = udp->dest;
492                                 }
493                         } else if (key->nw_proto == IPPROTO_ICMP) {
494                                 if (icmphdr_ok(skb)) {
495                                         struct icmphdr *icmp = icmp_hdr(skb);
496                                         /* The ICMP type and code fields use the 16-bit
497                                          * transport port fields, so we need to store them
498                                          * in 16-bit network byte order. */
499                                         key->tp_src = htons(icmp->type);
500                                         key->tp_dst = htons(icmp->code);
501                                 }
502                         }
503                 } else
504                         *is_frag = true;
505
506         } else if (key->dl_type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
507                 struct arp_eth_header *arp;
508
509                 arp = (struct arp_eth_header *)skb_network_header(skb);
510
511                 if (arp->ar_hrd == htons(ARPHRD_ETHER)
512                                 && arp->ar_pro == htons(ETH_P_IP)
513                                 && arp->ar_hln == ETH_ALEN
514                                 && arp->ar_pln == 4) {
515
516                         /* We only match on the lower 8 bits of the opcode. */
517                         if (ntohs(arp->ar_op) <= 0xff)
518                                 key->nw_proto = ntohs(arp->ar_op);
519
520                         if (key->nw_proto == ARPOP_REQUEST
521                                         || key->nw_proto == ARPOP_REPLY) {
522                                 memcpy(&key->ipv4_src, arp->ar_sip, sizeof(key->ipv4_src));
523                                 memcpy(&key->ipv4_dst, arp->ar_tip, sizeof(key->ipv4_dst));
524                                 memcpy(key->arp_sha, arp->ar_sha, ETH_ALEN);
525                                 memcpy(key->arp_tha, arp->ar_tha, ETH_ALEN);
526                         }
527                 }
528         } else if (key->dl_type == htons(ETH_P_IPV6)) {
529                 int nh_len;             /* IPv6 Header + Extensions */
530
531                 nh_len = parse_ipv6hdr(skb, key);
532                 if (unlikely(nh_len < 0)) {
533                         if (nh_len == -EINVAL) {
534                                 skb->transport_header = skb->network_header;
535                                 return 0;
536                         }
537                         return nh_len;
538                 }
539
540                 /* Transport layer. */
541                 if (key->nw_proto == NEXTHDR_TCP) {
542                         if (tcphdr_ok(skb)) {
543                                 struct tcphdr *tcp = tcp_hdr(skb);
544                                 key->tp_src = tcp->source;
545                                 key->tp_dst = tcp->dest;
546                         }
547                 } else if (key->nw_proto == NEXTHDR_UDP) {
548                         if (udphdr_ok(skb)) {
549                                 struct udphdr *udp = udp_hdr(skb);
550                                 key->tp_src = udp->source;
551                                 key->tp_dst = udp->dest;
552                         }
553                 } else if (key->nw_proto == NEXTHDR_ICMP) {
554                         if (icmp6hdr_ok(skb)) {
555                                 int error = parse_icmpv6(skb, key, nh_len);
556                                 if (error < 0)
557                                         return error;
558                         }
559                 }
560         }
561         return 0;
562 }
563
564 u32 flow_hash(const struct sw_flow_key *key)
565 {
566         return jhash2((u32*)key, sizeof(*key) / sizeof(u32), hash_seed);
567 }
568
569 int flow_cmp(const struct tbl_node *node, void *key2_)
570 {
571         const struct sw_flow_key *key1 = &flow_cast(node)->key;
572         const struct sw_flow_key *key2 = key2_;
573
574         return !memcmp(key1, key2, sizeof(struct sw_flow_key));
575 }
576
577 /**
578  * flow_from_nlattrs - parses Netlink attributes into a flow key.
579  * @swkey: receives the extracted flow key.
580  * @key: Netlink attribute holding nested %ODP_KEY_ATTR_* Netlink attribute
581  * sequence.
582  *
583  * This state machine accepts the following forms, with [] for optional
584  * elements and | for alternatives:
585  *
586  * [tun_id] in_port ethernet [8021q] [ethertype \
587  *              [IPv4 [TCP|UDP|ICMP] | IPv6 [TCP|UDP|ICMPv6 [ND]] | ARP]]
588  */
589 int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
590 {
591         const struct nlattr *nla;
592         u16 prev_type;
593         int rem;
594
595         memset(swkey, 0, sizeof(*swkey));
596         swkey->dl_type = htons(ETH_P_802_2);
597
598         prev_type = ODP_KEY_ATTR_UNSPEC;
599         nla_for_each_nested(nla, attr, rem) {
600                 static const u32 key_lens[ODP_KEY_ATTR_MAX + 1] = {
601                         [ODP_KEY_ATTR_TUN_ID] = 8,
602                         [ODP_KEY_ATTR_IN_PORT] = 4,
603                         [ODP_KEY_ATTR_ETHERNET] = sizeof(struct odp_key_ethernet),
604                         [ODP_KEY_ATTR_8021Q] = sizeof(struct odp_key_8021q),
605                         [ODP_KEY_ATTR_ETHERTYPE] = 2,
606                         [ODP_KEY_ATTR_IPV4] = sizeof(struct odp_key_ipv4),
607                         [ODP_KEY_ATTR_IPV6] = sizeof(struct odp_key_ipv6),
608                         [ODP_KEY_ATTR_TCP] = sizeof(struct odp_key_tcp),
609                         [ODP_KEY_ATTR_UDP] = sizeof(struct odp_key_udp),
610                         [ODP_KEY_ATTR_ICMP] = sizeof(struct odp_key_icmp),
611                         [ODP_KEY_ATTR_ICMPV6] = sizeof(struct odp_key_icmpv6),
612                         [ODP_KEY_ATTR_ARP] = sizeof(struct odp_key_arp),
613                         [ODP_KEY_ATTR_ND] = sizeof(struct odp_key_nd),
614                 };
615
616                 const struct odp_key_ethernet *eth_key;
617                 const struct odp_key_8021q *q_key;
618                 const struct odp_key_ipv4 *ipv4_key;
619                 const struct odp_key_ipv6 *ipv6_key;
620                 const struct odp_key_tcp *tcp_key;
621                 const struct odp_key_udp *udp_key;
622                 const struct odp_key_icmp *icmp_key;
623                 const struct odp_key_icmpv6 *icmpv6_key;
624                 const struct odp_key_arp *arp_key;
625                 const struct odp_key_nd *nd_key;
626
627                 int type = nla_type(nla);
628
629                 if (type > ODP_KEY_ATTR_MAX || nla_len(nla) != key_lens[type])
630                         return -EINVAL;
631
632 #define TRANSITION(PREV_TYPE, TYPE) (((PREV_TYPE) << 16) | (TYPE))
633                 switch (TRANSITION(prev_type, type)) {
634                 case TRANSITION(ODP_KEY_ATTR_UNSPEC, ODP_KEY_ATTR_TUN_ID):
635                         swkey->tun_id = nla_get_be64(nla);
636                         break;
637
638                 case TRANSITION(ODP_KEY_ATTR_UNSPEC, ODP_KEY_ATTR_IN_PORT):
639                 case TRANSITION(ODP_KEY_ATTR_TUN_ID, ODP_KEY_ATTR_IN_PORT):
640                         if (nla_get_u32(nla) >= DP_MAX_PORTS)
641                                 return -EINVAL;
642                         swkey->in_port = nla_get_u32(nla);
643                         break;
644
645                 case TRANSITION(ODP_KEY_ATTR_IN_PORT, ODP_KEY_ATTR_ETHERNET):
646                         eth_key = nla_data(nla);
647                         memcpy(swkey->dl_src, eth_key->eth_src, ETH_ALEN);
648                         memcpy(swkey->dl_dst, eth_key->eth_dst, ETH_ALEN);
649                         break;
650
651                 case TRANSITION(ODP_KEY_ATTR_ETHERNET, ODP_KEY_ATTR_8021Q):
652                         q_key = nla_data(nla);
653                         /* Only standard 0x8100 VLANs currently supported. */
654                         if (q_key->q_tpid != htons(ETH_P_8021Q))
655                                 return -EINVAL;
656                         if (q_key->q_tci & htons(VLAN_TAG_PRESENT))
657                                 return -EINVAL;
658                         swkey->dl_tci = q_key->q_tci | htons(VLAN_TAG_PRESENT);
659                         break;
660
661                 case TRANSITION(ODP_KEY_ATTR_8021Q, ODP_KEY_ATTR_ETHERTYPE):
662                 case TRANSITION(ODP_KEY_ATTR_ETHERNET, ODP_KEY_ATTR_ETHERTYPE):
663                         swkey->dl_type = nla_get_be16(nla);
664                         if (ntohs(swkey->dl_type) < 1536)
665                                 return -EINVAL;
666                         break;
667
668                 case TRANSITION(ODP_KEY_ATTR_ETHERTYPE, ODP_KEY_ATTR_IPV4):
669                         if (swkey->dl_type != htons(ETH_P_IP))
670                                 return -EINVAL;
671                         ipv4_key = nla_data(nla);
672                         swkey->ipv4_src = ipv4_key->ipv4_src;
673                         swkey->ipv4_dst = ipv4_key->ipv4_dst;
674                         swkey->nw_proto = ipv4_key->ipv4_proto;
675                         swkey->nw_tos = ipv4_key->ipv4_tos;
676                         if (swkey->nw_tos & INET_ECN_MASK)
677                                 return -EINVAL;
678                         break;
679
680                 case TRANSITION(ODP_KEY_ATTR_ETHERTYPE, ODP_KEY_ATTR_IPV6):
681                         if (swkey->dl_type != htons(ETH_P_IPV6))
682                                 return -EINVAL;
683                         ipv6_key = nla_data(nla);
684                         memcpy(swkey->ipv6_src, ipv6_key->ipv6_src,
685                                         sizeof(swkey->ipv6_src));
686                         memcpy(swkey->ipv6_dst, ipv6_key->ipv6_dst,
687                                         sizeof(swkey->ipv6_dst));
688                         swkey->nw_proto = ipv6_key->ipv6_proto;
689                         swkey->nw_tos = ipv6_key->ipv6_tos;
690                         if (swkey->nw_tos & INET_ECN_MASK)
691                                 return -EINVAL;
692                         break;
693
694                 case TRANSITION(ODP_KEY_ATTR_IPV4, ODP_KEY_ATTR_TCP):
695                 case TRANSITION(ODP_KEY_ATTR_IPV6, ODP_KEY_ATTR_TCP):
696                         if (swkey->nw_proto != IPPROTO_TCP)
697                                 return -EINVAL;
698                         tcp_key = nla_data(nla);
699                         swkey->tp_src = tcp_key->tcp_src;
700                         swkey->tp_dst = tcp_key->tcp_dst;
701                         break;
702
703                 case TRANSITION(ODP_KEY_ATTR_IPV4, ODP_KEY_ATTR_UDP):
704                 case TRANSITION(ODP_KEY_ATTR_IPV6, ODP_KEY_ATTR_UDP):
705                         if (swkey->nw_proto != IPPROTO_UDP)
706                                 return -EINVAL;
707                         udp_key = nla_data(nla);
708                         swkey->tp_src = udp_key->udp_src;
709                         swkey->tp_dst = udp_key->udp_dst;
710                         break;
711
712                 case TRANSITION(ODP_KEY_ATTR_IPV4, ODP_KEY_ATTR_ICMP):
713                         if (swkey->nw_proto != IPPROTO_ICMP)
714                                 return -EINVAL;
715                         icmp_key = nla_data(nla);
716                         swkey->tp_src = htons(icmp_key->icmp_type);
717                         swkey->tp_dst = htons(icmp_key->icmp_code);
718                         break;
719
720                 case TRANSITION(ODP_KEY_ATTR_IPV6, ODP_KEY_ATTR_ICMPV6):
721                         if (swkey->nw_proto != IPPROTO_ICMPV6)
722                                 return -EINVAL;
723                         icmpv6_key = nla_data(nla);
724                         swkey->tp_src = htons(icmpv6_key->icmpv6_type);
725                         swkey->tp_dst = htons(icmpv6_key->icmpv6_code);
726                         break;
727
728                 case TRANSITION(ODP_KEY_ATTR_ETHERTYPE, ODP_KEY_ATTR_ARP):
729                         if (swkey->dl_type != htons(ETH_P_ARP))
730                                 return -EINVAL;
731                         arp_key = nla_data(nla);
732                         swkey->ipv4_src = arp_key->arp_sip;
733                         swkey->ipv4_dst = arp_key->arp_tip;
734                         if (arp_key->arp_op & htons(0xff00))
735                                 return -EINVAL;
736                         swkey->nw_proto = ntohs(arp_key->arp_op);
737                         memcpy(swkey->arp_sha, arp_key->arp_sha, ETH_ALEN);
738                         memcpy(swkey->arp_tha, arp_key->arp_tha, ETH_ALEN);
739                         break;
740
741                 case TRANSITION(ODP_KEY_ATTR_ICMPV6, ODP_KEY_ATTR_ND):
742                         if (swkey->tp_src != htons(NDISC_NEIGHBOUR_SOLICITATION)
743                                         && swkey->tp_src != htons(NDISC_NEIGHBOUR_ADVERTISEMENT))
744                                 return -EINVAL;
745                         nd_key = nla_data(nla);
746                         memcpy(swkey->nd_target, nd_key->nd_target,
747                                         sizeof(swkey->nd_target));
748                         memcpy(swkey->arp_sha, nd_key->nd_sll, ETH_ALEN);
749                         memcpy(swkey->arp_tha, nd_key->nd_tll, ETH_ALEN);
750                         break;
751
752                 default:
753                         return -EINVAL;
754                 }
755
756                 prev_type = type;
757         }
758         if (rem)
759                 return -EINVAL;
760
761         switch (prev_type) {
762         case ODP_KEY_ATTR_UNSPEC:
763                 return -EINVAL;
764
765         case ODP_KEY_ATTR_TUN_ID:
766         case ODP_KEY_ATTR_IN_PORT:
767                 return -EINVAL;
768
769         case ODP_KEY_ATTR_ETHERNET:
770         case ODP_KEY_ATTR_8021Q:
771                 return 0;
772
773         case ODP_KEY_ATTR_ETHERTYPE:
774                 if (swkey->dl_type == htons(ETH_P_IP) ||
775                     swkey->dl_type == htons(ETH_P_ARP))
776                         return -EINVAL;
777                 return 0;
778
779         case ODP_KEY_ATTR_IPV4:
780                 if (swkey->nw_proto == IPPROTO_TCP ||
781                     swkey->nw_proto == IPPROTO_UDP ||
782                     swkey->nw_proto == IPPROTO_ICMP)
783                         return -EINVAL;
784                 return 0;
785
786         case ODP_KEY_ATTR_IPV6:
787                 if (swkey->nw_proto == IPPROTO_TCP ||
788                     swkey->nw_proto == IPPROTO_UDP ||
789                     swkey->nw_proto == IPPROTO_ICMPV6)
790                         return -EINVAL;
791                 return 0;
792
793         case ODP_KEY_ATTR_ICMPV6:
794                 if (swkey->tp_src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
795                     swkey->tp_src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT))
796                         return -EINVAL;
797                 return 0;
798
799         case ODP_KEY_ATTR_TCP:
800         case ODP_KEY_ATTR_UDP:
801         case ODP_KEY_ATTR_ICMP:
802         case ODP_KEY_ATTR_ARP:
803         case ODP_KEY_ATTR_ND:
804                 return 0;
805         }
806
807         WARN_ON_ONCE(1);
808         return -EINVAL;
809 }
810
811 int flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
812 {
813         struct odp_key_ethernet *eth_key;
814         struct nlattr *nla;
815
816         if (swkey->tun_id != cpu_to_be64(0))
817                 NLA_PUT_BE64(skb, ODP_KEY_ATTR_TUN_ID, swkey->tun_id);
818
819         NLA_PUT_U32(skb, ODP_KEY_ATTR_IN_PORT, swkey->in_port);
820
821         nla = nla_reserve(skb, ODP_KEY_ATTR_ETHERNET, sizeof(*eth_key));
822         if (!nla)
823                 goto nla_put_failure;
824         eth_key = nla_data(nla);
825         memcpy(eth_key->eth_src, swkey->dl_src, ETH_ALEN);
826         memcpy(eth_key->eth_dst, swkey->dl_dst, ETH_ALEN);
827
828         if (swkey->dl_tci != htons(0)) {
829                 struct odp_key_8021q q_key;
830
831                 q_key.q_tpid = htons(ETH_P_8021Q);
832                 q_key.q_tci = swkey->dl_tci & ~htons(VLAN_TAG_PRESENT);
833                 NLA_PUT(skb, ODP_KEY_ATTR_8021Q, sizeof(q_key), &q_key);
834         }
835
836         if (swkey->dl_type == htons(ETH_P_802_2))
837                 return 0;
838
839         NLA_PUT_BE16(skb, ODP_KEY_ATTR_ETHERTYPE, swkey->dl_type);
840
841         if (swkey->dl_type == htons(ETH_P_IP)) {
842                 struct odp_key_ipv4 *ipv4_key;
843
844                 nla = nla_reserve(skb, ODP_KEY_ATTR_IPV4, sizeof(*ipv4_key));
845                 if (!nla)
846                         goto nla_put_failure;
847                 ipv4_key = nla_data(nla);
848                 ipv4_key->ipv4_src = swkey->ipv4_src;
849                 ipv4_key->ipv4_dst = swkey->ipv4_dst;
850                 ipv4_key->ipv4_proto = swkey->nw_proto;
851                 ipv4_key->ipv4_tos = swkey->nw_tos;
852         } else if (swkey->dl_type == htons(ETH_P_IPV6)) {
853                 struct odp_key_ipv6 *ipv6_key;
854
855                 nla = nla_reserve(skb, ODP_KEY_ATTR_IPV6, sizeof(*ipv6_key));
856                 if (!nla)
857                         goto nla_put_failure;
858                 ipv6_key = nla_data(nla);
859                 memcpy(ipv6_key->ipv6_src, swkey->ipv6_src,
860                                 sizeof(ipv6_key->ipv6_src));
861                 memcpy(ipv6_key->ipv6_dst, swkey->ipv6_dst,
862                                 sizeof(ipv6_key->ipv6_dst));
863                 ipv6_key->ipv6_proto = swkey->nw_proto;
864                 ipv6_key->ipv6_tos = swkey->nw_tos;
865         } else if (swkey->dl_type == htons(ETH_P_ARP)) {
866                 struct odp_key_arp *arp_key;
867
868                 nla = nla_reserve(skb, ODP_KEY_ATTR_ARP, sizeof(*arp_key));
869                 if (!nla)
870                         goto nla_put_failure;
871                 arp_key = nla_data(nla);
872                 arp_key->arp_sip = swkey->ipv4_src;
873                 arp_key->arp_tip = swkey->ipv4_dst;
874                 arp_key->arp_op = htons(swkey->nw_proto);
875                 memcpy(arp_key->arp_sha, swkey->arp_sha, ETH_ALEN);
876                 memcpy(arp_key->arp_tha, swkey->arp_tha, ETH_ALEN);
877         }
878
879         if (swkey->dl_type == htons(ETH_P_IP)
880                         || swkey->dl_type == htons(ETH_P_IPV6)) {
881
882                 if (swkey->nw_proto == IPPROTO_TCP) {
883                         struct odp_key_tcp *tcp_key;
884
885                         nla = nla_reserve(skb, ODP_KEY_ATTR_TCP, sizeof(*tcp_key));
886                         if (!nla)
887                                 goto nla_put_failure;
888                         tcp_key = nla_data(nla);
889                         tcp_key->tcp_src = swkey->tp_src;
890                         tcp_key->tcp_dst = swkey->tp_dst;
891                 } else if (swkey->nw_proto == IPPROTO_UDP) {
892                         struct odp_key_udp *udp_key;
893
894                         nla = nla_reserve(skb, ODP_KEY_ATTR_UDP, sizeof(*udp_key));
895                         if (!nla)
896                                 goto nla_put_failure;
897                         udp_key = nla_data(nla);
898                         udp_key->udp_src = swkey->tp_src;
899                         udp_key->udp_dst = swkey->tp_dst;
900                 } else if (swkey->dl_type == htons(ETH_P_IP)
901                                 && swkey->nw_proto == IPPROTO_ICMP) {
902                         struct odp_key_icmp *icmp_key;
903
904                         nla = nla_reserve(skb, ODP_KEY_ATTR_ICMP, sizeof(*icmp_key));
905                         if (!nla)
906                                 goto nla_put_failure;
907                         icmp_key = nla_data(nla);
908                         icmp_key->icmp_type = ntohs(swkey->tp_src);
909                         icmp_key->icmp_code = ntohs(swkey->tp_dst);
910                 } else if (swkey->dl_type == htons(ETH_P_IPV6)
911                                 && swkey->nw_proto == IPPROTO_ICMPV6) {
912                         struct odp_key_icmpv6 *icmpv6_key;
913
914                         nla = nla_reserve(skb, ODP_KEY_ATTR_ICMPV6, sizeof(*icmpv6_key));
915                         if (!nla)
916                                 goto nla_put_failure;
917                         icmpv6_key = nla_data(nla);
918                         icmpv6_key->icmpv6_type = ntohs(swkey->tp_src);
919                         icmpv6_key->icmpv6_code = ntohs(swkey->tp_dst);
920
921                         if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION
922                                         || icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
923                                 struct odp_key_nd *nd_key;
924
925                                 nla = nla_reserve(skb, ODP_KEY_ATTR_ND, sizeof(*nd_key));
926                                 if (!nla)
927                                         goto nla_put_failure;
928                                 nd_key = nla_data(nla);
929                                 memcpy(nd_key->nd_target, swkey->nd_target,
930                                                         sizeof(nd_key->nd_target));
931                                 memcpy(nd_key->nd_sll, swkey->arp_sha, ETH_ALEN);
932                                 memcpy(nd_key->nd_tll, swkey->arp_tha, ETH_ALEN);
933                         }
934                 }
935         }
936
937         return 0;
938
939 nla_put_failure:
940         return -EMSGSIZE;
941 }
942
943 /* Initializes the flow module.
944  * Returns zero if successful or a negative error code. */
945 int flow_init(void)
946 {
947         flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
948                                         0, NULL);
949         if (flow_cache == NULL)
950                 return -ENOMEM;
951
952         get_random_bytes(&hash_seed, sizeof(hash_seed));
953
954         return 0;
955 }
956
957 /* Uninitializes the flow module. */
958 void flow_exit(void)
959 {
960         kmem_cache_destroy(flow_cache);
961 }