datapath: Remove vport MAC address configuration.
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2007-2012 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/if_arp.h>
22 #include <linux/if_ether.h>
23 #include <linux/ip.h>
24 #include <linux/if_vlan.h>
25 #include <linux/igmp.h>
26 #include <linux/in.h>
27 #include <linux/in_route.h>
28 #include <linux/inetdevice.h>
29 #include <linux/jhash.h>
30 #include <linux/list.h>
31 #include <linux/kernel.h>
32 #include <linux/version.h>
33 #include <linux/workqueue.h>
34 #include <linux/rculist.h>
35
36 #include <net/dsfield.h>
37 #include <net/dst.h>
38 #include <net/icmp.h>
39 #include <net/inet_ecn.h>
40 #include <net/ip.h>
41 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
42 #include <net/ipv6.h>
43 #endif
44 #include <net/route.h>
45 #include <net/xfrm.h>
46
47 #include "checksum.h"
48 #include "datapath.h"
49 #include "tunnel.h"
50 #include "vlan.h"
51 #include "vport.h"
52 #include "vport-internal_dev.h"
53
54 #define PORT_TABLE_SIZE  1024
55
56 static struct hlist_head *port_table __read_mostly;
57
58 /*
59  * These are just used as an optimization: they don't require any kind of
60  * synchronization because we could have just as easily read the value before
61  * the port change happened.
62  */
63 static unsigned int key_local_remote_ports __read_mostly;
64 static unsigned int key_remote_ports __read_mostly;
65 static unsigned int key_multicast_ports __read_mostly;
66 static unsigned int local_remote_ports __read_mostly;
67 static unsigned int remote_ports __read_mostly;
68 static unsigned int null_ports __read_mostly;
69 static unsigned int multicast_ports __read_mostly;
70
71 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
72 #define rt_dst(rt) (rt->dst)
73 #else
74 #define rt_dst(rt) (rt->u.dst)
75 #endif
76
77 static struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport)
78 {
79         return vport_from_priv(tnl_vport);
80 }
81
82 static void free_config_rcu(struct rcu_head *rcu)
83 {
84         struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
85         kfree(c);
86 }
87
88 /* Frees the portion of 'mutable' that requires RTNL and thus can't happen
89  * within an RCU callback.  Fortunately this part doesn't require waiting for
90  * an RCU grace period.
91  */
92 static void free_mutable_rtnl(struct tnl_mutable_config *mutable)
93 {
94         ASSERT_RTNL();
95         if (ipv4_is_multicast(mutable->key.daddr) && mutable->mlink) {
96                 struct in_device *in_dev;
97                 in_dev = inetdev_by_index(port_key_get_net(&mutable->key), mutable->mlink);
98                 if (in_dev)
99                         ip_mc_dec_group(in_dev, mutable->key.daddr);
100         }
101 }
102
103 static void assign_config_rcu(struct vport *vport,
104                               struct tnl_mutable_config *new_config)
105 {
106         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
107         struct tnl_mutable_config *old_config;
108
109         old_config = rtnl_dereference(tnl_vport->mutable);
110         rcu_assign_pointer(tnl_vport->mutable, new_config);
111
112         free_mutable_rtnl(old_config);
113         call_rcu(&old_config->rcu, free_config_rcu);
114 }
115
116 static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
117 {
118         bool is_multicast = ipv4_is_multicast(mutable->key.daddr);
119
120         if (mutable->flags & TNL_F_IN_KEY_MATCH) {
121                 if (mutable->key.saddr)
122                         return &local_remote_ports;
123                 else if (is_multicast)
124                         return &multicast_ports;
125                 else
126                         return &remote_ports;
127         } else {
128                 if (mutable->key.saddr)
129                         return &key_local_remote_ports;
130                 else if (is_multicast)
131                         return &key_multicast_ports;
132                 else if (mutable->key.daddr)
133                         return &key_remote_ports;
134                 else
135                         return &null_ports;
136         }
137 }
138
139 static u32 port_hash(const struct port_lookup_key *key)
140 {
141         return jhash2((u32 *)key, (PORT_KEY_LEN / sizeof(u32)), 0);
142 }
143
144 static struct hlist_head *find_bucket(u32 hash)
145 {
146         return &port_table[(hash & (PORT_TABLE_SIZE - 1))];
147 }
148
149 static void port_table_add_port(struct vport *vport)
150 {
151         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
152         const struct tnl_mutable_config *mutable;
153         u32 hash;
154
155         mutable = rtnl_dereference(tnl_vport->mutable);
156         hash = port_hash(&mutable->key);
157         hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
158
159         (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
160 }
161
162 static void port_table_move_port(struct vport *vport,
163                       struct tnl_mutable_config *new_mutable)
164 {
165         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
166         u32 hash;
167
168         hash = port_hash(&new_mutable->key);
169         hlist_del_init_rcu(&tnl_vport->hash_node);
170         hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
171
172         (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
173         assign_config_rcu(vport, new_mutable);
174         (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
175 }
176
177 static void port_table_remove_port(struct vport *vport)
178 {
179         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
180
181         hlist_del_init_rcu(&tnl_vport->hash_node);
182
183         (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
184 }
185
186 static struct vport *port_table_lookup(struct port_lookup_key *key,
187                                        const struct tnl_mutable_config **pmutable)
188 {
189         struct hlist_node *n;
190         struct hlist_head *bucket;
191         u32 hash = port_hash(key);
192         struct tnl_vport *tnl_vport;
193
194         bucket = find_bucket(hash);
195
196         hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
197                 struct tnl_mutable_config *mutable;
198
199                 mutable = rcu_dereference_rtnl(tnl_vport->mutable);
200                 if (!memcmp(&mutable->key, key, PORT_KEY_LEN)) {
201                         *pmutable = mutable;
202                         return tnl_vport_to_vport(tnl_vport);
203                 }
204         }
205
206         return NULL;
207 }
208
209 struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
210                                 __be64 key, int tunnel_type,
211                                 const struct tnl_mutable_config **mutable)
212 {
213         struct port_lookup_key lookup;
214         struct vport *vport;
215         bool is_multicast = ipv4_is_multicast(saddr);
216
217         port_key_set_net(&lookup, net);
218         lookup.saddr = saddr;
219         lookup.daddr = daddr;
220
221         /* First try for exact match on in_key. */
222         lookup.in_key = key;
223         lookup.tunnel_type = tunnel_type | TNL_T_KEY_EXACT;
224         if (!is_multicast && key_local_remote_ports) {
225                 vport = port_table_lookup(&lookup, mutable);
226                 if (vport)
227                         return vport;
228         }
229         if (key_remote_ports) {
230                 lookup.saddr = 0;
231                 vport = port_table_lookup(&lookup, mutable);
232                 if (vport)
233                         return vport;
234
235                 lookup.saddr = saddr;
236         }
237
238         /* Then try matches that wildcard in_key. */
239         lookup.in_key = 0;
240         lookup.tunnel_type = tunnel_type | TNL_T_KEY_MATCH;
241         if (!is_multicast && local_remote_ports) {
242                 vport = port_table_lookup(&lookup, mutable);
243                 if (vport)
244                         return vport;
245         }
246         if (remote_ports) {
247                 lookup.saddr = 0;
248                 vport = port_table_lookup(&lookup, mutable);
249                 if (vport)
250                         return vport;
251         }
252
253         if (is_multicast) {
254                 lookup.saddr = 0;
255                 lookup.daddr = saddr;
256                 if (key_multicast_ports) {
257                         lookup.tunnel_type = tunnel_type | TNL_T_KEY_EXACT;
258                         lookup.in_key = key;
259                         vport = port_table_lookup(&lookup, mutable);
260                         if (vport)
261                                 return vport;
262                 }
263                 if (multicast_ports) {
264                         lookup.tunnel_type = tunnel_type | TNL_T_KEY_MATCH;
265                         lookup.in_key = 0;
266                         vport = port_table_lookup(&lookup, mutable);
267                         if (vport)
268                                 return vport;
269                 }
270         }
271
272         if (null_ports) {
273                 lookup.daddr = 0;
274                 lookup.saddr = 0;
275                 lookup.in_key = 0;
276                 lookup.tunnel_type = tunnel_type;
277                 vport = port_table_lookup(&lookup, mutable);
278                 if (vport)
279                         return vport;
280         }
281         return NULL;
282 }
283
284 static void ecn_decapsulate(struct sk_buff *skb)
285 {
286         if (unlikely(INET_ECN_is_ce(OVS_CB(skb)->tun_key->ipv4_tos))) {
287                 __be16 protocol = skb->protocol;
288
289                 skb_set_network_header(skb, ETH_HLEN);
290
291                 if (protocol == htons(ETH_P_8021Q)) {
292                         if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
293                                 return;
294
295                         protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
296                         skb_set_network_header(skb, VLAN_ETH_HLEN);
297                 }
298
299                 if (protocol == htons(ETH_P_IP)) {
300                         if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
301                             + sizeof(struct iphdr))))
302                                 return;
303
304                         IP_ECN_set_ce(ip_hdr(skb));
305                 }
306 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
307                 else if (protocol == htons(ETH_P_IPV6)) {
308                         if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
309                             + sizeof(struct ipv6hdr))))
310                                 return;
311
312                         IP6_ECN_set_ce(ipv6_hdr(skb));
313                 }
314 #endif
315         }
316 }
317
318 /**
319  *      ovs_tnl_rcv - ingress point for generic tunnel code
320  *
321  * @vport: port this packet was received on
322  * @skb: received packet
323  * @tos: ToS from encapsulating IP packet, used to copy ECN bits
324  *
325  * Must be called with rcu_read_lock.
326  *
327  * Packets received by this function are in the following state:
328  * - skb->data points to the inner Ethernet header.
329  * - The inner Ethernet header is in the linear data area.
330  * - skb->csum does not include the inner Ethernet header.
331  * - The layer pointers are undefined.
332  */
333 void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb)
334 {
335         struct ethhdr *eh;
336
337         skb_reset_mac_header(skb);
338         eh = eth_hdr(skb);
339
340         if (likely(ntohs(eh->h_proto) >= 1536))
341                 skb->protocol = eh->h_proto;
342         else
343                 skb->protocol = htons(ETH_P_802_2);
344
345         skb_dst_drop(skb);
346         nf_reset(skb);
347         skb_clear_rxhash(skb);
348         secpath_reset(skb);
349
350         ecn_decapsulate(skb);
351         vlan_set_tci(skb, 0);
352
353         if (unlikely(compute_ip_summed(skb, false))) {
354                 kfree_skb(skb);
355                 return;
356         }
357
358         ovs_vport_receive(vport, skb);
359 }
360
361 static struct rtable *find_route(struct net *net,
362                 __be32 *saddr, __be32 daddr, u8 ipproto,
363                 u8 tos)
364 {
365         struct rtable *rt;
366         /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
367          * router expect RT_TOS bits only. */
368
369 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
370         struct flowi fl = { .nl_u = { .ip4_u = {
371                                         .daddr = daddr,
372                                         .saddr = *saddr,
373                                         .tos   = RT_TOS(tos) } },
374                                         .proto = ipproto };
375
376         if (unlikely(ip_route_output_key(net, &rt, &fl)))
377                 return ERR_PTR(-EADDRNOTAVAIL);
378         *saddr = fl.nl_u.ip4_u.saddr;
379         return rt;
380 #else
381         struct flowi4 fl = { .daddr = daddr,
382                              .saddr = *saddr,
383                              .flowi4_tos = RT_TOS(tos),
384                              .flowi4_proto = ipproto };
385
386         rt = ip_route_output_key(net, &fl);
387         *saddr = fl.saddr;
388         return rt;
389 #endif
390 }
391
392 static bool need_linearize(const struct sk_buff *skb)
393 {
394         int i;
395
396         if (unlikely(skb_shinfo(skb)->frag_list))
397                 return true;
398
399         /*
400          * Generally speaking we should linearize if there are paged frags.
401          * However, if all of the refcounts are 1 we know nobody else can
402          * change them from underneath us and we can skip the linearization.
403          */
404         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
405                 if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
406                         return true;
407
408         return false;
409 }
410
411 static struct sk_buff *handle_offloads(struct sk_buff *skb,
412                                        const struct tnl_mutable_config *mutable,
413                                        const struct rtable *rt,
414                                        int tunnel_hlen)
415 {
416         int min_headroom;
417         int err;
418
419         min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
420                         + tunnel_hlen
421                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
422
423         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
424                 int head_delta = SKB_DATA_ALIGN(min_headroom -
425                                                 skb_headroom(skb) +
426                                                 16);
427                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
428                                         0, GFP_ATOMIC);
429                 if (unlikely(err))
430                         goto error_free;
431         }
432
433         forward_ip_summed(skb, true);
434
435         if (skb_is_gso(skb)) {
436                 struct sk_buff *nskb;
437
438                 nskb = skb_gso_segment(skb, 0);
439                 if (IS_ERR(nskb)) {
440                         kfree_skb(skb);
441                         err = PTR_ERR(nskb);
442                         goto error;
443                 }
444
445                 consume_skb(skb);
446                 skb = nskb;
447         } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
448                 /* Pages aren't locked and could change at any time.
449                  * If this happens after we compute the checksum, the
450                  * checksum will be wrong.  We linearize now to avoid
451                  * this problem.
452                  */
453                 if (unlikely(need_linearize(skb))) {
454                         err = __skb_linearize(skb);
455                         if (unlikely(err))
456                                 goto error_free;
457                 }
458
459                 err = skb_checksum_help(skb);
460                 if (unlikely(err))
461                         goto error_free;
462         }
463
464         set_ip_summed(skb, OVS_CSUM_NONE);
465
466         return skb;
467
468 error_free:
469         kfree_skb(skb);
470 error:
471         return ERR_PTR(err);
472 }
473
474 static int send_frags(struct sk_buff *skb,
475                       int tunnel_hlen)
476 {
477         int sent_len;
478
479         sent_len = 0;
480         while (skb) {
481                 struct sk_buff *next = skb->next;
482                 int frag_len = skb->len - tunnel_hlen;
483                 int err;
484
485                 skb->next = NULL;
486                 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
487
488                 err = ip_local_out(skb);
489                 skb = next;
490                 if (unlikely(net_xmit_eval(err)))
491                         goto free_frags;
492                 sent_len += frag_len;
493         }
494
495         return sent_len;
496
497 free_frags:
498         /*
499          * There's no point in continuing to send fragments once one has been
500          * dropped so just free the rest.  This may help improve the congestion
501          * that caused the first packet to be dropped.
502          */
503         ovs_tnl_free_linked_skbs(skb);
504         return sent_len;
505 }
506
507 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
508 {
509         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
510         const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
511         enum vport_err_type err = VPORT_E_TX_ERROR;
512         struct rtable *rt;
513         struct ovs_key_ipv4_tunnel tun_key;
514         int sent_len = 0;
515         int tunnel_hlen;
516         __be16 frag_off;
517         __be32 daddr;
518         __be32 saddr;
519         u8 ttl;
520         u8 tos;
521
522         /* Validate the protocol headers before we try to use them. */
523         if (skb->protocol == htons(ETH_P_8021Q) &&
524             !vlan_tx_tag_present(skb)) {
525                 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
526                         goto error_free;
527
528                 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
529                 skb_set_network_header(skb, VLAN_ETH_HLEN);
530         }
531
532         if (skb->protocol == htons(ETH_P_IP)) {
533                 if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
534                     + sizeof(struct iphdr))))
535                         skb->protocol = 0;
536         }
537 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
538         else if (skb->protocol == htons(ETH_P_IPV6)) {
539                 if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
540                     + sizeof(struct ipv6hdr))))
541                         skb->protocol = 0;
542         }
543 #endif
544
545         /* If OVS_CB(skb)->tun_key is NULL, point it at the local tun_key here,
546          * and zero it out.
547          */
548         if (!OVS_CB(skb)->tun_key) {
549                 memset(&tun_key, 0, sizeof(tun_key));
550                 OVS_CB(skb)->tun_key = &tun_key;
551         }
552
553         tunnel_hlen = tnl_vport->tnl_ops->hdr_len(mutable, OVS_CB(skb)->tun_key);
554         if (unlikely(tunnel_hlen < 0)) {
555                 err = VPORT_E_TX_DROPPED;
556                 goto error_free;
557         }
558         tunnel_hlen += sizeof(struct iphdr);
559
560         if (OVS_CB(skb)->tun_key->ipv4_dst) {
561                 daddr = OVS_CB(skb)->tun_key->ipv4_dst;
562                 saddr = OVS_CB(skb)->tun_key->ipv4_src;
563                 tos = OVS_CB(skb)->tun_key->ipv4_tos;
564                 ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
565                 frag_off = OVS_CB(skb)->tun_key->tun_flags &
566                                 OVS_TNL_F_DONT_FRAGMENT ?  htons(IP_DF) : 0;
567         } else {
568                 u8 inner_tos;
569                 daddr = mutable->key.daddr;
570                 saddr = mutable->key.saddr;
571
572                 if (unlikely(!daddr)) {
573                         /* Trying to sent packet from Null-port without
574                          * tunnel info? Drop this packet. */
575                         err = VPORT_E_TX_DROPPED;
576                         goto error_free;
577                 }
578
579                 /* ToS */
580                 if (skb->protocol == htons(ETH_P_IP))
581                         inner_tos = ip_hdr(skb)->tos;
582 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
583                 else if (skb->protocol == htons(ETH_P_IPV6))
584                         inner_tos = ipv6_get_dsfield(ipv6_hdr(skb));
585 #endif
586                 else
587                         inner_tos = 0;
588
589                 if (mutable->flags & TNL_F_TOS_INHERIT)
590                         tos = inner_tos;
591                 else
592                         tos = mutable->tos;
593
594                 tos = INET_ECN_encapsulate(tos, inner_tos);
595
596                 /* TTL */
597                 ttl = mutable->ttl;
598                 if (mutable->flags & TNL_F_TTL_INHERIT) {
599                         if (skb->protocol == htons(ETH_P_IP))
600                                 ttl = ip_hdr(skb)->ttl;
601 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
602                         else if (skb->protocol == htons(ETH_P_IPV6))
603                                 ttl = ipv6_hdr(skb)->hop_limit;
604 #endif
605                 }
606
607                 frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
608         }
609
610         /* Route lookup */
611         rt = find_route(port_key_get_net(&mutable->key), &saddr, daddr,
612                           tnl_vport->tnl_ops->ipproto, tos);
613         if (IS_ERR(rt))
614                 goto error_free;
615
616         /* Reset SKB */
617         nf_reset(skb);
618         secpath_reset(skb);
619         skb_dst_drop(skb);
620         skb_clear_rxhash(skb);
621
622         /* Offloading */
623         skb = handle_offloads(skb, mutable, rt, tunnel_hlen);
624         if (IS_ERR(skb)) {
625                 skb = NULL;
626                 goto err_free_rt;
627         }
628
629         /* TTL Fixup. */
630         if (!OVS_CB(skb)->tun_key->ipv4_dst) {
631                 if (!(mutable->flags & TNL_F_TTL_INHERIT)) {
632                         if (!ttl)
633                                 ttl = ip4_dst_hoplimit(&rt_dst(rt));
634                 }
635         }
636
637         while (skb) {
638                 struct iphdr *iph;
639                 struct sk_buff *next_skb = skb->next;
640                 skb->next = NULL;
641
642                 if (unlikely(vlan_deaccel_tag(skb)))
643                         goto next;
644
645                 skb_push(skb, tunnel_hlen);
646                 skb_reset_network_header(skb);
647                 skb_set_transport_header(skb, sizeof(struct iphdr));
648
649                 if (next_skb)
650                         skb_dst_set(skb, dst_clone(&rt_dst(rt)));
651                 else
652                         skb_dst_set(skb, &rt_dst(rt));
653
654                 /* Push IP header. */
655                 iph = ip_hdr(skb);
656                 iph->version    = 4;
657                 iph->ihl        = sizeof(struct iphdr) >> 2;
658                 iph->protocol   = tnl_vport->tnl_ops->ipproto;
659                 iph->daddr      = daddr;
660                 iph->saddr      = saddr;
661                 iph->tos        = tos;
662                 iph->ttl        = ttl;
663                 iph->frag_off   = frag_off;
664                 ip_select_ident(iph, &rt_dst(rt), NULL);
665
666                 /* Push Tunnel header. */
667                 skb = tnl_vport->tnl_ops->build_header(vport, mutable,
668                                                         &rt_dst(rt), skb, tunnel_hlen);
669                 if (unlikely(!skb))
670                         goto next;
671
672                 sent_len += send_frags(skb, tunnel_hlen);
673
674 next:
675                 skb = next_skb;
676         }
677
678         if (unlikely(sent_len == 0))
679                 ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
680
681         return sent_len;
682
683 err_free_rt:
684         ip_rt_put(rt);
685 error_free:
686         ovs_tnl_free_linked_skbs(skb);
687         ovs_vport_record_error(vport, err);
688         return sent_len;
689 }
690
691 static const struct nla_policy tnl_policy[OVS_TUNNEL_ATTR_MAX + 1] = {
692         [OVS_TUNNEL_ATTR_FLAGS]    = { .type = NLA_U32 },
693         [OVS_TUNNEL_ATTR_DST_IPV4] = { .type = NLA_U32 },
694         [OVS_TUNNEL_ATTR_SRC_IPV4] = { .type = NLA_U32 },
695         [OVS_TUNNEL_ATTR_OUT_KEY]  = { .type = NLA_U64 },
696         [OVS_TUNNEL_ATTR_IN_KEY]   = { .type = NLA_U64 },
697         [OVS_TUNNEL_ATTR_TOS]      = { .type = NLA_U8 },
698         [OVS_TUNNEL_ATTR_TTL]      = { .type = NLA_U8 },
699         [OVS_TUNNEL_ATTR_DST_PORT] = { .type = NLA_U16 },
700 };
701
702 /* Sets OVS_TUNNEL_ATTR_* fields in 'mutable', which must initially be
703  * zeroed. */
704 static int tnl_set_config(struct net *net, struct nlattr *options,
705                           const struct tnl_ops *tnl_ops,
706                           const struct vport *cur_vport,
707                           struct tnl_mutable_config *mutable)
708 {
709         const struct vport *old_vport;
710         const struct tnl_mutable_config *old_mutable;
711         struct nlattr *a[OVS_TUNNEL_ATTR_MAX + 1];
712         int err;
713
714         port_key_set_net(&mutable->key, net);
715         mutable->key.tunnel_type = tnl_ops->tunnel_type;
716         if (!options)
717                 goto out;
718
719         err = nla_parse_nested(a, OVS_TUNNEL_ATTR_MAX, options, tnl_policy);
720         if (err)
721                 return err;
722
723         /* Process attributes possibly useful for null_ports first */
724         if (a[OVS_TUNNEL_ATTR_DST_PORT])
725                 mutable->dst_port =
726                         htons(nla_get_u16(a[OVS_TUNNEL_ATTR_DST_PORT]));
727
728         if (a[OVS_TUNNEL_ATTR_DST_IPV4])
729                 mutable->key.daddr = nla_get_be32(a[OVS_TUNNEL_ATTR_DST_IPV4]);
730
731         /* Skip the rest if configuring a null_port */
732         if (!mutable->key.daddr)
733                 goto out;
734
735         if (a[OVS_TUNNEL_ATTR_FLAGS])
736                 mutable->flags = nla_get_u32(a[OVS_TUNNEL_ATTR_FLAGS])
737                         & TNL_F_PUBLIC;
738
739         if (a[OVS_TUNNEL_ATTR_SRC_IPV4]) {
740                 if (ipv4_is_multicast(mutable->key.daddr))
741                         return -EINVAL;
742                 mutable->key.saddr = nla_get_be32(a[OVS_TUNNEL_ATTR_SRC_IPV4]);
743         }
744
745         if (a[OVS_TUNNEL_ATTR_TOS]) {
746                 mutable->tos = nla_get_u8(a[OVS_TUNNEL_ATTR_TOS]);
747                 /* Reject ToS config with ECN bits set. */
748                 if (mutable->tos & INET_ECN_MASK)
749                         return -EINVAL;
750         }
751
752         if (a[OVS_TUNNEL_ATTR_TTL])
753                 mutable->ttl = nla_get_u8(a[OVS_TUNNEL_ATTR_TTL]);
754
755         if (!a[OVS_TUNNEL_ATTR_IN_KEY]) {
756                 mutable->key.tunnel_type |= TNL_T_KEY_MATCH;
757                 mutable->flags |= TNL_F_IN_KEY_MATCH;
758         } else {
759                 mutable->key.tunnel_type |= TNL_T_KEY_EXACT;
760                 mutable->key.in_key = nla_get_be64(a[OVS_TUNNEL_ATTR_IN_KEY]);
761         }
762
763         if (!a[OVS_TUNNEL_ATTR_OUT_KEY])
764                 mutable->flags |= TNL_F_OUT_KEY_ACTION;
765         else
766                 mutable->out_key = nla_get_be64(a[OVS_TUNNEL_ATTR_OUT_KEY]);
767
768         mutable->mlink = 0;
769         if (ipv4_is_multicast(mutable->key.daddr)) {
770                 struct net_device *dev;
771                 struct rtable *rt;
772                 __be32 saddr = mutable->key.saddr;
773
774                 rt = find_route(port_key_get_net(&mutable->key),
775                              &saddr, mutable->key.daddr,
776                              tnl_ops->ipproto, mutable->tos);
777                 if (IS_ERR(rt))
778                         return -EADDRNOTAVAIL;
779                 dev = rt_dst(rt).dev;
780                 ip_rt_put(rt);
781                 if (__in_dev_get_rtnl(dev) == NULL)
782                         return -EADDRNOTAVAIL;
783                 mutable->mlink = dev->ifindex;
784                 ip_mc_inc_group(__in_dev_get_rtnl(dev), mutable->key.daddr);
785         }
786
787 out:
788         old_vport = port_table_lookup(&mutable->key, &old_mutable);
789         if (old_vport && old_vport != cur_vport)
790                 return -EEXIST;
791
792         return 0;
793 }
794
795 struct vport *ovs_tnl_create(const struct vport_parms *parms,
796                              const struct vport_ops *vport_ops,
797                              const struct tnl_ops *tnl_ops)
798 {
799         struct vport *vport;
800         struct tnl_vport *tnl_vport;
801         struct tnl_mutable_config *mutable;
802         int initial_frag_id;
803         int err;
804
805         vport = ovs_vport_alloc(sizeof(struct tnl_vport), vport_ops, parms);
806         if (IS_ERR(vport)) {
807                 err = PTR_ERR(vport);
808                 goto error;
809         }
810
811         tnl_vport = tnl_vport_priv(vport);
812
813         strcpy(tnl_vport->name, parms->name);
814         tnl_vport->tnl_ops = tnl_ops;
815
816         mutable = kzalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
817         if (!mutable) {
818                 err = -ENOMEM;
819                 goto error_free_vport;
820         }
821
822         get_random_bytes(&initial_frag_id, sizeof(int));
823         atomic_set(&tnl_vport->frag_id, initial_frag_id);
824
825         err = tnl_set_config(ovs_dp_get_net(parms->dp), parms->options, tnl_ops,
826                              NULL, mutable);
827         if (err)
828                 goto error_free_mutable;
829
830         rcu_assign_pointer(tnl_vport->mutable, mutable);
831
832         port_table_add_port(vport);
833         return vport;
834
835 error_free_mutable:
836         free_mutable_rtnl(mutable);
837         kfree(mutable);
838 error_free_vport:
839         ovs_vport_free(vport);
840 error:
841         return ERR_PTR(err);
842 }
843
844 int ovs_tnl_set_options(struct vport *vport, struct nlattr *options)
845 {
846         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
847         const struct tnl_mutable_config *old_mutable;
848         struct tnl_mutable_config *mutable;
849         int err;
850
851         old_mutable = rtnl_dereference(tnl_vport->mutable);
852         if (!old_mutable->key.daddr)
853                 return -EINVAL;
854
855         mutable = kzalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
856         if (!mutable) {
857                 err = -ENOMEM;
858                 goto error;
859         }
860
861         mutable->seq = old_mutable->seq + 1;
862
863         /* Parse the others configured by userspace. */
864         err = tnl_set_config(ovs_dp_get_net(vport->dp), options, tnl_vport->tnl_ops,
865                              vport, mutable);
866         if (err)
867                 goto error_free;
868
869         if (port_hash(&mutable->key) != port_hash(&old_mutable->key))
870                 port_table_move_port(vport, mutable);
871         else
872                 assign_config_rcu(vport, mutable);
873
874         return 0;
875
876 error_free:
877         free_mutable_rtnl(mutable);
878         kfree(mutable);
879 error:
880         return err;
881 }
882
883 int ovs_tnl_get_options(const struct vport *vport, struct sk_buff *skb)
884 {
885         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
886         const struct tnl_mutable_config *mutable = rcu_dereference_rtnl(tnl_vport->mutable);
887
888         if (mutable->dst_port && nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT,
889                                              ntohs(mutable->dst_port)))
890                 goto nla_put_failure;
891
892         /* Skip the rest for null_ports */
893         if (!mutable->key.daddr)
894                 return 0;
895
896         if (nla_put_be32(skb, OVS_TUNNEL_ATTR_DST_IPV4, mutable->key.daddr))
897                 goto nla_put_failure;
898         if (nla_put_u32(skb, OVS_TUNNEL_ATTR_FLAGS,
899                         mutable->flags & TNL_F_PUBLIC))
900                 goto nla_put_failure;
901         if (!(mutable->flags & TNL_F_IN_KEY_MATCH) &&
902             nla_put_be64(skb, OVS_TUNNEL_ATTR_IN_KEY, mutable->key.in_key))
903                 goto nla_put_failure;
904         if (!(mutable->flags & TNL_F_OUT_KEY_ACTION) &&
905             nla_put_be64(skb, OVS_TUNNEL_ATTR_OUT_KEY, mutable->out_key))
906                 goto nla_put_failure;
907         if (mutable->key.saddr &&
908             nla_put_be32(skb, OVS_TUNNEL_ATTR_SRC_IPV4, mutable->key.saddr))
909                 goto nla_put_failure;
910         if (mutable->tos && nla_put_u8(skb, OVS_TUNNEL_ATTR_TOS, mutable->tos))
911                 goto nla_put_failure;
912         if (mutable->ttl && nla_put_u8(skb, OVS_TUNNEL_ATTR_TTL, mutable->ttl))
913                 goto nla_put_failure;
914
915         return 0;
916
917 nla_put_failure:
918         return -EMSGSIZE;
919 }
920
921 static void free_port_rcu(struct rcu_head *rcu)
922 {
923         struct tnl_vport *tnl_vport = container_of(rcu,
924                                                    struct tnl_vport, rcu);
925
926         kfree((struct tnl_mutable __force *)tnl_vport->mutable);
927         ovs_vport_free(tnl_vport_to_vport(tnl_vport));
928 }
929
930 void ovs_tnl_destroy(struct vport *vport)
931 {
932         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
933         struct tnl_mutable_config *mutable;
934
935         mutable = rtnl_dereference(tnl_vport->mutable);
936         port_table_remove_port(vport);
937         free_mutable_rtnl(mutable);
938         call_rcu(&tnl_vport->rcu, free_port_rcu);
939 }
940
941 const char *ovs_tnl_get_name(const struct vport *vport)
942 {
943         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
944         return tnl_vport->name;
945 }
946
947 void ovs_tnl_free_linked_skbs(struct sk_buff *skb)
948 {
949         while (skb) {
950                 struct sk_buff *next = skb->next;
951                 kfree_skb(skb);
952                 skb = next;
953         }
954 }
955
956 int ovs_tnl_init(void)
957 {
958         int i;
959
960         port_table = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *),
961                              GFP_KERNEL);
962         if (!port_table)
963                 return -ENOMEM;
964
965         for (i = 0; i < PORT_TABLE_SIZE; i++)
966                 INIT_HLIST_HEAD(&port_table[i]);
967
968         return 0;
969 }
970
971 void ovs_tnl_exit(void)
972 {
973         kfree(port_table);
974 }