datapath: Abstract tunneling implementation from GRE.
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2010 Nicira Networks.
3  * Distributed under the terms of the GNU GPL version 2.
4  *
5  * Significant portions of this file may be copied from parts of the Linux
6  * kernel, by Linus Torvalds and others.
7  */
8
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
11 #include <linux/ip.h>
12 #include <linux/if_vlan.h>
13 #include <linux/in.h>
14 #include <linux/in_route.h>
15 #include <linux/jhash.h>
16 #include <linux/kernel.h>
17 #include <linux/version.h>
18
19 #include <net/dsfield.h>
20 #include <net/dst.h>
21 #include <net/icmp.h>
22 #include <net/inet_ecn.h>
23 #include <net/ip.h>
24 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
25 #include <net/ipv6.h>
26 #endif
27 #include <net/route.h>
28 #include <net/xfrm.h>
29
30 #include "actions.h"
31 #include "datapath.h"
32 #include "table.h"
33 #include "tunnel.h"
34 #include "vport.h"
35 #include "vport-generic.h"
36
37 /* Protected by RCU. */
38 static struct tbl *port_table;
39
40 /*
41  * These are just used as an optimization: they don't require any kind of
42  * synchronization because we could have just as easily read the value before
43  * the port change happened.
44  */
45 static unsigned int key_local_remote_ports;
46 static unsigned int key_remote_ports;
47 static unsigned int local_remote_ports;
48 static unsigned int remote_ports;
49
50 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
51 #define rt_dst(rt) (rt->dst)
52 #else
53 #define rt_dst(rt) (rt->u.dst)
54 #endif
55
56 static inline struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport)
57 {
58         return vport_from_priv(tnl_vport);
59 }
60
61 static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node)
62 {
63         return container_of(node, struct tnl_vport, tbl_node);
64 }
65
66 /* RCU callback. */
67 static void free_config(struct rcu_head *rcu)
68 {
69         struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
70         kfree(c);
71 }
72
73 static void assign_config_rcu(struct vport *vport,
74                               struct tnl_mutable_config *new_config)
75 {
76         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
77         struct tnl_mutable_config *old_config;
78
79         old_config = rcu_dereference(tnl_vport->mutable);
80         rcu_assign_pointer(tnl_vport->mutable, new_config);
81         call_rcu(&old_config->rcu, free_config);
82 }
83
84 static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
85 {
86         if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
87                 if (mutable->port_config.saddr)
88                         return &local_remote_ports;
89                 else
90                         return &remote_ports;
91         } else {
92                 if (mutable->port_config.saddr)
93                         return &key_local_remote_ports;
94                 else
95                         return &key_remote_ports;
96         }
97 }
98
99 enum lookup_key {
100         LOOKUP_TUNNEL_TYPE      = 0,
101         LOOKUP_SADDR            = 1,
102         LOOKUP_DADDR            = 2,
103         LOOKUP_KEY              = 3,
104 };
105
106 struct port_lookup_key {
107         u32 vals[4];                    /* Contains enum lookup_key keys. */
108         const struct tnl_mutable_config *mutable;
109 };
110
111 /*
112  * Modifies 'target' to store the rcu_dereferenced pointer that was used to do
113  * the comparision.
114  */
115 static int port_cmp(const struct tbl_node *node, void *target)
116 {
117         const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node);
118         struct port_lookup_key *lookup = target;
119
120         lookup->mutable = rcu_dereference(tnl_vport->mutable);
121
122         return (lookup->mutable->tunnel_type == lookup->vals[LOOKUP_TUNNEL_TYPE]) &&
123                lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] &&
124                lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] &&
125                lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR];
126 }
127
128 static u32 port_hash(struct port_lookup_key *lookup)
129 {
130         return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
131 }
132
133 static int add_port(struct vport *vport)
134 {
135         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
136         struct port_lookup_key lookup;
137         int err;
138
139         if (!port_table) {
140                 struct tbl *new_table;
141
142                 new_table = tbl_create(0);
143                 if (!new_table)
144                         return -ENOMEM;
145
146                 rcu_assign_pointer(port_table, new_table);
147
148         } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
149                 struct tbl *old_table = port_table;
150                 struct tbl *new_table;
151
152                 new_table = tbl_expand(old_table);
153                 if (IS_ERR(new_table))
154                         return PTR_ERR(new_table);
155
156                 rcu_assign_pointer(port_table, new_table);
157                 tbl_deferred_destroy(old_table, NULL);
158         }
159
160         lookup.vals[LOOKUP_SADDR] = tnl_vport->mutable->port_config.saddr;
161         lookup.vals[LOOKUP_DADDR] = tnl_vport->mutable->port_config.daddr;
162         lookup.vals[LOOKUP_KEY] = tnl_vport->mutable->port_config.in_key;
163         lookup.vals[LOOKUP_TUNNEL_TYPE] = tnl_vport->mutable->tunnel_type;
164
165         err = tbl_insert(port_table, &tnl_vport->tbl_node, port_hash(&lookup));
166         if (err)
167                 return err;
168
169         (*find_port_pool(tnl_vport->mutable))++;
170
171         return 0;
172 }
173
174 static int del_port(struct vport *vport)
175 {
176         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
177         int err;
178
179         err = tbl_remove(port_table, &tnl_vport->tbl_node);
180         if (err)
181                 return err;
182
183         (*find_port_pool(tnl_vport->mutable))--;
184
185         return 0;
186 }
187
188 struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
189                             int tunnel_type,
190                             const struct tnl_mutable_config **mutable)
191 {
192         struct port_lookup_key lookup;
193         struct tbl *table = rcu_dereference(port_table);
194         struct tbl_node *tbl_node;
195
196         if (!table)
197                 return NULL;
198
199         lookup.vals[LOOKUP_SADDR] = saddr;
200         lookup.vals[LOOKUP_DADDR] = daddr;
201
202         if (tunnel_type & TNL_T_KEY_EXACT) {
203                 lookup.vals[LOOKUP_KEY] = key;
204                 lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_MATCH;
205
206                 if (key_local_remote_ports) {
207                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
208                         if (tbl_node)
209                                 goto found;
210                 }
211
212                 if (key_remote_ports) {
213                         lookup.vals[LOOKUP_SADDR] = 0;
214
215                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
216                         if (tbl_node)
217                                 goto found;
218
219                         lookup.vals[LOOKUP_SADDR] = saddr;
220                 }
221         }
222
223         if (tunnel_type & TNL_T_KEY_MATCH) {
224                 lookup.vals[LOOKUP_KEY] = 0;
225                 lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_EXACT;
226
227                 if (local_remote_ports) {
228                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
229                         if (tbl_node)
230                                 goto found;
231                 }
232
233                 if (remote_ports) {
234                         lookup.vals[LOOKUP_SADDR] = 0;
235
236                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
237                         if (tbl_node)
238                                 goto found;
239                 }
240         }
241
242         return NULL;
243
244 found:
245         *mutable = lookup.mutable;
246         return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
247 }
248
249 static bool check_ipv4_address(__be32 addr)
250 {
251         if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
252             || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
253                 return false;
254
255         return true;
256 }
257
258 static bool ipv4_should_icmp(struct sk_buff *skb)
259 {
260         struct iphdr *old_iph = ip_hdr(skb);
261
262         /* Don't respond to L2 broadcast. */
263         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
264                 return false;
265
266         /* Don't respond to L3 broadcast or invalid addresses. */
267         if (!check_ipv4_address(old_iph->daddr) ||
268             !check_ipv4_address(old_iph->saddr))
269                 return false;
270
271         /* Only respond to the first fragment. */
272         if (old_iph->frag_off & htons(IP_OFFSET))
273                 return false;
274
275         /* Don't respond to ICMP error messages. */
276         if (old_iph->protocol == IPPROTO_ICMP) {
277                 u8 icmp_type, *icmp_typep;
278
279                 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
280                                                 (old_iph->ihl << 2) +
281                                                 offsetof(struct icmphdr, type) -
282                                                 skb->data, sizeof(icmp_type),
283                                                 &icmp_type);
284
285                 if (!icmp_typep)
286                         return false;
287
288                 if (*icmp_typep > NR_ICMP_TYPES
289                         || (*icmp_typep <= ICMP_PARAMETERPROB
290                                 && *icmp_typep != ICMP_ECHOREPLY
291                                 && *icmp_typep != ICMP_ECHO))
292                         return false;
293         }
294
295         return true;
296 }
297
298 static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
299                             unsigned int mtu, unsigned int payload_length)
300 {
301         struct iphdr *iph, *old_iph = ip_hdr(skb);
302         struct icmphdr *icmph;
303         u8 *payload;
304
305         iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
306         icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
307         payload = skb_put(nskb, payload_length);
308
309         /* IP */
310         iph->version            =       4;
311         iph->ihl                =       sizeof(struct iphdr) >> 2;
312         iph->tos                =       (old_iph->tos & IPTOS_TOS_MASK) |
313                                         IPTOS_PREC_INTERNETCONTROL;
314         iph->tot_len            =       htons(sizeof(struct iphdr)
315                                               + sizeof(struct icmphdr)
316                                               + payload_length);
317         get_random_bytes(&iph->id, sizeof(iph->id));
318         iph->frag_off           =       0;
319         iph->ttl                =       IPDEFTTL;
320         iph->protocol           =       IPPROTO_ICMP;
321         iph->daddr              =       old_iph->saddr;
322         iph->saddr              =       old_iph->daddr;
323
324         ip_send_check(iph);
325
326         /* ICMP */
327         icmph->type             =       ICMP_DEST_UNREACH;
328         icmph->code             =       ICMP_FRAG_NEEDED;
329         icmph->un.gateway       =       htonl(mtu);
330         icmph->checksum         =       0;
331
332         nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0);
333         nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
334                                             payload, payload_length,
335                                             nskb->csum);
336         icmph->checksum = csum_fold(nskb->csum);
337 }
338
339 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
340 static bool ipv6_should_icmp(struct sk_buff *skb)
341 {
342         struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
343         int addr_type;
344         int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
345         u8 nexthdr = ipv6_hdr(skb)->nexthdr;
346
347         /* Check source address is valid. */
348         addr_type = ipv6_addr_type(&old_ipv6h->saddr);
349         if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
350                 return false;
351
352         /* Don't reply to unspecified addresses. */
353         if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
354                 return false;
355
356         /* Don't respond to ICMP error messages. */
357         payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
358         if (payload_off < 0)
359                 return false;
360
361         if (nexthdr == NEXTHDR_ICMP) {
362                 u8 icmp_type, *icmp_typep;
363
364                 icmp_typep = skb_header_pointer(skb, payload_off +
365                                                 offsetof(struct icmp6hdr,
366                                                         icmp6_type),
367                                                 sizeof(icmp_type), &icmp_type);
368
369                 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
370                         return false;
371         }
372
373         return true;
374 }
375
376 static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
377                             unsigned int mtu, unsigned int payload_length)
378 {
379         struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
380         struct icmp6hdr *icmp6h;
381         u8 *payload;
382
383         ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
384         icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
385         payload = skb_put(nskb, payload_length);
386
387         /* IPv6 */
388         ipv6h->version          =       6;
389         ipv6h->priority         =       0;
390         memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl));
391         ipv6h->payload_len      =       htons(sizeof(struct icmp6hdr)
392                                               + payload_length);
393         ipv6h->nexthdr          =       NEXTHDR_ICMP;
394         ipv6h->hop_limit        =       IPV6_DEFAULT_HOPLIMIT;
395         ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
396         ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
397
398         /* ICMPv6 */
399         icmp6h->icmp6_type      =       ICMPV6_PKT_TOOBIG;
400         icmp6h->icmp6_code      =       0;
401         icmp6h->icmp6_cksum     =       0;
402         icmp6h->icmp6_mtu       =       htonl(mtu);
403
404         nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0);
405         nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
406                                             payload, payload_length,
407                                             nskb->csum);
408         icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
409                                                 sizeof(struct icmp6hdr)
410                                                 + payload_length,
411                                                 ipv6h->nexthdr, nskb->csum);
412 }
413 #endif /* IPv6 */
414
415 bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutable,
416                      struct sk_buff *skb, unsigned int mtu, __be32 flow_key)
417 {
418         unsigned int eth_hdr_len = ETH_HLEN;
419         unsigned int total_length = 0, header_length = 0, payload_length;
420         struct ethhdr *eh, *old_eh = eth_hdr(skb);
421         struct sk_buff *nskb;
422
423         /* Sanity check */
424         if (skb->protocol == htons(ETH_P_IP)) {
425                 if (mtu < IP_MIN_MTU)
426                         return false;
427
428                 if (!ipv4_should_icmp(skb))
429                         return true;
430         }
431 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
432         else if (skb->protocol == htons(ETH_P_IPV6)) {
433                 if (mtu < IPV6_MIN_MTU)
434                         return false;
435
436                 /*
437                  * In theory we should do PMTUD on IPv6 multicast messages but
438                  * we don't have an address to send from so just fragment.
439                  */
440                 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
441                         return false;
442
443                 if (!ipv6_should_icmp(skb))
444                         return true;
445         }
446 #endif
447         else
448                 return false;
449
450         /* Allocate */
451         if (old_eh->h_proto == htons(ETH_P_8021Q))
452                 eth_hdr_len = VLAN_ETH_HLEN;
453
454         payload_length = skb->len - eth_hdr_len;
455         if (skb->protocol == htons(ETH_P_IP)) {
456                 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
457                 total_length = min_t(unsigned int, header_length +
458                                                    payload_length, 576);
459         }
460 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
461         else {
462                 header_length = sizeof(struct ipv6hdr) +
463                                 sizeof(struct icmp6hdr);
464                 total_length = min_t(unsigned int, header_length +
465                                                   payload_length, IPV6_MIN_MTU);
466         }
467 #endif
468
469         total_length = min(total_length, mutable->mtu);
470         payload_length = total_length - header_length;
471
472         nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length +
473                              payload_length);
474         if (!nskb)
475                 return false;
476
477         skb_reserve(nskb, NET_IP_ALIGN);
478
479         /* Ethernet / VLAN */
480         eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
481         memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
482         memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN);
483         nskb->protocol = eh->h_proto = old_eh->h_proto;
484         if (old_eh->h_proto == htons(ETH_P_8021Q)) {
485                 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
486
487                 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
488                 vh->h_vlan_encapsulated_proto = skb->protocol;
489         }
490         skb_reset_mac_header(nskb);
491
492         /* Protocol */
493         if (skb->protocol == htons(ETH_P_IP))
494                 ipv4_build_icmp(skb, nskb, mtu, payload_length);
495 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
496         else
497                 ipv6_build_icmp(skb, nskb, mtu, payload_length);
498 #endif
499
500         /*
501          * Assume that flow based keys are symmetric with respect to input
502          * and output and use the key that we were going to put on the
503          * outgoing packet for the fake received packet.  If the keys are
504          * not symmetric then PMTUD needs to be disabled since we won't have
505          * any way of synthesizing packets.
506          */
507         if ((mutable->port_config.flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
508             (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
509                 OVS_CB(nskb)->tun_id = flow_key;
510
511         compute_ip_summed(nskb, false);
512         vport_receive(vport, nskb);
513
514         return true;
515 }
516
517 static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
518 {
519         if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
520                 struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
521                 if (unlikely(!nskb)) {
522                         kfree_skb(skb);
523                         return ERR_PTR(-ENOMEM);
524                 }
525
526                 set_skb_csum_bits(skb, nskb);
527
528                 if (skb->sk)
529                         skb_set_owner_w(nskb, skb->sk);
530
531                 dev_kfree_skb(skb);
532                 return nskb;
533         }
534
535         return skb;
536 }
537
538 static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb)
539 {
540         u8 inner;
541
542         if (skb->protocol == htons(ETH_P_IP))
543                 inner = ((struct iphdr *)skb_network_header(skb))->tos;
544 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
545         else if (skb->protocol == htons(ETH_P_IPV6))
546                 inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
547 #endif
548         else
549                 inner = 0;
550
551         return INET_ECN_encapsulate(tos, inner);
552 }
553
554 static inline void ecn_decapsulate(struct sk_buff *skb)
555 {
556         u8 tos = ip_hdr(skb)->tos;
557
558         if (INET_ECN_is_ce(tos)) {
559                 __be16 protocol = skb->protocol;
560                 unsigned int nw_header = skb_network_header(skb) - skb->data;
561
562                 if (skb->protocol == htons(ETH_P_8021Q)) {
563                         if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
564                                 return;
565
566                         protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
567                         nw_header += VLAN_HLEN;
568                 }
569
570                 if (protocol == htons(ETH_P_IP)) {
571                         if (unlikely(!pskb_may_pull(skb, nw_header
572                             + sizeof(struct iphdr))))
573                                 return;
574
575                         IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
576                 }
577 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
578                 else if (protocol == htons(ETH_P_IPV6)) {
579                         if (unlikely(!pskb_may_pull(skb, nw_header
580                             + sizeof(struct ipv6hdr))))
581                                 return;
582
583                         IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
584                                                           + skb->data));
585                 }
586 #endif
587         }
588 }
589
590 static struct sk_buff *handle_gso(struct sk_buff *skb)
591 {
592         if (skb_is_gso(skb)) {
593                 struct sk_buff *nskb = skb_gso_segment(skb, 0);
594
595                 dev_kfree_skb(skb);
596                 return nskb;
597         }
598
599         return skb;
600 }
601
602 static int handle_csum_offload(struct sk_buff *skb)
603 {
604         if (skb->ip_summed == CHECKSUM_PARTIAL)
605                 return skb_checksum_help(skb);
606         else {
607                 skb->ip_summed = CHECKSUM_NONE;
608                 return 0;
609         }
610 }
611
612 /* Called with rcu_read_lock. */
613 void tnl_rcv(struct vport *vport, struct sk_buff *skb)
614 {
615         skb->pkt_type = PACKET_HOST;
616         skb->protocol = eth_type_trans(skb, skb->dev);
617
618         skb_dst_drop(skb);
619         nf_reset(skb);
620         secpath_reset(skb);
621         skb_reset_network_header(skb);
622
623         ecn_decapsulate(skb);
624
625         skb_push(skb, ETH_HLEN);
626         compute_ip_summed(skb, false);
627
628         vport_receive(vport, skb);
629 }
630
631 static int build_packet(struct vport *vport, const struct tnl_mutable_config *mutable,
632                         struct iphdr *iph, struct rtable *rt, int max_headroom,
633                         int mtu, struct sk_buff *skb)
634 {
635         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
636         int err;
637         struct iphdr *new_iph;
638         int orig_len = skb->len;
639         __be16 frag_off = iph->frag_off;
640
641         skb = check_headroom(skb, max_headroom);
642         if (unlikely(IS_ERR(skb)))
643                 goto error;
644
645         err = handle_csum_offload(skb);
646         if (unlikely(err))
647                 goto error_free;
648
649         if (skb->protocol == htons(ETH_P_IP)) {
650                 struct iphdr *old_iph = ip_hdr(skb);
651
652                 if ((old_iph->frag_off & htons(IP_DF)) &&
653                     mtu < ntohs(old_iph->tot_len)) {
654                         if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
655                                 goto error_free;
656                 }
657
658         }
659 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
660         else if (skb->protocol == htons(ETH_P_IPV6)) {
661                 unsigned int packet_length = skb->len - ETH_HLEN
662                         - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
663
664                 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
665                 if (packet_length > IPV6_MIN_MTU)
666                         frag_off = htons(IP_DF);
667
668                 if (mtu < packet_length) {
669                         if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
670                                 goto error_free;
671                 }
672         }
673 #endif
674
675         new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
676         skb_reset_network_header(skb);
677         skb_set_transport_header(skb, sizeof(struct iphdr));
678
679         memcpy(new_iph, iph, sizeof(struct iphdr));
680         new_iph->frag_off = frag_off;
681         ip_select_ident(new_iph, &rt_dst(rt), NULL);
682
683         tnl_vport->tnl_ops->build_header(skb, vport, mutable);
684
685         /* Allow our local IP stack to fragment the outer packet even if the
686          * DF bit is set as a last resort. */
687         skb->local_df = 1;
688
689         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
690         IPCB(skb)->flags = 0;
691
692         err = ip_local_out(skb);
693         if (likely(net_xmit_eval(err) == 0))
694                 return orig_len;
695         else {
696                 vport_record_error(vport, VPORT_E_TX_ERROR);
697                 return 0;
698         }
699
700 error_free:
701         kfree_skb(skb);
702 error:
703         vport_record_error(vport, VPORT_E_TX_DROPPED);
704
705         return 0;
706 }
707
708 int tnl_send(struct vport *vport, struct sk_buff *skb)
709 {
710         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
711         const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
712
713         struct iphdr *old_iph;
714         int orig_len;
715         struct iphdr iph;
716         struct rtable *rt;
717         int max_headroom;
718         int mtu;
719
720         /* Validate the protocol headers before we try to use them. */
721         if (skb->protocol == htons(ETH_P_8021Q)) {
722                 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
723                         goto error_free;
724
725                 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
726                 skb_set_network_header(skb, VLAN_ETH_HLEN);
727         }
728
729         if (skb->protocol == htons(ETH_P_IP)) {
730                 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
731                     + sizeof(struct iphdr) - skb->data)))
732                         skb->protocol = 0;
733         }
734 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
735         else if (skb->protocol == htons(ETH_P_IPV6)) {
736                 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
737                     + sizeof(struct ipv6hdr) - skb->data)))
738                         skb->protocol = 0;
739         }
740 #endif
741         old_iph = ip_hdr(skb);
742
743         iph.tos = mutable->port_config.tos;
744         if (mutable->port_config.flags & TNL_F_TOS_INHERIT) {
745                 if (skb->protocol == htons(ETH_P_IP))
746                         iph.tos = old_iph->tos;
747 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
748                 else if (skb->protocol == htons(ETH_P_IPV6))
749                         iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
750 #endif
751         }
752         iph.tos = ecn_encapsulate(iph.tos, skb);
753
754         {
755                 struct flowi fl = { .nl_u = { .ip4_u =
756                                               { .daddr = mutable->port_config.daddr,
757                                                 .saddr = mutable->port_config.saddr,
758                                                 .tos = RT_TOS(iph.tos) } },
759                                     .proto = tnl_vport->tnl_ops->ipproto };
760
761                 if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
762                         goto error_free;
763         }
764
765         iph.ttl = mutable->port_config.ttl;
766         if (mutable->port_config.flags & TNL_F_TTL_INHERIT) {
767                 if (skb->protocol == htons(ETH_P_IP))
768                         iph.ttl = old_iph->ttl;
769 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
770                 else if (skb->protocol == htons(ETH_P_IPV6))
771                         iph.ttl = ipv6_hdr(skb)->hop_limit;
772 #endif
773         }
774         if (!iph.ttl)
775                 iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
776
777         iph.frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
778         if (iph.frag_off)
779                 mtu = dst_mtu(&rt_dst(rt))
780                         - ETH_HLEN
781                         - mutable->tunnel_hlen
782                         - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
783         else
784                 mtu = mutable->mtu;
785
786         if (skb->protocol == htons(ETH_P_IP)) {
787                 iph.frag_off |= old_iph->frag_off & htons(IP_DF);
788                 mtu = max(mtu, IP_MIN_MTU);
789         }
790 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
791         else if (skb->protocol == htons(ETH_P_IPV6))
792                 mtu = max(mtu, IPV6_MIN_MTU);
793 #endif
794
795         iph.version = 4;
796         iph.ihl = sizeof(struct iphdr) >> 2;
797         iph.protocol = tnl_vport->tnl_ops->ipproto;
798         iph.daddr = rt->rt_dst;
799         iph.saddr = rt->rt_src;
800
801         nf_reset(skb);
802         secpath_reset(skb);
803         skb_dst_drop(skb);
804         skb_dst_set(skb, &rt_dst(rt));
805
806         /*
807          * If we are doing GSO on a pskb it is better to make sure that the
808          * headroom is correct now.  We will only have to copy the portion in
809          * the linear data area and GSO will preserve headroom when it creates
810          * the segments.  This is particularly beneficial on Xen where we get
811          * lots of GSO pskbs.  Conversely, we delay copying if it is just to
812          * get our own writable clone because GSO may do the copy for us.
813          */
814         max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
815                         + mutable->tunnel_hlen;
816
817         if (skb_headroom(skb) < max_headroom) {
818                 skb = check_headroom(skb, max_headroom);
819                 if (unlikely(IS_ERR(skb))) {
820                         vport_record_error(vport, VPORT_E_TX_DROPPED);
821                         goto error;
822                 }
823         }
824
825         forward_ip_summed(skb);
826
827         if (unlikely(vswitch_skb_checksum_setup(skb)))
828                 goto error_free;
829
830         skb = handle_gso(skb);
831         if (unlikely(IS_ERR(skb))) {
832                 vport_record_error(vport, VPORT_E_TX_DROPPED);
833                 goto error;
834         }
835
836         /*
837          * Process GSO segments.  Try to do any work for the entire packet that
838          * doesn't involve actually writing to it before this point.
839          */
840         orig_len = 0;
841         do {
842                 struct sk_buff *next_skb = skb->next;
843                 skb->next = NULL;
844
845                 orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
846
847                 skb = next_skb;
848         } while (skb);
849
850         return orig_len;
851
852 error_free:
853         kfree_skb(skb);
854         vport_record_error(vport, VPORT_E_TX_ERROR);
855 error:
856         return 0;
857 }
858
859 int tnl_init(void)
860 {
861         return 0;
862 }
863
864 void tnl_exit(void)
865 {
866         tbl_destroy(port_table, NULL);
867         port_table = NULL;
868 }
869
870 static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
871                       const struct vport *cur_vport,
872                       struct tnl_mutable_config *mutable)
873 {
874         const struct vport *old_vport;
875         const struct tnl_mutable_config *old_mutable;
876
877         if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct tnl_port_config)))
878                 return -EFAULT;
879
880         mutable->tunnel_hlen = tnl_ops->hdr_len(&mutable->port_config);
881         if (mutable->tunnel_hlen < 0)
882                 return mutable->tunnel_hlen;
883
884         mutable->tunnel_hlen += sizeof(struct iphdr);
885
886         if (mutable->port_config.daddr == 0)
887                 return -EINVAL;
888
889         mutable->tunnel_type = tnl_ops->tunnel_type;
890         if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
891                 mutable->tunnel_type |= TNL_T_KEY_MATCH;
892                 mutable->port_config.in_key = 0;
893         } else
894                 mutable->tunnel_type |= TNL_T_KEY_EXACT;
895
896         old_vport = tnl_find_port(mutable->port_config.saddr,
897                                   mutable->port_config.daddr,
898                                   mutable->port_config.in_key,
899                                   mutable->tunnel_type,
900                                   &old_mutable);
901
902         if (old_vport && old_vport != cur_vport)
903                 return -EEXIST;
904
905         if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
906                 mutable->port_config.out_key = 0;
907
908         return 0;
909 }
910
911 struct vport *tnl_create(const char *name, const void __user *config,
912                          const struct vport_ops *vport_ops,
913                          const struct tnl_ops *tnl_ops)
914 {
915         struct vport *vport;
916         struct tnl_vport *tnl_vport;
917         int err;
918
919         vport = vport_alloc(sizeof(struct tnl_vport), vport_ops);
920         if (IS_ERR(vport)) {
921                 err = PTR_ERR(vport);
922                 goto error;
923         }
924
925         tnl_vport = tnl_vport_priv(vport);
926
927         strcpy(tnl_vport->name, name);
928         tnl_vport->tnl_ops = tnl_ops;
929
930         tnl_vport->mutable = kmalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
931         if (!tnl_vport->mutable) {
932                 err = -ENOMEM;
933                 goto error_free_vport;
934         }
935
936         vport_gen_rand_ether_addr(tnl_vport->mutable->eth_addr);
937         tnl_vport->mutable->mtu = ETH_DATA_LEN;
938
939         err = set_config(config, tnl_ops, NULL, tnl_vport->mutable);
940         if (err)
941                 goto error_free_mutable;
942
943         err = add_port(vport);
944         if (err)
945                 goto error_free_mutable;
946
947         return vport;
948
949 error_free_mutable:
950         kfree(tnl_vport->mutable);
951 error_free_vport:
952         vport_free(vport);
953 error:
954         return ERR_PTR(err);
955 }
956
957 int tnl_modify(struct vport *vport, const void __user *config)
958 {
959         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
960         struct tnl_mutable_config *mutable;
961         int err;
962         bool update_hash = false;
963
964         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
965         if (!mutable) {
966                 err = -ENOMEM;
967                 goto error;
968         }
969
970         err = set_config(config, tnl_vport->tnl_ops, vport, mutable);
971         if (err)
972                 goto error_free;
973
974         /*
975          * Only remove the port from the hash table if something that would
976          * affect the lookup has changed.
977          */
978         if (tnl_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
979             tnl_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
980             tnl_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
981             (tnl_vport->mutable->port_config.flags & TNL_F_IN_KEY_MATCH) !=
982             (mutable->port_config.flags & TNL_F_IN_KEY_MATCH))
983                 update_hash = true;
984
985
986         /*
987          * This update is not atomic but the lookup uses the config, which
988          * serves as an inherent double check.
989          */
990         if (update_hash) {
991                 err = del_port(vport);
992                 if (err)
993                         goto error_free;
994         }
995
996         assign_config_rcu(vport, mutable);
997
998         if (update_hash) {
999                 err = add_port(vport);
1000                 if (err)
1001                         goto error_free;
1002         }
1003
1004         return 0;
1005
1006 error_free:
1007         kfree(mutable);
1008 error:
1009         return err;
1010 }
1011
1012 static void free_port(struct rcu_head *rcu)
1013 {
1014         struct tnl_vport *tnl_vport = container_of(rcu, struct tnl_vport, rcu);
1015
1016         kfree(tnl_vport->mutable);
1017         vport_free(tnl_vport_to_vport(tnl_vport));
1018 }
1019
1020 int tnl_destroy(struct vport *vport)
1021 {
1022         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1023         const struct tnl_mutable_config *old_mutable;
1024
1025         if (vport == tnl_find_port(tnl_vport->mutable->port_config.saddr,
1026             tnl_vport->mutable->port_config.daddr,
1027             tnl_vport->mutable->port_config.in_key,
1028             tnl_vport->mutable->tunnel_type,
1029             &old_mutable))
1030                 del_port(vport);
1031
1032         call_rcu(&tnl_vport->rcu, free_port);
1033
1034         return 0;
1035 }
1036
1037 int tnl_set_mtu(struct vport *vport, int mtu)
1038 {
1039         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1040         struct tnl_mutable_config *mutable;
1041
1042         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
1043         if (!mutable)
1044                 return -ENOMEM;
1045
1046         mutable->mtu = mtu;
1047         assign_config_rcu(vport, mutable);
1048
1049         return 0;
1050 }
1051
1052 int tnl_set_addr(struct vport *vport, const unsigned char *addr)
1053 {
1054         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1055         struct tnl_mutable_config *mutable;
1056
1057         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
1058         if (!mutable)
1059                 return -ENOMEM;
1060
1061         memcpy(mutable->eth_addr, addr, ETH_ALEN);
1062         assign_config_rcu(vport, mutable);
1063
1064         return 0;
1065 }
1066
1067
1068 const char *tnl_get_name(const struct vport *vport)
1069 {
1070         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1071         return tnl_vport->name;
1072 }
1073
1074 const unsigned char *tnl_get_addr(const struct vport *vport)
1075 {
1076         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1077         return rcu_dereference(tnl_vport->mutable)->eth_addr;
1078 }
1079
1080 int tnl_get_mtu(const struct vport *vport)
1081 {
1082         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1083         return rcu_dereference(tnl_vport->mutable)->mtu;
1084 }