ofproto: Fix effective memory leak for uninstallable flows.
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2010 Nicira Networks.
3  * Distributed under the terms of the GNU GPL version 2.
4  *
5  * Significant portions of this file may be copied from parts of the Linux
6  * kernel, by Linus Torvalds and others.
7  */
8
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
11 #include <linux/ip.h>
12 #include <linux/if_vlan.h>
13 #include <linux/in.h>
14 #include <linux/in_route.h>
15 #include <linux/jhash.h>
16 #include <linux/kernel.h>
17 #include <linux/version.h>
18
19 #include <net/dsfield.h>
20 #include <net/dst.h>
21 #include <net/icmp.h>
22 #include <net/inet_ecn.h>
23 #include <net/ip.h>
24 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
25 #include <net/ipv6.h>
26 #endif
27 #include <net/route.h>
28 #include <net/xfrm.h>
29
30 #include "actions.h"
31 #include "datapath.h"
32 #include "table.h"
33 #include "tunnel.h"
34 #include "vport.h"
35 #include "vport-generic.h"
36
37 /* Protected by RCU. */
38 static struct tbl *port_table;
39
40 /*
41  * These are just used as an optimization: they don't require any kind of
42  * synchronization because we could have just as easily read the value before
43  * the port change happened.
44  */
45 static unsigned int key_local_remote_ports;
46 static unsigned int key_remote_ports;
47 static unsigned int local_remote_ports;
48 static unsigned int remote_ports;
49
50 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
51 #define rt_dst(rt) (rt->dst)
52 #else
53 #define rt_dst(rt) (rt->u.dst)
54 #endif
55
56 static inline struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport)
57 {
58         return vport_from_priv(tnl_vport);
59 }
60
61 static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node)
62 {
63         return container_of(node, struct tnl_vport, tbl_node);
64 }
65
66 /* RCU callback. */
67 static void free_config(struct rcu_head *rcu)
68 {
69         struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
70         kfree(c);
71 }
72
73 static void assign_config_rcu(struct vport *vport,
74                               struct tnl_mutable_config *new_config)
75 {
76         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
77         struct tnl_mutable_config *old_config;
78
79         old_config = rcu_dereference(tnl_vport->mutable);
80         rcu_assign_pointer(tnl_vport->mutable, new_config);
81         call_rcu(&old_config->rcu, free_config);
82 }
83
84 static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
85 {
86         if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
87                 if (mutable->port_config.saddr)
88                         return &local_remote_ports;
89                 else
90                         return &remote_ports;
91         } else {
92                 if (mutable->port_config.saddr)
93                         return &key_local_remote_ports;
94                 else
95                         return &key_remote_ports;
96         }
97 }
98
99 enum lookup_key {
100         LOOKUP_TUNNEL_TYPE      = 0,
101         LOOKUP_SADDR            = 1,
102         LOOKUP_DADDR            = 2,
103         LOOKUP_KEY              = 3,
104 };
105
106 struct port_lookup_key {
107         u32 vals[4];                    /* Contains enum lookup_key keys. */
108         const struct tnl_mutable_config *mutable;
109 };
110
111 /*
112  * Modifies 'target' to store the rcu_dereferenced pointer that was used to do
113  * the comparision.
114  */
115 static int port_cmp(const struct tbl_node *node, void *target)
116 {
117         const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node);
118         struct port_lookup_key *lookup = target;
119
120         lookup->mutable = rcu_dereference(tnl_vport->mutable);
121
122         return (lookup->mutable->tunnel_type == lookup->vals[LOOKUP_TUNNEL_TYPE]) &&
123                lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] &&
124                lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] &&
125                lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR];
126 }
127
128 static u32 port_hash(struct port_lookup_key *lookup)
129 {
130         return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
131 }
132
133 static int add_port(struct vport *vport)
134 {
135         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
136         struct port_lookup_key lookup;
137         int err;
138
139         if (!port_table) {
140                 struct tbl *new_table;
141
142                 new_table = tbl_create(0);
143                 if (!new_table)
144                         return -ENOMEM;
145
146                 rcu_assign_pointer(port_table, new_table);
147
148         } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
149                 struct tbl *old_table = port_table;
150                 struct tbl *new_table;
151
152                 new_table = tbl_expand(old_table);
153                 if (IS_ERR(new_table))
154                         return PTR_ERR(new_table);
155
156                 rcu_assign_pointer(port_table, new_table);
157                 tbl_deferred_destroy(old_table, NULL);
158         }
159
160         lookup.vals[LOOKUP_SADDR] = tnl_vport->mutable->port_config.saddr;
161         lookup.vals[LOOKUP_DADDR] = tnl_vport->mutable->port_config.daddr;
162         lookup.vals[LOOKUP_KEY] = tnl_vport->mutable->port_config.in_key;
163         lookup.vals[LOOKUP_TUNNEL_TYPE] = tnl_vport->mutable->tunnel_type;
164
165         err = tbl_insert(port_table, &tnl_vport->tbl_node, port_hash(&lookup));
166         if (err)
167                 return err;
168
169         (*find_port_pool(tnl_vport->mutable))++;
170
171         return 0;
172 }
173
174 static int del_port(struct vport *vport)
175 {
176         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
177         int err;
178
179         err = tbl_remove(port_table, &tnl_vport->tbl_node);
180         if (err)
181                 return err;
182
183         (*find_port_pool(tnl_vport->mutable))--;
184
185         return 0;
186 }
187
188 struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
189                             int tunnel_type,
190                             const struct tnl_mutable_config **mutable)
191 {
192         struct port_lookup_key lookup;
193         struct tbl *table = rcu_dereference(port_table);
194         struct tbl_node *tbl_node;
195
196         if (!table)
197                 return NULL;
198
199         lookup.vals[LOOKUP_SADDR] = saddr;
200         lookup.vals[LOOKUP_DADDR] = daddr;
201
202         if (tunnel_type & TNL_T_KEY_EXACT) {
203                 lookup.vals[LOOKUP_KEY] = key;
204                 lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_MATCH;
205
206                 if (key_local_remote_ports) {
207                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
208                         if (tbl_node)
209                                 goto found;
210                 }
211
212                 if (key_remote_ports) {
213                         lookup.vals[LOOKUP_SADDR] = 0;
214
215                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
216                         if (tbl_node)
217                                 goto found;
218
219                         lookup.vals[LOOKUP_SADDR] = saddr;
220                 }
221         }
222
223         if (tunnel_type & TNL_T_KEY_MATCH) {
224                 lookup.vals[LOOKUP_KEY] = 0;
225                 lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_EXACT;
226
227                 if (local_remote_ports) {
228                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
229                         if (tbl_node)
230                                 goto found;
231                 }
232
233                 if (remote_ports) {
234                         lookup.vals[LOOKUP_SADDR] = 0;
235
236                         tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
237                         if (tbl_node)
238                                 goto found;
239                 }
240         }
241
242         return NULL;
243
244 found:
245         *mutable = lookup.mutable;
246         return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
247 }
248
249 static bool check_ipv4_address(__be32 addr)
250 {
251         if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
252             || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
253                 return false;
254
255         return true;
256 }
257
258 static bool ipv4_should_icmp(struct sk_buff *skb)
259 {
260         struct iphdr *old_iph = ip_hdr(skb);
261
262         /* Don't respond to L2 broadcast. */
263         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
264                 return false;
265
266         /* Don't respond to L3 broadcast or invalid addresses. */
267         if (!check_ipv4_address(old_iph->daddr) ||
268             !check_ipv4_address(old_iph->saddr))
269                 return false;
270
271         /* Only respond to the first fragment. */
272         if (old_iph->frag_off & htons(IP_OFFSET))
273                 return false;
274
275         /* Don't respond to ICMP error messages. */
276         if (old_iph->protocol == IPPROTO_ICMP) {
277                 u8 icmp_type, *icmp_typep;
278
279                 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
280                                                 (old_iph->ihl << 2) +
281                                                 offsetof(struct icmphdr, type) -
282                                                 skb->data, sizeof(icmp_type),
283                                                 &icmp_type);
284
285                 if (!icmp_typep)
286                         return false;
287
288                 if (*icmp_typep > NR_ICMP_TYPES
289                         || (*icmp_typep <= ICMP_PARAMETERPROB
290                                 && *icmp_typep != ICMP_ECHOREPLY
291                                 && *icmp_typep != ICMP_ECHO))
292                         return false;
293         }
294
295         return true;
296 }
297
298 static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
299                             unsigned int mtu, unsigned int payload_length)
300 {
301         struct iphdr *iph, *old_iph = ip_hdr(skb);
302         struct icmphdr *icmph;
303         u8 *payload;
304
305         iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
306         icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
307         payload = skb_put(nskb, payload_length);
308
309         /* IP */
310         iph->version            =       4;
311         iph->ihl                =       sizeof(struct iphdr) >> 2;
312         iph->tos                =       (old_iph->tos & IPTOS_TOS_MASK) |
313                                         IPTOS_PREC_INTERNETCONTROL;
314         iph->tot_len            =       htons(sizeof(struct iphdr)
315                                               + sizeof(struct icmphdr)
316                                               + payload_length);
317         get_random_bytes(&iph->id, sizeof(iph->id));
318         iph->frag_off           =       0;
319         iph->ttl                =       IPDEFTTL;
320         iph->protocol           =       IPPROTO_ICMP;
321         iph->daddr              =       old_iph->saddr;
322         iph->saddr              =       old_iph->daddr;
323
324         ip_send_check(iph);
325
326         /* ICMP */
327         icmph->type             =       ICMP_DEST_UNREACH;
328         icmph->code             =       ICMP_FRAG_NEEDED;
329         icmph->un.gateway       =       htonl(mtu);
330         icmph->checksum         =       0;
331
332         nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0);
333         nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
334                                             payload, payload_length,
335                                             nskb->csum);
336         icmph->checksum = csum_fold(nskb->csum);
337 }
338
339 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
340 static bool ipv6_should_icmp(struct sk_buff *skb)
341 {
342         struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
343         int addr_type;
344         int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
345         u8 nexthdr = ipv6_hdr(skb)->nexthdr;
346
347         /* Check source address is valid. */
348         addr_type = ipv6_addr_type(&old_ipv6h->saddr);
349         if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
350                 return false;
351
352         /* Don't reply to unspecified addresses. */
353         if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
354                 return false;
355
356         /* Don't respond to ICMP error messages. */
357         payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
358         if (payload_off < 0)
359                 return false;
360
361         if (nexthdr == NEXTHDR_ICMP) {
362                 u8 icmp_type, *icmp_typep;
363
364                 icmp_typep = skb_header_pointer(skb, payload_off +
365                                                 offsetof(struct icmp6hdr,
366                                                         icmp6_type),
367                                                 sizeof(icmp_type), &icmp_type);
368
369                 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
370                         return false;
371         }
372
373         return true;
374 }
375
376 static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
377                             unsigned int mtu, unsigned int payload_length)
378 {
379         struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
380         struct icmp6hdr *icmp6h;
381         u8 *payload;
382
383         ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
384         icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
385         payload = skb_put(nskb, payload_length);
386
387         /* IPv6 */
388         ipv6h->version          =       6;
389         ipv6h->priority         =       0;
390         memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl));
391         ipv6h->payload_len      =       htons(sizeof(struct icmp6hdr)
392                                               + payload_length);
393         ipv6h->nexthdr          =       NEXTHDR_ICMP;
394         ipv6h->hop_limit        =       IPV6_DEFAULT_HOPLIMIT;
395         ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
396         ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
397
398         /* ICMPv6 */
399         icmp6h->icmp6_type      =       ICMPV6_PKT_TOOBIG;
400         icmp6h->icmp6_code      =       0;
401         icmp6h->icmp6_cksum     =       0;
402         icmp6h->icmp6_mtu       =       htonl(mtu);
403
404         nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0);
405         nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
406                                             payload, payload_length,
407                                             nskb->csum);
408         icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
409                                                 sizeof(struct icmp6hdr)
410                                                 + payload_length,
411                                                 ipv6h->nexthdr, nskb->csum);
412 }
413 #endif /* IPv6 */
414
415 bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutable,
416                      struct sk_buff *skb, unsigned int mtu, __be32 flow_key)
417 {
418         unsigned int eth_hdr_len = ETH_HLEN;
419         unsigned int total_length = 0, header_length = 0, payload_length;
420         struct ethhdr *eh, *old_eh = eth_hdr(skb);
421         struct sk_buff *nskb;
422
423         /* Sanity check */
424         if (skb->protocol == htons(ETH_P_IP)) {
425                 if (mtu < IP_MIN_MTU)
426                         return false;
427
428                 if (!ipv4_should_icmp(skb))
429                         return true;
430         }
431 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
432         else if (skb->protocol == htons(ETH_P_IPV6)) {
433                 if (mtu < IPV6_MIN_MTU)
434                         return false;
435
436                 /*
437                  * In theory we should do PMTUD on IPv6 multicast messages but
438                  * we don't have an address to send from so just fragment.
439                  */
440                 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
441                         return false;
442
443                 if (!ipv6_should_icmp(skb))
444                         return true;
445         }
446 #endif
447         else
448                 return false;
449
450         /* Allocate */
451         if (old_eh->h_proto == htons(ETH_P_8021Q))
452                 eth_hdr_len = VLAN_ETH_HLEN;
453
454         payload_length = skb->len - eth_hdr_len;
455         if (skb->protocol == htons(ETH_P_IP)) {
456                 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
457                 total_length = min_t(unsigned int, header_length +
458                                                    payload_length, 576);
459         }
460 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
461         else {
462                 header_length = sizeof(struct ipv6hdr) +
463                                 sizeof(struct icmp6hdr);
464                 total_length = min_t(unsigned int, header_length +
465                                                   payload_length, IPV6_MIN_MTU);
466         }
467 #endif
468
469         total_length = min(total_length, mutable->mtu);
470         payload_length = total_length - header_length;
471
472         nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length +
473                              payload_length);
474         if (!nskb)
475                 return false;
476
477         skb_reserve(nskb, NET_IP_ALIGN);
478
479         /* Ethernet / VLAN */
480         eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
481         memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
482         memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN);
483         nskb->protocol = eh->h_proto = old_eh->h_proto;
484         if (old_eh->h_proto == htons(ETH_P_8021Q)) {
485                 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
486
487                 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
488                 vh->h_vlan_encapsulated_proto = skb->protocol;
489         }
490         skb_reset_mac_header(nskb);
491
492         /* Protocol */
493         if (skb->protocol == htons(ETH_P_IP))
494                 ipv4_build_icmp(skb, nskb, mtu, payload_length);
495 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
496         else
497                 ipv6_build_icmp(skb, nskb, mtu, payload_length);
498 #endif
499
500         /*
501          * Assume that flow based keys are symmetric with respect to input
502          * and output and use the key that we were going to put on the
503          * outgoing packet for the fake received packet.  If the keys are
504          * not symmetric then PMTUD needs to be disabled since we won't have
505          * any way of synthesizing packets.
506          */
507         if ((mutable->port_config.flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
508             (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
509                 OVS_CB(nskb)->tun_id = flow_key;
510
511         compute_ip_summed(nskb, false);
512         vport_receive(vport, nskb);
513
514         return true;
515 }
516
517 static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
518 {
519         if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
520                 struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
521                 if (unlikely(!nskb)) {
522                         kfree_skb(skb);
523                         return ERR_PTR(-ENOMEM);
524                 }
525
526                 set_skb_csum_bits(skb, nskb);
527
528                 if (skb->sk)
529                         skb_set_owner_w(nskb, skb->sk);
530
531                 dev_kfree_skb(skb);
532                 return nskb;
533         }
534
535         return skb;
536 }
537
538 static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb)
539 {
540         u8 inner;
541
542         if (skb->protocol == htons(ETH_P_IP))
543                 inner = ((struct iphdr *)skb_network_header(skb))->tos;
544 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
545         else if (skb->protocol == htons(ETH_P_IPV6))
546                 inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
547 #endif
548         else
549                 inner = 0;
550
551         return INET_ECN_encapsulate(tos, inner);
552 }
553
554 static inline void ecn_decapsulate(struct sk_buff *skb)
555 {
556         u8 tos = ip_hdr(skb)->tos;
557
558         if (INET_ECN_is_ce(tos)) {
559                 __be16 protocol = skb->protocol;
560                 unsigned int nw_header = skb_network_header(skb) - skb->data;
561
562                 if (skb->protocol == htons(ETH_P_8021Q)) {
563                         if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
564                                 return;
565
566                         protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
567                         nw_header += VLAN_HLEN;
568                 }
569
570                 if (protocol == htons(ETH_P_IP)) {
571                         if (unlikely(!pskb_may_pull(skb, nw_header
572                             + sizeof(struct iphdr))))
573                                 return;
574
575                         IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
576                 }
577 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
578                 else if (protocol == htons(ETH_P_IPV6)) {
579                         if (unlikely(!pskb_may_pull(skb, nw_header
580                             + sizeof(struct ipv6hdr))))
581                                 return;
582
583                         IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
584                                                           + skb->data));
585                 }
586 #endif
587         }
588 }
589
590 static struct sk_buff *handle_gso(struct sk_buff *skb)
591 {
592         if (skb_is_gso(skb)) {
593                 struct sk_buff *nskb = skb_gso_segment(skb, 0);
594
595                 dev_kfree_skb(skb);
596                 return nskb;
597         }
598
599         return skb;
600 }
601
602 static int handle_csum_offload(struct sk_buff *skb)
603 {
604         if (skb->ip_summed == CHECKSUM_PARTIAL)
605                 return skb_checksum_help(skb);
606         else {
607                 skb->ip_summed = CHECKSUM_NONE;
608                 return 0;
609         }
610 }
611
612 /* Called with rcu_read_lock. */
613 void tnl_rcv(struct vport *vport, struct sk_buff *skb)
614 {
615         skb->pkt_type = PACKET_HOST;
616         skb->protocol = eth_type_trans(skb, skb->dev);
617
618         skb_dst_drop(skb);
619         nf_reset(skb);
620         secpath_reset(skb);
621         skb_reset_network_header(skb);
622
623         ecn_decapsulate(skb);
624
625         skb_push(skb, ETH_HLEN);
626         compute_ip_summed(skb, false);
627
628         vport_receive(vport, skb);
629 }
630
631 static int build_packet(struct vport *vport, const struct tnl_mutable_config *mutable,
632                         struct iphdr *iph, struct rtable *rt, int max_headroom,
633                         int mtu, struct sk_buff *skb)
634 {
635         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
636         int err;
637         struct iphdr *new_iph;
638         int orig_len = skb->len;
639         __be16 frag_off = iph->frag_off;
640
641         skb = check_headroom(skb, max_headroom);
642         if (unlikely(IS_ERR(skb)))
643                 goto error;
644
645         err = handle_csum_offload(skb);
646         if (unlikely(err))
647                 goto error_free;
648
649         if (skb->protocol == htons(ETH_P_IP)) {
650                 struct iphdr *old_iph = ip_hdr(skb);
651
652                 if ((old_iph->frag_off & htons(IP_DF)) &&
653                     mtu < ntohs(old_iph->tot_len)) {
654                         if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
655                                 goto error_free;
656                 }
657
658         }
659 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
660         else if (skb->protocol == htons(ETH_P_IPV6)) {
661                 unsigned int packet_length = skb->len - ETH_HLEN
662                         - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
663
664                 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
665                 if (packet_length > IPV6_MIN_MTU)
666                         frag_off = htons(IP_DF);
667
668                 if (mtu < packet_length) {
669                         if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
670                                 goto error_free;
671                 }
672         }
673 #endif
674
675         new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
676         skb_reset_network_header(skb);
677         skb_set_transport_header(skb, sizeof(struct iphdr));
678
679         memcpy(new_iph, iph, sizeof(struct iphdr));
680         new_iph->frag_off = frag_off;
681         ip_select_ident(new_iph, &rt_dst(rt), NULL);
682
683         memset(&IPCB(skb)->opt, 0, sizeof(IPCB(skb)->opt));
684         IPCB(skb)->flags = 0;
685
686         skb = tnl_vport->tnl_ops->build_header(skb, vport, mutable, &rt_dst(rt));
687         if (unlikely(!skb))
688                 goto error;
689
690         while (skb) {
691                 struct sk_buff *next = skb->next;
692                 int frag_len = skb->len - mutable->tunnel_hlen;
693
694                 skb->next = NULL;
695
696                 err = ip_local_out(skb);
697                 if (unlikely(net_xmit_eval(err) != 0)) {
698                         orig_len -= frag_len;
699                         skb = next;
700                         goto free_frags;
701                 }
702
703                 skb = next;
704         };
705
706         return orig_len;
707
708 error_free:
709         kfree_skb(skb);
710 error:
711         return 0;
712 free_frags:
713         /*
714          * There's no point in continuing to send fragments once one has been
715          * dropped so just free the rest.  This may help improve the congestion
716          * that caused the first packet to be dropped.
717          */
718         while (skb) {
719                 struct sk_buff *next = skb->next;
720                 orig_len -= skb->len - mutable->tunnel_hlen;
721                 kfree_skb(skb);
722                 skb = next;
723         };
724         return orig_len;
725 }
726
727 int tnl_send(struct vport *vport, struct sk_buff *skb)
728 {
729         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
730         const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
731
732         struct iphdr *old_iph;
733         int orig_len;
734         struct iphdr iph;
735         struct rtable *rt;
736         int max_headroom;
737         int mtu;
738
739         /* Validate the protocol headers before we try to use them. */
740         if (skb->protocol == htons(ETH_P_8021Q)) {
741                 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
742                         goto error_free;
743
744                 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
745                 skb_set_network_header(skb, VLAN_ETH_HLEN);
746         }
747
748         if (skb->protocol == htons(ETH_P_IP)) {
749                 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
750                     + sizeof(struct iphdr) - skb->data)))
751                         skb->protocol = 0;
752         }
753 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
754         else if (skb->protocol == htons(ETH_P_IPV6)) {
755                 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
756                     + sizeof(struct ipv6hdr) - skb->data)))
757                         skb->protocol = 0;
758         }
759 #endif
760         old_iph = ip_hdr(skb);
761
762         iph.tos = mutable->port_config.tos;
763         if (mutable->port_config.flags & TNL_F_TOS_INHERIT) {
764                 if (skb->protocol == htons(ETH_P_IP))
765                         iph.tos = old_iph->tos;
766 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
767                 else if (skb->protocol == htons(ETH_P_IPV6))
768                         iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
769 #endif
770         }
771         iph.tos = ecn_encapsulate(iph.tos, skb);
772
773         {
774                 struct flowi fl = { .nl_u = { .ip4_u =
775                                               { .daddr = mutable->port_config.daddr,
776                                                 .saddr = mutable->port_config.saddr,
777                                                 .tos = RT_TOS(iph.tos) } },
778                                     .proto = tnl_vport->tnl_ops->ipproto };
779
780                 if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
781                         goto error_free;
782         }
783
784         iph.ttl = mutable->port_config.ttl;
785         if (mutable->port_config.flags & TNL_F_TTL_INHERIT) {
786                 if (skb->protocol == htons(ETH_P_IP))
787                         iph.ttl = old_iph->ttl;
788 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
789                 else if (skb->protocol == htons(ETH_P_IPV6))
790                         iph.ttl = ipv6_hdr(skb)->hop_limit;
791 #endif
792         }
793         if (!iph.ttl)
794                 iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
795
796         iph.frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
797         if (iph.frag_off)
798                 mtu = dst_mtu(&rt_dst(rt))
799                         - ETH_HLEN
800                         - mutable->tunnel_hlen
801                         - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
802         else
803                 mtu = mutable->mtu;
804
805         if (skb->protocol == htons(ETH_P_IP)) {
806                 iph.frag_off |= old_iph->frag_off & htons(IP_DF);
807                 mtu = max(mtu, IP_MIN_MTU);
808         }
809 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
810         else if (skb->protocol == htons(ETH_P_IPV6))
811                 mtu = max(mtu, IPV6_MIN_MTU);
812 #endif
813
814         iph.version = 4;
815         iph.ihl = sizeof(struct iphdr) >> 2;
816         iph.protocol = tnl_vport->tnl_ops->ipproto;
817         iph.daddr = rt->rt_dst;
818         iph.saddr = rt->rt_src;
819
820         nf_reset(skb);
821         secpath_reset(skb);
822         skb_dst_drop(skb);
823         skb_dst_set(skb, &rt_dst(rt));
824
825         /*
826          * If we are doing GSO on a pskb it is better to make sure that the
827          * headroom is correct now.  We will only have to copy the portion in
828          * the linear data area and GSO will preserve headroom when it creates
829          * the segments.  This is particularly beneficial on Xen where we get
830          * lots of GSO pskbs.  Conversely, we delay copying if it is just to
831          * get our own writable clone because GSO may do the copy for us.
832          */
833         max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
834                         + mutable->tunnel_hlen;
835
836         if (skb_headroom(skb) < max_headroom) {
837                 skb = check_headroom(skb, max_headroom);
838                 if (unlikely(IS_ERR(skb))) {
839                         vport_record_error(vport, VPORT_E_TX_DROPPED);
840                         goto error;
841                 }
842         }
843
844         forward_ip_summed(skb);
845
846         if (unlikely(vswitch_skb_checksum_setup(skb)))
847                 goto error_free;
848
849         skb = handle_gso(skb);
850         if (unlikely(IS_ERR(skb))) {
851                 vport_record_error(vport, VPORT_E_TX_DROPPED);
852                 goto error;
853         }
854
855         /*
856          * Process GSO segments.  Try to do any work for the entire packet that
857          * doesn't involve actually writing to it before this point.
858          */
859         orig_len = 0;
860         do {
861                 struct sk_buff *next_skb = skb->next;
862                 skb->next = NULL;
863
864                 orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
865
866                 skb = next_skb;
867         } while (skb);
868
869         if (unlikely(orig_len == 0))
870                 vport_record_error(vport, VPORT_E_TX_DROPPED);
871
872         return orig_len;
873
874 error_free:
875         kfree_skb(skb);
876         vport_record_error(vport, VPORT_E_TX_ERROR);
877 error:
878         return 0;
879 }
880
881 int tnl_init(void)
882 {
883         return 0;
884 }
885
886 void tnl_exit(void)
887 {
888         tbl_destroy(port_table, NULL);
889         port_table = NULL;
890 }
891
892 static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
893                       const struct vport *cur_vport,
894                       struct tnl_mutable_config *mutable)
895 {
896         const struct vport *old_vport;
897         const struct tnl_mutable_config *old_mutable;
898
899         if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct tnl_port_config)))
900                 return -EFAULT;
901
902         mutable->tunnel_hlen = tnl_ops->hdr_len(&mutable->port_config);
903         if (mutable->tunnel_hlen < 0)
904                 return mutable->tunnel_hlen;
905
906         mutable->tunnel_hlen += sizeof(struct iphdr);
907
908         if (mutable->port_config.daddr == 0)
909                 return -EINVAL;
910
911         mutable->tunnel_type = tnl_ops->tunnel_type;
912         if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
913                 mutable->tunnel_type |= TNL_T_KEY_MATCH;
914                 mutable->port_config.in_key = 0;
915         } else
916                 mutable->tunnel_type |= TNL_T_KEY_EXACT;
917
918         old_vport = tnl_find_port(mutable->port_config.saddr,
919                                   mutable->port_config.daddr,
920                                   mutable->port_config.in_key,
921                                   mutable->tunnel_type,
922                                   &old_mutable);
923
924         if (old_vport && old_vport != cur_vport)
925                 return -EEXIST;
926
927         if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
928                 mutable->port_config.out_key = 0;
929
930         return 0;
931 }
932
933 struct vport *tnl_create(const char *name, const void __user *config,
934                          const struct vport_ops *vport_ops,
935                          const struct tnl_ops *tnl_ops)
936 {
937         struct vport *vport;
938         struct tnl_vport *tnl_vport;
939         int initial_frag_id;
940         int err;
941
942         vport = vport_alloc(sizeof(struct tnl_vport), vport_ops);
943         if (IS_ERR(vport)) {
944                 err = PTR_ERR(vport);
945                 goto error;
946         }
947
948         tnl_vport = tnl_vport_priv(vport);
949
950         strcpy(tnl_vport->name, name);
951         tnl_vport->tnl_ops = tnl_ops;
952
953         tnl_vport->mutable = kmalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
954         if (!tnl_vport->mutable) {
955                 err = -ENOMEM;
956                 goto error_free_vport;
957         }
958
959         vport_gen_rand_ether_addr(tnl_vport->mutable->eth_addr);
960         tnl_vport->mutable->mtu = ETH_DATA_LEN;
961
962         get_random_bytes(&initial_frag_id, sizeof(int));
963         atomic_set(&tnl_vport->frag_id, initial_frag_id);
964
965         err = set_config(config, tnl_ops, NULL, tnl_vport->mutable);
966         if (err)
967                 goto error_free_mutable;
968
969         err = add_port(vport);
970         if (err)
971                 goto error_free_mutable;
972
973         return vport;
974
975 error_free_mutable:
976         kfree(tnl_vport->mutable);
977 error_free_vport:
978         vport_free(vport);
979 error:
980         return ERR_PTR(err);
981 }
982
983 int tnl_modify(struct vport *vport, const void __user *config)
984 {
985         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
986         struct tnl_mutable_config *mutable;
987         int err;
988         bool update_hash = false;
989
990         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
991         if (!mutable) {
992                 err = -ENOMEM;
993                 goto error;
994         }
995
996         err = set_config(config, tnl_vport->tnl_ops, vport, mutable);
997         if (err)
998                 goto error_free;
999
1000         /*
1001          * Only remove the port from the hash table if something that would
1002          * affect the lookup has changed.
1003          */
1004         if (tnl_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
1005             tnl_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
1006             tnl_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
1007             (tnl_vport->mutable->port_config.flags & TNL_F_IN_KEY_MATCH) !=
1008             (mutable->port_config.flags & TNL_F_IN_KEY_MATCH))
1009                 update_hash = true;
1010
1011
1012         /*
1013          * This update is not atomic but the lookup uses the config, which
1014          * serves as an inherent double check.
1015          */
1016         if (update_hash) {
1017                 err = del_port(vport);
1018                 if (err)
1019                         goto error_free;
1020         }
1021
1022         assign_config_rcu(vport, mutable);
1023
1024         if (update_hash) {
1025                 err = add_port(vport);
1026                 if (err)
1027                         goto error_free;
1028         }
1029
1030         return 0;
1031
1032 error_free:
1033         kfree(mutable);
1034 error:
1035         return err;
1036 }
1037
1038 static void free_port(struct rcu_head *rcu)
1039 {
1040         struct tnl_vport *tnl_vport = container_of(rcu, struct tnl_vport, rcu);
1041
1042         kfree(tnl_vport->mutable);
1043         vport_free(tnl_vport_to_vport(tnl_vport));
1044 }
1045
1046 int tnl_destroy(struct vport *vport)
1047 {
1048         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1049         const struct tnl_mutable_config *old_mutable;
1050
1051         if (vport == tnl_find_port(tnl_vport->mutable->port_config.saddr,
1052             tnl_vport->mutable->port_config.daddr,
1053             tnl_vport->mutable->port_config.in_key,
1054             tnl_vport->mutable->tunnel_type,
1055             &old_mutable))
1056                 del_port(vport);
1057
1058         call_rcu(&tnl_vport->rcu, free_port);
1059
1060         return 0;
1061 }
1062
1063 int tnl_set_mtu(struct vport *vport, int mtu)
1064 {
1065         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1066         struct tnl_mutable_config *mutable;
1067
1068         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
1069         if (!mutable)
1070                 return -ENOMEM;
1071
1072         mutable->mtu = mtu;
1073         assign_config_rcu(vport, mutable);
1074
1075         return 0;
1076 }
1077
1078 int tnl_set_addr(struct vport *vport, const unsigned char *addr)
1079 {
1080         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1081         struct tnl_mutable_config *mutable;
1082
1083         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
1084         if (!mutable)
1085                 return -ENOMEM;
1086
1087         memcpy(mutable->eth_addr, addr, ETH_ALEN);
1088         assign_config_rcu(vport, mutable);
1089
1090         return 0;
1091 }
1092
1093
1094 const char *tnl_get_name(const struct vport *vport)
1095 {
1096         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1097         return tnl_vport->name;
1098 }
1099
1100 const unsigned char *tnl_get_addr(const struct vport *vport)
1101 {
1102         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1103         return rcu_dereference(tnl_vport->mutable)->eth_addr;
1104 }
1105
1106 int tnl_get_mtu(const struct vport *vport)
1107 {
1108         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1109         return rcu_dereference(tnl_vport->mutable)->mtu;
1110 }