Merge "citrix" branch into "master".
[sliver-openvswitch.git] / datapath / linux-2.6 / compat-2.6 / ip_gre.c
1 /* ip_gre driver port to Linux 2.6.18 and greater plus enhancements */
2
3 #include <linux/version.h>
4 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
5 #define HAVE_NETDEV_STATS
6 #endif
7 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
8 #define HAVE_NETDEV_HEADER_OPS
9 #endif
10 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
11 #define HAVE_NETDEV_NEEDED_HEADROOM
12 #endif
13
14 /*
15  *      Linux NET3:     GRE over IP protocol decoder.
16  *
17  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
18  *
19  *      This program is free software; you can redistribute it and/or
20  *      modify it under the terms of the GNU General Public License
21  *      as published by the Free Software Foundation; either version
22  *      2 of the License, or (at your option) any later version.
23  *
24  */
25
26 #include <linux/capability.h>
27 #include <linux/ethtool.h>
28 #include <linux/module.h>
29 #include <linux/types.h>
30 #include <linux/kernel.h>
31 #include <asm/uaccess.h>
32 #include <linux/skbuff.h>
33 #include <linux/netdevice.h>
34 #include <linux/in.h>
35 #include <linux/tcp.h>
36 #include <linux/udp.h>
37 #include <linux/if_arp.h>
38 #include <linux/if_vlan.h>
39 #include <linux/mroute.h>
40 #include <linux/init.h>
41 #include <linux/in6.h>
42 #include <linux/inetdevice.h>
43 #include <linux/igmp.h>
44 #include <linux/netfilter_ipv4.h>
45 #include <linux/etherdevice.h>
46 #include <linux/if_ether.h>
47
48 #include <net/sock.h>
49 #include <net/ip.h>
50 #include <net/icmp.h>
51 #include <net/protocol.h>
52 #include <net/ipip.h>
53 #include <net/ipv6.h>
54 #include <net/arp.h>
55 #include <net/checksum.h>
56 #include <net/dsfield.h>
57 #include <net/inet_ecn.h>
58 #include <net/xfrm.h>
59 #include <net/net_namespace.h>
60 #include <net/netns/generic.h>
61
62 #ifdef CONFIG_IPV6
63 #include <net/ip6_fib.h>
64 #include <net/ip6_route.h>
65 #endif
66
67 #include "compat.h"
68 #include "openvswitch/gre.h"
69
70 #ifndef GRE_IOCTL_ONLY
71 #include <net/rtnetlink.h>
72 #endif
73
74 /*
75    Problems & solutions
76    --------------------
77
78    1. The most important issue is detecting local dead loops.
79    They would cause complete host lockup in transmit, which
80    would be "resolved" by stack overflow or, if queueing is enabled,
81    with infinite looping in net_bh.
82
83    We cannot track such dead loops during route installation,
84    it is infeasible task. The most general solutions would be
85    to keep skb->encapsulation counter (sort of local ttl),
86    and silently drop packet when it expires. It is the best
87    solution, but it supposes maintaing new variable in ALL
88    skb, even if no tunneling is used.
89
90    Current solution: HARD_TX_LOCK lock breaks dead loops.
91
92
93
94    2. Networking dead loops would not kill routers, but would really
95    kill network. IP hop limit plays role of "t->recursion" in this case,
96    if we copy it from packet being encapsulated to upper header.
97    It is very good solution, but it introduces two problems:
98
99    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
100      do not work over tunnels.
101    - traceroute does not work. I planned to relay ICMP from tunnel,
102      so that this problem would be solved and traceroute output
103      would even more informative. This idea appeared to be wrong:
104      only Linux complies to rfc1812 now (yes, guys, Linux is the only
105      true router now :-)), all routers (at least, in neighbourhood of mine)
106      return only 8 bytes of payload. It is the end.
107
108    Hence, if we want that OSPF worked or traceroute said something reasonable,
109    we should search for another solution.
110
111    One of them is to parse packet trying to detect inner encapsulation
112    made by our node. It is difficult or even impossible, especially,
113    taking into account fragmentation. TO be short, tt is not solution at all.
114
115    Current solution: The solution was UNEXPECTEDLY SIMPLE.
116    We force DF flag on tunnels with preconfigured hop limit,
117    that is ALL. :-) Well, it does not remove the problem completely,
118    but exponential growth of network traffic is changed to linear
119    (branches, that exceed pmtu are pruned) and tunnel mtu
120    fastly degrades to value <68, where looping stops.
121    Yes, it is not good if there exists a router in the loop,
122    which does not force DF, even when encapsulating packets have DF set.
123    But it is not our problem! Nobody could accuse us, we made
124    all that we could make. Even if it is your gated who injected
125    fatal route to network, even if it were you who configured
126    fatal static route: you are innocent. :-)
127
128    XXX: Forcing the DF flag on was done only when setting up tunnels via the
129         ioctl interface and not Netlink.  Since it prevents some operations
130         and isn't very transparent I removed it.  It seems nobody really
131         cared about it anyways.
132         Moral: don't create loops.
133
134    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
135    practically identical code. It would be good to glue them
136    together, but it is not very evident, how to make them modular.
137    sit is integral part of IPv6, ipip and gre are naturally modular.
138    We could extract common parts (hash table, ioctl etc)
139    to a separate module (ip_tunnel.c).
140
141    Alexey Kuznetsov.
142  */
143
144 #ifndef GRE_IOCTL_ONLY
145 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
146 static struct rtnl_link_ops ipgre_tap_ops __read_mostly;
147 #endif
148 static int ipgre_tunnel_init(struct net_device *dev);
149 static void ipgre_tunnel_setup(struct net_device *dev);
150 static void ipgre_tap_setup(struct net_device *dev);
151 static int ipgre_tunnel_bind_dev(struct net_device *dev);
152 static bool send_frag_needed(struct sk_buff *skb, struct net_device *dev,
153                              unsigned int mtu);
154
155 #define HASH_SIZE  16
156
157 /* The absolute minimum fragment size.  Note that there are many other
158  * definitions of the minimum MTU. */
159 #define IP_MIN_MTU 68
160
161 static inline __be16 *gre_flags(void *header_start)
162 {
163         return header_start;
164 }
165
166 static inline __be16 *gre_protocol(void *header_start)
167 {
168         return header_start + 2;
169 }
170
171 static int ipgre_net_id __read_mostly;
172 struct ipgre_net {
173         struct ip_tunnel *tunnels[4][HASH_SIZE];
174
175         struct net_device *fb_tunnel_dev;
176 };
177
178 /* Tunnel hash table */
179
180 /*
181    4 hash tables:
182
183    3: (remote,local)
184    2: (remote,*)
185    1: (*,local)
186    0: (*,*)
187
188    We require exact key match i.e. if a key is present in packet
189    it will match only tunnel with the same key; if it is not present,
190    it will match only keyless tunnel.
191
192    All keysless packets, if not matched configured keyless tunnels
193    will match fallback tunnel.
194  */
195
196 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
197
198 #define tunnels_r_l     tunnels[3]
199 #define tunnels_r       tunnels[2]
200 #define tunnels_l       tunnels[1]
201 #define tunnels_wc      tunnels[0]
202 /*
203  * Locking : hash tables are protected by RCU and a spinlock
204  */
205 static DEFINE_SPINLOCK(ipgre_lock);
206
207 #define for_each_ip_tunnel_rcu(start) \
208         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
209
210 /* Given src, dst and key, find appropriate for input tunnel. */
211
212 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
213                                               __be32 remote, __be32 local,
214                                               __be32 key, __be16 gre_proto)
215 {
216         struct net *net = dev_net(dev);
217         int link = dev->ifindex;
218         unsigned h0 = HASH(remote);
219         unsigned h1 = HASH(key);
220         struct ip_tunnel *t, *cand = NULL;
221         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
222         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
223                        ARPHRD_ETHER : ARPHRD_IPGRE;
224         int score, cand_score = 4;
225
226         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
227                 if (local != t->parms.iph.saddr ||
228                     remote != t->parms.iph.daddr ||
229                     key != t->parms.i_key ||
230                     !(t->dev->flags & IFF_UP))
231                         continue;
232
233                 if (t->dev->type != ARPHRD_IPGRE &&
234                     t->dev->type != dev_type)
235                         continue;
236
237                 score = 0;
238                 if (t->parms.link != link)
239                         score |= 1;
240                 if (t->dev->type != dev_type)
241                         score |= 2;
242                 if (score == 0)
243                         return t;
244
245                 if (score < cand_score) {
246                         cand = t;
247                         cand_score = score;
248                 }
249         }
250
251         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
252                 if (remote != t->parms.iph.daddr ||
253                     key != t->parms.i_key ||
254                     !(t->dev->flags & IFF_UP))
255                         continue;
256
257                 if (t->dev->type != ARPHRD_IPGRE &&
258                     t->dev->type != dev_type)
259                         continue;
260
261                 score = 0;
262                 if (t->parms.link != link)
263                         score |= 1;
264                 if (t->dev->type != dev_type)
265                         score |= 2;
266                 if (score == 0)
267                         return t;
268
269                 if (score < cand_score) {
270                         cand = t;
271                         cand_score = score;
272                 }
273         }
274
275         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
276                 if ((local != t->parms.iph.saddr &&
277                      (local != t->parms.iph.daddr ||
278                       !ipv4_is_multicast(local))) ||
279                     key != t->parms.i_key ||
280                     !(t->dev->flags & IFF_UP))
281                         continue;
282
283                 if (t->dev->type != ARPHRD_IPGRE &&
284                     t->dev->type != dev_type)
285                         continue;
286
287                 score = 0;
288                 if (t->parms.link != link)
289                         score |= 1;
290                 if (t->dev->type != dev_type)
291                         score |= 2;
292                 if (score == 0)
293                         return t;
294
295                 if (score < cand_score) {
296                         cand = t;
297                         cand_score = score;
298                 }
299         }
300
301         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
302                 if (t->parms.i_key != key ||
303                     !(t->dev->flags & IFF_UP))
304                         continue;
305
306                 if (t->dev->type != ARPHRD_IPGRE &&
307                     t->dev->type != dev_type)
308                         continue;
309
310                 score = 0;
311                 if (t->parms.link != link)
312                         score |= 1;
313                 if (t->dev->type != dev_type)
314                         score |= 2;
315                 if (score == 0)
316                         return t;
317
318                 if (score < cand_score) {
319                         cand = t;
320                         cand_score = score;
321                 }
322         }
323
324         if (cand != NULL)
325                 return cand;
326
327         dev = ign->fb_tunnel_dev;
328         if (dev->flags & IFF_UP)
329                 return netdev_priv(dev);
330
331         return NULL;
332 }
333
334 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
335                 struct ip_tunnel_parm *parms)
336 {
337         __be32 remote = parms->iph.daddr;
338         __be32 local = parms->iph.saddr;
339         __be32 key = parms->i_key;
340         unsigned h = HASH(key);
341         int prio = 0;
342
343         if (local)
344                 prio |= 1;
345         if (remote && !ipv4_is_multicast(remote)) {
346                 prio |= 2;
347                 h ^= HASH(remote);
348         }
349
350         return &ign->tunnels[prio][h];
351 }
352
353 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
354                 struct ip_tunnel *t)
355 {
356         return __ipgre_bucket(ign, &t->parms);
357 }
358
359 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
360 {
361         struct ip_tunnel **tp = ipgre_bucket(ign, t);
362
363         spin_lock_bh(&ipgre_lock);
364         t->next = *tp;
365         rcu_assign_pointer(*tp, t);
366         spin_unlock_bh(&ipgre_lock);
367 }
368
369 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
370 {
371         struct ip_tunnel **tp;
372
373         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
374                 if (t == *tp) {
375                         spin_lock_bh(&ipgre_lock);
376                         *tp = t->next;
377                         spin_unlock_bh(&ipgre_lock);
378                         break;
379                 }
380         }
381 }
382
383 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
384                                            struct ip_tunnel_parm *parms,
385                                            int type)
386 {
387         __be32 remote = parms->iph.daddr;
388         __be32 local = parms->iph.saddr;
389         __be32 key = parms->i_key;
390         int link = parms->link;
391         struct ip_tunnel *t, **tp;
392         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
393
394         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
395                 if (local == t->parms.iph.saddr &&
396                     remote == t->parms.iph.daddr &&
397                     key == t->parms.i_key &&
398                     link == t->parms.link &&
399                     type == t->dev->type)
400                         break;
401
402         return t;
403 }
404
405 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
406                 struct ip_tunnel_parm *parms, int gretap, int create)
407 {
408         struct ip_tunnel *t, *nt;
409         struct net_device *dev;
410         char name[IFNAMSIZ];
411         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
412
413         t = ipgre_tunnel_find(net, parms, gretap ? ARPHRD_ETHER : ARPHRD_IPGRE);
414         if (t || !create)
415                 return t;
416
417         if (parms->name[0])
418                 strlcpy(name, parms->name, IFNAMSIZ);
419         else
420                 sprintf(name, "gre%%d");
421
422         dev = alloc_netdev(sizeof(*t), name, gretap ? ipgre_tap_setup
423                                                     : ipgre_tunnel_setup);
424         if (!dev)
425           return NULL;
426
427         dev_net_set(dev, net);
428
429         if (strchr(name, '%')) {
430                 if (dev_alloc_name(dev, name) < 0)
431                         goto failed_free;
432         }
433
434         if (gretap)
435                 random_ether_addr(dev->dev_addr);
436
437 #ifndef GRE_IOCTL_ONLY
438         dev->rtnl_link_ops = gretap ? &ipgre_tap_ops : &ipgre_link_ops;
439 #endif
440         nt = netdev_priv(dev);
441         nt->parms = *parms;
442
443         dev->mtu = ipgre_tunnel_bind_dev(dev);
444
445         if (register_netdevice(dev) < 0)
446                 goto failed_free;
447
448         dev_hold(dev);
449         ipgre_tunnel_link(ign, nt);
450         return nt;
451
452 failed_free:
453         free_netdev(dev);
454         return NULL;
455 }
456
457 static void ipgre_tunnel_uninit(struct net_device *dev)
458 {
459         struct net *net = dev_net(dev);
460         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
461
462         ipgre_tunnel_unlink(ign, netdev_priv(dev));
463         dev_put(dev);
464 }
465
466 static unsigned int tunnel_hard_header_len(struct net_device *dev)
467 {
468 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
469         return dev->hard_header_len;
470 #else
471         return (dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0;
472 #endif
473 }
474
475 static void icmp_err_frag(struct sk_buff *skb, struct ip_tunnel *t,
476                           __be16 encap_proto)
477 {
478         int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
479         int header_len = t->hlen + tunnel_hard_header_len(t->dev);
480         unsigned int orig_mac_header = skb_mac_header(skb) - skb->data;
481         unsigned int orig_nw_header = skb_network_header(skb) - skb->data;
482
483         /* Add the size of the IP header since this is the smallest
484          * packet size the we might do something with and we might as
485          * well fail early if we don't have it.  Plus it allows us to
486          * safely look at the VLAN header if there is one.  The final
487          * size is checked before use. */
488         if (!pskb_may_pull(skb, header_len + sizeof(struct iphdr)))
489                 return;
490
491         if (t->dev->type == ARPHRD_ETHER) {
492                 skb_set_mac_header(skb, t->hlen);
493                 encap_proto = eth_hdr(skb)->h_proto;
494
495                 if (encap_proto == htons(ETH_P_8021Q)) {
496                         header_len += VLAN_HLEN;
497                         encap_proto =
498                                    vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
499                 }
500         }
501
502         skb_set_network_header(skb, header_len);
503         skb->protocol = encap_proto;
504         mtu -= header_len;
505
506         if (skb->protocol == htons(ETH_P_IP)) {
507                 if (mtu < IP_MIN_MTU) {
508                         if (ntohs(ip_hdr(skb)->tot_len) >= IP_MIN_MTU)
509                                 mtu = IP_MIN_MTU;
510                         else
511                                 goto out;
512                 }
513
514                 header_len += sizeof(struct iphdr);
515         } else if (skb->protocol == htons(ETH_P_IPV6)) {
516                 if (mtu < IPV6_MIN_MTU) {
517                         unsigned int packet_length;
518
519                         if (!pskb_may_pull(skb, header_len +
520                                                 sizeof(struct ipv6hdr)))
521                                 goto out;
522
523                         packet_length = sizeof(struct ipv6hdr) +
524                                               ntohs(ipv6_hdr(skb)->payload_len);
525
526                         if (packet_length >= IPV6_MIN_MTU
527                             || ntohs(ipv6_hdr(skb)->payload_len) == 0)
528                                 mtu = IPV6_MIN_MTU;
529                         else
530                                 goto out;
531                 }
532
533                 header_len += sizeof(struct ipv6hdr);
534         } else
535                 goto out;
536
537         if (pskb_may_pull(skb, header_len)) {
538                 __pskb_pull(skb, t->hlen);
539                 send_frag_needed(skb, t->dev, mtu);
540                 skb_push(skb, t->hlen);
541         }
542
543 out:
544         skb_set_mac_header(skb, orig_mac_header);
545         skb_set_network_header(skb, orig_nw_header);
546         skb->protocol = htons(ETH_P_IP);
547 }
548
549 static void ipgre_err(struct sk_buff *skb, u32 info)
550 {
551
552 /* All the routers (except for Linux) return only
553    8 bytes of packet payload. It means, that precise relaying of
554    ICMP in the real Internet is absolutely infeasible.
555
556    Moreover, Cisco "wise men" put GRE key to the third word
557    in GRE header. It makes impossible maintaining even soft state for keyed
558    GRE tunnels with enabled checksum. Tell them "thank you".
559
560    Well, I wonder, rfc1812 was written by Cisco employee,
561    what the hell these idiots break standrads established
562    by themself???
563  */
564
565         struct iphdr *iph = (struct iphdr *)skb->data;
566         __be16 *p;
567         int grehlen = (iph->ihl << 2) + 4;
568         const int type = icmp_hdr(skb)->type;
569         const int code = icmp_hdr(skb)->code;
570         struct ip_tunnel *t;
571         __be16 flags;
572         __be16 gre_proto;
573
574         WARN_ON_ONCE(skb_shared(skb));
575
576         if (!pskb_may_pull(skb, grehlen))
577                 return;
578
579         iph = (struct iphdr *)skb->data;
580         p = (__be16 *)(skb->data + (iph->ihl << 2));
581         flags = *gre_flags(p);
582         gre_proto = *gre_protocol(p);
583
584         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
585                 if (flags&(GRE_VERSION|GRE_ROUTING))
586                         return;
587                 if (flags&GRE_KEY) {
588                         grehlen += 4;
589                         if (flags&GRE_CSUM)
590                                 grehlen += 4;
591                 }
592         }
593
594         /* If only 8 bytes returned, keyed message will be dropped here */
595         if (!pskb_may_pull(skb, grehlen))
596                 return;
597
598         iph = (struct iphdr *)skb->data;
599
600         switch (type) {
601         default:
602         case ICMP_PARAMETERPROB:
603                 return;
604
605         case ICMP_DEST_UNREACH:
606                 switch (code) {
607                 case ICMP_SR_FAILED:
608                         /* Impossible event. */
609                 case ICMP_PORT_UNREACH:
610                         return;
611                 case ICMP_FRAG_NEEDED:
612                         /* Soft state for pmtu is maintained by IP core but we
613                          * also want to relay the message back. */
614                         break;
615                 default:
616                         /* All others are translated to HOST_UNREACH.
617                            rfc2003 contains "deep thoughts" about NET_UNREACH,
618                            I believe they are just ether pollution. --ANK
619                          */
620                         break;
621                 }
622                 break;
623         case ICMP_TIME_EXCEEDED:
624                 if (code != ICMP_EXC_TTL)
625                         return;
626                 break;
627         }
628
629         rcu_read_lock();
630         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
631                                 flags & GRE_KEY ?
632                                 *(((__be32 *)skb->data) + (grehlen / 4) - 1)
633                                 : 0, gre_proto);
634
635         if (t == NULL || t->parms.iph.daddr == 0 ||
636             ipv4_is_multicast(t->parms.iph.daddr))
637                 goto out;
638
639         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
640                 goto out;
641
642         if (code == ICMP_FRAG_NEEDED) {
643                 /* Invalidates pointers. */
644                 icmp_err_frag(skb, t, gre_proto);
645                 goto out;
646         }
647
648         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
649                 t->err_count++;
650         else
651                 t->err_count = 1;
652         t->err_time = jiffies;
653 out:
654         rcu_read_unlock();
655         return;
656 }
657
658 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
659 {
660         if (INET_ECN_is_ce(iph->tos)) {
661                 __be16 protocol = skb->protocol;
662                 unsigned int nw_header = skb_network_header(skb) - skb->data;
663
664                 if (skb->dev->type == ARPHRD_ETHER
665                     && skb->protocol == htons(ETH_P_8021Q)) {
666                         if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
667                                 return;
668
669                         protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
670                         nw_header += VLAN_HLEN;
671                 }
672
673                 if (protocol == htons(ETH_P_IP)) {
674                         if (unlikely(!pskb_may_pull(skb, nw_header
675                             + sizeof(struct iphdr))))
676                                 return;
677
678                         IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
679                 } else if (protocol == htons(ETH_P_IPV6)) {
680                         if (unlikely(!pskb_may_pull(skb, nw_header
681                             + sizeof(struct ipv6hdr))))
682                                 return;
683
684                         IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
685                                                           + skb->data));
686                 }
687         }
688 }
689
690 static inline u8
691 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
692 {
693         u8 inner = 0;
694         if (skb->protocol == htons(ETH_P_IP))
695                 inner = old_iph->tos;
696         else if (skb->protocol == htons(ETH_P_IPV6))
697                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
698         return INET_ECN_encapsulate(tos, inner);
699 }
700
701 static int ipgre_rcv(struct sk_buff *skb)
702 {
703         struct iphdr *iph;
704         u8     *h;
705         __be16    flags;
706         __sum16   csum = 0;
707         __be32 key = 0;
708         u32    seqno = 0;
709         struct ip_tunnel *tunnel;
710         int    offset = 4;
711         __be16 gre_proto;
712         unsigned int len;
713
714         if (!pskb_may_pull(skb, 16))
715                 goto drop_nolock;
716
717         iph = ip_hdr(skb);
718         h = skb->data;
719         flags = *gre_flags(h);
720
721         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
722                 /* - Version must be 0.
723                    - We do not support routing headers.
724                  */
725                 if (flags&(GRE_VERSION|GRE_ROUTING))
726                         goto drop_nolock;
727
728                 if (flags&GRE_CSUM) {
729                         switch (skb->ip_summed) {
730                         case CHECKSUM_COMPLETE:
731                                 csum = csum_fold(skb->csum);
732                                 if (!csum)
733                                         break;
734                                 /* fall through */
735                         case CHECKSUM_NONE:
736                                 skb->csum = 0;
737                                 csum = __skb_checksum_complete(skb);
738                                 skb->ip_summed = CHECKSUM_COMPLETE;
739                         }
740                         offset += 4;
741                 }
742                 if (flags&GRE_KEY) {
743                         key = *(__be32*)(h + offset);
744                         offset += 4;
745                 }
746                 if (flags&GRE_SEQ) {
747                         seqno = ntohl(*(__be32*)(h + offset));
748                         offset += 4;
749                 }
750         }
751
752         gre_proto = *gre_protocol(h);
753
754         rcu_read_lock();
755         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
756                                           iph->saddr, iph->daddr, key,
757                                           gre_proto))) {
758                 struct net_device_stats *stats;
759 #ifdef HAVE_NETDEV_STATS
760                 stats = &tunnel->dev->stats;
761 #else
762                 stats = &tunnel->stat;
763 #endif
764
765                 secpath_reset(skb);
766
767                 skb->protocol = gre_proto;
768                 /* WCCP version 1 and 2 protocol decoding.
769                  * - Change protocol to IP
770                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
771                  */
772                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
773                         skb->protocol = htons(ETH_P_IP);
774                         if ((*(h + offset) & 0xF0) != 0x40)
775                                 offset += 4;
776                 }
777
778                 skb->mac_header = skb->network_header;
779                 __pskb_pull(skb, offset);
780                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
781                 skb->pkt_type = PACKET_HOST;
782 #ifdef CONFIG_NET_IPGRE_BROADCAST
783                 if (ipv4_is_multicast(iph->daddr)) {
784                         /* Looped back packet, drop it! */
785                         if (skb_rtable(skb)->fl.iif == 0)
786                                 goto drop;
787                         stats->multicast++;
788                         skb->pkt_type = PACKET_BROADCAST;
789                 }
790 #endif
791
792                 if (((flags&GRE_CSUM) && csum) ||
793                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
794                         stats->rx_crc_errors++;
795                         stats->rx_errors++;
796                         goto drop;
797                 }
798                 if (tunnel->parms.i_flags&GRE_SEQ) {
799                         if (!(flags&GRE_SEQ) ||
800                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
801                                 stats->rx_fifo_errors++;
802                                 stats->rx_errors++;
803                                 goto drop;
804                         }
805                         tunnel->i_seqno = seqno + 1;
806                 }
807
808                 len = skb->len;
809
810                 /* Warning: All skb pointers will be invalidated! */
811                 if (tunnel->dev->type == ARPHRD_ETHER) {
812                         if (!pskb_may_pull(skb, ETH_HLEN)) {
813                                 stats->rx_length_errors++;
814                                 stats->rx_errors++;
815                                 goto drop;
816                         }
817
818                         iph = ip_hdr(skb);
819                         skb->protocol = eth_type_trans(skb, tunnel->dev);
820                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
821                 }
822
823                 stats->rx_packets++;
824                 stats->rx_bytes += len;
825                 skb->dev = tunnel->dev;
826                 skb_dst_drop(skb);
827                 nf_reset(skb);
828
829                 skb_reset_network_header(skb);
830
831                 /* Invalidates pointers. */
832                 ipgre_ecn_decapsulate(iph, skb);
833
834                 netif_rx(skb);
835                 rcu_read_unlock();
836                 return(0);
837         }
838         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
839
840 drop:
841         rcu_read_unlock();
842 drop_nolock:
843         kfree_skb(skb);
844         return(0);
845 }
846
847 static bool check_ipv4_address(__be32 addr)
848 {
849         if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
850             || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
851                 return false;
852
853         return true;
854 }
855
856 static bool ipv4_should_icmp(struct sk_buff *skb)
857 {
858         struct iphdr *old_iph = ip_hdr(skb);
859
860         /* Don't respond to L2 broadcast. */
861         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
862                 return false;
863
864         /* Don't respond to L3 broadcast or invalid addresses. */
865         if (!check_ipv4_address(old_iph->daddr) ||
866             !check_ipv4_address(old_iph->saddr))
867                 return false;
868
869         /* Only respond to the first fragment. */
870         if (old_iph->frag_off & htons(IP_OFFSET))
871                 return false;
872
873         /* Don't respond to ICMP error messages. */
874         if (old_iph->protocol == IPPROTO_ICMP) {
875                 u8 icmp_type, *icmp_typep;
876
877                 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
878                                                 (old_iph->ihl << 2) +
879                                                 offsetof(struct icmphdr, type) -
880                                                 skb->data, sizeof(icmp_type),
881                                                 &icmp_type);
882
883                 if (!icmp_typep)
884                         return false;
885
886                 if (*icmp_typep > NR_ICMP_TYPES
887                         || (*icmp_typep <= ICMP_PARAMETERPROB
888                                 && *icmp_typep != ICMP_ECHOREPLY
889                                 && *icmp_typep != ICMP_ECHO))
890                         return false;
891         }
892
893         return true;
894 }
895
896 static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
897                             unsigned int mtu, unsigned int payload_length)
898 {
899         struct iphdr *iph, *old_iph = ip_hdr(skb);
900         struct icmphdr *icmph;
901         u8 *payload;
902
903         iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
904         icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
905         payload = skb_put(nskb, payload_length);
906
907         /* IP */
908         iph->version            =       4;
909         iph->ihl                =       sizeof(struct iphdr) >> 2;
910         iph->tos                =       (old_iph->tos & IPTOS_TOS_MASK) |
911                                         IPTOS_PREC_INTERNETCONTROL;
912         iph->tot_len            =       htons(sizeof(struct iphdr)
913                                               + sizeof(struct icmphdr)
914                                               + payload_length);
915         get_random_bytes(&iph->id, sizeof iph->id);
916         iph->frag_off           =       0;
917         iph->ttl                =       IPDEFTTL;
918         iph->protocol           =       IPPROTO_ICMP;
919         iph->daddr              =       old_iph->saddr;
920         iph->saddr              =       old_iph->daddr;
921
922         ip_send_check(iph);
923
924         /* ICMP */
925         icmph->type             =       ICMP_DEST_UNREACH;
926         icmph->code             =       ICMP_FRAG_NEEDED;
927         icmph->un.gateway       =       htonl(mtu);
928         icmph->checksum         =       0;
929
930         nskb->csum = csum_partial((u8 *)icmph, sizeof *icmph, 0);
931         nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
932                                             payload, payload_length,
933                                             nskb->csum);
934         icmph->checksum = csum_fold(nskb->csum);
935 }
936
937 static bool ipv6_should_icmp(struct sk_buff *skb)
938 {
939         struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
940         int addr_type;
941         int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
942         u8 nexthdr = ipv6_hdr(skb)->nexthdr;
943
944         /* Check source address is valid. */
945         addr_type = ipv6_addr_type(&old_ipv6h->saddr);
946         if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
947                 return false;
948
949         /* Don't reply to unspecified addresses. */
950         if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
951                 return false;
952
953         /* Don't respond to ICMP error messages. */
954         payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
955         if (payload_off < 0)
956                 return false;
957
958         if (nexthdr == NEXTHDR_ICMP) {
959                 u8 icmp_type, *icmp_typep;
960
961                 icmp_typep = skb_header_pointer(skb, payload_off +
962                                                 offsetof(struct icmp6hdr,
963                                                         icmp6_type),
964                                                 sizeof(icmp_type), &icmp_type);
965
966                 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
967                         return false;
968         }
969
970         return true;
971 }
972
973 static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
974                             unsigned int mtu, unsigned int payload_length)
975 {
976         struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
977         struct icmp6hdr *icmp6h;
978         u8 *payload;
979
980         ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
981         icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
982         payload = skb_put(nskb, payload_length);
983
984         /* IPv6 */
985         ipv6h->version          =       6;
986         ipv6h->priority         =       0;
987         memset(&ipv6h->flow_lbl, 0, sizeof ipv6h->flow_lbl);
988         ipv6h->payload_len      =       htons(sizeof(struct icmp6hdr)
989                                               + payload_length);
990         ipv6h->nexthdr          =       NEXTHDR_ICMP;
991         ipv6h->hop_limit        =       IPV6_DEFAULT_HOPLIMIT;
992         ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
993         ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
994
995         /* ICMPv6 */
996         icmp6h->icmp6_type      =       ICMPV6_PKT_TOOBIG;
997         icmp6h->icmp6_code      =       0;
998         icmp6h->icmp6_cksum     =       0;
999         icmp6h->icmp6_mtu       =       htonl(mtu);
1000
1001         nskb->csum = csum_partial((u8 *)icmp6h, sizeof *icmp6h, 0);
1002         nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
1003                                             payload, payload_length,
1004                                             nskb->csum);
1005         icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
1006                                                 sizeof(struct icmp6hdr)
1007                                                 + payload_length,
1008                                                 ipv6h->nexthdr, nskb->csum);
1009 }
1010
1011 static bool send_frag_needed(struct sk_buff *skb, struct net_device *dev,
1012                              unsigned int mtu)
1013 {
1014         unsigned int eth_hdr_len = ETH_HLEN;
1015         unsigned int total_length, header_length, payload_length;
1016         struct ethhdr *eh, *old_eh = eth_hdr(skb);
1017         struct sk_buff *nskb;
1018         struct net_device_stats *stats;
1019
1020         /* Normal IP stack. */
1021         if (!dev->br_port) {
1022                 if (skb->protocol == htons(ETH_P_IP)) {
1023                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
1024                                   htonl(mtu));
1025                         return true;
1026                 } else {
1027 #ifdef CONFIG_IPV6
1028                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
1029                         return true;
1030 #else
1031                         return false;
1032 #endif
1033                 }
1034         }
1035
1036         /* Sanity check */
1037         if (skb->protocol == htons(ETH_P_IP)) {
1038                 if (mtu < IP_MIN_MTU)
1039                         return false;
1040
1041                 if (!ipv4_should_icmp(skb))
1042                         return true;
1043         } else {
1044                 if (mtu < IPV6_MIN_MTU)
1045                         return false;
1046
1047                 /* In theory we should do PMTUD on IPv6 multicast messages but
1048                  * we don't have an address to send from so just fragment. */
1049                 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
1050                         return false;
1051
1052                 if (!ipv6_should_icmp(skb))
1053                         return true;
1054         }
1055
1056         /* Allocate */
1057         if (old_eh->h_proto == htons(ETH_P_8021Q))
1058                 eth_hdr_len = VLAN_ETH_HLEN;
1059
1060         payload_length = skb->len - eth_hdr_len;
1061         if (skb->protocol == htons(ETH_P_IP)) {
1062                 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
1063                 total_length = min_t(unsigned int, header_length +
1064                                                    payload_length, 576);
1065         } else {
1066                 header_length = sizeof(struct ipv6hdr) +
1067                                 sizeof(struct icmp6hdr);
1068                 total_length = min_t(unsigned int, header_length +
1069                                                   payload_length, IPV6_MIN_MTU);
1070         }
1071         total_length = min(total_length, dev->mtu);
1072         payload_length = total_length - header_length;
1073
1074         nskb = netdev_alloc_skb_ip_align(dev, eth_hdr_len + header_length
1075                                               + payload_length);
1076         if (!nskb)
1077                 return false;
1078
1079         /* Ethernet / VLAN */
1080         eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
1081         memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
1082         memcpy(eh->h_source, dev->dev_addr, ETH_ALEN);
1083         eh->h_proto = old_eh->h_proto;
1084         if (old_eh->h_proto == htons(ETH_P_8021Q)) {
1085                 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
1086
1087                 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
1088                 vh->h_vlan_encapsulated_proto = skb->protocol;
1089         }
1090         nskb->protocol = eth_type_trans(nskb, dev);
1091
1092         /* Protocol */
1093         if (skb->protocol == htons(ETH_P_IP))
1094                 ipv4_build_icmp(skb, nskb, mtu, payload_length);
1095         else
1096                 ipv6_build_icmp(skb, nskb, mtu, payload_length);
1097
1098         /* Send */
1099 #ifdef HAVE_NETDEV_STATS
1100         stats = &dev->stats;
1101 #else
1102         stats = &((struct ip_tunnel *)netdev_priv(dev))->stat;
1103 #endif
1104         stats->rx_packets++;
1105         stats->rx_bytes += nskb->len;
1106
1107         netif_rx(nskb);
1108         return true;
1109 }
1110
1111 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
1112 {
1113         struct ip_tunnel *tunnel = netdev_priv(dev);
1114         struct net_device_stats *stats;
1115 #ifdef HAVE_NETDEV_QUEUE_STATS
1116         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
1117 #endif
1118         struct iphdr  *old_iph;
1119         struct ipv6hdr *old_ipv6h;
1120         struct iphdr  *tiph;
1121         u8     tos;
1122         __be16 df;
1123         struct rtable *rt;                      /* Route to the other host */
1124         struct net_device *tdev;                /* Device to other host */
1125         struct iphdr  *iph;                     /* Our new IP header */
1126         unsigned int max_headroom;              /* The extra header space needed */
1127         int    gre_hlen;
1128         __be32 dst;
1129         int    mtu;
1130         __be16 original_protocol;
1131         bool is_vlan = false;
1132
1133 #ifdef HAVE_NETDEV_STATS
1134         stats = &dev->stats;
1135 #else
1136         stats = &tunnel->stat;
1137 #endif
1138
1139         WARN_ON_ONCE(skb_shared(skb));
1140
1141         /* Validate the protocol headers before we try to use them. */
1142         original_protocol = skb->protocol;
1143
1144         if (dev->type == ARPHRD_ETHER && skb->protocol == htons(ETH_P_8021Q)) {
1145                 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
1146                         goto tx_error;
1147
1148                 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
1149                 skb_set_network_header(skb, VLAN_ETH_HLEN);
1150                 is_vlan = true;
1151         }
1152
1153         old_iph = ip_hdr(skb);
1154         old_ipv6h = ipv6_hdr(skb);
1155
1156         if (skb->protocol == htons(ETH_P_IP)) {
1157                 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
1158                     + sizeof(struct iphdr) - skb->data)))
1159                         skb->protocol = 0;
1160         } else if (skb->protocol == htons(ETH_P_IPV6)) {
1161                 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
1162                     + sizeof(struct ipv6hdr) - skb->data)))
1163                         skb->protocol = 0;
1164         }
1165
1166         if (dev->type == ARPHRD_ETHER)
1167                 IPCB(skb)->flags = 0;
1168
1169 #ifdef HAVE_NETDEV_HEADER_OPS
1170         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
1171 #else
1172         if (dev->hard_header && dev->type == ARPHRD_IPGRE) {
1173 #endif
1174                 gre_hlen = 0;
1175                 tiph = (struct iphdr *)skb->data;
1176         } else {
1177                 gre_hlen = tunnel->hlen;
1178                 tiph = &tunnel->parms.iph;
1179         }
1180
1181         if ((dst = tiph->daddr) == 0) {
1182                 /* NBMA tunnel */
1183
1184                 if (skb_dst(skb) == NULL) {
1185                         stats->tx_fifo_errors++;
1186                         goto tx_error;
1187                 }
1188
1189                 if (skb->protocol == htons(ETH_P_IP)) {
1190                         rt = skb_rtable(skb);
1191                         if ((dst = rt->rt_gateway) == 0)
1192                                 goto tx_error_icmp;
1193                 }
1194 #ifdef CONFIG_IPV6
1195                 else if (skb->protocol == htons(ETH_P_IPV6)) {
1196                         struct in6_addr *addr6;
1197                         int addr_type;
1198                         struct neighbour *neigh = skb_dst(skb)->neighbour;
1199
1200                         if (neigh == NULL)
1201                                 goto tx_error;
1202
1203                         addr6 = (struct in6_addr *)&neigh->primary_key;
1204                         addr_type = ipv6_addr_type(addr6);
1205
1206                         if (addr_type == IPV6_ADDR_ANY) {
1207                                 addr6 = &ipv6_hdr(skb)->daddr;
1208                                 addr_type = ipv6_addr_type(addr6);
1209                         }
1210
1211                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
1212                                 goto tx_error_icmp;
1213
1214                         dst = addr6->s6_addr32[3];
1215                 }
1216 #endif
1217                 else
1218                         goto tx_error;
1219         }
1220
1221         tos = tiph->tos;
1222         if (tos == 1) {
1223                 tos = 0;
1224                 if (skb->protocol == htons(ETH_P_IP))
1225                         tos = old_iph->tos;
1226                 else if (skb->protocol == htons(ETH_P_IPV6))
1227                         tos = ipv6_get_dsfield(ipv6_hdr(skb));
1228         }
1229
1230         {
1231                 struct flowi fl = { .oif = tunnel->parms.link,
1232                                     .nl_u = { .ip4_u =
1233                                               { .daddr = dst,
1234                                                 .saddr = tiph->saddr,
1235                                                 .tos = RT_TOS(tos) } },
1236                                     .proto = IPPROTO_GRE };
1237                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
1238                         stats->tx_carrier_errors++;
1239                         goto tx_error;
1240                 }
1241         }
1242         tdev = rt->u.dst.dev;
1243
1244         if (tdev == dev) {
1245                 ip_rt_put(rt);
1246                 stats->collisions++;
1247                 goto tx_error;
1248         }
1249
1250         df = tiph->frag_off;
1251         if (df)
1252                 mtu = dst_mtu(&rt->u.dst) - tunnel_hard_header_len(dev)
1253                         - (is_vlan ? VLAN_HLEN : 0)
1254                         - tunnel->hlen;
1255         else
1256                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
1257
1258         if (skb->protocol == htons(ETH_P_IP))
1259                 mtu = max(mtu, IP_MIN_MTU);
1260         if (skb->protocol == htons(ETH_P_IPV6))
1261                 mtu = max(mtu, IPV6_MIN_MTU);
1262
1263         if (skb_dst(skb))
1264                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1265
1266         if (skb->protocol == htons(ETH_P_IP)) {
1267                 df |= (old_iph->frag_off&htons(IP_DF));
1268
1269                 if ((old_iph->frag_off&htons(IP_DF)) &&
1270                     mtu < ntohs(old_iph->tot_len)) {
1271                         if (send_frag_needed(skb, dev, mtu)) {
1272                                 ip_rt_put(rt);
1273                                 goto tx_error;
1274                         }
1275                 }
1276         } else if (skb->protocol == htons(ETH_P_IPV6)) {
1277                 unsigned int packet_length = skb->len
1278                                              - tunnel_hard_header_len(dev)
1279                                              - (is_vlan ? VLAN_HLEN : 0);
1280
1281 #ifdef CONFIG_IPV6
1282                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
1283
1284                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
1285                         if ((tunnel->parms.iph.daddr &&
1286                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
1287                             rt6->rt6i_dst.plen == 128) {
1288                                 rt6->rt6i_flags |= RTF_MODIFIED;
1289                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
1290                         }
1291                 }
1292 #endif
1293
1294                 /* IPv6 requires PMTUD if the packet is above the minimum MTU.*/
1295                 if (packet_length > IPV6_MIN_MTU)
1296                         df = htons(IP_DF);
1297
1298                 if (mtu < packet_length - tunnel->hlen + gre_hlen) {
1299                         if (send_frag_needed(skb, dev, mtu)) {
1300                                 ip_rt_put(rt);
1301                                 goto tx_error;
1302                         }
1303                 }
1304         }
1305
1306         if (tunnel->err_count > 0) {
1307                 if (time_before(jiffies,
1308                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
1309                         tunnel->err_count--;
1310
1311                         dst_link_failure(skb);
1312                 } else
1313                         tunnel->err_count = 0;
1314         }
1315
1316         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
1317
1318         if (skb_headroom(skb) < max_headroom ||
1319             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
1320                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
1321                 if (!new_skb) {
1322                         ip_rt_put(rt);
1323 #ifdef HAVE_NETDEV_QUEUE_STATS
1324                         txq->tx_dropped++;
1325 #else
1326                         stats->tx_dropped++;
1327 #endif
1328                         dev_kfree_skb(skb);
1329                         return NETDEV_TX_OK;
1330                 }
1331                 if (skb->sk)
1332                         skb_set_owner_w(new_skb, skb->sk);
1333                 dev_kfree_skb(skb);
1334                 skb = new_skb;
1335                 old_iph = ip_hdr(skb);
1336         }
1337
1338         skb_reset_transport_header(skb);
1339         skb_push(skb, gre_hlen);
1340         skb_reset_network_header(skb);
1341         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1342         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
1343                               IPSKB_REROUTED);;
1344
1345         skb_dst_drop(skb);
1346         skb_dst_set(skb, &rt->u.dst);
1347
1348         /*
1349          *      Push down and install the GRE header.
1350          */
1351
1352         iph                     =       ip_hdr(skb);
1353         iph->version            =       4;
1354         iph->ihl                =       sizeof(struct iphdr) >> 2;
1355         iph->frag_off           =       df;
1356         iph->protocol           =       IPPROTO_GRE;
1357         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
1358         iph->daddr              =       rt->rt_dst;
1359         iph->saddr              =       rt->rt_src;
1360
1361         /* Allow our local IP stack to fragment the outer packet even if the
1362          * DF bit is set.  If we got this far there is nothing more that we
1363          * can do with the inner packet. */
1364         skb->local_df = 1;
1365
1366         if ((iph->ttl = tiph->ttl) == 0) {
1367                 if (skb->protocol == htons(ETH_P_IP))
1368                         iph->ttl = old_iph->ttl;
1369                 else if (skb->protocol == htons(ETH_P_IPV6))
1370                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
1371                 else
1372                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
1373         }
1374
1375         *gre_flags(iph + 1) = tunnel->parms.o_flags;
1376         *gre_protocol(iph + 1) = (dev->type == ARPHRD_ETHER) ?
1377                                    htons(ETH_P_TEB) : original_protocol;
1378
1379         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
1380                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
1381
1382                 if (tunnel->parms.o_flags&GRE_SEQ) {
1383                         ++tunnel->o_seqno;
1384                         *ptr = htonl(tunnel->o_seqno);
1385                         ptr--;
1386                 }
1387                 if (tunnel->parms.o_flags&GRE_KEY) {
1388                         *ptr = tunnel->parms.o_key;
1389                         ptr--;
1390                 }
1391                 if (tunnel->parms.o_flags&GRE_CSUM) {
1392                         *ptr = 0;
1393                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
1394                 }
1395         }
1396
1397         nf_reset(skb);
1398
1399         IPTUNNEL_XMIT();
1400         return NETDEV_TX_OK;
1401
1402 tx_error_icmp:
1403         dst_link_failure(skb);
1404
1405 tx_error:
1406         stats->tx_errors++;
1407         dev_kfree_skb(skb);
1408         return NETDEV_TX_OK;
1409 }
1410
1411 static int ipgre_tunnel_bind_dev(struct net_device *dev)
1412 {
1413         struct net_device *tdev = NULL;
1414         struct ip_tunnel *tunnel;
1415         struct iphdr *iph;
1416         int hlen = LL_MAX_HEADER;
1417         int mtu = ETH_DATA_LEN;
1418         int addend = sizeof(struct iphdr) + 4;
1419
1420         tunnel = netdev_priv(dev);
1421         iph = &tunnel->parms.iph;
1422
1423         /* Guess output device to choose reasonable mtu and needed_headroom */
1424
1425         if (iph->daddr) {
1426                 struct flowi fl = { .oif = tunnel->parms.link,
1427                                     .nl_u = { .ip4_u =
1428                                               { .daddr = iph->daddr,
1429                                                 .saddr = iph->saddr,
1430                                                 .tos = RT_TOS(iph->tos) } },
1431                                     .proto = IPPROTO_GRE };
1432                 struct rtable *rt;
1433                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
1434                         tdev = rt->u.dst.dev;
1435                         ip_rt_put(rt);
1436                 }
1437
1438                 if (dev->type != ARPHRD_ETHER)
1439                         dev->flags |= IFF_POINTOPOINT;
1440         }
1441
1442         if (!tdev && tunnel->parms.link)
1443                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1444
1445         if (tdev) {
1446 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1447                 hlen = tdev->hard_header_len + tdev->needed_headroom;
1448 #else
1449                 hlen = tdev->hard_header_len;
1450 #endif
1451                 mtu = tdev->mtu;
1452         }
1453         dev->iflink = tunnel->parms.link;
1454
1455         /* Precalculate GRE options length */
1456         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1457                 if (tunnel->parms.o_flags&GRE_CSUM)
1458                         addend += 4;
1459                 if (tunnel->parms.o_flags&GRE_KEY)
1460                         addend += 4;
1461                 if (tunnel->parms.o_flags&GRE_SEQ)
1462                         addend += 4;
1463         }
1464 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1465         dev->needed_headroom = hlen + addend;
1466 #else
1467         dev->hard_header_len = hlen + addend;
1468 #endif
1469         mtu -= tunnel_hard_header_len(dev) + addend;
1470         tunnel->hlen = addend;
1471
1472         if (mtu < IP_MIN_MTU)
1473                 mtu = IP_MIN_MTU;
1474
1475         /* If we could be connected to a bridge set the normal Ethernet MTU
1476          * since all devices on the bridge are required to have the same MTU.
1477          * Even though this isn't our optimal MTU we can handle it. */
1478         if (dev->type == ARPHRD_ETHER)
1479                 mtu = ETH_DATA_LEN;
1480
1481         return mtu;
1482 }
1483
1484 static int
1485 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1486 {
1487         int err = 0;
1488         struct ip_tunnel_parm p;
1489         struct ip_tunnel *t;
1490         struct net *net = dev_net(dev);
1491         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1492         int add_tunnel, gretap;
1493
1494         switch (cmd) {
1495         case SIOCGETTUNNEL:
1496                 t = NULL;
1497                 if (dev == ign->fb_tunnel_dev) {
1498                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1499                                 err = -EFAULT;
1500                                 break;
1501                         }
1502                         t = ipgre_tunnel_locate(net, &p, false, 0);
1503                 }
1504                 if (t == NULL)
1505                         t = netdev_priv(dev);
1506                 memcpy(&p, &t->parms, sizeof(p));
1507                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1508                         err = -EFAULT;
1509                 break;
1510
1511         case SIOCADDTUNNEL:
1512         case SIOCCHGTUNNEL:
1513         case SIOCADDGRETAP:
1514         case SIOCCHGGRETAP:
1515                 err = -EPERM;
1516                 if (!capable(CAP_NET_ADMIN))
1517                         goto done;
1518
1519                 err = -EFAULT;
1520                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1521                         goto done;
1522
1523                 err = -EINVAL;
1524                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1525                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1526                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1527                         goto done;
1528
1529                 add_tunnel = (cmd == SIOCADDTUNNEL || cmd == SIOCADDGRETAP);
1530                 gretap = (cmd == SIOCADDGRETAP || cmd == SIOCCHGGRETAP);
1531
1532                 if (!(p.i_flags&GRE_KEY))
1533                         p.i_key = 0;
1534                 if (!(p.o_flags&GRE_KEY))
1535                         p.o_key = 0;
1536
1537                 t = ipgre_tunnel_locate(net, &p, gretap, add_tunnel);
1538
1539                 if (dev != ign->fb_tunnel_dev && !add_tunnel) {
1540                         if (t != NULL) {
1541                                 if (t->dev != dev) {
1542                                         err = -EEXIST;
1543                                         break;
1544                                 }
1545                         } else {
1546                                 unsigned nflags = 0;
1547
1548                                 t = netdev_priv(dev);
1549
1550                                 if (ipv4_is_multicast(p.iph.daddr))
1551                                         nflags = IFF_BROADCAST;
1552                                 else if (p.iph.daddr)
1553                                         nflags = IFF_POINTOPOINT;
1554
1555                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1556                                         err = -EINVAL;
1557                                         break;
1558                                 }
1559                                 ipgre_tunnel_unlink(ign, t);
1560                                 t->parms.iph.saddr = p.iph.saddr;
1561                                 t->parms.iph.daddr = p.iph.daddr;
1562                                 t->parms.i_key = p.i_key;
1563                                 t->parms.o_key = p.o_key;
1564                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1565                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1566                                 ipgre_tunnel_link(ign, t);
1567                                 netdev_state_change(dev);
1568                         }
1569                 }
1570
1571                 if (t) {
1572                         err = 0;
1573                         if (!add_tunnel) {
1574                                 t->parms.iph.ttl = p.iph.ttl;
1575                                 t->parms.iph.tos = p.iph.tos;
1576                                 t->parms.iph.frag_off = p.iph.frag_off;
1577                                 if (t->parms.link != p.link) {
1578                                         t->parms.link = p.link;
1579                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1580                                         netdev_state_change(dev);
1581                                 }
1582                         }
1583                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1584                                 err = -EFAULT;
1585                 } else
1586                         err = (add_tunnel ? -ENOBUFS : -ENOENT);
1587                 break;
1588
1589         case SIOCDELTUNNEL:
1590                 err = -EPERM;
1591                 if (!capable(CAP_NET_ADMIN))
1592                         goto done;
1593
1594                 if (dev == ign->fb_tunnel_dev) {
1595                         err = -EFAULT;
1596                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1597                                 goto done;
1598                         err = -ENOENT;
1599                         if ((t = ipgre_tunnel_locate(net, &p, false, 0)) == NULL)
1600                                 goto done;
1601                         err = -EPERM;
1602                         if (t == netdev_priv(ign->fb_tunnel_dev))
1603                                 goto done;
1604                         dev = t->dev;
1605                 }
1606                 unregister_netdevice(dev);
1607                 err = 0;
1608                 break;
1609
1610         default:
1611                 err = -EINVAL;
1612         }
1613
1614 done:
1615         return err;
1616 }
1617
1618 #ifndef HAVE_NETDEV_STATS
1619 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1620 {
1621         return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1622 }
1623 #endif
1624
1625 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1626 {
1627         struct ip_tunnel *tunnel = netdev_priv(dev);
1628         if (new_mtu < IP_MIN_MTU ||
1629             new_mtu > 0xFFF8 - tunnel_hard_header_len(dev) - tunnel->hlen)
1630                 return -EINVAL;
1631         dev->mtu = new_mtu;
1632         return 0;
1633 }
1634
1635 /* Nice toy. Unfortunately, useless in real life :-)
1636    It allows to construct virtual multiprotocol broadcast "LAN"
1637    over the Internet, provided multicast routing is tuned.
1638
1639
1640    I have no idea was this bicycle invented before me,
1641    so that I had to set ARPHRD_IPGRE to a random value.
1642    I have an impression, that Cisco could make something similar,
1643    but this feature is apparently missing in IOS<=11.2(8).
1644
1645    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1646    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1647
1648    ping -t 255 224.66.66.66
1649
1650    If nobody answers, mbone does not work.
1651
1652    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1653    ip addr add 10.66.66.<somewhat>/24 dev Universe
1654    ifconfig Universe up
1655    ifconfig Universe add fe80::<Your_real_addr>/10
1656    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1657    ftp 10.66.66.66
1658    ...
1659    ftp fec0:6666:6666::193.233.7.65
1660    ...
1661
1662  */
1663
1664 #ifdef HAVE_NETDEV_HEADER_OPS
1665 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1666                        unsigned short type,
1667                        const void *daddr, const void *saddr, unsigned len)
1668 #else
1669 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1670                         void *daddr, void *saddr, unsigned len)
1671 #endif
1672 {
1673         struct ip_tunnel *t = netdev_priv(dev);
1674         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1675         __be16 *p = (__be16*)(iph+1);
1676
1677         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1678         p[0]            = t->parms.o_flags;
1679         p[1]            = htons(type);
1680
1681         /*
1682          *      Set the source hardware address.
1683          */
1684
1685         if (saddr)
1686                 memcpy(&iph->saddr, saddr, 4);
1687
1688         if (daddr) {
1689                 memcpy(&iph->daddr, daddr, 4);
1690                 return t->hlen;
1691         }
1692         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1693                 return t->hlen;
1694
1695         return -t->hlen;
1696 }
1697
1698 #ifdef HAVE_NETDEV_HEADER_OPS
1699 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1700 #else
1701 static int ipgre_header_parse(struct sk_buff *skb, unsigned char *haddr)
1702 #endif
1703 {
1704         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1705         memcpy(haddr, &iph->saddr, 4);
1706         return 4;
1707 }
1708
1709 #ifdef HAVE_NETDEV_HEADER_OPS
1710 static const struct header_ops ipgre_header_ops = {
1711         .create = ipgre_header,
1712         .parse  = ipgre_header_parse,
1713 };
1714 #endif
1715
1716 #ifdef CONFIG_NET_IPGRE_BROADCAST
1717 static int ipgre_open(struct net_device *dev)
1718 {
1719         struct ip_tunnel *t = netdev_priv(dev);
1720
1721         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1722                 struct flowi fl = { .oif = t->parms.link,
1723                                     .nl_u = { .ip4_u =
1724                                               { .daddr = t->parms.iph.daddr,
1725                                                 .saddr = t->parms.iph.saddr,
1726                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1727                                     .proto = IPPROTO_GRE };
1728                 struct rtable *rt;
1729                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1730                         return -EADDRNOTAVAIL;
1731                 dev = rt->u.dst.dev;
1732                 ip_rt_put(rt);
1733                 if (__in_dev_get_rtnl(dev) == NULL)
1734                         return -EADDRNOTAVAIL;
1735                 t->mlink = dev->ifindex;
1736                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1737         }
1738         return 0;
1739 }
1740
1741 static int ipgre_close(struct net_device *dev)
1742 {
1743         struct ip_tunnel *t = netdev_priv(dev);
1744
1745         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1746                 struct in_device *in_dev;
1747                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1748                 if (in_dev) {
1749                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1750                         in_dev_put(in_dev);
1751                 }
1752         }
1753         return 0;
1754 }
1755
1756 #endif
1757
1758 static void ethtool_getinfo(struct net_device *dev,
1759                             struct ethtool_drvinfo *info)
1760 {
1761         strcpy(info->driver, "ip_gre");
1762         strcpy(info->version, "Open vSwitch "VERSION BUILDNR);
1763         strcpy(info->bus_info, dev->type == ARPHRD_ETHER ? "gretap" : "gre");
1764 }
1765
1766 static struct ethtool_ops ethtool_ops = {
1767         .get_drvinfo            = ethtool_getinfo,
1768 };
1769
1770 #ifdef HAVE_NET_DEVICE_OPS
1771 static const struct net_device_ops ipgre_netdev_ops = {
1772         .ndo_init               = ipgre_tunnel_init,
1773         .ndo_uninit             = ipgre_tunnel_uninit,
1774 #ifdef CONFIG_NET_IPGRE_BROADCAST
1775         .ndo_open               = ipgre_open,
1776         .ndo_stop               = ipgre_close,
1777 #endif
1778         .ndo_start_xmit         = ipgre_tunnel_xmit,
1779         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1780         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1781 };
1782 #endif
1783
1784 static void ipgre_tunnel_setup(struct net_device *dev)
1785 {
1786 #ifdef HAVE_NET_DEVICE_OPS
1787         dev->netdev_ops         = &ipgre_netdev_ops;
1788 #else
1789         dev->init               = ipgre_tunnel_init;
1790         dev->uninit             = ipgre_tunnel_uninit;
1791         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1792 #ifndef HAVE_NETDEV_STATS
1793         dev->get_stats          = ipgre_tunnel_get_stats;
1794 #endif
1795         dev->do_ioctl           = ipgre_tunnel_ioctl;
1796         dev->change_mtu         = ipgre_tunnel_change_mtu;
1797 #endif /* HAVE_NET_DEVICE_OPS */
1798         dev->destructor         = free_netdev;
1799
1800         dev->type               = ARPHRD_IPGRE;
1801 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1802         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1803 #else
1804         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1805 #endif
1806         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1807         dev->flags              = IFF_NOARP;
1808         dev->iflink             = 0;
1809         dev->addr_len           = 4;
1810         dev->features           |= NETIF_F_NETNS_LOCAL;
1811         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1812
1813         SET_ETHTOOL_OPS(dev, &ethtool_ops);
1814 }
1815
1816 static int ipgre_tunnel_init(struct net_device *dev)
1817 {
1818         struct ip_tunnel *tunnel;
1819         struct iphdr *iph;
1820
1821         tunnel = netdev_priv(dev);
1822         iph = &tunnel->parms.iph;
1823
1824         tunnel->dev = dev;
1825         strcpy(tunnel->parms.name, dev->name);
1826
1827         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1828         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1829
1830         if (iph->daddr) {
1831 #ifdef CONFIG_NET_IPGRE_BROADCAST
1832                 if (ipv4_is_multicast(iph->daddr)) {
1833                         if (!iph->saddr)
1834                                 return -EINVAL;
1835                         dev->flags = IFF_BROADCAST;
1836 #ifdef HAVE_NETDEV_HEADER_OPS
1837                         dev->header_ops = &ipgre_header_ops;
1838 #else
1839                         dev->hard_header = ipgre_header;
1840                         dev->hard_header_parse = ipgre_header_parse;
1841 #endif
1842 #ifndef HAVE_NET_DEVICE_OPS
1843                         dev->open = ipgre_open;
1844                         dev->stop = ipgre_close;
1845 #endif
1846                 }
1847 #endif
1848         } else {
1849 #ifdef HAVE_NETDEV_HEADER_OPS
1850                 dev->header_ops = &ipgre_header_ops;
1851 #else
1852                 dev->hard_header = ipgre_header;
1853                 dev->hard_header_parse = ipgre_header_parse;
1854 #endif
1855         }
1856
1857         return 0;
1858 }
1859
1860 #ifdef HAVE_NET_DEVICE_OPS
1861 static void ipgre_fb_tunnel_init(struct net_device *dev)
1862 #else
1863 static int ipgre_fb_tunnel_init(struct net_device *dev)
1864 #endif
1865 {
1866         struct ip_tunnel *tunnel = netdev_priv(dev);
1867         struct iphdr *iph = &tunnel->parms.iph;
1868         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1869
1870         tunnel->dev = dev;
1871         strcpy(tunnel->parms.name, dev->name);
1872
1873         iph->version            = 4;
1874         iph->protocol           = IPPROTO_GRE;
1875         iph->ihl                = 5;
1876         tunnel->hlen            = sizeof(struct iphdr) + 4;
1877
1878         dev_hold(dev);
1879         ign->tunnels_wc[0]      = tunnel;
1880
1881 #ifndef HAVE_NET_DEVICE_OPS
1882         return 0;
1883 #endif
1884 }
1885
1886 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
1887 static struct net_protocol ipgre_protocol = {
1888 #else
1889 static const struct net_protocol ipgre_protocol = {
1890 #endif
1891         .handler        =       ipgre_rcv,
1892         .err_handler    =       ipgre_err,
1893 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
1894         .netns_ok       =       1,
1895 #endif
1896 };
1897
1898 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1899 {
1900         int prio;
1901
1902         for (prio = 0; prio < 4; prio++) {
1903                 int h;
1904                 for (h = 0; h < HASH_SIZE; h++) {
1905                         struct ip_tunnel *t = ign->tunnels[prio][h];
1906
1907                         while (t != NULL) {
1908                                 unregister_netdevice_queue(t->dev, head);
1909                                 t = t->next;
1910                         }
1911                 }
1912         }
1913 }
1914
1915 static int ipgre_init_net(struct net *net)
1916 {
1917         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1918         int err;
1919
1920         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), GRE_IOCTL_DEVICE,
1921                                            ipgre_tunnel_setup);
1922         if (!ign->fb_tunnel_dev) {
1923                 err = -ENOMEM;
1924                 goto err_alloc_dev;
1925         }
1926         dev_net_set(ign->fb_tunnel_dev, net);
1927
1928 #ifdef HAVE_NET_DEVICE_OPS
1929         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1930 #else
1931         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1932 #endif
1933 #ifndef GRE_IOCTL_ONLY
1934         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1935 #endif
1936
1937         if ((err = register_netdev(ign->fb_tunnel_dev)))
1938                 goto err_reg_dev;
1939
1940         return 0;
1941
1942 err_reg_dev:
1943         free_netdev(ign->fb_tunnel_dev);
1944 err_alloc_dev:
1945         return err;
1946 }
1947
1948 static void ipgre_exit_net(struct net *net)
1949 {
1950         struct ipgre_net *ign;
1951         LIST_HEAD(list);
1952
1953         ign = net_generic(net, ipgre_net_id);
1954         rtnl_lock();
1955         ipgre_destroy_tunnels(ign, &list);
1956         unregister_netdevice_many(&list);
1957         rtnl_unlock();
1958 }
1959
1960 static struct pernet_operations ipgre_net_ops = {
1961         .init = ipgre_init_net,
1962         .exit = ipgre_exit_net,
1963         .id   = &ipgre_net_id,
1964         .size = sizeof(struct ipgre_net),
1965 };
1966
1967 static int ipgre_tap_init(struct net_device *dev)
1968 {
1969         struct ip_tunnel *tunnel;
1970
1971         tunnel = netdev_priv(dev);
1972
1973         tunnel->dev = dev;
1974         strcpy(tunnel->parms.name, dev->name);
1975
1976         ipgre_tunnel_bind_dev(dev);
1977
1978         return 0;
1979 }
1980
1981 #ifdef HAVE_NET_DEVICE_OPS
1982 static const struct net_device_ops ipgre_tap_netdev_ops = {
1983         .ndo_init               = ipgre_tap_init,
1984         .ndo_uninit             = ipgre_tunnel_uninit,
1985         .ndo_start_xmit         = ipgre_tunnel_xmit,
1986         .ndo_set_mac_address    = eth_mac_addr,
1987         .ndo_validate_addr      = eth_validate_addr,
1988         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1989         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1990 };
1991 #endif
1992
1993 static void ipgre_tap_setup(struct net_device *dev)
1994 {
1995         ether_setup(dev);
1996
1997 #ifdef HAVE_NET_DEVICE_OPS
1998         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1999 #else
2000         dev->init               = ipgre_tap_init;
2001         dev->uninit             = ipgre_tunnel_uninit;
2002         dev->hard_start_xmit    = ipgre_tunnel_xmit;
2003 #ifndef HAVE_NETDEV_STATS
2004         dev->get_stats          = ipgre_tunnel_get_stats;
2005 #endif
2006         dev->do_ioctl           = ipgre_tunnel_ioctl;
2007         dev->change_mtu         = ipgre_tunnel_change_mtu;
2008 #endif /* HAVE_NET_DEVICE_OPS */
2009         dev->destructor         = free_netdev;
2010
2011         dev->iflink             = 0;
2012         dev->features           |= NETIF_F_NETNS_LOCAL;
2013         dev->tx_queue_len       = 0;
2014
2015         SET_ETHTOOL_OPS(dev, &ethtool_ops);
2016 }
2017
2018 #ifndef GRE_IOCTL_ONLY
2019 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
2020 {
2021         __be16 flags;
2022
2023         if (!data)
2024                 return 0;
2025
2026         flags = 0;
2027         if (data[IFLA_GRE_IFLAGS])
2028                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
2029         if (data[IFLA_GRE_OFLAGS])
2030                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
2031         if (flags & (GRE_VERSION|GRE_ROUTING))
2032                 return -EINVAL;
2033
2034         return 0;
2035 }
2036
2037 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
2038 {
2039         __be32 daddr;
2040
2041         if (tb[IFLA_ADDRESS]) {
2042                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
2043                         return -EINVAL;
2044                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
2045                         return -EADDRNOTAVAIL;
2046         }
2047
2048         if (!data)
2049                 goto out;
2050
2051         if (data[IFLA_GRE_REMOTE]) {
2052                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
2053                 if (!daddr)
2054                         return -EINVAL;
2055         }
2056
2057 out:
2058         return ipgre_tunnel_validate(tb, data);
2059 }
2060
2061 static void ipgre_netlink_parms(struct nlattr *data[],
2062                                 struct ip_tunnel_parm *parms)
2063 {
2064         memset(parms, 0, sizeof(*parms));
2065
2066         parms->iph.protocol = IPPROTO_GRE;
2067
2068         if (!data)
2069                 return;
2070
2071         if (data[IFLA_GRE_LINK])
2072                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
2073
2074         if (data[IFLA_GRE_IFLAGS])
2075                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
2076
2077         if (data[IFLA_GRE_OFLAGS])
2078                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
2079
2080         if (data[IFLA_GRE_IKEY])
2081                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
2082
2083         if (data[IFLA_GRE_OKEY])
2084                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
2085
2086         if (data[IFLA_GRE_LOCAL])
2087                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
2088
2089         if (data[IFLA_GRE_REMOTE])
2090                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
2091
2092         if (data[IFLA_GRE_TTL])
2093                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
2094
2095         if (data[IFLA_GRE_TOS])
2096                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
2097
2098         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
2099                 parms->iph.frag_off = htons(IP_DF);
2100 }
2101
2102 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
2103 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
2104                          struct nlattr *data[])
2105 #else
2106 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
2107                          struct nlattr *data[])
2108 #endif
2109 {
2110         struct ip_tunnel *nt;
2111         struct net *net = dev_net(dev);
2112         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
2113         int mtu;
2114         int err;
2115
2116         nt = netdev_priv(dev);
2117         ipgre_netlink_parms(data, &nt->parms);
2118
2119         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
2120                 return -EEXIST;
2121
2122         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
2123                 random_ether_addr(dev->dev_addr);
2124
2125         mtu = ipgre_tunnel_bind_dev(dev);
2126         if (!tb[IFLA_MTU])
2127                 dev->mtu = mtu;
2128
2129         err = register_netdevice(dev);
2130         if (err)
2131                 goto out;
2132
2133         dev_hold(dev);
2134         ipgre_tunnel_link(ign, nt);
2135
2136 out:
2137         return err;
2138 }
2139
2140 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
2141                             struct nlattr *data[])
2142 {
2143         struct ip_tunnel *t, *nt;
2144         struct net *net = dev_net(dev);
2145         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
2146         struct ip_tunnel_parm p;
2147         int mtu;
2148
2149         if (dev == ign->fb_tunnel_dev)
2150                 return -EINVAL;
2151
2152         nt = netdev_priv(dev);
2153         ipgre_netlink_parms(data, &p);
2154
2155         t = ipgre_tunnel_locate(net, &p, false, 0);
2156
2157         if (t) {
2158                 if (t->dev != dev)
2159                         return -EEXIST;
2160         } else {
2161                 t = nt;
2162
2163                 if (dev->type != ARPHRD_ETHER) {
2164                         unsigned nflags = 0;
2165
2166                         if (ipv4_is_multicast(p.iph.daddr))
2167                                 nflags = IFF_BROADCAST;
2168                         else if (p.iph.daddr)
2169                                 nflags = IFF_POINTOPOINT;
2170
2171                         if ((dev->flags ^ nflags) &
2172                             (IFF_POINTOPOINT | IFF_BROADCAST))
2173                                 return -EINVAL;
2174                 }
2175
2176                 ipgre_tunnel_unlink(ign, t);
2177                 t->parms.iph.saddr = p.iph.saddr;
2178                 t->parms.iph.daddr = p.iph.daddr;
2179                 t->parms.i_key = p.i_key;
2180                 if (dev->type != ARPHRD_ETHER) {
2181                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
2182                         memcpy(dev->broadcast, &p.iph.daddr, 4);
2183                 }
2184                 ipgre_tunnel_link(ign, t);
2185                 netdev_state_change(dev);
2186         }
2187
2188         t->parms.o_key = p.o_key;
2189         t->parms.iph.ttl = p.iph.ttl;
2190         t->parms.iph.tos = p.iph.tos;
2191         t->parms.iph.frag_off = p.iph.frag_off;
2192
2193         if (t->parms.link != p.link) {
2194                 t->parms.link = p.link;
2195                 mtu = ipgre_tunnel_bind_dev(dev);
2196                 if (!tb[IFLA_MTU])
2197                         dev->mtu = mtu;
2198                 netdev_state_change(dev);
2199         }
2200
2201         return 0;
2202 }
2203
2204 static size_t ipgre_get_size(const struct net_device *dev)
2205 {
2206         return
2207                 /* IFLA_GRE_LINK */
2208                 nla_total_size(4) +
2209                 /* IFLA_GRE_IFLAGS */
2210                 nla_total_size(2) +
2211                 /* IFLA_GRE_OFLAGS */
2212                 nla_total_size(2) +
2213                 /* IFLA_GRE_IKEY */
2214                 nla_total_size(4) +
2215                 /* IFLA_GRE_OKEY */
2216                 nla_total_size(4) +
2217                 /* IFLA_GRE_LOCAL */
2218                 nla_total_size(4) +
2219                 /* IFLA_GRE_REMOTE */
2220                 nla_total_size(4) +
2221                 /* IFLA_GRE_TTL */
2222                 nla_total_size(1) +
2223                 /* IFLA_GRE_TOS */
2224                 nla_total_size(1) +
2225                 /* IFLA_GRE_PMTUDISC */
2226                 nla_total_size(1) +
2227                 0;
2228 }
2229
2230 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
2231 {
2232         struct ip_tunnel *t = netdev_priv(dev);
2233         struct ip_tunnel_parm *p = &t->parms;
2234
2235         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
2236         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
2237         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
2238         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
2239         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
2240         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
2241         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
2242         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
2243         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
2244         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
2245
2246         return 0;
2247
2248 nla_put_failure:
2249         return -EMSGSIZE;
2250 }
2251
2252 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
2253         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
2254         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
2255         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
2256         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
2257         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
2258         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
2259         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
2260         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
2261         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
2262         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
2263 };
2264
2265 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
2266         .kind           = "gre",
2267         .maxtype        = IFLA_GRE_MAX,
2268         .policy         = ipgre_policy,
2269         .priv_size      = sizeof(struct ip_tunnel),
2270         .setup          = ipgre_tunnel_setup,
2271         .validate       = ipgre_tunnel_validate,
2272         .newlink        = ipgre_newlink,
2273         .changelink     = ipgre_changelink,
2274         .get_size       = ipgre_get_size,
2275         .fill_info      = ipgre_fill_info,
2276 };
2277
2278 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
2279         .kind           = "gretap",
2280         .maxtype        = IFLA_GRE_MAX,
2281         .policy         = ipgre_policy,
2282         .priv_size      = sizeof(struct ip_tunnel),
2283         .setup          = ipgre_tap_setup,
2284         .validate       = ipgre_tap_validate,
2285         .newlink        = ipgre_newlink,
2286         .changelink     = ipgre_changelink,
2287         .get_size       = ipgre_get_size,
2288         .fill_info      = ipgre_fill_info,
2289 };
2290 #endif
2291
2292 /*
2293  *      And now the modules code and kernel interface.
2294  */
2295
2296 static int __init ipgre_init(void)
2297 {
2298         int err;
2299
2300         printk(KERN_INFO "Open vSwitch GRE over IPv4, built "__DATE__" "
2301                          __TIME__"\n");
2302
2303         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
2304                 printk(KERN_INFO "ipgre init: can't add protocol\n");
2305                 return -EAGAIN;
2306         }
2307
2308         err = register_pernet_device(&ipgre_net_ops);
2309         if (err < 0)
2310                 goto pernet_device_failed;
2311
2312 #ifndef GRE_IOCTL_ONLY
2313         err = rtnl_link_register(&ipgre_link_ops);
2314         if (err < 0)
2315                 goto rtnl_link_failed;
2316
2317         err = rtnl_link_register(&ipgre_tap_ops);
2318         if (err < 0)
2319                 goto tap_ops_failed;
2320 #endif
2321
2322 out:
2323         return err;
2324
2325 #ifndef GRE_IOCTL_ONLY
2326 tap_ops_failed:
2327         rtnl_link_unregister(&ipgre_link_ops);
2328 rtnl_link_failed:
2329         unregister_pernet_device(&ipgre_net_ops);
2330 #endif
2331 pernet_device_failed:
2332         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
2333         goto out;
2334
2335 }
2336
2337 static void __exit ipgre_fini(void)
2338 {
2339 #ifndef GRE_IOCTL_ONLY
2340         rtnl_link_unregister(&ipgre_tap_ops);
2341         rtnl_link_unregister(&ipgre_link_ops);
2342 #endif
2343         unregister_pernet_device(&ipgre_net_ops);
2344         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
2345                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
2346 }
2347
2348 module_init(ipgre_init);
2349 module_exit(ipgre_fini);
2350 MODULE_DESCRIPTION("GRE over IPv4 tunneling driver");
2351 MODULE_LICENSE("GPL");
2352 #ifndef GRE_IOCTL_ONLY
2353 MODULE_ALIAS_RTNL_LINK("gre");
2354 MODULE_ALIAS_RTNL_LINK("gretap");
2355 #endif
2356