5a4b98baf20e719285dd2d64f5694a55957cb202
[sliver-openvswitch.git] / datapath / linux-2.6 / compat-2.6 / ip_gre.c
1 /* ip_gre driver port to Linux 2.6.18 and greater */
2
3 #include <linux/version.h>
4 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
5 #define HAVE_NETDEV_STATS
6 #endif
7 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
8 #define HAVE_NETDEV_HEADER_OPS
9 #endif
10 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
11 #define HAVE_NETDEV_NEEDED_HEADROOM
12 #endif
13
14 /*
15  *      Linux NET3:     GRE over IP protocol decoder.
16  *
17  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
18  *
19  *      This program is free software; you can redistribute it and/or
20  *      modify it under the terms of the GNU General Public License
21  *      as published by the Free Software Foundation; either version
22  *      2 of the License, or (at your option) any later version.
23  *
24  */
25
26 #include <linux/capability.h>
27 #include <linux/module.h>
28 #include <linux/types.h>
29 #include <linux/kernel.h>
30 #include <asm/uaccess.h>
31 #include <linux/skbuff.h>
32 #include <linux/netdevice.h>
33 #include <linux/in.h>
34 #include <linux/tcp.h>
35 #include <linux/udp.h>
36 #include <linux/if_arp.h>
37 #include <linux/mroute.h>
38 #include <linux/init.h>
39 #include <linux/in6.h>
40 #include <linux/inetdevice.h>
41 #include <linux/igmp.h>
42 #include <linux/netfilter_ipv4.h>
43 #include <linux/etherdevice.h>
44 #include <linux/if_ether.h>
45
46 #include <net/sock.h>
47 #include <net/ip.h>
48 #include <net/icmp.h>
49 #include <net/protocol.h>
50 #include <net/ipip.h>
51 #include <net/arp.h>
52 #include <net/checksum.h>
53 #include <net/dsfield.h>
54 #include <net/inet_ecn.h>
55 #include <net/xfrm.h>
56 #include <net/net_namespace.h>
57 #include <net/netns/generic.h>
58
59 #ifdef CONFIG_IPV6
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 #include "compat.h"
66 #include "openvswitch/gre.h"
67
68 #ifndef GRE_IOCTL_ONLY
69 #include <net/rtnetlink.h>
70 #endif
71
72 /*
73    Problems & solutions
74    --------------------
75
76    1. The most important issue is detecting local dead loops.
77    They would cause complete host lockup in transmit, which
78    would be "resolved" by stack overflow or, if queueing is enabled,
79    with infinite looping in net_bh.
80
81    We cannot track such dead loops during route installation,
82    it is infeasible task. The most general solutions would be
83    to keep skb->encapsulation counter (sort of local ttl),
84    and silently drop packet when it expires. It is the best
85    solution, but it supposes maintaing new variable in ALL
86    skb, even if no tunneling is used.
87
88    Current solution: HARD_TX_LOCK lock breaks dead loops.
89
90
91
92    2. Networking dead loops would not kill routers, but would really
93    kill network. IP hop limit plays role of "t->recursion" in this case,
94    if we copy it from packet being encapsulated to upper header.
95    It is very good solution, but it introduces two problems:
96
97    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
98      do not work over tunnels.
99    - traceroute does not work. I planned to relay ICMP from tunnel,
100      so that this problem would be solved and traceroute output
101      would even more informative. This idea appeared to be wrong:
102      only Linux complies to rfc1812 now (yes, guys, Linux is the only
103      true router now :-)), all routers (at least, in neighbourhood of mine)
104      return only 8 bytes of payload. It is the end.
105
106    Hence, if we want that OSPF worked or traceroute said something reasonable,
107    we should search for another solution.
108
109    One of them is to parse packet trying to detect inner encapsulation
110    made by our node. It is difficult or even impossible, especially,
111    taking into account fragmentation. TO be short, tt is not solution at all.
112
113    Current solution: The solution was UNEXPECTEDLY SIMPLE.
114    We force DF flag on tunnels with preconfigured hop limit,
115    that is ALL. :-) Well, it does not remove the problem completely,
116    but exponential growth of network traffic is changed to linear
117    (branches, that exceed pmtu are pruned) and tunnel mtu
118    fastly degrades to value <68, where looping stops.
119    Yes, it is not good if there exists a router in the loop,
120    which does not force DF, even when encapsulating packets have DF set.
121    But it is not our problem! Nobody could accuse us, we made
122    all that we could make. Even if it is your gated who injected
123    fatal route to network, even if it were you who configured
124    fatal static route: you are innocent. :-)
125
126
127
128    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
129    practically identical code. It would be good to glue them
130    together, but it is not very evident, how to make them modular.
131    sit is integral part of IPv6, ipip and gre are naturally modular.
132    We could extract common parts (hash table, ioctl etc)
133    to a separate module (ip_tunnel.c).
134
135    Alexey Kuznetsov.
136  */
137
138 #ifndef GRE_IOCTL_ONLY
139 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
140 static struct rtnl_link_ops ipgre_tap_ops __read_mostly;
141 #endif
142 static int ipgre_tunnel_init(struct net_device *dev);
143 static void ipgre_tunnel_setup(struct net_device *dev);
144 static void ipgre_tap_setup(struct net_device *dev);
145 static int ipgre_tunnel_bind_dev(struct net_device *dev);
146
147 #define HASH_SIZE  16
148
149 static int ipgre_net_id;
150 struct ipgre_net {
151         struct ip_tunnel *tunnels[4][HASH_SIZE];
152
153         struct net_device *fb_tunnel_dev;
154 };
155
156 /* Tunnel hash table */
157
158 /*
159    4 hash tables:
160
161    3: (remote,local)
162    2: (remote,*)
163    1: (*,local)
164    0: (*,*)
165
166    We require exact key match i.e. if a key is present in packet
167    it will match only tunnel with the same key; if it is not present,
168    it will match only keyless tunnel.
169
170    All keysless packets, if not matched configured keyless tunnels
171    will match fallback tunnel.
172  */
173
174 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
175
176 #define tunnels_r_l     tunnels[3]
177 #define tunnels_r       tunnels[2]
178 #define tunnels_l       tunnels[1]
179 #define tunnels_wc      tunnels[0]
180
181 static DEFINE_RWLOCK(ipgre_lock);
182
183 /* Given src, dst and key, find appropriate for input tunnel. */
184
185 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
186                                               __be32 remote, __be32 local,
187                                               __be32 key, __be16 gre_proto)
188 {
189         struct net *net = dev_net(dev);
190         int link = dev->ifindex;
191         unsigned h0 = HASH(remote);
192         unsigned h1 = HASH(key);
193         struct ip_tunnel *t, *cand = NULL;
194         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
195         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
196                        ARPHRD_ETHER : ARPHRD_IPGRE;
197         int score, cand_score = 4;
198
199         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
200                 if (local != t->parms.iph.saddr ||
201                     remote != t->parms.iph.daddr ||
202                     key != t->parms.i_key ||
203                     !(t->dev->flags & IFF_UP))
204                         continue;
205
206                 if (t->dev->type != ARPHRD_IPGRE &&
207                     t->dev->type != dev_type)
208                         continue;
209
210                 score = 0;
211                 if (t->parms.link != link)
212                         score |= 1;
213                 if (t->dev->type != dev_type)
214                         score |= 2;
215                 if (score == 0)
216                         return t;
217
218                 if (score < cand_score) {
219                         cand = t;
220                         cand_score = score;
221                 }
222         }
223
224         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
225                 if (remote != t->parms.iph.daddr ||
226                     key != t->parms.i_key ||
227                     !(t->dev->flags & IFF_UP))
228                         continue;
229
230                 if (t->dev->type != ARPHRD_IPGRE &&
231                     t->dev->type != dev_type)
232                         continue;
233
234                 score = 0;
235                 if (t->parms.link != link)
236                         score |= 1;
237                 if (t->dev->type != dev_type)
238                         score |= 2;
239                 if (score == 0)
240                         return t;
241
242                 if (score < cand_score) {
243                         cand = t;
244                         cand_score = score;
245                 }
246         }
247
248         for (t = ign->tunnels_l[h1]; t; t = t->next) {
249                 if ((local != t->parms.iph.saddr &&
250                      (local != t->parms.iph.daddr ||
251                       !ipv4_is_multicast(local))) ||
252                     key != t->parms.i_key ||
253                     !(t->dev->flags & IFF_UP))
254                         continue;
255
256                 if (t->dev->type != ARPHRD_IPGRE &&
257                     t->dev->type != dev_type)
258                         continue;
259
260                 score = 0;
261                 if (t->parms.link != link)
262                         score |= 1;
263                 if (t->dev->type != dev_type)
264                         score |= 2;
265                 if (score == 0)
266                         return t;
267
268                 if (score < cand_score) {
269                         cand = t;
270                         cand_score = score;
271                 }
272         }
273
274         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
275                 if (t->parms.i_key != key ||
276                     !(t->dev->flags & IFF_UP))
277                         continue;
278
279                 if (t->dev->type != ARPHRD_IPGRE &&
280                     t->dev->type != dev_type)
281                         continue;
282
283                 score = 0;
284                 if (t->parms.link != link)
285                         score |= 1;
286                 if (t->dev->type != dev_type)
287                         score |= 2;
288                 if (score == 0)
289                         return t;
290
291                 if (score < cand_score) {
292                         cand = t;
293                         cand_score = score;
294                 }
295         }
296
297         if (cand != NULL)
298                 return cand;
299
300         if (ign->fb_tunnel_dev->flags & IFF_UP)
301                 return netdev_priv(ign->fb_tunnel_dev);
302
303         return NULL;
304 }
305
306 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
307                 struct ip_tunnel_parm *parms)
308 {
309         __be32 remote = parms->iph.daddr;
310         __be32 local = parms->iph.saddr;
311         __be32 key = parms->i_key;
312         unsigned h = HASH(key);
313         int prio = 0;
314
315         if (local)
316                 prio |= 1;
317         if (remote && !ipv4_is_multicast(remote)) {
318                 prio |= 2;
319                 h ^= HASH(remote);
320         }
321
322         return &ign->tunnels[prio][h];
323 }
324
325 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
326                 struct ip_tunnel *t)
327 {
328         return __ipgre_bucket(ign, &t->parms);
329 }
330
331 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
332 {
333         struct ip_tunnel **tp = ipgre_bucket(ign, t);
334
335         t->next = *tp;
336         write_lock_bh(&ipgre_lock);
337         *tp = t;
338         write_unlock_bh(&ipgre_lock);
339 }
340
341 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
342 {
343         struct ip_tunnel **tp;
344
345         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
346                 if (t == *tp) {
347                         write_lock_bh(&ipgre_lock);
348                         *tp = t->next;
349                         write_unlock_bh(&ipgre_lock);
350                         break;
351                 }
352         }
353 }
354
355 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
356                                            struct ip_tunnel_parm *parms,
357                                            int type)
358 {
359         __be32 remote = parms->iph.daddr;
360         __be32 local = parms->iph.saddr;
361         __be32 key = parms->i_key;
362         int link = parms->link;
363         struct ip_tunnel *t, **tp;
364         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
365
366         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
367                 if (local == t->parms.iph.saddr &&
368                     remote == t->parms.iph.daddr &&
369                     key == t->parms.i_key &&
370                     link == t->parms.link &&
371                     type == t->dev->type)
372                         break;
373
374         return t;
375 }
376
377 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
378                 struct ip_tunnel_parm *parms, int gretap, int create)
379 {
380         struct ip_tunnel *t, *nt;
381         struct net_device *dev;
382         char name[IFNAMSIZ];
383         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
384
385         t = ipgre_tunnel_find(net, parms, gretap ? ARPHRD_ETHER : ARPHRD_IPGRE);
386         if (t || !create)
387                 return t;
388
389         if (parms->name[0])
390                 strlcpy(name, parms->name, IFNAMSIZ);
391         else
392                 sprintf(name, "gre%%d");
393
394         dev = alloc_netdev(sizeof(*t), name, gretap ? ipgre_tap_setup
395                                                     : ipgre_tunnel_setup);
396         if (!dev)
397           return NULL;
398
399         dev_net_set(dev, net);
400
401         if (strchr(name, '%')) {
402                 if (dev_alloc_name(dev, name) < 0)
403                         goto failed_free;
404         }
405
406         if (gretap)
407                 random_ether_addr(dev->dev_addr);
408
409 #ifndef GRE_IOCTL_ONLY
410         dev->rtnl_link_ops = gretap ? &ipgre_tap_ops : &ipgre_link_ops;
411 #endif
412         nt = netdev_priv(dev);
413         nt->parms = *parms;
414
415         dev->mtu = ipgre_tunnel_bind_dev(dev);
416
417         if (register_netdevice(dev) < 0)
418                 goto failed_free;
419
420         dev_hold(dev);
421         ipgre_tunnel_link(ign, nt);
422         return nt;
423
424 failed_free:
425         free_netdev(dev);
426         return NULL;
427 }
428
429 static void ipgre_tunnel_uninit(struct net_device *dev)
430 {
431         struct net *net = dev_net(dev);
432         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
433
434         ipgre_tunnel_unlink(ign, netdev_priv(dev));
435         dev_put(dev);
436 }
437
438
439 static void ipgre_err(struct sk_buff *skb, u32 info)
440 {
441
442 /* All the routers (except for Linux) return only
443    8 bytes of packet payload. It means, that precise relaying of
444    ICMP in the real Internet is absolutely infeasible.
445
446    Moreover, Cisco "wise men" put GRE key to the third word
447    in GRE header. It makes impossible maintaining even soft state for keyed
448    GRE tunnels with enabled checksum. Tell them "thank you".
449
450    Well, I wonder, rfc1812 was written by Cisco employee,
451    what the hell these idiots break standrads established
452    by themself???
453  */
454
455         struct iphdr *iph = (struct iphdr *)skb->data;
456         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
457         int grehlen = (iph->ihl<<2) + 4;
458         const int type = icmp_hdr(skb)->type;
459         const int code = icmp_hdr(skb)->code;
460         struct ip_tunnel *t;
461         __be16 flags;
462
463         flags = p[0];
464         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
465                 if (flags&(GRE_VERSION|GRE_ROUTING))
466                         return;
467                 if (flags&GRE_KEY) {
468                         grehlen += 4;
469                         if (flags&GRE_CSUM)
470                                 grehlen += 4;
471                 }
472         }
473
474         /* If only 8 bytes returned, keyed message will be dropped here */
475         if (skb_headlen(skb) < grehlen)
476                 return;
477
478         switch (type) {
479         default:
480         case ICMP_PARAMETERPROB:
481                 return;
482
483         case ICMP_DEST_UNREACH:
484                 switch (code) {
485                 case ICMP_SR_FAILED:
486                 case ICMP_PORT_UNREACH:
487                         /* Impossible event. */
488                         return;
489                 case ICMP_FRAG_NEEDED:
490                         /* Soft state for pmtu is maintained by IP core. */
491                         return;
492                 default:
493                         /* All others are translated to HOST_UNREACH.
494                            rfc2003 contains "deep thoughts" about NET_UNREACH,
495                            I believe they are just ether pollution. --ANK
496                          */
497                         break;
498                 }
499                 break;
500         case ICMP_TIME_EXCEEDED:
501                 if (code != ICMP_EXC_TTL)
502                         return;
503                 break;
504         }
505
506         read_lock(&ipgre_lock);
507         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
508                                 flags & GRE_KEY ?
509                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
510                                 p[1]);
511         if (t == NULL || t->parms.iph.daddr == 0 ||
512             ipv4_is_multicast(t->parms.iph.daddr))
513                 goto out;
514
515         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
516                 goto out;
517
518         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
519                 t->err_count++;
520         else
521                 t->err_count = 1;
522         t->err_time = jiffies;
523 out:
524         read_unlock(&ipgre_lock);
525         return;
526 }
527
528 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
529 {
530         if (INET_ECN_is_ce(iph->tos)) {
531                 if (skb->protocol == htons(ETH_P_IP)) {
532                         IP_ECN_set_ce(ip_hdr(skb));
533                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
534                         IP6_ECN_set_ce(ipv6_hdr(skb));
535                 }
536         }
537 }
538
539 static inline u8
540 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
541 {
542         u8 inner = 0;
543         if (skb->protocol == htons(ETH_P_IP))
544                 inner = old_iph->tos;
545         else if (skb->protocol == htons(ETH_P_IPV6))
546                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
547         return INET_ECN_encapsulate(tos, inner);
548 }
549
550 static int ipgre_rcv(struct sk_buff *skb)
551 {
552         struct iphdr *iph;
553         u8     *h;
554         __be16    flags;
555         __sum16   csum = 0;
556         __be32 key = 0;
557         u32    seqno = 0;
558         struct ip_tunnel *tunnel;
559         int    offset = 4;
560         __be16 gre_proto;
561         unsigned int len;
562
563         if (!pskb_may_pull(skb, 16))
564                 goto drop_nolock;
565
566         iph = ip_hdr(skb);
567         h = skb->data;
568         flags = *(__be16*)h;
569
570         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
571                 /* - Version must be 0.
572                    - We do not support routing headers.
573                  */
574                 if (flags&(GRE_VERSION|GRE_ROUTING))
575                         goto drop_nolock;
576
577                 if (flags&GRE_CSUM) {
578                         switch (skb->ip_summed) {
579                         case CHECKSUM_COMPLETE:
580                                 csum = csum_fold(skb->csum);
581                                 if (!csum)
582                                         break;
583                                 /* fall through */
584                         case CHECKSUM_NONE:
585                                 skb->csum = 0;
586                                 csum = __skb_checksum_complete(skb);
587                                 skb->ip_summed = CHECKSUM_COMPLETE;
588                         }
589                         offset += 4;
590                 }
591                 if (flags&GRE_KEY) {
592                         key = *(__be32*)(h + offset);
593                         offset += 4;
594                 }
595                 if (flags&GRE_SEQ) {
596                         seqno = ntohl(*(__be32*)(h + offset));
597                         offset += 4;
598                 }
599         }
600
601         gre_proto = *(__be16 *)(h + 2);
602
603         read_lock(&ipgre_lock);
604         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
605                                           iph->saddr, iph->daddr, key,
606                                           gre_proto))) {
607                 struct net_device_stats *stats;
608 #ifdef HAVE_NETDEV_STATS
609                 stats = &tunnel->dev->stats;
610 #else
611                 stats = &tunnel->stat;
612 #endif
613
614                 secpath_reset(skb);
615
616                 skb->protocol = gre_proto;
617                 /* WCCP version 1 and 2 protocol decoding.
618                  * - Change protocol to IP
619                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
620                  */
621                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
622                         skb->protocol = htons(ETH_P_IP);
623                         if ((*(h + offset) & 0xF0) != 0x40)
624                                 offset += 4;
625                 }
626
627                 skb->mac_header = skb->network_header;
628                 __pskb_pull(skb, offset);
629                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
630                 skb->pkt_type = PACKET_HOST;
631 #ifdef CONFIG_NET_IPGRE_BROADCAST
632                 if (ipv4_is_multicast(iph->daddr)) {
633                         /* Looped back packet, drop it! */
634                         if (skb_rtable(skb)->fl.iif == 0)
635                                 goto drop;
636                         stats->multicast++;
637                         skb->pkt_type = PACKET_BROADCAST;
638                 }
639 #endif
640
641                 if (((flags&GRE_CSUM) && csum) ||
642                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
643                         stats->rx_crc_errors++;
644                         stats->rx_errors++;
645                         goto drop;
646                 }
647                 if (tunnel->parms.i_flags&GRE_SEQ) {
648                         if (!(flags&GRE_SEQ) ||
649                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
650                                 stats->rx_fifo_errors++;
651                                 stats->rx_errors++;
652                                 goto drop;
653                         }
654                         tunnel->i_seqno = seqno + 1;
655                 }
656
657                 len = skb->len;
658
659                 /* Warning: All skb pointers will be invalidated! */
660                 if (tunnel->dev->type == ARPHRD_ETHER) {
661                         if (!pskb_may_pull(skb, ETH_HLEN)) {
662                                 stats->rx_length_errors++;
663                                 stats->rx_errors++;
664                                 goto drop;
665                         }
666
667                         iph = ip_hdr(skb);
668                         skb->protocol = eth_type_trans(skb, tunnel->dev);
669                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
670                 }
671
672                 stats->rx_packets++;
673                 stats->rx_bytes += len;
674                 skb->dev = tunnel->dev;
675                 skb_dst_drop(skb);
676                 nf_reset(skb);
677
678                 skb_reset_network_header(skb);
679                 ipgre_ecn_decapsulate(iph, skb);
680
681 #ifdef CHECKSUM_HW
682                 /* XXX: Temporary workaround to avoid a panic when doing
683                  * bridging due to multiple meanings of CHECKSUM_HW. */
684                 if (skb->ip_summed == CHECKSUM_HW)
685                         skb->ip_summed = CHECKSUM_NONE;
686 #endif
687
688                 netif_rx(skb);
689                 read_unlock(&ipgre_lock);
690                 return(0);
691         }
692         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
693
694 drop:
695         read_unlock(&ipgre_lock);
696 drop_nolock:
697         kfree_skb(skb);
698         return(0);
699 }
700
701 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
702 {
703         struct ip_tunnel *tunnel = netdev_priv(dev);
704         struct net_device_stats *stats;
705         struct iphdr  *old_iph = ip_hdr(skb);
706         struct iphdr  *tiph;
707         u8     tos;
708         __be16 df;
709         struct rtable *rt;                      /* Route to the other host */
710         struct net_device *tdev;                /* Device to other host */
711         struct iphdr  *iph;                     /* Our new IP header */
712         unsigned int max_headroom;              /* The extra header space needed */
713         int    gre_hlen;
714         __be32 dst;
715         int    mtu;
716
717 #ifdef HAVE_NETDEV_STATS
718         stats = &tunnel->dev->stats;
719 #else
720         stats = &tunnel->stat;
721 #endif
722
723         if (dev->type == ARPHRD_ETHER)
724                 IPCB(skb)->flags = 0;
725
726 #ifdef HAVE_NETDEV_HEADER_OPS
727         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
728 #else
729         if (dev->hard_header && dev->type == ARPHRD_IPGRE) {
730 #endif
731                 gre_hlen = 0;
732                 tiph = (struct iphdr *)skb->data;
733         } else {
734                 gre_hlen = tunnel->hlen;
735                 tiph = &tunnel->parms.iph;
736         }
737
738         if ((dst = tiph->daddr) == 0) {
739                 /* NBMA tunnel */
740
741                 if (skb_dst(skb) == NULL) {
742                         stats->tx_fifo_errors++;
743                         goto tx_error;
744                 }
745
746                 if (skb->protocol == htons(ETH_P_IP)) {
747                         rt = skb_rtable(skb);
748                         if ((dst = rt->rt_gateway) == 0)
749                                 goto tx_error_icmp;
750                 }
751 #ifdef CONFIG_IPV6
752                 else if (skb->protocol == htons(ETH_P_IPV6)) {
753                         struct in6_addr *addr6;
754                         int addr_type;
755                         struct neighbour *neigh = skb_dst(skb)->neighbour;
756
757                         if (neigh == NULL)
758                                 goto tx_error;
759
760                         addr6 = (struct in6_addr *)&neigh->primary_key;
761                         addr_type = ipv6_addr_type(addr6);
762
763                         if (addr_type == IPV6_ADDR_ANY) {
764                                 addr6 = &ipv6_hdr(skb)->daddr;
765                                 addr_type = ipv6_addr_type(addr6);
766                         }
767
768                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
769                                 goto tx_error_icmp;
770
771                         dst = addr6->s6_addr32[3];
772                 }
773 #endif
774                 else
775                         goto tx_error;
776         }
777
778         tos = tiph->tos;
779         if (tos == 1) {
780                 tos = 0;
781                 if (skb->protocol == htons(ETH_P_IP))
782                         tos = old_iph->tos;
783         }
784
785         {
786                 struct flowi fl = { .oif = tunnel->parms.link,
787                                     .nl_u = { .ip4_u =
788                                               { .daddr = dst,
789                                                 .saddr = tiph->saddr,
790                                                 .tos = RT_TOS(tos) } },
791                                     .proto = IPPROTO_GRE };
792                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
793                         stats->tx_carrier_errors++;
794                         goto tx_error;
795                 }
796         }
797         tdev = rt->u.dst.dev;
798
799         if (tdev == dev) {
800                 ip_rt_put(rt);
801                 stats->collisions++;
802                 goto tx_error;
803         }
804
805         df = tiph->frag_off;
806         if (df)
807 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
808                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
809 #else
810                 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
811 #endif
812         else
813                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
814
815         if (skb_dst(skb))
816                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
817
818         /* XXX: Temporarily allow fragmentation since DF doesn't
819          * do the right thing with bridging. */
820 /*
821         if (skb->protocol == htons(ETH_P_IP)) {
822                 df |= (old_iph->frag_off&htons(IP_DF));
823
824                 if ((old_iph->frag_off&htons(IP_DF)) &&
825                     mtu < ntohs(old_iph->tot_len)) {
826                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
827                         ip_rt_put(rt);
828                         goto tx_error;
829                 }
830         }
831 #ifdef CONFIG_IPV6
832         else if (skb->protocol == htons(ETH_P_IPV6)) {
833                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
834
835                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
836                         if ((tunnel->parms.iph.daddr &&
837                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
838                             rt6->rt6i_dst.plen == 128) {
839                                 rt6->rt6i_flags |= RTF_MODIFIED;
840                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
841                         }
842                 }
843
844                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
845                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
846                         ip_rt_put(rt);
847                         goto tx_error;
848                 }
849         }
850 #endif
851 */
852         if (tunnel->err_count > 0) {
853                 if (time_before(jiffies,
854                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
855                         tunnel->err_count--;
856
857                         dst_link_failure(skb);
858                 } else
859                         tunnel->err_count = 0;
860         }
861
862         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
863
864         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
865             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
866                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
867                 if (!new_skb) {
868                         ip_rt_put(rt);
869                         stats->tx_dropped++;
870                         dev_kfree_skb(skb);
871                         return NETDEV_TX_OK;
872                 }
873                 if (skb->sk)
874                         skb_set_owner_w(new_skb, skb->sk);
875                 dev_kfree_skb(skb);
876                 skb = new_skb;
877                 old_iph = ip_hdr(skb);
878         }
879
880         skb_reset_transport_header(skb);
881         skb_push(skb, gre_hlen);
882         skb_reset_network_header(skb);
883         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
884         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
885                               IPSKB_REROUTED);
886         skb_dst_drop(skb);
887         skb_dst_set(skb, &rt->u.dst);
888
889         /*
890          *      Push down and install the IPIP header.
891          */
892
893         iph                     =       ip_hdr(skb);
894         iph->version            =       4;
895         iph->ihl                =       sizeof(struct iphdr) >> 2;
896         iph->frag_off           =       df;
897         iph->protocol           =       IPPROTO_GRE;
898         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
899         iph->daddr              =       rt->rt_dst;
900         iph->saddr              =       rt->rt_src;
901
902         if ((iph->ttl = tiph->ttl) == 0) {
903                 if (skb->protocol == htons(ETH_P_IP))
904                         iph->ttl = old_iph->ttl;
905 #ifdef CONFIG_IPV6
906                 else if (skb->protocol == htons(ETH_P_IPV6))
907                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
908 #endif
909                 else
910                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
911         }
912
913         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
914         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
915                                    htons(ETH_P_TEB) : skb->protocol;
916
917         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
918                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
919
920                 if (tunnel->parms.o_flags&GRE_SEQ) {
921                         ++tunnel->o_seqno;
922                         *ptr = htonl(tunnel->o_seqno);
923                         ptr--;
924                 }
925                 if (tunnel->parms.o_flags&GRE_KEY) {
926                         *ptr = tunnel->parms.o_key;
927                         ptr--;
928                 }
929                 if (tunnel->parms.o_flags&GRE_CSUM) {
930                         *ptr = 0;
931                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
932                 }
933         }
934
935         nf_reset(skb);
936
937         IPTUNNEL_XMIT();
938         return NETDEV_TX_OK;
939
940 tx_error_icmp:
941         dst_link_failure(skb);
942
943 tx_error:
944         stats->tx_errors++;
945         dev_kfree_skb(skb);
946         return NETDEV_TX_OK;
947 }
948
949 static int ipgre_tunnel_bind_dev(struct net_device *dev)
950 {
951         struct net_device *tdev = NULL;
952         struct ip_tunnel *tunnel;
953         struct iphdr *iph;
954         int hlen = LL_MAX_HEADER;
955         int mtu = ETH_DATA_LEN;
956         int addend = sizeof(struct iphdr) + 4;
957
958         tunnel = netdev_priv(dev);
959         iph = &tunnel->parms.iph;
960
961         /* Guess output device to choose reasonable mtu and needed_headroom */
962
963         if (iph->daddr) {
964                 struct flowi fl = { .oif = tunnel->parms.link,
965                                     .nl_u = { .ip4_u =
966                                               { .daddr = iph->daddr,
967                                                 .saddr = iph->saddr,
968                                                 .tos = RT_TOS(iph->tos) } },
969                                     .proto = IPPROTO_GRE };
970                 struct rtable *rt;
971                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
972                         tdev = rt->u.dst.dev;
973                         ip_rt_put(rt);
974                 }
975
976                 if (dev->type != ARPHRD_ETHER)
977                         dev->flags |= IFF_POINTOPOINT;
978         }
979
980         if (!tdev && tunnel->parms.link)
981                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
982
983         if (tdev) {
984 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
985                 hlen = tdev->hard_header_len + tdev->needed_headroom;
986 #else
987                 hlen = tdev->hard_header_len;
988 #endif
989                 mtu = tdev->mtu;
990         }
991         dev->iflink = tunnel->parms.link;
992
993         /* Precalculate GRE options length */
994         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
995                 if (tunnel->parms.o_flags&GRE_CSUM)
996                         addend += 4;
997                 if (tunnel->parms.o_flags&GRE_KEY)
998                         addend += 4;
999                 if (tunnel->parms.o_flags&GRE_SEQ)
1000                         addend += 4;
1001         }
1002 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1003         dev->needed_headroom = hlen + addend;
1004         mtu -= dev->hard_header_len + addend;
1005 #else
1006         dev->hard_header_len = hlen + addend;
1007         mtu -= addend;
1008 #endif
1009         tunnel->hlen = addend;
1010
1011         if (mtu < 68)
1012                 mtu = 68;
1013
1014         /* XXX: Set MTU to the maximum possible value.  If we are bridged to a
1015         * device with a larger MTU then packets will be dropped. */
1016         mtu = 65482;
1017
1018         return mtu;
1019 }
1020
1021 static int
1022 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1023 {
1024         int err = 0;
1025         struct ip_tunnel_parm p;
1026         struct ip_tunnel *t;
1027         struct net *net = dev_net(dev);
1028         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1029         int add_tunnel, gretap;
1030
1031         switch (cmd) {
1032         case SIOCGETTUNNEL:
1033                 t = NULL;
1034                 if (dev == ign->fb_tunnel_dev) {
1035                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1036                                 err = -EFAULT;
1037                                 break;
1038                         }
1039                         t = ipgre_tunnel_locate(net, &p, false, 0);
1040                 }
1041                 if (t == NULL)
1042                         t = netdev_priv(dev);
1043                 memcpy(&p, &t->parms, sizeof(p));
1044                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1045                         err = -EFAULT;
1046                 break;
1047
1048         case SIOCADDTUNNEL:
1049         case SIOCCHGTUNNEL:
1050         case SIOCADDGRETAP:
1051         case SIOCCHGGRETAP:
1052                 err = -EPERM;
1053                 if (!capable(CAP_NET_ADMIN))
1054                         goto done;
1055
1056                 err = -EFAULT;
1057                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1058                         goto done;
1059
1060                 err = -EINVAL;
1061                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1062                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1063                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1064                         goto done;
1065
1066                 add_tunnel = (cmd == SIOCADDTUNNEL || cmd == SIOCADDGRETAP);
1067                 gretap = (cmd == SIOCADDGRETAP || cmd == SIOCCHGGRETAP);
1068
1069                 if (p.iph.ttl)
1070                         p.iph.frag_off |= htons(IP_DF);
1071
1072                 if (!(p.i_flags&GRE_KEY))
1073                         p.i_key = 0;
1074                 if (!(p.o_flags&GRE_KEY))
1075                         p.o_key = 0;
1076
1077                 t = ipgre_tunnel_locate(net, &p, gretap, add_tunnel);
1078
1079                 if (dev != ign->fb_tunnel_dev && !add_tunnel) {
1080                         if (t != NULL) {
1081                                 if (t->dev != dev) {
1082                                         err = -EEXIST;
1083                                         break;
1084                                 }
1085                         } else {
1086                                 unsigned nflags = 0;
1087
1088                                 t = netdev_priv(dev);
1089
1090                                 if (ipv4_is_multicast(p.iph.daddr))
1091                                         nflags = IFF_BROADCAST;
1092                                 else if (p.iph.daddr)
1093                                         nflags = IFF_POINTOPOINT;
1094
1095                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1096                                         err = -EINVAL;
1097                                         break;
1098                                 }
1099                                 ipgre_tunnel_unlink(ign, t);
1100                                 t->parms.iph.saddr = p.iph.saddr;
1101                                 t->parms.iph.daddr = p.iph.daddr;
1102                                 t->parms.i_key = p.i_key;
1103                                 t->parms.o_key = p.o_key;
1104                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1105                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1106                                 ipgre_tunnel_link(ign, t);
1107                                 netdev_state_change(dev);
1108                         }
1109                 }
1110
1111                 if (t) {
1112                         err = 0;
1113                         if (!add_tunnel) {
1114                                 t->parms.iph.ttl = p.iph.ttl;
1115                                 t->parms.iph.tos = p.iph.tos;
1116                                 t->parms.iph.frag_off = p.iph.frag_off;
1117                                 if (t->parms.link != p.link) {
1118                                         t->parms.link = p.link;
1119                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1120                                         netdev_state_change(dev);
1121                                 }
1122                         }
1123                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1124                                 err = -EFAULT;
1125                 } else
1126                         err = (add_tunnel ? -ENOBUFS : -ENOENT);
1127                 break;
1128
1129         case SIOCDELTUNNEL:
1130                 err = -EPERM;
1131                 if (!capable(CAP_NET_ADMIN))
1132                         goto done;
1133
1134                 if (dev == ign->fb_tunnel_dev) {
1135                         err = -EFAULT;
1136                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1137                                 goto done;
1138                         err = -ENOENT;
1139                         if ((t = ipgre_tunnel_locate(net, &p, false, 0)) == NULL)
1140                                 goto done;
1141                         err = -EPERM;
1142                         if (t == netdev_priv(ign->fb_tunnel_dev))
1143                                 goto done;
1144                         dev = t->dev;
1145                 }
1146                 unregister_netdevice(dev);
1147                 err = 0;
1148                 break;
1149
1150         default:
1151                 err = -EINVAL;
1152         }
1153
1154 done:
1155         return err;
1156 }
1157
1158 #ifndef HAVE_NETDEV_STATS
1159 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1160 {
1161         return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1162 }
1163 #endif
1164
1165 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1166 {
1167         struct ip_tunnel *tunnel = netdev_priv(dev);
1168         if (new_mtu < 68 ||
1169 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1170         new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1171 #else
1172         new_mtu > 0xFFF8 - tunnel->hlen)
1173 #endif
1174                 return -EINVAL;
1175         dev->mtu = new_mtu;
1176         return 0;
1177 }
1178
1179 /* Nice toy. Unfortunately, useless in real life :-)
1180    It allows to construct virtual multiprotocol broadcast "LAN"
1181    over the Internet, provided multicast routing is tuned.
1182
1183
1184    I have no idea was this bicycle invented before me,
1185    so that I had to set ARPHRD_IPGRE to a random value.
1186    I have an impression, that Cisco could make something similar,
1187    but this feature is apparently missing in IOS<=11.2(8).
1188
1189    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1190    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1191
1192    ping -t 255 224.66.66.66
1193
1194    If nobody answers, mbone does not work.
1195
1196    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1197    ip addr add 10.66.66.<somewhat>/24 dev Universe
1198    ifconfig Universe up
1199    ifconfig Universe add fe80::<Your_real_addr>/10
1200    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1201    ftp 10.66.66.66
1202    ...
1203    ftp fec0:6666:6666::193.233.7.65
1204    ...
1205
1206  */
1207
1208 #ifdef HAVE_NETDEV_HEADER_OPS
1209 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1210                        unsigned short type,
1211                        const void *daddr, const void *saddr, unsigned len)
1212 #else
1213 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1214                         void *daddr, void *saddr, unsigned len)
1215 #endif
1216 {
1217         struct ip_tunnel *t = netdev_priv(dev);
1218         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1219         __be16 *p = (__be16*)(iph+1);
1220
1221         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1222         p[0]            = t->parms.o_flags;
1223         p[1]            = htons(type);
1224
1225         /*
1226          *      Set the source hardware address.
1227          */
1228
1229         if (saddr)
1230                 memcpy(&iph->saddr, saddr, 4);
1231
1232         if (daddr) {
1233                 memcpy(&iph->daddr, daddr, 4);
1234                 return t->hlen;
1235         }
1236         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1237                 return t->hlen;
1238
1239         return -t->hlen;
1240 }
1241
1242 #ifdef HAVE_NETDEV_HEADER_OPS
1243 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1244 #else
1245 static int ipgre_header_parse(struct sk_buff *skb, unsigned char *haddr)
1246 #endif
1247 {
1248         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1249         memcpy(haddr, &iph->saddr, 4);
1250         return 4;
1251 }
1252
1253 #ifdef HAVE_NETDEV_HEADER_OPS
1254 static const struct header_ops ipgre_header_ops = {
1255         .create = ipgre_header,
1256         .parse  = ipgre_header_parse,
1257 };
1258 #endif
1259
1260 #ifdef CONFIG_NET_IPGRE_BROADCAST
1261 static int ipgre_open(struct net_device *dev)
1262 {
1263         struct ip_tunnel *t = netdev_priv(dev);
1264
1265         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1266                 struct flowi fl = { .oif = t->parms.link,
1267                                     .nl_u = { .ip4_u =
1268                                               { .daddr = t->parms.iph.daddr,
1269                                                 .saddr = t->parms.iph.saddr,
1270                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1271                                     .proto = IPPROTO_GRE };
1272                 struct rtable *rt;
1273                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1274                         return -EADDRNOTAVAIL;
1275                 dev = rt->u.dst.dev;
1276                 ip_rt_put(rt);
1277                 if (__in_dev_get_rtnl(dev) == NULL)
1278                         return -EADDRNOTAVAIL;
1279                 t->mlink = dev->ifindex;
1280                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1281         }
1282         return 0;
1283 }
1284
1285 static int ipgre_close(struct net_device *dev)
1286 {
1287         struct ip_tunnel *t = netdev_priv(dev);
1288
1289         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1290                 struct in_device *in_dev;
1291                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1292                 if (in_dev) {
1293                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1294                         in_dev_put(in_dev);
1295                 }
1296         }
1297         return 0;
1298 }
1299
1300 #endif
1301
1302 #ifdef HAVE_NET_DEVICE_OPS
1303 static const struct net_device_ops ipgre_netdev_ops = {
1304         .ndo_init               = ipgre_tunnel_init,
1305         .ndo_uninit             = ipgre_tunnel_uninit,
1306 #ifdef CONFIG_NET_IPGRE_BROADCAST
1307         .ndo_open               = ipgre_open,
1308         .ndo_stop               = ipgre_close,
1309 #endif
1310         .ndo_start_xmit         = ipgre_tunnel_xmit,
1311         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1312         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1313 };
1314 #endif
1315
1316 static void ipgre_tunnel_setup(struct net_device *dev)
1317 {
1318 #ifdef HAVE_NET_DEVICE_OPS
1319         dev->netdev_ops         = &ipgre_netdev_ops;
1320 #else
1321         dev->init               = ipgre_tunnel_init;
1322         dev->uninit             = ipgre_tunnel_uninit;
1323         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1324 #ifndef HAVE_NETDEV_STATS
1325         dev->get_stats          = ipgre_tunnel_get_stats;
1326 #endif
1327         dev->do_ioctl           = ipgre_tunnel_ioctl;
1328         dev->change_mtu         = ipgre_tunnel_change_mtu;
1329 #endif /* HAVE_NET_DEVICE_OPS */
1330         dev->destructor         = free_netdev;
1331
1332         dev->type               = ARPHRD_IPGRE;
1333 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1334         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1335 #else
1336         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1337 #endif
1338         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1339         dev->flags              = IFF_NOARP;
1340         dev->iflink             = 0;
1341         dev->addr_len           = 4;
1342         dev->features           |= NETIF_F_NETNS_LOCAL;
1343         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1344 }
1345
1346 static int ipgre_tunnel_init(struct net_device *dev)
1347 {
1348         struct ip_tunnel *tunnel;
1349         struct iphdr *iph;
1350
1351         tunnel = netdev_priv(dev);
1352         iph = &tunnel->parms.iph;
1353
1354         tunnel->dev = dev;
1355         strcpy(tunnel->parms.name, dev->name);
1356
1357         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1358         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1359
1360         if (iph->daddr) {
1361 #ifdef CONFIG_NET_IPGRE_BROADCAST
1362                 if (ipv4_is_multicast(iph->daddr)) {
1363                         if (!iph->saddr)
1364                                 return -EINVAL;
1365                         dev->flags = IFF_BROADCAST;
1366 #ifdef HAVE_NETDEV_HEADER_OPS
1367                         dev->header_ops = &ipgre_header_ops;
1368 #else
1369                         dev->hard_header = ipgre_header;
1370                         dev->hard_header_parse = ipgre_header_parse;
1371 #endif
1372 #ifndef HAVE_NET_DEVICE_OPS
1373                         dev->open = ipgre_open;
1374                         dev->stop = ipgre_close;
1375 #endif
1376                 }
1377 #endif
1378         } else {
1379 #ifdef HAVE_NETDEV_HEADER_OPS
1380                 dev->header_ops = &ipgre_header_ops;
1381 #else
1382                 dev->hard_header = ipgre_header;
1383                 dev->hard_header_parse = ipgre_header_parse;
1384 #endif
1385         }
1386
1387         return 0;
1388 }
1389
1390 #ifdef HAVE_NET_DEVICE_OPS
1391 static void ipgre_fb_tunnel_init(struct net_device *dev)
1392 #else
1393 static int ipgre_fb_tunnel_init(struct net_device *dev)
1394 #endif
1395 {
1396         struct ip_tunnel *tunnel = netdev_priv(dev);
1397         struct iphdr *iph = &tunnel->parms.iph;
1398         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1399
1400         tunnel->dev = dev;
1401         strcpy(tunnel->parms.name, dev->name);
1402
1403         iph->version            = 4;
1404         iph->protocol           = IPPROTO_GRE;
1405         iph->ihl                = 5;
1406         tunnel->hlen            = sizeof(struct iphdr) + 4;
1407
1408         dev_hold(dev);
1409         ign->tunnels_wc[0]      = tunnel;
1410
1411 #ifndef HAVE_NET_DEVICE_OPS
1412         return 0;
1413 #endif
1414 }
1415
1416 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
1417 static struct net_protocol ipgre_protocol = {
1418 #else
1419 static const struct net_protocol ipgre_protocol = {
1420 #endif
1421         .handler        =       ipgre_rcv,
1422         .err_handler    =       ipgre_err,
1423 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
1424         .netns_ok       =       1,
1425 #endif
1426 };
1427
1428 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1429 {
1430         int prio;
1431
1432         for (prio = 0; prio < 4; prio++) {
1433                 int h;
1434                 for (h = 0; h < HASH_SIZE; h++) {
1435                         struct ip_tunnel *t;
1436                         while ((t = ign->tunnels[prio][h]) != NULL)
1437                                 unregister_netdevice(t->dev);
1438                 }
1439         }
1440 }
1441
1442 static int ipgre_init_net(struct net *net)
1443 {
1444         int err;
1445         struct ipgre_net *ign;
1446
1447         err = -ENOMEM;
1448         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1449         if (ign == NULL)
1450                 goto err_alloc;
1451
1452         err = net_assign_generic(net, ipgre_net_id, ign);
1453         if (err < 0)
1454                 goto err_assign;
1455
1456         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), GRE_IOCTL_DEVICE,
1457                                            ipgre_tunnel_setup);
1458         if (!ign->fb_tunnel_dev) {
1459                 err = -ENOMEM;
1460                 goto err_alloc_dev;
1461         }
1462         dev_net_set(ign->fb_tunnel_dev, net);
1463
1464 #ifdef HAVE_NET_DEVICE_OPS
1465         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1466 #else
1467         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1468 #endif
1469 #ifndef GRE_IOCTL_ONLY
1470         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1471 #endif
1472
1473         if ((err = register_netdev(ign->fb_tunnel_dev)))
1474                 goto err_reg_dev;
1475
1476         return 0;
1477
1478 err_reg_dev:
1479         free_netdev(ign->fb_tunnel_dev);
1480 err_alloc_dev:
1481         /* nothing */
1482 err_assign:
1483         kfree(ign);
1484 err_alloc:
1485         return err;
1486 }
1487
1488 static void ipgre_exit_net(struct net *net)
1489 {
1490         struct ipgre_net *ign;
1491
1492         ign = net_generic(net, ipgre_net_id);
1493         rtnl_lock();
1494         ipgre_destroy_tunnels(ign);
1495         rtnl_unlock();
1496         kfree(ign);
1497 }
1498
1499 static struct pernet_operations ipgre_net_ops = {
1500         .init = ipgre_init_net,
1501         .exit = ipgre_exit_net,
1502 };
1503
1504 static int ipgre_tap_init(struct net_device *dev)
1505 {
1506         struct ip_tunnel *tunnel;
1507
1508         tunnel = netdev_priv(dev);
1509
1510         tunnel->dev = dev;
1511         strcpy(tunnel->parms.name, dev->name);
1512
1513         ipgre_tunnel_bind_dev(dev);
1514
1515         return 0;
1516 }
1517
1518 #ifdef HAVE_NET_DEVICE_OPS
1519 static const struct net_device_ops ipgre_tap_netdev_ops = {
1520         .ndo_init               = ipgre_tap_init,
1521         .ndo_uninit             = ipgre_tunnel_uninit,
1522         .ndo_start_xmit         = ipgre_tunnel_xmit,
1523         .ndo_set_mac_address    = eth_mac_addr,
1524         .ndo_validate_addr      = eth_validate_addr,
1525         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1526         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1527 };
1528 #endif
1529
1530 static void ipgre_tap_setup(struct net_device *dev)
1531 {
1532         ether_setup(dev);
1533
1534 #ifdef HAVE_NET_DEVICE_OPS
1535         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1536 #else
1537         dev->init               = ipgre_tap_init;
1538         dev->uninit             = ipgre_tunnel_uninit;
1539         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1540 #ifndef HAVE_NETDEV_STATS
1541         dev->get_stats          = ipgre_tunnel_get_stats;
1542 #endif
1543         dev->do_ioctl           = ipgre_tunnel_ioctl;
1544         dev->change_mtu         = ipgre_tunnel_change_mtu;
1545 #endif /* HAVE_NET_DEVICE_OPS */
1546         dev->destructor         = free_netdev;
1547
1548         dev->iflink             = 0;
1549         dev->features           |= NETIF_F_NETNS_LOCAL;
1550 }
1551
1552 #ifndef GRE_IOCTL_ONLY
1553 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1554 {
1555         __be16 flags;
1556
1557         if (!data)
1558                 return 0;
1559
1560         flags = 0;
1561         if (data[IFLA_GRE_IFLAGS])
1562                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1563         if (data[IFLA_GRE_OFLAGS])
1564                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1565         if (flags & (GRE_VERSION|GRE_ROUTING))
1566                 return -EINVAL;
1567
1568         return 0;
1569 }
1570
1571 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1572 {
1573         __be32 daddr;
1574
1575         if (tb[IFLA_ADDRESS]) {
1576                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1577                         return -EINVAL;
1578                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1579                         return -EADDRNOTAVAIL;
1580         }
1581
1582         if (!data)
1583                 goto out;
1584
1585         if (data[IFLA_GRE_REMOTE]) {
1586                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1587                 if (!daddr)
1588                         return -EINVAL;
1589         }
1590
1591 out:
1592         return ipgre_tunnel_validate(tb, data);
1593 }
1594
1595 static void ipgre_netlink_parms(struct nlattr *data[],
1596                                 struct ip_tunnel_parm *parms)
1597 {
1598         memset(parms, 0, sizeof(*parms));
1599
1600         parms->iph.protocol = IPPROTO_GRE;
1601
1602         if (!data)
1603                 return;
1604
1605         if (data[IFLA_GRE_LINK])
1606                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1607
1608         if (data[IFLA_GRE_IFLAGS])
1609                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1610
1611         if (data[IFLA_GRE_OFLAGS])
1612                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1613
1614         if (data[IFLA_GRE_IKEY])
1615                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1616
1617         if (data[IFLA_GRE_OKEY])
1618                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1619
1620         if (data[IFLA_GRE_LOCAL])
1621                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1622
1623         if (data[IFLA_GRE_REMOTE])
1624                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1625
1626         if (data[IFLA_GRE_TTL])
1627                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1628
1629         if (data[IFLA_GRE_TOS])
1630                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1631
1632         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1633                 parms->iph.frag_off = htons(IP_DF);
1634 }
1635
1636 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1637                          struct nlattr *data[])
1638 {
1639         struct ip_tunnel *nt;
1640         struct net *net = dev_net(dev);
1641         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1642         int mtu;
1643         int err;
1644
1645         nt = netdev_priv(dev);
1646         ipgre_netlink_parms(data, &nt->parms);
1647
1648         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1649                 return -EEXIST;
1650
1651         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1652                 random_ether_addr(dev->dev_addr);
1653
1654         mtu = ipgre_tunnel_bind_dev(dev);
1655         if (!tb[IFLA_MTU])
1656                 dev->mtu = mtu;
1657
1658         err = register_netdevice(dev);
1659         if (err)
1660                 goto out;
1661
1662         dev_hold(dev);
1663         ipgre_tunnel_link(ign, nt);
1664
1665 out:
1666         return err;
1667 }
1668
1669 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1670                             struct nlattr *data[])
1671 {
1672         struct ip_tunnel *t, *nt;
1673         struct net *net = dev_net(dev);
1674         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1675         struct ip_tunnel_parm p;
1676         int mtu;
1677
1678         if (dev == ign->fb_tunnel_dev)
1679                 return -EINVAL;
1680
1681         nt = netdev_priv(dev);
1682         ipgre_netlink_parms(data, &p);
1683
1684         t = ipgre_tunnel_locate(net, &p, false, 0);
1685
1686         if (t) {
1687                 if (t->dev != dev)
1688                         return -EEXIST;
1689         } else {
1690                 t = nt;
1691
1692                 if (dev->type != ARPHRD_ETHER) {
1693                         unsigned nflags = 0;
1694
1695                         if (ipv4_is_multicast(p.iph.daddr))
1696                                 nflags = IFF_BROADCAST;
1697                         else if (p.iph.daddr)
1698                                 nflags = IFF_POINTOPOINT;
1699
1700                         if ((dev->flags ^ nflags) &
1701                             (IFF_POINTOPOINT | IFF_BROADCAST))
1702                                 return -EINVAL;
1703                 }
1704
1705                 ipgre_tunnel_unlink(ign, t);
1706                 t->parms.iph.saddr = p.iph.saddr;
1707                 t->parms.iph.daddr = p.iph.daddr;
1708                 t->parms.i_key = p.i_key;
1709                 if (dev->type != ARPHRD_ETHER) {
1710                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1711                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1712                 }
1713                 ipgre_tunnel_link(ign, t);
1714                 netdev_state_change(dev);
1715         }
1716
1717         t->parms.o_key = p.o_key;
1718         t->parms.iph.ttl = p.iph.ttl;
1719         t->parms.iph.tos = p.iph.tos;
1720         t->parms.iph.frag_off = p.iph.frag_off;
1721
1722         if (t->parms.link != p.link) {
1723                 t->parms.link = p.link;
1724                 mtu = ipgre_tunnel_bind_dev(dev);
1725                 if (!tb[IFLA_MTU])
1726                         dev->mtu = mtu;
1727                 netdev_state_change(dev);
1728         }
1729
1730         return 0;
1731 }
1732
1733 static size_t ipgre_get_size(const struct net_device *dev)
1734 {
1735         return
1736                 /* IFLA_GRE_LINK */
1737                 nla_total_size(4) +
1738                 /* IFLA_GRE_IFLAGS */
1739                 nla_total_size(2) +
1740                 /* IFLA_GRE_OFLAGS */
1741                 nla_total_size(2) +
1742                 /* IFLA_GRE_IKEY */
1743                 nla_total_size(4) +
1744                 /* IFLA_GRE_OKEY */
1745                 nla_total_size(4) +
1746                 /* IFLA_GRE_LOCAL */
1747                 nla_total_size(4) +
1748                 /* IFLA_GRE_REMOTE */
1749                 nla_total_size(4) +
1750                 /* IFLA_GRE_TTL */
1751                 nla_total_size(1) +
1752                 /* IFLA_GRE_TOS */
1753                 nla_total_size(1) +
1754                 /* IFLA_GRE_PMTUDISC */
1755                 nla_total_size(1) +
1756                 0;
1757 }
1758
1759 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1760 {
1761         struct ip_tunnel *t = netdev_priv(dev);
1762         struct ip_tunnel_parm *p = &t->parms;
1763
1764         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1765         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1766         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1767         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1768         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1769         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1770         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1771         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1772         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1773         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1774
1775         return 0;
1776
1777 nla_put_failure:
1778         return -EMSGSIZE;
1779 }
1780
1781 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1782         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1783         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1784         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1785         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1786         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1787         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1788         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1789         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1790         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1791         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1792 };
1793
1794 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1795         .kind           = "gre",
1796         .maxtype        = IFLA_GRE_MAX,
1797         .policy         = ipgre_policy,
1798         .priv_size      = sizeof(struct ip_tunnel),
1799         .setup          = ipgre_tunnel_setup,
1800         .validate       = ipgre_tunnel_validate,
1801         .newlink        = ipgre_newlink,
1802         .changelink     = ipgre_changelink,
1803         .get_size       = ipgre_get_size,
1804         .fill_info      = ipgre_fill_info,
1805 };
1806
1807 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1808         .kind           = "gretap",
1809         .maxtype        = IFLA_GRE_MAX,
1810         .policy         = ipgre_policy,
1811         .priv_size      = sizeof(struct ip_tunnel),
1812         .setup          = ipgre_tap_setup,
1813         .validate       = ipgre_tap_validate,
1814         .newlink        = ipgre_newlink,
1815         .changelink     = ipgre_changelink,
1816         .get_size       = ipgre_get_size,
1817         .fill_info      = ipgre_fill_info,
1818 };
1819 #endif
1820
1821 /*
1822  *      And now the modules code and kernel interface.
1823  */
1824
1825 static int __init ipgre_init(void)
1826 {
1827         int err;
1828
1829         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1830
1831         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1832                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1833                 return -EAGAIN;
1834         }
1835
1836         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1837         if (err < 0)
1838                 goto gen_device_failed;
1839
1840 #ifndef GRE_IOCTL_ONLY
1841         err = rtnl_link_register(&ipgre_link_ops);
1842         if (err < 0)
1843                 goto rtnl_link_failed;
1844
1845         err = rtnl_link_register(&ipgre_tap_ops);
1846         if (err < 0)
1847                 goto tap_ops_failed;
1848 #endif
1849
1850 out:
1851         return err;
1852
1853 #ifndef GRE_IOCTL_ONLY
1854 tap_ops_failed:
1855         rtnl_link_unregister(&ipgre_link_ops);
1856 rtnl_link_failed:
1857         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1858 #endif
1859 gen_device_failed:
1860         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1861         goto out;
1862
1863 }
1864
1865 static void __exit ipgre_fini(void)
1866 {
1867 #ifndef GRE_IOCTL_ONLY
1868         rtnl_link_unregister(&ipgre_tap_ops);
1869         rtnl_link_unregister(&ipgre_link_ops);
1870 #endif
1871         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1872         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1873                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1874 }
1875
1876 module_init(ipgre_init);
1877 module_exit(ipgre_fini);
1878 MODULE_DESCRIPTION("GRE over IPv4 tunneling driver");
1879 MODULE_LICENSE("GPL");
1880 #ifndef GRE_IOCTL_ONLY
1881 MODULE_ALIAS_RTNL_LINK("gre");
1882 MODULE_ALIAS_RTNL_LINK("gretap");
1883 #endif
1884