2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_vlan.h>
15 #include <linux/in_route.h>
16 #include <linux/jhash.h>
17 #include <linux/kernel.h>
18 #include <linux/version.h>
20 #include <net/dsfield.h>
23 #include <net/inet_ecn.h>
26 #include <net/protocol.h>
27 #include <net/route.h>
32 #include "openvswitch/gre.h"
36 /* The absolute minimum fragment size. Note that there are many other
37 * definitions of the minimum MTU. */
40 /* The GRE header is composed of a series of sections: a base and then a variable
41 * number of options. */
42 #define GRE_HEADER_SECTION 4
44 struct mutable_config {
47 unsigned char eth_addr[ETH_ALEN];
49 struct gre_port_config port_config;
51 int tunnel_hlen; /* Tunnel header length. */
55 struct tbl_node tbl_node;
59 /* Protected by RCU. */
60 struct mutable_config *mutable;
63 struct vport_ops gre_vport_ops;
65 /* Protected by RCU. */
66 static struct tbl *port_table;
68 /* These are just used as an optimization: they don't require any kind of
69 * synchronization because we could have just as easily read the value before
70 * the port change happened. */
71 static unsigned int key_local_remote_ports;
72 static unsigned int key_remote_ports;
73 static unsigned int local_remote_ports;
74 static unsigned int remote_ports;
76 static inline struct gre_vport *
77 gre_vport_priv(const struct vport *vport)
79 return vport_priv(vport);
82 static inline struct vport *
83 gre_vport_to_vport(const struct gre_vport *gre_vport)
85 return vport_from_priv(gre_vport);
88 static inline struct gre_vport *
89 gre_vport_table_cast(const struct tbl_node *node)
91 return container_of(node, struct gre_vport, tbl_node);
96 free_config(struct rcu_head *rcu)
98 struct mutable_config *c = container_of(rcu, struct mutable_config, rcu);
103 assign_config_rcu(struct vport *vport, struct mutable_config *new_config)
105 struct gre_vport *gre_vport = gre_vport_priv(vport);
106 struct mutable_config *old_config;
108 old_config = rcu_dereference(gre_vport->mutable);
109 rcu_assign_pointer(gre_vport->mutable, new_config);
110 call_rcu(&old_config->rcu, free_config);
113 static unsigned int *
114 find_port_pool(const struct mutable_config *mutable)
116 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) {
117 if (mutable->port_config.saddr)
118 return &local_remote_ports;
120 return &remote_ports;
122 if (mutable->port_config.saddr)
123 return &key_local_remote_ports;
125 return &key_remote_ports;
136 struct port_lookup_key {
137 u32 vals[4]; /* Contains enum lookup_key keys. */
138 const struct mutable_config *mutable;
141 /* Modifies 'target' to store the rcu_dereferenced pointer that was used to do
142 * the comparision. */
144 port_cmp(const struct tbl_node *node, void *target)
146 const struct gre_vport *gre_vport = gre_vport_table_cast(node);
147 struct port_lookup_key *lookup = target;
149 lookup->mutable = rcu_dereference(gre_vport->mutable);
151 return ((lookup->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) ==
152 lookup->vals[LOOKUP_KEY_MATCH]) &&
153 lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] &&
154 lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] &&
155 lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR];
159 port_hash(struct port_lookup_key *lookup)
161 return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
165 add_port(struct vport *vport)
167 struct gre_vport *gre_vport = gre_vport_priv(vport);
168 struct port_lookup_key lookup;
172 struct tbl *new_table;
174 new_table = tbl_create(0);
178 rcu_assign_pointer(port_table, new_table);
180 } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
181 struct tbl *old_table = port_table;
182 struct tbl *new_table;
184 new_table = tbl_expand(old_table);
185 if (IS_ERR(new_table))
186 return PTR_ERR(new_table);
188 rcu_assign_pointer(port_table, new_table);
189 tbl_deferred_destroy(old_table, NULL);
192 lookup.vals[LOOKUP_SADDR] = gre_vport->mutable->port_config.saddr;
193 lookup.vals[LOOKUP_DADDR] = gre_vport->mutable->port_config.daddr;
194 lookup.vals[LOOKUP_KEY] = gre_vport->mutable->port_config.in_key;
195 lookup.vals[LOOKUP_KEY_MATCH] = gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH;
197 err = tbl_insert(port_table, &gre_vport->tbl_node, port_hash(&lookup));
201 (*find_port_pool(gre_vport->mutable))++;
207 del_port(struct vport *vport)
209 struct gre_vport *gre_vport = gre_vport_priv(vport);
212 err = tbl_remove(port_table, &gre_vport->tbl_node);
216 (*find_port_pool(gre_vport->mutable))--;
221 #define FIND_PORT_KEY (1 << 0)
222 #define FIND_PORT_MATCH (1 << 1)
223 #define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH)
225 static struct vport *
226 find_port(__be32 saddr, __be32 daddr, __be32 key, int port_type,
227 const struct mutable_config **mutable)
229 struct port_lookup_key lookup;
230 struct tbl *table = rcu_dereference(port_table);
231 struct tbl_node *tbl_node;
236 lookup.vals[LOOKUP_SADDR] = saddr;
237 lookup.vals[LOOKUP_DADDR] = daddr;
239 if (port_type & FIND_PORT_KEY) {
240 lookup.vals[LOOKUP_KEY] = key;
241 lookup.vals[LOOKUP_KEY_MATCH] = 0;
243 if (key_local_remote_ports) {
244 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
249 if (key_remote_ports) {
250 lookup.vals[LOOKUP_SADDR] = 0;
252 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
256 lookup.vals[LOOKUP_SADDR] = saddr;
260 if (port_type & FIND_PORT_MATCH) {
261 lookup.vals[LOOKUP_KEY] = 0;
262 lookup.vals[LOOKUP_KEY_MATCH] = GRE_F_IN_KEY_MATCH;
264 if (local_remote_ports) {
265 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
271 lookup.vals[LOOKUP_SADDR] = 0;
273 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
282 *mutable = lookup.mutable;
283 return gre_vport_to_vport(gre_vport_table_cast(tbl_node));
287 check_ipv4_address(__be32 addr)
289 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
290 || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
297 ipv4_should_icmp(struct sk_buff *skb)
299 struct iphdr *old_iph = ip_hdr(skb);
301 /* Don't respond to L2 broadcast. */
302 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
305 /* Don't respond to L3 broadcast or invalid addresses. */
306 if (!check_ipv4_address(old_iph->daddr) ||
307 !check_ipv4_address(old_iph->saddr))
310 /* Only respond to the first fragment. */
311 if (old_iph->frag_off & htons(IP_OFFSET))
314 /* Don't respond to ICMP error messages. */
315 if (old_iph->protocol == IPPROTO_ICMP) {
316 u8 icmp_type, *icmp_typep;
318 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
319 (old_iph->ihl << 2) +
320 offsetof(struct icmphdr, type) -
321 skb->data, sizeof(icmp_type),
327 if (*icmp_typep > NR_ICMP_TYPES
328 || (*icmp_typep <= ICMP_PARAMETERPROB
329 && *icmp_typep != ICMP_ECHOREPLY
330 && *icmp_typep != ICMP_ECHO))
338 ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
339 unsigned int mtu, unsigned int payload_length)
341 struct iphdr *iph, *old_iph = ip_hdr(skb);
342 struct icmphdr *icmph;
345 iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
346 icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
347 payload = skb_put(nskb, payload_length);
351 iph->ihl = sizeof(struct iphdr) >> 2;
352 iph->tos = (old_iph->tos & IPTOS_TOS_MASK) |
353 IPTOS_PREC_INTERNETCONTROL;
354 iph->tot_len = htons(sizeof(struct iphdr)
355 + sizeof(struct icmphdr)
357 get_random_bytes(&iph->id, sizeof(iph->id));
360 iph->protocol = IPPROTO_ICMP;
361 iph->daddr = old_iph->saddr;
362 iph->saddr = old_iph->daddr;
367 icmph->type = ICMP_DEST_UNREACH;
368 icmph->code = ICMP_FRAG_NEEDED;
369 icmph->un.gateway = htonl(mtu);
372 nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0);
373 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
374 payload, payload_length,
376 icmph->checksum = csum_fold(nskb->csum);
380 ipv6_should_icmp(struct sk_buff *skb)
382 struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
384 int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
385 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
387 /* Check source address is valid. */
388 addr_type = ipv6_addr_type(&old_ipv6h->saddr);
389 if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
392 /* Don't reply to unspecified addresses. */
393 if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
396 /* Don't respond to ICMP error messages. */
397 payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
401 if (nexthdr == NEXTHDR_ICMP) {
402 u8 icmp_type, *icmp_typep;
404 icmp_typep = skb_header_pointer(skb, payload_off +
405 offsetof(struct icmp6hdr,
407 sizeof(icmp_type), &icmp_type);
409 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
417 ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb, unsigned int mtu,
418 unsigned int payload_length)
420 struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
421 struct icmp6hdr *icmp6h;
424 ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
425 icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
426 payload = skb_put(nskb, payload_length);
431 memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl));
432 ipv6h->payload_len = htons(sizeof(struct icmp6hdr)
434 ipv6h->nexthdr = NEXTHDR_ICMP;
435 ipv6h->hop_limit = IPV6_DEFAULT_HOPLIMIT;
436 ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
437 ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
440 icmp6h->icmp6_type = ICMPV6_PKT_TOOBIG;
441 icmp6h->icmp6_code = 0;
442 icmp6h->icmp6_cksum = 0;
443 icmp6h->icmp6_mtu = htonl(mtu);
445 nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0);
446 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
447 payload, payload_length,
449 icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
450 sizeof(struct icmp6hdr)
452 ipv6h->nexthdr, nskb->csum);
456 send_frag_needed(struct vport *vport, const struct mutable_config *mutable,
457 struct sk_buff *skb, unsigned int mtu)
459 unsigned int eth_hdr_len = ETH_HLEN;
460 unsigned int total_length, header_length, payload_length;
461 struct ethhdr *eh, *old_eh = eth_hdr(skb);
462 struct sk_buff *nskb;
465 if (skb->protocol == htons(ETH_P_IP)) {
466 if (mtu < IP_MIN_MTU)
469 if (!ipv4_should_icmp(skb))
472 if (mtu < IPV6_MIN_MTU)
475 /* In theory we should do PMTUD on IPv6 multicast messages but
476 * we don't have an address to send from so just fragment. */
477 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
480 if (!ipv6_should_icmp(skb))
485 if (old_eh->h_proto == htons(ETH_P_8021Q))
486 eth_hdr_len = VLAN_ETH_HLEN;
488 payload_length = skb->len - eth_hdr_len;
489 if (skb->protocol == htons(ETH_P_IP)) {
490 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
491 total_length = min_t(unsigned int, header_length +
492 payload_length, 576);
494 header_length = sizeof(struct ipv6hdr) +
495 sizeof(struct icmp6hdr);
496 total_length = min_t(unsigned int, header_length +
497 payload_length, IPV6_MIN_MTU);
499 total_length = min(total_length, mutable->mtu);
500 payload_length = total_length - header_length;
502 nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length +
507 skb_reserve(nskb, NET_IP_ALIGN);
509 /* Ethernet / VLAN */
510 eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
511 memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
512 memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN);
513 nskb->protocol = eh->h_proto = old_eh->h_proto;
514 if (old_eh->h_proto == htons(ETH_P_8021Q)) {
515 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
517 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
518 vh->h_vlan_encapsulated_proto = skb->protocol;
520 skb_reset_mac_header(nskb);
523 if (skb->protocol == htons(ETH_P_IP))
524 ipv4_build_icmp(skb, nskb, mtu, payload_length);
526 ipv6_build_icmp(skb, nskb, mtu, payload_length);
528 /* Assume that flow based keys are symmetric with respect to input
529 * and output and use the key that we were going to put on the
530 * outgoing packet for the fake received packet. If the keys are
531 * not symmetric then PMTUD needs to be disabled since we won't have
532 * any way of synthesizing packets. */
533 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) {
534 if (mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
535 OVS_CB(nskb)->tun_id = OVS_CB(skb)->tun_id;
537 OVS_CB(nskb)->tun_id = mutable->port_config.out_key;
540 vport_receive(vport, nskb);
545 static struct sk_buff *
546 check_headroom(struct sk_buff *skb, int headroom)
548 if (skb_headroom(skb) < headroom ||
549 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
550 struct sk_buff *nskb = skb_realloc_headroom(skb, headroom);
553 return ERR_PTR(-ENOMEM);
556 set_skb_csum_bits(skb, nskb);
559 skb_set_owner_w(nskb, skb->sk);
569 create_gre_header(struct sk_buff *skb, const struct mutable_config *mutable)
571 struct iphdr *iph = ip_hdr(skb);
572 __be16 *flags = (__be16 *)(iph + 1);
573 __be16 *protocol = flags + 1;
574 __be32 *options = (__be32 *)((u8 *)iph + mutable->tunnel_hlen
575 - GRE_HEADER_SECTION);
577 *protocol = htons(ETH_P_TEB);
580 /* Work backwards over the options so the checksum is last. */
581 if (mutable->port_config.out_key ||
582 mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) {
585 if (mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
586 *options = OVS_CB(skb)->tun_id;
588 *options = mutable->port_config.out_key;
593 if (mutable->port_config.flags & GRE_F_OUT_CSUM) {
597 *(__sum16 *)options = csum_fold(skb_checksum(skb,
598 sizeof(struct iphdr),
599 skb->len - sizeof(struct iphdr),
605 check_checksum(struct sk_buff *skb)
607 struct iphdr *iph = ip_hdr(skb);
608 __be16 flags = *(__be16 *)(iph + 1);
611 if (flags & GRE_CSUM) {
612 switch (skb->ip_summed) {
613 case CHECKSUM_COMPLETE:
614 csum = csum_fold(skb->csum);
622 csum = __skb_checksum_complete(skb);
623 skb->ip_summed = CHECKSUM_COMPLETE;
632 parse_gre_header(struct iphdr *iph, __be16 *flags, __be32 *key)
634 __be16 *flagsp = (__be16 *)(iph + 1);
635 __be16 *protocol = flagsp + 1;
636 __be32 *options = (__be32 *)(protocol + 1);
641 if (*flags & (GRE_VERSION | GRE_ROUTING))
644 if (*protocol != htons(ETH_P_TEB))
647 hdr_len = GRE_HEADER_SECTION;
649 if (*flags & GRE_CSUM) {
650 hdr_len += GRE_HEADER_SECTION;
654 if (*flags & GRE_KEY) {
655 hdr_len += GRE_HEADER_SECTION;
662 if (*flags & GRE_SEQ)
663 hdr_len += GRE_HEADER_SECTION;
669 ecn_encapsulate(u8 tos, struct sk_buff *skb)
673 if (skb->protocol == htons(ETH_P_IP))
674 inner = ((struct iphdr *)skb_network_header(skb))->tos;
675 else if (skb->protocol == htons(ETH_P_IPV6))
676 inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
680 return INET_ECN_encapsulate(tos, inner);
684 ecn_decapsulate(u8 tos, struct sk_buff *skb)
686 if (INET_ECN_is_ce(tos)) {
687 __be16 protocol = skb->protocol;
688 unsigned int nw_header = skb_network_header(skb) - skb->data;
690 if (skb->protocol == htons(ETH_P_8021Q)) {
691 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
694 protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
695 nw_header += VLAN_HLEN;
698 if (protocol == htons(ETH_P_IP)) {
699 if (unlikely(!pskb_may_pull(skb, nw_header
700 + sizeof(struct iphdr))))
703 IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
704 } else if (protocol == htons(ETH_P_IPV6)) {
705 if (unlikely(!pskb_may_pull(skb, nw_header
706 + sizeof(struct ipv6hdr))))
709 IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
715 static struct sk_buff *
716 handle_gso(struct sk_buff *skb)
718 if (skb_is_gso(skb)) {
719 struct sk_buff *nskb = skb_gso_segment(skb, NETIF_F_SG);
729 handle_csum_offload(struct sk_buff *skb)
731 if (skb->ip_summed == CHECKSUM_PARTIAL)
732 return skb_checksum_help(skb);
737 /* Called with rcu_read_lock and bottom-halves disabled. */
739 gre_err(struct sk_buff *skb, u32 info)
742 const struct mutable_config *mutable;
743 const int type = icmp_hdr(skb)->type;
744 const int code = icmp_hdr(skb)->code;
745 int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
750 int tunnel_hdr_len, tot_hdr_len;
751 unsigned int orig_mac_header;
752 unsigned int orig_nw_header;
754 if (type != ICMP_DEST_UNREACH || code != ICMP_FRAG_NEEDED)
757 /* The mimimum size packet that we would actually be able to process:
758 * encapsulating IP header, minimum GRE header, Ethernet header,
759 * inner IPv4 header. */
760 if (!pskb_may_pull(skb, sizeof(struct iphdr) + GRE_HEADER_SECTION +
761 ETH_HLEN + sizeof(struct iphdr)))
764 iph = (struct iphdr *)skb->data;
766 tunnel_hdr_len = parse_gre_header(iph, &flags, &key);
767 if (tunnel_hdr_len < 0)
770 vport = find_port(iph->saddr, iph->daddr, key, FIND_PORT_ANY, &mutable);
774 if ((mutable->port_config.flags & GRE_F_IN_CSUM) && !(flags & GRE_CSUM))
777 tot_hdr_len = sizeof(struct iphdr) + tunnel_hdr_len;
779 orig_mac_header = skb_mac_header(skb) - skb->data;
780 orig_nw_header = skb_network_header(skb) - skb->data;
781 skb_set_mac_header(skb, tot_hdr_len);
783 tot_hdr_len += ETH_HLEN;
785 skb->protocol = eth_hdr(skb)->h_proto;
786 if (skb->protocol == htons(ETH_P_8021Q)) {
787 tot_hdr_len += VLAN_HLEN;
788 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
791 if (skb->protocol == htons(ETH_P_IP))
792 tot_hdr_len += sizeof(struct iphdr);
793 else if (skb->protocol == htons(ETH_P_IP))
794 tot_hdr_len += sizeof(struct ipv6hdr);
798 if (!pskb_may_pull(skb, tot_hdr_len))
801 skb_set_network_header(skb, tot_hdr_len);
804 if (skb->protocol == htons(ETH_P_IP)) {
805 if (mtu < IP_MIN_MTU) {
806 if (ntohs(ip_hdr(skb)->tot_len) >= IP_MIN_MTU)
812 } else if (skb->protocol == htons(ETH_P_IPV6)) {
813 if (mtu < IPV6_MIN_MTU) {
814 unsigned int packet_length = sizeof(struct ipv6hdr) +
815 ntohs(ipv6_hdr(skb)->payload_len);
817 if (packet_length >= IPV6_MIN_MTU
818 || ntohs(ipv6_hdr(skb)->payload_len) == 0)
825 __pskb_pull(skb, tunnel_hdr_len);
826 send_frag_needed(vport, mutable, skb, mtu);
827 skb_push(skb, tunnel_hdr_len);
830 skb_set_mac_header(skb, orig_mac_header);
831 skb_set_network_header(skb, orig_nw_header);
832 skb->protocol = htons(ETH_P_IP);
835 /* Called with rcu_read_lock and bottom-halves disabled. */
837 gre_rcv(struct sk_buff *skb)
840 const struct mutable_config *mutable;
846 if (!pskb_may_pull(skb, GRE_HEADER_SECTION + ETH_HLEN))
849 if (!check_checksum(skb))
854 hdr_len = parse_gre_header(iph, &flags, &key);
858 vport = find_port(iph->daddr, iph->saddr, key, FIND_PORT_ANY, &mutable);
860 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
864 if ((mutable->port_config.flags & GRE_F_IN_CSUM) && !(flags & GRE_CSUM)) {
865 vport_record_error(vport, VPORT_E_RX_CRC);
869 if (!pskb_pull(skb, hdr_len) || !pskb_may_pull(skb, ETH_HLEN)) {
870 vport_record_error(vport, VPORT_E_RX_ERROR);
874 skb->pkt_type = PACKET_HOST;
875 skb->protocol = eth_type_trans(skb, skb->dev);
876 skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
881 skb_reset_network_header(skb);
883 ecn_decapsulate(iph->tos, skb);
885 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH)
886 OVS_CB(skb)->tun_id = key;
888 OVS_CB(skb)->tun_id = 0;
890 skb_push(skb, ETH_HLEN);
891 vport_receive(vport, skb);
901 build_packet(struct vport *vport, const struct mutable_config *mutable,
902 struct iphdr *iph, struct rtable *rt, int max_headroom, int mtu,
906 struct iphdr *new_iph;
907 int orig_len = skb->len;
908 __be16 frag_off = iph->frag_off;
910 skb = check_headroom(skb, max_headroom);
911 if (unlikely(IS_ERR(skb)))
914 err = handle_csum_offload(skb);
918 if (skb->protocol == htons(ETH_P_IP)) {
919 struct iphdr *old_iph = ip_hdr(skb);
921 if ((old_iph->frag_off & htons(IP_DF)) &&
922 mtu < ntohs(old_iph->tot_len)) {
923 if (send_frag_needed(vport, mutable, skb, mtu))
927 } else if (skb->protocol == htons(ETH_P_IPV6)) {
928 unsigned int packet_length = skb->len - ETH_HLEN
929 - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
931 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
932 if (packet_length > IPV6_MIN_MTU)
933 frag_off = htons(IP_DF);
935 if (mtu < packet_length) {
936 if (send_frag_needed(vport, mutable, skb, mtu))
941 skb_reset_transport_header(skb);
942 new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
943 skb_reset_network_header(skb);
945 memcpy(new_iph, iph, sizeof(struct iphdr));
946 new_iph->frag_off = frag_off;
947 ip_select_ident(new_iph, &rt->u.dst, NULL);
949 create_gre_header(skb, mutable);
951 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
952 IPCB(skb)->flags = 0;
954 err = ip_local_out(skb);
955 if (likely(net_xmit_eval(err) == 0))
958 vport_record_error(vport, VPORT_E_TX_ERROR);
965 vport_record_error(vport, VPORT_E_TX_DROPPED);
971 gre_send(struct vport *vport, struct sk_buff *skb)
973 struct gre_vport *gre_vport = gre_vport_priv(vport);
974 const struct mutable_config *mutable = rcu_dereference(gre_vport->mutable);
976 struct iphdr *old_iph;
977 struct ipv6hdr *old_ipv6h;
984 /* Validate the protocol headers before we try to use them. */
985 if (skb->protocol == htons(ETH_P_8021Q)) {
986 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
989 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
990 skb_set_network_header(skb, VLAN_ETH_HLEN);
993 if (skb->protocol == htons(ETH_P_IP)) {
994 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
995 + sizeof(struct iphdr) - skb->data)))
997 } else if (skb->protocol == htons(ETH_P_IPV6)) {
998 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
999 + sizeof(struct ipv6hdr) - skb->data)))
1003 old_iph = ip_hdr(skb);
1004 old_ipv6h = ipv6_hdr(skb);
1006 iph.tos = mutable->port_config.tos;
1007 if (mutable->port_config.flags & GRE_F_TOS_INHERIT) {
1008 if (skb->protocol == htons(ETH_P_IP))
1009 iph.tos = old_iph->tos;
1010 else if (skb->protocol == htons(ETH_P_IPV6))
1011 iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
1013 iph.tos = ecn_encapsulate(iph.tos, skb);
1016 struct flowi fl = { .nl_u = { .ip4_u =
1017 { .daddr = mutable->port_config.daddr,
1018 .saddr = mutable->port_config.saddr,
1019 .tos = RT_TOS(iph.tos) } },
1020 .proto = IPPROTO_GRE };
1022 if (ip_route_output_key(&init_net, &rt, &fl))
1026 iph.ttl = mutable->port_config.ttl;
1027 if (mutable->port_config.flags & GRE_F_TTL_INHERIT) {
1028 if (skb->protocol == htons(ETH_P_IP))
1029 iph.ttl = old_iph->ttl;
1030 else if (skb->protocol == htons(ETH_P_IPV6))
1031 iph.ttl = old_ipv6h->hop_limit;
1034 iph.ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
1036 iph.frag_off = (mutable->port_config.flags & GRE_F_PMTUD) ? htons(IP_DF) : 0;
1038 mtu = dst_mtu(&rt->u.dst)
1040 - mutable->tunnel_hlen
1041 - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
1045 if (skb->protocol == htons(ETH_P_IP)) {
1046 iph.frag_off |= old_iph->frag_off & htons(IP_DF);
1047 mtu = max(mtu, IP_MIN_MTU);
1049 } else if (skb->protocol == htons(ETH_P_IPV6))
1050 mtu = max(mtu, IPV6_MIN_MTU);
1053 iph.ihl = sizeof(struct iphdr) >> 2;
1054 iph.protocol = IPPROTO_GRE;
1055 iph.daddr = rt->rt_dst;
1056 iph.saddr = rt->rt_src;
1058 /* Allow our local IP stack to fragment the outer packet even if the
1059 * DF bit is set as a last resort. */
1065 skb_dst_set(skb, &rt->u.dst);
1066 skb->ip_summed = CHECKSUM_NONE;
1068 /* If we are doing GSO on a pskb it is better to make sure that the
1069 * headroom is correct now. We will only have to copy the portion in
1070 * the linear data area and GSO will preserve headroom when it creates
1071 * the segments. This is particularly beneficial on Xen where we get
1072 * lots of GSO pskbs. Conversely, we delay copying if it is just to
1073 * get our own writable clone because GSO may do the copy for us. */
1074 max_headroom = LL_RESERVED_SPACE(rt->u.dst.dev) + mutable->tunnel_hlen;
1075 if (skb_headroom(skb) < max_headroom) {
1076 skb = check_headroom(skb, max_headroom);
1077 if (unlikely(IS_ERR(skb))) {
1078 vport_record_error(vport, VPORT_E_TX_DROPPED);
1083 vswitch_skb_checksum_setup(skb);
1084 skb = handle_gso(skb);
1085 if (unlikely(IS_ERR(skb))) {
1086 vport_record_error(vport, VPORT_E_TX_DROPPED);
1090 /* Process GSO segments. Try to do any work on the entire packet that
1091 * doesn't involve actually writing to it before this point. */
1094 struct sk_buff *next_skb = skb->next;
1097 orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
1106 vport_record_error(vport, VPORT_E_TX_ERROR);
1111 static struct net_protocol gre_protocol_handlers = {
1113 .err_handler = gre_err,
1121 err = inet_add_protocol(&gre_protocol_handlers, IPPROTO_GRE);
1123 printk(KERN_WARNING "openvswitch: cannot register gre protocol handler\n");
1131 tbl_destroy(port_table, NULL);
1132 inet_del_protocol(&gre_protocol_handlers, IPPROTO_GRE);
1136 set_config(const struct vport *cur_vport, struct mutable_config *mutable,
1137 const void __user *uconfig)
1139 const struct vport *old_vport;
1140 const struct mutable_config *old_mutable;
1143 if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct gre_port_config)))
1146 if (mutable->port_config.daddr == 0)
1149 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) {
1150 port_type = FIND_PORT_MATCH;
1151 mutable->port_config.in_key = 0;
1153 port_type = FIND_PORT_KEY;
1155 old_vport = find_port(mutable->port_config.saddr,
1156 mutable->port_config.daddr,
1157 mutable->port_config.in_key, port_type,
1160 if (old_vport && old_vport != cur_vport)
1163 mutable->tunnel_hlen = sizeof(struct iphdr) + GRE_HEADER_SECTION;
1165 if (mutable->port_config.flags & GRE_F_OUT_CSUM)
1166 mutable->tunnel_hlen += GRE_HEADER_SECTION;
1168 if (mutable->port_config.out_key ||
1169 mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
1170 mutable->tunnel_hlen += GRE_HEADER_SECTION;
1175 static struct vport *
1176 gre_create(const char *name, const void __user *config)
1178 struct vport *vport;
1179 struct gre_vport *gre_vport;
1182 vport = vport_alloc(sizeof(struct gre_vport), &gre_vport_ops);
1183 if (IS_ERR(vport)) {
1184 err = PTR_ERR(vport);
1188 gre_vport = gre_vport_priv(vport);
1190 strcpy(gre_vport->name, name);
1192 gre_vport->mutable = kmalloc(sizeof(struct mutable_config), GFP_KERNEL);
1193 if (!gre_vport->mutable) {
1195 goto error_free_vport;
1198 vport_gen_ether_addr(gre_vport->mutable->eth_addr);
1199 gre_vport->mutable->mtu = ETH_DATA_LEN;
1201 err = set_config(NULL, gre_vport->mutable, config);
1203 goto error_free_mutable;
1205 err = add_port(vport);
1207 goto error_free_mutable;
1212 kfree(gre_vport->mutable);
1216 return ERR_PTR(err);
1220 gre_modify(struct vport *vport, const void __user *config)
1222 struct gre_vport *gre_vport = gre_vport_priv(vport);
1223 struct mutable_config *mutable;
1225 int update_hash = 0;
1227 mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL);
1233 err = set_config(vport, mutable, config);
1237 /* Only remove the port from the hash table if something that would
1238 * affect the lookup has changed. */
1239 if (gre_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
1240 gre_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
1241 gre_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
1242 (gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) !=
1243 (mutable->port_config.flags & GRE_F_IN_KEY_MATCH))
1247 /* This update is not atomic but the lookup uses the config, which
1248 * serves as an inherent double check. */
1250 err = del_port(vport);
1255 assign_config_rcu(vport, mutable);
1258 err = add_port(vport);
1272 gre_destroy(struct vport *vport)
1274 struct gre_vport *gre_vport = gre_vport_priv(vport);
1276 const struct mutable_config *old_mutable;
1278 /* Do a hash table lookup to make sure that the port exists. It should
1279 * exist but might not if a modify failed earlier. */
1280 if (gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH)
1281 port_type = FIND_PORT_MATCH;
1283 port_type = FIND_PORT_KEY;
1285 if (vport == find_port(gre_vport->mutable->port_config.saddr,
1286 gre_vport->mutable->port_config.daddr,
1287 gre_vport->mutable->port_config.in_key, port_type, &old_mutable))
1290 kfree(gre_vport->mutable);
1297 gre_set_mtu(struct vport *vport, int mtu)
1299 struct gre_vport *gre_vport = gre_vport_priv(vport);
1300 struct mutable_config *mutable;
1301 struct dp_port *dp_port;
1303 mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL);
1308 assign_config_rcu(vport, mutable);
1310 dp_port = vport_get_dp_port(vport);
1312 set_internal_devs_mtu(dp_port->dp);
1318 gre_set_addr(struct vport *vport, const unsigned char *addr)
1320 struct gre_vport *gre_vport = gre_vport_priv(vport);
1321 struct mutable_config *mutable;
1323 mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL);
1327 memcpy(mutable->eth_addr, addr, ETH_ALEN);
1328 assign_config_rcu(vport, mutable);
1335 gre_get_name(const struct vport *vport)
1337 const struct gre_vport *gre_vport = gre_vport_priv(vport);
1338 return gre_vport->name;
1341 static const unsigned char *
1342 gre_get_addr(const struct vport *vport)
1344 const struct gre_vport *gre_vport = gre_vport_priv(vport);
1345 return rcu_dereference(gre_vport->mutable)->eth_addr;
1349 gre_get_dev_flags(const struct vport *vport)
1351 return IFF_UP | IFF_RUNNING | IFF_LOWER_UP;
1355 gre_is_running(const struct vport *vport)
1360 static unsigned char
1361 gre_get_operstate(const struct vport *vport)
1367 gre_get_mtu(const struct vport *vport)
1369 const struct gre_vport *gre_vport = gre_vport_priv(vport);
1370 return rcu_dereference(gre_vport->mutable)->mtu;
1373 struct vport_ops gre_vport_ops = {
1375 .flags = VPORT_F_GEN_STATS | VPORT_F_TUN_ID,
1378 .create = gre_create,
1379 .modify = gre_modify,
1380 .destroy = gre_destroy,
1381 .set_mtu = gre_set_mtu,
1382 .set_addr = gre_set_addr,
1383 .get_name = gre_get_name,
1384 .get_addr = gre_get_addr,
1385 .get_dev_flags = gre_get_dev_flags,
1386 .is_running = gre_is_running,
1387 .get_operstate = gre_get_operstate,
1388 .get_mtu = gre_get_mtu,