c2c3df4c2a4354b1fe92610ca95d8996d66ead9b
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2007-2012 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/in.h>
22 #include <linux/in_route.h>
23 #include <linux/inetdevice.h>
24 #include <linux/jhash.h>
25 #include <linux/list.h>
26 #include <linux/kernel.h>
27 #include <linux/version.h>
28 #include <linux/workqueue.h>
29 #include <linux/rculist.h>
30 #include <net/route.h>
31 #include <net/xfrm.h>
32
33 #include "checksum.h"
34 #include "compat.h"
35 #include "datapath.h"
36 #include "tunnel.h"
37 #include "vlan.h"
38 #include "vport.h"
39
40 /**
41  *      ovs_tnl_rcv - ingress point for generic tunnel code
42  *
43  * @vport: port this packet was received on
44  * @skb: received packet
45  * @tos: ToS from encapsulating IP packet, used to copy ECN bits
46  *
47  * Must be called with rcu_read_lock.
48  *
49  * Packets received by this function are in the following state:
50  * - skb->data points to the inner Ethernet header.
51  * - The inner Ethernet header is in the linear data area.
52  * - skb->csum does not include the inner Ethernet header.
53  * - The layer pointers are undefined.
54  */
55 void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
56                  struct ovs_key_ipv4_tunnel *tun_key)
57 {
58         struct ethhdr *eh;
59
60         skb_reset_mac_header(skb);
61         eh = eth_hdr(skb);
62
63         if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
64                 skb->protocol = eh->h_proto;
65         else
66                 skb->protocol = htons(ETH_P_802_2);
67
68         skb_dst_drop(skb);
69         nf_reset(skb);
70         skb_clear_rxhash(skb);
71         secpath_reset(skb);
72         vlan_set_tci(skb, 0);
73
74         if (unlikely(compute_ip_summed(skb, false))) {
75                 kfree_skb(skb);
76                 return;
77         }
78
79         ovs_vport_receive(vport, skb, tun_key);
80 }
81
82 static bool need_linearize(const struct sk_buff *skb)
83 {
84         int i;
85
86         if (unlikely(skb_shinfo(skb)->frag_list))
87                 return true;
88
89         /*
90          * Generally speaking we should linearize if there are paged frags.
91          * However, if all of the refcounts are 1 we know nobody else can
92          * change them from underneath us and we can skip the linearization.
93          */
94         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
95                 if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
96                         return true;
97
98         return false;
99 }
100
101 static struct sk_buff *handle_offloads(struct sk_buff *skb)
102 {
103         int err;
104
105         forward_ip_summed(skb, true);
106
107         if (skb_is_gso(skb)) {
108                 struct sk_buff *nskb;
109                 char cb[sizeof(skb->cb)];
110
111                 memcpy(cb, skb->cb, sizeof(cb));
112
113                 nskb = __skb_gso_segment(skb, 0, false);
114                 if (IS_ERR(nskb)) {
115                         err = PTR_ERR(nskb);
116                         goto error;
117                 }
118
119                 consume_skb(skb);
120                 skb = nskb;
121                 while (nskb) {
122                         memcpy(nskb->cb, cb, sizeof(cb));
123                         nskb = nskb->next;
124                 }
125         } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
126                 /* Pages aren't locked and could change at any time.
127                  * If this happens after we compute the checksum, the
128                  * checksum will be wrong.  We linearize now to avoid
129                  * this problem.
130                  */
131                 if (unlikely(need_linearize(skb))) {
132                         err = __skb_linearize(skb);
133                         if (unlikely(err))
134                                 goto error;
135                 }
136
137                 err = skb_checksum_help(skb);
138                 if (unlikely(err))
139                         goto error;
140         }
141
142         set_ip_summed(skb, OVS_CSUM_NONE);
143
144         return skb;
145
146 error:
147         return ERR_PTR(err);
148 }
149
150 /* Compute source UDP port for outgoing packet.
151  * Currently we use the flow hash.
152  */
153 u16 ovs_tnl_get_src_port(struct sk_buff *skb)
154 {
155         int low;
156         int high;
157         unsigned int range;
158         struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
159         u32 hash = jhash2((const u32 *)pkt_key,
160                           sizeof(*pkt_key) / sizeof(u32), 0);
161
162         inet_get_local_port_range(&low, &high);
163         range = (high - low) + 1;
164         return (((u64) hash * range) >> 32) + low;
165 }
166
167 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
168                  u8 ipproto, int tunnel_hlen,
169                  void (*build_header)(const struct vport *,
170                                       struct sk_buff *,
171                                       int tunnel_hlen))
172 {
173         int min_headroom;
174         struct rtable *rt;
175         __be32 saddr;
176         int sent_len = 0;
177         int err;
178         struct sk_buff *nskb;
179
180         /* Route lookup */
181         saddr = OVS_CB(skb)->tun_key->ipv4_src;
182         rt = find_route(ovs_dp_get_net(vport->dp),
183                         &saddr,
184                         OVS_CB(skb)->tun_key->ipv4_dst,
185                         ipproto,
186                         OVS_CB(skb)->tun_key->ipv4_tos,
187                         skb_get_mark(skb));
188         if (IS_ERR(rt)) {
189                 err = PTR_ERR(rt);
190                 goto error;
191         }
192
193         tunnel_hlen += sizeof(struct iphdr);
194
195         min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
196                         + tunnel_hlen
197                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
198
199         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
200                 int head_delta = SKB_DATA_ALIGN(min_headroom -
201                                                 skb_headroom(skb) +
202                                                 16);
203
204                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
205                                         0, GFP_ATOMIC);
206                 if (unlikely(err))
207                         goto err_free_rt;
208         }
209
210         /* Offloading */
211         nskb = handle_offloads(skb);
212         if (IS_ERR(nskb)) {
213                 err = PTR_ERR(nskb);
214                 goto err_free_rt;
215         }
216         skb = nskb;
217
218         /* Reset SKB */
219         nf_reset(skb);
220         secpath_reset(skb);
221         skb_dst_drop(skb);
222         skb_clear_rxhash(skb);
223
224         while (skb) {
225                 struct sk_buff *next_skb = skb->next;
226                 struct iphdr *iph;
227                 int frag_len;
228
229                 skb->next = NULL;
230
231                 if (unlikely(vlan_deaccel_tag(skb)))
232                         goto next;
233
234                 frag_len = skb->len;
235                 skb_push(skb, tunnel_hlen);
236                 skb_reset_network_header(skb);
237                 skb_set_transport_header(skb, sizeof(struct iphdr));
238
239                 if (next_skb)
240                         skb_dst_set(skb, dst_clone(&rt_dst(rt)));
241                 else
242                         skb_dst_set(skb, &rt_dst(rt));
243
244                 /* Push Tunnel header. */
245                 build_header(vport, skb, tunnel_hlen);
246
247                 /* Push IP header. */
248                 iph = ip_hdr(skb);
249                 iph->version    = 4;
250                 iph->ihl        = sizeof(struct iphdr) >> 2;
251                 iph->protocol   = ipproto;
252                 iph->daddr      = OVS_CB(skb)->tun_key->ipv4_dst;
253                 iph->saddr      = saddr;
254                 iph->tos        = OVS_CB(skb)->tun_key->ipv4_tos;
255                 iph->ttl        = OVS_CB(skb)->tun_key->ipv4_ttl;
256                 iph->frag_off   = OVS_CB(skb)->tun_key->tun_flags &
257                                   TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
258                 /*
259                  * Allow our local IP stack to fragment the outer packet even
260                  * if the DF bit is set as a last resort.  We also need to
261                  * force selection of an IP ID here with __ip_select_ident(),
262                  * as ip_select_ident() assumes a proper ID is not needed when
263                  * when the DF bit is set.
264                  */
265                 skb->local_df = 1;
266                 __ip_select_ident(iph, skb_dst(skb), 0);
267
268                 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
269
270                 err = ip_local_out(skb);
271                 if (unlikely(net_xmit_eval(err)))
272                         goto next;
273
274                 sent_len += frag_len;
275
276 next:
277                 skb = next_skb;
278         }
279
280         return sent_len;
281
282 err_free_rt:
283         ip_rt_put(rt);
284 error:
285         return err;
286 }