Merge commit '259e0b1ad1bfea762a76f0098deb8f8d8db1dfa3'
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2007-2012 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/in.h>
22 #include <linux/in_route.h>
23 #include <linux/inetdevice.h>
24 #include <linux/jhash.h>
25 #include <linux/list.h>
26 #include <linux/kernel.h>
27 #include <linux/version.h>
28 #include <linux/workqueue.h>
29 #include <linux/rculist.h>
30 #include <net/route.h>
31 #include <net/xfrm.h>
32
33 #include "checksum.h"
34 #include "compat.h"
35 #include "datapath.h"
36 #include "tunnel.h"
37 #include "vlan.h"
38 #include "vport.h"
39
40 /**
41  *      ovs_tnl_rcv - ingress point for generic tunnel code
42  *
43  * @vport: port this packet was received on
44  * @skb: received packet
45  * @tos: ToS from encapsulating IP packet, used to copy ECN bits
46  *
47  * Must be called with rcu_read_lock.
48  *
49  * Packets received by this function are in the following state:
50  * - skb->data points to the inner Ethernet header.
51  * - The inner Ethernet header is in the linear data area.
52  * - skb->csum does not include the inner Ethernet header.
53  * - The layer pointers are undefined.
54  */
55 void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
56                  struct ovs_key_ipv4_tunnel *tun_key)
57 {
58         struct ethhdr *eh;
59
60         skb_reset_mac_header(skb);
61         eh = eth_hdr(skb);
62
63         if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
64                 skb->protocol = eh->h_proto;
65         else
66                 skb->protocol = htons(ETH_P_802_2);
67
68         skb_dst_drop(skb);
69         nf_reset(skb);
70         skb_clear_rxhash(skb);
71         secpath_reset(skb);
72         vlan_set_tci(skb, 0);
73
74         if (unlikely(compute_ip_summed(skb, false))) {
75                 kfree_skb(skb);
76                 return;
77         }
78
79         ovs_vport_receive(vport, skb, tun_key);
80 }
81
82 struct rtable *find_route(struct net *net,
83                           __be32 *saddr, __be32 daddr, u8 ipproto,
84                           u8 tos, u32 skb_mark)
85 {
86         struct rtable *rt;
87         /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
88          * router expect RT_TOS bits only. */
89
90 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
91         struct flowi fl = { .nl_u = { .ip4_u = {
92                                         .daddr = daddr,
93                                         .saddr = *saddr,
94 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
95                                         .fwmark = skb_mark,
96 #endif
97                                         .tos   = RT_TOS(tos) } },
98 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
99                                         .mark = skb_mark,
100 #endif
101                                         .proto = ipproto };
102
103         if (unlikely(ip_route_output_key(net, &rt, &fl)))
104                 return ERR_PTR(-EADDRNOTAVAIL);
105         *saddr = fl.nl_u.ip4_u.saddr;
106         return rt;
107 #else
108         struct flowi4 fl = { .daddr = daddr,
109                              .saddr = *saddr,
110                              .flowi4_tos = RT_TOS(tos),
111                              .flowi4_mark = skb_mark,
112                              .flowi4_proto = ipproto };
113
114         rt = ip_route_output_key(net, &fl);
115         *saddr = fl.saddr;
116         return rt;
117 #endif
118 }
119
120 static bool need_linearize(const struct sk_buff *skb)
121 {
122         int i;
123
124         if (unlikely(skb_shinfo(skb)->frag_list))
125                 return true;
126
127         /*
128          * Generally speaking we should linearize if there are paged frags.
129          * However, if all of the refcounts are 1 we know nobody else can
130          * change them from underneath us and we can skip the linearization.
131          */
132         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
133                 if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
134                         return true;
135
136         return false;
137 }
138
139 static struct sk_buff *handle_offloads(struct sk_buff *skb)
140 {
141         int err;
142
143         forward_ip_summed(skb, true);
144
145         if (skb_is_gso(skb)) {
146                 struct sk_buff *nskb;
147                 char cb[sizeof(skb->cb)];
148
149                 memcpy(cb, skb->cb, sizeof(cb));
150
151                 nskb = __skb_gso_segment(skb, 0, false);
152                 if (IS_ERR(nskb)) {
153                         err = PTR_ERR(nskb);
154                         goto error;
155                 }
156
157                 consume_skb(skb);
158                 skb = nskb;
159                 while (nskb) {
160                         memcpy(nskb->cb, cb, sizeof(cb));
161                         nskb = nskb->next;
162                 }
163         } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
164                 /* Pages aren't locked and could change at any time.
165                  * If this happens after we compute the checksum, the
166                  * checksum will be wrong.  We linearize now to avoid
167                  * this problem.
168                  */
169                 if (unlikely(need_linearize(skb))) {
170                         err = __skb_linearize(skb);
171                         if (unlikely(err))
172                                 goto error;
173                 }
174
175                 err = skb_checksum_help(skb);
176                 if (unlikely(err))
177                         goto error;
178         }
179
180         set_ip_summed(skb, OVS_CSUM_NONE);
181
182         return skb;
183
184 error:
185         return ERR_PTR(err);
186 }
187
188 /* Compute source UDP port for outgoing packet.
189  * Currently we use the flow hash.
190  */
191 u16 ovs_tnl_get_src_port(struct sk_buff *skb)
192 {
193         int low;
194         int high;
195         unsigned int range;
196         struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
197         u32 hash = jhash2((const u32 *)pkt_key,
198                           sizeof(*pkt_key) / sizeof(u32), 0);
199
200         inet_get_local_port_range(&low, &high);
201         range = (high - low) + 1;
202         return (((u64) hash * range) >> 32) + low;
203 }
204
205 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
206                  u8 ipproto, int tunnel_hlen,
207                  void (*build_header)(const struct vport *,
208                                       struct sk_buff *,
209                                       int tunnel_hlen))
210 {
211         int min_headroom;
212         struct rtable *rt;
213         __be32 saddr;
214         int sent_len = 0;
215         int err;
216         struct sk_buff *nskb;
217
218         /* Route lookup */
219         saddr = OVS_CB(skb)->tun_key->ipv4_src;
220         rt = find_route(ovs_dp_get_net(vport->dp),
221                         &saddr,
222                         OVS_CB(skb)->tun_key->ipv4_dst,
223                         ipproto,
224                         OVS_CB(skb)->tun_key->ipv4_tos,
225                         skb_get_mark(skb));
226         if (IS_ERR(rt)) {
227                 err = PTR_ERR(rt);
228                 goto error;
229         }
230
231         tunnel_hlen += sizeof(struct iphdr);
232
233         min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
234                         + tunnel_hlen
235                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
236
237         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
238                 int head_delta = SKB_DATA_ALIGN(min_headroom -
239                                                 skb_headroom(skb) +
240                                                 16);
241
242                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
243                                         0, GFP_ATOMIC);
244                 if (unlikely(err))
245                         goto err_free_rt;
246         }
247
248         /* Offloading */
249         nskb = handle_offloads(skb);
250         if (IS_ERR(nskb)) {
251                 err = PTR_ERR(nskb);
252                 goto err_free_rt;
253         }
254         skb = nskb;
255
256         /* Reset SKB */
257         nf_reset(skb);
258         secpath_reset(skb);
259         skb_dst_drop(skb);
260         skb_clear_rxhash(skb);
261
262         while (skb) {
263                 struct sk_buff *next_skb = skb->next;
264                 struct iphdr *iph;
265                 int frag_len;
266
267                 skb->next = NULL;
268
269                 if (unlikely(vlan_deaccel_tag(skb)))
270                         goto next;
271
272                 frag_len = skb->len;
273                 skb_push(skb, tunnel_hlen);
274                 skb_reset_network_header(skb);
275                 skb_set_transport_header(skb, sizeof(struct iphdr));
276
277                 if (next_skb)
278                         skb_dst_set(skb, dst_clone(&rt_dst(rt)));
279                 else
280                         skb_dst_set(skb, &rt_dst(rt));
281
282                 /* Push Tunnel header. */
283                 build_header(vport, skb, tunnel_hlen);
284
285                 /* Push IP header. */
286                 iph = ip_hdr(skb);
287                 iph->version    = 4;
288                 iph->ihl        = sizeof(struct iphdr) >> 2;
289                 iph->protocol   = ipproto;
290                 iph->daddr      = OVS_CB(skb)->tun_key->ipv4_dst;
291                 iph->saddr      = saddr;
292                 iph->tos        = OVS_CB(skb)->tun_key->ipv4_tos;
293                 iph->ttl        = OVS_CB(skb)->tun_key->ipv4_ttl;
294                 iph->frag_off   = OVS_CB(skb)->tun_key->tun_flags &
295                                   TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
296                 /*
297                  * Allow our local IP stack to fragment the outer packet even
298                  * if the DF bit is set as a last resort.  We also need to
299                  * force selection of an IP ID here with __ip_select_ident(),
300                  * as ip_select_ident() assumes a proper ID is not needed when
301                  * when the DF bit is set.
302                  */
303                 skb->local_df = 1;
304                 __ip_select_ident(iph, skb_dst(skb), 0);
305
306                 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
307
308                 err = ip_local_out(skb);
309                 if (unlikely(net_xmit_eval(err)))
310                         goto next;
311
312                 sent_len += frag_len;
313
314 next:
315                 skb = next_skb;
316         }
317
318         return sent_len;
319
320 err_free_rt:
321         ip_rt_put(rt);
322 error:
323         return err;
324 }