910278686212c486912fcb2bdcf69d21fbcf5dff
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2007-2012 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/in.h>
22 #include <linux/in_route.h>
23 #include <linux/inetdevice.h>
24 #include <linux/jhash.h>
25 #include <linux/list.h>
26 #include <linux/kernel.h>
27 #include <linux/version.h>
28 #include <linux/workqueue.h>
29 #include <linux/rculist.h>
30 #include <net/route.h>
31 #include <net/xfrm.h>
32
33 #include "checksum.h"
34 #include "compat.h"
35 #include "datapath.h"
36 #include "tunnel.h"
37 #include "vlan.h"
38 #include "vport.h"
39
40 /**
41  *      ovs_tnl_rcv - ingress point for generic tunnel code
42  *
43  * @vport: port this packet was received on
44  * @skb: received packet
45  * @tos: ToS from encapsulating IP packet, used to copy ECN bits
46  *
47  * Must be called with rcu_read_lock.
48  *
49  * Packets received by this function are in the following state:
50  * - skb->data points to the inner Ethernet header.
51  * - The inner Ethernet header is in the linear data area.
52  * - skb->csum does not include the inner Ethernet header.
53  * - The layer pointers are undefined.
54  */
55 void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
56                  struct ovs_key_ipv4_tunnel *tun_key)
57 {
58         struct ethhdr *eh;
59
60         skb_reset_mac_header(skb);
61         eh = eth_hdr(skb);
62
63         if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
64                 skb->protocol = eh->h_proto;
65         else
66                 skb->protocol = htons(ETH_P_802_2);
67
68         skb_dst_drop(skb);
69         nf_reset(skb);
70         skb_clear_rxhash(skb);
71         secpath_reset(skb);
72         vlan_set_tci(skb, 0);
73
74         if (unlikely(compute_ip_summed(skb, false))) {
75                 kfree_skb(skb);
76                 return;
77         }
78
79         ovs_vport_receive(vport, skb, tun_key);
80 }
81
82 struct rtable *find_route(struct net *net,
83                           __be32 *saddr, __be32 daddr, u8 ipproto,
84                           u8 tos, u32 skb_mark)
85 {
86         struct rtable *rt;
87         /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
88          * router expect RT_TOS bits only. */
89
90 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
91         struct flowi fl = { .nl_u = { .ip4_u = {
92                                         .daddr = daddr,
93                                         .saddr = *saddr,
94 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
95                                         .fwmark = skb_mark,
96 #endif
97                                         .tos   = RT_TOS(tos) } },
98 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
99                                         .mark = skb_mark,
100 #endif
101                                         .proto = ipproto };
102
103         if (unlikely(ip_route_output_key(net, &rt, &fl)))
104                 return ERR_PTR(-EADDRNOTAVAIL);
105         *saddr = fl.nl_u.ip4_u.saddr;
106         return rt;
107 #else
108         struct flowi4 fl = { .daddr = daddr,
109                              .saddr = *saddr,
110                              .flowi4_tos = RT_TOS(tos),
111                              .flowi4_mark = skb_mark,
112                              .flowi4_proto = ipproto };
113
114         rt = ip_route_output_key(net, &fl);
115         *saddr = fl.saddr;
116         return rt;
117 #endif
118 }
119
120 static bool need_linearize(const struct sk_buff *skb)
121 {
122         int i;
123
124         if (unlikely(skb_shinfo(skb)->frag_list))
125                 return true;
126
127         /*
128          * Generally speaking we should linearize if there are paged frags.
129          * However, if all of the refcounts are 1 we know nobody else can
130          * change them from underneath us and we can skip the linearization.
131          */
132         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
133                 if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
134                         return true;
135
136         return false;
137 }
138
139 static struct sk_buff *handle_offloads(struct sk_buff *skb)
140 {
141         int err;
142
143         forward_ip_summed(skb, true);
144
145         if (skb_is_gso(skb)) {
146                 struct sk_buff *nskb;
147
148                 nskb = __skb_gso_segment(skb, 0, false);
149                 if (IS_ERR(nskb)) {
150                         err = PTR_ERR(nskb);
151                         goto error;
152                 }
153
154                 consume_skb(skb);
155                 skb = nskb;
156         } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
157                 /* Pages aren't locked and could change at any time.
158                  * If this happens after we compute the checksum, the
159                  * checksum will be wrong.  We linearize now to avoid
160                  * this problem.
161                  */
162                 if (unlikely(need_linearize(skb))) {
163                         err = __skb_linearize(skb);
164                         if (unlikely(err))
165                                 goto error;
166                 }
167
168                 err = skb_checksum_help(skb);
169                 if (unlikely(err))
170                         goto error;
171         }
172
173         set_ip_summed(skb, OVS_CSUM_NONE);
174
175         return skb;
176
177 error:
178         return ERR_PTR(err);
179 }
180
181 /* Compute source UDP port for outgoing packet.
182  * Currently we use the flow hash.
183  */
184 u16 ovs_tnl_get_src_port(struct sk_buff *skb)
185 {
186         int low;
187         int high;
188         unsigned int range;
189         u32 hash = OVS_CB(skb)->flow->hash;
190
191         inet_get_local_port_range(&low, &high);
192         range = (high - low) + 1;
193         return (((u64) hash * range) >> 32) + low;
194 }
195
196 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
197                  u8 ipproto, int tunnel_hlen,
198                  void (*build_header)(const struct vport *,
199                                       struct sk_buff *,
200                                       int tunnel_hlen))
201 {
202         int min_headroom;
203         struct rtable *rt;
204         __be32 saddr;
205         int sent_len = 0;
206         int err;
207         struct sk_buff *nskb;
208
209         /* Route lookup */
210         saddr = OVS_CB(skb)->tun_key->ipv4_src;
211         rt = find_route(ovs_dp_get_net(vport->dp),
212                         &saddr,
213                         OVS_CB(skb)->tun_key->ipv4_dst,
214                         ipproto,
215                         OVS_CB(skb)->tun_key->ipv4_tos,
216                         skb_get_mark(skb));
217         if (IS_ERR(rt)) {
218                 err = PTR_ERR(rt);
219                 goto error;
220         }
221
222         tunnel_hlen += sizeof(struct iphdr);
223
224         min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
225                         + tunnel_hlen
226                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
227
228         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
229                 int head_delta = SKB_DATA_ALIGN(min_headroom -
230                                                 skb_headroom(skb) +
231                                                 16);
232
233                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
234                                         0, GFP_ATOMIC);
235                 if (unlikely(err))
236                         goto err_free_rt;
237         }
238
239         /* Offloading */
240         nskb = handle_offloads(skb);
241         if (IS_ERR(nskb)) {
242                 err = PTR_ERR(nskb);
243                 goto err_free_rt;
244         }
245         skb = nskb;
246
247         /* Reset SKB */
248         nf_reset(skb);
249         secpath_reset(skb);
250         skb_dst_drop(skb);
251         skb_clear_rxhash(skb);
252
253         while (skb) {
254                 struct sk_buff *next_skb = skb->next;
255                 struct iphdr *iph;
256                 int frag_len;
257
258                 skb->next = NULL;
259
260                 if (unlikely(vlan_deaccel_tag(skb)))
261                         goto next;
262
263                 frag_len = skb->len;
264                 skb_push(skb, tunnel_hlen);
265                 skb_reset_network_header(skb);
266                 skb_set_transport_header(skb, sizeof(struct iphdr));
267
268                 if (next_skb)
269                         skb_dst_set(skb, dst_clone(&rt_dst(rt)));
270                 else
271                         skb_dst_set(skb, &rt_dst(rt));
272
273                 /* Push Tunnel header. */
274                 build_header(vport, skb, tunnel_hlen);
275
276                 /* Push IP header. */
277                 iph = ip_hdr(skb);
278                 iph->version    = 4;
279                 iph->ihl        = sizeof(struct iphdr) >> 2;
280                 iph->protocol   = ipproto;
281                 iph->daddr      = OVS_CB(skb)->tun_key->ipv4_dst;
282                 iph->saddr      = saddr;
283                 iph->tos        = OVS_CB(skb)->tun_key->ipv4_tos;
284                 iph->ttl        = OVS_CB(skb)->tun_key->ipv4_ttl;
285                 iph->frag_off   = OVS_CB(skb)->tun_key->tun_flags &
286                                   TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
287                 /*
288                  * Allow our local IP stack to fragment the outer packet even
289                  * if the DF bit is set as a last resort.  We also need to
290                  * force selection of an IP ID here with __ip_select_ident(),
291                  * as ip_select_ident() assumes a proper ID is not needed when
292                  * when the DF bit is set.
293                  */
294                 skb->local_df = 1;
295                 __ip_select_ident(iph, skb_dst(skb), 0);
296
297                 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
298
299                 err = ip_local_out(skb);
300                 if (unlikely(net_xmit_eval(err)))
301                         goto next;
302
303                 sent_len += frag_len;
304
305 next:
306                 skb = next_skb;
307         }
308
309         return sent_len;
310
311 err_free_rt:
312         ip_rt_put(rt);
313 error:
314         return err;
315 }