datapath: Fix tunnel source port selection for mega flow
[sliver-openvswitch.git] / datapath / tunnel.c
1 /*
2  * Copyright (c) 2007-2012 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/in.h>
22 #include <linux/in_route.h>
23 #include <linux/inetdevice.h>
24 #include <linux/jhash.h>
25 #include <linux/list.h>
26 #include <linux/kernel.h>
27 #include <linux/version.h>
28 #include <linux/workqueue.h>
29 #include <linux/rculist.h>
30 #include <net/route.h>
31 #include <net/xfrm.h>
32
33 #include "checksum.h"
34 #include "compat.h"
35 #include "datapath.h"
36 #include "tunnel.h"
37 #include "vlan.h"
38 #include "vport.h"
39
40 /**
41  *      ovs_tnl_rcv - ingress point for generic tunnel code
42  *
43  * @vport: port this packet was received on
44  * @skb: received packet
45  * @tos: ToS from encapsulating IP packet, used to copy ECN bits
46  *
47  * Must be called with rcu_read_lock.
48  *
49  * Packets received by this function are in the following state:
50  * - skb->data points to the inner Ethernet header.
51  * - The inner Ethernet header is in the linear data area.
52  * - skb->csum does not include the inner Ethernet header.
53  * - The layer pointers are undefined.
54  */
55 void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb,
56                  struct ovs_key_ipv4_tunnel *tun_key)
57 {
58         struct ethhdr *eh;
59
60         skb_reset_mac_header(skb);
61         eh = eth_hdr(skb);
62
63         if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
64                 skb->protocol = eh->h_proto;
65         else
66                 skb->protocol = htons(ETH_P_802_2);
67
68         skb_dst_drop(skb);
69         nf_reset(skb);
70         skb_clear_rxhash(skb);
71         secpath_reset(skb);
72         vlan_set_tci(skb, 0);
73
74         if (unlikely(compute_ip_summed(skb, false))) {
75                 kfree_skb(skb);
76                 return;
77         }
78
79         ovs_vport_receive(vport, skb, tun_key);
80 }
81
82 struct rtable *find_route(struct net *net,
83                           __be32 *saddr, __be32 daddr, u8 ipproto,
84                           u8 tos, u32 skb_mark)
85 {
86         struct rtable *rt;
87         /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
88          * router expect RT_TOS bits only. */
89
90 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
91         struct flowi fl = { .nl_u = { .ip4_u = {
92                                         .daddr = daddr,
93                                         .saddr = *saddr,
94 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
95                                         .fwmark = skb_mark,
96 #endif
97                                         .tos   = RT_TOS(tos) } },
98 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
99                                         .mark = skb_mark,
100 #endif
101                                         .proto = ipproto };
102
103         if (unlikely(ip_route_output_key(net, &rt, &fl)))
104                 return ERR_PTR(-EADDRNOTAVAIL);
105         *saddr = fl.nl_u.ip4_u.saddr;
106         return rt;
107 #else
108         struct flowi4 fl = { .daddr = daddr,
109                              .saddr = *saddr,
110                              .flowi4_tos = RT_TOS(tos),
111                              .flowi4_mark = skb_mark,
112                              .flowi4_proto = ipproto };
113
114         rt = ip_route_output_key(net, &fl);
115         *saddr = fl.saddr;
116         return rt;
117 #endif
118 }
119
120 static bool need_linearize(const struct sk_buff *skb)
121 {
122         int i;
123
124         if (unlikely(skb_shinfo(skb)->frag_list))
125                 return true;
126
127         /*
128          * Generally speaking we should linearize if there are paged frags.
129          * However, if all of the refcounts are 1 we know nobody else can
130          * change them from underneath us and we can skip the linearization.
131          */
132         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
133                 if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
134                         return true;
135
136         return false;
137 }
138
139 static struct sk_buff *handle_offloads(struct sk_buff *skb)
140 {
141         int err;
142
143         forward_ip_summed(skb, true);
144
145         if (skb_is_gso(skb)) {
146                 struct sk_buff *nskb;
147
148                 nskb = __skb_gso_segment(skb, 0, false);
149                 if (IS_ERR(nskb)) {
150                         err = PTR_ERR(nskb);
151                         goto error;
152                 }
153
154                 consume_skb(skb);
155                 skb = nskb;
156         } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) {
157                 /* Pages aren't locked and could change at any time.
158                  * If this happens after we compute the checksum, the
159                  * checksum will be wrong.  We linearize now to avoid
160                  * this problem.
161                  */
162                 if (unlikely(need_linearize(skb))) {
163                         err = __skb_linearize(skb);
164                         if (unlikely(err))
165                                 goto error;
166                 }
167
168                 err = skb_checksum_help(skb);
169                 if (unlikely(err))
170                         goto error;
171         }
172
173         set_ip_summed(skb, OVS_CSUM_NONE);
174
175         return skb;
176
177 error:
178         return ERR_PTR(err);
179 }
180
181 /* Compute source UDP port for outgoing packet.
182  * Currently we use the flow hash.
183  */
184 u16 ovs_tnl_get_src_port(struct sk_buff *skb)
185 {
186         int low;
187         int high;
188         unsigned int range;
189         struct sw_flow_key *pkt_key = OVS_CB(skb)->pkt_key;
190         u32 hash = jhash2((const u32 *)pkt_key,
191                           sizeof(*pkt_key) / sizeof(u32), 0);
192
193         inet_get_local_port_range(&low, &high);
194         range = (high - low) + 1;
195         return (((u64) hash * range) >> 32) + low;
196 }
197
198 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb,
199                  u8 ipproto, int tunnel_hlen,
200                  void (*build_header)(const struct vport *,
201                                       struct sk_buff *,
202                                       int tunnel_hlen))
203 {
204         int min_headroom;
205         struct rtable *rt;
206         __be32 saddr;
207         int sent_len = 0;
208         int err;
209         struct sk_buff *nskb;
210
211         /* Route lookup */
212         saddr = OVS_CB(skb)->tun_key->ipv4_src;
213         rt = find_route(ovs_dp_get_net(vport->dp),
214                         &saddr,
215                         OVS_CB(skb)->tun_key->ipv4_dst,
216                         ipproto,
217                         OVS_CB(skb)->tun_key->ipv4_tos,
218                         skb_get_mark(skb));
219         if (IS_ERR(rt)) {
220                 err = PTR_ERR(rt);
221                 goto error;
222         }
223
224         tunnel_hlen += sizeof(struct iphdr);
225
226         min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
227                         + tunnel_hlen
228                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
229
230         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
231                 int head_delta = SKB_DATA_ALIGN(min_headroom -
232                                                 skb_headroom(skb) +
233                                                 16);
234
235                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
236                                         0, GFP_ATOMIC);
237                 if (unlikely(err))
238                         goto err_free_rt;
239         }
240
241         /* Offloading */
242         nskb = handle_offloads(skb);
243         if (IS_ERR(nskb)) {
244                 err = PTR_ERR(nskb);
245                 goto err_free_rt;
246         }
247         skb = nskb;
248
249         /* Reset SKB */
250         nf_reset(skb);
251         secpath_reset(skb);
252         skb_dst_drop(skb);
253         skb_clear_rxhash(skb);
254
255         while (skb) {
256                 struct sk_buff *next_skb = skb->next;
257                 struct iphdr *iph;
258                 int frag_len;
259
260                 skb->next = NULL;
261
262                 if (unlikely(vlan_deaccel_tag(skb)))
263                         goto next;
264
265                 frag_len = skb->len;
266                 skb_push(skb, tunnel_hlen);
267                 skb_reset_network_header(skb);
268                 skb_set_transport_header(skb, sizeof(struct iphdr));
269
270                 if (next_skb)
271                         skb_dst_set(skb, dst_clone(&rt_dst(rt)));
272                 else
273                         skb_dst_set(skb, &rt_dst(rt));
274
275                 /* Push Tunnel header. */
276                 build_header(vport, skb, tunnel_hlen);
277
278                 /* Push IP header. */
279                 iph = ip_hdr(skb);
280                 iph->version    = 4;
281                 iph->ihl        = sizeof(struct iphdr) >> 2;
282                 iph->protocol   = ipproto;
283                 iph->daddr      = OVS_CB(skb)->tun_key->ipv4_dst;
284                 iph->saddr      = saddr;
285                 iph->tos        = OVS_CB(skb)->tun_key->ipv4_tos;
286                 iph->ttl        = OVS_CB(skb)->tun_key->ipv4_ttl;
287                 iph->frag_off   = OVS_CB(skb)->tun_key->tun_flags &
288                                   TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
289                 /*
290                  * Allow our local IP stack to fragment the outer packet even
291                  * if the DF bit is set as a last resort.  We also need to
292                  * force selection of an IP ID here with __ip_select_ident(),
293                  * as ip_select_ident() assumes a proper ID is not needed when
294                  * when the DF bit is set.
295                  */
296                 skb->local_df = 1;
297                 __ip_select_ident(iph, skb_dst(skb), 0);
298
299                 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
300
301                 err = ip_local_out(skb);
302                 if (unlikely(net_xmit_eval(err)))
303                         goto next;
304
305                 sent_len += frag_len;
306
307 next:
308                 skb = next_skb;
309         }
310
311         return sent_len;
312
313 err_free_rt:
314         ip_rt_put(rt);
315 error:
316         return err;
317 }