datapath: Remove redundant checks on SKBs.
[sliver-openvswitch.git] / datapath / actions.c
1 /*
2  * Distributed under the terms of the GNU GPL version 2.
3  * Copyright (c) 2007, 2008, 2009, 2010 Nicira Networks.
4  *
5  * Significant portions of this file may be copied from parts of the Linux
6  * kernel, by Linus Torvalds and others.
7  */
8
9 /* Functions for executing flow actions. */
10
11 #include <linux/skbuff.h>
12 #include <linux/in.h>
13 #include <linux/ip.h>
14 #include <linux/tcp.h>
15 #include <linux/udp.h>
16 #include <linux/in6.h>
17 #include <linux/if_vlan.h>
18 #include <net/inet_ecn.h>
19 #include <net/ip.h>
20 #include <net/checksum.h>
21
22 #include "actions.h"
23 #include "datapath.h"
24 #include "openvswitch/datapath-protocol.h"
25 #include "vport.h"
26
27 static struct sk_buff *make_writable(struct sk_buff *skb, unsigned min_headroom, gfp_t gfp)
28 {
29         if (skb_cloned(skb)) {
30                 struct sk_buff *nskb;
31                 unsigned headroom = max(min_headroom, skb_headroom(skb));
32
33                 nskb = skb_copy_expand(skb, headroom, skb_tailroom(skb), gfp);
34                 if (nskb) {
35                         set_skb_csum_bits(skb, nskb);
36                         kfree_skb(skb);
37                         return nskb;
38                 }
39         } else {
40                 unsigned int hdr_len = (skb_transport_offset(skb)
41                                         + sizeof(struct tcphdr));
42                 if (pskb_may_pull(skb, min(hdr_len, skb->len)))
43                         return skb;
44         }
45         kfree_skb(skb);
46         return NULL;
47 }
48
49 static struct sk_buff *vlan_pull_tag(struct sk_buff *skb)
50 {
51         struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
52         struct ethhdr *eh;
53
54         /* Verify we were given a vlan packet */
55         if (vh->h_vlan_proto != htons(ETH_P_8021Q))
56                 return skb;
57
58         if (OVS_CB(skb)->ip_summed == OVS_CSUM_COMPLETE)
59                 skb->csum = csum_sub(skb->csum, csum_partial(skb->data
60                                         + ETH_HLEN, VLAN_HLEN, 0));
61
62         memmove(skb->data + VLAN_HLEN, skb->data, 2 * VLAN_ETH_ALEN);
63
64         eh = (struct ethhdr *)skb_pull(skb, VLAN_HLEN);
65
66         skb->protocol = eh->h_proto;
67         skb->mac_header += VLAN_HLEN;
68
69         return skb;
70 }
71
72 static struct sk_buff *modify_vlan_tci(struct datapath *dp, struct sk_buff *skb,
73                                        const struct odp_flow_key *key,
74                                        const union odp_action *a, int n_actions,
75                                        gfp_t gfp)
76 {
77         u16 tci, mask;
78
79         if (a->type == ODPAT_SET_VLAN_VID) {
80                 tci = ntohs(a->vlan_vid.vlan_vid);
81                 mask = VLAN_VID_MASK;
82         } else {
83                 tci = a->vlan_pcp.vlan_pcp << VLAN_PCP_SHIFT;
84                 mask = VLAN_PCP_MASK;
85         }
86
87         skb = make_writable(skb, VLAN_HLEN, gfp);
88         if (!skb)
89                 return ERR_PTR(-ENOMEM);
90
91         if (skb->protocol == htons(ETH_P_8021Q)) {
92                 /* Modify vlan id, but maintain other TCI values */
93                 struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
94                 __be16 old_tci = vh->h_vlan_TCI;
95
96                 vh->h_vlan_TCI = htons((ntohs(vh->h_vlan_TCI) & ~mask) | tci);
97
98                 if (OVS_CB(skb)->ip_summed == OVS_CSUM_COMPLETE) {
99                         __be16 diff[] = { ~old_tci, vh->h_vlan_TCI };
100
101                         skb->csum = ~csum_partial((char *)diff, sizeof(diff),
102                                                 ~skb->csum);
103                 }
104         } else {
105                 int err;
106
107                 /* Add vlan header */
108
109                 /* Set up checksumming pointers for checksum-deferred packets
110                  * on Xen.  Otherwise, dev_queue_xmit() will try to do this
111                  * when we send the packet out on the wire, and it will fail at
112                  * that point because skb_checksum_setup() will not look inside
113                  * an 802.1Q header. */
114                 err = vswitch_skb_checksum_setup(skb);
115                 if (unlikely(err)) {
116                         kfree_skb(skb);
117                         return ERR_PTR(err);
118                 }       
119
120                 /* GSO is not implemented for packets with an 802.1Q header, so
121                  * we have to do segmentation before we add that header.
122                  *
123                  * GSO does work with hardware-accelerated VLAN tagging, but we
124                  * can't use hardware-accelerated VLAN tagging since it
125                  * requires the device to have a VLAN group configured (with
126                  * e.g. vconfig(8)) and we don't do that.
127                  *
128                  * Having to do this here may be a performance loss, since we
129                  * can't take advantage of TSO hardware support, although it
130                  * does not make a measurable network performance difference
131                  * for 1G Ethernet.  Fixing that would require patching the
132                  * kernel (either to add GSO support to the VLAN protocol or to
133                  * support hardware-accelerated VLAN tagging without VLAN
134                  * groups configured). */
135                 if (skb_is_gso(skb)) {
136                         struct sk_buff *segs;
137
138                         segs = skb_gso_segment(skb, 0);
139                         kfree_skb(skb);
140                         if (unlikely(IS_ERR(segs)))
141                                 return ERR_CAST(segs);
142
143                         do {
144                                 struct sk_buff *nskb = segs->next;
145                                 int err;
146
147                                 segs->next = NULL;
148
149                                 /* GSO can change the checksum type so update.*/
150                                 compute_ip_summed(segs, true);
151
152                                 segs = __vlan_put_tag(segs, tci);
153                                 err = -ENOMEM;
154                                 if (segs) {
155                                         err = execute_actions(dp, segs,
156                                                               key, a + 1,
157                                                               n_actions - 1,
158                                                               gfp);
159                                 }
160
161                                 if (unlikely(err)) {
162                                         while ((segs = nskb)) {
163                                                 nskb = segs->next;
164                                                 segs->next = NULL;
165                                                 kfree_skb(segs);
166                                         }
167                                         return ERR_PTR(err);
168                                 }
169
170                                 segs = nskb;
171                         } while (segs->next);
172
173                         skb = segs;
174                         compute_ip_summed(skb, true);
175                 }
176
177                 /* The hardware-accelerated version of vlan_put_tag() works
178                  * only for a device that has a VLAN group configured (with
179                  * e.g. vconfig(8)), so call the software-only version
180                  * __vlan_put_tag() directly instead.
181                  */
182                 skb = __vlan_put_tag(skb, tci);
183                 if (!skb)
184                         return ERR_PTR(-ENOMEM);
185
186                 /* GSO doesn't fix up the hardware computed checksum so this
187                  * will only be hit in the non-GSO case. */
188                 if (OVS_CB(skb)->ip_summed == OVS_CSUM_COMPLETE)
189                         skb->csum = csum_add(skb->csum, csum_partial(skb->data
190                                                 + ETH_HLEN, VLAN_HLEN, 0));
191         }
192
193         return skb;
194 }
195
196 static struct sk_buff *strip_vlan(struct sk_buff *skb, gfp_t gfp)
197 {
198         skb = make_writable(skb, 0, gfp);
199         if (skb)
200                 vlan_pull_tag(skb);
201
202         return skb;
203 }
204
205 static struct sk_buff *set_dl_addr(struct sk_buff *skb,
206                                    const struct odp_action_dl_addr *a,
207                                    gfp_t gfp)
208 {
209         skb = make_writable(skb, 0, gfp);
210         if (skb) {
211                 struct ethhdr *eh = eth_hdr(skb);
212                 if (a->type == ODPAT_SET_DL_SRC)
213                         memcpy(eh->h_source, a->dl_addr, ETH_ALEN);
214                 else
215                         memcpy(eh->h_dest, a->dl_addr, ETH_ALEN);
216         }
217         return skb;
218 }
219
220 /* Updates 'sum', which is a field in 'skb''s data, given that a 4-byte field
221  * covered by the sum has been changed from 'from' to 'to'.  If set,
222  * 'pseudohdr' indicates that the field is in the TCP or UDP pseudo-header.
223  * Based on nf_proto_csum_replace4. */
224 static void update_csum(__sum16 *sum, struct sk_buff *skb,
225                         __be32 from, __be32 to, int pseudohdr)
226 {
227         __be32 diff[] = { ~from, to };
228
229         if (OVS_CB(skb)->ip_summed != OVS_CSUM_PARTIAL) {
230                 *sum = csum_fold(csum_partial((char *)diff, sizeof(diff),
231                                 ~csum_unfold(*sum)));
232                 if (OVS_CB(skb)->ip_summed == OVS_CSUM_COMPLETE && pseudohdr)
233                         skb->csum = ~csum_partial((char *)diff, sizeof(diff),
234                                                 ~skb->csum);
235         } else if (pseudohdr)
236                 *sum = ~csum_fold(csum_partial((char *)diff, sizeof(diff),
237                                 csum_unfold(*sum)));
238 }
239
240 static struct sk_buff *set_nw_addr(struct sk_buff *skb,
241                                    const struct odp_flow_key *key,
242                                    const struct odp_action_nw_addr *a,
243                                    gfp_t gfp)
244 {
245         if (key->dl_type != htons(ETH_P_IP))
246                 return skb;
247
248         skb = make_writable(skb, 0, gfp);
249         if (skb) {
250                 struct iphdr *nh = ip_hdr(skb);
251                 u32 *f = a->type == ODPAT_SET_NW_SRC ? &nh->saddr : &nh->daddr;
252                 u32 old = *f;
253                 u32 new = a->nw_addr;
254
255                 if (key->nw_proto == IPPROTO_TCP) {
256                         struct tcphdr *th = tcp_hdr(skb);
257                         update_csum(&th->check, skb, old, new, 1);
258                 } else if (key->nw_proto == IPPROTO_UDP) {
259                         struct udphdr *th = udp_hdr(skb);
260                         update_csum(&th->check, skb, old, new, 1);
261                 }
262                 update_csum(&nh->check, skb, old, new, 0);
263                 *f = new;
264         }
265         return skb;
266 }
267
268 static struct sk_buff *set_nw_tos(struct sk_buff *skb,
269                                    const struct odp_flow_key *key,
270                                    const struct odp_action_nw_tos *a,
271                                    gfp_t gfp)
272 {
273         if (key->dl_type != htons(ETH_P_IP))
274                 return skb;
275
276         skb = make_writable(skb, 0, gfp);
277         if (skb) {
278                 struct iphdr *nh = ip_hdr(skb);
279                 u8 *f = &nh->tos;
280                 u8 old = *f;
281                 u8 new;
282
283                 /* Set the DSCP bits and preserve the ECN bits. */
284                 new = a->nw_tos | (nh->tos & INET_ECN_MASK);
285                 update_csum(&nh->check, skb, htons((uint16_t)old),
286                                 htons((uint16_t)new), 0);
287                 *f = new;
288         }
289         return skb;
290 }
291
292 static struct sk_buff *set_tp_port(struct sk_buff *skb,
293                                    const struct odp_flow_key *key,
294                                    const struct odp_action_tp_port *a, gfp_t gfp)
295 {
296         int check_ofs;
297
298         if (key->dl_type != htons(ETH_P_IP))
299                 return skb;
300
301         if (key->nw_proto == IPPROTO_TCP)
302                 check_ofs = offsetof(struct tcphdr, check);
303         else if (key->nw_proto == IPPROTO_UDP)
304                 check_ofs = offsetof(struct udphdr, check);
305         else
306                 return skb;
307
308         skb = make_writable(skb, 0, gfp);
309         if (skb) {
310                 struct udphdr *th = udp_hdr(skb);
311                 u16 *f = a->type == ODPAT_SET_TP_SRC ? &th->source : &th->dest;
312                 u16 old = *f;
313                 u16 new = a->tp_port;
314                 update_csum((u16*)(skb_transport_header(skb) + check_ofs), 
315                                 skb, old, new, 0);
316                 *f = new;
317         }
318         return skb;
319 }
320
321 static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
322 {
323         struct dp_port *p;
324
325         if (!skb)
326                 goto error;
327
328         p = rcu_dereference(dp->ports[out_port]);
329         if (!p)
330                 goto error;
331
332         vport_send(p->vport, skb);
333         return;
334
335 error:
336         kfree_skb(skb);
337 }
338
339 /* Never consumes 'skb'.  Returns a port that 'skb' should be sent to, -1 if
340  * none.  */
341 static int output_group(struct datapath *dp, __u16 group,
342                         struct sk_buff *skb, gfp_t gfp)
343 {
344         struct dp_port_group *g = rcu_dereference(dp->groups[group]);
345         int prev_port = -1;
346         int i;
347
348         if (!g)
349                 return -1;
350         for (i = 0; i < g->n_ports; i++) {
351                 struct dp_port *p = rcu_dereference(dp->ports[g->ports[i]]);
352                 if (!p || OVS_CB(skb)->dp_port == p)
353                         continue;
354                 if (prev_port != -1) {
355                         struct sk_buff *clone = skb_clone(skb, gfp);
356                         if (!clone)
357                                 return -1;
358                         do_output(dp, clone, prev_port);
359                 }
360                 prev_port = p->port_no;
361         }
362         return prev_port;
363 }
364
365 static int output_control(struct datapath *dp, struct sk_buff *skb, u32 arg,
366                           gfp_t gfp)
367 {
368         skb = skb_clone(skb, gfp);
369         if (!skb)
370                 return -ENOMEM;
371         return dp_output_control(dp, skb, _ODPL_ACTION_NR, arg);
372 }
373
374 /* Send a copy of this packet up to the sFlow agent, along with extra
375  * information about what happened to it. */
376 static void sflow_sample(struct datapath *dp, struct sk_buff *skb,
377                          const union odp_action *a, int n_actions,
378                          gfp_t gfp, struct dp_port *dp_port)
379 {
380         struct odp_sflow_sample_header *hdr;
381         unsigned int actlen = n_actions * sizeof(union odp_action);
382         unsigned int hdrlen = sizeof(struct odp_sflow_sample_header);
383         struct sk_buff *nskb;
384
385         nskb = skb_copy_expand(skb, actlen + hdrlen, 0, gfp);
386         if (!nskb)
387                 return;
388
389         memcpy(__skb_push(nskb, actlen), a, actlen);
390         hdr = (struct odp_sflow_sample_header*)__skb_push(nskb, hdrlen);
391         hdr->n_actions = n_actions;
392         hdr->sample_pool = atomic_read(&dp_port->sflow_pool);
393         dp_output_control(dp, nskb, _ODPL_SFLOW_NR, 0);
394 }
395
396 /* Execute a list of actions against 'skb'. */
397 int execute_actions(struct datapath *dp, struct sk_buff *skb,
398                     const struct odp_flow_key *key,
399                     const union odp_action *a, int n_actions,
400                     gfp_t gfp)
401 {
402         /* Every output action needs a separate clone of 'skb', but the common
403          * case is just a single output action, so that doing a clone and
404          * then freeing the original skbuff is wasteful.  So the following code
405          * is slightly obscure just to avoid that. */
406         int prev_port = -1;
407         u32 priority = skb->priority;
408         int err;
409
410         if (dp->sflow_probability) {
411                 struct dp_port *p = OVS_CB(skb)->dp_port;
412                 if (p) {
413                         atomic_inc(&p->sflow_pool);
414                         if (dp->sflow_probability == UINT_MAX ||
415                             net_random() < dp->sflow_probability)
416                                 sflow_sample(dp, skb, a, n_actions, gfp, p);
417                 }
418         }
419
420         OVS_CB(skb)->tun_id = 0;
421
422         for (; n_actions > 0; a++, n_actions--) {
423                 if (prev_port != -1) {
424                         do_output(dp, skb_clone(skb, gfp), prev_port);
425                         prev_port = -1;
426                 }
427
428                 switch (a->type) {
429                 case ODPAT_OUTPUT:
430                         prev_port = a->output.port;
431                         break;
432
433                 case ODPAT_OUTPUT_GROUP:
434                         prev_port = output_group(dp, a->output_group.group,
435                                                  skb, gfp);
436                         break;
437
438                 case ODPAT_CONTROLLER:
439                         err = output_control(dp, skb, a->controller.arg, gfp);
440                         if (err) {
441                                 kfree_skb(skb);
442                                 return err;
443                         }
444                         break;
445
446                 case ODPAT_SET_TUNNEL:
447                         OVS_CB(skb)->tun_id = a->tunnel.tun_id;
448                         break;
449
450                 case ODPAT_SET_VLAN_VID:
451                 case ODPAT_SET_VLAN_PCP:
452                         skb = modify_vlan_tci(dp, skb, key, a, n_actions, gfp);
453                         if (IS_ERR(skb))
454                                 return PTR_ERR(skb);
455                         break;
456
457                 case ODPAT_STRIP_VLAN:
458                         skb = strip_vlan(skb, gfp);
459                         break;
460
461                 case ODPAT_SET_DL_SRC:
462                 case ODPAT_SET_DL_DST:
463                         skb = set_dl_addr(skb, &a->dl_addr, gfp);
464                         break;
465
466                 case ODPAT_SET_NW_SRC:
467                 case ODPAT_SET_NW_DST:
468                         skb = set_nw_addr(skb, key, &a->nw_addr, gfp);
469                         break;
470
471                 case ODPAT_SET_NW_TOS:
472                         skb = set_nw_tos(skb, key, &a->nw_tos, gfp);
473                         break;
474
475                 case ODPAT_SET_TP_SRC:
476                 case ODPAT_SET_TP_DST:
477                         skb = set_tp_port(skb, key, &a->tp_port, gfp);
478                         break;
479
480                 case ODPAT_SET_PRIORITY:
481                         skb->priority = a->priority.priority;
482                         break;
483
484                 case ODPAT_POP_PRIORITY:
485                         skb->priority = priority;
486                         break;
487                 }
488                 if (!skb)
489                         return -ENOMEM;
490         }
491         if (prev_port != -1)
492                 do_output(dp, skb, prev_port);
493         else
494                 kfree_skb(skb);
495         return 0;
496 }