Merge commit 'origin/citrix'
[sliver-openvswitch.git] / datapath / actions.c
1 /*
2  * Distributed under the terms of the GNU GPL version 2.
3  * Copyright (c) 2007, 2008, 2009 Nicira Networks.
4  *
5  * Significant portions of this file may be copied from parts of the Linux
6  * kernel, by Linus Torvalds and others.
7  */
8
9 /* Functions for executing flow actions. */
10
11 #include <linux/skbuff.h>
12 #include <linux/in.h>
13 #include <linux/ip.h>
14 #include <linux/tcp.h>
15 #include <linux/udp.h>
16 #include <linux/in6.h>
17 #include <linux/if_vlan.h>
18 #include <net/ip.h>
19 #include <net/checksum.h>
20 #include "datapath.h"
21 #include "dp_dev.h"
22 #include "actions.h"
23 #include "openvswitch/datapath-protocol.h"
24
25 struct sk_buff *
26 make_writable(struct sk_buff *skb, gfp_t gfp)
27 {
28         if (skb_shared(skb) || skb_cloned(skb)) {
29                 struct sk_buff *nskb = skb_copy(skb, gfp);
30                 if (nskb) {
31                         kfree_skb(skb);
32                         return nskb;
33                 }
34         } else {
35                 unsigned int hdr_len = (skb_transport_offset(skb)
36                                         + sizeof(struct tcphdr));
37                 if (pskb_may_pull(skb, min(hdr_len, skb->len)))
38                         return skb;
39         }
40         kfree_skb(skb);
41         return NULL;
42 }
43
44
45 static struct sk_buff *
46 vlan_pull_tag(struct sk_buff *skb)
47 {
48         struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
49         struct ethhdr *eh;
50
51
52         /* Verify we were given a vlan packet */
53         if (vh->h_vlan_proto != htons(ETH_P_8021Q))
54                 return skb;
55
56         memmove(skb->data + VLAN_HLEN, skb->data, 2 * VLAN_ETH_ALEN);
57
58         eh = (struct ethhdr *)skb_pull(skb, VLAN_HLEN);
59
60         skb->protocol = eh->h_proto;
61         skb->mac_header += VLAN_HLEN;
62
63         return skb;
64 }
65
66
67 static struct sk_buff *
68 modify_vlan_tci(struct datapath *dp, struct sk_buff *skb,
69                 struct odp_flow_key *key, const union odp_action *a,
70                 int n_actions, gfp_t gfp)
71 {
72         u16 tci, mask;
73
74         if (a->type == ODPAT_SET_VLAN_VID) {
75                 tci = ntohs(a->vlan_vid.vlan_vid);
76                 mask = VLAN_VID_MASK;
77                 key->dl_vlan = htons(tci & mask);
78         } else {
79                 tci = a->vlan_pcp.vlan_pcp << 13;
80                 mask = VLAN_PCP_MASK;
81         }
82
83         skb = make_writable(skb, gfp);
84         if (!skb)
85                 return ERR_PTR(-ENOMEM);
86
87         if (skb->protocol == htons(ETH_P_8021Q)) {
88                 /* Modify vlan id, but maintain other TCI values */
89                 struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
90                 vh->h_vlan_TCI = htons((ntohs(vh->h_vlan_TCI) & ~mask) | tci);
91         } else {
92                 /* Add vlan header */
93
94                 /* Set up checksumming pointers for checksum-deferred packets
95                  * on Xen.  Otherwise, dev_queue_xmit() will try to do this
96                  * when we send the packet out on the wire, and it will fail at
97                  * that point because skb_checksum_setup() will not look inside
98                  * an 802.1Q header. */
99                 vswitch_skb_checksum_setup(skb);
100
101                 /* GSO is not implemented for packets with an 802.1Q header, so
102                  * we have to do segmentation before we add that header.
103                  *
104                  * GSO does work with hardware-accelerated VLAN tagging, but we
105                  * can't use hardware-accelerated VLAN tagging since it
106                  * requires the device to have a VLAN group configured (with
107                  * e.g. vconfig(8)) and we don't do that.
108                  *
109                  * Having to do this here may be a performance loss, since we
110                  * can't take advantage of TSO hardware support, although it
111                  * does not make a measurable network performance difference
112                  * for 1G Ethernet.  Fixing that would require patching the
113                  * kernel (either to add GSO support to the VLAN protocol or to
114                  * support hardware-accelerated VLAN tagging without VLAN
115                  * groups configured). */
116                 if (skb_is_gso(skb)) {
117                         struct sk_buff *segs;
118
119                         segs = skb_gso_segment(skb, 0);
120                         kfree_skb(skb);
121                         if (unlikely(IS_ERR(segs)))
122                                 return ERR_CAST(segs);
123
124                         do {
125                                 struct sk_buff *nskb = segs->next;
126                                 int err;
127
128                                 segs->next = NULL;
129
130                                 segs = __vlan_put_tag(segs, tci);
131                                 err = -ENOMEM;
132                                 if (segs) {
133                                         struct odp_flow_key segkey = *key;
134                                         err = execute_actions(dp, segs,
135                                                               &segkey, a + 1,
136                                                               n_actions - 1,
137                                                               gfp);
138                                 }
139
140                                 if (unlikely(err)) {
141                                         while ((segs = nskb)) {
142                                                 nskb = segs->next;
143                                                 segs->next = NULL;
144                                                 kfree_skb(segs);
145                                         }
146                                         return ERR_PTR(err);
147                                 }
148
149                                 segs = nskb;
150                         } while (segs->next);
151
152                         skb = segs;
153                 }
154
155                 /* The hardware-accelerated version of vlan_put_tag() works
156                  * only for a device that has a VLAN group configured (with
157                  * e.g. vconfig(8)), so call the software-only version
158                  * __vlan_put_tag() directly instead.
159                  */
160                 skb = __vlan_put_tag(skb, tci);
161                 if (!skb)
162                         return ERR_PTR(-ENOMEM);
163         }
164
165         return skb;
166 }
167
168 static struct sk_buff *strip_vlan(struct sk_buff *skb,
169                                   struct odp_flow_key *key, gfp_t gfp)
170 {
171         skb = make_writable(skb, gfp);
172         if (skb) {
173                 vlan_pull_tag(skb);
174                 key->dl_vlan = htons(ODP_VLAN_NONE);
175         }
176         return skb;
177 }
178
179 static struct sk_buff *set_dl_addr(struct sk_buff *skb,
180                                    const struct odp_action_dl_addr *a,
181                                    gfp_t gfp)
182 {
183         skb = make_writable(skb, gfp);
184         if (skb) {
185                 struct ethhdr *eh = eth_hdr(skb);
186                 memcpy(a->type == ODPAT_SET_DL_SRC ? eh->h_source : eh->h_dest,
187                        a->dl_addr, ETH_ALEN);
188         }
189         return skb;
190 }
191
192 /* Updates 'sum', which is a field in 'skb''s data, given that a 4-byte field
193  * covered by the sum has been changed from 'from' to 'to'.  If set,
194  * 'pseudohdr' indicates that the field is in the TCP or UDP pseudo-header.
195  * Based on nf_proto_csum_replace4. */
196 static void update_csum(__sum16 *sum, struct sk_buff *skb,
197                         __be32 from, __be32 to, int pseudohdr)
198 {
199         __be32 diff[] = { ~from, to };
200         if (skb->ip_summed != CHECKSUM_PARTIAL) {
201                 *sum = csum_fold(csum_partial((char *)diff, sizeof(diff),
202                                 ~csum_unfold(*sum)));
203                 if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
204                         skb->csum = ~csum_partial((char *)diff, sizeof(diff),
205                                                 ~skb->csum);
206         } else if (pseudohdr)
207                 *sum = ~csum_fold(csum_partial((char *)diff, sizeof(diff),
208                                 csum_unfold(*sum)));
209 }
210
211 static struct sk_buff *set_nw_addr(struct sk_buff *skb,
212                                    struct odp_flow_key *key,
213                                    const struct odp_action_nw_addr *a,
214                                    gfp_t gfp)
215 {
216         if (key->dl_type != htons(ETH_P_IP))
217                 return skb;
218
219         skb = make_writable(skb, gfp);
220         if (skb) {
221                 struct iphdr *nh = ip_hdr(skb);
222                 u32 *f = a->type == ODPAT_SET_NW_SRC ? &nh->saddr : &nh->daddr;
223                 u32 old = *f;
224                 u32 new = a->nw_addr;
225
226                 if (key->nw_proto == IPPROTO_TCP) {
227                         struct tcphdr *th = tcp_hdr(skb);
228                         update_csum(&th->check, skb, old, new, 1);
229                 } else if (key->nw_proto == IPPROTO_UDP) {
230                         struct udphdr *th = udp_hdr(skb);
231                         update_csum(&th->check, skb, old, new, 1);
232                 }
233                 update_csum(&nh->check, skb, old, new, 0);
234                 *f = new;
235         }
236         return skb;
237 }
238
239 static struct sk_buff *
240 set_tp_port(struct sk_buff *skb, struct odp_flow_key *key,
241             const struct odp_action_tp_port *a,
242             gfp_t gfp)
243 {
244         int check_ofs;
245
246         if (key->dl_type != htons(ETH_P_IP))
247                 return skb;
248
249         if (key->nw_proto == IPPROTO_TCP)
250                 check_ofs = offsetof(struct tcphdr, check);
251         else if (key->nw_proto == IPPROTO_UDP)
252                 check_ofs = offsetof(struct udphdr, check);
253         else
254                 return skb;
255
256         skb = make_writable(skb, gfp);
257         if (skb) {
258                 struct udphdr *th = udp_hdr(skb);
259                 u16 *f = a->type == ODPAT_SET_TP_SRC ? &th->source : &th->dest;
260                 u16 old = *f;
261                 u16 new = a->tp_port;
262                 update_csum((u16*)((u8*)skb->data + check_ofs),
263                             skb, old, new, 1);
264                 *f = new;
265         }
266         return skb;
267 }
268
269 static inline unsigned packet_length(const struct sk_buff *skb)
270 {
271         unsigned length = skb->len - ETH_HLEN;
272         if (skb->protocol == htons(ETH_P_8021Q))
273                 length -= VLAN_HLEN;
274         return length;
275 }
276
277 int dp_xmit_skb(struct sk_buff *skb)
278 {
279         struct datapath *dp = skb->dev->br_port->dp;
280         int len = skb->len;
281
282         if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) {
283                 printk(KERN_WARNING "%s: dropped over-mtu packet: %d > %d\n",
284                        dp_name(dp), packet_length(skb), skb->dev->mtu);
285                 kfree_skb(skb);
286                 return -E2BIG;
287         }
288
289         dev_queue_xmit(skb);
290
291         return len;
292 }
293
294 static void
295 do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
296 {
297         struct net_bridge_port *p;
298         struct net_device *dev;
299
300         if (!skb)
301                 goto error;
302
303         p = dp->ports[out_port];
304         if (!p)
305                 goto error;
306
307         dev = skb->dev = p->dev;
308         if (is_dp_dev(dev))
309                 dp_dev_recv(dev, skb);
310         else
311                 dp_xmit_skb(skb);
312         return;
313
314 error:
315         kfree_skb(skb);
316 }
317
318 /* Never consumes 'skb'.  Returns a port that 'skb' should be sent to, -1 if
319  * none.  */
320 static int output_group(struct datapath *dp, __u16 group,
321                         struct sk_buff *skb, gfp_t gfp)
322 {
323         struct dp_port_group *g = rcu_dereference(dp->groups[group]);
324         int prev_port = -1;
325         int i;
326
327         if (!g)
328                 return -1;
329         for (i = 0; i < g->n_ports; i++) {
330                 struct net_bridge_port *p = dp->ports[g->ports[i]];
331                 if (!p || skb->dev == p->dev)
332                         continue;
333                 if (prev_port != -1) {
334                         struct sk_buff *clone = skb_clone(skb, gfp);
335                         if (!clone)
336                                 return -1;
337                         do_output(dp, clone, prev_port);
338                 }
339                 prev_port = p->port_no;
340         }
341         return prev_port;
342 }
343
344 static int
345 output_control(struct datapath *dp, struct sk_buff *skb, u32 arg, gfp_t gfp)
346 {
347         skb = skb_clone(skb, gfp);
348         if (!skb)
349                 return -ENOMEM;
350         return dp_output_control(dp, skb, _ODPL_ACTION_NR, arg);
351 }
352
353 /* Execute a list of actions against 'skb'. */
354 int execute_actions(struct datapath *dp, struct sk_buff *skb,
355                     struct odp_flow_key *key,
356                     const union odp_action *a, int n_actions,
357                     gfp_t gfp)
358 {
359         /* Every output action needs a separate clone of 'skb', but the common
360          * case is just a single output action, so that doing a clone and
361          * then freeing the original skbuff is wasteful.  So the following code
362          * is slightly obscure just to avoid that. */
363         int prev_port = -1;
364         int err;
365         for (; n_actions > 0; a++, n_actions--) {
366                 WARN_ON_ONCE(skb_shared(skb));
367                 if (prev_port != -1) {
368                         do_output(dp, skb_clone(skb, gfp), prev_port);
369                         prev_port = -1;
370                 }
371
372                 switch (a->type) {
373                 case ODPAT_OUTPUT:
374                         prev_port = a->output.port;
375                         break;
376
377                 case ODPAT_OUTPUT_GROUP:
378                         prev_port = output_group(dp, a->output_group.group,
379                                                  skb, gfp);
380                         break;
381
382                 case ODPAT_CONTROLLER:
383                         err = output_control(dp, skb, a->controller.arg, gfp);
384                         if (err) {
385                                 kfree_skb(skb);
386                                 return err;
387                         }
388                         break;
389
390                 case ODPAT_SET_VLAN_VID:
391                 case ODPAT_SET_VLAN_PCP:
392                         skb = modify_vlan_tci(dp, skb, key, a, n_actions, gfp);
393                         if (IS_ERR(skb))
394                                 return PTR_ERR(skb);
395                         break;
396
397                 case ODPAT_STRIP_VLAN:
398                         skb = strip_vlan(skb, key, gfp);
399                         break;
400
401                 case ODPAT_SET_DL_SRC:
402                 case ODPAT_SET_DL_DST:
403                         skb = set_dl_addr(skb, &a->dl_addr, gfp);
404                         break;
405
406                 case ODPAT_SET_NW_SRC:
407                 case ODPAT_SET_NW_DST:
408                         skb = set_nw_addr(skb, key, &a->nw_addr, gfp);
409                         break;
410
411                 case ODPAT_SET_TP_SRC:
412                 case ODPAT_SET_TP_DST:
413                         skb = set_tp_port(skb, key, &a->tp_port, gfp);
414                         break;
415                 }
416                 if (!skb)
417                         return -ENOMEM;
418         }
419         if (prev_port != -1)
420                 do_output(dp, skb, prev_port);
421         else
422                 kfree_skb(skb);
423         return 0;
424 }