Add support for exporting flow information in NetFlow v5 format.
[sliver-openvswitch.git] / datapath / flow.c
1 /*
2  * Distributed under the terms of the GNU GPL version 2.
3  * Copyright (c) 2007, 2008 The Board of Trustees of The Leland 
4  * Stanford Junior University
5  */
6
7 #include "flow.h"
8 #include <linux/netdevice.h>
9 #include <linux/etherdevice.h>
10 #include <linux/if_ether.h>
11 #include <linux/if_vlan.h>
12 #include <net/llc_pdu.h>
13 #include <linux/jiffies.h>
14 #include <linux/kernel.h>
15 #include <linux/llc.h>
16 #include <linux/module.h>
17 #include <linux/in.h>
18 #include <linux/rcupdate.h>
19
20 #include "openflow/openflow.h"
21 #include "openflow/nicira-ext.h"
22 #include "compat.h"
23
24 struct kmem_cache *flow_cache;
25
26 /* Internal function used to compare fields in flow. */
27 static inline
28 int flow_fields_match(const struct sw_flow_key *a, const struct sw_flow_key *b,
29                       uint32_t w, uint32_t src_mask, uint32_t dst_mask)
30 {
31         return ((w & OFPFW_IN_PORT || a->in_port == b->in_port)
32                 && (w & OFPFW_DL_VLAN || a->dl_vlan == b->dl_vlan)
33                 && (w & OFPFW_DL_SRC || !memcmp(a->dl_src, b->dl_src, ETH_ALEN))
34                 && (w & OFPFW_DL_DST || !memcmp(a->dl_dst, b->dl_dst, ETH_ALEN))
35                 && (w & OFPFW_DL_TYPE || a->dl_type == b->dl_type)
36                 && !((a->nw_src ^ b->nw_src) & src_mask)
37                 && !((a->nw_dst ^ b->nw_dst) & dst_mask)
38                 && (w & OFPFW_NW_PROTO || a->nw_proto == b->nw_proto)
39                 && (w & OFPFW_TP_SRC || a->tp_src == b->tp_src)
40                 && (w & OFPFW_TP_DST || a->tp_dst == b->tp_dst));
41 }
42
43 /* Returns nonzero if 'a' and 'b' match, that is, if their fields are equal
44  * modulo wildcards in 'b', zero otherwise. */
45 int flow_matches_1wild(const struct sw_flow_key *a,
46                        const struct sw_flow_key *b)
47 {
48         return flow_fields_match(a, b, b->wildcards,
49                                  b->nw_src_mask, b->nw_dst_mask);
50 }
51 EXPORT_SYMBOL(flow_matches_1wild);
52
53 /* Returns nonzero if 'a' and 'b' match, that is, if their fields are equal
54  * modulo wildcards in 'a' or 'b', zero otherwise. */
55 int flow_matches_2wild(const struct sw_flow_key *a,
56                        const struct sw_flow_key *b)
57 {
58         return flow_fields_match(a, b,
59                                  a->wildcards | b->wildcards,
60                                  a->nw_src_mask & b->nw_src_mask,
61                                  a->nw_dst_mask & b->nw_dst_mask);
62 }
63 EXPORT_SYMBOL(flow_matches_2wild);
64
65 /* Returns nonzero if 't' (the table entry's key) and 'd' (the key
66  * describing the match) match, that is, if their fields are
67  * equal modulo wildcards, zero otherwise.  If 'strict' is nonzero, the
68  * wildcards must match in both 't_key' and 'd_key'.  Note that the
69  * table's wildcards are ignored unless 'strict' is set. */
70 int flow_matches_desc(const struct sw_flow_key *t, const struct sw_flow_key *d, 
71                 int strict)
72 {
73         if (strict && d->wildcards != t->wildcards)
74                 return 0;
75         return flow_matches_1wild(t, d);
76 }
77 EXPORT_SYMBOL(flow_matches_desc);
78
79 static uint32_t make_nw_mask(int n_wild_bits)
80 {
81         n_wild_bits &= (1u << OFPFW_NW_SRC_BITS) - 1;
82         return n_wild_bits < 32 ? htonl(~((1u << n_wild_bits) - 1)) : 0;
83 }
84
85 void flow_extract_match(struct sw_flow_key* to, const struct ofp_match* from)
86 {
87         to->wildcards = ntohl(from->wildcards) & OFPFW_ALL;
88         to->pad = 0;
89         to->in_port = from->in_port;
90         to->dl_vlan = from->dl_vlan;
91         memcpy(to->dl_src, from->dl_src, ETH_ALEN);
92         memcpy(to->dl_dst, from->dl_dst, ETH_ALEN);
93         to->dl_type = from->dl_type;
94
95         to->nw_src = to->nw_dst = to->nw_proto = 0;
96         to->tp_src = to->tp_dst = 0;
97
98 #define OFPFW_TP (OFPFW_TP_SRC | OFPFW_TP_DST)
99 #define OFPFW_NW (OFPFW_NW_SRC_MASK | OFPFW_NW_DST_MASK | OFPFW_NW_PROTO)
100         if (to->wildcards & OFPFW_DL_TYPE) {
101                 /* Can't sensibly match on network or transport headers if the
102                  * data link type is unknown. */
103                 to->wildcards |= OFPFW_NW | OFPFW_TP;
104         } else if (from->dl_type == htons(ETH_P_IP)) {
105                 to->nw_src   = from->nw_src;
106                 to->nw_dst   = from->nw_dst;
107                 to->nw_proto = from->nw_proto;
108
109                 if (to->wildcards & OFPFW_NW_PROTO) {
110                         /* Can't sensibly match on transport headers if the
111                          * network protocol is unknown. */
112                         to->wildcards |= OFPFW_TP;
113                 } else if (from->nw_proto == IPPROTO_TCP
114                                 || from->nw_proto == IPPROTO_UDP
115                                 || from->nw_proto == IPPROTO_ICMP) {
116                         to->tp_src = from->tp_src;
117                         to->tp_dst = from->tp_dst;
118                 } else {
119                         /* Transport layer fields are undefined.  Mark them as
120                          * exact-match to allow such flows to reside in
121                          * table-hash, instead of falling into table-linear. */
122                         to->wildcards &= ~OFPFW_TP;
123                 }
124         } else {
125                 /* Network and transport layer fields are undefined.  Mark them
126                  * as exact-match to allow such flows to reside in table-hash,
127                  * instead of falling into table-linear. */
128                 to->wildcards &= ~(OFPFW_NW | OFPFW_TP);
129         }
130
131         /* We set these late because code above adjusts to->wildcards. */
132         to->nw_src_mask = make_nw_mask(to->wildcards >> OFPFW_NW_SRC_SHIFT);
133         to->nw_dst_mask = make_nw_mask(to->wildcards >> OFPFW_NW_DST_SHIFT);
134 }
135
136 void flow_fill_match(struct ofp_match* to, const struct sw_flow_key* from)
137 {
138         to->wildcards = htonl(from->wildcards);
139         to->in_port   = from->in_port;
140         to->dl_vlan   = from->dl_vlan;
141         memcpy(to->dl_src, from->dl_src, ETH_ALEN);
142         memcpy(to->dl_dst, from->dl_dst, ETH_ALEN);
143         to->dl_type   = from->dl_type;
144         to->nw_src    = from->nw_src;
145         to->nw_dst    = from->nw_dst;
146         to->nw_proto  = from->nw_proto;
147         to->tp_src    = from->tp_src;
148         to->tp_dst    = from->tp_dst;
149         to->pad       = 0;
150 }
151
152 int flow_timeout(struct sw_flow *flow)
153 {
154         if (flow->idle_timeout != OFP_FLOW_PERMANENT
155             && time_after64(get_jiffies_64(), flow->used + flow->idle_timeout * HZ))
156                 return NXFER_IDLE_TIMEOUT;
157         else if (flow->hard_timeout != OFP_FLOW_PERMANENT
158                  && time_after64(get_jiffies_64(),
159                                flow->created + flow->hard_timeout * HZ))
160                 return NXFER_HARD_TIMEOUT;
161         else
162                 return -1;
163 }
164 EXPORT_SYMBOL(flow_timeout);
165
166 /* Returns nonzero if 'flow' contains an output action to 'out_port' or
167  * has the value OFPP_NONE. 'out_port' is in network-byte order. */
168 int flow_has_out_port(struct sw_flow *flow, uint16_t out_port)
169 {
170         struct sw_flow_actions *sf_acts;
171         size_t actions_len;
172         uint8_t *p;
173
174         if (out_port == htons(OFPP_NONE))
175                 return 1;
176
177         sf_acts = rcu_dereference(flow->sf_acts);
178
179         actions_len = sf_acts->actions_len;
180         p = (uint8_t *)sf_acts->actions;
181
182         while (actions_len > 0) {
183                 struct ofp_action_header *ah = (struct ofp_action_header *)p;
184                 size_t len = ntohs(ah->len);
185
186                 if (ah->type == htons(OFPAT_OUTPUT)) {
187                         struct ofp_action_output *oa = (struct ofp_action_output *)p;
188                         if (oa->port == out_port)
189                                 return 1;
190                 }
191
192                 p += len;
193                 actions_len -= len;
194         }
195
196         return 0;
197 }
198 EXPORT_SYMBOL(flow_has_out_port);
199
200 /* Allocates and returns a new flow with room for 'actions_len' actions, 
201  * using allocation flags 'flags'.  Returns the new flow or a null pointer 
202  * on failure. */
203 struct sw_flow *flow_alloc(size_t actions_len, gfp_t flags)
204 {
205         struct sw_flow_actions *sfa;
206         size_t size = sizeof *sfa + actions_len;
207         struct sw_flow *flow = kmem_cache_alloc(flow_cache, flags);
208         if (unlikely(!flow))
209                 return NULL;
210
211         sfa = kmalloc(size, flags);
212         if (unlikely(!sfa)) {
213                 kmem_cache_free(flow_cache, flow);
214                 return NULL;
215         }
216         sfa->actions_len = actions_len;
217         flow->sf_acts = sfa;
218
219         return flow;
220 }
221
222 /* Frees 'flow' immediately. */
223 void flow_free(struct sw_flow *flow)
224 {
225         if (unlikely(!flow))
226                 return;
227         kfree(flow->sf_acts);
228         kmem_cache_free(flow_cache, flow);
229 }
230 EXPORT_SYMBOL(flow_free);
231
232 /* RCU callback used by flow_deferred_free. */
233 static void rcu_free_flow_callback(struct rcu_head *rcu)
234 {
235         struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
236         flow_free(flow);
237 }
238
239 /* Schedules 'flow' to be freed after the next RCU grace period.
240  * The caller must hold rcu_read_lock for this to be sensible. */
241 void flow_deferred_free(struct sw_flow *flow)
242 {
243         call_rcu(&flow->rcu, rcu_free_flow_callback);
244 }
245 EXPORT_SYMBOL(flow_deferred_free);
246
247 /* RCU callback used by flow_deferred_free_acts. */
248 static void rcu_free_acts_callback(struct rcu_head *rcu)
249 {
250         struct sw_flow_actions *sf_acts = container_of(rcu, 
251                         struct sw_flow_actions, rcu);
252         kfree(sf_acts);
253 }
254
255 /* Schedules 'sf_acts' to be freed after the next RCU grace period.
256  * The caller must hold rcu_read_lock for this to be sensible. */
257 void flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
258 {
259         call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
260 }
261 EXPORT_SYMBOL(flow_deferred_free_acts);
262
263 /* Copies 'actions' into a newly allocated structure for use by 'flow'
264  * and safely frees the structure that defined the previous actions. */
265 void flow_replace_acts(struct sw_flow *flow, 
266                 const struct ofp_action_header *actions, size_t actions_len)
267 {
268         struct sw_flow_actions *sfa;
269         struct sw_flow_actions *orig_sfa = flow->sf_acts;
270         size_t size = sizeof *sfa + actions_len;
271
272         sfa = kmalloc(size, GFP_ATOMIC);
273         if (unlikely(!sfa))
274                 return;
275
276         sfa->actions_len = actions_len;
277         memcpy(sfa->actions, actions, actions_len);
278
279         rcu_assign_pointer(flow->sf_acts, sfa);
280         flow_deferred_free_acts(orig_sfa);
281
282         return;
283 }
284 EXPORT_SYMBOL(flow_replace_acts);
285
286 /* Prints a representation of 'key' to the kernel log. */
287 void print_flow(const struct sw_flow_key *key)
288 {
289         printk("wild%08x port%04x:vlan%04x mac%02x:%02x:%02x:%02x:%02x:%02x"
290                         "->%02x:%02x:%02x:%02x:%02x:%02x "
291                         "proto%04x ip%u.%u.%u.%u->%u.%u.%u.%u port%d->%d\n",
292                         key->wildcards, ntohs(key->in_port), ntohs(key->dl_vlan),
293                         key->dl_src[0], key->dl_src[1], key->dl_src[2],
294                         key->dl_src[3], key->dl_src[4], key->dl_src[5],
295                         key->dl_dst[0], key->dl_dst[1], key->dl_dst[2],
296                         key->dl_dst[3], key->dl_dst[4], key->dl_dst[5],
297                         ntohs(key->dl_type),
298                         ((unsigned char *)&key->nw_src)[0],
299                         ((unsigned char *)&key->nw_src)[1],
300                         ((unsigned char *)&key->nw_src)[2],
301                         ((unsigned char *)&key->nw_src)[3],
302                         ((unsigned char *)&key->nw_dst)[0],
303                         ((unsigned char *)&key->nw_dst)[1],
304                         ((unsigned char *)&key->nw_dst)[2],
305                         ((unsigned char *)&key->nw_dst)[3],
306                         ntohs(key->tp_src), ntohs(key->tp_dst));
307 }
308 EXPORT_SYMBOL(print_flow);
309
310 #define SNAP_OUI_LEN 3
311
312 struct eth_snap_hdr
313 {
314         struct ethhdr eth;
315         uint8_t  dsap;  /* Always 0xAA */
316         uint8_t  ssap;  /* Always 0xAA */
317         uint8_t  ctrl;
318         uint8_t  oui[SNAP_OUI_LEN];
319         uint16_t ethertype;
320 } __attribute__ ((packed));
321
322 static int is_snap(const struct eth_snap_hdr *esh)
323 {
324         return (esh->dsap == LLC_SAP_SNAP
325                 && esh->ssap == LLC_SAP_SNAP
326                 && !memcmp(esh->oui, "\0\0\0", 3));
327 }
328
329 /* Parses the Ethernet frame in 'skb', which was received on 'in_port',
330  * and initializes 'key' to match.  Returns 1 if 'skb' contains an IP
331  * fragment, 0 otherwise. */
332 int flow_extract(struct sk_buff *skb, uint16_t in_port,
333                  struct sw_flow_key *key)
334 {
335         struct ethhdr *eth;
336         struct eth_snap_hdr *esh;
337         int retval = 0;
338         int nh_ofs;
339
340         memset(key, 0, sizeof *key);
341         key->dl_vlan = htons(OFP_VLAN_NONE);
342         key->in_port = htons(in_port);
343
344         if (skb->len < sizeof *eth)
345                 return 0;
346         if (!pskb_may_pull(skb, skb->len >= 64 ? 64 : skb->len)) {
347                 return 0;
348         }
349
350         skb_reset_mac_header(skb);
351         eth = eth_hdr(skb);
352         esh = (struct eth_snap_hdr *) eth;
353         nh_ofs = sizeof *eth;
354         if (likely(ntohs(eth->h_proto) >= OFP_DL_TYPE_ETH2_CUTOFF))
355                 key->dl_type = eth->h_proto;
356         else if (skb->len >= sizeof *esh && is_snap(esh)) {
357                 key->dl_type = esh->ethertype;
358                 nh_ofs = sizeof *esh;
359         } else {
360                 key->dl_type = htons(OFP_DL_TYPE_NOT_ETH_TYPE);
361                 if (skb->len >= nh_ofs + sizeof(struct llc_pdu_un)) {
362                         nh_ofs += sizeof(struct llc_pdu_un); 
363                 }
364         }
365
366         /* Check for a VLAN tag */
367         if (key->dl_type == htons(ETH_P_8021Q) &&
368             skb->len >= nh_ofs + sizeof(struct vlan_hdr)) {
369                 struct vlan_hdr *vh = (struct vlan_hdr*)(skb->data + nh_ofs);
370                 key->dl_type = vh->h_vlan_encapsulated_proto;
371                 key->dl_vlan = vh->h_vlan_TCI & htons(VLAN_VID_MASK);
372                 nh_ofs += sizeof(struct vlan_hdr);
373         }
374         memcpy(key->dl_src, eth->h_source, ETH_ALEN);
375         memcpy(key->dl_dst, eth->h_dest, ETH_ALEN);
376         skb_set_network_header(skb, nh_ofs);
377
378         /* Network layer. */
379         if (key->dl_type == htons(ETH_P_IP) && iphdr_ok(skb)) {
380                 struct iphdr *nh = ip_hdr(skb);
381                 int th_ofs = nh_ofs + nh->ihl * 4;
382                 key->nw_src = nh->saddr;
383                 key->nw_dst = nh->daddr;
384                 key->nw_proto = nh->protocol;
385                 skb_set_transport_header(skb, th_ofs);
386
387                 /* Transport layer. */
388                 if (!(nh->frag_off & htons(IP_MF | IP_OFFSET))) {
389                         if (key->nw_proto == IPPROTO_TCP) {
390                                 if (tcphdr_ok(skb)) {
391                                         struct tcphdr *tcp = tcp_hdr(skb);
392                                         key->tp_src = tcp->source;
393                                         key->tp_dst = tcp->dest;
394                                 } else {
395                                         /* Avoid tricking other code into
396                                          * thinking that this packet has an L4
397                                          * header. */
398                                         key->nw_proto = 0;
399                                 }
400                         } else if (key->nw_proto == IPPROTO_UDP) {
401                                 if (udphdr_ok(skb)) {
402                                         struct udphdr *udp = udp_hdr(skb);
403                                         key->tp_src = udp->source;
404                                         key->tp_dst = udp->dest;
405                                 } else {
406                                         /* Avoid tricking other code into
407                                          * thinking that this packet has an L4
408                                          * header. */
409                                         key->nw_proto = 0;
410                                 }
411                         } else if (key->nw_proto == IPPROTO_ICMP) {
412                                 if (icmphdr_ok(skb)) {
413                                         struct icmphdr *icmp = icmp_hdr(skb);
414                                         /* The ICMP type and code fields use the 16-bit
415                                          * transport port fields, so we need to store them
416                                          * in 16-bit network byte order. */
417                                         key->icmp_type = htons(icmp->type);
418                                         key->icmp_code = htons(icmp->code);
419                                 } else {
420                                         /* Avoid tricking other code into
421                                          * thinking that this packet has an L4
422                                          * header. */
423                                         key->nw_proto = 0;
424                                 }
425                         }
426                 } else {
427                         retval = 1;
428                 }
429         } else {
430                 skb_reset_transport_header(skb);
431         }
432         return retval;
433 }
434
435 /* Initializes the flow module.
436  * Returns zero if successful or a negative error code. */
437 int flow_init(void)
438 {
439         flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
440                                         0, NULL);
441         if (flow_cache == NULL)
442                 return -ENOMEM;
443
444         return 0;
445 }
446
447 /* Uninitializes the flow module. */
448 void flow_exit(void)
449 {
450         kmem_cache_destroy(flow_cache);
451 }
452