Add support for exporting flow information in NetFlow v5 format.
[sliver-openvswitch.git] / datapath / datapath.c
1 /*
2  * Distributed under the terms of the GNU GPL version 2.
3  * Copyright (c) 2007, 2008 The Board of Trustees of The Leland 
4  * Stanford Junior University
5  */
6
7 /* Functions for managing the dp interface/device. */
8
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/if_arp.h>
12 #include <linux/if_bridge.h>
13 #include <linux/if_vlan.h>
14 #include <linux/in.h>
15 #include <net/genetlink.h>
16 #include <linux/ip.h>
17 #include <linux/delay.h>
18 #include <linux/time.h>
19 #include <linux/etherdevice.h>
20 #include <linux/kernel.h>
21 #include <linux/kthread.h>
22 #include <linux/mutex.h>
23 #include <linux/rtnetlink.h>
24 #include <linux/rcupdate.h>
25 #include <linux/version.h>
26 #include <linux/ethtool.h>
27 #include <linux/random.h>
28 #include <asm/system.h>
29 #include <asm/div64.h>
30 #include <linux/netfilter_bridge.h>
31 #include <linux/netfilter_ipv4.h>
32 #include <linux/inetdevice.h>
33 #include <linux/list.h>
34 #include <linux/rculist.h>
35 #include <linux/workqueue.h>
36 #include <linux/dmi.h>
37
38 #include "openflow/nicira-ext.h"
39 #include "openflow/openflow-netlink.h"
40 #include "datapath.h"
41 #include "nx_act_snat.h"
42 #include "table.h"
43 #include "chain.h"
44 #include "dp_dev.h"
45 #include "forward.h"
46 #include "flow.h"
47
48 #include "compat.h"
49
50
51 /* Strings to describe the manufacturer, hardware, and software.  This data 
52  * is queriable through the switch description stats message. */
53 static char mfr_desc[DESC_STR_LEN] = "Nicira Networks, Inc.";
54 static char hw_desc[DESC_STR_LEN] = "Reference Linux Kernel Module";
55 static char sw_desc[DESC_STR_LEN] = VERSION BUILDNR;
56 static char serial_num[SERIAL_NUM_LEN] = "None";
57
58 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
59 module_param_string(mfr_desc, mfr_desc, sizeof mfr_desc, 0444);
60 module_param_string(hw_desc, hw_desc, sizeof hw_desc, 0444);
61 module_param_string(sw_desc, sw_desc, sizeof sw_desc, 0444);
62 module_param_string(serial_num, serial_num, sizeof serial_num, 0444);
63 #else
64 MODULE_PARM(mfr_desc, "s");
65 MODULE_PARM(hw_desc, "s");
66 MODULE_PARM(sw_desc, "s");
67 MODULE_PARM(serial_num, "s");
68 #endif
69
70
71 /* Number of milliseconds between runs of the maintenance thread. */
72 #define MAINT_SLEEP_MSECS 1000
73
74 #define UINT32_MAX                        4294967295U
75 #define UINT16_MAX                        65535
76 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
77
78 static struct genl_family dp_genl_family;
79 static struct genl_multicast_group mc_group;
80
81 /* It's hard to imagine wanting more than one datapath, but... */
82 #define DP_MAX 32
83
84 /* Datapaths.  Protected on the read side by rcu_read_lock, on the write side
85  * by dp_mutex.  dp_mutex is almost completely redundant with genl_mutex
86  * maintained by the Generic Netlink code, but the timeout path needs mutual
87  * exclusion too.
88  *
89  * It is safe to access the datapath and net_bridge_port structures with just
90  * dp_mutex.
91  */
92 static struct datapath *dps[DP_MAX];
93 DEFINE_MUTEX(dp_mutex);
94 EXPORT_SYMBOL(dp_mutex);
95
96 static int dp_maint_func(void *data);
97 static void init_port_status(struct net_bridge_port *p);
98 static int dp_genl_openflow_done(struct netlink_callback *);
99 static struct net_bridge_port *new_nbp(struct datapath *,
100                                        struct net_device *, int port_no);
101
102 /* nla_shrink - reduce amount of space reserved by nla_reserve
103  * @skb: socket buffer from which to recover room
104  * @nla: netlink attribute to adjust
105  * @len: new length of attribute payload
106  *
107  * Reduces amount of space reserved by a call to nla_reserve.
108  *
109  * No other attributes may be added between calling nla_reserve and this
110  * function, since it will create a hole in the message.
111  */
112 void nla_shrink(struct sk_buff *skb, struct nlattr *nla, int len)
113 {
114         int delta = nla_total_size(len) - nla_total_size(nla_len(nla));
115         BUG_ON(delta > 0);
116         skb->tail += delta;
117         skb->len  += delta;
118         nla->nla_len = nla_attr_size(len);
119 }
120
121 /* Puts a set of openflow headers for a message of the given 'type' into 'skb'.
122  * If 'sender' is nonnull, then it is used as the message's destination.  'dp'
123  * must specify the datapath to use.
124  *
125  * '*max_openflow_len' receives the maximum number of bytes that are available
126  * for the embedded OpenFlow message.  The caller must call
127  * resize_openflow_skb() to set the actual size of the message to this number
128  * of bytes or less.
129  *
130  * Returns the openflow header if successful, otherwise (if 'skb' is too small)
131  * an error code. */
132 static void *
133 put_openflow_headers(struct datapath *dp, struct sk_buff *skb, uint8_t type,
134                      const struct sender *sender, int *max_openflow_len)
135 {
136         struct ofp_header *oh;
137         struct nlattr *attr;
138         int openflow_len;
139
140         /* Assemble the Generic Netlink wrapper. */
141         if (!genlmsg_put(skb,
142                          sender ? sender->pid : 0,
143                          sender ? sender->seq : 0,
144                          &dp_genl_family, 0, DP_GENL_C_OPENFLOW))
145                 return ERR_PTR(-ENOBUFS);
146         if (nla_put_u32(skb, DP_GENL_A_DP_IDX, dp->dp_idx) < 0)
147                 return ERR_PTR(-ENOBUFS);
148         openflow_len = (skb_tailroom(skb) - NLA_HDRLEN) & ~(NLA_ALIGNTO - 1);
149         if (openflow_len < sizeof *oh)
150                 return ERR_PTR(-ENOBUFS);
151         *max_openflow_len = openflow_len;
152         attr = nla_reserve(skb, DP_GENL_A_OPENFLOW, openflow_len);
153         BUG_ON(!attr);
154
155         /* Fill in the header.  The caller is responsible for the length. */
156         oh = nla_data(attr);
157         oh->version = OFP_VERSION;
158         oh->type = type;
159         oh->xid = sender ? sender->xid : 0;
160
161         return oh;
162 }
163
164 /* Resizes OpenFlow header 'oh', which must be at the tail end of 'skb', to new
165  * length 'new_length' (in bytes), adjusting pointers and size values as
166  * necessary. */
167 static void
168 resize_openflow_skb(struct sk_buff *skb,
169                     struct ofp_header *oh, size_t new_length)
170 {
171         struct nlattr *attr = ((void *) oh) - NLA_HDRLEN;
172         nla_shrink(skb, attr, new_length);
173         oh->length = htons(new_length);
174         nlmsg_end(skb, (struct nlmsghdr *) skb->data);
175 }
176
177 /* Allocates a new skb to contain an OpenFlow message 'openflow_len' bytes in
178  * length.  Returns a null pointer if memory is unavailable, otherwise returns
179  * the OpenFlow header and stores a pointer to the skb in '*pskb'. 
180  *
181  * 'type' is the OpenFlow message type.  If 'sender' is nonnull, then it is
182  * used as the message's destination.  'dp' must specify the datapath to
183  * use.  */
184 static void *
185 alloc_openflow_skb(struct datapath *dp, size_t openflow_len, uint8_t type,
186                    const struct sender *sender, struct sk_buff **pskb) 
187 {
188         struct ofp_header *oh;
189         size_t genl_len;
190         struct sk_buff *skb;
191         int max_openflow_len;
192
193         if ((openflow_len + sizeof(struct ofp_header)) > UINT16_MAX) {
194                 if (net_ratelimit())
195                         printk("alloc_openflow_skb: openflow message too large: %zu\n", 
196                                         openflow_len);
197                 return NULL;
198         }
199
200         genl_len = nlmsg_total_size(GENL_HDRLEN + dp_genl_family.hdrsize);
201         genl_len += nla_total_size(sizeof(uint32_t)); /* DP_GENL_A_DP_IDX */
202         genl_len += nla_total_size(openflow_len);    /* DP_GENL_A_OPENFLOW */
203         skb = *pskb = genlmsg_new(genl_len, GFP_ATOMIC);
204         if (!skb) {
205                 if (net_ratelimit())
206                         printk("alloc_openflow_skb: genlmsg_new failed\n");
207                 return NULL;
208         }
209
210         oh = put_openflow_headers(dp, skb, type, sender, &max_openflow_len);
211         BUG_ON(!oh || IS_ERR(oh));
212         resize_openflow_skb(skb, oh, openflow_len);
213
214         return oh;
215 }
216
217 /* Sends 'skb' to 'sender' if it is nonnull, otherwise multicasts 'skb' to all
218  * listeners. */
219 static int
220 send_openflow_skb(struct sk_buff *skb, const struct sender *sender) 
221 {
222         return (sender
223                 ? genlmsg_unicast(skb, sender->pid)
224                 : genlmsg_multicast(skb, 0, mc_group.id, GFP_ATOMIC));
225 }
226
227 /* Retrieves the datapath id, which is the MAC address of the "of" device. */
228 static 
229 uint64_t get_datapath_id(struct net_device *dev)
230 {
231         uint64_t id = 0;
232         int i;
233
234         for (i=0; i<ETH_ALEN; i++) 
235                 id |= (uint64_t)dev->dev_addr[i] << (8*(ETH_ALEN-1 - i));
236
237         return id;
238 }
239
240 /* Creates a new datapath numbered 'dp_idx'.  Returns 0 for success or a
241  * negative error code. */
242 static int new_dp(int dp_idx)
243 {
244         struct datapath *dp;
245         int err;
246
247         if (dp_idx < 0 || dp_idx >= DP_MAX)
248                 return -EINVAL;
249
250         if (!try_module_get(THIS_MODULE))
251                 return -ENODEV;
252
253         /* Exit early if a datapath with that number already exists. */
254         if (dps[dp_idx]) {
255                 err = -EEXIST;
256                 goto err_unlock;
257         }
258
259         err = -ENOMEM;
260         dp = kzalloc(sizeof *dp, GFP_KERNEL);
261         if (dp == NULL)
262                 goto err_unlock;
263
264         /* Setup our "of" device */
265         err = dp_dev_setup(dp);
266         if (err)
267                 goto err_free_dp;
268
269         dp->dp_idx = dp_idx;
270         dp->chain = chain_create(dp);
271         if (dp->chain == NULL)
272                 goto err_destroy_dp_dev;
273         INIT_LIST_HEAD(&dp->port_list);
274
275         dp->local_port = new_nbp(dp, dp->netdev, OFPP_LOCAL);
276         if (IS_ERR(dp->local_port)) {
277                 err = PTR_ERR(dp->local_port);
278                 goto err_destroy_local_port;
279         }
280
281         dp->flags = 0;
282         dp->miss_send_len = OFP_DEFAULT_MISS_SEND_LEN;
283
284         dp->dp_task = kthread_run(dp_maint_func, dp, "dp%d", dp_idx);
285         if (IS_ERR(dp->dp_task))
286                 goto err_destroy_chain;
287
288         dps[dp_idx] = dp;
289
290         return 0;
291
292 err_destroy_local_port:
293         dp_del_switch_port(dp->local_port);
294 err_destroy_chain:
295         chain_destroy(dp->chain);
296 err_destroy_dp_dev:
297         dp_dev_destroy(dp);
298 err_free_dp:
299         kfree(dp);
300 err_unlock:
301         module_put(THIS_MODULE);
302                 return err;
303 }
304
305 /* Find and return a free port number under 'dp'. */
306 static int find_portno(struct datapath *dp)
307 {
308         int i;
309         for (i = 0; i < DP_MAX_PORTS; i++)
310                 if (dp->ports[i] == NULL)
311                         return i;
312         return -EXFULL;
313 }
314
315 static struct net_bridge_port *new_nbp(struct datapath *dp,
316                                        struct net_device *dev, int port_no)
317 {
318         struct net_bridge_port *p;
319
320         if (dev->br_port != NULL)
321                 return ERR_PTR(-EBUSY);
322
323         p = kzalloc(sizeof(*p), GFP_KERNEL);
324         if (p == NULL)
325                 return ERR_PTR(-ENOMEM);
326
327         rtnl_lock();
328         dev_set_promiscuity(dev, 1);
329         rtnl_unlock();
330         dev_hold(dev);
331         p->dp = dp;
332         p->dev = dev;
333         p->port_no = port_no;
334         spin_lock_init(&p->lock);
335         INIT_WORK(&p->port_task, NULL);
336         if (port_no != OFPP_LOCAL)
337                 rcu_assign_pointer(dev->br_port, p);
338         if (port_no < DP_MAX_PORTS)
339                 rcu_assign_pointer(dp->ports[port_no], p); 
340         list_add_rcu(&p->node, &dp->port_list);
341
342         return p;
343 }
344
345 int add_switch_port(struct datapath *dp, struct net_device *dev)
346 {
347         struct net_bridge_port *p;
348         int port_no;
349
350         if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER
351             || is_dp_dev(dev))
352                 return -EINVAL;
353
354         port_no = find_portno(dp);
355         if (port_no < 0)
356                 return port_no;
357
358         p = new_nbp(dp, dev, port_no);
359         if (IS_ERR(p))
360                 return PTR_ERR(p);
361
362         init_port_status(p);
363
364         /* Notify the ctlpath that this port has been added */
365         dp_send_port_status(p, OFPPR_ADD);
366
367         return 0;
368 }
369
370 /* Delete 'p' from switch. */
371 int dp_del_switch_port(struct net_bridge_port *p)
372 {
373 #ifdef SUPPORT_SNAT
374         unsigned long flags;
375 #endif
376
377         /* First drop references to device. */
378         cancel_work_sync(&p->port_task);
379         rtnl_lock();
380         dev_set_promiscuity(p->dev, -1);
381         rtnl_unlock();
382         list_del_rcu(&p->node);
383         if (p->port_no != OFPP_LOCAL)
384                 rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
385         rcu_assign_pointer(p->dev->br_port, NULL);
386
387         /* Then wait until no one is still using it, and destroy it. */
388         synchronize_rcu();
389
390 #ifdef SUPPORT_SNAT
391         /* Free any SNAT configuration on the port. */
392         spin_lock_irqsave(&p->lock, flags);
393         snat_free_conf(p);
394         spin_unlock_irqrestore(&p->lock, flags);
395 #endif
396
397         /* Notify the ctlpath that this port no longer exists */
398         dp_send_port_status(p, OFPPR_DELETE);
399
400         dev_put(p->dev);
401         kfree(p);
402
403         return 0;
404 }
405
406 static void del_dp(struct datapath *dp)
407 {
408         struct net_bridge_port *p, *n;
409
410         kthread_stop(dp->dp_task);
411
412         /* Drop references to DP. */
413         list_for_each_entry_safe (p, n, &dp->port_list, node)
414                 dp_del_switch_port(p);
415         rcu_assign_pointer(dps[dp->dp_idx], NULL);
416
417         /* Kill off local_port dev references from buffered packets that have
418          * associated dst entries. */
419         synchronize_rcu();
420         fwd_discard_all();
421
422         /* Destroy dp->netdev.  (Must follow deleting switch ports since
423          * dp->local_port has a reference to it.) */
424         dp_dev_destroy(dp);
425
426         /* Wait until no longer in use, then destroy it. */
427         synchronize_rcu();
428         chain_destroy(dp->chain);
429         kfree(dp);
430         module_put(THIS_MODULE);
431 }
432
433 static int dp_maint_func(void *data)
434 {
435         struct datapath *dp = (struct datapath *) data;
436
437         while (!kthread_should_stop()) {
438 #ifdef SUPPORT_SNAT
439                 struct net_bridge_port *p;
440
441                 /* Expire old SNAT entries */
442                 rcu_read_lock();
443                 list_for_each_entry_rcu (p, &dp->port_list, node) 
444                         snat_maint(p);
445                 rcu_read_unlock();
446 #endif
447
448                 /* Timeout old entries */
449                 chain_timeout(dp->chain);
450                 msleep_interruptible(MAINT_SLEEP_MSECS);
451         }
452                 
453         return 0;
454 }
455
456 static void
457 do_port_input(struct net_bridge_port *p, struct sk_buff *skb) 
458 {
459         /* Make our own copy of the packet.  Otherwise we will mangle the
460          * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
461          * (No one comes after us, since we tell handle_bridge() that we took
462          * the packet.) */
463         skb = skb_share_check(skb, GFP_ATOMIC);
464         if (!skb)
465                 return;
466
467 #ifdef SUPPORT_SNAT
468         /* Check if this packet needs early SNAT processing. */
469         if (snat_pre_route(skb)) {
470                 return;
471         }
472 #endif
473
474         /* Push the Ethernet header back on. */
475         skb_push(skb, ETH_HLEN);
476         skb_reset_mac_header(skb);
477         fwd_port_input(p->dp->chain, skb, p);
478 }
479
480 /*
481  * Used as br_handle_frame_hook.  (Cannot run bridge at the same time, even on
482  * different set of devices!)
483  */
484 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
485 /* Called with rcu_read_lock. */
486 static struct sk_buff *dp_frame_hook(struct net_bridge_port *p,
487                                          struct sk_buff *skb)
488 {
489         do_port_input(p, skb);
490         return NULL;
491 }
492 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
493 static int dp_frame_hook(struct net_bridge_port *p, struct sk_buff **pskb)
494 {
495         do_port_input(p, *pskb);
496         return 1;
497 }
498 #else
499 /* NB: This has only been tested on 2.4.35 */
500 static void dp_frame_hook(struct sk_buff *skb)
501 {
502         struct net_bridge_port *p = skb->dev->br_port;
503         if (p) {
504                 rcu_read_lock();
505                 do_port_input(p, skb);
506                 rcu_read_unlock();
507         } else
508                 kfree_skb(skb);
509 }
510 #endif
511
512 /* Forwarding output path.
513  * Based on net/bridge/br_forward.c. */
514
515 static inline unsigned packet_length(const struct sk_buff *skb)
516 {
517         int length = skb->len - ETH_HLEN;
518         if (skb->protocol == htons(ETH_P_8021Q))
519                 length -= VLAN_HLEN;
520         return length;
521 }
522
523 /* Send packets out all the ports except the originating one.  If the
524  * "flood" argument is set, only send along the minimum spanning tree.
525  */
526 static int
527 output_all(struct datapath *dp, struct sk_buff *skb, int flood)
528 {
529         u32 disable = flood ? OFPPC_NO_FLOOD : 0;
530         struct net_bridge_port *p;
531         int prev_port = -1;
532
533         list_for_each_entry_rcu (p, &dp->port_list, node) {
534                 if (skb->dev == p->dev || p->config & disable)
535                         continue;
536                 if (prev_port != -1) {
537                         struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
538                         if (!clone) {
539                                 kfree_skb(skb);
540                                 return -ENOMEM;
541                         }
542                         dp_output_port(dp, clone, prev_port, 0); 
543                 }
544                 prev_port = p->port_no;
545         }
546         if (prev_port != -1)
547                 dp_output_port(dp, skb, prev_port, 0);
548         else
549                 kfree_skb(skb);
550
551         return 0;
552 }
553
554 /* Marks 'skb' as having originated from 'in_port' in 'dp'.
555    FIXME: how are devices reference counted? */
556 void dp_set_origin(struct datapath *dp, uint16_t in_port,
557                            struct sk_buff *skb)
558 {
559         struct net_bridge_port *p;
560         p = (in_port < DP_MAX_PORTS ? dp->ports[in_port]
561              : in_port == OFPP_LOCAL ? dp->local_port
562              : NULL);
563         if (p) 
564                 skb->dev = p->dev;
565          else 
566                 skb->dev = NULL;
567 }
568
569 int 
570 dp_xmit_skb(struct sk_buff *skb)
571 {
572         int len = skb->len;
573         if (packet_length(skb) > skb->dev->mtu) {
574                 printk("dropped over-mtu packet: %d > %d\n",
575                            packet_length(skb), skb->dev->mtu);
576                 kfree_skb(skb);
577                 return -E2BIG;
578         }
579
580         dev_queue_xmit(skb);
581
582         return len;
583 }
584
585 /* Takes ownership of 'skb' and transmits it to 'out_port' on 'dp'.
586  */
587 int dp_output_port(struct datapath *dp, struct sk_buff *skb, int out_port,
588                    int ignore_no_fwd)
589 {
590         BUG_ON(!skb);
591         switch (out_port){
592         case OFPP_IN_PORT:
593                 /* Send it out the port it came in on, which is already set in
594                  * the skb. */
595                 if (!skb->dev) {
596                         if (net_ratelimit())
597                                 printk("skb device not set forwarding to in_port\n");
598                         kfree_skb(skb);
599                         return -ESRCH;
600                 }
601                 return dp_xmit_skb(skb);
602                 
603         case OFPP_TABLE: {
604                 int retval = run_flow_through_tables(dp->chain, skb,
605                                                      skb->dev->br_port);
606                 if (retval)
607                         kfree_skb(skb);
608                 return retval;
609         }
610
611         case OFPP_FLOOD:
612                 return output_all(dp, skb, 1);
613
614         case OFPP_ALL:
615                 return output_all(dp, skb, 0);
616
617         case OFPP_CONTROLLER:
618                 return dp_output_control(dp, skb, fwd_save_skb(skb), 0,
619                                                   OFPR_ACTION);
620
621         case OFPP_LOCAL: {
622                 struct net_device *dev = dp->netdev;
623 #ifdef SUPPORT_SNAT
624                 snat_local_in(skb);
625 #endif
626                 return dev ? dp_dev_recv(dev, skb) : -ESRCH;
627         }
628
629         case 0 ... DP_MAX_PORTS - 1: {
630                 struct net_bridge_port *p = dp->ports[out_port];
631                 if (p == NULL)
632                         goto bad_port;
633                 if (p->dev == skb->dev) {
634                         /* To send to the input port, must use OFPP_IN_PORT */
635                         kfree_skb(skb);
636                         if (net_ratelimit())
637                                 printk("can't directly forward to input port\n");
638                         return -EINVAL;
639                 }
640                 if (p->config & OFPPC_NO_FWD && !ignore_no_fwd) {
641                         kfree_skb(skb);
642                         return 0;
643                 }
644                 skb->dev = p->dev; 
645                 return dp_xmit_skb(skb);
646         }
647
648         default:
649                 goto bad_port;
650         }
651
652 bad_port:
653         kfree_skb(skb);
654         if (net_ratelimit())
655                 printk("can't forward to bad port %d\n", out_port);
656         return -ENOENT;
657 }
658
659 /* Takes ownership of 'skb' and transmits it to 'dp''s control path.  If
660  * 'buffer_id' != -1, then only the first 64 bytes of 'skb' are sent;
661  * otherwise, all of 'skb' is sent.  'reason' indicates why 'skb' is being
662  * sent. 'max_len' sets the maximum number of bytes that the caller
663  * wants to be sent; a value of 0 indicates the entire packet should be
664  * sent. */
665 int
666 dp_output_control(struct datapath *dp, struct sk_buff *skb,
667                            uint32_t buffer_id, size_t max_len, int reason)
668 {
669         /* FIXME?  Can we avoid creating a new skbuff in the case where we
670          * forward the whole packet? */
671         struct sk_buff *f_skb;
672         struct ofp_packet_in *opi;
673         size_t fwd_len, opi_len;
674         int err;
675
676         fwd_len = skb->len;
677         if ((buffer_id != (uint32_t) -1) && max_len)
678                 fwd_len = min(fwd_len, max_len);
679
680         opi_len = offsetof(struct ofp_packet_in, data) + fwd_len;
681         opi = alloc_openflow_skb(dp, opi_len, OFPT_PACKET_IN, NULL, &f_skb);
682         if (!opi) {
683                 err = -ENOMEM;
684                 goto out;
685         }
686         opi->buffer_id      = htonl(buffer_id);
687         opi->total_len      = htons(skb->len);
688         opi->in_port        = htons(skb->dev && skb->dev->br_port
689                                     ? skb->dev->br_port->port_no
690                                     : OFPP_LOCAL);
691         opi->reason         = reason;
692         opi->pad            = 0;
693         skb_copy_bits(skb, 0, opi->data, fwd_len);
694         err = send_openflow_skb(f_skb, NULL);
695
696 out:
697         kfree_skb(skb);
698         return err;
699 }
700
701 static void fill_port_desc(struct net_bridge_port *p, struct ofp_phy_port *desc)
702 {
703         unsigned long flags;
704         desc->port_no = htons(p->port_no);
705         strncpy(desc->name, p->dev->name, OFP_MAX_PORT_NAME_LEN);
706         desc->name[OFP_MAX_PORT_NAME_LEN-1] = '\0';
707         memcpy(desc->hw_addr, p->dev->dev_addr, ETH_ALEN);
708         desc->curr = 0;
709         desc->supported = 0;
710         desc->advertised = 0;
711         desc->peer = 0;
712
713         spin_lock_irqsave(&p->lock, flags);
714         desc->config = htonl(p->config);
715         desc->state = htonl(p->state);
716         spin_unlock_irqrestore(&p->lock, flags);
717
718 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,24)
719         if (p->dev->ethtool_ops && p->dev->ethtool_ops->get_settings) {
720                 struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET };
721
722                 if (!p->dev->ethtool_ops->get_settings(p->dev, &ecmd)) {
723                         /* Set the supported features */
724                         if (ecmd.supported & SUPPORTED_10baseT_Half) 
725                                 desc->supported |= OFPPF_10MB_HD;
726                         if (ecmd.supported & SUPPORTED_10baseT_Full)
727                                 desc->supported |= OFPPF_10MB_FD;
728                         if (ecmd.supported & SUPPORTED_100baseT_Half) 
729                                 desc->supported |= OFPPF_100MB_HD;
730                         if (ecmd.supported & SUPPORTED_100baseT_Full)
731                                 desc->supported |= OFPPF_100MB_FD;
732                         if (ecmd.supported & SUPPORTED_1000baseT_Half)
733                                 desc->supported |= OFPPF_1GB_HD;
734                         if (ecmd.supported & SUPPORTED_1000baseT_Full)
735                                 desc->supported |= OFPPF_1GB_FD;
736                         if (ecmd.supported & SUPPORTED_10000baseT_Full)
737                                 desc->supported |= OFPPF_10GB_FD;
738                         if (ecmd.supported & SUPPORTED_TP)
739                                 desc->supported |= OFPPF_COPPER;
740                         if (ecmd.supported & SUPPORTED_FIBRE)
741                                 desc->supported |= OFPPF_FIBER;
742                         if (ecmd.supported & SUPPORTED_Autoneg)
743                                 desc->supported |= OFPPF_AUTONEG;
744 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
745                         if (ecmd.supported & SUPPORTED_Pause)
746                                 desc->supported |= OFPPF_PAUSE;
747                         if (ecmd.supported & SUPPORTED_Asym_Pause)
748                                 desc->supported |= OFPPF_PAUSE_ASYM;
749 #endif /* kernel >= 2.6.14 */
750
751                         /* Set the advertised features */
752                         if (ecmd.advertising & ADVERTISED_10baseT_Half) 
753                                 desc->advertised |= OFPPF_10MB_HD;
754                         if (ecmd.advertising & ADVERTISED_10baseT_Full)
755                                 desc->advertised |= OFPPF_10MB_FD;
756                         if (ecmd.advertising & ADVERTISED_100baseT_Half) 
757                                 desc->advertised |= OFPPF_100MB_HD;
758                         if (ecmd.advertising & ADVERTISED_100baseT_Full)
759                                 desc->advertised |= OFPPF_100MB_FD;
760                         if (ecmd.advertising & ADVERTISED_1000baseT_Half)
761                                 desc->advertised |= OFPPF_1GB_HD;
762                         if (ecmd.advertising & ADVERTISED_1000baseT_Full)
763                                 desc->advertised |= OFPPF_1GB_FD;
764                         if (ecmd.advertising & ADVERTISED_10000baseT_Full)
765                                 desc->advertised |= OFPPF_10GB_FD;
766                         if (ecmd.advertising & ADVERTISED_TP)
767                                 desc->advertised |= OFPPF_COPPER;
768                         if (ecmd.advertising & ADVERTISED_FIBRE)
769                                 desc->advertised |= OFPPF_FIBER;
770                         if (ecmd.advertising & ADVERTISED_Autoneg)
771                                 desc->advertised |= OFPPF_AUTONEG;
772 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
773                         if (ecmd.advertising & ADVERTISED_Pause)
774                                 desc->advertised |= OFPPF_PAUSE;
775                         if (ecmd.advertising & ADVERTISED_Asym_Pause)
776                                 desc->advertised |= OFPPF_PAUSE_ASYM;
777 #endif /* kernel >= 2.6.14 */
778
779                         /* Set the current features */
780                         if (ecmd.speed == SPEED_10)
781                                 desc->curr = (ecmd.duplex) ? OFPPF_10MB_FD : OFPPF_10MB_HD;
782                         else if (ecmd.speed == SPEED_100)
783                                 desc->curr = (ecmd.duplex) ? OFPPF_100MB_FD : OFPPF_100MB_HD;
784                         else if (ecmd.speed == SPEED_1000)
785                                 desc->curr = (ecmd.duplex) ? OFPPF_1GB_FD : OFPPF_1GB_HD;
786                         else if (ecmd.speed == SPEED_10000)
787                                 desc->curr = OFPPF_10GB_FD;
788
789                         if (ecmd.port == PORT_TP) 
790                                 desc->curr |= OFPPF_COPPER;
791                         else if (ecmd.port == PORT_FIBRE) 
792                                 desc->curr |= OFPPF_FIBER;
793
794                         if (ecmd.autoneg)
795                                 desc->curr |= OFPPF_AUTONEG;
796                 }
797         }
798 #endif
799         desc->curr = htonl(desc->curr);
800         desc->supported = htonl(desc->supported);
801         desc->advertised = htonl(desc->advertised);
802         desc->peer = htonl(desc->peer);
803 }
804
805 static int 
806 fill_features_reply(struct datapath *dp, struct ofp_switch_features *ofr)
807 {
808         struct net_bridge_port *p;
809         uint64_t dpid = get_datapath_id(dp->netdev);
810         int port_count = 0;
811
812         ofr->datapath_id  = cpu_to_be64(dpid);
813
814         ofr->n_buffers    = htonl(N_PKT_BUFFERS);
815         ofr->n_tables     = dp->chain->n_tables;
816         ofr->capabilities = htonl(OFP_SUPPORTED_CAPABILITIES);
817         ofr->actions      = htonl(OFP_SUPPORTED_ACTIONS);
818         memset(ofr->pad, 0, sizeof ofr->pad);
819
820         list_for_each_entry_rcu (p, &dp->port_list, node) {
821                 fill_port_desc(p, &ofr->ports[port_count]);
822                 port_count++;
823         }
824
825         return port_count;
826 }
827
828 int
829 dp_send_features_reply(struct datapath *dp, const struct sender *sender)
830 {
831         struct sk_buff *skb;
832         struct ofp_switch_features *ofr;
833         size_t ofr_len, port_max_len;
834         int port_count;
835
836         /* Overallocate. */
837         port_max_len = sizeof(struct ofp_phy_port) * DP_MAX_PORTS;
838         ofr = alloc_openflow_skb(dp, sizeof(*ofr) + port_max_len,
839                                  OFPT_FEATURES_REPLY, sender, &skb);
840         if (!ofr)
841                 return -ENOMEM;
842
843         /* Fill. */
844         port_count = fill_features_reply(dp, ofr);
845
846         /* Shrink to fit. */
847         ofr_len = sizeof(*ofr) + (sizeof(struct ofp_phy_port) * port_count);
848         resize_openflow_skb(skb, &ofr->header, ofr_len);
849         return send_openflow_skb(skb, sender);
850 }
851
852 int
853 dp_send_config_reply(struct datapath *dp, const struct sender *sender)
854 {
855         struct sk_buff *skb;
856         struct ofp_switch_config *osc;
857
858         osc = alloc_openflow_skb(dp, sizeof *osc, OFPT_GET_CONFIG_REPLY, sender,
859                                  &skb);
860         if (!osc)
861                 return -ENOMEM;
862
863         osc->flags = htons(dp->flags);
864         osc->miss_send_len = htons(dp->miss_send_len);
865
866         return send_openflow_skb(skb, sender);
867 }
868
869 int
870 dp_send_hello(struct datapath *dp, const struct sender *sender,
871               const struct ofp_header *request)
872 {
873         if (request->version < OFP_VERSION) {
874                 char err[64];
875                 sprintf(err, "Only version 0x%02x supported", OFP_VERSION);
876                 dp_send_error_msg(dp, sender, OFPET_HELLO_FAILED,
877                                   OFPHFC_INCOMPATIBLE, err, strlen(err));
878                 return -EINVAL;
879         } else {
880                 struct sk_buff *skb;
881                 struct ofp_header *reply;
882
883                 reply = alloc_openflow_skb(dp, sizeof *reply,
884                                            OFPT_HELLO, sender, &skb);
885                 if (!reply)
886                         return -ENOMEM;
887
888                 return send_openflow_skb(skb, sender);
889         }
890 }
891
892 /* Callback function for a workqueue to disable an interface */
893 static void
894 down_port_cb(struct work_struct *work)
895 {
896         struct net_bridge_port *p = container_of(work, struct net_bridge_port, 
897                         port_task);
898
899         rtnl_lock();
900         if (dev_change_flags(p->dev, p->dev->flags & ~IFF_UP) < 0)
901                 if (net_ratelimit())
902                         printk("problem bringing up port %s\n", p->dev->name);
903         rtnl_unlock();
904         p->config |= OFPPC_PORT_DOWN;
905 }
906
907 /* Callback function for a workqueue to enable an interface */
908 static void
909 up_port_cb(struct work_struct *work)
910 {
911         struct net_bridge_port *p = container_of(work, struct net_bridge_port, 
912                         port_task);
913
914         rtnl_lock();
915         if (dev_change_flags(p->dev, p->dev->flags | IFF_UP) < 0)
916                 if (net_ratelimit())
917                         printk("problem bringing down port %s\n", p->dev->name);
918         rtnl_unlock();
919         p->config &= ~OFPPC_PORT_DOWN;
920 }
921
922 int
923 dp_update_port_flags(struct datapath *dp, const struct ofp_port_mod *opm)
924 {
925         unsigned long int flags;
926         int port_no = ntohs(opm->port_no);
927         struct net_bridge_port *p;
928         p = (port_no < DP_MAX_PORTS ? dp->ports[port_no]
929              : port_no == OFPP_LOCAL ? dp->local_port
930              : NULL);
931
932         /* Make sure the port id hasn't changed since this was sent */
933         if (!p || memcmp(opm->hw_addr, p->dev->dev_addr, ETH_ALEN))
934                 return -1;
935
936         spin_lock_irqsave(&p->lock, flags);
937         if (opm->mask) {
938                 uint32_t config_mask = ntohl(opm->mask);
939                 p->config &= ~config_mask;
940                 p->config |= ntohl(opm->config) & config_mask;
941         }
942
943         /* Modifying the status of an interface requires taking a lock
944          * that cannot be done from here.  For this reason, we use a shared 
945          * workqueue, which will cause it to be executed from a safer 
946          * context. */
947         if (opm->mask & htonl(OFPPC_PORT_DOWN)) {
948                 if ((opm->config & htonl(OFPPC_PORT_DOWN))
949                     && (p->config & OFPPC_PORT_DOWN) == 0) {
950                         PREPARE_WORK(&p->port_task, down_port_cb);
951                         schedule_work(&p->port_task);
952                 } else if ((opm->config & htonl(OFPPC_PORT_DOWN)) == 0
953                            && (p->config & OFPPC_PORT_DOWN)) {
954                         PREPARE_WORK(&p->port_task, up_port_cb);
955                         schedule_work(&p->port_task);
956                 }
957         }
958         spin_unlock_irqrestore(&p->lock, flags);
959
960         return 0;
961 }
962
963 /* Initialize the port status field of the bridge port. */
964 static void
965 init_port_status(struct net_bridge_port *p)
966 {
967         unsigned long int flags;
968
969         spin_lock_irqsave(&p->lock, flags);
970
971         if (p->dev->flags & IFF_UP) 
972                 p->config &= ~OFPPC_PORT_DOWN;
973         else
974                 p->config |= OFPPC_PORT_DOWN;
975
976         if (netif_carrier_ok(p->dev))
977                 p->state &= ~OFPPS_LINK_DOWN;
978         else
979                 p->state |= OFPPS_LINK_DOWN;
980
981         spin_unlock_irqrestore(&p->lock, flags);
982 }
983
984 int
985 dp_send_port_status(struct net_bridge_port *p, uint8_t status)
986 {
987         struct sk_buff *skb;
988         struct ofp_port_status *ops;
989
990         ops = alloc_openflow_skb(p->dp, sizeof *ops, OFPT_PORT_STATUS, NULL,
991                                  &skb);
992         if (!ops)
993                 return -ENOMEM;
994         ops->reason = status;
995         memset(ops->pad, 0, sizeof ops->pad);
996         fill_port_desc(p, &ops->desc);
997
998         return send_openflow_skb(skb, NULL);
999 }
1000
1001 /* Convert jiffies_64 to milliseconds. */
1002 static u64 inline jiffies_64_to_msecs(const u64 j)
1003 {
1004 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
1005                 return (MSEC_PER_SEC / HZ) * j;
1006 #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
1007                 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
1008 #else
1009                 return (j * MSEC_PER_SEC) / HZ;
1010 #endif
1011 }
1012
1013 int 
1014 dp_send_flow_end(struct datapath *dp, struct sw_flow *flow,
1015                      enum nx_flow_end_reason reason)
1016 {
1017         struct sk_buff *skb;
1018         struct nx_flow_end *nfe;
1019
1020         if (!dp->send_flow_end)
1021                 return 0;
1022
1023         nfe = alloc_openflow_skb(dp, sizeof *nfe, OFPT_VENDOR, 0, &skb);
1024         if (!nfe)
1025                 return -ENOMEM;
1026
1027         nfe->header.vendor = htonl(NX_VENDOR_ID);
1028         nfe->header.subtype = htonl(NXT_FLOW_END);
1029
1030         flow_fill_match(&nfe->match, &flow->key);
1031
1032         nfe->priority = htons(flow->priority);
1033         nfe->reason = reason;
1034
1035         nfe->tcp_flags = flow->tcp_flags;
1036         nfe->ip_tos = flow->ip_tos;
1037
1038         memset(nfe->pad, 0, sizeof nfe->pad);
1039
1040         nfe->init_time = cpu_to_be64(jiffies_64_to_msecs(flow->created));
1041         nfe->used_time = cpu_to_be64(jiffies_64_to_msecs(flow->used));
1042         nfe->end_time = cpu_to_be64(jiffies_64_to_msecs(get_jiffies_64()));
1043
1044         nfe->packet_count = cpu_to_be64(flow->packet_count);
1045         nfe->byte_count   = cpu_to_be64(flow->byte_count);
1046
1047         return send_openflow_skb(skb, NULL);
1048 }
1049 EXPORT_SYMBOL(dp_send_flow_end);
1050
1051 int
1052 dp_send_error_msg(struct datapath *dp, const struct sender *sender, 
1053                 uint16_t type, uint16_t code, const void *data, size_t len)
1054 {
1055         struct sk_buff *skb;
1056         struct ofp_error_msg *oem;
1057
1058
1059         oem = alloc_openflow_skb(dp, sizeof(*oem)+len, OFPT_ERROR, 
1060                         sender, &skb);
1061         if (!oem)
1062                 return -ENOMEM;
1063
1064         oem->type = htons(type);
1065         oem->code = htons(code);
1066         memcpy(oem->data, data, len);
1067
1068         return send_openflow_skb(skb, sender);
1069 }
1070
1071 int
1072 dp_send_echo_reply(struct datapath *dp, const struct sender *sender,
1073                    const struct ofp_header *rq)
1074 {
1075         struct sk_buff *skb;
1076         struct ofp_header *reply;
1077
1078         reply = alloc_openflow_skb(dp, ntohs(rq->length), OFPT_ECHO_REPLY,
1079                                    sender, &skb);
1080         if (!reply)
1081                 return -ENOMEM;
1082
1083         memcpy(reply + 1, rq + 1, ntohs(rq->length) - sizeof *rq);
1084         return send_openflow_skb(skb, sender);
1085 }
1086
1087 /* Generic Netlink interface.
1088  *
1089  * See netlink(7) for an introduction to netlink.  See
1090  * http://linux-net.osdl.org/index.php/Netlink for more information and
1091  * pointers on how to work with netlink and Generic Netlink in the kernel and
1092  * in userspace. */
1093
1094 static struct genl_family dp_genl_family = {
1095         .id = GENL_ID_GENERATE,
1096         .hdrsize = 0,
1097         .name = DP_GENL_FAMILY_NAME,
1098         .version = 1,
1099         .maxattr = DP_GENL_A_MAX,
1100 };
1101
1102 /* Attribute policy: what each attribute may contain.  */
1103 static struct nla_policy dp_genl_policy[DP_GENL_A_MAX + 1] = {
1104         [DP_GENL_A_DP_IDX] = { .type = NLA_U32 },
1105         [DP_GENL_A_MC_GROUP] = { .type = NLA_U32 },
1106         [DP_GENL_A_PORTNAME] = { .type = NLA_STRING }
1107 };
1108
1109 static int dp_genl_add(struct sk_buff *skb, struct genl_info *info)
1110 {
1111         if (!info->attrs[DP_GENL_A_DP_IDX])
1112                 return -EINVAL;
1113
1114         return new_dp(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
1115 }
1116
1117 static struct genl_ops dp_genl_ops_add_dp = {
1118         .cmd = DP_GENL_C_ADD_DP,
1119         .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1120         .policy = dp_genl_policy,
1121         .doit = dp_genl_add,
1122         .dumpit = NULL,
1123 };
1124
1125 struct datapath *dp_get(int dp_idx)
1126 {
1127         if (dp_idx < 0 || dp_idx > DP_MAX)
1128                 return NULL;
1129         return rcu_dereference(dps[dp_idx]);
1130 }
1131
1132 static int dp_genl_del(struct sk_buff *skb, struct genl_info *info)
1133 {
1134         struct datapath *dp;
1135         int err;
1136
1137         if (!info->attrs[DP_GENL_A_DP_IDX])
1138                 return -EINVAL;
1139
1140         dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
1141         if (!dp)
1142                 err = -ENOENT;
1143         else {
1144                 del_dp(dp);
1145                 err = 0;
1146         }
1147         return err;
1148 }
1149
1150 static struct genl_ops dp_genl_ops_del_dp = {
1151         .cmd = DP_GENL_C_DEL_DP,
1152         .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1153         .policy = dp_genl_policy,
1154         .doit = dp_genl_del,
1155         .dumpit = NULL,
1156 };
1157
1158 /* Queries a datapath for related information.  Currently the only relevant
1159  * information is the datapath's multicast group ID.  Really we want one
1160  * multicast group per datapath, but because of locking issues[*] we can't
1161  * easily get one.  Thus, every datapath will currently return the same
1162  * global multicast group ID, but in the future it would be nice to fix that.
1163  *
1164  * [*] dp_genl_add, to add a new datapath, is called under the genl_lock
1165  *       mutex, and genl_register_mc_group, called to acquire a new multicast
1166  *       group ID, also acquires genl_lock, thus deadlock.
1167  */
1168 static int dp_genl_query(struct sk_buff *skb, struct genl_info *info)
1169 {
1170         struct datapath *dp;
1171         struct sk_buff *ans_skb = NULL;
1172         int dp_idx;
1173         int err = -ENOMEM;
1174
1175         if (!info->attrs[DP_GENL_A_DP_IDX])
1176                 return -EINVAL;
1177
1178         rcu_read_lock();
1179         dp_idx = nla_get_u32((info->attrs[DP_GENL_A_DP_IDX]));
1180         dp = dp_get(dp_idx);
1181         if (!dp)
1182                 err = -ENOENT;
1183         else {
1184                 void *data;
1185                 ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1186                 if (!ans_skb) {
1187                         err = -ENOMEM;
1188                         goto err;
1189                 }
1190                 data = genlmsg_put_reply(ans_skb, info, &dp_genl_family,
1191                                          0, DP_GENL_C_QUERY_DP);
1192                 if (data == NULL) {
1193                         err = -ENOMEM;
1194                         goto err;
1195                 }
1196                 NLA_PUT_U32(ans_skb, DP_GENL_A_DP_IDX, dp_idx);
1197                 NLA_PUT_U32(ans_skb, DP_GENL_A_MC_GROUP, mc_group.id);
1198
1199                 genlmsg_end(ans_skb, data);
1200                 err = genlmsg_reply(ans_skb, info);
1201                 ans_skb = NULL;
1202         }
1203 err:
1204 nla_put_failure:
1205         kfree_skb(ans_skb);
1206         rcu_read_unlock();
1207         return err;
1208 }
1209
1210 static struct genl_ops dp_genl_ops_query_dp = {
1211         .cmd = DP_GENL_C_QUERY_DP,
1212         .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1213         .policy = dp_genl_policy,
1214         .doit = dp_genl_query,
1215         .dumpit = NULL,
1216 };
1217
1218 static int dp_genl_add_del_port(struct sk_buff *skb, struct genl_info *info)
1219 {
1220         struct datapath *dp;
1221         struct net_device *port;
1222         int err;
1223
1224         if (!info->attrs[DP_GENL_A_DP_IDX] || !info->attrs[DP_GENL_A_PORTNAME])
1225                 return -EINVAL;
1226
1227         /* Get datapath. */
1228         dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
1229         if (!dp) {
1230                 err = -ENOENT;
1231                 goto out;
1232         }
1233
1234         /* Get interface to add/remove. */
1235         port = dev_get_by_name(&init_net, 
1236                         nla_data(info->attrs[DP_GENL_A_PORTNAME]));
1237         if (!port) {
1238                 err = -ENOENT;
1239                 goto out;
1240         }
1241
1242         /* Execute operation. */
1243         if (info->genlhdr->cmd == DP_GENL_C_ADD_PORT)
1244                 err = add_switch_port(dp, port);
1245         else {
1246                 if (port->br_port == NULL || port->br_port->dp != dp) {
1247                         err = -ENOENT;
1248                         goto out_put;
1249                 }
1250                 err = dp_del_switch_port(port->br_port);
1251         }
1252
1253 out_put:
1254         dev_put(port);
1255 out:
1256         return err;
1257 }
1258
1259 static struct genl_ops dp_genl_ops_add_port = {
1260         .cmd = DP_GENL_C_ADD_PORT,
1261         .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1262         .policy = dp_genl_policy,
1263         .doit = dp_genl_add_del_port,
1264         .dumpit = NULL,
1265 };
1266
1267 static struct genl_ops dp_genl_ops_del_port = {
1268         .cmd = DP_GENL_C_DEL_PORT,
1269         .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1270         .policy = dp_genl_policy,
1271         .doit = dp_genl_add_del_port,
1272         .dumpit = NULL,
1273 };
1274
1275 static int dp_genl_openflow(struct sk_buff *skb, struct genl_info *info)
1276 {
1277         struct nlattr *va = info->attrs[DP_GENL_A_OPENFLOW];
1278         struct datapath *dp;
1279         struct ofp_header *oh;
1280         struct sender sender;
1281         int err;
1282
1283         if (!info->attrs[DP_GENL_A_DP_IDX] || !va)
1284                 return -EINVAL;
1285
1286         dp = dp_get(nla_get_u32(info->attrs[DP_GENL_A_DP_IDX]));
1287         if (!dp)
1288                 return -ENOENT;
1289
1290         if (nla_len(va) < sizeof(struct ofp_header))
1291                 return -EINVAL;
1292         oh = nla_data(va);
1293
1294         sender.xid = oh->xid;
1295         sender.pid = info->snd_pid;
1296         sender.seq = info->snd_seq;
1297
1298         mutex_lock(&dp_mutex);
1299         err = fwd_control_input(dp->chain, &sender,
1300                                 nla_data(va), nla_len(va));
1301         mutex_unlock(&dp_mutex);
1302         return err;
1303 }
1304
1305 static struct nla_policy dp_genl_openflow_policy[DP_GENL_A_MAX + 1] = {
1306         [DP_GENL_A_DP_IDX] = { .type = NLA_U32 },
1307 };
1308
1309 static int desc_stats_dump(struct datapath *dp, void *state,
1310                             void *body, int *body_len)
1311 {
1312         struct ofp_desc_stats *ods = body;
1313         int n_bytes = sizeof *ods;
1314
1315         if (n_bytes > *body_len) {
1316                 return -ENOBUFS;
1317         }
1318         *body_len = n_bytes;
1319
1320         strncpy(ods->mfr_desc, mfr_desc, sizeof ods->mfr_desc);
1321         strncpy(ods->hw_desc, hw_desc, sizeof ods->hw_desc);
1322         strncpy(ods->sw_desc, sw_desc, sizeof ods->sw_desc);
1323         strncpy(ods->serial_num, serial_num, sizeof ods->serial_num);
1324
1325         return 0;
1326 }
1327
1328 struct flow_stats_state {
1329         int table_idx;
1330         struct sw_table_position position;
1331         const struct ofp_flow_stats_request *rq;
1332
1333         void *body;
1334         int bytes_used, bytes_allocated;
1335 };
1336
1337 static int flow_stats_init(struct datapath *dp, const void *body, int body_len,
1338                            void **state)
1339 {
1340         const struct ofp_flow_stats_request *fsr = body;
1341         struct flow_stats_state *s = kmalloc(sizeof *s, GFP_ATOMIC);
1342         if (!s)
1343                 return -ENOMEM;
1344         s->table_idx = fsr->table_id == 0xff ? 0 : fsr->table_id;
1345         memset(&s->position, 0, sizeof s->position);
1346         s->rq = fsr;
1347         *state = s;
1348         return 0;
1349 }
1350
1351 static int flow_stats_dump_callback(struct sw_flow *flow, void *private)
1352 {
1353         struct sw_flow_actions *sf_acts = rcu_dereference(flow->sf_acts);
1354         struct flow_stats_state *s = private;
1355         struct ofp_flow_stats *ofs;
1356         int length;
1357         uint64_t duration;
1358
1359         length = sizeof *ofs + sf_acts->actions_len;
1360         if (length + s->bytes_used > s->bytes_allocated)
1361                 return 1;
1362
1363         ofs = s->body + s->bytes_used;
1364         ofs->length          = htons(length);
1365         ofs->table_id        = s->table_idx;
1366         ofs->pad             = 0;
1367         ofs->match.wildcards = htonl(flow->key.wildcards);
1368         ofs->match.in_port   = flow->key.in_port;
1369         memcpy(ofs->match.dl_src, flow->key.dl_src, ETH_ALEN);
1370         memcpy(ofs->match.dl_dst, flow->key.dl_dst, ETH_ALEN);
1371         ofs->match.dl_vlan   = flow->key.dl_vlan;
1372         ofs->match.dl_type   = flow->key.dl_type;
1373         ofs->match.nw_src    = flow->key.nw_src;
1374         ofs->match.nw_dst    = flow->key.nw_dst;
1375         ofs->match.nw_proto  = flow->key.nw_proto;
1376         ofs->match.pad       = 0;
1377         ofs->match.tp_src    = flow->key.tp_src;
1378         ofs->match.tp_dst    = flow->key.tp_dst;
1379
1380         /* The kernel doesn't support 64-bit division, so use the 'do_div' 
1381          * macro instead.  The first argument is replaced with the quotient,
1382          * while the remainder is the return value. */
1383         duration = get_jiffies_64() - flow->created;
1384         do_div(duration, HZ);
1385         ofs->duration        = htonl(duration);
1386
1387         ofs->priority        = htons(flow->priority);
1388         ofs->idle_timeout    = htons(flow->idle_timeout);
1389         ofs->hard_timeout    = htons(flow->hard_timeout);
1390         memset(ofs->pad2, 0, sizeof ofs->pad2);
1391         ofs->packet_count    = cpu_to_be64(flow->packet_count);
1392         ofs->byte_count      = cpu_to_be64(flow->byte_count);
1393         memcpy(ofs->actions, sf_acts->actions, sf_acts->actions_len);
1394
1395         s->bytes_used += length;
1396         return 0;
1397 }
1398
1399 static int flow_stats_dump(struct datapath *dp, void *state,
1400                            void *body, int *body_len)
1401 {
1402         struct flow_stats_state *s = state;
1403         struct sw_flow_key match_key;
1404         int error = 0;
1405
1406         s->bytes_used = 0;
1407         s->bytes_allocated = *body_len;
1408         s->body = body;
1409
1410         flow_extract_match(&match_key, &s->rq->match);
1411         while (s->table_idx < dp->chain->n_tables
1412                && (s->rq->table_id == 0xff || s->rq->table_id == s->table_idx))
1413         {
1414                 struct sw_table *table = dp->chain->tables[s->table_idx];
1415
1416                 error = table->iterate(table, &match_key, s->rq->out_port, 
1417                                 &s->position, flow_stats_dump_callback, s);
1418                 if (error)
1419                         break;
1420
1421                 s->table_idx++;
1422                 memset(&s->position, 0, sizeof s->position);
1423         }
1424         *body_len = s->bytes_used;
1425
1426         /* If error is 0, we're done.
1427          * Otherwise, if some bytes were used, there are more flows to come.
1428          * Otherwise, we were not able to fit even a single flow in the body,
1429          * which indicates that we have a single flow with too many actions to
1430          * fit.  We won't ever make any progress at that rate, so give up. */
1431         return !error ? 0 : s->bytes_used ? 1 : -ENOMEM;
1432 }
1433
1434 static void flow_stats_done(void *state)
1435 {
1436         kfree(state);
1437 }
1438
1439 static int aggregate_stats_init(struct datapath *dp,
1440                                 const void *body, int body_len,
1441                                 void **state)
1442 {
1443         *state = (void *)body;
1444         return 0;
1445 }
1446
1447 static int aggregate_stats_dump_callback(struct sw_flow *flow, void *private)
1448 {
1449         struct ofp_aggregate_stats_reply *rpy = private;
1450         rpy->packet_count += flow->packet_count;
1451         rpy->byte_count += flow->byte_count;
1452         rpy->flow_count++;
1453         return 0;
1454 }
1455
1456 static int aggregate_stats_dump(struct datapath *dp, void *state,
1457                                 void *body, int *body_len)
1458 {
1459         struct ofp_aggregate_stats_request *rq = state;
1460         struct ofp_aggregate_stats_reply *rpy;
1461         struct sw_table_position position;
1462         struct sw_flow_key match_key;
1463         int table_idx;
1464
1465         if (*body_len < sizeof *rpy)
1466                 return -ENOBUFS;
1467         rpy = body;
1468         *body_len = sizeof *rpy;
1469
1470         memset(rpy, 0, sizeof *rpy);
1471
1472         flow_extract_match(&match_key, &rq->match);
1473         table_idx = rq->table_id == 0xff ? 0 : rq->table_id;
1474         memset(&position, 0, sizeof position);
1475         while (table_idx < dp->chain->n_tables
1476                && (rq->table_id == 0xff || rq->table_id == table_idx))
1477         {
1478                 struct sw_table *table = dp->chain->tables[table_idx];
1479                 int error;
1480
1481                 error = table->iterate(table, &match_key, rq->out_port, &position,
1482                                        aggregate_stats_dump_callback, rpy);
1483                 if (error)
1484                         return error;
1485
1486                 table_idx++;
1487                 memset(&position, 0, sizeof position);
1488         }
1489
1490         rpy->packet_count = cpu_to_be64(rpy->packet_count);
1491         rpy->byte_count = cpu_to_be64(rpy->byte_count);
1492         rpy->flow_count = htonl(rpy->flow_count);
1493         return 0;
1494 }
1495
1496 static int table_stats_dump(struct datapath *dp, void *state,
1497                             void *body, int *body_len)
1498 {
1499         struct ofp_table_stats *ots;
1500         int n_bytes = dp->chain->n_tables * sizeof *ots;
1501         int i;
1502         if (n_bytes > *body_len)
1503                 return -ENOBUFS;
1504         *body_len = n_bytes;
1505         for (i = 0, ots = body; i < dp->chain->n_tables; i++, ots++) {
1506                 struct sw_table_stats stats;
1507                 dp->chain->tables[i]->stats(dp->chain->tables[i], &stats);
1508                 strncpy(ots->name, stats.name, sizeof ots->name);
1509                 ots->table_id = i;
1510                 ots->wildcards = htonl(stats.wildcards);
1511                 memset(ots->pad, 0, sizeof ots->pad);
1512                 ots->max_entries = htonl(stats.max_flows);
1513                 ots->active_count = htonl(stats.n_flows);
1514                 ots->lookup_count = cpu_to_be64(stats.n_lookup);
1515                 ots->matched_count = cpu_to_be64(stats.n_matched);
1516         }
1517         return 0;
1518 }
1519
1520 struct port_stats_state {
1521         int port;
1522 };
1523
1524 static int port_stats_init(struct datapath *dp, const void *body, int body_len,
1525                            void **state)
1526 {
1527         struct port_stats_state *s = kmalloc(sizeof *s, GFP_ATOMIC);
1528         if (!s)
1529                 return -ENOMEM;
1530         s->port = 0;
1531         *state = s;
1532         return 0;
1533 }
1534
1535 static int port_stats_dump(struct datapath *dp, void *state,
1536                            void *body, int *body_len)
1537 {
1538         struct port_stats_state *s = state;
1539         struct ofp_port_stats *ops;
1540         int n_ports, max_ports;
1541         int i;
1542
1543         max_ports = *body_len / sizeof *ops;
1544         if (!max_ports)
1545                 return -ENOMEM;
1546         ops = body;
1547
1548         n_ports = 0;
1549         for (i = s->port; i < DP_MAX_PORTS && n_ports < max_ports; i++) {
1550                 struct net_bridge_port *p = dp->ports[i];
1551                 struct net_device_stats *stats;
1552                 if (!p)
1553                         continue;
1554                 stats = p->dev->get_stats(p->dev);
1555                 ops->port_no = htons(p->port_no);
1556                 memset(ops->pad, 0, sizeof ops->pad);
1557                 ops->rx_packets   = cpu_to_be64(stats->rx_packets);
1558                 ops->tx_packets   = cpu_to_be64(stats->tx_packets);
1559                 ops->rx_bytes     = cpu_to_be64(stats->rx_bytes);
1560                 ops->tx_bytes     = cpu_to_be64(stats->tx_bytes);
1561                 ops->rx_dropped   = cpu_to_be64(stats->rx_dropped);
1562                 ops->tx_dropped   = cpu_to_be64(stats->tx_dropped);
1563                 ops->rx_errors    = cpu_to_be64(stats->rx_errors);
1564                 ops->tx_errors    = cpu_to_be64(stats->tx_errors);
1565                 ops->rx_frame_err = cpu_to_be64(stats->rx_frame_errors);
1566                 ops->rx_over_err  = cpu_to_be64(stats->rx_over_errors);
1567                 ops->rx_crc_err   = cpu_to_be64(stats->rx_crc_errors);
1568                 ops->collisions   = cpu_to_be64(stats->collisions);
1569                 n_ports++;
1570                 ops++;
1571         }
1572         s->port = i;
1573         *body_len = n_ports * sizeof *ops;
1574         return n_ports >= max_ports;
1575 }
1576
1577 static void port_stats_done(void *state)
1578 {
1579         kfree(state);
1580 }
1581
1582 struct stats_type {
1583         /* Minimum and maximum acceptable number of bytes in body member of
1584          * struct ofp_stats_request. */
1585         size_t min_body, max_body;
1586
1587         /* Prepares to dump some kind of statistics on 'dp'.  'body' and
1588          * 'body_len' are the 'body' member of the struct ofp_stats_request.
1589          * Returns zero if successful, otherwise a negative error code.
1590          * May initialize '*state' to state information.  May be null if no
1591          * initialization is required.*/
1592         int (*init)(struct datapath *dp, const void *body, int body_len,
1593                     void **state);
1594
1595         /* Dumps statistics for 'dp' into the '*body_len' bytes at 'body', and
1596          * modifies '*body_len' to reflect the number of bytes actually used.
1597          * ('body' will be transmitted as the 'body' member of struct
1598          * ofp_stats_reply.) */
1599         int (*dump)(struct datapath *dp, void *state,
1600                     void *body, int *body_len);
1601
1602         /* Cleans any state created by the init or dump functions.  May be null
1603          * if no cleanup is required. */
1604         void (*done)(void *state);
1605 };
1606
1607 static const struct stats_type stats[] = {
1608         [OFPST_DESC] = {
1609                 0,
1610                 0,
1611                 NULL,
1612                 desc_stats_dump,
1613                 NULL
1614         },
1615         [OFPST_FLOW] = {
1616                 sizeof(struct ofp_flow_stats_request),
1617                 sizeof(struct ofp_flow_stats_request),
1618                 flow_stats_init,
1619                 flow_stats_dump,
1620                 flow_stats_done
1621         },
1622         [OFPST_AGGREGATE] = {
1623                 sizeof(struct ofp_aggregate_stats_request),
1624                 sizeof(struct ofp_aggregate_stats_request),
1625                 aggregate_stats_init,
1626                 aggregate_stats_dump,
1627                 NULL
1628         },
1629         [OFPST_TABLE] = {
1630                 0,
1631                 0,
1632                 NULL,
1633                 table_stats_dump,
1634                 NULL
1635         },
1636         [OFPST_PORT] = {
1637                 0,
1638                 0,
1639                 port_stats_init,
1640                 port_stats_dump,
1641                 port_stats_done
1642         },
1643 };
1644
1645 static int
1646 dp_genl_openflow_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
1647 {
1648         struct datapath *dp;
1649         struct sender sender;
1650         const struct stats_type *s;
1651         struct ofp_stats_reply *osr;
1652         int dp_idx;
1653         int max_openflow_len, body_len;
1654         void *body;
1655         int err;
1656
1657         /* Set up the cleanup function for this dump.  Linux 2.6.20 and later
1658          * support setting up cleanup functions via the .doneit member of
1659          * struct genl_ops.  This kluge supports earlier versions also. */
1660         cb->done = dp_genl_openflow_done;
1661
1662         sender.pid = NETLINK_CB(cb->skb).pid;
1663         sender.seq = cb->nlh->nlmsg_seq;
1664         if (!cb->args[0]) {
1665                 struct nlattr *attrs[DP_GENL_A_MAX + 1];
1666                 struct ofp_stats_request *rq;
1667                 struct nlattr *va;
1668                 size_t len, body_len;
1669                 int type;
1670
1671                 err = nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, DP_GENL_A_MAX,
1672                                   dp_genl_openflow_policy);
1673                 if (err < 0)
1674                         return err;
1675
1676                 if (!attrs[DP_GENL_A_DP_IDX])
1677                         return -EINVAL;
1678                 dp_idx = nla_get_u16(attrs[DP_GENL_A_DP_IDX]);
1679                 dp = dp_get(dp_idx);
1680                 if (!dp)
1681                         return -ENOENT;
1682
1683                 va = attrs[DP_GENL_A_OPENFLOW];
1684                 len = nla_len(va);
1685                 if (!va || len < sizeof *rq)
1686                         return -EINVAL;
1687
1688                 rq = nla_data(va);
1689                 sender.xid = rq->header.xid;
1690                 type = ntohs(rq->type);
1691                 if (rq->header.version != OFP_VERSION) {
1692                         dp_send_error_msg(dp, &sender, OFPET_BAD_REQUEST,
1693                                           OFPBRC_BAD_VERSION, rq, len);
1694                         return -EINVAL;
1695                 }
1696                 if (rq->header.type != OFPT_STATS_REQUEST
1697                     || ntohs(rq->header.length) != len)
1698                         return -EINVAL;
1699
1700                 if (type >= ARRAY_SIZE(stats) || !stats[type].dump) {
1701                         dp_send_error_msg(dp, &sender, OFPET_BAD_REQUEST,
1702                                           OFPBRC_BAD_STAT, rq, len);
1703                         return -EINVAL;
1704                 }
1705
1706                 s = &stats[type];
1707                 body_len = len - offsetof(struct ofp_stats_request, body);
1708                 if (body_len < s->min_body || body_len > s->max_body)
1709                         return -EINVAL;
1710
1711                 cb->args[0] = 1;
1712                 cb->args[1] = dp_idx;
1713                 cb->args[2] = type;
1714                 cb->args[3] = rq->header.xid;
1715                 if (s->init) {
1716                         void *state;
1717                         err = s->init(dp, rq->body, body_len, &state);
1718                         if (err)
1719                                 return err;
1720                         cb->args[4] = (long) state;
1721                 }
1722         } else if (cb->args[0] == 1) {
1723                 sender.xid = cb->args[3];
1724                 dp_idx = cb->args[1];
1725                 s = &stats[cb->args[2]];
1726
1727                 dp = dp_get(dp_idx);
1728                 if (!dp)
1729                         return -ENOENT;
1730         } else {
1731                 return 0;
1732         }
1733
1734         osr = put_openflow_headers(dp, skb, OFPT_STATS_REPLY, &sender,
1735                                    &max_openflow_len);
1736         if (IS_ERR(osr))
1737                 return PTR_ERR(osr);
1738         osr->type = htons(s - stats);
1739         osr->flags = 0;
1740         resize_openflow_skb(skb, &osr->header, max_openflow_len);
1741         body = osr->body;
1742         body_len = max_openflow_len - offsetof(struct ofp_stats_reply, body);
1743
1744         err = s->dump(dp, (void *) cb->args[4], body, &body_len);
1745         if (err >= 0) {
1746                 if (!err)
1747                         cb->args[0] = 2;
1748                 else
1749                         osr->flags = ntohs(OFPSF_REPLY_MORE);
1750                 resize_openflow_skb(skb, &osr->header,
1751                                     (offsetof(struct ofp_stats_reply, body)
1752                                      + body_len));
1753                 err = skb->len;
1754         }
1755
1756         return err;
1757 }
1758
1759 static int
1760 dp_genl_openflow_done(struct netlink_callback *cb)
1761 {
1762         if (cb->args[0]) {
1763                 const struct stats_type *s = &stats[cb->args[2]];
1764                 if (s->done)
1765                         s->done((void *) cb->args[4]);
1766         }
1767         return 0;
1768 }
1769
1770 static struct genl_ops dp_genl_ops_openflow = {
1771         .cmd = DP_GENL_C_OPENFLOW,
1772         .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1773         .policy = dp_genl_openflow_policy,
1774         .doit = dp_genl_openflow,
1775         .dumpit = dp_genl_openflow_dumpit,
1776 };
1777
1778 static struct genl_ops *dp_genl_all_ops[] = {
1779         /* Keep this operation first.  Generic Netlink dispatching
1780          * looks up operations with linear search, so we want it at the
1781          * front. */
1782         &dp_genl_ops_openflow,
1783
1784         &dp_genl_ops_add_dp,
1785         &dp_genl_ops_del_dp,
1786         &dp_genl_ops_query_dp,
1787         &dp_genl_ops_add_port,
1788         &dp_genl_ops_del_port,
1789 };
1790
1791 static int dp_init_netlink(void)
1792 {
1793         int err;
1794         int i;
1795
1796         err = genl_register_family(&dp_genl_family);
1797         if (err)
1798                 return err;
1799
1800         for (i = 0; i < ARRAY_SIZE(dp_genl_all_ops); i++) {
1801                 err = genl_register_ops(&dp_genl_family, dp_genl_all_ops[i]);
1802                 if (err)
1803                         goto err_unregister;
1804         }
1805
1806         strcpy(mc_group.name, "openflow");
1807         err = genl_register_mc_group(&dp_genl_family, &mc_group);
1808         if (err < 0)
1809                 goto err_unregister;
1810
1811         return 0;
1812
1813 err_unregister:
1814         genl_unregister_family(&dp_genl_family);
1815                 return err;
1816 }
1817
1818 static void dp_uninit_netlink(void)
1819 {
1820         genl_unregister_family(&dp_genl_family);
1821 }
1822
1823 /* Set the description strings if appropriate values are available from
1824  * the DMI. */
1825 static void set_desc(void)
1826 {
1827         const char *uuid = dmi_get_system_info(DMI_PRODUCT_UUID);
1828         const char *uptr = uuid + 24;
1829
1830         if (!uuid || *uuid == '\0' || strlen(uuid) != 36) 
1831                 return;
1832
1833         /* We are only interested version 1 UUIDs, since the last six bytes
1834          * are an IEEE 802 MAC address. */
1835         if (uuid[14] != '1') 
1836                 return;
1837
1838         /* Only set if the UUID is from Nicira. */
1839         if (strncmp(uptr, NICIRA_OUI_STR, strlen(NICIRA_OUI_STR)))
1840                 return;
1841
1842         strlcpy(mfr_desc, dmi_get_system_info(DMI_SYS_VENDOR), sizeof(mfr_desc));
1843         snprintf(hw_desc, sizeof(hw_desc), "%s %s", 
1844                         dmi_get_system_info(DMI_PRODUCT_NAME), 
1845                         dmi_get_system_info(DMI_PRODUCT_VERSION));
1846         strlcpy(serial_num, dmi_get_system_info(DMI_PRODUCT_SERIAL), 
1847                         sizeof(serial_num));
1848 }
1849
1850 static int __init dp_init(void)
1851 {
1852         int err;
1853
1854         printk("OpenFlow %s, built "__DATE__" "__TIME__", "
1855                "protocol 0x%02x\n", VERSION BUILDNR, OFP_VERSION);
1856
1857         err = flow_init();
1858         if (err)
1859                 goto error;
1860
1861         err = register_netdevice_notifier(&dp_device_notifier);
1862         if (err)
1863                 goto error_flow_exit;
1864
1865         err = dp_init_netlink();
1866         if (err)
1867                 goto error_unreg_notifier;
1868
1869         /* Check if better descriptions of the switch are available than the
1870          * defaults. */
1871         set_desc();
1872
1873         /* Hook into callback used by the bridge to intercept packets.
1874          * Parasites we are. */
1875         if (br_handle_frame_hook)
1876                 printk("openflow: hijacking bridge hook\n");
1877         br_handle_frame_hook = dp_frame_hook;
1878
1879         return 0;
1880
1881 error_unreg_notifier:
1882         unregister_netdevice_notifier(&dp_device_notifier);
1883 error_flow_exit:
1884         flow_exit();
1885 error:
1886         printk(KERN_EMERG "openflow: failed to install!");
1887         return err;
1888 }
1889
1890 static void dp_cleanup(void)
1891 {
1892         fwd_exit();
1893         dp_uninit_netlink();
1894         unregister_netdevice_notifier(&dp_device_notifier);
1895         flow_exit();
1896         br_handle_frame_hook = NULL;
1897 }
1898
1899 module_init(dp_init);
1900 module_exit(dp_cleanup);
1901
1902 MODULE_DESCRIPTION("OpenFlow switching datapath");
1903 MODULE_AUTHOR("Copyright (c) 2007, 2008 The Board of Trustees of The Leland Stanford Junior University");
1904 MODULE_LICENSE("GPL");