datapath: Enforce mutual exclusion between bridge and brcompat_mod.
[sliver-openvswitch.git] / datapath / brcompat.c
1 /*
2  * Copyright (c) 2007-2012 Nicira Networks.
3  *
4  * Significant portions of this file may be copied from parts of the Linux
5  * kernel, by Linus Torvalds and others.
6  */
7
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10 #include <linux/kernel.h>
11 #include <asm/uaccess.h>
12 #include <linux/completion.h>
13 #include <linux/etherdevice.h>
14 #include <linux/if_bridge.h>
15 #include <linux/netdevice.h>
16 #include <linux/rtnetlink.h>
17 #include <net/genetlink.h>
18
19 #include "openvswitch/brcompat-netlink.h"
20 #include "datapath.h"
21
22 static struct genl_family brc_genl_family;
23 static struct genl_multicast_group brc_mc_group;
24
25 /* Time to wait for ovs-vswitchd to respond to a datapath action, in
26  * jiffies. */
27 #define BRC_TIMEOUT (HZ * 5)
28
29 /* Mutex to serialize ovs-brcompatd callbacks.  (Some callbacks naturally hold
30  * br_ioctl_mutex, others hold rtnl_lock, but we can't take the former
31  * ourselves and we don't want to hold the latter over a potentially long
32  * period of time.) */
33 static DEFINE_MUTEX(brc_serial);
34
35 /* Userspace communication. */
36 static DEFINE_SPINLOCK(brc_lock);    /* Ensure atomic access to these vars. */
37 static DECLARE_COMPLETION(brc_done); /* Userspace signaled operation done? */
38 static struct sk_buff *brc_reply;    /* Reply from userspace. */
39 static u32 brc_seq;                  /* Sequence number for current op. */
40
41 static struct sk_buff *brc_send_command(struct sk_buff *, struct nlattr **attrs);
42 static int brc_send_simple_command(struct sk_buff *);
43
44 static struct sk_buff *brc_make_request(int op, const char *bridge,
45                                         const char *port)
46 {
47         struct sk_buff *skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
48         if (!skb)
49                 goto error;
50
51         genlmsg_put(skb, 0, 0, &brc_genl_family, 0, op);
52         if (bridge)
53                 NLA_PUT_STRING(skb, BRC_GENL_A_DP_NAME, bridge);
54         if (port)
55                 NLA_PUT_STRING(skb, BRC_GENL_A_PORT_NAME, port);
56         return skb;
57
58 nla_put_failure:
59         kfree_skb(skb);
60 error:
61         return NULL;
62 }
63
64 static int brc_send_simple_command(struct sk_buff *request)
65 {
66         struct nlattr *attrs[BRC_GENL_A_MAX + 1];
67         struct sk_buff *reply;
68         int error;
69
70         reply = brc_send_command(request, attrs);
71         if (IS_ERR(reply))
72                 return PTR_ERR(reply);
73
74         error = nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
75         kfree_skb(reply);
76         return -error;
77 }
78
79 static int brc_add_del_bridge(char __user *uname, int add)
80 {
81         struct sk_buff *request;
82         char name[IFNAMSIZ];
83
84         if (!capable(CAP_NET_ADMIN))
85                 return -EPERM;
86
87         if (copy_from_user(name, uname, IFNAMSIZ))
88                 return -EFAULT;
89
90         name[IFNAMSIZ - 1] = 0;
91         request = brc_make_request(add ? BRC_GENL_C_DP_ADD : BRC_GENL_C_DP_DEL,
92                                    name, NULL);
93         if (!request)
94                 return -ENOMEM;
95
96         return brc_send_simple_command(request);
97 }
98
99 static int brc_get_indices(int op, const char *br_name,
100                            int __user *uindices, int n)
101 {
102         struct nlattr *attrs[BRC_GENL_A_MAX + 1];
103         struct sk_buff *request, *reply;
104         int *indices;
105         int ret;
106         int len;
107
108         if (n < 0)
109                 return -EINVAL;
110         if (n >= 2048)
111                 return -ENOMEM;
112
113         request = brc_make_request(op, br_name, NULL);
114         if (!request)
115                 return -ENOMEM;
116
117         reply = brc_send_command(request, attrs);
118         ret = PTR_ERR(reply);
119         if (IS_ERR(reply))
120                 goto exit;
121
122         ret = -nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
123         if (ret < 0)
124                 goto exit_free_skb;
125
126         ret = -EINVAL;
127         if (!attrs[BRC_GENL_A_IFINDEXES])
128                 goto exit_free_skb;
129
130         len = nla_len(attrs[BRC_GENL_A_IFINDEXES]);
131         indices = nla_data(attrs[BRC_GENL_A_IFINDEXES]);
132         if (len % sizeof(int))
133                 goto exit_free_skb;
134
135         n = min_t(int, n, len / sizeof(int));
136         ret = copy_to_user(uindices, indices, n * sizeof(int)) ? -EFAULT : n;
137
138 exit_free_skb:
139         kfree_skb(reply);
140 exit:
141         return ret;
142 }
143
144 /* Called with br_ioctl_mutex. */
145 static int brc_get_bridges(int __user *uindices, int n)
146 {
147         return brc_get_indices(BRC_GENL_C_GET_BRIDGES, NULL, uindices, n);
148 }
149
150 /* Legacy deviceless bridge ioctl's.  Called with br_ioctl_mutex. */
151 static int old_deviceless(void __user *uarg)
152 {
153         unsigned long args[3];
154
155         if (copy_from_user(args, uarg, sizeof(args)))
156                 return -EFAULT;
157
158         switch (args[0]) {
159         case BRCTL_GET_BRIDGES:
160                 return brc_get_bridges((int __user *)args[1], args[2]);
161
162         case BRCTL_ADD_BRIDGE:
163                 return brc_add_del_bridge((void __user *)args[1], 1);
164         case BRCTL_DEL_BRIDGE:
165                 return brc_add_del_bridge((void __user *)args[1], 0);
166         }
167
168         return -EOPNOTSUPP;
169 }
170
171 /* Called with the br_ioctl_mutex. */
172 static int
173 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
174 brc_ioctl_deviceless_stub(unsigned int cmd, void __user *uarg)
175 #else
176 brc_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
177 #endif
178 {
179         switch (cmd) {
180         case SIOCGIFBR:
181         case SIOCSIFBR:
182                 return old_deviceless(uarg);
183
184         case SIOCBRADDBR:
185                 return brc_add_del_bridge(uarg, 1);
186         case SIOCBRDELBR:
187                 return brc_add_del_bridge(uarg, 0);
188         }
189
190         return -EOPNOTSUPP;
191 }
192
193 static int brc_add_del_port(struct net_device *dev, int port_ifindex, int add)
194 {
195         struct sk_buff *request;
196         struct net_device *port;
197         int err;
198
199         if (!capable(CAP_NET_ADMIN))
200                 return -EPERM;
201
202         port = __dev_get_by_index(&init_net, port_ifindex);
203         if (!port)
204                 return -EINVAL;
205
206         /* Save name of dev and port because there's a race between the
207          * rtnl_unlock() and the brc_send_simple_command(). */
208         request = brc_make_request(add ? BRC_GENL_C_PORT_ADD : BRC_GENL_C_PORT_DEL,
209                                    dev->name, port->name);
210         if (!request)
211                 return -ENOMEM;
212
213         rtnl_unlock();
214         err = brc_send_simple_command(request);
215         rtnl_lock();
216
217         return err;
218 }
219
220 static int brc_get_bridge_info(struct net_device *dev,
221                                struct __bridge_info __user *ub)
222 {
223         struct __bridge_info b;
224
225         memset(&b, 0, sizeof(struct __bridge_info));
226
227         /* First two bytes are the priority, which we should skip.  This comes
228          * from struct bridge_id in br_private.h, which is unavailable to us.
229          */
230         memcpy((u8 *)&b.bridge_id + 2, dev->dev_addr, ETH_ALEN);
231         b.stp_enabled = 0;
232
233         if (copy_to_user(ub, &b, sizeof(struct __bridge_info)))
234                 return -EFAULT;
235
236         return 0;
237 }
238
239 static int brc_get_port_list(struct net_device *dev, int __user *uindices,
240                              int num)
241 {
242         int retval;
243
244         rtnl_unlock();
245         retval = brc_get_indices(BRC_GENL_C_GET_PORTS, dev->name,
246                                  uindices, num);
247         rtnl_lock();
248
249         return retval;
250 }
251
252 /*
253  * Format up to a page worth of forwarding table entries
254  * userbuf -- where to copy result
255  * maxnum  -- maximum number of entries desired
256  *            (limited to a page for sanity)
257  * offset  -- number of records to skip
258  */
259 static int brc_get_fdb_entries(struct net_device *dev, void __user *userbuf,
260                                unsigned long maxnum, unsigned long offset)
261 {
262         struct nlattr *attrs[BRC_GENL_A_MAX + 1];
263         struct sk_buff *request, *reply;
264         int retval;
265         int len;
266
267         /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */
268         if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry))
269                 maxnum = PAGE_SIZE/sizeof(struct __fdb_entry);
270
271         request = brc_make_request(BRC_GENL_C_FDB_QUERY, dev->name, NULL);
272         if (!request)
273                 return -ENOMEM;
274         NLA_PUT_U64(request, BRC_GENL_A_FDB_COUNT, maxnum);
275         NLA_PUT_U64(request, BRC_GENL_A_FDB_SKIP, offset);
276
277         rtnl_unlock();
278         reply = brc_send_command(request, attrs);
279         retval = PTR_ERR(reply);
280         if (IS_ERR(reply))
281                 goto exit;
282
283         retval = -nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
284         if (retval < 0)
285                 goto exit_free_skb;
286
287         retval = -EINVAL;
288         if (!attrs[BRC_GENL_A_FDB_DATA])
289                 goto exit_free_skb;
290         len = nla_len(attrs[BRC_GENL_A_FDB_DATA]);
291         if (len % sizeof(struct __fdb_entry) ||
292             len / sizeof(struct __fdb_entry) > maxnum)
293                 goto exit_free_skb;
294
295         retval = len / sizeof(struct __fdb_entry);
296         if (copy_to_user(userbuf, nla_data(attrs[BRC_GENL_A_FDB_DATA]), len))
297                 retval = -EFAULT;
298
299 exit_free_skb:
300         kfree_skb(reply);
301 exit:
302         rtnl_lock();
303         return retval;
304
305 nla_put_failure:
306         kfree_skb(request);
307         return -ENOMEM;
308 }
309
310 /* Legacy ioctl's through SIOCDEVPRIVATE.  Called with rtnl_lock. */
311 static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
312 {
313         unsigned long args[4];
314
315         if (copy_from_user(args, rq->ifr_data, sizeof(args)))
316                 return -EFAULT;
317
318         switch (args[0]) {
319         case BRCTL_ADD_IF:
320                 return brc_add_del_port(dev, args[1], 1);
321         case BRCTL_DEL_IF:
322                 return brc_add_del_port(dev, args[1], 0);
323
324         case BRCTL_GET_BRIDGE_INFO:
325                 return brc_get_bridge_info(dev, (struct __bridge_info __user *)args[1]);
326
327         case BRCTL_GET_PORT_LIST:
328                 return brc_get_port_list(dev, (int __user *)args[1], args[2]);
329
330         case BRCTL_GET_FDB_ENTRIES:
331                 return brc_get_fdb_entries(dev, (void __user *)args[1],
332                                            args[2], args[3]);
333         }
334
335         return -EOPNOTSUPP;
336 }
337
338 /* Called with the rtnl_lock. */
339 static int brc_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
340 {
341         int err;
342
343         switch (cmd) {
344                 case SIOCDEVPRIVATE:
345                         err = old_dev_ioctl(dev, rq, cmd);
346                         break;
347
348                 case SIOCBRADDIF:
349                         return brc_add_del_port(dev, rq->ifr_ifindex, 1);
350                 case SIOCBRDELIF:
351                         return brc_add_del_port(dev, rq->ifr_ifindex, 0);
352
353                 default:
354                         err = -EOPNOTSUPP;
355                         break;
356         }
357
358         return err;
359 }
360
361
362 static struct genl_family brc_genl_family = {
363         .id = GENL_ID_GENERATE,
364         .hdrsize = 0,
365         .name = BRC_GENL_FAMILY_NAME,
366         .version = 1,
367         .maxattr = BRC_GENL_A_MAX,
368 };
369
370 static int brc_genl_query(struct sk_buff *skb, struct genl_info *info)
371 {
372         int err = -EINVAL;
373         struct sk_buff *ans_skb;
374         void *data;
375
376         ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
377         if (!ans_skb)
378                 return -ENOMEM;
379
380         data = genlmsg_put_reply(ans_skb, info, &brc_genl_family,
381                                  0, BRC_GENL_C_QUERY_MC);
382         if (data == NULL) {
383                 err = -ENOMEM;
384                 goto err;
385         }
386         NLA_PUT_U32(ans_skb, BRC_GENL_A_MC_GROUP, brc_mc_group.id);
387
388         genlmsg_end(ans_skb, data);
389         return genlmsg_reply(ans_skb, info);
390
391 err:
392 nla_put_failure:
393         kfree_skb(ans_skb);
394         return err;
395 }
396
397 /* Attribute policy: what each attribute may contain.  */
398 static struct nla_policy brc_genl_policy[BRC_GENL_A_MAX + 1] = {
399         [BRC_GENL_A_ERR_CODE] = { .type = NLA_U32 },
400         [BRC_GENL_A_FDB_DATA] = { .type = NLA_UNSPEC },
401 };
402
403 static int brc_genl_dp_result(struct sk_buff *skb, struct genl_info *info)
404 {
405         unsigned long int flags;
406         int err;
407
408         if (!info->attrs[BRC_GENL_A_ERR_CODE])
409                 return -EINVAL;
410
411         skb = skb_clone(skb, GFP_KERNEL);
412         if (!skb)
413                 return -ENOMEM;
414
415         spin_lock_irqsave(&brc_lock, flags);
416         if (brc_seq == info->snd_seq) {
417                 brc_seq++;
418
419                 kfree_skb(brc_reply);
420                 brc_reply = skb;
421
422                 complete(&brc_done);
423                 err = 0;
424         } else {
425                 kfree_skb(skb);
426                 err = -ESTALE;
427         }
428         spin_unlock_irqrestore(&brc_lock, flags);
429
430         return err;
431 }
432
433 static struct genl_ops brc_genl_ops[] = {
434         { .cmd = BRC_GENL_C_QUERY_MC,
435           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
436           .policy = NULL,
437           .doit = brc_genl_query,
438         },
439         { .cmd = BRC_GENL_C_DP_RESULT,
440           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
441           .policy = brc_genl_policy,
442           .doit = brc_genl_dp_result,
443         },
444 };
445
446 static struct sk_buff *brc_send_command(struct sk_buff *request,
447                                         struct nlattr **attrs)
448 {
449         unsigned long int flags;
450         struct sk_buff *reply;
451         int error;
452
453         mutex_lock(&brc_serial);
454
455         /* Increment sequence number first, so that we ignore any replies
456          * to stale requests. */
457         spin_lock_irqsave(&brc_lock, flags);
458         nlmsg_hdr(request)->nlmsg_seq = ++brc_seq;
459         INIT_COMPLETION(brc_done);
460         spin_unlock_irqrestore(&brc_lock, flags);
461
462         nlmsg_end(request, nlmsg_hdr(request));
463
464         /* Send message. */
465         error = genlmsg_multicast(request, 0, brc_mc_group.id, GFP_KERNEL);
466         if (error < 0)
467                 goto error;
468
469         /* Wait for reply. */
470         error = -ETIMEDOUT;
471         if (!wait_for_completion_timeout(&brc_done, BRC_TIMEOUT)) {
472                 pr_warn("timed out waiting for userspace\n");
473                 goto error;
474     }
475
476         /* Grab reply. */
477         spin_lock_irqsave(&brc_lock, flags);
478         reply = brc_reply;
479         brc_reply = NULL;
480         spin_unlock_irqrestore(&brc_lock, flags);
481
482         mutex_unlock(&brc_serial);
483
484         /* Re-parse message.  Can't fail, since it parsed correctly once
485          * already. */
486         error = nlmsg_parse(nlmsg_hdr(reply), GENL_HDRLEN,
487                             attrs, BRC_GENL_A_MAX, brc_genl_policy);
488         WARN_ON(error);
489
490         return reply;
491
492 error:
493         mutex_unlock(&brc_serial);
494         return ERR_PTR(error);
495 }
496
497 static int __init brc_init(void)
498 {
499         int err;
500
501         printk("Open vSwitch Bridge Compatibility, built "__DATE__" "__TIME__"\n");
502
503         /* Set the bridge ioctl handler */
504         brioctl_set(brc_ioctl_deviceless_stub);
505
506         /* Set the openvswitch_mod device ioctl handler */
507         dp_ioctl_hook = brc_dev_ioctl;
508
509         /* Randomize the initial sequence number.  This is not a security
510          * feature; it only helps avoid crossed wires between userspace and
511          * the kernel when the module is unloaded and reloaded. */
512         brc_seq = net_random();
513
514         /* Register generic netlink family to communicate changes to
515          * userspace. */
516         err = genl_register_family_with_ops(&brc_genl_family,
517                                             brc_genl_ops, ARRAY_SIZE(brc_genl_ops));
518         if (err)
519                 goto error;
520
521         strcpy(brc_mc_group.name, "brcompat");
522         err = genl_register_mc_group(&brc_genl_family, &brc_mc_group);
523         if (err < 0)
524                 goto err_unregister;
525
526         return 0;
527
528 err_unregister:
529         genl_unregister_family(&brc_genl_family);
530 error:
531         pr_emerg("failed to install!\n");
532         return err;
533 }
534
535 static void brc_cleanup(void)
536 {
537         /* Unregister ioctl hooks */
538         dp_ioctl_hook = NULL;
539         brioctl_set(NULL);
540
541         genl_unregister_family(&brc_genl_family);
542 }
543
544 module_init(brc_init);
545 module_exit(brc_cleanup);
546
547 MODULE_DESCRIPTION("Open vSwitch bridge compatibility");
548 MODULE_AUTHOR("Nicira Networks");
549 MODULE_LICENSE("GPL");
550
551 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
552 /*
553  * In kernels 2.6.36 and later, Open vSwitch can safely coexist with
554  * the Linux bridge module, but it does not make sense to load both bridge and
555  * brcompat_mod, so this prevents it.
556  */
557 BRIDGE_MUTUAL_EXCLUSION;
558 #endif