d4a0acad926868011c81a25ad3333b90d9c4a4ef
[sliver-openvswitch.git] / datapath / brcompat_main.c
1 /*
2  * Copyright (c) 2007-2012 Nicira Networks.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/module.h>
22 #include <linux/kernel.h>
23 #include <linux/uaccess.h>
24 #include <linux/completion.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_bridge.h>
27 #include <linux/netdevice.h>
28 #include <linux/rtnetlink.h>
29 #include <net/genetlink.h>
30
31 #include "openvswitch/brcompat-netlink.h"
32 #include "datapath.h"
33
34 static struct genl_family brc_genl_family;
35 static struct genl_multicast_group brc_mc_group;
36
37 /* Time to wait for ovs-vswitchd to respond to a datapath action, in
38  * jiffies. */
39 #define BRC_TIMEOUT (HZ * 5)
40
41 /* Mutex to serialize ovs-brcompatd callbacks.  (Some callbacks naturally hold
42  * br_ioctl_mutex, others hold rtnl_lock, but we can't take the former
43  * ourselves and we don't want to hold the latter over a potentially long
44  * period of time.) */
45 static DEFINE_MUTEX(brc_serial);
46
47 /* Userspace communication. */
48 static DEFINE_SPINLOCK(brc_lock);    /* Ensure atomic access to these vars. */
49 static DECLARE_COMPLETION(brc_done); /* Userspace signaled operation done? */
50 static struct sk_buff *brc_reply;    /* Reply from userspace. */
51 static u32 brc_seq;                  /* Sequence number for current op. */
52
53 static struct sk_buff *brc_send_command(struct net *,
54                                         struct sk_buff *,
55                                         struct nlattr **attrs);
56 static int brc_send_simple_command(struct net *, struct sk_buff *);
57
58 static struct sk_buff *brc_make_request(int op, const char *bridge,
59                                         const char *port)
60 {
61         struct sk_buff *skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
62         if (!skb)
63                 goto error;
64
65         genlmsg_put(skb, 0, 0, &brc_genl_family, 0, op);
66         if (bridge)
67                 NLA_PUT_STRING(skb, BRC_GENL_A_DP_NAME, bridge);
68         if (port)
69                 NLA_PUT_STRING(skb, BRC_GENL_A_PORT_NAME, port);
70         return skb;
71
72 nla_put_failure:
73         kfree_skb(skb);
74 error:
75         return NULL;
76 }
77
78 static int brc_send_simple_command(struct net *net, struct sk_buff *request)
79 {
80         struct nlattr *attrs[BRC_GENL_A_MAX + 1];
81         struct sk_buff *reply;
82         int error;
83
84         reply = brc_send_command(net, request, attrs);
85         if (IS_ERR(reply))
86                 return PTR_ERR(reply);
87
88         error = nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
89         kfree_skb(reply);
90         return -error;
91 }
92
93 static int brc_add_del_bridge(struct net *net, char __user *uname, int add)
94 {
95         struct sk_buff *request;
96         char name[IFNAMSIZ];
97
98         if (!capable(CAP_NET_ADMIN))
99                 return -EPERM;
100
101         if (copy_from_user(name, uname, IFNAMSIZ))
102                 return -EFAULT;
103
104         name[IFNAMSIZ - 1] = 0;
105         request = brc_make_request(add ? BRC_GENL_C_DP_ADD : BRC_GENL_C_DP_DEL,
106                                    name, NULL);
107         if (!request)
108                 return -ENOMEM;
109
110         return brc_send_simple_command(net, request);
111 }
112
113 static int brc_get_indices(struct net *net,
114                            int op, const char *br_name,
115                            int __user *uindices, int n)
116 {
117         struct nlattr *attrs[BRC_GENL_A_MAX + 1];
118         struct sk_buff *request, *reply;
119         int *indices;
120         int ret;
121         int len;
122
123         if (n < 0)
124                 return -EINVAL;
125         if (n >= 2048)
126                 return -ENOMEM;
127
128         request = brc_make_request(op, br_name, NULL);
129         if (!request)
130                 return -ENOMEM;
131
132         reply = brc_send_command(net, request, attrs);
133         ret = PTR_ERR(reply);
134         if (IS_ERR(reply))
135                 goto exit;
136
137         ret = -nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
138         if (ret < 0)
139                 goto exit_free_skb;
140
141         ret = -EINVAL;
142         if (!attrs[BRC_GENL_A_IFINDEXES])
143                 goto exit_free_skb;
144
145         len = nla_len(attrs[BRC_GENL_A_IFINDEXES]);
146         indices = nla_data(attrs[BRC_GENL_A_IFINDEXES]);
147         if (len % sizeof(int))
148                 goto exit_free_skb;
149
150         n = min_t(int, n, len / sizeof(int));
151         ret = copy_to_user(uindices, indices, n * sizeof(int)) ? -EFAULT : n;
152
153 exit_free_skb:
154         kfree_skb(reply);
155 exit:
156         return ret;
157 }
158
159 /* Called with br_ioctl_mutex. */
160 static int brc_get_bridges(struct net *net, int __user *uindices, int n)
161 {
162         return brc_get_indices(net, BRC_GENL_C_GET_BRIDGES, NULL, uindices, n);
163 }
164
165 /* Legacy deviceless bridge ioctl's.  Called with br_ioctl_mutex. */
166 static int old_deviceless(struct net *net, void __user *uarg)
167 {
168         unsigned long args[3];
169
170         if (copy_from_user(args, uarg, sizeof(args)))
171                 return -EFAULT;
172
173         switch (args[0]) {
174         case BRCTL_GET_BRIDGES:
175                 return brc_get_bridges(net, (int __user *)args[1], args[2]);
176
177         case BRCTL_ADD_BRIDGE:
178                 return brc_add_del_bridge(net, (void __user *)args[1], 1);
179         case BRCTL_DEL_BRIDGE:
180                 return brc_add_del_bridge(net, (void __user *)args[1], 0);
181         }
182
183         return -EOPNOTSUPP;
184 }
185
186 /* Called with the br_ioctl_mutex. */
187 static int
188 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
189 brc_ioctl_deviceless_stub(unsigned int cmd, void __user *uarg)
190 {
191         struct net *net = NULL;
192 #else
193 brc_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
194 {
195 #endif
196         switch (cmd) {
197         case SIOCGIFBR:
198         case SIOCSIFBR:
199                 return old_deviceless(net, uarg);
200
201         case SIOCBRADDBR:
202                 return brc_add_del_bridge(net, uarg, 1);
203         case SIOCBRDELBR:
204                 return brc_add_del_bridge(net, uarg, 0);
205         }
206
207         return -EOPNOTSUPP;
208 }
209
210 static int brc_add_del_port(struct net_device *dev, int port_ifindex, int add)
211 {
212         struct sk_buff *request;
213         struct net_device *port;
214         int err;
215
216         if (!capable(CAP_NET_ADMIN))
217                 return -EPERM;
218
219         port = __dev_get_by_index(dev_net(dev), port_ifindex);
220         if (!port)
221                 return -EINVAL;
222
223         /* Save name of dev and port because there's a race between the
224          * rtnl_unlock() and the brc_send_simple_command(). */
225         request = brc_make_request(add ? BRC_GENL_C_PORT_ADD : BRC_GENL_C_PORT_DEL,
226                                    dev->name, port->name);
227         if (!request)
228                 return -ENOMEM;
229
230         rtnl_unlock();
231         err = brc_send_simple_command(dev_net(dev), request);
232         rtnl_lock();
233
234         return err;
235 }
236
237 static int brc_get_bridge_info(struct net_device *dev,
238                                struct __bridge_info __user *ub)
239 {
240         struct __bridge_info b;
241
242         memset(&b, 0, sizeof(struct __bridge_info));
243
244         /* First two bytes are the priority, which we should skip.  This comes
245          * from struct bridge_id in br_private.h, which is unavailable to us.
246          */
247         memcpy((u8 *)&b.bridge_id + 2, dev->dev_addr, ETH_ALEN);
248         b.stp_enabled = 0;
249
250         if (copy_to_user(ub, &b, sizeof(struct __bridge_info)))
251                 return -EFAULT;
252
253         return 0;
254 }
255
256 static int brc_get_port_list(struct net_device *dev, int __user *uindices,
257                              int num)
258 {
259         int retval;
260
261         rtnl_unlock();
262         retval = brc_get_indices(dev_net(dev), BRC_GENL_C_GET_PORTS, dev->name,
263                                  uindices, num);
264         rtnl_lock();
265
266         return retval;
267 }
268
269 /*
270  * Format up to a page worth of forwarding table entries
271  * userbuf -- where to copy result
272  * maxnum  -- maximum number of entries desired
273  *            (limited to a page for sanity)
274  * offset  -- number of records to skip
275  */
276 static int brc_get_fdb_entries(struct net_device *dev, void __user *userbuf,
277                                unsigned long maxnum, unsigned long offset)
278 {
279         struct nlattr *attrs[BRC_GENL_A_MAX + 1];
280         struct sk_buff *request, *reply;
281         int retval;
282         int len;
283
284         /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */
285         if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry))
286                 maxnum = PAGE_SIZE/sizeof(struct __fdb_entry);
287
288         request = brc_make_request(BRC_GENL_C_FDB_QUERY, dev->name, NULL);
289         if (!request)
290                 return -ENOMEM;
291         NLA_PUT_U64(request, BRC_GENL_A_FDB_COUNT, maxnum);
292         NLA_PUT_U64(request, BRC_GENL_A_FDB_SKIP, offset);
293
294         rtnl_unlock();
295         reply = brc_send_command(dev_net(dev), request, attrs);
296         retval = PTR_ERR(reply);
297         if (IS_ERR(reply))
298                 goto exit;
299
300         retval = -nla_get_u32(attrs[BRC_GENL_A_ERR_CODE]);
301         if (retval < 0)
302                 goto exit_free_skb;
303
304         retval = -EINVAL;
305         if (!attrs[BRC_GENL_A_FDB_DATA])
306                 goto exit_free_skb;
307         len = nla_len(attrs[BRC_GENL_A_FDB_DATA]);
308         if (len % sizeof(struct __fdb_entry) ||
309             len / sizeof(struct __fdb_entry) > maxnum)
310                 goto exit_free_skb;
311
312         retval = len / sizeof(struct __fdb_entry);
313         if (copy_to_user(userbuf, nla_data(attrs[BRC_GENL_A_FDB_DATA]), len))
314                 retval = -EFAULT;
315
316 exit_free_skb:
317         kfree_skb(reply);
318 exit:
319         rtnl_lock();
320         return retval;
321
322 nla_put_failure:
323         kfree_skb(request);
324         return -ENOMEM;
325 }
326
327 /* Legacy ioctl's through SIOCDEVPRIVATE.  Called with rtnl_lock. */
328 static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
329 {
330         unsigned long args[4];
331
332         if (copy_from_user(args, rq->ifr_data, sizeof(args)))
333                 return -EFAULT;
334
335         switch (args[0]) {
336         case BRCTL_ADD_IF:
337                 return brc_add_del_port(dev, args[1], 1);
338         case BRCTL_DEL_IF:
339                 return brc_add_del_port(dev, args[1], 0);
340
341         case BRCTL_GET_BRIDGE_INFO:
342                 return brc_get_bridge_info(dev, (struct __bridge_info __user *)args[1]);
343
344         case BRCTL_GET_PORT_LIST:
345                 return brc_get_port_list(dev, (int __user *)args[1], args[2]);
346
347         case BRCTL_GET_FDB_ENTRIES:
348                 return brc_get_fdb_entries(dev, (void __user *)args[1],
349                                            args[2], args[3]);
350         }
351
352         return -EOPNOTSUPP;
353 }
354
355 /* Called with the rtnl_lock. */
356 static int brc_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
357 {
358         int err;
359
360         switch (cmd) {
361         case SIOCDEVPRIVATE:
362                 err = old_dev_ioctl(dev, rq, cmd);
363                 break;
364
365         case SIOCBRADDIF:
366                 return brc_add_del_port(dev, rq->ifr_ifindex, 1);
367         case SIOCBRDELIF:
368                 return brc_add_del_port(dev, rq->ifr_ifindex, 0);
369
370         default:
371                 err = -EOPNOTSUPP;
372                 break;
373         }
374
375         return err;
376 }
377
378
379 static struct genl_family brc_genl_family = {
380         .id = GENL_ID_GENERATE,
381         .hdrsize = 0,
382         .name = BRC_GENL_FAMILY_NAME,
383         .version = 1,
384         .maxattr = BRC_GENL_A_MAX,
385          SET_NETNSOK
386 };
387
388 static int brc_genl_query(struct sk_buff *skb, struct genl_info *info)
389 {
390         int err = -EINVAL;
391         struct sk_buff *ans_skb;
392         void *data;
393
394         ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
395         if (!ans_skb)
396                 return -ENOMEM;
397
398         data = genlmsg_put_reply(ans_skb, info, &brc_genl_family,
399                                  0, BRC_GENL_C_QUERY_MC);
400         if (data == NULL) {
401                 err = -ENOMEM;
402                 goto err;
403         }
404         NLA_PUT_U32(ans_skb, BRC_GENL_A_MC_GROUP, brc_mc_group.id);
405
406         genlmsg_end(ans_skb, data);
407         return genlmsg_reply(ans_skb, info);
408
409 err:
410 nla_put_failure:
411         kfree_skb(ans_skb);
412         return err;
413 }
414
415 /* Attribute policy: what each attribute may contain.  */
416 static struct nla_policy brc_genl_policy[BRC_GENL_A_MAX + 1] = {
417         [BRC_GENL_A_ERR_CODE] = { .type = NLA_U32 },
418         [BRC_GENL_A_FDB_DATA] = { .type = NLA_UNSPEC },
419 };
420
421 static int brc_genl_dp_result(struct sk_buff *skb, struct genl_info *info)
422 {
423         unsigned long int flags;
424         int err;
425
426         if (!info->attrs[BRC_GENL_A_ERR_CODE])
427                 return -EINVAL;
428
429         skb = skb_clone(skb, GFP_KERNEL);
430         if (!skb)
431                 return -ENOMEM;
432
433         spin_lock_irqsave(&brc_lock, flags);
434         if (brc_seq == info->snd_seq) {
435                 brc_seq++;
436
437                 kfree_skb(brc_reply);
438                 brc_reply = skb;
439
440                 complete(&brc_done);
441                 err = 0;
442         } else {
443                 kfree_skb(skb);
444                 err = -ESTALE;
445         }
446         spin_unlock_irqrestore(&brc_lock, flags);
447
448         return err;
449 }
450
451 static struct genl_ops brc_genl_ops[] = {
452         { .cmd = BRC_GENL_C_QUERY_MC,
453           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
454           .policy = NULL,
455           .doit = brc_genl_query,
456         },
457         { .cmd = BRC_GENL_C_DP_RESULT,
458           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
459           .policy = brc_genl_policy,
460           .doit = brc_genl_dp_result,
461         },
462 };
463
464 static struct sk_buff *brc_send_command(struct net *net,
465                                         struct sk_buff *request,
466                                         struct nlattr **attrs)
467 {
468         unsigned long int flags;
469         struct sk_buff *reply;
470         int error;
471
472         mutex_lock(&brc_serial);
473
474         /* Increment sequence number first, so that we ignore any replies
475          * to stale requests. */
476         spin_lock_irqsave(&brc_lock, flags);
477         nlmsg_hdr(request)->nlmsg_seq = ++brc_seq;
478         INIT_COMPLETION(brc_done);
479         spin_unlock_irqrestore(&brc_lock, flags);
480
481         nlmsg_end(request, nlmsg_hdr(request));
482
483         /* Send message. */
484         error = genlmsg_multicast_netns(net, request, 0,
485                                         brc_mc_group.id, GFP_KERNEL);
486         if (error < 0)
487                 goto error;
488
489         /* Wait for reply. */
490         error = -ETIMEDOUT;
491         if (!wait_for_completion_timeout(&brc_done, BRC_TIMEOUT)) {
492                 pr_warn("timed out waiting for userspace\n");
493                 goto error;
494         }
495
496         /* Grab reply. */
497         spin_lock_irqsave(&brc_lock, flags);
498         reply = brc_reply;
499         brc_reply = NULL;
500         spin_unlock_irqrestore(&brc_lock, flags);
501
502         mutex_unlock(&brc_serial);
503
504         /* Re-parse message.  Can't fail, since it parsed correctly once
505          * already. */
506         error = nlmsg_parse(nlmsg_hdr(reply), GENL_HDRLEN,
507                             attrs, BRC_GENL_A_MAX, brc_genl_policy);
508         WARN_ON(error);
509
510         return reply;
511
512 error:
513         mutex_unlock(&brc_serial);
514         return ERR_PTR(error);
515 }
516
517 static int __init brc_init(void)
518 {
519         int err;
520
521         pr_info("Open vSwitch Bridge Compatibility, built "__DATE__" "__TIME__"\n");
522
523         /* Set the bridge ioctl handler */
524         brioctl_set(brc_ioctl_deviceless_stub);
525
526         /* Set the openvswitch_mod device ioctl handler */
527         ovs_dp_ioctl_hook = brc_dev_ioctl;
528
529         /* Randomize the initial sequence number.  This is not a security
530          * feature; it only helps avoid crossed wires between userspace and
531          * the kernel when the module is unloaded and reloaded. */
532         brc_seq = net_random();
533
534         /* Register generic netlink family to communicate changes to
535          * userspace. */
536         err = genl_register_family_with_ops(&brc_genl_family,
537                                             brc_genl_ops, ARRAY_SIZE(brc_genl_ops));
538         if (err)
539                 goto error;
540
541         strcpy(brc_mc_group.name, "brcompat");
542         err = genl_register_mc_group(&brc_genl_family, &brc_mc_group);
543         if (err < 0)
544                 goto err_unregister;
545
546         return 0;
547
548 err_unregister:
549         genl_unregister_family(&brc_genl_family);
550 error:
551         pr_emerg("failed to install!\n");
552         return err;
553 }
554
555 static void brc_cleanup(void)
556 {
557         /* Unregister ioctl hooks */
558         ovs_dp_ioctl_hook = NULL;
559         brioctl_set(NULL);
560
561         genl_unregister_family(&brc_genl_family);
562 }
563
564 module_init(brc_init);
565 module_exit(brc_cleanup);
566
567 MODULE_DESCRIPTION("Open vSwitch bridge compatibility");
568 MODULE_AUTHOR("Nicira Networks");
569 MODULE_LICENSE("GPL");
570
571 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
572 /*
573  * In kernels 2.6.36 and later, Open vSwitch can safely coexist with
574  * the Linux bridge module, but it does not make sense to load both bridge and
575  * brcompat_mod, so this prevents it.
576  */
577 BRIDGE_MUTUAL_EXCLUSION;
578 #endif