2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
67 #include "socket-util.h"
70 #include "unaligned.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
153 /* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
236 * This function may be null if 'tc' is not configurable.
238 int (*qdisc_set)(struct netdev *, const struct smap *details);
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
251 * This function may be null if 'tc' does not have queues ('n_queues' is
253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
254 struct smap *details);
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
268 const struct smap *details);
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
280 * On success, initializes '*stats'.
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
286 struct netdev_queue_stats *stats);
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
299 tc_init(struct tc *tc, const struct tc_ops *ops)
302 hmap_init(&tc->queues);
306 tc_destroy(struct tc *tc)
308 hmap_destroy(&tc->queues);
311 static const struct tc_ops tc_ops_htb;
312 static const struct tc_ops tc_ops_hfsc;
313 static const struct tc_ops tc_ops_default;
314 static const struct tc_ops tc_ops_other;
316 static const struct tc_ops *const tcs[] = {
317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
324 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325 static unsigned int tc_get_major(unsigned int handle);
326 static unsigned int tc_get_minor(unsigned int handle);
328 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332 static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
335 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
339 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344 static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347 static int tc_delete_class(const struct netdev *, unsigned int handle);
349 static int tc_del_qdisc(struct netdev *netdev);
350 static int tc_query_qdisc(const struct netdev *netdev);
352 static int tc_calc_cell_log(unsigned int mtu);
353 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 struct netdev_dev_linux {
359 struct netdev_dev netdev_dev;
361 struct shash_node *shash_node;
362 unsigned int cache_valid;
363 unsigned int change_seq;
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 uint8_t etheraddr[ETH_ADDR_LEN];
373 struct in_addr address, netmask;
376 unsigned int ifi_flags;
377 long long int carrier_resets;
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
384 int netdev_policing_error; /* Cached error code from set policing. */
385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
397 struct tap_state tap;
401 struct netdev_linux {
402 struct netdev netdev;
406 /* Sockets used for ioctl operations. */
407 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
409 /* A Netlink routing socket that is not subscribed to any multicast groups. */
410 static struct nl_sock *rtnl_sock;
412 /* This is set pretty low because we probably won't learn anything from the
413 * additional log messages. */
414 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416 static int netdev_linux_init(void);
418 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
419 int cmd, const char *cmd_name);
420 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
421 const char *cmd_name);
422 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
423 int cmd, const char *cmd_name);
424 static int get_flags(const struct netdev_dev *, unsigned int *flags);
425 static int set_flags(const char *, unsigned int flags);
426 static int do_get_ifindex(const char *netdev_name);
427 static int get_ifindex(const struct netdev *, int *ifindexp);
428 static int do_set_addr(struct netdev *netdev,
429 int ioctl_nr, const char *ioctl_name,
430 struct in_addr addr);
431 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
432 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
433 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
434 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
435 static int af_packet_sock(void);
436 static void netdev_linux_miimon_run(void);
437 static void netdev_linux_miimon_wait(void);
440 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 return netdev_class->init == netdev_linux_init;
445 static struct netdev_dev_linux *
446 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
448 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
449 ovs_assert(is_netdev_linux_class(netdev_class));
451 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
454 static struct netdev_linux *
455 netdev_linux_cast(const struct netdev *netdev)
457 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
458 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
459 ovs_assert(is_netdev_linux_class(netdev_class));
461 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
465 netdev_linux_init(void)
467 static int status = -1;
469 /* Create AF_INET socket. */
470 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
471 status = af_inet_sock >= 0 ? 0 : errno;
473 VLOG_ERR("failed to create inet socket: %s", strerror(status));
476 /* Create rtnetlink socket. */
478 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
480 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
489 netdev_linux_run(void)
491 rtnetlink_link_run();
492 netdev_linux_miimon_run();
496 netdev_linux_wait(void)
498 rtnetlink_link_wait();
499 netdev_linux_miimon_wait();
503 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
504 unsigned int ifi_flags,
508 if (!dev->change_seq) {
512 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
513 dev->carrier_resets++;
515 dev->ifi_flags = ifi_flags;
517 dev->cache_valid &= mask;
521 netdev_dev_linux_update(struct netdev_dev_linux *dev,
522 const struct rtnetlink_link_change *change)
524 if (change->nlmsg_type == RTM_NEWLINK) {
526 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
528 /* Update netdev from rtnl-change msg. */
530 dev->mtu = change->mtu;
531 dev->cache_valid |= VALID_MTU;
532 dev->netdev_mtu_error = 0;
535 if (!eth_addr_is_zero(change->addr)) {
536 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
537 dev->cache_valid |= VALID_ETHERADDR;
538 dev->ether_addr_error = 0;
541 dev->ifindex = change->ifi_index;
542 dev->cache_valid |= VALID_IFINDEX;
543 dev->get_ifindex_error = 0;
546 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
551 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
552 void *aux OVS_UNUSED)
554 struct netdev_dev_linux *dev;
556 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
558 const struct netdev_class *netdev_class =
559 netdev_dev_get_class(base_dev);
561 if (is_netdev_linux_class(netdev_class)) {
562 dev = netdev_dev_linux_cast(base_dev);
563 netdev_dev_linux_update(dev, change);
567 struct shash device_shash;
568 struct shash_node *node;
570 shash_init(&device_shash);
571 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
572 SHASH_FOR_EACH (node, &device_shash) {
577 get_flags(&dev->netdev_dev, &flags);
578 netdev_dev_linux_changed(dev, flags, 0);
580 shash_destroy(&device_shash);
585 cache_notifier_ref(void)
587 if (!cache_notifier_refcount) {
588 ovs_assert(!netdev_linux_cache_notifier);
590 netdev_linux_cache_notifier =
591 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
593 if (!netdev_linux_cache_notifier) {
597 cache_notifier_refcount++;
603 cache_notifier_unref(void)
605 ovs_assert(cache_notifier_refcount > 0);
606 if (!--cache_notifier_refcount) {
607 ovs_assert(netdev_linux_cache_notifier);
608 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier = NULL;
613 /* Creates system and internal devices. */
615 netdev_linux_create(const struct netdev_class *class, const char *name,
616 struct netdev_dev **netdev_devp)
618 struct netdev_dev_linux *netdev_dev;
621 error = cache_notifier_ref();
626 netdev_dev = xzalloc(sizeof *netdev_dev);
627 netdev_dev->change_seq = 1;
628 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
629 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
631 *netdev_devp = &netdev_dev->netdev_dev;
635 /* For most types of netdevs we open the device for each call of
636 * netdev_open(). However, this is not the case with tap devices,
637 * since it is only possible to open the device once. In this
638 * situation we share a single file descriptor, and consequently
639 * buffers, across all readers. Therefore once data is read it will
640 * be unavailable to other reads for tap devices. */
642 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
643 const char *name, struct netdev_dev **netdev_devp)
645 struct netdev_dev_linux *netdev_dev;
646 struct tap_state *state;
647 static const char tap_dev[] = "/dev/net/tun";
651 netdev_dev = xzalloc(sizeof *netdev_dev);
652 state = &netdev_dev->state.tap;
654 error = cache_notifier_ref();
659 /* Open tap device. */
660 state->fd = open(tap_dev, O_RDWR);
663 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
664 goto error_unref_notifier;
667 /* Create tap device. */
668 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
669 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
670 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
671 VLOG_WARN("%s: creating tap device failed: %s", name,
674 goto error_unref_notifier;
677 /* Make non-blocking. */
678 error = set_nonblocking(state->fd);
680 goto error_unref_notifier;
683 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
684 *netdev_devp = &netdev_dev->netdev_dev;
687 error_unref_notifier:
688 cache_notifier_unref();
695 destroy_tap(struct netdev_dev_linux *netdev_dev)
697 struct tap_state *state = &netdev_dev->state.tap;
699 if (state->fd >= 0) {
704 /* Destroys the netdev device 'netdev_dev_'. */
706 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
708 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
709 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
711 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
712 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
715 if (class == &netdev_tap_class) {
716 destroy_tap(netdev_dev);
720 cache_notifier_unref();
724 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
726 struct netdev_linux *netdev;
727 enum netdev_flags flags;
730 /* Allocate network device. */
731 netdev = xzalloc(sizeof *netdev);
733 netdev_init(&netdev->netdev, netdev_dev_);
735 /* Verify that the device really exists, by attempting to read its flags.
736 * (The flags might be cached, in which case this won't actually do an
739 * Don't do this for "internal" netdevs, though, because those have to be
740 * created as netdev objects before they exist in the kernel, because
741 * creating them in the kernel happens by passing a netdev object to
742 * dpif_port_add(). */
743 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
744 error = netdev_get_flags(&netdev->netdev, &flags);
745 if (error == ENODEV) {
750 *netdevp = &netdev->netdev;
754 netdev_uninit(&netdev->netdev, true);
758 /* Closes and destroys 'netdev'. */
760 netdev_linux_close(struct netdev *netdev_)
762 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
764 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
771 netdev_linux_listen(struct netdev *netdev_)
773 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 struct netdev_dev_linux *netdev_dev =
775 netdev_dev_linux_cast(netdev_get_dev(netdev_));
776 struct sockaddr_ll sll;
781 if (netdev->fd >= 0) {
785 if (!strcmp(netdev_get_type(netdev_), "tap")
786 && !netdev_dev->state.tap.opened) {
787 netdev->fd = netdev_dev->state.tap.fd;
788 netdev_dev->state.tap.opened = true;
792 /* Create file descriptor. */
793 fd = socket(PF_PACKET, SOCK_RAW, 0);
796 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
800 /* Set non-blocking mode. */
801 error = set_nonblocking(fd);
806 /* Get ethernet device index. */
807 error = get_ifindex(&netdev->netdev, &ifindex);
812 /* Bind to specific ethernet device. */
813 memset(&sll, 0, sizeof sll);
814 sll.sll_family = AF_PACKET;
815 sll.sll_ifindex = ifindex;
816 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
817 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
819 VLOG_ERR("%s: failed to bind raw socket (%s)",
820 netdev_get_name(netdev_), strerror(error));
835 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 if (netdev->fd < 0) {
840 /* Device is not listening. */
847 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
848 ? read(netdev->fd, data, size)
849 : recv(netdev->fd, data, size, MSG_TRUNC));
851 return retval <= size ? retval : -EMSGSIZE;
852 } else if (errno != EINTR) {
853 if (errno != EAGAIN) {
854 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
855 strerror(errno), netdev_get_name(netdev_));
862 /* Registers with the poll loop to wake up from the next call to poll_block()
863 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
865 netdev_linux_recv_wait(struct netdev *netdev_)
867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
868 if (netdev->fd >= 0) {
869 poll_fd_wait(netdev->fd, POLLIN);
873 /* Discards all packets waiting to be received from 'netdev'. */
875 netdev_linux_drain(struct netdev *netdev_)
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
878 if (netdev->fd < 0) {
880 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
882 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
883 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
887 drain_fd(netdev->fd, ifr.ifr_qlen);
890 return drain_rcvbuf(netdev->fd);
894 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
895 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
896 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
897 * the packet is too big or too small to transmit on the device.
899 * The caller retains ownership of 'buffer' in all cases.
901 * The kernel maintains a packet transmission queue, so the caller is not
902 * expected to do additional queuing of packets. */
904 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
910 if (netdev->fd < 0) {
911 /* Use our AF_PACKET socket to send to this device. */
912 struct sockaddr_ll sll;
919 sock = af_packet_sock();
924 error = get_ifindex(netdev_, &ifindex);
929 /* We don't bother setting most fields in sockaddr_ll because the
930 * kernel ignores them for SOCK_RAW. */
931 memset(&sll, 0, sizeof sll);
932 sll.sll_family = AF_PACKET;
933 sll.sll_ifindex = ifindex;
935 iov.iov_base = CONST_CAST(void *, data);
939 msg.msg_namelen = sizeof sll;
942 msg.msg_control = NULL;
943 msg.msg_controllen = 0;
946 retval = sendmsg(sock, &msg, 0);
948 /* Use the netdev's own fd to send to this device. This is
949 * essential for tap devices, because packets sent to a tap device
950 * with an AF_PACKET socket will loop back to be *received* again
951 * on the tap device. */
952 retval = write(netdev->fd, data, size);
956 /* The Linux AF_PACKET implementation never blocks waiting for room
957 * for packets, instead returning ENOBUFS. Translate this into
958 * EAGAIN for the caller. */
959 if (errno == ENOBUFS) {
961 } else if (errno == EINTR) {
963 } else if (errno != EAGAIN) {
964 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
965 netdev_get_name(netdev_), strerror(errno));
968 } else if (retval != size) {
969 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
970 "%zu) on %s", retval, size, netdev_get_name(netdev_));
978 /* Registers with the poll loop to wake up from the next call to poll_block()
979 * when the packet transmission queue has sufficient room to transmit a packet
980 * with netdev_send().
982 * The kernel maintains a packet transmission queue, so the client is not
983 * expected to do additional queuing of packets. Thus, this function is
984 * unlikely to ever be used. It is included for completeness. */
986 netdev_linux_send_wait(struct netdev *netdev_)
988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
989 if (netdev->fd < 0) {
991 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
992 poll_fd_wait(netdev->fd, POLLOUT);
994 /* TAP device always accepts packets.*/
995 poll_immediate_wake();
999 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1000 * otherwise a positive errno value. */
1002 netdev_linux_set_etheraddr(struct netdev *netdev_,
1003 const uint8_t mac[ETH_ADDR_LEN])
1005 struct netdev_dev_linux *netdev_dev =
1006 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1007 struct netdev_saved_flags *sf = NULL;
1010 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1011 if (netdev_dev->ether_addr_error) {
1012 return netdev_dev->ether_addr_error;
1014 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1017 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1020 /* Tap devices must be brought down before setting the address. */
1021 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1022 enum netdev_flags flags;
1024 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1025 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
1028 error = set_etheraddr(netdev_get_name(netdev_), mac);
1029 if (!error || error == ENODEV) {
1030 netdev_dev->ether_addr_error = error;
1031 netdev_dev->cache_valid |= VALID_ETHERADDR;
1033 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1037 netdev_restore_flags(sf);
1042 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1044 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1045 uint8_t mac[ETH_ADDR_LEN])
1047 struct netdev_dev_linux *netdev_dev =
1048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1050 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1051 int error = get_etheraddr(netdev_get_name(netdev_),
1052 netdev_dev->etheraddr);
1054 netdev_dev->ether_addr_error = error;
1055 netdev_dev->cache_valid |= VALID_ETHERADDR;
1058 if (!netdev_dev->ether_addr_error) {
1059 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1062 return netdev_dev->ether_addr_error;
1065 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1066 * in bytes, not including the hardware header; thus, this is typically 1500
1067 * bytes for Ethernet devices. */
1069 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1071 struct netdev_dev_linux *netdev_dev =
1072 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1073 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1077 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1078 SIOCGIFMTU, "SIOCGIFMTU");
1080 netdev_dev->netdev_mtu_error = error;
1081 netdev_dev->mtu = ifr.ifr_mtu;
1082 netdev_dev->cache_valid |= VALID_MTU;
1085 if (!netdev_dev->netdev_mtu_error) {
1086 *mtup = netdev_dev->mtu;
1088 return netdev_dev->netdev_mtu_error;
1091 /* Sets the maximum size of transmitted (MTU) for given device using linux
1092 * networking ioctl interface.
1095 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1097 struct netdev_dev_linux *netdev_dev =
1098 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1102 if (netdev_dev->cache_valid & VALID_MTU) {
1103 if (netdev_dev->netdev_mtu_error) {
1104 return netdev_dev->netdev_mtu_error;
1106 if (netdev_dev->mtu == mtu) {
1109 netdev_dev->cache_valid &= ~VALID_MTU;
1112 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1113 SIOCSIFMTU, "SIOCSIFMTU");
1114 if (!error || error == ENODEV) {
1115 netdev_dev->netdev_mtu_error = error;
1116 netdev_dev->mtu = ifr.ifr_mtu;
1117 netdev_dev->cache_valid |= VALID_MTU;
1122 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1123 * On failure, returns a negative errno value. */
1125 netdev_linux_get_ifindex(const struct netdev *netdev)
1129 error = get_ifindex(netdev, &ifindex);
1130 return error ? -error : ifindex;
1134 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1136 struct netdev_dev_linux *netdev_dev =
1137 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1139 if (netdev_dev->miimon_interval > 0) {
1140 *carrier = netdev_dev->miimon;
1142 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1148 static long long int
1149 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1151 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1155 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1156 struct mii_ioctl_data *data)
1161 memset(&ifr, 0, sizeof ifr);
1162 memcpy(&ifr.ifr_data, data, sizeof *data);
1163 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1164 memcpy(data, &ifr.ifr_data, sizeof *data);
1170 netdev_linux_get_miimon(const char *name, bool *miimon)
1172 struct mii_ioctl_data data;
1177 memset(&data, 0, sizeof data);
1178 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1180 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1181 data.reg_num = MII_BMSR;
1182 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1186 *miimon = !!(data.val_out & BMSR_LSTATUS);
1188 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1191 struct ethtool_cmd ecmd;
1193 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1196 COVERAGE_INC(netdev_get_ethtool);
1197 memset(&ecmd, 0, sizeof ecmd);
1198 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1201 struct ethtool_value eval;
1203 memcpy(&eval, &ecmd, sizeof eval);
1204 *miimon = !!eval.data;
1206 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1214 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1215 long long int interval)
1217 struct netdev_dev_linux *netdev_dev;
1219 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1221 interval = interval > 0 ? MAX(interval, 100) : 0;
1222 if (netdev_dev->miimon_interval != interval) {
1223 netdev_dev->miimon_interval = interval;
1224 timer_set_expired(&netdev_dev->miimon_timer);
1231 netdev_linux_miimon_run(void)
1233 struct shash device_shash;
1234 struct shash_node *node;
1236 shash_init(&device_shash);
1237 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1238 SHASH_FOR_EACH (node, &device_shash) {
1239 struct netdev_dev_linux *dev = node->data;
1242 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1246 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1247 if (miimon != dev->miimon) {
1248 dev->miimon = miimon;
1249 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1252 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1255 shash_destroy(&device_shash);
1259 netdev_linux_miimon_wait(void)
1261 struct shash device_shash;
1262 struct shash_node *node;
1264 shash_init(&device_shash);
1265 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1266 SHASH_FOR_EACH (node, &device_shash) {
1267 struct netdev_dev_linux *dev = node->data;
1269 if (dev->miimon_interval > 0) {
1270 timer_wait(&dev->miimon_timer);
1273 shash_destroy(&device_shash);
1276 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1277 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1280 check_for_working_netlink_stats(void)
1282 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1283 * preferable, so if that works, we'll use it. */
1284 int ifindex = do_get_ifindex("lo");
1286 VLOG_WARN("failed to get ifindex for lo, "
1287 "obtaining netdev stats from proc");
1290 struct netdev_stats stats;
1291 int error = get_stats_via_netlink(ifindex, &stats);
1293 VLOG_DBG("obtaining netdev stats via rtnetlink");
1296 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1297 "via proc (you are probably running a pre-2.6.19 "
1298 "kernel)", strerror(error));
1305 swap_uint64(uint64_t *a, uint64_t *b)
1312 /* Copies 'src' into 'dst', performing format conversion in the process.
1314 * 'src' is allowed to be misaligned. */
1316 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1317 const struct ovs_vport_stats *src)
1319 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1320 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1321 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1322 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1323 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1324 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1325 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1326 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1328 dst->collisions = 0;
1329 dst->rx_length_errors = 0;
1330 dst->rx_over_errors = 0;
1331 dst->rx_crc_errors = 0;
1332 dst->rx_frame_errors = 0;
1333 dst->rx_fifo_errors = 0;
1334 dst->rx_missed_errors = 0;
1335 dst->tx_aborted_errors = 0;
1336 dst->tx_carrier_errors = 0;
1337 dst->tx_fifo_errors = 0;
1338 dst->tx_heartbeat_errors = 0;
1339 dst->tx_window_errors = 0;
1343 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1345 struct dpif_linux_vport reply;
1349 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1352 } else if (!reply.stats) {
1357 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1365 get_stats_via_vport(const struct netdev *netdev_,
1366 struct netdev_stats *stats)
1368 struct netdev_dev_linux *netdev_dev =
1369 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1371 if (!netdev_dev->vport_stats_error ||
1372 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1375 error = get_stats_via_vport__(netdev_, stats);
1376 if (error && error != ENOENT) {
1377 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1378 "(%s)", netdev_get_name(netdev_), strerror(error));
1380 netdev_dev->vport_stats_error = error;
1381 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1386 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1387 struct netdev_stats *stats)
1389 static int use_netlink_stats = -1;
1392 if (use_netlink_stats < 0) {
1393 use_netlink_stats = check_for_working_netlink_stats();
1396 if (use_netlink_stats) {
1399 error = get_ifindex(netdev_, &ifindex);
1401 error = get_stats_via_netlink(ifindex, stats);
1404 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1408 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1409 netdev_get_name(netdev_), error);
1415 /* Retrieves current device stats for 'netdev-linux'. */
1417 netdev_linux_get_stats(const struct netdev *netdev_,
1418 struct netdev_stats *stats)
1420 struct netdev_dev_linux *netdev_dev =
1421 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1422 struct netdev_stats dev_stats;
1425 get_stats_via_vport(netdev_, stats);
1427 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1430 if (netdev_dev->vport_stats_error) {
1437 if (netdev_dev->vport_stats_error) {
1438 /* stats not available from OVS then use ioctl stats. */
1441 stats->rx_errors += dev_stats.rx_errors;
1442 stats->tx_errors += dev_stats.tx_errors;
1443 stats->rx_dropped += dev_stats.rx_dropped;
1444 stats->tx_dropped += dev_stats.tx_dropped;
1445 stats->multicast += dev_stats.multicast;
1446 stats->collisions += dev_stats.collisions;
1447 stats->rx_length_errors += dev_stats.rx_length_errors;
1448 stats->rx_over_errors += dev_stats.rx_over_errors;
1449 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1450 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1451 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1452 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1453 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1454 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1455 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1456 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1457 stats->tx_window_errors += dev_stats.tx_window_errors;
1462 /* Retrieves current device stats for 'netdev-tap' netdev or
1463 * netdev-internal. */
1465 netdev_tap_get_stats(const struct netdev *netdev_,
1466 struct netdev_stats *stats)
1468 struct netdev_dev_linux *netdev_dev =
1469 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1470 struct netdev_stats dev_stats;
1473 get_stats_via_vport(netdev_, stats);
1475 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1477 if (netdev_dev->vport_stats_error) {
1484 /* If this port is an internal port then the transmit and receive stats
1485 * will appear to be swapped relative to the other ports since we are the
1486 * one sending the data, not a remote computer. For consistency, we swap
1487 * them back here. This does not apply if we are getting stats from the
1488 * vport layer because it always tracks stats from the perspective of the
1490 if (netdev_dev->vport_stats_error) {
1492 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1493 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1494 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1495 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1496 stats->rx_length_errors = 0;
1497 stats->rx_over_errors = 0;
1498 stats->rx_crc_errors = 0;
1499 stats->rx_frame_errors = 0;
1500 stats->rx_fifo_errors = 0;
1501 stats->rx_missed_errors = 0;
1502 stats->tx_aborted_errors = 0;
1503 stats->tx_carrier_errors = 0;
1504 stats->tx_fifo_errors = 0;
1505 stats->tx_heartbeat_errors = 0;
1506 stats->tx_window_errors = 0;
1508 stats->rx_dropped += dev_stats.tx_dropped;
1509 stats->tx_dropped += dev_stats.rx_dropped;
1511 stats->rx_errors += dev_stats.tx_errors;
1512 stats->tx_errors += dev_stats.rx_errors;
1514 stats->multicast += dev_stats.multicast;
1515 stats->collisions += dev_stats.collisions;
1521 netdev_internal_get_stats(const struct netdev *netdev_,
1522 struct netdev_stats *stats)
1524 struct netdev_dev_linux *netdev_dev =
1525 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1527 get_stats_via_vport(netdev_, stats);
1528 return netdev_dev->vport_stats_error;
1532 netdev_internal_set_stats(struct netdev *netdev,
1533 const struct netdev_stats *stats)
1535 struct ovs_vport_stats vport_stats;
1536 struct dpif_linux_vport vport;
1539 vport_stats.rx_packets = stats->rx_packets;
1540 vport_stats.tx_packets = stats->tx_packets;
1541 vport_stats.rx_bytes = stats->rx_bytes;
1542 vport_stats.tx_bytes = stats->tx_bytes;
1543 vport_stats.rx_errors = stats->rx_errors;
1544 vport_stats.tx_errors = stats->tx_errors;
1545 vport_stats.rx_dropped = stats->rx_dropped;
1546 vport_stats.tx_dropped = stats->tx_dropped;
1548 dpif_linux_vport_init(&vport);
1549 vport.cmd = OVS_VPORT_CMD_SET;
1550 vport.name = netdev_get_name(netdev);
1551 vport.stats = &vport_stats;
1553 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1555 /* If the vport layer doesn't know about the device, that doesn't mean it
1556 * doesn't exist (after all were able to open it when netdev_open() was
1557 * called), it just means that it isn't attached and we'll be getting
1558 * stats a different way. */
1559 if (err == ENODEV) {
1567 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1569 struct ethtool_cmd ecmd;
1573 if (netdev_dev->cache_valid & VALID_FEATURES) {
1577 COVERAGE_INC(netdev_get_ethtool);
1578 memset(&ecmd, 0, sizeof ecmd);
1579 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1580 ETHTOOL_GSET, "ETHTOOL_GSET");
1585 /* Supported features. */
1586 netdev_dev->supported = 0;
1587 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1588 netdev_dev->supported |= NETDEV_F_10MB_HD;
1590 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1591 netdev_dev->supported |= NETDEV_F_10MB_FD;
1593 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1594 netdev_dev->supported |= NETDEV_F_100MB_HD;
1596 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1597 netdev_dev->supported |= NETDEV_F_100MB_FD;
1599 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1600 netdev_dev->supported |= NETDEV_F_1GB_HD;
1602 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1603 netdev_dev->supported |= NETDEV_F_1GB_FD;
1605 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1606 netdev_dev->supported |= NETDEV_F_10GB_FD;
1608 if (ecmd.supported & SUPPORTED_TP) {
1609 netdev_dev->supported |= NETDEV_F_COPPER;
1611 if (ecmd.supported & SUPPORTED_FIBRE) {
1612 netdev_dev->supported |= NETDEV_F_FIBER;
1614 if (ecmd.supported & SUPPORTED_Autoneg) {
1615 netdev_dev->supported |= NETDEV_F_AUTONEG;
1617 if (ecmd.supported & SUPPORTED_Pause) {
1618 netdev_dev->supported |= NETDEV_F_PAUSE;
1620 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1621 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1624 /* Advertised features. */
1625 netdev_dev->advertised = 0;
1626 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1627 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1629 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1630 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1632 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1633 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1635 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1636 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1638 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1639 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1641 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1642 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1644 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1645 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1647 if (ecmd.advertising & ADVERTISED_TP) {
1648 netdev_dev->advertised |= NETDEV_F_COPPER;
1650 if (ecmd.advertising & ADVERTISED_FIBRE) {
1651 netdev_dev->advertised |= NETDEV_F_FIBER;
1653 if (ecmd.advertising & ADVERTISED_Autoneg) {
1654 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1656 if (ecmd.advertising & ADVERTISED_Pause) {
1657 netdev_dev->advertised |= NETDEV_F_PAUSE;
1659 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1660 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1663 /* Current settings. */
1665 if (speed == SPEED_10) {
1666 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1667 } else if (speed == SPEED_100) {
1668 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1669 } else if (speed == SPEED_1000) {
1670 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1671 } else if (speed == SPEED_10000) {
1672 netdev_dev->current = NETDEV_F_10GB_FD;
1673 } else if (speed == 40000) {
1674 netdev_dev->current = NETDEV_F_40GB_FD;
1675 } else if (speed == 100000) {
1676 netdev_dev->current = NETDEV_F_100GB_FD;
1677 } else if (speed == 1000000) {
1678 netdev_dev->current = NETDEV_F_1TB_FD;
1680 netdev_dev->current = 0;
1683 if (ecmd.port == PORT_TP) {
1684 netdev_dev->current |= NETDEV_F_COPPER;
1685 } else if (ecmd.port == PORT_FIBRE) {
1686 netdev_dev->current |= NETDEV_F_FIBER;
1690 netdev_dev->current |= NETDEV_F_AUTONEG;
1693 /* Peer advertisements. */
1694 netdev_dev->peer = 0; /* XXX */
1697 netdev_dev->cache_valid |= VALID_FEATURES;
1698 netdev_dev->get_features_error = error;
1701 /* Stores the features supported by 'netdev' into each of '*current',
1702 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1703 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1706 netdev_linux_get_features(const struct netdev *netdev_,
1707 enum netdev_features *current,
1708 enum netdev_features *advertised,
1709 enum netdev_features *supported,
1710 enum netdev_features *peer)
1712 struct netdev_dev_linux *netdev_dev =
1713 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1715 netdev_linux_read_features(netdev_dev);
1717 if (!netdev_dev->get_features_error) {
1718 *current = netdev_dev->current;
1719 *advertised = netdev_dev->advertised;
1720 *supported = netdev_dev->supported;
1721 *peer = netdev_dev->peer;
1723 return netdev_dev->get_features_error;
1726 /* Set the features advertised by 'netdev' to 'advertise'. */
1728 netdev_linux_set_advertisements(struct netdev *netdev,
1729 enum netdev_features advertise)
1731 struct ethtool_cmd ecmd;
1734 COVERAGE_INC(netdev_get_ethtool);
1735 memset(&ecmd, 0, sizeof ecmd);
1736 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1737 ETHTOOL_GSET, "ETHTOOL_GSET");
1742 ecmd.advertising = 0;
1743 if (advertise & NETDEV_F_10MB_HD) {
1744 ecmd.advertising |= ADVERTISED_10baseT_Half;
1746 if (advertise & NETDEV_F_10MB_FD) {
1747 ecmd.advertising |= ADVERTISED_10baseT_Full;
1749 if (advertise & NETDEV_F_100MB_HD) {
1750 ecmd.advertising |= ADVERTISED_100baseT_Half;
1752 if (advertise & NETDEV_F_100MB_FD) {
1753 ecmd.advertising |= ADVERTISED_100baseT_Full;
1755 if (advertise & NETDEV_F_1GB_HD) {
1756 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1758 if (advertise & NETDEV_F_1GB_FD) {
1759 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1761 if (advertise & NETDEV_F_10GB_FD) {
1762 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1764 if (advertise & NETDEV_F_COPPER) {
1765 ecmd.advertising |= ADVERTISED_TP;
1767 if (advertise & NETDEV_F_FIBER) {
1768 ecmd.advertising |= ADVERTISED_FIBRE;
1770 if (advertise & NETDEV_F_AUTONEG) {
1771 ecmd.advertising |= ADVERTISED_Autoneg;
1773 if (advertise & NETDEV_F_PAUSE) {
1774 ecmd.advertising |= ADVERTISED_Pause;
1776 if (advertise & NETDEV_F_PAUSE_ASYM) {
1777 ecmd.advertising |= ADVERTISED_Asym_Pause;
1779 COVERAGE_INC(netdev_set_ethtool);
1780 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1781 ETHTOOL_SSET, "ETHTOOL_SSET");
1784 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1785 * successful, otherwise a positive errno value. */
1787 netdev_linux_set_policing(struct netdev *netdev,
1788 uint32_t kbits_rate, uint32_t kbits_burst)
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev));
1792 const char *netdev_name = netdev_get_name(netdev);
1796 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1797 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1798 : kbits_burst); /* Stick with user-specified value. */
1800 if (netdev_dev->cache_valid & VALID_POLICING) {
1801 if (netdev_dev->netdev_policing_error) {
1802 return netdev_dev->netdev_policing_error;
1805 if (netdev_dev->kbits_rate == kbits_rate &&
1806 netdev_dev->kbits_burst == kbits_burst) {
1807 /* Assume that settings haven't changed since we last set them. */
1810 netdev_dev->cache_valid &= ~VALID_POLICING;
1813 COVERAGE_INC(netdev_set_policing);
1814 /* Remove any existing ingress qdisc. */
1815 error = tc_add_del_ingress_qdisc(netdev, false);
1817 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1818 netdev_name, strerror(error));
1823 error = tc_add_del_ingress_qdisc(netdev, true);
1825 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1826 netdev_name, strerror(error));
1830 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1832 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1833 netdev_name, strerror(error));
1838 netdev_dev->kbits_rate = kbits_rate;
1839 netdev_dev->kbits_burst = kbits_burst;
1842 if (!error || error == ENODEV) {
1843 netdev_dev->netdev_policing_error = error;
1844 netdev_dev->cache_valid |= VALID_POLICING;
1850 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1853 const struct tc_ops *const *opsp;
1855 for (opsp = tcs; *opsp != NULL; opsp++) {
1856 const struct tc_ops *ops = *opsp;
1857 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1858 sset_add(types, ops->ovs_name);
1864 static const struct tc_ops *
1865 tc_lookup_ovs_name(const char *name)
1867 const struct tc_ops *const *opsp;
1869 for (opsp = tcs; *opsp != NULL; opsp++) {
1870 const struct tc_ops *ops = *opsp;
1871 if (!strcmp(name, ops->ovs_name)) {
1878 static const struct tc_ops *
1879 tc_lookup_linux_name(const char *name)
1881 const struct tc_ops *const *opsp;
1883 for (opsp = tcs; *opsp != NULL; opsp++) {
1884 const struct tc_ops *ops = *opsp;
1885 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1892 static struct tc_queue *
1893 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1896 struct netdev_dev_linux *netdev_dev =
1897 netdev_dev_linux_cast(netdev_get_dev(netdev));
1898 struct tc_queue *queue;
1900 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1901 if (queue->queue_id == queue_id) {
1908 static struct tc_queue *
1909 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1911 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1915 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1917 struct netdev_qos_capabilities *caps)
1919 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1923 caps->n_queues = ops->n_queues;
1928 netdev_linux_get_qos(const struct netdev *netdev,
1929 const char **typep, struct smap *details)
1931 struct netdev_dev_linux *netdev_dev =
1932 netdev_dev_linux_cast(netdev_get_dev(netdev));
1935 error = tc_query_qdisc(netdev);
1940 *typep = netdev_dev->tc->ops->ovs_name;
1941 return (netdev_dev->tc->ops->qdisc_get
1942 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1947 netdev_linux_set_qos(struct netdev *netdev,
1948 const char *type, const struct smap *details)
1950 struct netdev_dev_linux *netdev_dev =
1951 netdev_dev_linux_cast(netdev_get_dev(netdev));
1952 const struct tc_ops *new_ops;
1955 new_ops = tc_lookup_ovs_name(type);
1956 if (!new_ops || !new_ops->tc_install) {
1960 error = tc_query_qdisc(netdev);
1965 if (new_ops == netdev_dev->tc->ops) {
1966 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1968 /* Delete existing qdisc. */
1969 error = tc_del_qdisc(netdev);
1973 ovs_assert(netdev_dev->tc == NULL);
1975 /* Install new qdisc. */
1976 error = new_ops->tc_install(netdev, details);
1977 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
1984 netdev_linux_get_queue(const struct netdev *netdev,
1985 unsigned int queue_id, struct smap *details)
1987 struct netdev_dev_linux *netdev_dev =
1988 netdev_dev_linux_cast(netdev_get_dev(netdev));
1991 error = tc_query_qdisc(netdev);
1995 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1997 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
2003 netdev_linux_set_queue(struct netdev *netdev,
2004 unsigned int queue_id, const struct smap *details)
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev));
2010 error = tc_query_qdisc(netdev);
2013 } else if (queue_id >= netdev_dev->tc->ops->n_queues
2014 || !netdev_dev->tc->ops->class_set) {
2018 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
2022 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
2024 struct netdev_dev_linux *netdev_dev =
2025 netdev_dev_linux_cast(netdev_get_dev(netdev));
2028 error = tc_query_qdisc(netdev);
2031 } else if (!netdev_dev->tc->ops->class_delete) {
2034 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2036 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2042 netdev_linux_get_queue_stats(const struct netdev *netdev,
2043 unsigned int queue_id,
2044 struct netdev_queue_stats *stats)
2046 struct netdev_dev_linux *netdev_dev =
2047 netdev_dev_linux_cast(netdev_get_dev(netdev));
2050 error = tc_query_qdisc(netdev);
2053 } else if (!netdev_dev->tc->ops->class_get_stats) {
2056 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2058 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2064 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2066 struct ofpbuf request;
2067 struct tcmsg *tcmsg;
2069 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2073 tcmsg->tcm_parent = 0;
2074 nl_dump_start(dump, rtnl_sock, &request);
2075 ofpbuf_uninit(&request);
2080 netdev_linux_dump_queues(const struct netdev *netdev,
2081 netdev_dump_queues_cb *cb, void *aux)
2083 struct netdev_dev_linux *netdev_dev =
2084 netdev_dev_linux_cast(netdev_get_dev(netdev));
2085 struct tc_queue *queue, *next_queue;
2086 struct smap details;
2090 error = tc_query_qdisc(netdev);
2093 } else if (!netdev_dev->tc->ops->class_get) {
2098 smap_init(&details);
2099 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2100 &netdev_dev->tc->queues) {
2101 smap_clear(&details);
2103 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2105 (*cb)(queue->queue_id, &details, aux);
2110 smap_destroy(&details);
2116 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2117 netdev_dump_queue_stats_cb *cb, void *aux)
2119 struct netdev_dev_linux *netdev_dev =
2120 netdev_dev_linux_cast(netdev_get_dev(netdev));
2121 struct nl_dump dump;
2126 error = tc_query_qdisc(netdev);
2129 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2134 if (!start_queue_dump(netdev, &dump)) {
2137 while (nl_dump_next(&dump, &msg)) {
2138 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2144 error = nl_dump_done(&dump);
2145 return error ? error : last_error;
2149 netdev_linux_get_in4(const struct netdev *netdev_,
2150 struct in_addr *address, struct in_addr *netmask)
2152 struct netdev_dev_linux *netdev_dev =
2153 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2155 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2158 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2159 SIOCGIFADDR, "SIOCGIFADDR");
2164 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2165 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2170 netdev_dev->cache_valid |= VALID_IN4;
2172 *address = netdev_dev->address;
2173 *netmask = netdev_dev->netmask;
2174 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2178 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2179 struct in_addr netmask)
2181 struct netdev_dev_linux *netdev_dev =
2182 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2185 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2187 netdev_dev->cache_valid |= VALID_IN4;
2188 netdev_dev->address = address;
2189 netdev_dev->netmask = netmask;
2190 if (address.s_addr != INADDR_ANY) {
2191 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2192 "SIOCSIFNETMASK", netmask);
2199 parse_if_inet6_line(const char *line,
2200 struct in6_addr *in6, char ifname[16 + 1])
2202 uint8_t *s6 = in6->s6_addr;
2203 #define X8 "%2"SCNx8
2205 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2206 "%*x %*x %*x %*x %16s\n",
2207 &s6[0], &s6[1], &s6[2], &s6[3],
2208 &s6[4], &s6[5], &s6[6], &s6[7],
2209 &s6[8], &s6[9], &s6[10], &s6[11],
2210 &s6[12], &s6[13], &s6[14], &s6[15],
2214 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2215 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2217 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2219 struct netdev_dev_linux *netdev_dev =
2220 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2221 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2225 netdev_dev->in6 = in6addr_any;
2227 file = fopen("/proc/net/if_inet6", "r");
2229 const char *name = netdev_get_name(netdev_);
2230 while (fgets(line, sizeof line, file)) {
2231 struct in6_addr in6_tmp;
2232 char ifname[16 + 1];
2233 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2234 && !strcmp(name, ifname))
2236 netdev_dev->in6 = in6_tmp;
2242 netdev_dev->cache_valid |= VALID_IN6;
2244 *in6 = netdev_dev->in6;
2249 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2251 struct sockaddr_in sin;
2252 memset(&sin, 0, sizeof sin);
2253 sin.sin_family = AF_INET;
2254 sin.sin_addr = addr;
2257 memset(sa, 0, sizeof *sa);
2258 memcpy(sa, &sin, sizeof sin);
2262 do_set_addr(struct netdev *netdev,
2263 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2266 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2267 make_in4_sockaddr(&ifr.ifr_addr, addr);
2269 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2273 /* Adds 'router' as a default IP gateway. */
2275 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2277 struct in_addr any = { INADDR_ANY };
2281 memset(&rt, 0, sizeof rt);
2282 make_in4_sockaddr(&rt.rt_dst, any);
2283 make_in4_sockaddr(&rt.rt_gateway, router);
2284 make_in4_sockaddr(&rt.rt_genmask, any);
2285 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2286 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2288 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2294 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2297 static const char fn[] = "/proc/net/route";
2302 *netdev_name = NULL;
2303 stream = fopen(fn, "r");
2304 if (stream == NULL) {
2305 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2310 while (fgets(line, sizeof line, stream)) {
2313 ovs_be32 dest, gateway, mask;
2314 int refcnt, metric, mtu;
2315 unsigned int flags, use, window, irtt;
2318 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2320 iface, &dest, &gateway, &flags, &refcnt,
2321 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2323 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2327 if (!(flags & RTF_UP)) {
2328 /* Skip routes that aren't up. */
2332 /* The output of 'dest', 'mask', and 'gateway' were given in
2333 * network byte order, so we don't need need any endian
2334 * conversions here. */
2335 if ((dest & mask) == (host->s_addr & mask)) {
2337 /* The host is directly reachable. */
2338 next_hop->s_addr = 0;
2340 /* To reach the host, we must go through a gateway. */
2341 next_hop->s_addr = gateway;
2343 *netdev_name = xstrdup(iface);
2355 netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
2357 struct netdev_dev_linux *netdev_dev;
2360 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2361 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2362 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2364 COVERAGE_INC(netdev_get_ethtool);
2365 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2366 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2369 "ETHTOOL_GDRVINFO");
2371 netdev_dev->cache_valid |= VALID_DRVINFO;
2376 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2377 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2378 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2384 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2387 smap_add(smap, "driver_name", "openvswitch");
2391 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2392 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2393 * returns 0. Otherwise, it returns a positive errno value; in particular,
2394 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2396 netdev_linux_arp_lookup(const struct netdev *netdev,
2397 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2400 struct sockaddr_in sin;
2403 memset(&r, 0, sizeof r);
2404 memset(&sin, 0, sizeof sin);
2405 sin.sin_family = AF_INET;
2406 sin.sin_addr.s_addr = ip;
2408 memcpy(&r.arp_pa, &sin, sizeof sin);
2409 r.arp_ha.sa_family = ARPHRD_ETHER;
2411 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2412 COVERAGE_INC(netdev_arp_lookup);
2413 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2415 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2416 } else if (retval != ENXIO) {
2417 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2418 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2424 nd_to_iff_flags(enum netdev_flags nd)
2427 if (nd & NETDEV_UP) {
2430 if (nd & NETDEV_PROMISC) {
2437 iff_to_nd_flags(int iff)
2439 enum netdev_flags nd = 0;
2443 if (iff & IFF_PROMISC) {
2444 nd |= NETDEV_PROMISC;
2450 netdev_linux_update_flags(struct netdev_dev *dev_, enum netdev_flags off,
2451 enum netdev_flags on, enum netdev_flags *old_flagsp)
2453 struct netdev_dev_linux *netdev_dev;
2454 int old_flags, new_flags;
2457 netdev_dev = netdev_dev_linux_cast(dev_);
2458 old_flags = netdev_dev->ifi_flags;
2459 *old_flagsp = iff_to_nd_flags(old_flags);
2460 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2461 if (new_flags != old_flags) {
2462 error = set_flags(netdev_dev_get_name(dev_), new_flags);
2463 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2469 netdev_linux_change_seq(const struct netdev *netdev)
2471 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2474 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2475 GET_FEATURES, GET_STATUS) \
2479 netdev_linux_init, \
2481 netdev_linux_wait, \
2484 netdev_linux_destroy, \
2485 NULL, /* get_config */ \
2486 NULL, /* set_config */ \
2487 NULL, /* get_tunnel_config */ \
2489 netdev_linux_open, \
2490 netdev_linux_close, \
2492 netdev_linux_listen, \
2493 netdev_linux_recv, \
2494 netdev_linux_recv_wait, \
2495 netdev_linux_drain, \
2497 netdev_linux_send, \
2498 netdev_linux_send_wait, \
2500 netdev_linux_set_etheraddr, \
2501 netdev_linux_get_etheraddr, \
2502 netdev_linux_get_mtu, \
2503 netdev_linux_set_mtu, \
2504 netdev_linux_get_ifindex, \
2505 netdev_linux_get_carrier, \
2506 netdev_linux_get_carrier_resets, \
2507 netdev_linux_set_miimon_interval, \
2512 netdev_linux_set_advertisements, \
2514 netdev_linux_set_policing, \
2515 netdev_linux_get_qos_types, \
2516 netdev_linux_get_qos_capabilities, \
2517 netdev_linux_get_qos, \
2518 netdev_linux_set_qos, \
2519 netdev_linux_get_queue, \
2520 netdev_linux_set_queue, \
2521 netdev_linux_delete_queue, \
2522 netdev_linux_get_queue_stats, \
2523 netdev_linux_dump_queues, \
2524 netdev_linux_dump_queue_stats, \
2526 netdev_linux_get_in4, \
2527 netdev_linux_set_in4, \
2528 netdev_linux_get_in6, \
2529 netdev_linux_add_router, \
2530 netdev_linux_get_next_hop, \
2532 netdev_linux_arp_lookup, \
2534 netdev_linux_update_flags, \
2536 netdev_linux_change_seq \
2539 const struct netdev_class netdev_linux_class =
2542 netdev_linux_create,
2543 netdev_linux_get_stats,
2544 NULL, /* set_stats */
2545 netdev_linux_get_features,
2546 netdev_linux_get_status);
2548 const struct netdev_class netdev_tap_class =
2551 netdev_linux_create_tap,
2552 netdev_tap_get_stats,
2553 NULL, /* set_stats */
2554 netdev_linux_get_features,
2555 netdev_linux_get_status);
2557 const struct netdev_class netdev_internal_class =
2560 netdev_linux_create,
2561 netdev_internal_get_stats,
2562 netdev_internal_set_stats,
2563 NULL, /* get_features */
2564 netdev_internal_get_status);
2566 /* HTB traffic control class. */
2568 #define HTB_N_QUEUES 0xf000
2572 unsigned int max_rate; /* In bytes/s. */
2576 struct tc_queue tc_queue;
2577 unsigned int min_rate; /* In bytes/s. */
2578 unsigned int max_rate; /* In bytes/s. */
2579 unsigned int burst; /* In bytes. */
2580 unsigned int priority; /* Lower values are higher priorities. */
2584 htb_get__(const struct netdev *netdev)
2586 struct netdev_dev_linux *netdev_dev =
2587 netdev_dev_linux_cast(netdev_get_dev(netdev));
2588 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2592 htb_install__(struct netdev *netdev, uint64_t max_rate)
2594 struct netdev_dev_linux *netdev_dev =
2595 netdev_dev_linux_cast(netdev_get_dev(netdev));
2598 htb = xmalloc(sizeof *htb);
2599 tc_init(&htb->tc, &tc_ops_htb);
2600 htb->max_rate = max_rate;
2602 netdev_dev->tc = &htb->tc;
2605 /* Create an HTB qdisc.
2607 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2609 htb_setup_qdisc__(struct netdev *netdev)
2612 struct tc_htb_glob opt;
2613 struct ofpbuf request;
2614 struct tcmsg *tcmsg;
2616 tc_del_qdisc(netdev);
2618 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2619 NLM_F_EXCL | NLM_F_CREATE, &request);
2623 tcmsg->tcm_handle = tc_make_handle(1, 0);
2624 tcmsg->tcm_parent = TC_H_ROOT;
2626 nl_msg_put_string(&request, TCA_KIND, "htb");
2628 memset(&opt, 0, sizeof opt);
2629 opt.rate2quantum = 10;
2633 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2634 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2635 nl_msg_end_nested(&request, opt_offset);
2637 return tc_transact(&request, NULL);
2640 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2641 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2643 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2644 unsigned int parent, struct htb_class *class)
2647 struct tc_htb_opt opt;
2648 struct ofpbuf request;
2649 struct tcmsg *tcmsg;
2653 error = netdev_get_mtu(netdev, &mtu);
2655 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2656 netdev_get_name(netdev));
2660 memset(&opt, 0, sizeof opt);
2661 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2662 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2663 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2664 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2665 opt.prio = class->priority;
2667 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2671 tcmsg->tcm_handle = handle;
2672 tcmsg->tcm_parent = parent;
2674 nl_msg_put_string(&request, TCA_KIND, "htb");
2675 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2676 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2677 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2678 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2679 nl_msg_end_nested(&request, opt_offset);
2681 error = tc_transact(&request, NULL);
2683 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2684 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2685 netdev_get_name(netdev),
2686 tc_get_major(handle), tc_get_minor(handle),
2687 tc_get_major(parent), tc_get_minor(parent),
2688 class->min_rate, class->max_rate,
2689 class->burst, class->priority, strerror(error));
2694 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2695 * description of them into 'details'. The description complies with the
2696 * specification given in the vswitch database documentation for linux-htb
2699 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2701 static const struct nl_policy tca_htb_policy[] = {
2702 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2703 .min_len = sizeof(struct tc_htb_opt) },
2706 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2707 const struct tc_htb_opt *htb;
2709 if (!nl_parse_nested(nl_options, tca_htb_policy,
2710 attrs, ARRAY_SIZE(tca_htb_policy))) {
2711 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2715 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2716 class->min_rate = htb->rate.rate;
2717 class->max_rate = htb->ceil.rate;
2718 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2719 class->priority = htb->prio;
2724 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2725 struct htb_class *options,
2726 struct netdev_queue_stats *stats)
2728 struct nlattr *nl_options;
2729 unsigned int handle;
2732 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2733 if (!error && queue_id) {
2734 unsigned int major = tc_get_major(handle);
2735 unsigned int minor = tc_get_minor(handle);
2736 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2737 *queue_id = minor - 1;
2742 if (!error && options) {
2743 error = htb_parse_tca_options__(nl_options, options);
2749 htb_parse_qdisc_details__(struct netdev *netdev,
2750 const struct smap *details, struct htb_class *hc)
2752 const char *max_rate_s;
2754 max_rate_s = smap_get(details, "max-rate");
2755 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2756 if (!hc->max_rate) {
2757 enum netdev_features current;
2759 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2760 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2762 hc->min_rate = hc->max_rate;
2768 htb_parse_class_details__(struct netdev *netdev,
2769 const struct smap *details, struct htb_class *hc)
2771 const struct htb *htb = htb_get__(netdev);
2772 const char *min_rate_s = smap_get(details, "min-rate");
2773 const char *max_rate_s = smap_get(details, "max-rate");
2774 const char *burst_s = smap_get(details, "burst");
2775 const char *priority_s = smap_get(details, "priority");
2778 error = netdev_get_mtu(netdev, &mtu);
2780 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2781 netdev_get_name(netdev));
2785 /* HTB requires at least an mtu sized min-rate to send any traffic even
2786 * on uncongested links. */
2787 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2788 hc->min_rate = MAX(hc->min_rate, mtu);
2789 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2792 hc->max_rate = (max_rate_s
2793 ? strtoull(max_rate_s, NULL, 10) / 8
2795 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2796 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2800 * According to hints in the documentation that I've read, it is important
2801 * that 'burst' be at least as big as the largest frame that might be
2802 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2803 * but having it a bit too small is a problem. Since netdev_get_mtu()
2804 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2805 * the MTU. We actually add 64, instead of 14, as a guard against
2806 * additional headers get tacked on somewhere that we're not aware of. */
2807 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2808 hc->burst = MAX(hc->burst, mtu + 64);
2811 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2817 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2818 unsigned int parent, struct htb_class *options,
2819 struct netdev_queue_stats *stats)
2821 struct ofpbuf *reply;
2824 error = tc_query_class(netdev, handle, parent, &reply);
2826 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2827 ofpbuf_delete(reply);
2833 htb_tc_install(struct netdev *netdev, const struct smap *details)
2837 error = htb_setup_qdisc__(netdev);
2839 struct htb_class hc;
2841 htb_parse_qdisc_details__(netdev, details, &hc);
2842 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2843 tc_make_handle(1, 0), &hc);
2845 htb_install__(netdev, hc.max_rate);
2851 static struct htb_class *
2852 htb_class_cast__(const struct tc_queue *queue)
2854 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2858 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2859 const struct htb_class *hc)
2861 struct htb *htb = htb_get__(netdev);
2862 size_t hash = hash_int(queue_id, 0);
2863 struct tc_queue *queue;
2864 struct htb_class *hcp;
2866 queue = tc_find_queue__(netdev, queue_id, hash);
2868 hcp = htb_class_cast__(queue);
2870 hcp = xmalloc(sizeof *hcp);
2871 queue = &hcp->tc_queue;
2872 queue->queue_id = queue_id;
2873 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2876 hcp->min_rate = hc->min_rate;
2877 hcp->max_rate = hc->max_rate;
2878 hcp->burst = hc->burst;
2879 hcp->priority = hc->priority;
2883 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2886 struct nl_dump dump;
2887 struct htb_class hc;
2889 /* Get qdisc options. */
2891 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2892 htb_install__(netdev, hc.max_rate);
2895 if (!start_queue_dump(netdev, &dump)) {
2898 while (nl_dump_next(&dump, &msg)) {
2899 unsigned int queue_id;
2901 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2902 htb_update_queue__(netdev, queue_id, &hc);
2905 nl_dump_done(&dump);
2911 htb_tc_destroy(struct tc *tc)
2913 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2914 struct htb_class *hc, *next;
2916 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2917 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2925 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2927 const struct htb *htb = htb_get__(netdev);
2928 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2933 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2935 struct htb_class hc;
2938 htb_parse_qdisc_details__(netdev, details, &hc);
2939 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2940 tc_make_handle(1, 0), &hc);
2942 htb_get__(netdev)->max_rate = hc.max_rate;
2948 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2949 const struct tc_queue *queue, struct smap *details)
2951 const struct htb_class *hc = htb_class_cast__(queue);
2953 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2954 if (hc->min_rate != hc->max_rate) {
2955 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2957 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2959 smap_add_format(details, "priority", "%u", hc->priority);
2965 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2966 const struct smap *details)
2968 struct htb_class hc;
2971 error = htb_parse_class_details__(netdev, details, &hc);
2976 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2977 tc_make_handle(1, 0xfffe), &hc);
2982 htb_update_queue__(netdev, queue_id, &hc);
2987 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2989 struct htb_class *hc = htb_class_cast__(queue);
2990 struct htb *htb = htb_get__(netdev);
2993 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2995 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3002 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3003 struct netdev_queue_stats *stats)
3005 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3006 tc_make_handle(1, 0xfffe), NULL, stats);
3010 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3011 const struct ofpbuf *nlmsg,
3012 netdev_dump_queue_stats_cb *cb, void *aux)
3014 struct netdev_queue_stats stats;
3015 unsigned int handle, major, minor;
3018 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3023 major = tc_get_major(handle);
3024 minor = tc_get_minor(handle);
3025 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3026 (*cb)(minor - 1, &stats, aux);
3031 static const struct tc_ops tc_ops_htb = {
3032 "htb", /* linux_name */
3033 "linux-htb", /* ovs_name */
3034 HTB_N_QUEUES, /* n_queues */
3043 htb_class_get_stats,
3044 htb_class_dump_stats
3047 /* "linux-hfsc" traffic control class. */
3049 #define HFSC_N_QUEUES 0xf000
3057 struct tc_queue tc_queue;
3062 static struct hfsc *
3063 hfsc_get__(const struct netdev *netdev)
3065 struct netdev_dev_linux *netdev_dev;
3066 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3067 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3070 static struct hfsc_class *
3071 hfsc_class_cast__(const struct tc_queue *queue)
3073 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3077 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3079 struct netdev_dev_linux * netdev_dev;
3082 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3083 hfsc = xmalloc(sizeof *hfsc);
3084 tc_init(&hfsc->tc, &tc_ops_hfsc);
3085 hfsc->max_rate = max_rate;
3086 netdev_dev->tc = &hfsc->tc;
3090 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3091 const struct hfsc_class *hc)
3095 struct hfsc_class *hcp;
3096 struct tc_queue *queue;
3098 hfsc = hfsc_get__(netdev);
3099 hash = hash_int(queue_id, 0);
3101 queue = tc_find_queue__(netdev, queue_id, hash);
3103 hcp = hfsc_class_cast__(queue);
3105 hcp = xmalloc(sizeof *hcp);
3106 queue = &hcp->tc_queue;
3107 queue->queue_id = queue_id;
3108 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3111 hcp->min_rate = hc->min_rate;
3112 hcp->max_rate = hc->max_rate;
3116 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3118 const struct tc_service_curve *rsc, *fsc, *usc;
3119 static const struct nl_policy tca_hfsc_policy[] = {
3121 .type = NL_A_UNSPEC,
3123 .min_len = sizeof(struct tc_service_curve),
3126 .type = NL_A_UNSPEC,
3128 .min_len = sizeof(struct tc_service_curve),
3131 .type = NL_A_UNSPEC,
3133 .min_len = sizeof(struct tc_service_curve),
3136 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3138 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3139 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3140 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3144 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3145 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3146 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3148 if (rsc->m1 != 0 || rsc->d != 0 ||
3149 fsc->m1 != 0 || fsc->d != 0 ||
3150 usc->m1 != 0 || usc->d != 0) {
3151 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3152 "Non-linear service curves are not supported.");
3156 if (rsc->m2 != fsc->m2) {
3157 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3158 "Real-time service curves are not supported ");
3162 if (rsc->m2 > usc->m2) {
3163 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3164 "Min-rate service curve is greater than "
3165 "the max-rate service curve.");
3169 class->min_rate = fsc->m2;
3170 class->max_rate = usc->m2;
3175 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3176 struct hfsc_class *options,
3177 struct netdev_queue_stats *stats)
3180 unsigned int handle;
3181 struct nlattr *nl_options;
3183 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3189 unsigned int major, minor;
3191 major = tc_get_major(handle);
3192 minor = tc_get_minor(handle);
3193 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3194 *queue_id = minor - 1;
3201 error = hfsc_parse_tca_options__(nl_options, options);
3208 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3209 unsigned int parent, struct hfsc_class *options,
3210 struct netdev_queue_stats *stats)
3213 struct ofpbuf *reply;
3215 error = tc_query_class(netdev, handle, parent, &reply);
3220 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3221 ofpbuf_delete(reply);
3226 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3227 struct hfsc_class *class)
3230 const char *max_rate_s;
3232 max_rate_s = smap_get(details, "max-rate");
3233 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3236 enum netdev_features current;
3238 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3239 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3242 class->min_rate = max_rate;
3243 class->max_rate = max_rate;
3247 hfsc_parse_class_details__(struct netdev *netdev,
3248 const struct smap *details,
3249 struct hfsc_class * class)
3251 const struct hfsc *hfsc;
3252 uint32_t min_rate, max_rate;
3253 const char *min_rate_s, *max_rate_s;
3255 hfsc = hfsc_get__(netdev);
3256 min_rate_s = smap_get(details, "min-rate");
3257 max_rate_s = smap_get(details, "max-rate");
3259 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3260 min_rate = MAX(min_rate, 1);
3261 min_rate = MIN(min_rate, hfsc->max_rate);
3263 max_rate = (max_rate_s
3264 ? strtoull(max_rate_s, NULL, 10) / 8
3266 max_rate = MAX(max_rate, min_rate);
3267 max_rate = MIN(max_rate, hfsc->max_rate);
3269 class->min_rate = min_rate;
3270 class->max_rate = max_rate;
3275 /* Create an HFSC qdisc.
3277 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3279 hfsc_setup_qdisc__(struct netdev * netdev)
3281 struct tcmsg *tcmsg;
3282 struct ofpbuf request;
3283 struct tc_hfsc_qopt opt;
3285 tc_del_qdisc(netdev);
3287 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3288 NLM_F_EXCL | NLM_F_CREATE, &request);
3294 tcmsg->tcm_handle = tc_make_handle(1, 0);
3295 tcmsg->tcm_parent = TC_H_ROOT;
3297 memset(&opt, 0, sizeof opt);
3300 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3301 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3303 return tc_transact(&request, NULL);
3306 /* Create an HFSC class.
3308 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3309 * sc rate <min_rate> ul rate <max_rate>" */
3311 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3312 unsigned int parent, struct hfsc_class *class)
3316 struct tcmsg *tcmsg;
3317 struct ofpbuf request;
3318 struct tc_service_curve min, max;
3320 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3326 tcmsg->tcm_handle = handle;
3327 tcmsg->tcm_parent = parent;
3331 min.m2 = class->min_rate;
3335 max.m2 = class->max_rate;
3337 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3338 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3339 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3340 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3341 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3342 nl_msg_end_nested(&request, opt_offset);
3344 error = tc_transact(&request, NULL);
3346 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3347 "min-rate %ubps, max-rate %ubps (%s)",
3348 netdev_get_name(netdev),
3349 tc_get_major(handle), tc_get_minor(handle),
3350 tc_get_major(parent), tc_get_minor(parent),
3351 class->min_rate, class->max_rate, strerror(error));
3358 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3361 struct hfsc_class class;
3363 error = hfsc_setup_qdisc__(netdev);
3369 hfsc_parse_qdisc_details__(netdev, details, &class);
3370 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3371 tc_make_handle(1, 0), &class);
3377 hfsc_install__(netdev, class.max_rate);
3382 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3385 struct nl_dump dump;
3386 struct hfsc_class hc;
3389 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3390 hfsc_install__(netdev, hc.max_rate);
3392 if (!start_queue_dump(netdev, &dump)) {
3396 while (nl_dump_next(&dump, &msg)) {
3397 unsigned int queue_id;
3399 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3400 hfsc_update_queue__(netdev, queue_id, &hc);
3404 nl_dump_done(&dump);
3409 hfsc_tc_destroy(struct tc *tc)
3412 struct hfsc_class *hc, *next;
3414 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3416 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3417 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3426 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3428 const struct hfsc *hfsc;
3429 hfsc = hfsc_get__(netdev);
3430 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3435 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3438 struct hfsc_class class;
3440 hfsc_parse_qdisc_details__(netdev, details, &class);
3441 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3442 tc_make_handle(1, 0), &class);
3445 hfsc_get__(netdev)->max_rate = class.max_rate;
3452 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3453 const struct tc_queue *queue, struct smap *details)
3455 const struct hfsc_class *hc;
3457 hc = hfsc_class_cast__(queue);
3458 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3459 if (hc->min_rate != hc->max_rate) {
3460 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3466 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3467 const struct smap *details)
3470 struct hfsc_class class;
3472 error = hfsc_parse_class_details__(netdev, details, &class);
3477 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3478 tc_make_handle(1, 0xfffe), &class);
3483 hfsc_update_queue__(netdev, queue_id, &class);
3488 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3492 struct hfsc_class *hc;
3494 hc = hfsc_class_cast__(queue);
3495 hfsc = hfsc_get__(netdev);
3497 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3499 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3506 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3507 struct netdev_queue_stats *stats)
3509 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3510 tc_make_handle(1, 0xfffe), NULL, stats);
3514 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3515 const struct ofpbuf *nlmsg,
3516 netdev_dump_queue_stats_cb *cb, void *aux)
3518 struct netdev_queue_stats stats;
3519 unsigned int handle, major, minor;
3522 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3527 major = tc_get_major(handle);
3528 minor = tc_get_minor(handle);
3529 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3530 (*cb)(minor - 1, &stats, aux);
3535 static const struct tc_ops tc_ops_hfsc = {
3536 "hfsc", /* linux_name */
3537 "linux-hfsc", /* ovs_name */
3538 HFSC_N_QUEUES, /* n_queues */
3539 hfsc_tc_install, /* tc_install */
3540 hfsc_tc_load, /* tc_load */
3541 hfsc_tc_destroy, /* tc_destroy */
3542 hfsc_qdisc_get, /* qdisc_get */
3543 hfsc_qdisc_set, /* qdisc_set */
3544 hfsc_class_get, /* class_get */
3545 hfsc_class_set, /* class_set */
3546 hfsc_class_delete, /* class_delete */
3547 hfsc_class_get_stats, /* class_get_stats */
3548 hfsc_class_dump_stats /* class_dump_stats */
3551 /* "linux-default" traffic control class.
3553 * This class represents the default, unnamed Linux qdisc. It corresponds to
3554 * the "" (empty string) QoS type in the OVS database. */
3557 default_install__(struct netdev *netdev)
3559 struct netdev_dev_linux *netdev_dev =
3560 netdev_dev_linux_cast(netdev_get_dev(netdev));
3561 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3563 /* Nothing but a tc class implementation is allowed to write to a tc. This
3564 * class never does that, so we can legitimately use a const tc object. */
3565 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
3569 default_tc_install(struct netdev *netdev,
3570 const struct smap *details OVS_UNUSED)
3572 default_install__(netdev);
3577 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3579 default_install__(netdev);
3583 static const struct tc_ops tc_ops_default = {
3584 NULL, /* linux_name */
3589 NULL, /* tc_destroy */
3590 NULL, /* qdisc_get */
3591 NULL, /* qdisc_set */
3592 NULL, /* class_get */
3593 NULL, /* class_set */
3594 NULL, /* class_delete */
3595 NULL, /* class_get_stats */
3596 NULL /* class_dump_stats */
3599 /* "linux-other" traffic control class.
3604 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3606 struct netdev_dev_linux *netdev_dev =
3607 netdev_dev_linux_cast(netdev_get_dev(netdev));
3608 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3610 /* Nothing but a tc class implementation is allowed to write to a tc. This
3611 * class never does that, so we can legitimately use a const tc object. */
3612 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
3616 static const struct tc_ops tc_ops_other = {
3617 NULL, /* linux_name */
3618 "linux-other", /* ovs_name */
3620 NULL, /* tc_install */
3622 NULL, /* tc_destroy */
3623 NULL, /* qdisc_get */
3624 NULL, /* qdisc_set */
3625 NULL, /* class_get */
3626 NULL, /* class_set */
3627 NULL, /* class_delete */
3628 NULL, /* class_get_stats */
3629 NULL /* class_dump_stats */
3632 /* Traffic control. */
3634 /* Number of kernel "tc" ticks per second. */
3635 static double ticks_per_s;
3637 /* Number of kernel "jiffies" per second. This is used for the purpose of
3638 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3639 * one jiffy's worth of data.
3641 * There are two possibilities here:
3643 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3644 * approximate range of 100 to 1024. That means that we really need to
3645 * make sure that the qdisc can buffer that much data.
3647 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3648 * has finely granular timers and there's no need to fudge additional room
3649 * for buffers. (There's no extra effort needed to implement that: the
3650 * large 'buffer_hz' is used as a divisor, so practically any number will
3651 * come out as 0 in the division. Small integer results in the case of
3652 * really high dividends won't have any real effect anyhow.)
3654 static unsigned int buffer_hz;
3656 /* Returns tc handle 'major':'minor'. */
3658 tc_make_handle(unsigned int major, unsigned int minor)
3660 return TC_H_MAKE(major << 16, minor);
3663 /* Returns the major number from 'handle'. */
3665 tc_get_major(unsigned int handle)
3667 return TC_H_MAJ(handle) >> 16;
3670 /* Returns the minor number from 'handle'. */
3672 tc_get_minor(unsigned int handle)
3674 return TC_H_MIN(handle);
3677 static struct tcmsg *
3678 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3679 struct ofpbuf *request)
3681 struct tcmsg *tcmsg;
3685 error = get_ifindex(netdev, &ifindex);
3690 ofpbuf_init(request, 512);
3691 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3692 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3693 tcmsg->tcm_family = AF_UNSPEC;
3694 tcmsg->tcm_ifindex = ifindex;
3695 /* Caller should fill in tcmsg->tcm_handle. */
3696 /* Caller should fill in tcmsg->tcm_parent. */
3702 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3704 int error = nl_sock_transact(rtnl_sock, request, replyp);
3705 ofpbuf_uninit(request);
3709 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3710 * policing configuration.
3712 * This function is equivalent to running the following when 'add' is true:
3713 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3715 * This function is equivalent to running the following when 'add' is false:
3716 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3718 * The configuration and stats may be seen with the following command:
3719 * /sbin/tc -s qdisc show dev <devname>
3721 * Returns 0 if successful, otherwise a positive errno value.
3724 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3726 struct ofpbuf request;
3727 struct tcmsg *tcmsg;
3729 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3730 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3732 tcmsg = tc_make_request(netdev, type, flags, &request);
3736 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3737 tcmsg->tcm_parent = TC_H_INGRESS;
3738 nl_msg_put_string(&request, TCA_KIND, "ingress");
3739 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3741 error = tc_transact(&request, NULL);
3743 /* If we're deleting the qdisc, don't worry about some of the
3744 * error conditions. */
3745 if (!add && (error == ENOENT || error == EINVAL)) {
3754 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3757 * This function is equivalent to running:
3758 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3759 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3762 * The configuration and stats may be seen with the following command:
3763 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3765 * Returns 0 if successful, otherwise a positive errno value.
3768 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3770 struct tc_police tc_police;
3771 struct ofpbuf request;
3772 struct tcmsg *tcmsg;
3773 size_t basic_offset;
3774 size_t police_offset;
3778 memset(&tc_police, 0, sizeof tc_police);
3779 tc_police.action = TC_POLICE_SHOT;
3780 tc_police.mtu = mtu;
3781 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3782 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3783 kbits_burst * 1024);
3785 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3786 NLM_F_EXCL | NLM_F_CREATE, &request);
3790 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3791 tcmsg->tcm_info = tc_make_handle(49,
3792 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3794 nl_msg_put_string(&request, TCA_KIND, "basic");
3795 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3796 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3797 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3798 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3799 nl_msg_end_nested(&request, police_offset);
3800 nl_msg_end_nested(&request, basic_offset);
3802 error = tc_transact(&request, NULL);
3813 /* The values in psched are not individually very meaningful, but they are
3814 * important. The tables below show some values seen in the wild.
3818 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3819 * (Before that, there are hints that it was 1000000000.)
3821 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3825 * -----------------------------------
3826 * [1] 000c8000 000f4240 000f4240 00000064
3827 * [2] 000003e8 00000400 000f4240 3b9aca00
3828 * [3] 000003e8 00000400 000f4240 3b9aca00
3829 * [4] 000003e8 00000400 000f4240 00000064
3830 * [5] 000003e8 00000040 000f4240 3b9aca00
3831 * [6] 000003e8 00000040 000f4240 000000f9
3833 * a b c d ticks_per_s buffer_hz
3834 * ------- --------- ---------- ------------- ----------- -------------
3835 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3836 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3837 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3838 * [4] 1,000 1,024 1,000,000 100 976,562 100
3839 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3840 * [6] 1,000 64 1,000,000 249 15,625,000 249
3842 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3843 * [2] 2.6.26-1-686-bigmem from Debian lenny
3844 * [3] 2.6.26-2-sparc64 from Debian lenny
3845 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3846 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3847 * [6] 2.6.34 from kernel.org on KVM
3849 static const char fn[] = "/proc/net/psched";
3850 unsigned int a, b, c, d;
3856 stream = fopen(fn, "r");
3858 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3862 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3863 VLOG_WARN("%s: read failed", fn);
3867 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3871 VLOG_WARN("%s: invalid scheduler parameters", fn);
3875 ticks_per_s = (double) a * c / b;
3879 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3882 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3885 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3886 * rate of 'rate' bytes per second. */
3888 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3893 return (rate * ticks) / ticks_per_s;
3896 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3897 * rate of 'rate' bytes per second. */
3899 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3904 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3907 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3908 * a transmission rate of 'rate' bytes per second. */
3910 tc_buffer_per_jiffy(unsigned int rate)
3915 return rate / buffer_hz;
3918 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3919 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3920 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3921 * stores NULL into it if it is absent.
3923 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3926 * Returns 0 if successful, otherwise a positive errno value. */
3928 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3929 struct nlattr **options)
3931 static const struct nl_policy tca_policy[] = {
3932 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3933 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3935 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3937 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3938 tca_policy, ta, ARRAY_SIZE(ta))) {
3939 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3944 *kind = nl_attr_get_string(ta[TCA_KIND]);
3948 *options = ta[TCA_OPTIONS];
3963 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3964 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3965 * into '*options', and its queue statistics into '*stats'. Any of the output
3966 * arguments may be null.
3968 * Returns 0 if successful, otherwise a positive errno value. */
3970 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3971 struct nlattr **options, struct netdev_queue_stats *stats)
3973 static const struct nl_policy tca_policy[] = {
3974 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3975 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3977 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3979 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3980 tca_policy, ta, ARRAY_SIZE(ta))) {
3981 VLOG_WARN_RL(&rl, "failed to parse class message");
3986 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3987 *handlep = tc->tcm_handle;
3991 *options = ta[TCA_OPTIONS];
3995 const struct gnet_stats_queue *gsq;
3996 struct gnet_stats_basic gsb;
3998 static const struct nl_policy stats_policy[] = {
3999 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4000 .min_len = sizeof gsb },
4001 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4002 .min_len = sizeof *gsq },
4004 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4006 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4007 sa, ARRAY_SIZE(sa))) {
4008 VLOG_WARN_RL(&rl, "failed to parse class stats");
4012 /* Alignment issues screw up the length of struct gnet_stats_basic on
4013 * some arch/bitsize combinations. Newer versions of Linux have a
4014 * struct gnet_stats_basic_packed, but we can't depend on that. The
4015 * easiest thing to do is just to make a copy. */
4016 memset(&gsb, 0, sizeof gsb);
4017 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4018 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4019 stats->tx_bytes = gsb.bytes;
4020 stats->tx_packets = gsb.packets;
4022 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4023 stats->tx_errors = gsq->drops;
4033 memset(stats, 0, sizeof *stats);
4038 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4041 tc_query_class(const struct netdev *netdev,
4042 unsigned int handle, unsigned int parent,
4043 struct ofpbuf **replyp)
4045 struct ofpbuf request;
4046 struct tcmsg *tcmsg;
4049 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4053 tcmsg->tcm_handle = handle;
4054 tcmsg->tcm_parent = parent;
4056 error = tc_transact(&request, replyp);
4058 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4059 netdev_get_name(netdev),
4060 tc_get_major(handle), tc_get_minor(handle),
4061 tc_get_major(parent), tc_get_minor(parent),
4067 /* Equivalent to "tc class del dev <name> handle <handle>". */
4069 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4071 struct ofpbuf request;
4072 struct tcmsg *tcmsg;
4075 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4079 tcmsg->tcm_handle = handle;
4080 tcmsg->tcm_parent = 0;
4082 error = tc_transact(&request, NULL);
4084 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4085 netdev_get_name(netdev),
4086 tc_get_major(handle), tc_get_minor(handle),
4092 /* Equivalent to "tc qdisc del dev <name> root". */
4094 tc_del_qdisc(struct netdev *netdev)
4096 struct netdev_dev_linux *netdev_dev =
4097 netdev_dev_linux_cast(netdev_get_dev(netdev));
4098 struct ofpbuf request;
4099 struct tcmsg *tcmsg;
4102 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4106 tcmsg->tcm_handle = tc_make_handle(1, 0);
4107 tcmsg->tcm_parent = TC_H_ROOT;
4109 error = tc_transact(&request, NULL);
4110 if (error == EINVAL) {
4111 /* EINVAL probably means that the default qdisc was in use, in which
4112 * case we've accomplished our purpose. */
4115 if (!error && netdev_dev->tc) {
4116 if (netdev_dev->tc->ops->tc_destroy) {
4117 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4119 netdev_dev->tc = NULL;
4124 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4125 * kernel to determine what they are. Returns 0 if successful, otherwise a
4126 * positive errno value. */
4128 tc_query_qdisc(const struct netdev *netdev)
4130 struct netdev_dev_linux *netdev_dev =
4131 netdev_dev_linux_cast(netdev_get_dev(netdev));
4132 struct ofpbuf request, *qdisc;
4133 const struct tc_ops *ops;
4134 struct tcmsg *tcmsg;
4138 if (netdev_dev->tc) {
4142 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4143 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4144 * 2.6.35 without that fix backported to it.
4146 * To avoid the OOPS, we must not make a request that would attempt to dump
4147 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4148 * few others. There are a few ways that I can see to do this, but most of
4149 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4150 * technique chosen here is to assume that any non-default qdisc that we
4151 * create will have a class with handle 1:0. The built-in qdiscs only have
4152 * a class with handle 0:0.
4154 * We could check for Linux 2.6.35+ and use a more straightforward method
4156 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4160 tcmsg->tcm_handle = tc_make_handle(1, 0);
4161 tcmsg->tcm_parent = 0;
4163 /* Figure out what tc class to instantiate. */
4164 error = tc_transact(&request, &qdisc);
4168 error = tc_parse_qdisc(qdisc, &kind, NULL);
4170 ops = &tc_ops_other;
4172 ops = tc_lookup_linux_name(kind);
4174 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4175 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4177 ops = &tc_ops_other;
4180 } else if (error == ENOENT) {
4181 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4182 * other entity that doesn't have a handle 1:0. We will assume
4183 * that it's the system default qdisc. */
4184 ops = &tc_ops_default;
4187 /* Who knows? Maybe the device got deleted. */
4188 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4189 netdev_get_name(netdev), strerror(error));
4190 ops = &tc_ops_other;
4193 /* Instantiate it. */
4194 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4195 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
4196 ofpbuf_delete(qdisc);
4198 return error ? error : load_error;
4201 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4202 approximate the time to transmit packets of various lengths. For an MTU of
4203 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4204 represents two possible packet lengths; for a MTU of 513 through 1024, four
4205 possible lengths; and so on.
4207 Returns, for the specified 'mtu', the number of bits that packet lengths
4208 need to be shifted right to fit within such a 256-entry table. */
4210 tc_calc_cell_log(unsigned int mtu)
4215 mtu = ETH_PAYLOAD_MAX;
4217 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4219 for (cell_log = 0; mtu >= 256; cell_log++) {
4226 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4229 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4231 memset(rate, 0, sizeof *rate);
4232 rate->cell_log = tc_calc_cell_log(mtu);
4233 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4234 /* rate->cell_align = 0; */ /* distro headers. */
4235 rate->mpu = ETH_TOTAL_MIN;
4239 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4240 * attribute of the specified "type".
4242 * See tc_calc_cell_log() above for a description of "rtab"s. */
4244 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4249 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4250 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4251 unsigned packet_size = (i + 1) << rate->cell_log;
4252 if (packet_size < rate->mpu) {
4253 packet_size = rate->mpu;
4255 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4259 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4260 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4261 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4264 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4266 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4267 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4270 /* Linux-only functions declared in netdev-linux.h */
4272 /* Returns a fd for an AF_INET socket or a negative errno value. */
4274 netdev_linux_get_af_inet_sock(void)
4276 int error = netdev_linux_init();
4277 return error ? -error : af_inet_sock;
4280 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4281 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4283 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4284 const char *flag_name, bool enable)
4286 const char *netdev_name = netdev_get_name(netdev);
4287 struct ethtool_value evalue;
4291 COVERAGE_INC(netdev_get_ethtool);
4292 memset(&evalue, 0, sizeof evalue);
4293 error = netdev_linux_do_ethtool(netdev_name,
4294 (struct ethtool_cmd *)&evalue,
4295 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4300 COVERAGE_INC(netdev_set_ethtool);
4301 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4302 error = netdev_linux_do_ethtool(netdev_name,
4303 (struct ethtool_cmd *)&evalue,
4304 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4309 COVERAGE_INC(netdev_get_ethtool);
4310 memset(&evalue, 0, sizeof evalue);
4311 error = netdev_linux_do_ethtool(netdev_name,
4312 (struct ethtool_cmd *)&evalue,
4313 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4318 if (new_flags != evalue.data) {
4319 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4320 "device %s failed", enable ? "enable" : "disable",
4321 flag_name, netdev_name);
4328 /* Utility functions. */
4330 /* Copies 'src' into 'dst', performing format conversion in the process. */
4332 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4333 const struct rtnl_link_stats *src)
4335 dst->rx_packets = src->rx_packets;
4336 dst->tx_packets = src->tx_packets;
4337 dst->rx_bytes = src->rx_bytes;
4338 dst->tx_bytes = src->tx_bytes;
4339 dst->rx_errors = src->rx_errors;
4340 dst->tx_errors = src->tx_errors;
4341 dst->rx_dropped = src->rx_dropped;
4342 dst->tx_dropped = src->tx_dropped;
4343 dst->multicast = src->multicast;
4344 dst->collisions = src->collisions;
4345 dst->rx_length_errors = src->rx_length_errors;
4346 dst->rx_over_errors = src->rx_over_errors;
4347 dst->rx_crc_errors = src->rx_crc_errors;
4348 dst->rx_frame_errors = src->rx_frame_errors;
4349 dst->rx_fifo_errors = src->rx_fifo_errors;
4350 dst->rx_missed_errors = src->rx_missed_errors;
4351 dst->tx_aborted_errors = src->tx_aborted_errors;
4352 dst->tx_carrier_errors = src->tx_carrier_errors;
4353 dst->tx_fifo_errors = src->tx_fifo_errors;
4354 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4355 dst->tx_window_errors = src->tx_window_errors;
4359 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4361 /* Policy for RTNLGRP_LINK messages.
4363 * There are *many* more fields in these messages, but currently we only
4364 * care about these fields. */
4365 static const struct nl_policy rtnlgrp_link_policy[] = {
4366 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4367 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4368 .min_len = sizeof(struct rtnl_link_stats) },
4371 struct ofpbuf request;
4372 struct ofpbuf *reply;
4373 struct ifinfomsg *ifi;
4374 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4377 ofpbuf_init(&request, 0);
4378 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4379 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4380 ifi->ifi_family = PF_UNSPEC;
4381 ifi->ifi_index = ifindex;
4382 error = nl_sock_transact(rtnl_sock, &request, &reply);
4383 ofpbuf_uninit(&request);
4388 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4389 rtnlgrp_link_policy,
4390 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4391 ofpbuf_delete(reply);
4395 if (!attrs[IFLA_STATS]) {
4396 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4397 ofpbuf_delete(reply);
4401 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4403 ofpbuf_delete(reply);
4409 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4411 static const char fn[] = "/proc/net/dev";
4416 stream = fopen(fn, "r");
4418 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4423 while (fgets(line, sizeof line, stream)) {
4426 #define X64 "%"SCNu64
4429 X64 X64 X64 X64 X64 X64 X64 "%*u"
4430 X64 X64 X64 X64 X64 X64 X64 "%*u",
4436 &stats->rx_fifo_errors,
4437 &stats->rx_frame_errors,
4443 &stats->tx_fifo_errors,
4445 &stats->tx_carrier_errors) != 15) {
4446 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4447 } else if (!strcmp(devname, netdev_name)) {
4448 stats->rx_length_errors = UINT64_MAX;
4449 stats->rx_over_errors = UINT64_MAX;
4450 stats->rx_crc_errors = UINT64_MAX;
4451 stats->rx_missed_errors = UINT64_MAX;
4452 stats->tx_aborted_errors = UINT64_MAX;
4453 stats->tx_heartbeat_errors = UINT64_MAX;
4454 stats->tx_window_errors = UINT64_MAX;
4460 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4466 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4472 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4475 *flags = ifr.ifr_flags;
4481 set_flags(const char *name, unsigned int flags)
4485 ifr.ifr_flags = flags;
4486 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4490 do_get_ifindex(const char *netdev_name)
4494 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4495 COVERAGE_INC(netdev_get_ifindex);
4496 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4497 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4498 netdev_name, strerror(errno));
4501 return ifr.ifr_ifindex;
4505 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4507 struct netdev_dev_linux *netdev_dev =
4508 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4510 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4511 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4514 netdev_dev->get_ifindex_error = -ifindex;
4515 netdev_dev->ifindex = 0;
4517 netdev_dev->get_ifindex_error = 0;
4518 netdev_dev->ifindex = ifindex;
4520 netdev_dev->cache_valid |= VALID_IFINDEX;
4523 *ifindexp = netdev_dev->ifindex;
4524 return netdev_dev->get_ifindex_error;
4528 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4533 memset(&ifr, 0, sizeof ifr);
4534 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4535 COVERAGE_INC(netdev_get_hwaddr);
4536 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4537 /* ENODEV probably means that a vif disappeared asynchronously and
4538 * hasn't been removed from the database yet, so reduce the log level
4539 * to INFO for that case. */
4540 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4541 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4542 netdev_name, strerror(errno));
4545 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4546 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4547 VLOG_WARN("%s device has unknown hardware address family %d",
4548 netdev_name, hwaddr_family);
4550 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4555 set_etheraddr(const char *netdev_name,
4556 const uint8_t mac[ETH_ADDR_LEN])
4560 memset(&ifr, 0, sizeof ifr);
4561 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4562 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4563 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4564 COVERAGE_INC(netdev_set_hwaddr);
4565 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4566 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4567 netdev_name, strerror(errno));
4574 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4575 int cmd, const char *cmd_name)
4579 memset(&ifr, 0, sizeof ifr);
4580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4581 ifr.ifr_data = (caddr_t) ecmd;
4584 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4587 if (errno != EOPNOTSUPP) {
4588 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4589 "failed: %s", cmd_name, name, strerror(errno));
4591 /* The device doesn't support this operation. That's pretty
4592 * common, so there's no point in logging anything. */
4599 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4600 const char *cmd_name)
4602 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4603 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4604 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4612 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4613 int cmd, const char *cmd_name)
4618 ifr.ifr_addr.sa_family = AF_INET;
4619 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4621 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4622 *ip = sin->sin_addr;
4627 /* Returns an AF_PACKET raw socket or a negative errno value. */
4629 af_packet_sock(void)
4631 static int sock = INT_MIN;
4633 if (sock == INT_MIN) {
4634 sock = socket(AF_PACKET, SOCK_RAW, 0);
4636 int error = set_nonblocking(sock);
4643 VLOG_ERR("failed to create packet socket: %s", strerror(errno));