2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
67 #include "socket-util.h"
70 #include "unaligned.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
144 /* One traffic control queue.
146 * Each TC implementation subclasses this with whatever additional data it
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
153 /* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
236 * This function may be null if 'tc' is not configurable.
238 int (*qdisc_set)(struct netdev *, const struct smap *details);
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
251 * This function may be null if 'tc' does not have queues ('n_queues' is
253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
254 struct smap *details);
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
268 const struct smap *details);
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
280 * On success, initializes '*stats'.
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
286 struct netdev_queue_stats *stats);
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
299 tc_init(struct tc *tc, const struct tc_ops *ops)
302 hmap_init(&tc->queues);
306 tc_destroy(struct tc *tc)
308 hmap_destroy(&tc->queues);
311 static const struct tc_ops tc_ops_htb;
312 static const struct tc_ops tc_ops_hfsc;
313 static const struct tc_ops tc_ops_default;
314 static const struct tc_ops tc_ops_other;
316 static const struct tc_ops *const tcs[] = {
317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
324 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325 static unsigned int tc_get_major(unsigned int handle);
326 static unsigned int tc_get_minor(unsigned int handle);
328 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332 static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
335 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
339 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344 static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347 static int tc_delete_class(const struct netdev *, unsigned int handle);
349 static int tc_del_qdisc(struct netdev *netdev);
350 static int tc_query_qdisc(const struct netdev *netdev);
352 static int tc_calc_cell_log(unsigned int mtu);
353 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 struct netdev_dev_linux {
359 struct netdev_dev netdev_dev;
361 struct shash_node *shash_node;
362 unsigned int cache_valid;
363 unsigned int change_seq;
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 uint8_t etheraddr[ETH_ADDR_LEN];
373 struct in_addr address, netmask;
376 unsigned int ifi_flags;
377 long long int carrier_resets;
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
384 int netdev_policing_error; /* Cached error code from set policing. */
385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
397 struct tap_state tap;
401 struct netdev_linux {
402 struct netdev netdev;
406 /* Sockets used for ioctl operations. */
407 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
409 /* A Netlink routing socket that is not subscribed to any multicast groups. */
410 static struct nl_sock *rtnl_sock;
412 /* This is set pretty low because we probably won't learn anything from the
413 * additional log messages. */
414 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416 static int netdev_linux_init(void);
418 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
419 int cmd, const char *cmd_name);
420 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
421 const char *cmd_name);
422 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
423 int cmd, const char *cmd_name);
424 static int get_flags(const struct netdev_dev *, unsigned int *flags);
425 static int set_flags(struct netdev *, unsigned int flags);
426 static int do_get_ifindex(const char *netdev_name);
427 static int get_ifindex(const struct netdev *, int *ifindexp);
428 static int do_set_addr(struct netdev *netdev,
429 int ioctl_nr, const char *ioctl_name,
430 struct in_addr addr);
431 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
432 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
433 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
434 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
435 static int af_packet_sock(void);
436 static void netdev_linux_miimon_run(void);
437 static void netdev_linux_miimon_wait(void);
440 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 return netdev_class->init == netdev_linux_init;
445 static struct netdev_dev_linux *
446 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
448 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
449 ovs_assert(is_netdev_linux_class(netdev_class));
451 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
454 static struct netdev_linux *
455 netdev_linux_cast(const struct netdev *netdev)
457 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
458 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
459 ovs_assert(is_netdev_linux_class(netdev_class));
461 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
465 netdev_linux_init(void)
467 static int status = -1;
469 /* Create AF_INET socket. */
470 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
471 status = af_inet_sock >= 0 ? 0 : errno;
473 VLOG_ERR("failed to create inet socket: %s", strerror(status));
476 /* Create rtnetlink socket. */
478 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
480 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
489 netdev_linux_run(void)
491 rtnetlink_link_run();
492 netdev_linux_miimon_run();
496 netdev_linux_wait(void)
498 rtnetlink_link_wait();
499 netdev_linux_miimon_wait();
503 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
504 unsigned int ifi_flags,
508 if (!dev->change_seq) {
512 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
513 dev->carrier_resets++;
515 dev->ifi_flags = ifi_flags;
517 dev->cache_valid &= mask;
521 netdev_dev_linux_update(struct netdev_dev_linux *dev,
522 const struct rtnetlink_link_change *change)
524 if (change->nlmsg_type == RTM_NEWLINK) {
526 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
528 /* Update netdev from rtnl-change msg. */
530 dev->mtu = change->mtu;
531 dev->cache_valid |= VALID_MTU;
532 dev->netdev_mtu_error = 0;
535 if (!eth_addr_is_zero(change->addr)) {
536 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
537 dev->cache_valid |= VALID_ETHERADDR;
538 dev->ether_addr_error = 0;
541 dev->ifindex = change->ifi_index;
542 dev->cache_valid |= VALID_IFINDEX;
543 dev->get_ifindex_error = 0;
546 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
551 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
552 void *aux OVS_UNUSED)
554 struct netdev_dev_linux *dev;
556 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
558 const struct netdev_class *netdev_class =
559 netdev_dev_get_class(base_dev);
561 if (is_netdev_linux_class(netdev_class)) {
562 dev = netdev_dev_linux_cast(base_dev);
563 netdev_dev_linux_update(dev, change);
567 struct shash device_shash;
568 struct shash_node *node;
570 shash_init(&device_shash);
571 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
572 SHASH_FOR_EACH (node, &device_shash) {
577 get_flags(&dev->netdev_dev, &flags);
578 netdev_dev_linux_changed(dev, flags, 0);
580 shash_destroy(&device_shash);
585 cache_notifier_ref(void)
587 if (!cache_notifier_refcount) {
588 ovs_assert(!netdev_linux_cache_notifier);
590 netdev_linux_cache_notifier =
591 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
593 if (!netdev_linux_cache_notifier) {
597 cache_notifier_refcount++;
603 cache_notifier_unref(void)
605 ovs_assert(cache_notifier_refcount > 0);
606 if (!--cache_notifier_refcount) {
607 ovs_assert(netdev_linux_cache_notifier);
608 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier = NULL;
613 /* Creates system and internal devices. */
615 netdev_linux_create(const struct netdev_class *class, const char *name,
616 struct netdev_dev **netdev_devp)
618 struct netdev_dev_linux *netdev_dev;
621 error = cache_notifier_ref();
626 netdev_dev = xzalloc(sizeof *netdev_dev);
627 netdev_dev->change_seq = 1;
628 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
629 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
631 *netdev_devp = &netdev_dev->netdev_dev;
635 /* For most types of netdevs we open the device for each call of
636 * netdev_open(). However, this is not the case with tap devices,
637 * since it is only possible to open the device once. In this
638 * situation we share a single file descriptor, and consequently
639 * buffers, across all readers. Therefore once data is read it will
640 * be unavailable to other reads for tap devices. */
642 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
643 const char *name, struct netdev_dev **netdev_devp)
645 struct netdev_dev_linux *netdev_dev;
646 struct tap_state *state;
647 static const char tap_dev[] = "/dev/net/tun";
651 netdev_dev = xzalloc(sizeof *netdev_dev);
652 state = &netdev_dev->state.tap;
654 error = cache_notifier_ref();
659 /* Open tap device. */
660 state->fd = open(tap_dev, O_RDWR);
663 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
664 goto error_unref_notifier;
667 /* Create tap device. */
668 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
669 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
670 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
671 VLOG_WARN("%s: creating tap device failed: %s", name,
674 goto error_unref_notifier;
677 /* Make non-blocking. */
678 error = set_nonblocking(state->fd);
680 goto error_unref_notifier;
683 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
684 *netdev_devp = &netdev_dev->netdev_dev;
687 error_unref_notifier:
688 cache_notifier_unref();
695 destroy_tap(struct netdev_dev_linux *netdev_dev)
697 struct tap_state *state = &netdev_dev->state.tap;
699 if (state->fd >= 0) {
704 /* Destroys the netdev device 'netdev_dev_'. */
706 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
708 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
709 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
711 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
712 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
715 if (class == &netdev_tap_class) {
716 destroy_tap(netdev_dev);
720 cache_notifier_unref();
724 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
726 struct netdev_linux *netdev;
727 enum netdev_flags flags;
730 /* Allocate network device. */
731 netdev = xzalloc(sizeof *netdev);
733 netdev_init(&netdev->netdev, netdev_dev_);
735 /* Verify that the device really exists, by attempting to read its flags.
736 * (The flags might be cached, in which case this won't actually do an
739 * Don't do this for "internal" netdevs, though, because those have to be
740 * created as netdev objects before they exist in the kernel, because
741 * creating them in the kernel happens by passing a netdev object to
742 * dpif_port_add(). */
743 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
744 error = netdev_get_flags(&netdev->netdev, &flags);
745 if (error == ENODEV) {
750 *netdevp = &netdev->netdev;
754 netdev_uninit(&netdev->netdev, true);
758 /* Closes and destroys 'netdev'. */
760 netdev_linux_close(struct netdev *netdev_)
762 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
764 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
771 netdev_linux_listen(struct netdev *netdev_)
773 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 struct netdev_dev_linux *netdev_dev =
775 netdev_dev_linux_cast(netdev_get_dev(netdev_));
776 struct sockaddr_ll sll;
781 if (netdev->fd >= 0) {
785 if (!strcmp(netdev_get_type(netdev_), "tap")
786 && !netdev_dev->state.tap.opened) {
787 netdev->fd = netdev_dev->state.tap.fd;
788 netdev_dev->state.tap.opened = true;
792 /* Create file descriptor. */
793 fd = socket(PF_PACKET, SOCK_RAW, 0);
796 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
800 /* Set non-blocking mode. */
801 error = set_nonblocking(fd);
806 /* Get ethernet device index. */
807 error = get_ifindex(&netdev->netdev, &ifindex);
812 /* Bind to specific ethernet device. */
813 memset(&sll, 0, sizeof sll);
814 sll.sll_family = AF_PACKET;
815 sll.sll_ifindex = ifindex;
816 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
817 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
819 VLOG_ERR("%s: failed to bind raw socket (%s)",
820 netdev_get_name(netdev_), strerror(error));
835 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 if (netdev->fd < 0) {
840 /* Device is not listening. */
847 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
848 ? read(netdev->fd, data, size)
849 : recv(netdev->fd, data, size, MSG_TRUNC));
851 return retval <= size ? retval : -EMSGSIZE;
852 } else if (errno != EINTR) {
853 if (errno != EAGAIN) {
854 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
855 strerror(errno), netdev_get_name(netdev_));
862 /* Registers with the poll loop to wake up from the next call to poll_block()
863 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
865 netdev_linux_recv_wait(struct netdev *netdev_)
867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
868 if (netdev->fd >= 0) {
869 poll_fd_wait(netdev->fd, POLLIN);
873 /* Discards all packets waiting to be received from 'netdev'. */
875 netdev_linux_drain(struct netdev *netdev_)
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
878 if (netdev->fd < 0) {
880 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
882 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
883 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
887 drain_fd(netdev->fd, ifr.ifr_qlen);
890 return drain_rcvbuf(netdev->fd);
894 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
895 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
896 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
897 * the packet is too big or too small to transmit on the device.
899 * The caller retains ownership of 'buffer' in all cases.
901 * The kernel maintains a packet transmission queue, so the caller is not
902 * expected to do additional queuing of packets. */
904 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
910 if (netdev->fd < 0) {
911 /* Use our AF_PACKET socket to send to this device. */
912 struct sockaddr_ll sll;
919 sock = af_packet_sock();
924 error = get_ifindex(netdev_, &ifindex);
929 /* We don't bother setting most fields in sockaddr_ll because the
930 * kernel ignores them for SOCK_RAW. */
931 memset(&sll, 0, sizeof sll);
932 sll.sll_family = AF_PACKET;
933 sll.sll_ifindex = ifindex;
935 iov.iov_base = CONST_CAST(void *, data);
939 msg.msg_namelen = sizeof sll;
942 msg.msg_control = NULL;
943 msg.msg_controllen = 0;
946 retval = sendmsg(sock, &msg, 0);
948 /* Use the netdev's own fd to send to this device. This is
949 * essential for tap devices, because packets sent to a tap device
950 * with an AF_PACKET socket will loop back to be *received* again
951 * on the tap device. */
952 retval = write(netdev->fd, data, size);
956 /* The Linux AF_PACKET implementation never blocks waiting for room
957 * for packets, instead returning ENOBUFS. Translate this into
958 * EAGAIN for the caller. */
959 if (errno == ENOBUFS) {
961 } else if (errno == EINTR) {
963 } else if (errno != EAGAIN) {
964 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
965 netdev_get_name(netdev_), strerror(errno));
968 } else if (retval != size) {
969 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
970 "%zu) on %s", retval, size, netdev_get_name(netdev_));
978 /* Registers with the poll loop to wake up from the next call to poll_block()
979 * when the packet transmission queue has sufficient room to transmit a packet
980 * with netdev_send().
982 * The kernel maintains a packet transmission queue, so the client is not
983 * expected to do additional queuing of packets. Thus, this function is
984 * unlikely to ever be used. It is included for completeness. */
986 netdev_linux_send_wait(struct netdev *netdev_)
988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
989 if (netdev->fd < 0) {
991 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
992 poll_fd_wait(netdev->fd, POLLOUT);
994 /* TAP device always accepts packets.*/
995 poll_immediate_wake();
999 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1000 * otherwise a positive errno value. */
1002 netdev_linux_set_etheraddr(struct netdev *netdev_,
1003 const uint8_t mac[ETH_ADDR_LEN])
1005 struct netdev_dev_linux *netdev_dev =
1006 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1008 bool up_again = false;
1010 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1011 if (netdev_dev->ether_addr_error) {
1012 return netdev_dev->ether_addr_error;
1014 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1017 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1020 /* Tap devices must be brought down before setting the address. */
1021 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1022 enum netdev_flags flags;
1024 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1025 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1029 error = set_etheraddr(netdev_get_name(netdev_), mac);
1030 if (!error || error == ENODEV) {
1031 netdev_dev->ether_addr_error = error;
1032 netdev_dev->cache_valid |= VALID_ETHERADDR;
1034 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1039 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1045 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1047 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1048 uint8_t mac[ETH_ADDR_LEN])
1050 struct netdev_dev_linux *netdev_dev =
1051 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1053 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1054 int error = get_etheraddr(netdev_get_name(netdev_),
1055 netdev_dev->etheraddr);
1057 netdev_dev->ether_addr_error = error;
1058 netdev_dev->cache_valid |= VALID_ETHERADDR;
1061 if (!netdev_dev->ether_addr_error) {
1062 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1065 return netdev_dev->ether_addr_error;
1068 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1069 * in bytes, not including the hardware header; thus, this is typically 1500
1070 * bytes for Ethernet devices. */
1072 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1074 struct netdev_dev_linux *netdev_dev =
1075 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1076 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1080 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1081 SIOCGIFMTU, "SIOCGIFMTU");
1083 netdev_dev->netdev_mtu_error = error;
1084 netdev_dev->mtu = ifr.ifr_mtu;
1085 netdev_dev->cache_valid |= VALID_MTU;
1088 if (!netdev_dev->netdev_mtu_error) {
1089 *mtup = netdev_dev->mtu;
1091 return netdev_dev->netdev_mtu_error;
1094 /* Sets the maximum size of transmitted (MTU) for given device using linux
1095 * networking ioctl interface.
1098 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1100 struct netdev_dev_linux *netdev_dev =
1101 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1105 if (netdev_dev->cache_valid & VALID_MTU) {
1106 if (netdev_dev->netdev_mtu_error) {
1107 return netdev_dev->netdev_mtu_error;
1109 if (netdev_dev->mtu == mtu) {
1112 netdev_dev->cache_valid &= ~VALID_MTU;
1115 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1116 SIOCSIFMTU, "SIOCSIFMTU");
1117 if (!error || error == ENODEV) {
1118 netdev_dev->netdev_mtu_error = error;
1119 netdev_dev->mtu = ifr.ifr_mtu;
1120 netdev_dev->cache_valid |= VALID_MTU;
1125 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1126 * On failure, returns a negative errno value. */
1128 netdev_linux_get_ifindex(const struct netdev *netdev)
1132 error = get_ifindex(netdev, &ifindex);
1133 return error ? -error : ifindex;
1137 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1139 struct netdev_dev_linux *netdev_dev =
1140 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1142 if (netdev_dev->miimon_interval > 0) {
1143 *carrier = netdev_dev->miimon;
1145 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1151 static long long int
1152 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1154 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1158 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1159 struct mii_ioctl_data *data)
1164 memset(&ifr, 0, sizeof ifr);
1165 memcpy(&ifr.ifr_data, data, sizeof *data);
1166 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1167 memcpy(data, &ifr.ifr_data, sizeof *data);
1173 netdev_linux_get_miimon(const char *name, bool *miimon)
1175 struct mii_ioctl_data data;
1180 memset(&data, 0, sizeof data);
1181 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1183 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1184 data.reg_num = MII_BMSR;
1185 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1189 *miimon = !!(data.val_out & BMSR_LSTATUS);
1191 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1194 struct ethtool_cmd ecmd;
1196 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1199 COVERAGE_INC(netdev_get_ethtool);
1200 memset(&ecmd, 0, sizeof ecmd);
1201 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1204 struct ethtool_value eval;
1206 memcpy(&eval, &ecmd, sizeof eval);
1207 *miimon = !!eval.data;
1209 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1217 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1218 long long int interval)
1220 struct netdev_dev_linux *netdev_dev;
1222 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1224 interval = interval > 0 ? MAX(interval, 100) : 0;
1225 if (netdev_dev->miimon_interval != interval) {
1226 netdev_dev->miimon_interval = interval;
1227 timer_set_expired(&netdev_dev->miimon_timer);
1234 netdev_linux_miimon_run(void)
1236 struct shash device_shash;
1237 struct shash_node *node;
1239 shash_init(&device_shash);
1240 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1241 SHASH_FOR_EACH (node, &device_shash) {
1242 struct netdev_dev_linux *dev = node->data;
1245 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1249 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1250 if (miimon != dev->miimon) {
1251 dev->miimon = miimon;
1252 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1255 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1258 shash_destroy(&device_shash);
1262 netdev_linux_miimon_wait(void)
1264 struct shash device_shash;
1265 struct shash_node *node;
1267 shash_init(&device_shash);
1268 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1269 SHASH_FOR_EACH (node, &device_shash) {
1270 struct netdev_dev_linux *dev = node->data;
1272 if (dev->miimon_interval > 0) {
1273 timer_wait(&dev->miimon_timer);
1276 shash_destroy(&device_shash);
1279 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1280 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1283 check_for_working_netlink_stats(void)
1285 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1286 * preferable, so if that works, we'll use it. */
1287 int ifindex = do_get_ifindex("lo");
1289 VLOG_WARN("failed to get ifindex for lo, "
1290 "obtaining netdev stats from proc");
1293 struct netdev_stats stats;
1294 int error = get_stats_via_netlink(ifindex, &stats);
1296 VLOG_DBG("obtaining netdev stats via rtnetlink");
1299 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1300 "via proc (you are probably running a pre-2.6.19 "
1301 "kernel)", strerror(error));
1308 swap_uint64(uint64_t *a, uint64_t *b)
1315 /* Copies 'src' into 'dst', performing format conversion in the process.
1317 * 'src' is allowed to be misaligned. */
1319 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1320 const struct ovs_vport_stats *src)
1322 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1323 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1324 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1325 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1326 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1327 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1328 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1329 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1331 dst->collisions = 0;
1332 dst->rx_length_errors = 0;
1333 dst->rx_over_errors = 0;
1334 dst->rx_crc_errors = 0;
1335 dst->rx_frame_errors = 0;
1336 dst->rx_fifo_errors = 0;
1337 dst->rx_missed_errors = 0;
1338 dst->tx_aborted_errors = 0;
1339 dst->tx_carrier_errors = 0;
1340 dst->tx_fifo_errors = 0;
1341 dst->tx_heartbeat_errors = 0;
1342 dst->tx_window_errors = 0;
1346 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1348 struct dpif_linux_vport reply;
1352 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1355 } else if (!reply.stats) {
1360 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1368 get_stats_via_vport(const struct netdev *netdev_,
1369 struct netdev_stats *stats)
1371 struct netdev_dev_linux *netdev_dev =
1372 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1374 if (!netdev_dev->vport_stats_error ||
1375 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1378 error = get_stats_via_vport__(netdev_, stats);
1379 if (error && error != ENOENT) {
1380 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1381 "(%s)", netdev_get_name(netdev_), strerror(error));
1383 netdev_dev->vport_stats_error = error;
1384 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1389 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1390 struct netdev_stats *stats)
1392 static int use_netlink_stats = -1;
1395 if (use_netlink_stats < 0) {
1396 use_netlink_stats = check_for_working_netlink_stats();
1399 if (use_netlink_stats) {
1402 error = get_ifindex(netdev_, &ifindex);
1404 error = get_stats_via_netlink(ifindex, stats);
1407 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1411 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1412 netdev_get_name(netdev_), error);
1418 /* Retrieves current device stats for 'netdev-linux'. */
1420 netdev_linux_get_stats(const struct netdev *netdev_,
1421 struct netdev_stats *stats)
1423 struct netdev_dev_linux *netdev_dev =
1424 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1425 struct netdev_stats dev_stats;
1428 get_stats_via_vport(netdev_, stats);
1430 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1433 if (netdev_dev->vport_stats_error) {
1440 if (netdev_dev->vport_stats_error) {
1441 /* stats not available from OVS then use ioctl stats. */
1444 stats->rx_errors += dev_stats.rx_errors;
1445 stats->tx_errors += dev_stats.tx_errors;
1446 stats->rx_dropped += dev_stats.rx_dropped;
1447 stats->tx_dropped += dev_stats.tx_dropped;
1448 stats->multicast += dev_stats.multicast;
1449 stats->collisions += dev_stats.collisions;
1450 stats->rx_length_errors += dev_stats.rx_length_errors;
1451 stats->rx_over_errors += dev_stats.rx_over_errors;
1452 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1453 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1454 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1455 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1456 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1457 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1458 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1459 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1460 stats->tx_window_errors += dev_stats.tx_window_errors;
1465 /* Retrieves current device stats for 'netdev-tap' netdev or
1466 * netdev-internal. */
1468 netdev_tap_get_stats(const struct netdev *netdev_,
1469 struct netdev_stats *stats)
1471 struct netdev_dev_linux *netdev_dev =
1472 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1473 struct netdev_stats dev_stats;
1476 get_stats_via_vport(netdev_, stats);
1478 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1480 if (netdev_dev->vport_stats_error) {
1487 /* If this port is an internal port then the transmit and receive stats
1488 * will appear to be swapped relative to the other ports since we are the
1489 * one sending the data, not a remote computer. For consistency, we swap
1490 * them back here. This does not apply if we are getting stats from the
1491 * vport layer because it always tracks stats from the perspective of the
1493 if (netdev_dev->vport_stats_error) {
1495 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1496 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1497 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1498 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1499 stats->rx_length_errors = 0;
1500 stats->rx_over_errors = 0;
1501 stats->rx_crc_errors = 0;
1502 stats->rx_frame_errors = 0;
1503 stats->rx_fifo_errors = 0;
1504 stats->rx_missed_errors = 0;
1505 stats->tx_aborted_errors = 0;
1506 stats->tx_carrier_errors = 0;
1507 stats->tx_fifo_errors = 0;
1508 stats->tx_heartbeat_errors = 0;
1509 stats->tx_window_errors = 0;
1511 stats->rx_dropped += dev_stats.tx_dropped;
1512 stats->tx_dropped += dev_stats.rx_dropped;
1514 stats->rx_errors += dev_stats.tx_errors;
1515 stats->tx_errors += dev_stats.rx_errors;
1517 stats->multicast += dev_stats.multicast;
1518 stats->collisions += dev_stats.collisions;
1524 netdev_internal_get_stats(const struct netdev *netdev_,
1525 struct netdev_stats *stats)
1527 struct netdev_dev_linux *netdev_dev =
1528 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1530 get_stats_via_vport(netdev_, stats);
1531 return netdev_dev->vport_stats_error;
1535 netdev_internal_set_stats(struct netdev *netdev,
1536 const struct netdev_stats *stats)
1538 struct ovs_vport_stats vport_stats;
1539 struct dpif_linux_vport vport;
1542 vport_stats.rx_packets = stats->rx_packets;
1543 vport_stats.tx_packets = stats->tx_packets;
1544 vport_stats.rx_bytes = stats->rx_bytes;
1545 vport_stats.tx_bytes = stats->tx_bytes;
1546 vport_stats.rx_errors = stats->rx_errors;
1547 vport_stats.tx_errors = stats->tx_errors;
1548 vport_stats.rx_dropped = stats->rx_dropped;
1549 vport_stats.tx_dropped = stats->tx_dropped;
1551 dpif_linux_vport_init(&vport);
1552 vport.cmd = OVS_VPORT_CMD_SET;
1553 vport.name = netdev_get_name(netdev);
1554 vport.stats = &vport_stats;
1556 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1558 /* If the vport layer doesn't know about the device, that doesn't mean it
1559 * doesn't exist (after all were able to open it when netdev_open() was
1560 * called), it just means that it isn't attached and we'll be getting
1561 * stats a different way. */
1562 if (err == ENODEV) {
1570 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1572 struct ethtool_cmd ecmd;
1576 if (netdev_dev->cache_valid & VALID_FEATURES) {
1580 COVERAGE_INC(netdev_get_ethtool);
1581 memset(&ecmd, 0, sizeof ecmd);
1582 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1583 ETHTOOL_GSET, "ETHTOOL_GSET");
1588 /* Supported features. */
1589 netdev_dev->supported = 0;
1590 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1591 netdev_dev->supported |= NETDEV_F_10MB_HD;
1593 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1594 netdev_dev->supported |= NETDEV_F_10MB_FD;
1596 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1597 netdev_dev->supported |= NETDEV_F_100MB_HD;
1599 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1600 netdev_dev->supported |= NETDEV_F_100MB_FD;
1602 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1603 netdev_dev->supported |= NETDEV_F_1GB_HD;
1605 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1606 netdev_dev->supported |= NETDEV_F_1GB_FD;
1608 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1609 netdev_dev->supported |= NETDEV_F_10GB_FD;
1611 if (ecmd.supported & SUPPORTED_TP) {
1612 netdev_dev->supported |= NETDEV_F_COPPER;
1614 if (ecmd.supported & SUPPORTED_FIBRE) {
1615 netdev_dev->supported |= NETDEV_F_FIBER;
1617 if (ecmd.supported & SUPPORTED_Autoneg) {
1618 netdev_dev->supported |= NETDEV_F_AUTONEG;
1620 if (ecmd.supported & SUPPORTED_Pause) {
1621 netdev_dev->supported |= NETDEV_F_PAUSE;
1623 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1624 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1627 /* Advertised features. */
1628 netdev_dev->advertised = 0;
1629 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1630 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1632 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1633 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1635 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1636 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1638 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1639 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1641 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1642 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1644 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1645 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1647 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1648 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1650 if (ecmd.advertising & ADVERTISED_TP) {
1651 netdev_dev->advertised |= NETDEV_F_COPPER;
1653 if (ecmd.advertising & ADVERTISED_FIBRE) {
1654 netdev_dev->advertised |= NETDEV_F_FIBER;
1656 if (ecmd.advertising & ADVERTISED_Autoneg) {
1657 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1659 if (ecmd.advertising & ADVERTISED_Pause) {
1660 netdev_dev->advertised |= NETDEV_F_PAUSE;
1662 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1663 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1666 /* Current settings. */
1668 if (speed == SPEED_10) {
1669 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1670 } else if (speed == SPEED_100) {
1671 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1672 } else if (speed == SPEED_1000) {
1673 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1674 } else if (speed == SPEED_10000) {
1675 netdev_dev->current = NETDEV_F_10GB_FD;
1676 } else if (speed == 40000) {
1677 netdev_dev->current = NETDEV_F_40GB_FD;
1678 } else if (speed == 100000) {
1679 netdev_dev->current = NETDEV_F_100GB_FD;
1680 } else if (speed == 1000000) {
1681 netdev_dev->current = NETDEV_F_1TB_FD;
1683 netdev_dev->current = 0;
1686 if (ecmd.port == PORT_TP) {
1687 netdev_dev->current |= NETDEV_F_COPPER;
1688 } else if (ecmd.port == PORT_FIBRE) {
1689 netdev_dev->current |= NETDEV_F_FIBER;
1693 netdev_dev->current |= NETDEV_F_AUTONEG;
1696 /* Peer advertisements. */
1697 netdev_dev->peer = 0; /* XXX */
1700 netdev_dev->cache_valid |= VALID_FEATURES;
1701 netdev_dev->get_features_error = error;
1704 /* Stores the features supported by 'netdev' into each of '*current',
1705 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1706 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1709 netdev_linux_get_features(const struct netdev *netdev_,
1710 enum netdev_features *current,
1711 enum netdev_features *advertised,
1712 enum netdev_features *supported,
1713 enum netdev_features *peer)
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1718 netdev_linux_read_features(netdev_dev);
1720 if (!netdev_dev->get_features_error) {
1721 *current = netdev_dev->current;
1722 *advertised = netdev_dev->advertised;
1723 *supported = netdev_dev->supported;
1724 *peer = netdev_dev->peer;
1726 return netdev_dev->get_features_error;
1729 /* Set the features advertised by 'netdev' to 'advertise'. */
1731 netdev_linux_set_advertisements(struct netdev *netdev,
1732 enum netdev_features advertise)
1734 struct ethtool_cmd ecmd;
1737 COVERAGE_INC(netdev_get_ethtool);
1738 memset(&ecmd, 0, sizeof ecmd);
1739 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1740 ETHTOOL_GSET, "ETHTOOL_GSET");
1745 ecmd.advertising = 0;
1746 if (advertise & NETDEV_F_10MB_HD) {
1747 ecmd.advertising |= ADVERTISED_10baseT_Half;
1749 if (advertise & NETDEV_F_10MB_FD) {
1750 ecmd.advertising |= ADVERTISED_10baseT_Full;
1752 if (advertise & NETDEV_F_100MB_HD) {
1753 ecmd.advertising |= ADVERTISED_100baseT_Half;
1755 if (advertise & NETDEV_F_100MB_FD) {
1756 ecmd.advertising |= ADVERTISED_100baseT_Full;
1758 if (advertise & NETDEV_F_1GB_HD) {
1759 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1761 if (advertise & NETDEV_F_1GB_FD) {
1762 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1764 if (advertise & NETDEV_F_10GB_FD) {
1765 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1767 if (advertise & NETDEV_F_COPPER) {
1768 ecmd.advertising |= ADVERTISED_TP;
1770 if (advertise & NETDEV_F_FIBER) {
1771 ecmd.advertising |= ADVERTISED_FIBRE;
1773 if (advertise & NETDEV_F_AUTONEG) {
1774 ecmd.advertising |= ADVERTISED_Autoneg;
1776 if (advertise & NETDEV_F_PAUSE) {
1777 ecmd.advertising |= ADVERTISED_Pause;
1779 if (advertise & NETDEV_F_PAUSE_ASYM) {
1780 ecmd.advertising |= ADVERTISED_Asym_Pause;
1782 COVERAGE_INC(netdev_set_ethtool);
1783 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1784 ETHTOOL_SSET, "ETHTOOL_SSET");
1787 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1788 * successful, otherwise a positive errno value. */
1790 netdev_linux_set_policing(struct netdev *netdev,
1791 uint32_t kbits_rate, uint32_t kbits_burst)
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev));
1795 const char *netdev_name = netdev_get_name(netdev);
1799 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1800 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1801 : kbits_burst); /* Stick with user-specified value. */
1803 if (netdev_dev->cache_valid & VALID_POLICING) {
1804 if (netdev_dev->netdev_policing_error) {
1805 return netdev_dev->netdev_policing_error;
1808 if (netdev_dev->kbits_rate == kbits_rate &&
1809 netdev_dev->kbits_burst == kbits_burst) {
1810 /* Assume that settings haven't changed since we last set them. */
1813 netdev_dev->cache_valid &= ~VALID_POLICING;
1816 COVERAGE_INC(netdev_set_policing);
1817 /* Remove any existing ingress qdisc. */
1818 error = tc_add_del_ingress_qdisc(netdev, false);
1820 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1821 netdev_name, strerror(error));
1826 error = tc_add_del_ingress_qdisc(netdev, true);
1828 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1829 netdev_name, strerror(error));
1833 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1835 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1836 netdev_name, strerror(error));
1841 netdev_dev->kbits_rate = kbits_rate;
1842 netdev_dev->kbits_burst = kbits_burst;
1845 if (!error || error == ENODEV) {
1846 netdev_dev->netdev_policing_error = error;
1847 netdev_dev->cache_valid |= VALID_POLICING;
1853 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1856 const struct tc_ops *const *opsp;
1858 for (opsp = tcs; *opsp != NULL; opsp++) {
1859 const struct tc_ops *ops = *opsp;
1860 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1861 sset_add(types, ops->ovs_name);
1867 static const struct tc_ops *
1868 tc_lookup_ovs_name(const char *name)
1870 const struct tc_ops *const *opsp;
1872 for (opsp = tcs; *opsp != NULL; opsp++) {
1873 const struct tc_ops *ops = *opsp;
1874 if (!strcmp(name, ops->ovs_name)) {
1881 static const struct tc_ops *
1882 tc_lookup_linux_name(const char *name)
1884 const struct tc_ops *const *opsp;
1886 for (opsp = tcs; *opsp != NULL; opsp++) {
1887 const struct tc_ops *ops = *opsp;
1888 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1895 static struct tc_queue *
1896 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1899 struct netdev_dev_linux *netdev_dev =
1900 netdev_dev_linux_cast(netdev_get_dev(netdev));
1901 struct tc_queue *queue;
1903 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1904 if (queue->queue_id == queue_id) {
1911 static struct tc_queue *
1912 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1914 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1918 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1920 struct netdev_qos_capabilities *caps)
1922 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1926 caps->n_queues = ops->n_queues;
1931 netdev_linux_get_qos(const struct netdev *netdev,
1932 const char **typep, struct smap *details)
1934 struct netdev_dev_linux *netdev_dev =
1935 netdev_dev_linux_cast(netdev_get_dev(netdev));
1938 error = tc_query_qdisc(netdev);
1943 *typep = netdev_dev->tc->ops->ovs_name;
1944 return (netdev_dev->tc->ops->qdisc_get
1945 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1950 netdev_linux_set_qos(struct netdev *netdev,
1951 const char *type, const struct smap *details)
1953 struct netdev_dev_linux *netdev_dev =
1954 netdev_dev_linux_cast(netdev_get_dev(netdev));
1955 const struct tc_ops *new_ops;
1958 new_ops = tc_lookup_ovs_name(type);
1959 if (!new_ops || !new_ops->tc_install) {
1963 error = tc_query_qdisc(netdev);
1968 if (new_ops == netdev_dev->tc->ops) {
1969 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1971 /* Delete existing qdisc. */
1972 error = tc_del_qdisc(netdev);
1976 ovs_assert(netdev_dev->tc == NULL);
1978 /* Install new qdisc. */
1979 error = new_ops->tc_install(netdev, details);
1980 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
1987 netdev_linux_get_queue(const struct netdev *netdev,
1988 unsigned int queue_id, struct smap *details)
1990 struct netdev_dev_linux *netdev_dev =
1991 netdev_dev_linux_cast(netdev_get_dev(netdev));
1994 error = tc_query_qdisc(netdev);
1998 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2000 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
2006 netdev_linux_set_queue(struct netdev *netdev,
2007 unsigned int queue_id, const struct smap *details)
2009 struct netdev_dev_linux *netdev_dev =
2010 netdev_dev_linux_cast(netdev_get_dev(netdev));
2013 error = tc_query_qdisc(netdev);
2016 } else if (queue_id >= netdev_dev->tc->ops->n_queues
2017 || !netdev_dev->tc->ops->class_set) {
2021 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
2025 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
2027 struct netdev_dev_linux *netdev_dev =
2028 netdev_dev_linux_cast(netdev_get_dev(netdev));
2031 error = tc_query_qdisc(netdev);
2034 } else if (!netdev_dev->tc->ops->class_delete) {
2037 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2039 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2045 netdev_linux_get_queue_stats(const struct netdev *netdev,
2046 unsigned int queue_id,
2047 struct netdev_queue_stats *stats)
2049 struct netdev_dev_linux *netdev_dev =
2050 netdev_dev_linux_cast(netdev_get_dev(netdev));
2053 error = tc_query_qdisc(netdev);
2056 } else if (!netdev_dev->tc->ops->class_get_stats) {
2059 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2061 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2067 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2069 struct ofpbuf request;
2070 struct tcmsg *tcmsg;
2072 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2076 tcmsg->tcm_parent = 0;
2077 nl_dump_start(dump, rtnl_sock, &request);
2078 ofpbuf_uninit(&request);
2083 netdev_linux_dump_queues(const struct netdev *netdev,
2084 netdev_dump_queues_cb *cb, void *aux)
2086 struct netdev_dev_linux *netdev_dev =
2087 netdev_dev_linux_cast(netdev_get_dev(netdev));
2088 struct tc_queue *queue, *next_queue;
2089 struct smap details;
2093 error = tc_query_qdisc(netdev);
2096 } else if (!netdev_dev->tc->ops->class_get) {
2101 smap_init(&details);
2102 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2103 &netdev_dev->tc->queues) {
2104 smap_clear(&details);
2106 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2108 (*cb)(queue->queue_id, &details, aux);
2113 smap_destroy(&details);
2119 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2120 netdev_dump_queue_stats_cb *cb, void *aux)
2122 struct netdev_dev_linux *netdev_dev =
2123 netdev_dev_linux_cast(netdev_get_dev(netdev));
2124 struct nl_dump dump;
2129 error = tc_query_qdisc(netdev);
2132 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2137 if (!start_queue_dump(netdev, &dump)) {
2140 while (nl_dump_next(&dump, &msg)) {
2141 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2147 error = nl_dump_done(&dump);
2148 return error ? error : last_error;
2152 netdev_linux_get_in4(const struct netdev *netdev_,
2153 struct in_addr *address, struct in_addr *netmask)
2155 struct netdev_dev_linux *netdev_dev =
2156 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2158 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2161 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2162 SIOCGIFADDR, "SIOCGIFADDR");
2167 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2168 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2173 netdev_dev->cache_valid |= VALID_IN4;
2175 *address = netdev_dev->address;
2176 *netmask = netdev_dev->netmask;
2177 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2181 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2182 struct in_addr netmask)
2184 struct netdev_dev_linux *netdev_dev =
2185 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2188 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2190 netdev_dev->cache_valid |= VALID_IN4;
2191 netdev_dev->address = address;
2192 netdev_dev->netmask = netmask;
2193 if (address.s_addr != INADDR_ANY) {
2194 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2195 "SIOCSIFNETMASK", netmask);
2202 parse_if_inet6_line(const char *line,
2203 struct in6_addr *in6, char ifname[16 + 1])
2205 uint8_t *s6 = in6->s6_addr;
2206 #define X8 "%2"SCNx8
2208 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2209 "%*x %*x %*x %*x %16s\n",
2210 &s6[0], &s6[1], &s6[2], &s6[3],
2211 &s6[4], &s6[5], &s6[6], &s6[7],
2212 &s6[8], &s6[9], &s6[10], &s6[11],
2213 &s6[12], &s6[13], &s6[14], &s6[15],
2217 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2218 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2220 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2222 struct netdev_dev_linux *netdev_dev =
2223 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2224 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2228 netdev_dev->in6 = in6addr_any;
2230 file = fopen("/proc/net/if_inet6", "r");
2232 const char *name = netdev_get_name(netdev_);
2233 while (fgets(line, sizeof line, file)) {
2234 struct in6_addr in6_tmp;
2235 char ifname[16 + 1];
2236 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2237 && !strcmp(name, ifname))
2239 netdev_dev->in6 = in6_tmp;
2245 netdev_dev->cache_valid |= VALID_IN6;
2247 *in6 = netdev_dev->in6;
2252 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2254 struct sockaddr_in sin;
2255 memset(&sin, 0, sizeof sin);
2256 sin.sin_family = AF_INET;
2257 sin.sin_addr = addr;
2260 memset(sa, 0, sizeof *sa);
2261 memcpy(sa, &sin, sizeof sin);
2265 do_set_addr(struct netdev *netdev,
2266 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2269 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2270 make_in4_sockaddr(&ifr.ifr_addr, addr);
2272 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2276 /* Adds 'router' as a default IP gateway. */
2278 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2280 struct in_addr any = { INADDR_ANY };
2284 memset(&rt, 0, sizeof rt);
2285 make_in4_sockaddr(&rt.rt_dst, any);
2286 make_in4_sockaddr(&rt.rt_gateway, router);
2287 make_in4_sockaddr(&rt.rt_genmask, any);
2288 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2289 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2291 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2297 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2300 static const char fn[] = "/proc/net/route";
2305 *netdev_name = NULL;
2306 stream = fopen(fn, "r");
2307 if (stream == NULL) {
2308 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2313 while (fgets(line, sizeof line, stream)) {
2316 ovs_be32 dest, gateway, mask;
2317 int refcnt, metric, mtu;
2318 unsigned int flags, use, window, irtt;
2321 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2323 iface, &dest, &gateway, &flags, &refcnt,
2324 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2326 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2330 if (!(flags & RTF_UP)) {
2331 /* Skip routes that aren't up. */
2335 /* The output of 'dest', 'mask', and 'gateway' were given in
2336 * network byte order, so we don't need need any endian
2337 * conversions here. */
2338 if ((dest & mask) == (host->s_addr & mask)) {
2340 /* The host is directly reachable. */
2341 next_hop->s_addr = 0;
2343 /* To reach the host, we must go through a gateway. */
2344 next_hop->s_addr = gateway;
2346 *netdev_name = xstrdup(iface);
2358 netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
2360 struct netdev_dev_linux *netdev_dev;
2363 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2364 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2365 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2367 COVERAGE_INC(netdev_get_ethtool);
2368 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2369 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2372 "ETHTOOL_GDRVINFO");
2374 netdev_dev->cache_valid |= VALID_DRVINFO;
2379 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2380 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2381 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2387 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2390 smap_add(smap, "driver_name", "openvswitch");
2394 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2395 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2396 * returns 0. Otherwise, it returns a positive errno value; in particular,
2397 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2399 netdev_linux_arp_lookup(const struct netdev *netdev,
2400 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2403 struct sockaddr_in sin;
2406 memset(&r, 0, sizeof r);
2407 memset(&sin, 0, sizeof sin);
2408 sin.sin_family = AF_INET;
2409 sin.sin_addr.s_addr = ip;
2411 memcpy(&r.arp_pa, &sin, sizeof sin);
2412 r.arp_ha.sa_family = ARPHRD_ETHER;
2414 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2415 COVERAGE_INC(netdev_arp_lookup);
2416 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2418 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2419 } else if (retval != ENXIO) {
2420 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2421 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2427 nd_to_iff_flags(enum netdev_flags nd)
2430 if (nd & NETDEV_UP) {
2433 if (nd & NETDEV_PROMISC) {
2440 iff_to_nd_flags(int iff)
2442 enum netdev_flags nd = 0;
2446 if (iff & IFF_PROMISC) {
2447 nd |= NETDEV_PROMISC;
2453 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2454 enum netdev_flags on, enum netdev_flags *old_flagsp)
2456 struct netdev_dev_linux *netdev_dev;
2457 int old_flags, new_flags;
2460 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2461 old_flags = netdev_dev->ifi_flags;
2462 *old_flagsp = iff_to_nd_flags(old_flags);
2463 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2464 if (new_flags != old_flags) {
2465 error = set_flags(netdev, new_flags);
2466 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2472 netdev_linux_change_seq(const struct netdev *netdev)
2474 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2477 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2478 GET_FEATURES, GET_STATUS) \
2482 netdev_linux_init, \
2484 netdev_linux_wait, \
2487 netdev_linux_destroy, \
2488 NULL, /* get_config */ \
2489 NULL, /* set_config */ \
2490 NULL, /* get_tunnel_config */ \
2492 netdev_linux_open, \
2493 netdev_linux_close, \
2495 netdev_linux_listen, \
2496 netdev_linux_recv, \
2497 netdev_linux_recv_wait, \
2498 netdev_linux_drain, \
2500 netdev_linux_send, \
2501 netdev_linux_send_wait, \
2503 netdev_linux_set_etheraddr, \
2504 netdev_linux_get_etheraddr, \
2505 netdev_linux_get_mtu, \
2506 netdev_linux_set_mtu, \
2507 netdev_linux_get_ifindex, \
2508 netdev_linux_get_carrier, \
2509 netdev_linux_get_carrier_resets, \
2510 netdev_linux_set_miimon_interval, \
2515 netdev_linux_set_advertisements, \
2517 netdev_linux_set_policing, \
2518 netdev_linux_get_qos_types, \
2519 netdev_linux_get_qos_capabilities, \
2520 netdev_linux_get_qos, \
2521 netdev_linux_set_qos, \
2522 netdev_linux_get_queue, \
2523 netdev_linux_set_queue, \
2524 netdev_linux_delete_queue, \
2525 netdev_linux_get_queue_stats, \
2526 netdev_linux_dump_queues, \
2527 netdev_linux_dump_queue_stats, \
2529 netdev_linux_get_in4, \
2530 netdev_linux_set_in4, \
2531 netdev_linux_get_in6, \
2532 netdev_linux_add_router, \
2533 netdev_linux_get_next_hop, \
2535 netdev_linux_arp_lookup, \
2537 netdev_linux_update_flags, \
2539 netdev_linux_change_seq \
2542 const struct netdev_class netdev_linux_class =
2545 netdev_linux_create,
2546 netdev_linux_get_stats,
2547 NULL, /* set_stats */
2548 netdev_linux_get_features,
2549 netdev_linux_get_status);
2551 const struct netdev_class netdev_tap_class =
2554 netdev_linux_create_tap,
2555 netdev_tap_get_stats,
2556 NULL, /* set_stats */
2557 netdev_linux_get_features,
2558 netdev_linux_get_status);
2560 const struct netdev_class netdev_internal_class =
2563 netdev_linux_create,
2564 netdev_internal_get_stats,
2565 netdev_internal_set_stats,
2566 NULL, /* get_features */
2567 netdev_internal_get_status);
2569 /* HTB traffic control class. */
2571 #define HTB_N_QUEUES 0xf000
2575 unsigned int max_rate; /* In bytes/s. */
2579 struct tc_queue tc_queue;
2580 unsigned int min_rate; /* In bytes/s. */
2581 unsigned int max_rate; /* In bytes/s. */
2582 unsigned int burst; /* In bytes. */
2583 unsigned int priority; /* Lower values are higher priorities. */
2587 htb_get__(const struct netdev *netdev)
2589 struct netdev_dev_linux *netdev_dev =
2590 netdev_dev_linux_cast(netdev_get_dev(netdev));
2591 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2595 htb_install__(struct netdev *netdev, uint64_t max_rate)
2597 struct netdev_dev_linux *netdev_dev =
2598 netdev_dev_linux_cast(netdev_get_dev(netdev));
2601 htb = xmalloc(sizeof *htb);
2602 tc_init(&htb->tc, &tc_ops_htb);
2603 htb->max_rate = max_rate;
2605 netdev_dev->tc = &htb->tc;
2608 /* Create an HTB qdisc.
2610 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2612 htb_setup_qdisc__(struct netdev *netdev)
2615 struct tc_htb_glob opt;
2616 struct ofpbuf request;
2617 struct tcmsg *tcmsg;
2619 tc_del_qdisc(netdev);
2621 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2622 NLM_F_EXCL | NLM_F_CREATE, &request);
2626 tcmsg->tcm_handle = tc_make_handle(1, 0);
2627 tcmsg->tcm_parent = TC_H_ROOT;
2629 nl_msg_put_string(&request, TCA_KIND, "htb");
2631 memset(&opt, 0, sizeof opt);
2632 opt.rate2quantum = 10;
2636 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2637 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2638 nl_msg_end_nested(&request, opt_offset);
2640 return tc_transact(&request, NULL);
2643 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2644 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2646 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2647 unsigned int parent, struct htb_class *class)
2650 struct tc_htb_opt opt;
2651 struct ofpbuf request;
2652 struct tcmsg *tcmsg;
2656 error = netdev_get_mtu(netdev, &mtu);
2658 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2659 netdev_get_name(netdev));
2663 memset(&opt, 0, sizeof opt);
2664 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2665 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2666 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2667 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2668 opt.prio = class->priority;
2670 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2674 tcmsg->tcm_handle = handle;
2675 tcmsg->tcm_parent = parent;
2677 nl_msg_put_string(&request, TCA_KIND, "htb");
2678 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2679 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2680 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2681 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2682 nl_msg_end_nested(&request, opt_offset);
2684 error = tc_transact(&request, NULL);
2686 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2687 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2688 netdev_get_name(netdev),
2689 tc_get_major(handle), tc_get_minor(handle),
2690 tc_get_major(parent), tc_get_minor(parent),
2691 class->min_rate, class->max_rate,
2692 class->burst, class->priority, strerror(error));
2697 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2698 * description of them into 'details'. The description complies with the
2699 * specification given in the vswitch database documentation for linux-htb
2702 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2704 static const struct nl_policy tca_htb_policy[] = {
2705 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2706 .min_len = sizeof(struct tc_htb_opt) },
2709 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2710 const struct tc_htb_opt *htb;
2712 if (!nl_parse_nested(nl_options, tca_htb_policy,
2713 attrs, ARRAY_SIZE(tca_htb_policy))) {
2714 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2718 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2719 class->min_rate = htb->rate.rate;
2720 class->max_rate = htb->ceil.rate;
2721 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2722 class->priority = htb->prio;
2727 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2728 struct htb_class *options,
2729 struct netdev_queue_stats *stats)
2731 struct nlattr *nl_options;
2732 unsigned int handle;
2735 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2736 if (!error && queue_id) {
2737 unsigned int major = tc_get_major(handle);
2738 unsigned int minor = tc_get_minor(handle);
2739 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2740 *queue_id = minor - 1;
2745 if (!error && options) {
2746 error = htb_parse_tca_options__(nl_options, options);
2752 htb_parse_qdisc_details__(struct netdev *netdev,
2753 const struct smap *details, struct htb_class *hc)
2755 const char *max_rate_s;
2757 max_rate_s = smap_get(details, "max-rate");
2758 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2759 if (!hc->max_rate) {
2760 enum netdev_features current;
2762 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2763 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2765 hc->min_rate = hc->max_rate;
2771 htb_parse_class_details__(struct netdev *netdev,
2772 const struct smap *details, struct htb_class *hc)
2774 const struct htb *htb = htb_get__(netdev);
2775 const char *min_rate_s = smap_get(details, "min-rate");
2776 const char *max_rate_s = smap_get(details, "max-rate");
2777 const char *burst_s = smap_get(details, "burst");
2778 const char *priority_s = smap_get(details, "priority");
2781 error = netdev_get_mtu(netdev, &mtu);
2783 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2784 netdev_get_name(netdev));
2788 /* HTB requires at least an mtu sized min-rate to send any traffic even
2789 * on uncongested links. */
2790 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2791 hc->min_rate = MAX(hc->min_rate, mtu);
2792 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2795 hc->max_rate = (max_rate_s
2796 ? strtoull(max_rate_s, NULL, 10) / 8
2798 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2799 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2803 * According to hints in the documentation that I've read, it is important
2804 * that 'burst' be at least as big as the largest frame that might be
2805 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2806 * but having it a bit too small is a problem. Since netdev_get_mtu()
2807 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2808 * the MTU. We actually add 64, instead of 14, as a guard against
2809 * additional headers get tacked on somewhere that we're not aware of. */
2810 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2811 hc->burst = MAX(hc->burst, mtu + 64);
2814 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2820 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2821 unsigned int parent, struct htb_class *options,
2822 struct netdev_queue_stats *stats)
2824 struct ofpbuf *reply;
2827 error = tc_query_class(netdev, handle, parent, &reply);
2829 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2830 ofpbuf_delete(reply);
2836 htb_tc_install(struct netdev *netdev, const struct smap *details)
2840 error = htb_setup_qdisc__(netdev);
2842 struct htb_class hc;
2844 htb_parse_qdisc_details__(netdev, details, &hc);
2845 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2846 tc_make_handle(1, 0), &hc);
2848 htb_install__(netdev, hc.max_rate);
2854 static struct htb_class *
2855 htb_class_cast__(const struct tc_queue *queue)
2857 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2861 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2862 const struct htb_class *hc)
2864 struct htb *htb = htb_get__(netdev);
2865 size_t hash = hash_int(queue_id, 0);
2866 struct tc_queue *queue;
2867 struct htb_class *hcp;
2869 queue = tc_find_queue__(netdev, queue_id, hash);
2871 hcp = htb_class_cast__(queue);
2873 hcp = xmalloc(sizeof *hcp);
2874 queue = &hcp->tc_queue;
2875 queue->queue_id = queue_id;
2876 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2879 hcp->min_rate = hc->min_rate;
2880 hcp->max_rate = hc->max_rate;
2881 hcp->burst = hc->burst;
2882 hcp->priority = hc->priority;
2886 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2889 struct nl_dump dump;
2890 struct htb_class hc;
2892 /* Get qdisc options. */
2894 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2895 htb_install__(netdev, hc.max_rate);
2898 if (!start_queue_dump(netdev, &dump)) {
2901 while (nl_dump_next(&dump, &msg)) {
2902 unsigned int queue_id;
2904 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2905 htb_update_queue__(netdev, queue_id, &hc);
2908 nl_dump_done(&dump);
2914 htb_tc_destroy(struct tc *tc)
2916 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2917 struct htb_class *hc, *next;
2919 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2920 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2928 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2930 const struct htb *htb = htb_get__(netdev);
2931 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2936 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2938 struct htb_class hc;
2941 htb_parse_qdisc_details__(netdev, details, &hc);
2942 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2943 tc_make_handle(1, 0), &hc);
2945 htb_get__(netdev)->max_rate = hc.max_rate;
2951 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2952 const struct tc_queue *queue, struct smap *details)
2954 const struct htb_class *hc = htb_class_cast__(queue);
2956 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2957 if (hc->min_rate != hc->max_rate) {
2958 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2960 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2962 smap_add_format(details, "priority", "%u", hc->priority);
2968 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2969 const struct smap *details)
2971 struct htb_class hc;
2974 error = htb_parse_class_details__(netdev, details, &hc);
2979 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2980 tc_make_handle(1, 0xfffe), &hc);
2985 htb_update_queue__(netdev, queue_id, &hc);
2990 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2992 struct htb_class *hc = htb_class_cast__(queue);
2993 struct htb *htb = htb_get__(netdev);
2996 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2998 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3005 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3006 struct netdev_queue_stats *stats)
3008 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3009 tc_make_handle(1, 0xfffe), NULL, stats);
3013 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3014 const struct ofpbuf *nlmsg,
3015 netdev_dump_queue_stats_cb *cb, void *aux)
3017 struct netdev_queue_stats stats;
3018 unsigned int handle, major, minor;
3021 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3026 major = tc_get_major(handle);
3027 minor = tc_get_minor(handle);
3028 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3029 (*cb)(minor - 1, &stats, aux);
3034 static const struct tc_ops tc_ops_htb = {
3035 "htb", /* linux_name */
3036 "linux-htb", /* ovs_name */
3037 HTB_N_QUEUES, /* n_queues */
3046 htb_class_get_stats,
3047 htb_class_dump_stats
3050 /* "linux-hfsc" traffic control class. */
3052 #define HFSC_N_QUEUES 0xf000
3060 struct tc_queue tc_queue;
3065 static struct hfsc *
3066 hfsc_get__(const struct netdev *netdev)
3068 struct netdev_dev_linux *netdev_dev;
3069 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3070 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3073 static struct hfsc_class *
3074 hfsc_class_cast__(const struct tc_queue *queue)
3076 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3080 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3082 struct netdev_dev_linux * netdev_dev;
3085 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3086 hfsc = xmalloc(sizeof *hfsc);
3087 tc_init(&hfsc->tc, &tc_ops_hfsc);
3088 hfsc->max_rate = max_rate;
3089 netdev_dev->tc = &hfsc->tc;
3093 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3094 const struct hfsc_class *hc)
3098 struct hfsc_class *hcp;
3099 struct tc_queue *queue;
3101 hfsc = hfsc_get__(netdev);
3102 hash = hash_int(queue_id, 0);
3104 queue = tc_find_queue__(netdev, queue_id, hash);
3106 hcp = hfsc_class_cast__(queue);
3108 hcp = xmalloc(sizeof *hcp);
3109 queue = &hcp->tc_queue;
3110 queue->queue_id = queue_id;
3111 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3114 hcp->min_rate = hc->min_rate;
3115 hcp->max_rate = hc->max_rate;
3119 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3121 const struct tc_service_curve *rsc, *fsc, *usc;
3122 static const struct nl_policy tca_hfsc_policy[] = {
3124 .type = NL_A_UNSPEC,
3126 .min_len = sizeof(struct tc_service_curve),
3129 .type = NL_A_UNSPEC,
3131 .min_len = sizeof(struct tc_service_curve),
3134 .type = NL_A_UNSPEC,
3136 .min_len = sizeof(struct tc_service_curve),
3139 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3141 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3142 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3143 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3147 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3148 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3149 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3151 if (rsc->m1 != 0 || rsc->d != 0 ||
3152 fsc->m1 != 0 || fsc->d != 0 ||
3153 usc->m1 != 0 || usc->d != 0) {
3154 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3155 "Non-linear service curves are not supported.");
3159 if (rsc->m2 != fsc->m2) {
3160 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3161 "Real-time service curves are not supported ");
3165 if (rsc->m2 > usc->m2) {
3166 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3167 "Min-rate service curve is greater than "
3168 "the max-rate service curve.");
3172 class->min_rate = fsc->m2;
3173 class->max_rate = usc->m2;
3178 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3179 struct hfsc_class *options,
3180 struct netdev_queue_stats *stats)
3183 unsigned int handle;
3184 struct nlattr *nl_options;
3186 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3192 unsigned int major, minor;
3194 major = tc_get_major(handle);
3195 minor = tc_get_minor(handle);
3196 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3197 *queue_id = minor - 1;
3204 error = hfsc_parse_tca_options__(nl_options, options);
3211 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3212 unsigned int parent, struct hfsc_class *options,
3213 struct netdev_queue_stats *stats)
3216 struct ofpbuf *reply;
3218 error = tc_query_class(netdev, handle, parent, &reply);
3223 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3224 ofpbuf_delete(reply);
3229 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3230 struct hfsc_class *class)
3233 const char *max_rate_s;
3235 max_rate_s = smap_get(details, "max-rate");
3236 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3239 enum netdev_features current;
3241 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3242 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3245 class->min_rate = max_rate;
3246 class->max_rate = max_rate;
3250 hfsc_parse_class_details__(struct netdev *netdev,
3251 const struct smap *details,
3252 struct hfsc_class * class)
3254 const struct hfsc *hfsc;
3255 uint32_t min_rate, max_rate;
3256 const char *min_rate_s, *max_rate_s;
3258 hfsc = hfsc_get__(netdev);
3259 min_rate_s = smap_get(details, "min-rate");
3260 max_rate_s = smap_get(details, "max-rate");
3262 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3263 min_rate = MAX(min_rate, 1);
3264 min_rate = MIN(min_rate, hfsc->max_rate);
3266 max_rate = (max_rate_s
3267 ? strtoull(max_rate_s, NULL, 10) / 8
3269 max_rate = MAX(max_rate, min_rate);
3270 max_rate = MIN(max_rate, hfsc->max_rate);
3272 class->min_rate = min_rate;
3273 class->max_rate = max_rate;
3278 /* Create an HFSC qdisc.
3280 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3282 hfsc_setup_qdisc__(struct netdev * netdev)
3284 struct tcmsg *tcmsg;
3285 struct ofpbuf request;
3286 struct tc_hfsc_qopt opt;
3288 tc_del_qdisc(netdev);
3290 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3291 NLM_F_EXCL | NLM_F_CREATE, &request);
3297 tcmsg->tcm_handle = tc_make_handle(1, 0);
3298 tcmsg->tcm_parent = TC_H_ROOT;
3300 memset(&opt, 0, sizeof opt);
3303 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3304 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3306 return tc_transact(&request, NULL);
3309 /* Create an HFSC class.
3311 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3312 * sc rate <min_rate> ul rate <max_rate>" */
3314 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3315 unsigned int parent, struct hfsc_class *class)
3319 struct tcmsg *tcmsg;
3320 struct ofpbuf request;
3321 struct tc_service_curve min, max;
3323 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3329 tcmsg->tcm_handle = handle;
3330 tcmsg->tcm_parent = parent;
3334 min.m2 = class->min_rate;
3338 max.m2 = class->max_rate;
3340 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3341 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3342 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3343 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3344 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3345 nl_msg_end_nested(&request, opt_offset);
3347 error = tc_transact(&request, NULL);
3349 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3350 "min-rate %ubps, max-rate %ubps (%s)",
3351 netdev_get_name(netdev),
3352 tc_get_major(handle), tc_get_minor(handle),
3353 tc_get_major(parent), tc_get_minor(parent),
3354 class->min_rate, class->max_rate, strerror(error));
3361 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3364 struct hfsc_class class;
3366 error = hfsc_setup_qdisc__(netdev);
3372 hfsc_parse_qdisc_details__(netdev, details, &class);
3373 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3374 tc_make_handle(1, 0), &class);
3380 hfsc_install__(netdev, class.max_rate);
3385 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3388 struct nl_dump dump;
3389 struct hfsc_class hc;
3392 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3393 hfsc_install__(netdev, hc.max_rate);
3395 if (!start_queue_dump(netdev, &dump)) {
3399 while (nl_dump_next(&dump, &msg)) {
3400 unsigned int queue_id;
3402 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3403 hfsc_update_queue__(netdev, queue_id, &hc);
3407 nl_dump_done(&dump);
3412 hfsc_tc_destroy(struct tc *tc)
3415 struct hfsc_class *hc, *next;
3417 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3419 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3420 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3429 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3431 const struct hfsc *hfsc;
3432 hfsc = hfsc_get__(netdev);
3433 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3438 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3441 struct hfsc_class class;
3443 hfsc_parse_qdisc_details__(netdev, details, &class);
3444 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3445 tc_make_handle(1, 0), &class);
3448 hfsc_get__(netdev)->max_rate = class.max_rate;
3455 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3456 const struct tc_queue *queue, struct smap *details)
3458 const struct hfsc_class *hc;
3460 hc = hfsc_class_cast__(queue);
3461 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3462 if (hc->min_rate != hc->max_rate) {
3463 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3469 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3470 const struct smap *details)
3473 struct hfsc_class class;
3475 error = hfsc_parse_class_details__(netdev, details, &class);
3480 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3481 tc_make_handle(1, 0xfffe), &class);
3486 hfsc_update_queue__(netdev, queue_id, &class);
3491 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3495 struct hfsc_class *hc;
3497 hc = hfsc_class_cast__(queue);
3498 hfsc = hfsc_get__(netdev);
3500 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3502 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3509 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3510 struct netdev_queue_stats *stats)
3512 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3513 tc_make_handle(1, 0xfffe), NULL, stats);
3517 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3518 const struct ofpbuf *nlmsg,
3519 netdev_dump_queue_stats_cb *cb, void *aux)
3521 struct netdev_queue_stats stats;
3522 unsigned int handle, major, minor;
3525 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3530 major = tc_get_major(handle);
3531 minor = tc_get_minor(handle);
3532 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3533 (*cb)(minor - 1, &stats, aux);
3538 static const struct tc_ops tc_ops_hfsc = {
3539 "hfsc", /* linux_name */
3540 "linux-hfsc", /* ovs_name */
3541 HFSC_N_QUEUES, /* n_queues */
3542 hfsc_tc_install, /* tc_install */
3543 hfsc_tc_load, /* tc_load */
3544 hfsc_tc_destroy, /* tc_destroy */
3545 hfsc_qdisc_get, /* qdisc_get */
3546 hfsc_qdisc_set, /* qdisc_set */
3547 hfsc_class_get, /* class_get */
3548 hfsc_class_set, /* class_set */
3549 hfsc_class_delete, /* class_delete */
3550 hfsc_class_get_stats, /* class_get_stats */
3551 hfsc_class_dump_stats /* class_dump_stats */
3554 /* "linux-default" traffic control class.
3556 * This class represents the default, unnamed Linux qdisc. It corresponds to
3557 * the "" (empty string) QoS type in the OVS database. */
3560 default_install__(struct netdev *netdev)
3562 struct netdev_dev_linux *netdev_dev =
3563 netdev_dev_linux_cast(netdev_get_dev(netdev));
3564 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3566 /* Nothing but a tc class implementation is allowed to write to a tc. This
3567 * class never does that, so we can legitimately use a const tc object. */
3568 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
3572 default_tc_install(struct netdev *netdev,
3573 const struct smap *details OVS_UNUSED)
3575 default_install__(netdev);
3580 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3582 default_install__(netdev);
3586 static const struct tc_ops tc_ops_default = {
3587 NULL, /* linux_name */
3592 NULL, /* tc_destroy */
3593 NULL, /* qdisc_get */
3594 NULL, /* qdisc_set */
3595 NULL, /* class_get */
3596 NULL, /* class_set */
3597 NULL, /* class_delete */
3598 NULL, /* class_get_stats */
3599 NULL /* class_dump_stats */
3602 /* "linux-other" traffic control class.
3607 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3609 struct netdev_dev_linux *netdev_dev =
3610 netdev_dev_linux_cast(netdev_get_dev(netdev));
3611 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3613 /* Nothing but a tc class implementation is allowed to write to a tc. This
3614 * class never does that, so we can legitimately use a const tc object. */
3615 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
3619 static const struct tc_ops tc_ops_other = {
3620 NULL, /* linux_name */
3621 "linux-other", /* ovs_name */
3623 NULL, /* tc_install */
3625 NULL, /* tc_destroy */
3626 NULL, /* qdisc_get */
3627 NULL, /* qdisc_set */
3628 NULL, /* class_get */
3629 NULL, /* class_set */
3630 NULL, /* class_delete */
3631 NULL, /* class_get_stats */
3632 NULL /* class_dump_stats */
3635 /* Traffic control. */
3637 /* Number of kernel "tc" ticks per second. */
3638 static double ticks_per_s;
3640 /* Number of kernel "jiffies" per second. This is used for the purpose of
3641 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3642 * one jiffy's worth of data.
3644 * There are two possibilities here:
3646 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3647 * approximate range of 100 to 1024. That means that we really need to
3648 * make sure that the qdisc can buffer that much data.
3650 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3651 * has finely granular timers and there's no need to fudge additional room
3652 * for buffers. (There's no extra effort needed to implement that: the
3653 * large 'buffer_hz' is used as a divisor, so practically any number will
3654 * come out as 0 in the division. Small integer results in the case of
3655 * really high dividends won't have any real effect anyhow.)
3657 static unsigned int buffer_hz;
3659 /* Returns tc handle 'major':'minor'. */
3661 tc_make_handle(unsigned int major, unsigned int minor)
3663 return TC_H_MAKE(major << 16, minor);
3666 /* Returns the major number from 'handle'. */
3668 tc_get_major(unsigned int handle)
3670 return TC_H_MAJ(handle) >> 16;
3673 /* Returns the minor number from 'handle'. */
3675 tc_get_minor(unsigned int handle)
3677 return TC_H_MIN(handle);
3680 static struct tcmsg *
3681 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3682 struct ofpbuf *request)
3684 struct tcmsg *tcmsg;
3688 error = get_ifindex(netdev, &ifindex);
3693 ofpbuf_init(request, 512);
3694 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3695 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3696 tcmsg->tcm_family = AF_UNSPEC;
3697 tcmsg->tcm_ifindex = ifindex;
3698 /* Caller should fill in tcmsg->tcm_handle. */
3699 /* Caller should fill in tcmsg->tcm_parent. */
3705 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3707 int error = nl_sock_transact(rtnl_sock, request, replyp);
3708 ofpbuf_uninit(request);
3712 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3713 * policing configuration.
3715 * This function is equivalent to running the following when 'add' is true:
3716 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3718 * This function is equivalent to running the following when 'add' is false:
3719 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3721 * The configuration and stats may be seen with the following command:
3722 * /sbin/tc -s qdisc show dev <devname>
3724 * Returns 0 if successful, otherwise a positive errno value.
3727 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3729 struct ofpbuf request;
3730 struct tcmsg *tcmsg;
3732 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3733 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3735 tcmsg = tc_make_request(netdev, type, flags, &request);
3739 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3740 tcmsg->tcm_parent = TC_H_INGRESS;
3741 nl_msg_put_string(&request, TCA_KIND, "ingress");
3742 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3744 error = tc_transact(&request, NULL);
3746 /* If we're deleting the qdisc, don't worry about some of the
3747 * error conditions. */
3748 if (!add && (error == ENOENT || error == EINVAL)) {
3757 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3760 * This function is equivalent to running:
3761 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3762 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3765 * The configuration and stats may be seen with the following command:
3766 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3768 * Returns 0 if successful, otherwise a positive errno value.
3771 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3773 struct tc_police tc_police;
3774 struct ofpbuf request;
3775 struct tcmsg *tcmsg;
3776 size_t basic_offset;
3777 size_t police_offset;
3781 memset(&tc_police, 0, sizeof tc_police);
3782 tc_police.action = TC_POLICE_SHOT;
3783 tc_police.mtu = mtu;
3784 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3785 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3786 kbits_burst * 1024);
3788 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3789 NLM_F_EXCL | NLM_F_CREATE, &request);
3793 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3794 tcmsg->tcm_info = tc_make_handle(49,
3795 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3797 nl_msg_put_string(&request, TCA_KIND, "basic");
3798 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3799 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3800 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3801 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3802 nl_msg_end_nested(&request, police_offset);
3803 nl_msg_end_nested(&request, basic_offset);
3805 error = tc_transact(&request, NULL);
3816 /* The values in psched are not individually very meaningful, but they are
3817 * important. The tables below show some values seen in the wild.
3821 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3822 * (Before that, there are hints that it was 1000000000.)
3824 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3828 * -----------------------------------
3829 * [1] 000c8000 000f4240 000f4240 00000064
3830 * [2] 000003e8 00000400 000f4240 3b9aca00
3831 * [3] 000003e8 00000400 000f4240 3b9aca00
3832 * [4] 000003e8 00000400 000f4240 00000064
3833 * [5] 000003e8 00000040 000f4240 3b9aca00
3834 * [6] 000003e8 00000040 000f4240 000000f9
3836 * a b c d ticks_per_s buffer_hz
3837 * ------- --------- ---------- ------------- ----------- -------------
3838 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3839 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3840 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3841 * [4] 1,000 1,024 1,000,000 100 976,562 100
3842 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3843 * [6] 1,000 64 1,000,000 249 15,625,000 249
3845 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3846 * [2] 2.6.26-1-686-bigmem from Debian lenny
3847 * [3] 2.6.26-2-sparc64 from Debian lenny
3848 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3849 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3850 * [6] 2.6.34 from kernel.org on KVM
3852 static const char fn[] = "/proc/net/psched";
3853 unsigned int a, b, c, d;
3859 stream = fopen(fn, "r");
3861 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3865 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3866 VLOG_WARN("%s: read failed", fn);
3870 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3874 VLOG_WARN("%s: invalid scheduler parameters", fn);
3878 ticks_per_s = (double) a * c / b;
3882 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3885 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3888 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3889 * rate of 'rate' bytes per second. */
3891 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3896 return (rate * ticks) / ticks_per_s;
3899 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3900 * rate of 'rate' bytes per second. */
3902 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3907 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3910 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3911 * a transmission rate of 'rate' bytes per second. */
3913 tc_buffer_per_jiffy(unsigned int rate)
3918 return rate / buffer_hz;
3921 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3922 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3923 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3924 * stores NULL into it if it is absent.
3926 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3929 * Returns 0 if successful, otherwise a positive errno value. */
3931 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3932 struct nlattr **options)
3934 static const struct nl_policy tca_policy[] = {
3935 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3936 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3938 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3940 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3941 tca_policy, ta, ARRAY_SIZE(ta))) {
3942 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3947 *kind = nl_attr_get_string(ta[TCA_KIND]);
3951 *options = ta[TCA_OPTIONS];
3966 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3967 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3968 * into '*options', and its queue statistics into '*stats'. Any of the output
3969 * arguments may be null.
3971 * Returns 0 if successful, otherwise a positive errno value. */
3973 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3974 struct nlattr **options, struct netdev_queue_stats *stats)
3976 static const struct nl_policy tca_policy[] = {
3977 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3978 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3980 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3982 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3983 tca_policy, ta, ARRAY_SIZE(ta))) {
3984 VLOG_WARN_RL(&rl, "failed to parse class message");
3989 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3990 *handlep = tc->tcm_handle;
3994 *options = ta[TCA_OPTIONS];
3998 const struct gnet_stats_queue *gsq;
3999 struct gnet_stats_basic gsb;
4001 static const struct nl_policy stats_policy[] = {
4002 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4003 .min_len = sizeof gsb },
4004 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4005 .min_len = sizeof *gsq },
4007 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4009 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4010 sa, ARRAY_SIZE(sa))) {
4011 VLOG_WARN_RL(&rl, "failed to parse class stats");
4015 /* Alignment issues screw up the length of struct gnet_stats_basic on
4016 * some arch/bitsize combinations. Newer versions of Linux have a
4017 * struct gnet_stats_basic_packed, but we can't depend on that. The
4018 * easiest thing to do is just to make a copy. */
4019 memset(&gsb, 0, sizeof gsb);
4020 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4021 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4022 stats->tx_bytes = gsb.bytes;
4023 stats->tx_packets = gsb.packets;
4025 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4026 stats->tx_errors = gsq->drops;
4036 memset(stats, 0, sizeof *stats);
4041 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4044 tc_query_class(const struct netdev *netdev,
4045 unsigned int handle, unsigned int parent,
4046 struct ofpbuf **replyp)
4048 struct ofpbuf request;
4049 struct tcmsg *tcmsg;
4052 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4056 tcmsg->tcm_handle = handle;
4057 tcmsg->tcm_parent = parent;
4059 error = tc_transact(&request, replyp);
4061 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4062 netdev_get_name(netdev),
4063 tc_get_major(handle), tc_get_minor(handle),
4064 tc_get_major(parent), tc_get_minor(parent),
4070 /* Equivalent to "tc class del dev <name> handle <handle>". */
4072 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4074 struct ofpbuf request;
4075 struct tcmsg *tcmsg;
4078 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4082 tcmsg->tcm_handle = handle;
4083 tcmsg->tcm_parent = 0;
4085 error = tc_transact(&request, NULL);
4087 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4088 netdev_get_name(netdev),
4089 tc_get_major(handle), tc_get_minor(handle),
4095 /* Equivalent to "tc qdisc del dev <name> root". */
4097 tc_del_qdisc(struct netdev *netdev)
4099 struct netdev_dev_linux *netdev_dev =
4100 netdev_dev_linux_cast(netdev_get_dev(netdev));
4101 struct ofpbuf request;
4102 struct tcmsg *tcmsg;
4105 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4109 tcmsg->tcm_handle = tc_make_handle(1, 0);
4110 tcmsg->tcm_parent = TC_H_ROOT;
4112 error = tc_transact(&request, NULL);
4113 if (error == EINVAL) {
4114 /* EINVAL probably means that the default qdisc was in use, in which
4115 * case we've accomplished our purpose. */
4118 if (!error && netdev_dev->tc) {
4119 if (netdev_dev->tc->ops->tc_destroy) {
4120 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4122 netdev_dev->tc = NULL;
4127 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4128 * kernel to determine what they are. Returns 0 if successful, otherwise a
4129 * positive errno value. */
4131 tc_query_qdisc(const struct netdev *netdev)
4133 struct netdev_dev_linux *netdev_dev =
4134 netdev_dev_linux_cast(netdev_get_dev(netdev));
4135 struct ofpbuf request, *qdisc;
4136 const struct tc_ops *ops;
4137 struct tcmsg *tcmsg;
4141 if (netdev_dev->tc) {
4145 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4146 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4147 * 2.6.35 without that fix backported to it.
4149 * To avoid the OOPS, we must not make a request that would attempt to dump
4150 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4151 * few others. There are a few ways that I can see to do this, but most of
4152 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4153 * technique chosen here is to assume that any non-default qdisc that we
4154 * create will have a class with handle 1:0. The built-in qdiscs only have
4155 * a class with handle 0:0.
4157 * We could check for Linux 2.6.35+ and use a more straightforward method
4159 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4163 tcmsg->tcm_handle = tc_make_handle(1, 0);
4164 tcmsg->tcm_parent = 0;
4166 /* Figure out what tc class to instantiate. */
4167 error = tc_transact(&request, &qdisc);
4171 error = tc_parse_qdisc(qdisc, &kind, NULL);
4173 ops = &tc_ops_other;
4175 ops = tc_lookup_linux_name(kind);
4177 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4178 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4180 ops = &tc_ops_other;
4183 } else if (error == ENOENT) {
4184 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4185 * other entity that doesn't have a handle 1:0. We will assume
4186 * that it's the system default qdisc. */
4187 ops = &tc_ops_default;
4190 /* Who knows? Maybe the device got deleted. */
4191 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4192 netdev_get_name(netdev), strerror(error));
4193 ops = &tc_ops_other;
4196 /* Instantiate it. */
4197 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4198 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
4199 ofpbuf_delete(qdisc);
4201 return error ? error : load_error;
4204 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4205 approximate the time to transmit packets of various lengths. For an MTU of
4206 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4207 represents two possible packet lengths; for a MTU of 513 through 1024, four
4208 possible lengths; and so on.
4210 Returns, for the specified 'mtu', the number of bits that packet lengths
4211 need to be shifted right to fit within such a 256-entry table. */
4213 tc_calc_cell_log(unsigned int mtu)
4218 mtu = ETH_PAYLOAD_MAX;
4220 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4222 for (cell_log = 0; mtu >= 256; cell_log++) {
4229 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4232 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4234 memset(rate, 0, sizeof *rate);
4235 rate->cell_log = tc_calc_cell_log(mtu);
4236 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4237 /* rate->cell_align = 0; */ /* distro headers. */
4238 rate->mpu = ETH_TOTAL_MIN;
4242 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4243 * attribute of the specified "type".
4245 * See tc_calc_cell_log() above for a description of "rtab"s. */
4247 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4252 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4253 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4254 unsigned packet_size = (i + 1) << rate->cell_log;
4255 if (packet_size < rate->mpu) {
4256 packet_size = rate->mpu;
4258 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4262 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4263 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4264 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4267 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4269 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4270 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4273 /* Linux-only functions declared in netdev-linux.h */
4275 /* Returns a fd for an AF_INET socket or a negative errno value. */
4277 netdev_linux_get_af_inet_sock(void)
4279 int error = netdev_linux_init();
4280 return error ? -error : af_inet_sock;
4283 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4284 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4286 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4287 const char *flag_name, bool enable)
4289 const char *netdev_name = netdev_get_name(netdev);
4290 struct ethtool_value evalue;
4294 COVERAGE_INC(netdev_get_ethtool);
4295 memset(&evalue, 0, sizeof evalue);
4296 error = netdev_linux_do_ethtool(netdev_name,
4297 (struct ethtool_cmd *)&evalue,
4298 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4303 COVERAGE_INC(netdev_set_ethtool);
4304 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4305 error = netdev_linux_do_ethtool(netdev_name,
4306 (struct ethtool_cmd *)&evalue,
4307 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4312 COVERAGE_INC(netdev_get_ethtool);
4313 memset(&evalue, 0, sizeof evalue);
4314 error = netdev_linux_do_ethtool(netdev_name,
4315 (struct ethtool_cmd *)&evalue,
4316 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4321 if (new_flags != evalue.data) {
4322 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4323 "device %s failed", enable ? "enable" : "disable",
4324 flag_name, netdev_name);
4331 /* Utility functions. */
4333 /* Copies 'src' into 'dst', performing format conversion in the process. */
4335 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4336 const struct rtnl_link_stats *src)
4338 dst->rx_packets = src->rx_packets;
4339 dst->tx_packets = src->tx_packets;
4340 dst->rx_bytes = src->rx_bytes;
4341 dst->tx_bytes = src->tx_bytes;
4342 dst->rx_errors = src->rx_errors;
4343 dst->tx_errors = src->tx_errors;
4344 dst->rx_dropped = src->rx_dropped;
4345 dst->tx_dropped = src->tx_dropped;
4346 dst->multicast = src->multicast;
4347 dst->collisions = src->collisions;
4348 dst->rx_length_errors = src->rx_length_errors;
4349 dst->rx_over_errors = src->rx_over_errors;
4350 dst->rx_crc_errors = src->rx_crc_errors;
4351 dst->rx_frame_errors = src->rx_frame_errors;
4352 dst->rx_fifo_errors = src->rx_fifo_errors;
4353 dst->rx_missed_errors = src->rx_missed_errors;
4354 dst->tx_aborted_errors = src->tx_aborted_errors;
4355 dst->tx_carrier_errors = src->tx_carrier_errors;
4356 dst->tx_fifo_errors = src->tx_fifo_errors;
4357 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4358 dst->tx_window_errors = src->tx_window_errors;
4362 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4364 /* Policy for RTNLGRP_LINK messages.
4366 * There are *many* more fields in these messages, but currently we only
4367 * care about these fields. */
4368 static const struct nl_policy rtnlgrp_link_policy[] = {
4369 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4370 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4371 .min_len = sizeof(struct rtnl_link_stats) },
4374 struct ofpbuf request;
4375 struct ofpbuf *reply;
4376 struct ifinfomsg *ifi;
4377 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4380 ofpbuf_init(&request, 0);
4381 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4382 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4383 ifi->ifi_family = PF_UNSPEC;
4384 ifi->ifi_index = ifindex;
4385 error = nl_sock_transact(rtnl_sock, &request, &reply);
4386 ofpbuf_uninit(&request);
4391 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4392 rtnlgrp_link_policy,
4393 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4394 ofpbuf_delete(reply);
4398 if (!attrs[IFLA_STATS]) {
4399 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4400 ofpbuf_delete(reply);
4404 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4406 ofpbuf_delete(reply);
4412 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4414 static const char fn[] = "/proc/net/dev";
4419 stream = fopen(fn, "r");
4421 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4426 while (fgets(line, sizeof line, stream)) {
4429 #define X64 "%"SCNu64
4432 X64 X64 X64 X64 X64 X64 X64 "%*u"
4433 X64 X64 X64 X64 X64 X64 X64 "%*u",
4439 &stats->rx_fifo_errors,
4440 &stats->rx_frame_errors,
4446 &stats->tx_fifo_errors,
4448 &stats->tx_carrier_errors) != 15) {
4449 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4450 } else if (!strcmp(devname, netdev_name)) {
4451 stats->rx_length_errors = UINT64_MAX;
4452 stats->rx_over_errors = UINT64_MAX;
4453 stats->rx_crc_errors = UINT64_MAX;
4454 stats->rx_missed_errors = UINT64_MAX;
4455 stats->tx_aborted_errors = UINT64_MAX;
4456 stats->tx_heartbeat_errors = UINT64_MAX;
4457 stats->tx_window_errors = UINT64_MAX;
4463 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4469 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4475 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4478 *flags = ifr.ifr_flags;
4484 set_flags(struct netdev *netdev, unsigned int flags)
4488 ifr.ifr_flags = flags;
4489 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4494 do_get_ifindex(const char *netdev_name)
4498 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4499 COVERAGE_INC(netdev_get_ifindex);
4500 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4501 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4502 netdev_name, strerror(errno));
4505 return ifr.ifr_ifindex;
4509 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4511 struct netdev_dev_linux *netdev_dev =
4512 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4514 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4515 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4518 netdev_dev->get_ifindex_error = -ifindex;
4519 netdev_dev->ifindex = 0;
4521 netdev_dev->get_ifindex_error = 0;
4522 netdev_dev->ifindex = ifindex;
4524 netdev_dev->cache_valid |= VALID_IFINDEX;
4527 *ifindexp = netdev_dev->ifindex;
4528 return netdev_dev->get_ifindex_error;
4532 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4537 memset(&ifr, 0, sizeof ifr);
4538 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4539 COVERAGE_INC(netdev_get_hwaddr);
4540 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4541 /* ENODEV probably means that a vif disappeared asynchronously and
4542 * hasn't been removed from the database yet, so reduce the log level
4543 * to INFO for that case. */
4544 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4545 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4546 netdev_name, strerror(errno));
4549 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4550 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4551 VLOG_WARN("%s device has unknown hardware address family %d",
4552 netdev_name, hwaddr_family);
4554 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4559 set_etheraddr(const char *netdev_name,
4560 const uint8_t mac[ETH_ADDR_LEN])
4564 memset(&ifr, 0, sizeof ifr);
4565 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4566 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4567 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4568 COVERAGE_INC(netdev_set_hwaddr);
4569 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4570 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4571 netdev_name, strerror(errno));
4578 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4579 int cmd, const char *cmd_name)
4583 memset(&ifr, 0, sizeof ifr);
4584 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4585 ifr.ifr_data = (caddr_t) ecmd;
4588 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4591 if (errno != EOPNOTSUPP) {
4592 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4593 "failed: %s", cmd_name, name, strerror(errno));
4595 /* The device doesn't support this operation. That's pretty
4596 * common, so there's no point in logging anything. */
4603 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4604 const char *cmd_name)
4606 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4607 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4608 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4616 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4617 int cmd, const char *cmd_name)
4622 ifr.ifr_addr.sa_family = AF_INET;
4623 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4625 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4626 *ip = sin->sin_addr;
4631 /* Returns an AF_PACKET raw socket or a negative errno value. */
4633 af_packet_sock(void)
4635 static int sock = INT_MIN;
4637 if (sock == INT_MIN) {
4638 sock = socket(AF_PACKET, SOCK_RAW, 0);
4640 int error = set_nonblocking(sock);
4647 VLOG_ERR("failed to create packet socket: %s", strerror(errno));