2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
129 /* Traffic control. */
131 /* An instance of a traffic control class. Always associated with a particular
134 * Each TC implementation subclasses this with whatever additional data it
137 const struct tc_ops *ops;
138 struct hmap queues; /* Contains "struct tc_queue"s.
139 * Read by generic TC layer.
140 * Written only by TC implementation. */
143 /* One traffic control queue.
145 * Each TC implementation subclasses this with whatever additional data it
148 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
149 unsigned int queue_id; /* OpenFlow queue ID. */
152 /* A particular kind of traffic control. Each implementation generally maps to
153 * one particular Linux qdisc class.
155 * The functions below return 0 if successful or a positive errno value on
156 * failure, except where otherwise noted. All of them must be provided, except
157 * where otherwise noted. */
159 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
160 * This is null for tc_ops_default and tc_ops_other, for which there are no
161 * appropriate values. */
162 const char *linux_name;
164 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
165 const char *ovs_name;
167 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
168 * queues. The queues are numbered 0 through n_queues - 1. */
169 unsigned int n_queues;
171 /* Called to install this TC class on 'netdev'. The implementation should
172 * make the Netlink calls required to set up 'netdev' with the right qdisc
173 * and configure it according to 'details'. The implementation may assume
174 * that the current qdisc is the default; that is, there is no need for it
175 * to delete the current qdisc before installing itself.
177 * The contents of 'details' should be documented as valid for 'ovs_name'
178 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
179 * (which is built as ovs-vswitchd.conf.db(8)).
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'.
184 * (This function is null for tc_ops_other, which cannot be installed. For
185 * other TC classes it should always be nonnull.) */
186 int (*tc_install)(struct netdev *netdev, const struct smap *details);
188 /* Called when the netdev code determines (through a Netlink query) that
189 * this TC class's qdisc is installed on 'netdev', but we didn't install
190 * it ourselves and so don't know any of the details.
192 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
193 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
194 * implementation should parse the other attributes of 'nlmsg' as
195 * necessary to determine its configuration. If necessary it should also
196 * use Netlink queries to determine the configuration of queues on
199 * This function must return 0 if and only if it sets 'netdev->tc' to an
200 * initialized 'struct tc'. */
201 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
203 /* Destroys the data structures allocated by the implementation as part of
204 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * The implementation should not need to perform any Netlink calls. If
208 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
209 * (But it may not be desirable.)
211 * This function may be null if 'tc' is trivial. */
212 void (*tc_destroy)(struct tc *tc);
214 /* Retrieves details of 'netdev->tc' configuration into 'details'.
216 * The implementation should not need to perform any Netlink calls, because
217 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
218 * cached the configuration.
220 * The contents of 'details' should be documented as valid for 'ovs_name'
221 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
222 * (which is built as ovs-vswitchd.conf.db(8)).
224 * This function may be null if 'tc' is not configurable.
226 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
228 /* Reconfigures 'netdev->tc' according to 'details', performing any
229 * required Netlink calls to complete the reconfiguration.
231 * The contents of 'details' should be documented as valid for 'ovs_name'
232 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
233 * (which is built as ovs-vswitchd.conf.db(8)).
235 * This function may be null if 'tc' is not configurable.
237 int (*qdisc_set)(struct netdev *, const struct smap *details);
239 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
240 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 * The implementation should not need to perform any Netlink calls, because
247 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
248 * cached the queue configuration.
250 * This function may be null if 'tc' does not have queues ('n_queues' is
252 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
253 struct smap *details);
255 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
256 * 'details', perfoming any required Netlink calls to complete the
257 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * The contents of 'details' should be documented as valid for 'ovs_name'
261 * in the "other_config" column in the "Queue" table in
262 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
264 * This function may be null if 'tc' does not have queues or its queues are
265 * not configurable. */
266 int (*class_set)(struct netdev *, unsigned int queue_id,
267 const struct smap *details);
269 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
270 * tc_queue's within 'netdev->tc->queues'.
272 * This function may be null if 'tc' does not have queues or its queues
273 * cannot be deleted. */
274 int (*class_delete)(struct netdev *, struct tc_queue *queue);
276 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
277 * 'struct tc_queue's within 'netdev->tc->queues'.
279 * On success, initializes '*stats'.
281 * This function may be null if 'tc' does not have queues or if it cannot
282 * report queue statistics. */
283 int (*class_get_stats)(const struct netdev *netdev,
284 const struct tc_queue *queue,
285 struct netdev_queue_stats *stats);
287 /* Extracts queue stats from 'nlmsg', which is a response to a
288 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
290 * This function may be null if 'tc' does not have queues or if it cannot
291 * report queue statistics. */
292 int (*class_dump_stats)(const struct netdev *netdev,
293 const struct ofpbuf *nlmsg,
294 netdev_dump_queue_stats_cb *cb, void *aux);
298 tc_init(struct tc *tc, const struct tc_ops *ops)
301 hmap_init(&tc->queues);
305 tc_destroy(struct tc *tc)
307 hmap_destroy(&tc->queues);
310 static const struct tc_ops tc_ops_htb;
311 static const struct tc_ops tc_ops_hfsc;
312 static const struct tc_ops tc_ops_default;
313 static const struct tc_ops tc_ops_other;
315 static const struct tc_ops *tcs[] = {
316 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
317 &tc_ops_hfsc, /* Hierarchical fair service curve. */
318 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
319 &tc_ops_other, /* Some other qdisc. */
323 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
324 static unsigned int tc_get_major(unsigned int handle);
325 static unsigned int tc_get_minor(unsigned int handle);
327 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
328 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
329 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
331 static struct tcmsg *tc_make_request(const struct netdev *, int type,
332 unsigned int flags, struct ofpbuf *);
333 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
334 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
335 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
339 struct nlattr **options);
340 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
341 struct nlattr **options,
342 struct netdev_queue_stats *);
343 static int tc_query_class(const struct netdev *,
344 unsigned int handle, unsigned int parent,
345 struct ofpbuf **replyp);
346 static int tc_delete_class(const struct netdev *, unsigned int handle);
348 static int tc_del_qdisc(struct netdev *netdev);
349 static int tc_query_qdisc(const struct netdev *netdev);
351 static int tc_calc_cell_log(unsigned int mtu);
352 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
353 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
354 const struct tc_ratespec *rate);
355 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
357 struct netdev_dev_linux {
358 struct netdev_dev netdev_dev;
360 struct shash_node *shash_node;
361 unsigned int cache_valid;
362 unsigned int change_seq;
364 bool miimon; /* Link status of last poll. */
365 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
366 struct timer miimon_timer;
368 /* The following are figured out "on demand" only. They are only valid
369 * when the corresponding VALID_* bit in 'cache_valid' is set. */
371 uint8_t etheraddr[ETH_ADDR_LEN];
372 struct in_addr address, netmask;
375 unsigned int ifi_flags;
376 long long int carrier_resets;
377 uint32_t kbits_rate; /* Policing data. */
378 uint32_t kbits_burst;
379 int vport_stats_error; /* Cached error code from vport_get_stats().
380 0 or an errno value. */
381 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
382 int ether_addr_error; /* Cached error code from set/get etheraddr. */
383 int netdev_policing_error; /* Cached error code from set policing. */
384 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
385 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
387 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
392 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
396 struct tap_state tap;
400 struct netdev_linux {
401 struct netdev netdev;
405 /* Sockets used for ioctl operations. */
406 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
408 /* A Netlink routing socket that is not subscribed to any multicast groups. */
409 static struct nl_sock *rtnl_sock;
411 /* This is set pretty low because we probably won't learn anything from the
412 * additional log messages. */
413 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
415 static int netdev_linux_init(void);
417 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
418 int cmd, const char *cmd_name);
419 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
420 const char *cmd_name);
421 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
422 int cmd, const char *cmd_name);
423 static int get_flags(const struct netdev_dev *, unsigned int *flags);
424 static int set_flags(struct netdev *, unsigned int flags);
425 static int do_get_ifindex(const char *netdev_name);
426 static int get_ifindex(const struct netdev *, int *ifindexp);
427 static int do_set_addr(struct netdev *netdev,
428 int ioctl_nr, const char *ioctl_name,
429 struct in_addr addr);
430 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
431 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
432 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
433 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
434 static int af_packet_sock(void);
435 static void netdev_linux_miimon_run(void);
436 static void netdev_linux_miimon_wait(void);
439 is_netdev_linux_class(const struct netdev_class *netdev_class)
441 return netdev_class->init == netdev_linux_init;
444 static struct netdev_dev_linux *
445 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
447 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
448 assert(is_netdev_linux_class(netdev_class));
450 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
453 static struct netdev_linux *
454 netdev_linux_cast(const struct netdev *netdev)
456 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
457 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
458 assert(is_netdev_linux_class(netdev_class));
460 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
464 netdev_linux_init(void)
466 static int status = -1;
468 /* Create AF_INET socket. */
469 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
470 status = af_inet_sock >= 0 ? 0 : errno;
472 VLOG_ERR("failed to create inet socket: %s", strerror(status));
475 /* Create rtnetlink socket. */
477 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
479 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
488 netdev_linux_run(void)
490 rtnetlink_link_run();
491 netdev_linux_miimon_run();
495 netdev_linux_wait(void)
497 rtnetlink_link_wait();
498 netdev_linux_miimon_wait();
502 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
507 if (netdev_dev->cache_valid & VALID_DRVINFO) {
511 COVERAGE_INC(netdev_get_ethtool);
512 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
513 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
514 (struct ethtool_cmd *)&netdev_dev->drvinfo,
518 netdev_dev->cache_valid |= VALID_DRVINFO;
524 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
525 unsigned int ifi_flags,
529 if (!dev->change_seq) {
533 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
534 dev->carrier_resets++;
536 dev->ifi_flags = ifi_flags;
538 dev->cache_valid &= mask;
542 netdev_dev_linux_update(struct netdev_dev_linux *dev,
543 const struct rtnetlink_link_change *change)
545 if (change->nlmsg_type == RTM_NEWLINK) {
547 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
549 /* Update netdev from rtnl-change msg. */
551 dev->mtu = change->mtu;
552 dev->cache_valid |= VALID_MTU;
553 dev->netdev_mtu_error = 0;
556 if (!eth_addr_is_zero(change->addr)) {
557 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
558 dev->cache_valid |= VALID_ETHERADDR;
559 dev->ether_addr_error = 0;
562 dev->ifindex = change->ifi_index;
563 dev->cache_valid |= VALID_IFINDEX;
564 dev->get_ifindex_error = 0;
567 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
572 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
573 void *aux OVS_UNUSED)
575 struct netdev_dev_linux *dev;
577 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
579 const struct netdev_class *netdev_class =
580 netdev_dev_get_class(base_dev);
582 if (is_netdev_linux_class(netdev_class)) {
583 dev = netdev_dev_linux_cast(base_dev);
584 netdev_dev_linux_update(dev, change);
588 struct shash device_shash;
589 struct shash_node *node;
591 shash_init(&device_shash);
592 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
593 SHASH_FOR_EACH (node, &device_shash) {
598 get_flags(&dev->netdev_dev, &flags);
599 netdev_dev_linux_changed(dev, flags, 0);
601 shash_destroy(&device_shash);
606 cache_notifier_ref(void)
608 if (!cache_notifier_refcount) {
609 assert(!netdev_linux_cache_notifier);
611 netdev_linux_cache_notifier =
612 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
614 if (!netdev_linux_cache_notifier) {
618 cache_notifier_refcount++;
624 cache_notifier_unref(void)
626 assert(cache_notifier_refcount > 0);
627 if (!--cache_notifier_refcount) {
628 assert(netdev_linux_cache_notifier);
629 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
630 netdev_linux_cache_notifier = NULL;
634 /* Creates system and internal devices. */
636 netdev_linux_create(const struct netdev_class *class, const char *name,
637 struct netdev_dev **netdev_devp)
639 struct netdev_dev_linux *netdev_dev;
642 error = cache_notifier_ref();
647 netdev_dev = xzalloc(sizeof *netdev_dev);
648 netdev_dev->change_seq = 1;
649 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
650 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
652 *netdev_devp = &netdev_dev->netdev_dev;
656 /* For most types of netdevs we open the device for each call of
657 * netdev_open(). However, this is not the case with tap devices,
658 * since it is only possible to open the device once. In this
659 * situation we share a single file descriptor, and consequently
660 * buffers, across all readers. Therefore once data is read it will
661 * be unavailable to other reads for tap devices. */
663 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
664 const char *name, struct netdev_dev **netdev_devp)
666 struct netdev_dev_linux *netdev_dev;
667 struct tap_state *state;
668 static const char tap_dev[] = "/dev/net/tun";
672 netdev_dev = xzalloc(sizeof *netdev_dev);
673 state = &netdev_dev->state.tap;
675 error = cache_notifier_ref();
680 /* Open tap device. */
681 state->fd = open(tap_dev, O_RDWR);
684 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
685 goto error_unref_notifier;
688 /* Create tap device. */
689 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
690 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
691 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
692 VLOG_WARN("%s: creating tap device failed: %s", name,
695 goto error_unref_notifier;
698 /* Make non-blocking. */
699 error = set_nonblocking(state->fd);
701 goto error_unref_notifier;
704 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
705 *netdev_devp = &netdev_dev->netdev_dev;
708 error_unref_notifier:
709 cache_notifier_unref();
716 destroy_tap(struct netdev_dev_linux *netdev_dev)
718 struct tap_state *state = &netdev_dev->state.tap;
720 if (state->fd >= 0) {
725 /* Destroys the netdev device 'netdev_dev_'. */
727 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
729 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
730 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
732 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
733 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
736 if (class == &netdev_tap_class || class == &netdev_tap_pl_class) {
737 destroy_tap(netdev_dev);
741 cache_notifier_unref();
745 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
747 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
748 struct netdev_linux *netdev;
749 enum netdev_flags flags;
752 /* Allocate network device. */
753 netdev = xzalloc(sizeof *netdev);
755 netdev_init(&netdev->netdev, netdev_dev_);
757 /* Verify that the device really exists, by attempting to read its flags.
758 * (The flags might be cached, in which case this won't actually do an
761 * Don't do this for "internal" netdevs, though, because those have to be
762 * created as netdev objects before they exist in the kernel, because
763 * creating them in the kernel happens by passing a netdev object to
764 * dpif_port_add(). */
765 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
766 error = netdev_get_flags(&netdev->netdev, &flags);
767 if (error == ENODEV) {
772 if (!strncmp(netdev_dev_get_type(netdev_dev_), "tap", 3) &&
773 !netdev_dev->state.tap.opened) {
775 /* We assume that the first user of the tap device is the primary user
776 * and give them the tap FD. Subsequent users probably just expect
777 * this to be a system device so open it normally to avoid send/receive
778 * directions appearing to be reversed. */
779 netdev->fd = netdev_dev->state.tap.fd;
780 netdev_dev->state.tap.opened = true;
783 *netdevp = &netdev->netdev;
787 netdev_uninit(&netdev->netdev, true);
791 /* Closes and destroys 'netdev'. */
793 netdev_linux_close(struct netdev *netdev_)
795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
797 if (netdev->fd > 0 && strncmp(netdev_get_type(netdev_), "tap", 3)) {
804 netdev_linux_listen(struct netdev *netdev_)
806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
807 struct sockaddr_ll sll;
812 if (netdev->fd >= 0) {
816 /* Create file descriptor. */
817 fd = socket(PF_PACKET, SOCK_RAW, 0);
820 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
824 /* Set non-blocking mode. */
825 error = set_nonblocking(fd);
830 /* Get ethernet device index. */
831 error = get_ifindex(&netdev->netdev, &ifindex);
836 /* Bind to specific ethernet device. */
837 memset(&sll, 0, sizeof sll);
838 sll.sll_family = AF_PACKET;
839 sll.sll_ifindex = ifindex;
840 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
841 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
843 VLOG_ERR("%s: failed to bind raw socket (%s)",
844 netdev_get_name(netdev_), strerror(error));
859 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
863 if (netdev->fd < 0) {
864 /* Device is not listening. */
871 retval = ((netdev_->netdev_dev->netdev_class == &netdev_tap_class ||
872 netdev_->netdev_dev->netdev_class == &netdev_tap_pl_class)
873 ? read(netdev->fd, data, size)
874 : recv(netdev->fd, data, size, MSG_TRUNC));
876 return retval <= size ? retval : -EMSGSIZE;
877 } else if (errno != EINTR) {
878 if (errno != EAGAIN) {
879 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
880 strerror(errno), netdev_get_name(netdev_));
887 /* Registers with the poll loop to wake up from the next call to poll_block()
888 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
890 netdev_linux_recv_wait(struct netdev *netdev_)
892 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
893 if (netdev->fd >= 0) {
894 poll_fd_wait(netdev->fd, POLLIN);
898 /* Discards all packets waiting to be received from 'netdev'. */
900 netdev_linux_drain(struct netdev *netdev_)
902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
903 if (netdev->fd < 0) {
905 } else if (!strncmp(netdev_get_type(netdev_), "tap", 3)) {
907 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
908 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
912 drain_fd(netdev->fd, ifr.ifr_qlen);
915 return drain_rcvbuf(netdev->fd);
919 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
920 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
921 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
922 * the packet is too big or too small to transmit on the device.
924 * The caller retains ownership of 'buffer' in all cases.
926 * The kernel maintains a packet transmission queue, so the caller is not
927 * expected to do additional queuing of packets. */
929 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
931 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
935 if (netdev->fd < 0) {
936 /* Use our AF_PACKET socket to send to this device. */
937 struct sockaddr_ll sll;
944 sock = af_packet_sock();
949 error = get_ifindex(netdev_, &ifindex);
954 /* We don't bother setting most fields in sockaddr_ll because the
955 * kernel ignores them for SOCK_RAW. */
956 memset(&sll, 0, sizeof sll);
957 sll.sll_family = AF_PACKET;
958 sll.sll_ifindex = ifindex;
960 iov.iov_base = CONST_CAST(void *, data);
964 msg.msg_namelen = sizeof sll;
967 msg.msg_control = NULL;
968 msg.msg_controllen = 0;
971 retval = sendmsg(sock, &msg, 0);
973 /* Use the netdev's own fd to send to this device. This is
974 * essential for tap devices, because packets sent to a tap device
975 * with an AF_PACKET socket will loop back to be *received* again
976 * on the tap device. */
977 retval = write(netdev->fd, data, size);
981 /* The Linux AF_PACKET implementation never blocks waiting for room
982 * for packets, instead returning ENOBUFS. Translate this into
983 * EAGAIN for the caller. */
984 if (errno == ENOBUFS) {
986 } else if (errno == EINTR) {
988 } else if (errno != EAGAIN) {
989 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
990 netdev_get_name(netdev_), strerror(errno));
993 } else if (retval != size) {
994 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
995 "%zu) on %s", retval, size, netdev_get_name(netdev_));
1003 /* Registers with the poll loop to wake up from the next call to poll_block()
1004 * when the packet transmission queue has sufficient room to transmit a packet
1005 * with netdev_send().
1007 * The kernel maintains a packet transmission queue, so the client is not
1008 * expected to do additional queuing of packets. Thus, this function is
1009 * unlikely to ever be used. It is included for completeness. */
1011 netdev_linux_send_wait(struct netdev *netdev_)
1013 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1014 if (netdev->fd < 0) {
1015 /* Nothing to do. */
1016 } else if (strncmp(netdev_get_type(netdev_), "tap", 3)) {
1017 poll_fd_wait(netdev->fd, POLLOUT);
1019 /* TAP device always accepts packets.*/
1020 poll_immediate_wake();
1024 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1025 * otherwise a positive errno value. */
1027 netdev_linux_set_etheraddr(struct netdev *netdev_,
1028 const uint8_t mac[ETH_ADDR_LEN])
1030 struct netdev_dev_linux *netdev_dev =
1031 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1034 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1035 if (netdev_dev->ether_addr_error) {
1036 return netdev_dev->ether_addr_error;
1038 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1041 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1044 error = set_etheraddr(netdev_get_name(netdev_), mac);
1045 if (!error || error == ENODEV) {
1046 netdev_dev->ether_addr_error = error;
1047 netdev_dev->cache_valid |= VALID_ETHERADDR;
1049 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1056 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1058 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1059 uint8_t mac[ETH_ADDR_LEN])
1061 struct netdev_dev_linux *netdev_dev =
1062 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1064 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1065 int error = get_etheraddr(netdev_get_name(netdev_),
1066 netdev_dev->etheraddr);
1068 netdev_dev->ether_addr_error = error;
1069 netdev_dev->cache_valid |= VALID_ETHERADDR;
1072 if (!netdev_dev->ether_addr_error) {
1073 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1076 return netdev_dev->ether_addr_error;
1079 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1080 * in bytes, not including the hardware header; thus, this is typically 1500
1081 * bytes for Ethernet devices. */
1083 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1085 struct netdev_dev_linux *netdev_dev =
1086 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1087 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1091 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1092 SIOCGIFMTU, "SIOCGIFMTU");
1094 netdev_dev->netdev_mtu_error = error;
1095 netdev_dev->mtu = ifr.ifr_mtu;
1096 netdev_dev->cache_valid |= VALID_MTU;
1099 if (!netdev_dev->netdev_mtu_error) {
1100 *mtup = netdev_dev->mtu;
1102 return netdev_dev->netdev_mtu_error;
1105 /* Sets the maximum size of transmitted (MTU) for given device using linux
1106 * networking ioctl interface.
1109 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1111 struct netdev_dev_linux *netdev_dev =
1112 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1116 if (netdev_dev->cache_valid & VALID_MTU) {
1117 if (netdev_dev->netdev_mtu_error) {
1118 return netdev_dev->netdev_mtu_error;
1120 if (netdev_dev->mtu == mtu) {
1123 netdev_dev->cache_valid &= ~VALID_MTU;
1126 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1127 SIOCSIFMTU, "SIOCSIFMTU");
1128 if (!error || error == ENODEV) {
1129 netdev_dev->netdev_mtu_error = error;
1130 netdev_dev->mtu = ifr.ifr_mtu;
1131 netdev_dev->cache_valid |= VALID_MTU;
1136 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1137 * On failure, returns a negative errno value. */
1139 netdev_linux_get_ifindex(const struct netdev *netdev)
1143 error = get_ifindex(netdev, &ifindex);
1144 return error ? -error : ifindex;
1148 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1150 struct netdev_dev_linux *netdev_dev =
1151 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1153 if (netdev_dev->miimon_interval > 0) {
1154 *carrier = netdev_dev->miimon;
1156 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1162 static long long int
1163 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1165 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1169 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1170 struct mii_ioctl_data *data)
1175 memset(&ifr, 0, sizeof ifr);
1176 memcpy(&ifr.ifr_data, data, sizeof *data);
1177 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1178 memcpy(data, &ifr.ifr_data, sizeof *data);
1184 netdev_linux_get_miimon(const char *name, bool *miimon)
1186 struct mii_ioctl_data data;
1191 memset(&data, 0, sizeof data);
1192 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1194 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1195 data.reg_num = MII_BMSR;
1196 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1200 *miimon = !!(data.val_out & BMSR_LSTATUS);
1202 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1205 struct ethtool_cmd ecmd;
1207 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1210 COVERAGE_INC(netdev_get_ethtool);
1211 memset(&ecmd, 0, sizeof ecmd);
1212 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1215 struct ethtool_value eval;
1217 memcpy(&eval, &ecmd, sizeof eval);
1218 *miimon = !!eval.data;
1220 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1228 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1229 long long int interval)
1231 struct netdev_dev_linux *netdev_dev;
1233 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1235 interval = interval > 0 ? MAX(interval, 100) : 0;
1236 if (netdev_dev->miimon_interval != interval) {
1237 netdev_dev->miimon_interval = interval;
1238 timer_set_expired(&netdev_dev->miimon_timer);
1245 netdev_linux_miimon_run(void)
1247 struct shash device_shash;
1248 struct shash_node *node;
1250 shash_init(&device_shash);
1251 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1252 SHASH_FOR_EACH (node, &device_shash) {
1253 struct netdev_dev_linux *dev = node->data;
1256 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1260 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1261 if (miimon != dev->miimon) {
1262 dev->miimon = miimon;
1263 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1266 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1269 shash_destroy(&device_shash);
1273 netdev_linux_miimon_wait(void)
1275 struct shash device_shash;
1276 struct shash_node *node;
1278 shash_init(&device_shash);
1279 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1280 SHASH_FOR_EACH (node, &device_shash) {
1281 struct netdev_dev_linux *dev = node->data;
1283 if (dev->miimon_interval > 0) {
1284 timer_wait(&dev->miimon_timer);
1287 shash_destroy(&device_shash);
1290 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1291 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1294 check_for_working_netlink_stats(void)
1296 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1297 * preferable, so if that works, we'll use it. */
1298 int ifindex = do_get_ifindex("lo");
1300 VLOG_WARN("failed to get ifindex for lo, "
1301 "obtaining netdev stats from proc");
1304 struct netdev_stats stats;
1305 int error = get_stats_via_netlink(ifindex, &stats);
1307 VLOG_DBG("obtaining netdev stats via rtnetlink");
1310 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1311 "via proc (you are probably running a pre-2.6.19 "
1312 "kernel)", strerror(error));
1319 swap_uint64(uint64_t *a, uint64_t *b)
1327 get_stats_via_vport(const struct netdev *netdev_,
1328 struct netdev_stats *stats)
1330 struct netdev_dev_linux *netdev_dev =
1331 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1333 if (!netdev_dev->vport_stats_error ||
1334 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1337 error = netdev_vport_get_stats(netdev_, stats);
1339 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1340 "(%s)", netdev_get_name(netdev_), strerror(error));
1342 netdev_dev->vport_stats_error = error;
1343 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1348 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1349 struct netdev_stats *stats)
1351 static int use_netlink_stats = -1;
1354 if (use_netlink_stats < 0) {
1355 use_netlink_stats = check_for_working_netlink_stats();
1358 if (use_netlink_stats) {
1361 error = get_ifindex(netdev_, &ifindex);
1363 error = get_stats_via_netlink(ifindex, stats);
1366 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1370 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1371 netdev_get_name(netdev_), error);
1377 /* Retrieves current device stats for 'netdev-linux'. */
1379 netdev_linux_get_stats(const struct netdev *netdev_,
1380 struct netdev_stats *stats)
1382 struct netdev_dev_linux *netdev_dev =
1383 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1384 struct netdev_stats dev_stats;
1387 get_stats_via_vport(netdev_, stats);
1389 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1392 if (netdev_dev->vport_stats_error) {
1399 if (netdev_dev->vport_stats_error) {
1400 /* stats not available from OVS then use ioctl stats. */
1403 stats->rx_errors += dev_stats.rx_errors;
1404 stats->tx_errors += dev_stats.tx_errors;
1405 stats->rx_dropped += dev_stats.rx_dropped;
1406 stats->tx_dropped += dev_stats.tx_dropped;
1407 stats->multicast += dev_stats.multicast;
1408 stats->collisions += dev_stats.collisions;
1409 stats->rx_length_errors += dev_stats.rx_length_errors;
1410 stats->rx_over_errors += dev_stats.rx_over_errors;
1411 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1412 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1413 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1414 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1415 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1416 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1417 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1418 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1419 stats->tx_window_errors += dev_stats.tx_window_errors;
1424 /* Retrieves current device stats for 'netdev-tap' netdev or
1425 * netdev-internal. */
1427 netdev_tap_get_stats(const struct netdev *netdev_,
1428 struct netdev_stats *stats)
1430 struct netdev_dev_linux *netdev_dev =
1431 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1432 struct netdev_stats dev_stats;
1435 get_stats_via_vport(netdev_, stats);
1437 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1439 if (netdev_dev->vport_stats_error) {
1446 /* If this port is an internal port then the transmit and receive stats
1447 * will appear to be swapped relative to the other ports since we are the
1448 * one sending the data, not a remote computer. For consistency, we swap
1449 * them back here. This does not apply if we are getting stats from the
1450 * vport layer because it always tracks stats from the perspective of the
1452 if (netdev_dev->vport_stats_error) {
1454 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1455 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1456 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1457 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1458 stats->rx_length_errors = 0;
1459 stats->rx_over_errors = 0;
1460 stats->rx_crc_errors = 0;
1461 stats->rx_frame_errors = 0;
1462 stats->rx_fifo_errors = 0;
1463 stats->rx_missed_errors = 0;
1464 stats->tx_aborted_errors = 0;
1465 stats->tx_carrier_errors = 0;
1466 stats->tx_fifo_errors = 0;
1467 stats->tx_heartbeat_errors = 0;
1468 stats->tx_window_errors = 0;
1470 stats->rx_dropped += dev_stats.tx_dropped;
1471 stats->tx_dropped += dev_stats.rx_dropped;
1473 stats->rx_errors += dev_stats.tx_errors;
1474 stats->tx_errors += dev_stats.rx_errors;
1476 stats->multicast += dev_stats.multicast;
1477 stats->collisions += dev_stats.collisions;
1483 netdev_internal_get_stats(const struct netdev *netdev_,
1484 struct netdev_stats *stats)
1486 struct netdev_dev_linux *netdev_dev =
1487 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1489 get_stats_via_vport(netdev_, stats);
1490 return netdev_dev->vport_stats_error;
1494 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1496 struct ethtool_cmd ecmd;
1500 if (netdev_dev->cache_valid & VALID_FEATURES) {
1504 COVERAGE_INC(netdev_get_ethtool);
1505 memset(&ecmd, 0, sizeof ecmd);
1506 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1507 ETHTOOL_GSET, "ETHTOOL_GSET");
1512 /* Supported features. */
1513 netdev_dev->supported = 0;
1514 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1515 netdev_dev->supported |= NETDEV_F_10MB_HD;
1517 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1518 netdev_dev->supported |= NETDEV_F_10MB_FD;
1520 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1521 netdev_dev->supported |= NETDEV_F_100MB_HD;
1523 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1524 netdev_dev->supported |= NETDEV_F_100MB_FD;
1526 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1527 netdev_dev->supported |= NETDEV_F_1GB_HD;
1529 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1530 netdev_dev->supported |= NETDEV_F_1GB_FD;
1532 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1533 netdev_dev->supported |= NETDEV_F_10GB_FD;
1535 if (ecmd.supported & SUPPORTED_TP) {
1536 netdev_dev->supported |= NETDEV_F_COPPER;
1538 if (ecmd.supported & SUPPORTED_FIBRE) {
1539 netdev_dev->supported |= NETDEV_F_FIBER;
1541 if (ecmd.supported & SUPPORTED_Autoneg) {
1542 netdev_dev->supported |= NETDEV_F_AUTONEG;
1544 if (ecmd.supported & SUPPORTED_Pause) {
1545 netdev_dev->supported |= NETDEV_F_PAUSE;
1547 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1548 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1551 /* Advertised features. */
1552 netdev_dev->advertised = 0;
1553 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1554 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1556 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1557 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1559 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1560 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1562 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1563 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1565 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1566 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1568 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1569 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1571 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1572 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1574 if (ecmd.advertising & ADVERTISED_TP) {
1575 netdev_dev->advertised |= NETDEV_F_COPPER;
1577 if (ecmd.advertising & ADVERTISED_FIBRE) {
1578 netdev_dev->advertised |= NETDEV_F_FIBER;
1580 if (ecmd.advertising & ADVERTISED_Autoneg) {
1581 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1583 if (ecmd.advertising & ADVERTISED_Pause) {
1584 netdev_dev->advertised |= NETDEV_F_PAUSE;
1586 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1587 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1590 /* Current settings. */
1592 if (speed == SPEED_10) {
1593 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1594 } else if (speed == SPEED_100) {
1595 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1596 } else if (speed == SPEED_1000) {
1597 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1598 } else if (speed == SPEED_10000) {
1599 netdev_dev->current = NETDEV_F_10GB_FD;
1600 } else if (speed == 40000) {
1601 netdev_dev->current = NETDEV_F_40GB_FD;
1602 } else if (speed == 100000) {
1603 netdev_dev->current = NETDEV_F_100GB_FD;
1604 } else if (speed == 1000000) {
1605 netdev_dev->current = NETDEV_F_1TB_FD;
1607 netdev_dev->current = 0;
1610 if (ecmd.port == PORT_TP) {
1611 netdev_dev->current |= NETDEV_F_COPPER;
1612 } else if (ecmd.port == PORT_FIBRE) {
1613 netdev_dev->current |= NETDEV_F_FIBER;
1617 netdev_dev->current |= NETDEV_F_AUTONEG;
1620 /* Peer advertisements. */
1621 netdev_dev->peer = 0; /* XXX */
1624 netdev_dev->cache_valid |= VALID_FEATURES;
1625 netdev_dev->get_features_error = error;
1628 /* Stores the features supported by 'netdev' into each of '*current',
1629 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1630 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1633 netdev_linux_get_features(const struct netdev *netdev_,
1634 enum netdev_features *current,
1635 enum netdev_features *advertised,
1636 enum netdev_features *supported,
1637 enum netdev_features *peer)
1639 struct netdev_dev_linux *netdev_dev =
1640 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1642 netdev_linux_read_features(netdev_dev);
1644 if (!netdev_dev->get_features_error) {
1645 *current = netdev_dev->current;
1646 *advertised = netdev_dev->advertised;
1647 *supported = netdev_dev->supported;
1648 *peer = netdev_dev->peer;
1650 return netdev_dev->get_features_error;
1653 /* Set the features advertised by 'netdev' to 'advertise'. */
1655 netdev_linux_set_advertisements(struct netdev *netdev,
1656 enum netdev_features advertise)
1658 struct ethtool_cmd ecmd;
1661 COVERAGE_INC(netdev_get_ethtool);
1662 memset(&ecmd, 0, sizeof ecmd);
1663 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1664 ETHTOOL_GSET, "ETHTOOL_GSET");
1669 ecmd.advertising = 0;
1670 if (advertise & NETDEV_F_10MB_HD) {
1671 ecmd.advertising |= ADVERTISED_10baseT_Half;
1673 if (advertise & NETDEV_F_10MB_FD) {
1674 ecmd.advertising |= ADVERTISED_10baseT_Full;
1676 if (advertise & NETDEV_F_100MB_HD) {
1677 ecmd.advertising |= ADVERTISED_100baseT_Half;
1679 if (advertise & NETDEV_F_100MB_FD) {
1680 ecmd.advertising |= ADVERTISED_100baseT_Full;
1682 if (advertise & NETDEV_F_1GB_HD) {
1683 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1685 if (advertise & NETDEV_F_1GB_FD) {
1686 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1688 if (advertise & NETDEV_F_10GB_FD) {
1689 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1691 if (advertise & NETDEV_F_COPPER) {
1692 ecmd.advertising |= ADVERTISED_TP;
1694 if (advertise & NETDEV_F_FIBER) {
1695 ecmd.advertising |= ADVERTISED_FIBRE;
1697 if (advertise & NETDEV_F_AUTONEG) {
1698 ecmd.advertising |= ADVERTISED_Autoneg;
1700 if (advertise & NETDEV_F_PAUSE) {
1701 ecmd.advertising |= ADVERTISED_Pause;
1703 if (advertise & NETDEV_F_PAUSE_ASYM) {
1704 ecmd.advertising |= ADVERTISED_Asym_Pause;
1706 COVERAGE_INC(netdev_set_ethtool);
1707 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1708 ETHTOOL_SSET, "ETHTOOL_SSET");
1711 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1712 * successful, otherwise a positive errno value. */
1714 netdev_linux_set_policing(struct netdev *netdev,
1715 uint32_t kbits_rate, uint32_t kbits_burst)
1717 struct netdev_dev_linux *netdev_dev =
1718 netdev_dev_linux_cast(netdev_get_dev(netdev));
1719 const char *netdev_name = netdev_get_name(netdev);
1723 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1724 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1725 : kbits_burst); /* Stick with user-specified value. */
1727 if (netdev_dev->cache_valid & VALID_POLICING) {
1728 if (netdev_dev->netdev_policing_error) {
1729 return netdev_dev->netdev_policing_error;
1732 if (netdev_dev->kbits_rate == kbits_rate &&
1733 netdev_dev->kbits_burst == kbits_burst) {
1734 /* Assume that settings haven't changed since we last set them. */
1737 netdev_dev->cache_valid &= ~VALID_POLICING;
1740 COVERAGE_INC(netdev_set_policing);
1741 /* Remove any existing ingress qdisc. */
1742 error = tc_add_del_ingress_qdisc(netdev, false);
1744 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1745 netdev_name, strerror(error));
1750 error = tc_add_del_ingress_qdisc(netdev, true);
1752 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1753 netdev_name, strerror(error));
1757 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1759 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1760 netdev_name, strerror(error));
1765 netdev_dev->kbits_rate = kbits_rate;
1766 netdev_dev->kbits_burst = kbits_burst;
1769 if (!error || error == ENODEV) {
1770 netdev_dev->netdev_policing_error = error;
1771 netdev_dev->cache_valid |= VALID_POLICING;
1777 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1780 const struct tc_ops **opsp;
1782 for (opsp = tcs; *opsp != NULL; opsp++) {
1783 const struct tc_ops *ops = *opsp;
1784 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1785 sset_add(types, ops->ovs_name);
1792 netdev_linux_create_tap_pl(const struct netdev_class *class OVS_UNUSED,
1793 const char *name, struct netdev_dev **netdev_devp)
1795 struct netdev_dev_linux *netdev_dev;
1796 struct tap_state *state;
1797 char real_name[IFNAMSIZ];
1800 netdev_dev = xzalloc(sizeof *netdev_dev);
1801 state = &netdev_dev->state.tap;
1803 error = cache_notifier_ref();
1808 /* Open tap device. */
1809 state->fd = tun_alloc(IFF_TAP, real_name);
1810 if (state->fd < 0) {
1812 VLOG_WARN("tun_alloc(IFF_TAP, %s) failed: %s", name, strerror(error));
1813 goto error_unref_notifier;
1815 if (strcmp(name, real_name)) {
1816 VLOG_WARN("tap_pl: requested %s, created %s", name, real_name);
1819 /* Make non-blocking. */
1820 error = set_nonblocking(state->fd);
1822 goto error_unref_notifier;
1825 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_pl_class);
1826 *netdev_devp = &netdev_dev->netdev_dev;
1829 error_unref_notifier:
1830 cache_notifier_unref();
1836 static const struct tc_ops *
1837 tc_lookup_ovs_name(const char *name)
1839 const struct tc_ops **opsp;
1841 for (opsp = tcs; *opsp != NULL; opsp++) {
1842 const struct tc_ops *ops = *opsp;
1843 if (!strcmp(name, ops->ovs_name)) {
1850 static const struct tc_ops *
1851 tc_lookup_linux_name(const char *name)
1853 const struct tc_ops **opsp;
1855 for (opsp = tcs; *opsp != NULL; opsp++) {
1856 const struct tc_ops *ops = *opsp;
1857 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1864 static struct tc_queue *
1865 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1868 struct netdev_dev_linux *netdev_dev =
1869 netdev_dev_linux_cast(netdev_get_dev(netdev));
1870 struct tc_queue *queue;
1872 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1873 if (queue->queue_id == queue_id) {
1880 static struct tc_queue *
1881 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1883 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1887 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1889 struct netdev_qos_capabilities *caps)
1891 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1895 caps->n_queues = ops->n_queues;
1900 netdev_linux_get_qos(const struct netdev *netdev,
1901 const char **typep, struct smap *details)
1903 struct netdev_dev_linux *netdev_dev =
1904 netdev_dev_linux_cast(netdev_get_dev(netdev));
1907 error = tc_query_qdisc(netdev);
1912 *typep = netdev_dev->tc->ops->ovs_name;
1913 return (netdev_dev->tc->ops->qdisc_get
1914 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1919 netdev_linux_set_qos(struct netdev *netdev,
1920 const char *type, const struct smap *details)
1922 struct netdev_dev_linux *netdev_dev =
1923 netdev_dev_linux_cast(netdev_get_dev(netdev));
1924 const struct tc_ops *new_ops;
1927 new_ops = tc_lookup_ovs_name(type);
1928 if (!new_ops || !new_ops->tc_install) {
1932 error = tc_query_qdisc(netdev);
1937 if (new_ops == netdev_dev->tc->ops) {
1938 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1940 /* Delete existing qdisc. */
1941 error = tc_del_qdisc(netdev);
1945 assert(netdev_dev->tc == NULL);
1947 /* Install new qdisc. */
1948 error = new_ops->tc_install(netdev, details);
1949 assert((error == 0) == (netdev_dev->tc != NULL));
1956 netdev_linux_get_queue(const struct netdev *netdev,
1957 unsigned int queue_id, struct smap *details)
1959 struct netdev_dev_linux *netdev_dev =
1960 netdev_dev_linux_cast(netdev_get_dev(netdev));
1963 error = tc_query_qdisc(netdev);
1967 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1969 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1975 netdev_linux_set_queue(struct netdev *netdev,
1976 unsigned int queue_id, const struct smap *details)
1978 struct netdev_dev_linux *netdev_dev =
1979 netdev_dev_linux_cast(netdev_get_dev(netdev));
1982 error = tc_query_qdisc(netdev);
1985 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1986 || !netdev_dev->tc->ops->class_set) {
1990 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1994 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1996 struct netdev_dev_linux *netdev_dev =
1997 netdev_dev_linux_cast(netdev_get_dev(netdev));
2000 error = tc_query_qdisc(netdev);
2003 } else if (!netdev_dev->tc->ops->class_delete) {
2006 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2008 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2014 netdev_linux_get_queue_stats(const struct netdev *netdev,
2015 unsigned int queue_id,
2016 struct netdev_queue_stats *stats)
2018 struct netdev_dev_linux *netdev_dev =
2019 netdev_dev_linux_cast(netdev_get_dev(netdev));
2022 error = tc_query_qdisc(netdev);
2025 } else if (!netdev_dev->tc->ops->class_get_stats) {
2028 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2030 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2036 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2038 struct ofpbuf request;
2039 struct tcmsg *tcmsg;
2041 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2045 tcmsg->tcm_parent = 0;
2046 nl_dump_start(dump, rtnl_sock, &request);
2047 ofpbuf_uninit(&request);
2052 netdev_linux_dump_queues(const struct netdev *netdev,
2053 netdev_dump_queues_cb *cb, void *aux)
2055 struct netdev_dev_linux *netdev_dev =
2056 netdev_dev_linux_cast(netdev_get_dev(netdev));
2057 struct tc_queue *queue, *next_queue;
2058 struct smap details;
2062 error = tc_query_qdisc(netdev);
2065 } else if (!netdev_dev->tc->ops->class_get) {
2070 smap_init(&details);
2071 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2072 &netdev_dev->tc->queues) {
2073 smap_clear(&details);
2075 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2077 (*cb)(queue->queue_id, &details, aux);
2082 smap_destroy(&details);
2088 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2089 netdev_dump_queue_stats_cb *cb, void *aux)
2091 struct netdev_dev_linux *netdev_dev =
2092 netdev_dev_linux_cast(netdev_get_dev(netdev));
2093 struct nl_dump dump;
2098 error = tc_query_qdisc(netdev);
2101 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2106 if (!start_queue_dump(netdev, &dump)) {
2109 while (nl_dump_next(&dump, &msg)) {
2110 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2116 error = nl_dump_done(&dump);
2117 return error ? error : last_error;
2121 netdev_linux_get_in4(const struct netdev *netdev_,
2122 struct in_addr *address, struct in_addr *netmask)
2124 struct netdev_dev_linux *netdev_dev =
2125 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2127 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2130 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2131 SIOCGIFADDR, "SIOCGIFADDR");
2136 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2137 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2142 netdev_dev->cache_valid |= VALID_IN4;
2144 *address = netdev_dev->address;
2145 *netmask = netdev_dev->netmask;
2146 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2150 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2151 struct in_addr netmask)
2153 struct netdev_dev_linux *netdev_dev =
2154 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2157 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2159 netdev_dev->cache_valid |= VALID_IN4;
2160 netdev_dev->address = address;
2161 netdev_dev->netmask = netmask;
2162 if (address.s_addr != INADDR_ANY) {
2163 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2164 "SIOCSIFNETMASK", netmask);
2171 parse_if_inet6_line(const char *line,
2172 struct in6_addr *in6, char ifname[16 + 1])
2174 uint8_t *s6 = in6->s6_addr;
2175 #define X8 "%2"SCNx8
2177 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2178 "%*x %*x %*x %*x %16s\n",
2179 &s6[0], &s6[1], &s6[2], &s6[3],
2180 &s6[4], &s6[5], &s6[6], &s6[7],
2181 &s6[8], &s6[9], &s6[10], &s6[11],
2182 &s6[12], &s6[13], &s6[14], &s6[15],
2186 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2187 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2189 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2191 struct netdev_dev_linux *netdev_dev =
2192 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2193 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2197 netdev_dev->in6 = in6addr_any;
2199 file = fopen("/proc/net/if_inet6", "r");
2201 const char *name = netdev_get_name(netdev_);
2202 while (fgets(line, sizeof line, file)) {
2203 struct in6_addr in6_tmp;
2204 char ifname[16 + 1];
2205 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2206 && !strcmp(name, ifname))
2208 netdev_dev->in6 = in6_tmp;
2214 netdev_dev->cache_valid |= VALID_IN6;
2216 *in6 = netdev_dev->in6;
2221 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2223 struct sockaddr_in sin;
2224 memset(&sin, 0, sizeof sin);
2225 sin.sin_family = AF_INET;
2226 sin.sin_addr = addr;
2229 memset(sa, 0, sizeof *sa);
2230 memcpy(sa, &sin, sizeof sin);
2234 do_set_addr(struct netdev *netdev,
2235 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2238 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2239 make_in4_sockaddr(&ifr.ifr_addr, addr);
2241 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2245 /* Adds 'router' as a default IP gateway. */
2247 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2249 struct in_addr any = { INADDR_ANY };
2253 memset(&rt, 0, sizeof rt);
2254 make_in4_sockaddr(&rt.rt_dst, any);
2255 make_in4_sockaddr(&rt.rt_gateway, router);
2256 make_in4_sockaddr(&rt.rt_genmask, any);
2257 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2258 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2260 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2266 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2269 static const char fn[] = "/proc/net/route";
2274 *netdev_name = NULL;
2275 stream = fopen(fn, "r");
2276 if (stream == NULL) {
2277 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2282 while (fgets(line, sizeof line, stream)) {
2285 ovs_be32 dest, gateway, mask;
2286 int refcnt, metric, mtu;
2287 unsigned int flags, use, window, irtt;
2290 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2292 iface, &dest, &gateway, &flags, &refcnt,
2293 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2295 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2299 if (!(flags & RTF_UP)) {
2300 /* Skip routes that aren't up. */
2304 /* The output of 'dest', 'mask', and 'gateway' were given in
2305 * network byte order, so we don't need need any endian
2306 * conversions here. */
2307 if ((dest & mask) == (host->s_addr & mask)) {
2309 /* The host is directly reachable. */
2310 next_hop->s_addr = 0;
2312 /* To reach the host, we must go through a gateway. */
2313 next_hop->s_addr = gateway;
2315 *netdev_name = xstrdup(iface);
2327 netdev_linux_get_drv_info(const struct netdev *netdev, struct smap *smap)
2330 struct netdev_dev_linux *netdev_dev =
2331 netdev_dev_linux_cast(netdev_get_dev(netdev));
2333 error = netdev_linux_get_drvinfo(netdev_dev);
2335 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2336 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2337 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2343 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED,
2346 smap_add(smap, "driver_name", "openvswitch");
2350 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2351 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2352 * returns 0. Otherwise, it returns a positive errno value; in particular,
2353 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2355 netdev_linux_arp_lookup(const struct netdev *netdev,
2356 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2359 struct sockaddr_in sin;
2362 memset(&r, 0, sizeof r);
2363 memset(&sin, 0, sizeof sin);
2364 sin.sin_family = AF_INET;
2365 sin.sin_addr.s_addr = ip;
2367 memcpy(&r.arp_pa, &sin, sizeof sin);
2368 r.arp_ha.sa_family = ARPHRD_ETHER;
2370 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2371 COVERAGE_INC(netdev_arp_lookup);
2372 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2374 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2375 } else if (retval != ENXIO) {
2376 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2377 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2383 nd_to_iff_flags(enum netdev_flags nd)
2386 if (nd & NETDEV_UP) {
2389 if (nd & NETDEV_PROMISC) {
2396 iff_to_nd_flags(int iff)
2398 enum netdev_flags nd = 0;
2402 if (iff & IFF_PROMISC) {
2403 nd |= NETDEV_PROMISC;
2409 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2410 enum netdev_flags on, enum netdev_flags *old_flagsp)
2412 struct netdev_dev_linux *netdev_dev;
2413 int old_flags, new_flags;
2416 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2417 old_flags = netdev_dev->ifi_flags;
2418 *old_flagsp = iff_to_nd_flags(old_flags);
2419 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2420 if (new_flags != old_flags) {
2421 error = set_flags(netdev, new_flags);
2422 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2428 netdev_tap_pl_update_flags(struct netdev *netdev OVS_UNUSED, enum netdev_flags off OVS_UNUSED,
2429 enum netdev_flags on OVS_UNUSED, enum netdev_flags *old_flagsp OVS_UNUSED)
2435 netdev_linux_change_seq(const struct netdev *netdev)
2437 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2440 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2441 GET_FEATURES, GET_STATUS, \
2446 netdev_linux_init, \
2448 netdev_linux_wait, \
2451 netdev_linux_destroy, \
2452 NULL, /* get_config */ \
2453 NULL, /* set_config */ \
2455 netdev_linux_open, \
2456 netdev_linux_close, \
2458 netdev_linux_listen, \
2459 netdev_linux_recv, \
2460 netdev_linux_recv_wait, \
2461 netdev_linux_drain, \
2463 netdev_linux_send, \
2464 netdev_linux_send_wait, \
2466 netdev_linux_set_etheraddr, \
2467 netdev_linux_get_etheraddr, \
2468 netdev_linux_get_mtu, \
2469 netdev_linux_set_mtu, \
2470 netdev_linux_get_ifindex, \
2471 netdev_linux_get_carrier, \
2472 netdev_linux_get_carrier_resets, \
2473 netdev_linux_set_miimon_interval, \
2478 netdev_linux_set_advertisements, \
2480 netdev_linux_set_policing, \
2481 netdev_linux_get_qos_types, \
2482 netdev_linux_get_qos_capabilities, \
2483 netdev_linux_get_qos, \
2484 netdev_linux_set_qos, \
2485 netdev_linux_get_queue, \
2486 netdev_linux_set_queue, \
2487 netdev_linux_delete_queue, \
2488 netdev_linux_get_queue_stats, \
2489 netdev_linux_dump_queues, \
2490 netdev_linux_dump_queue_stats, \
2492 netdev_linux_get_in4, \
2493 netdev_linux_set_in4, \
2494 netdev_linux_get_in6, \
2495 netdev_linux_add_router, \
2496 netdev_linux_get_next_hop, \
2498 netdev_linux_arp_lookup, \
2502 netdev_linux_change_seq \
2505 const struct netdev_class netdev_linux_class =
2508 netdev_linux_create,
2509 netdev_linux_get_stats,
2510 NULL, /* set_stats */
2511 netdev_linux_get_features,
2512 netdev_linux_get_drv_info,
2513 netdev_linux_update_flags);
2515 const struct netdev_class netdev_tap_class =
2518 netdev_linux_create_tap,
2519 netdev_tap_get_stats,
2520 NULL, /* set_stats */
2521 netdev_linux_get_features,
2522 netdev_linux_get_drv_info,
2523 netdev_linux_update_flags);
2525 const struct netdev_class netdev_internal_class =
2528 netdev_linux_create,
2529 netdev_internal_get_stats,
2530 netdev_vport_set_stats,
2531 NULL, /* get_features */
2532 netdev_internal_get_drv_info,
2533 netdev_linux_update_flags);
2535 const struct netdev_class netdev_tap_pl_class =
2538 netdev_linux_create_tap_pl,
2539 netdev_tap_get_stats,
2540 NULL, /* set_stats */
2541 netdev_linux_get_features,
2542 netdev_linux_get_drv_info,
2543 netdev_tap_pl_update_flags);
2545 /* HTB traffic control class. */
2547 #define HTB_N_QUEUES 0xf000
2551 unsigned int max_rate; /* In bytes/s. */
2555 struct tc_queue tc_queue;
2556 unsigned int min_rate; /* In bytes/s. */
2557 unsigned int max_rate; /* In bytes/s. */
2558 unsigned int burst; /* In bytes. */
2559 unsigned int priority; /* Lower values are higher priorities. */
2563 htb_get__(const struct netdev *netdev)
2565 struct netdev_dev_linux *netdev_dev =
2566 netdev_dev_linux_cast(netdev_get_dev(netdev));
2567 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2571 htb_install__(struct netdev *netdev, uint64_t max_rate)
2573 struct netdev_dev_linux *netdev_dev =
2574 netdev_dev_linux_cast(netdev_get_dev(netdev));
2577 htb = xmalloc(sizeof *htb);
2578 tc_init(&htb->tc, &tc_ops_htb);
2579 htb->max_rate = max_rate;
2581 netdev_dev->tc = &htb->tc;
2584 /* Create an HTB qdisc.
2586 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2588 htb_setup_qdisc__(struct netdev *netdev)
2591 struct tc_htb_glob opt;
2592 struct ofpbuf request;
2593 struct tcmsg *tcmsg;
2595 tc_del_qdisc(netdev);
2597 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2598 NLM_F_EXCL | NLM_F_CREATE, &request);
2602 tcmsg->tcm_handle = tc_make_handle(1, 0);
2603 tcmsg->tcm_parent = TC_H_ROOT;
2605 nl_msg_put_string(&request, TCA_KIND, "htb");
2607 memset(&opt, 0, sizeof opt);
2608 opt.rate2quantum = 10;
2612 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2613 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2614 nl_msg_end_nested(&request, opt_offset);
2616 return tc_transact(&request, NULL);
2619 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2620 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2622 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2623 unsigned int parent, struct htb_class *class)
2626 struct tc_htb_opt opt;
2627 struct ofpbuf request;
2628 struct tcmsg *tcmsg;
2632 error = netdev_get_mtu(netdev, &mtu);
2634 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2635 netdev_get_name(netdev));
2639 memset(&opt, 0, sizeof opt);
2640 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2641 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2642 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2643 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2644 opt.prio = class->priority;
2646 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2650 tcmsg->tcm_handle = handle;
2651 tcmsg->tcm_parent = parent;
2653 nl_msg_put_string(&request, TCA_KIND, "htb");
2654 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2655 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2656 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2657 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2658 nl_msg_end_nested(&request, opt_offset);
2660 error = tc_transact(&request, NULL);
2662 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2663 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2664 netdev_get_name(netdev),
2665 tc_get_major(handle), tc_get_minor(handle),
2666 tc_get_major(parent), tc_get_minor(parent),
2667 class->min_rate, class->max_rate,
2668 class->burst, class->priority, strerror(error));
2673 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2674 * description of them into 'details'. The description complies with the
2675 * specification given in the vswitch database documentation for linux-htb
2678 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2680 static const struct nl_policy tca_htb_policy[] = {
2681 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2682 .min_len = sizeof(struct tc_htb_opt) },
2685 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2686 const struct tc_htb_opt *htb;
2688 if (!nl_parse_nested(nl_options, tca_htb_policy,
2689 attrs, ARRAY_SIZE(tca_htb_policy))) {
2690 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2694 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2695 class->min_rate = htb->rate.rate;
2696 class->max_rate = htb->ceil.rate;
2697 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2698 class->priority = htb->prio;
2703 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2704 struct htb_class *options,
2705 struct netdev_queue_stats *stats)
2707 struct nlattr *nl_options;
2708 unsigned int handle;
2711 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2712 if (!error && queue_id) {
2713 unsigned int major = tc_get_major(handle);
2714 unsigned int minor = tc_get_minor(handle);
2715 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2716 *queue_id = minor - 1;
2721 if (!error && options) {
2722 error = htb_parse_tca_options__(nl_options, options);
2728 htb_parse_qdisc_details__(struct netdev *netdev,
2729 const struct smap *details, struct htb_class *hc)
2731 const char *max_rate_s;
2733 max_rate_s = smap_get(details, "max-rate");
2734 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2735 if (!hc->max_rate) {
2736 enum netdev_features current;
2738 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2739 hc->max_rate = netdev_features_to_bps(current) / 8;
2741 hc->min_rate = hc->max_rate;
2747 htb_parse_class_details__(struct netdev *netdev,
2748 const struct smap *details, struct htb_class *hc)
2750 const struct htb *htb = htb_get__(netdev);
2751 const char *min_rate_s = smap_get(details, "min-rate");
2752 const char *max_rate_s = smap_get(details, "max-rate");
2753 const char *burst_s = smap_get(details, "burst");
2754 const char *priority_s = smap_get(details, "priority");
2757 error = netdev_get_mtu(netdev, &mtu);
2759 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2760 netdev_get_name(netdev));
2764 /* HTB requires at least an mtu sized min-rate to send any traffic even
2765 * on uncongested links. */
2766 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2767 hc->min_rate = MAX(hc->min_rate, mtu);
2768 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2771 hc->max_rate = (max_rate_s
2772 ? strtoull(max_rate_s, NULL, 10) / 8
2774 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2775 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2779 * According to hints in the documentation that I've read, it is important
2780 * that 'burst' be at least as big as the largest frame that might be
2781 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2782 * but having it a bit too small is a problem. Since netdev_get_mtu()
2783 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2784 * the MTU. We actually add 64, instead of 14, as a guard against
2785 * additional headers get tacked on somewhere that we're not aware of. */
2786 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2787 hc->burst = MAX(hc->burst, mtu + 64);
2790 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2796 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2797 unsigned int parent, struct htb_class *options,
2798 struct netdev_queue_stats *stats)
2800 struct ofpbuf *reply;
2803 error = tc_query_class(netdev, handle, parent, &reply);
2805 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2806 ofpbuf_delete(reply);
2812 htb_tc_install(struct netdev *netdev, const struct smap *details)
2816 error = htb_setup_qdisc__(netdev);
2818 struct htb_class hc;
2820 htb_parse_qdisc_details__(netdev, details, &hc);
2821 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2822 tc_make_handle(1, 0), &hc);
2824 htb_install__(netdev, hc.max_rate);
2830 static struct htb_class *
2831 htb_class_cast__(const struct tc_queue *queue)
2833 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2837 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2838 const struct htb_class *hc)
2840 struct htb *htb = htb_get__(netdev);
2841 size_t hash = hash_int(queue_id, 0);
2842 struct tc_queue *queue;
2843 struct htb_class *hcp;
2845 queue = tc_find_queue__(netdev, queue_id, hash);
2847 hcp = htb_class_cast__(queue);
2849 hcp = xmalloc(sizeof *hcp);
2850 queue = &hcp->tc_queue;
2851 queue->queue_id = queue_id;
2852 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2855 hcp->min_rate = hc->min_rate;
2856 hcp->max_rate = hc->max_rate;
2857 hcp->burst = hc->burst;
2858 hcp->priority = hc->priority;
2862 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2865 struct nl_dump dump;
2866 struct htb_class hc;
2868 /* Get qdisc options. */
2870 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2871 htb_install__(netdev, hc.max_rate);
2874 if (!start_queue_dump(netdev, &dump)) {
2877 while (nl_dump_next(&dump, &msg)) {
2878 unsigned int queue_id;
2880 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2881 htb_update_queue__(netdev, queue_id, &hc);
2884 nl_dump_done(&dump);
2890 htb_tc_destroy(struct tc *tc)
2892 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2893 struct htb_class *hc, *next;
2895 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2896 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2904 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2906 const struct htb *htb = htb_get__(netdev);
2907 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2912 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2914 struct htb_class hc;
2917 htb_parse_qdisc_details__(netdev, details, &hc);
2918 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2919 tc_make_handle(1, 0), &hc);
2921 htb_get__(netdev)->max_rate = hc.max_rate;
2927 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2928 const struct tc_queue *queue, struct smap *details)
2930 const struct htb_class *hc = htb_class_cast__(queue);
2932 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2933 if (hc->min_rate != hc->max_rate) {
2934 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2936 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2938 smap_add_format(details, "priority", "%u", hc->priority);
2944 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2945 const struct smap *details)
2947 struct htb_class hc;
2950 error = htb_parse_class_details__(netdev, details, &hc);
2955 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2956 tc_make_handle(1, 0xfffe), &hc);
2961 htb_update_queue__(netdev, queue_id, &hc);
2966 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2968 struct htb_class *hc = htb_class_cast__(queue);
2969 struct htb *htb = htb_get__(netdev);
2972 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2974 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2981 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2982 struct netdev_queue_stats *stats)
2984 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2985 tc_make_handle(1, 0xfffe), NULL, stats);
2989 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2990 const struct ofpbuf *nlmsg,
2991 netdev_dump_queue_stats_cb *cb, void *aux)
2993 struct netdev_queue_stats stats;
2994 unsigned int handle, major, minor;
2997 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3002 major = tc_get_major(handle);
3003 minor = tc_get_minor(handle);
3004 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3005 (*cb)(minor - 1, &stats, aux);
3010 static const struct tc_ops tc_ops_htb = {
3011 "htb", /* linux_name */
3012 "linux-htb", /* ovs_name */
3013 HTB_N_QUEUES, /* n_queues */
3022 htb_class_get_stats,
3023 htb_class_dump_stats
3026 /* "linux-hfsc" traffic control class. */
3028 #define HFSC_N_QUEUES 0xf000
3036 struct tc_queue tc_queue;
3041 static struct hfsc *
3042 hfsc_get__(const struct netdev *netdev)
3044 struct netdev_dev_linux *netdev_dev;
3045 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3046 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3049 static struct hfsc_class *
3050 hfsc_class_cast__(const struct tc_queue *queue)
3052 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3056 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3058 struct netdev_dev_linux * netdev_dev;
3061 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3062 hfsc = xmalloc(sizeof *hfsc);
3063 tc_init(&hfsc->tc, &tc_ops_hfsc);
3064 hfsc->max_rate = max_rate;
3065 netdev_dev->tc = &hfsc->tc;
3069 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3070 const struct hfsc_class *hc)
3074 struct hfsc_class *hcp;
3075 struct tc_queue *queue;
3077 hfsc = hfsc_get__(netdev);
3078 hash = hash_int(queue_id, 0);
3080 queue = tc_find_queue__(netdev, queue_id, hash);
3082 hcp = hfsc_class_cast__(queue);
3084 hcp = xmalloc(sizeof *hcp);
3085 queue = &hcp->tc_queue;
3086 queue->queue_id = queue_id;
3087 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3090 hcp->min_rate = hc->min_rate;
3091 hcp->max_rate = hc->max_rate;
3095 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3097 const struct tc_service_curve *rsc, *fsc, *usc;
3098 static const struct nl_policy tca_hfsc_policy[] = {
3100 .type = NL_A_UNSPEC,
3102 .min_len = sizeof(struct tc_service_curve),
3105 .type = NL_A_UNSPEC,
3107 .min_len = sizeof(struct tc_service_curve),
3110 .type = NL_A_UNSPEC,
3112 .min_len = sizeof(struct tc_service_curve),
3115 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3117 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3118 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3119 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3123 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3124 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3125 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3127 if (rsc->m1 != 0 || rsc->d != 0 ||
3128 fsc->m1 != 0 || fsc->d != 0 ||
3129 usc->m1 != 0 || usc->d != 0) {
3130 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3131 "Non-linear service curves are not supported.");
3135 if (rsc->m2 != fsc->m2) {
3136 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3137 "Real-time service curves are not supported ");
3141 if (rsc->m2 > usc->m2) {
3142 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3143 "Min-rate service curve is greater than "
3144 "the max-rate service curve.");
3148 class->min_rate = fsc->m2;
3149 class->max_rate = usc->m2;
3154 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3155 struct hfsc_class *options,
3156 struct netdev_queue_stats *stats)
3159 unsigned int handle;
3160 struct nlattr *nl_options;
3162 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3168 unsigned int major, minor;
3170 major = tc_get_major(handle);
3171 minor = tc_get_minor(handle);
3172 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3173 *queue_id = minor - 1;
3180 error = hfsc_parse_tca_options__(nl_options, options);
3187 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3188 unsigned int parent, struct hfsc_class *options,
3189 struct netdev_queue_stats *stats)
3192 struct ofpbuf *reply;
3194 error = tc_query_class(netdev, handle, parent, &reply);
3199 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3200 ofpbuf_delete(reply);
3205 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3206 struct hfsc_class *class)
3209 const char *max_rate_s;
3211 max_rate_s = smap_get(details, "max-rate");
3212 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3215 enum netdev_features current;
3217 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3218 max_rate = netdev_features_to_bps(current) / 8;
3221 class->min_rate = max_rate;
3222 class->max_rate = max_rate;
3226 hfsc_parse_class_details__(struct netdev *netdev,
3227 const struct smap *details,
3228 struct hfsc_class * class)
3230 const struct hfsc *hfsc;
3231 uint32_t min_rate, max_rate;
3232 const char *min_rate_s, *max_rate_s;
3234 hfsc = hfsc_get__(netdev);
3235 min_rate_s = smap_get(details, "min-rate");
3236 max_rate_s = smap_get(details, "max-rate");
3238 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3239 min_rate = MAX(min_rate, 1);
3240 min_rate = MIN(min_rate, hfsc->max_rate);
3242 max_rate = (max_rate_s
3243 ? strtoull(max_rate_s, NULL, 10) / 8
3245 max_rate = MAX(max_rate, min_rate);
3246 max_rate = MIN(max_rate, hfsc->max_rate);
3248 class->min_rate = min_rate;
3249 class->max_rate = max_rate;
3254 /* Create an HFSC qdisc.
3256 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3258 hfsc_setup_qdisc__(struct netdev * netdev)
3260 struct tcmsg *tcmsg;
3261 struct ofpbuf request;
3262 struct tc_hfsc_qopt opt;
3264 tc_del_qdisc(netdev);
3266 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3267 NLM_F_EXCL | NLM_F_CREATE, &request);
3273 tcmsg->tcm_handle = tc_make_handle(1, 0);
3274 tcmsg->tcm_parent = TC_H_ROOT;
3276 memset(&opt, 0, sizeof opt);
3279 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3280 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3282 return tc_transact(&request, NULL);
3285 /* Create an HFSC class.
3287 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3288 * sc rate <min_rate> ul rate <max_rate>" */
3290 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3291 unsigned int parent, struct hfsc_class *class)
3295 struct tcmsg *tcmsg;
3296 struct ofpbuf request;
3297 struct tc_service_curve min, max;
3299 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3305 tcmsg->tcm_handle = handle;
3306 tcmsg->tcm_parent = parent;
3310 min.m2 = class->min_rate;
3314 max.m2 = class->max_rate;
3316 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3317 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3318 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3319 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3320 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3321 nl_msg_end_nested(&request, opt_offset);
3323 error = tc_transact(&request, NULL);
3325 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3326 "min-rate %ubps, max-rate %ubps (%s)",
3327 netdev_get_name(netdev),
3328 tc_get_major(handle), tc_get_minor(handle),
3329 tc_get_major(parent), tc_get_minor(parent),
3330 class->min_rate, class->max_rate, strerror(error));
3337 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3340 struct hfsc_class class;
3342 error = hfsc_setup_qdisc__(netdev);
3348 hfsc_parse_qdisc_details__(netdev, details, &class);
3349 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3350 tc_make_handle(1, 0), &class);
3356 hfsc_install__(netdev, class.max_rate);
3361 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3364 struct nl_dump dump;
3365 struct hfsc_class hc;
3368 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3369 hfsc_install__(netdev, hc.max_rate);
3371 if (!start_queue_dump(netdev, &dump)) {
3375 while (nl_dump_next(&dump, &msg)) {
3376 unsigned int queue_id;
3378 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3379 hfsc_update_queue__(netdev, queue_id, &hc);
3383 nl_dump_done(&dump);
3388 hfsc_tc_destroy(struct tc *tc)
3391 struct hfsc_class *hc, *next;
3393 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3395 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3396 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3405 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3407 const struct hfsc *hfsc;
3408 hfsc = hfsc_get__(netdev);
3409 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3414 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3417 struct hfsc_class class;
3419 hfsc_parse_qdisc_details__(netdev, details, &class);
3420 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3421 tc_make_handle(1, 0), &class);
3424 hfsc_get__(netdev)->max_rate = class.max_rate;
3431 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3432 const struct tc_queue *queue, struct smap *details)
3434 const struct hfsc_class *hc;
3436 hc = hfsc_class_cast__(queue);
3437 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3438 if (hc->min_rate != hc->max_rate) {
3439 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3445 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3446 const struct smap *details)
3449 struct hfsc_class class;
3451 error = hfsc_parse_class_details__(netdev, details, &class);
3456 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3457 tc_make_handle(1, 0xfffe), &class);
3462 hfsc_update_queue__(netdev, queue_id, &class);
3467 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3471 struct hfsc_class *hc;
3473 hc = hfsc_class_cast__(queue);
3474 hfsc = hfsc_get__(netdev);
3476 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3478 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3485 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3486 struct netdev_queue_stats *stats)
3488 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3489 tc_make_handle(1, 0xfffe), NULL, stats);
3493 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3494 const struct ofpbuf *nlmsg,
3495 netdev_dump_queue_stats_cb *cb, void *aux)
3497 struct netdev_queue_stats stats;
3498 unsigned int handle, major, minor;
3501 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3506 major = tc_get_major(handle);
3507 minor = tc_get_minor(handle);
3508 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3509 (*cb)(minor - 1, &stats, aux);
3514 static const struct tc_ops tc_ops_hfsc = {
3515 "hfsc", /* linux_name */
3516 "linux-hfsc", /* ovs_name */
3517 HFSC_N_QUEUES, /* n_queues */
3518 hfsc_tc_install, /* tc_install */
3519 hfsc_tc_load, /* tc_load */
3520 hfsc_tc_destroy, /* tc_destroy */
3521 hfsc_qdisc_get, /* qdisc_get */
3522 hfsc_qdisc_set, /* qdisc_set */
3523 hfsc_class_get, /* class_get */
3524 hfsc_class_set, /* class_set */
3525 hfsc_class_delete, /* class_delete */
3526 hfsc_class_get_stats, /* class_get_stats */
3527 hfsc_class_dump_stats /* class_dump_stats */
3530 /* "linux-default" traffic control class.
3532 * This class represents the default, unnamed Linux qdisc. It corresponds to
3533 * the "" (empty string) QoS type in the OVS database. */
3536 default_install__(struct netdev *netdev)
3538 struct netdev_dev_linux *netdev_dev =
3539 netdev_dev_linux_cast(netdev_get_dev(netdev));
3540 static struct tc *tc;
3543 tc = xmalloc(sizeof *tc);
3544 tc_init(tc, &tc_ops_default);
3546 netdev_dev->tc = tc;
3550 default_tc_install(struct netdev *netdev,
3551 const struct smap *details OVS_UNUSED)
3553 default_install__(netdev);
3558 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3560 default_install__(netdev);
3564 static const struct tc_ops tc_ops_default = {
3565 NULL, /* linux_name */
3570 NULL, /* tc_destroy */
3571 NULL, /* qdisc_get */
3572 NULL, /* qdisc_set */
3573 NULL, /* class_get */
3574 NULL, /* class_set */
3575 NULL, /* class_delete */
3576 NULL, /* class_get_stats */
3577 NULL /* class_dump_stats */
3580 /* "linux-other" traffic control class.
3585 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3587 struct netdev_dev_linux *netdev_dev =
3588 netdev_dev_linux_cast(netdev_get_dev(netdev));
3589 static struct tc *tc;
3592 tc = xmalloc(sizeof *tc);
3593 tc_init(tc, &tc_ops_other);
3595 netdev_dev->tc = tc;
3599 static const struct tc_ops tc_ops_other = {
3600 NULL, /* linux_name */
3601 "linux-other", /* ovs_name */
3603 NULL, /* tc_install */
3605 NULL, /* tc_destroy */
3606 NULL, /* qdisc_get */
3607 NULL, /* qdisc_set */
3608 NULL, /* class_get */
3609 NULL, /* class_set */
3610 NULL, /* class_delete */
3611 NULL, /* class_get_stats */
3612 NULL /* class_dump_stats */
3615 /* Traffic control. */
3617 /* Number of kernel "tc" ticks per second. */
3618 static double ticks_per_s;
3620 /* Number of kernel "jiffies" per second. This is used for the purpose of
3621 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3622 * one jiffy's worth of data.
3624 * There are two possibilities here:
3626 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3627 * approximate range of 100 to 1024. That means that we really need to
3628 * make sure that the qdisc can buffer that much data.
3630 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3631 * has finely granular timers and there's no need to fudge additional room
3632 * for buffers. (There's no extra effort needed to implement that: the
3633 * large 'buffer_hz' is used as a divisor, so practically any number will
3634 * come out as 0 in the division. Small integer results in the case of
3635 * really high dividends won't have any real effect anyhow.)
3637 static unsigned int buffer_hz;
3639 /* Returns tc handle 'major':'minor'. */
3641 tc_make_handle(unsigned int major, unsigned int minor)
3643 return TC_H_MAKE(major << 16, minor);
3646 /* Returns the major number from 'handle'. */
3648 tc_get_major(unsigned int handle)
3650 return TC_H_MAJ(handle) >> 16;
3653 /* Returns the minor number from 'handle'. */
3655 tc_get_minor(unsigned int handle)
3657 return TC_H_MIN(handle);
3660 static struct tcmsg *
3661 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3662 struct ofpbuf *request)
3664 struct tcmsg *tcmsg;
3668 error = get_ifindex(netdev, &ifindex);
3673 ofpbuf_init(request, 512);
3674 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3675 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3676 tcmsg->tcm_family = AF_UNSPEC;
3677 tcmsg->tcm_ifindex = ifindex;
3678 /* Caller should fill in tcmsg->tcm_handle. */
3679 /* Caller should fill in tcmsg->tcm_parent. */
3685 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3687 int error = nl_sock_transact(rtnl_sock, request, replyp);
3688 ofpbuf_uninit(request);
3692 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3693 * policing configuration.
3695 * This function is equivalent to running the following when 'add' is true:
3696 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3698 * This function is equivalent to running the following when 'add' is false:
3699 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3701 * The configuration and stats may be seen with the following command:
3702 * /sbin/tc -s qdisc show dev <devname>
3704 * Returns 0 if successful, otherwise a positive errno value.
3707 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3709 struct ofpbuf request;
3710 struct tcmsg *tcmsg;
3712 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3713 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3715 tcmsg = tc_make_request(netdev, type, flags, &request);
3719 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3720 tcmsg->tcm_parent = TC_H_INGRESS;
3721 nl_msg_put_string(&request, TCA_KIND, "ingress");
3722 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3724 error = tc_transact(&request, NULL);
3726 /* If we're deleting the qdisc, don't worry about some of the
3727 * error conditions. */
3728 if (!add && (error == ENOENT || error == EINVAL)) {
3737 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3740 * This function is equivalent to running:
3741 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3742 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3745 * The configuration and stats may be seen with the following command:
3746 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3748 * Returns 0 if successful, otherwise a positive errno value.
3751 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3753 struct tc_police tc_police;
3754 struct ofpbuf request;
3755 struct tcmsg *tcmsg;
3756 size_t basic_offset;
3757 size_t police_offset;
3761 memset(&tc_police, 0, sizeof tc_police);
3762 tc_police.action = TC_POLICE_SHOT;
3763 tc_police.mtu = mtu;
3764 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3765 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3766 kbits_burst * 1024);
3768 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3769 NLM_F_EXCL | NLM_F_CREATE, &request);
3773 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3774 tcmsg->tcm_info = tc_make_handle(49,
3775 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3777 nl_msg_put_string(&request, TCA_KIND, "basic");
3778 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3779 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3780 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3781 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3782 nl_msg_end_nested(&request, police_offset);
3783 nl_msg_end_nested(&request, basic_offset);
3785 error = tc_transact(&request, NULL);
3796 /* The values in psched are not individually very meaningful, but they are
3797 * important. The tables below show some values seen in the wild.
3801 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3802 * (Before that, there are hints that it was 1000000000.)
3804 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3808 * -----------------------------------
3809 * [1] 000c8000 000f4240 000f4240 00000064
3810 * [2] 000003e8 00000400 000f4240 3b9aca00
3811 * [3] 000003e8 00000400 000f4240 3b9aca00
3812 * [4] 000003e8 00000400 000f4240 00000064
3813 * [5] 000003e8 00000040 000f4240 3b9aca00
3814 * [6] 000003e8 00000040 000f4240 000000f9
3816 * a b c d ticks_per_s buffer_hz
3817 * ------- --------- ---------- ------------- ----------- -------------
3818 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3819 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3820 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3821 * [4] 1,000 1,024 1,000,000 100 976,562 100
3822 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3823 * [6] 1,000 64 1,000,000 249 15,625,000 249
3825 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3826 * [2] 2.6.26-1-686-bigmem from Debian lenny
3827 * [3] 2.6.26-2-sparc64 from Debian lenny
3828 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3829 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3830 * [6] 2.6.34 from kernel.org on KVM
3832 static const char fn[] = "/proc/net/psched";
3833 unsigned int a, b, c, d;
3839 stream = fopen(fn, "r");
3841 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3845 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3846 VLOG_WARN("%s: read failed", fn);
3850 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3854 VLOG_WARN("%s: invalid scheduler parameters", fn);
3858 ticks_per_s = (double) a * c / b;
3862 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3865 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3868 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3869 * rate of 'rate' bytes per second. */
3871 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3876 return (rate * ticks) / ticks_per_s;
3879 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3880 * rate of 'rate' bytes per second. */
3882 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3887 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3890 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3891 * a transmission rate of 'rate' bytes per second. */
3893 tc_buffer_per_jiffy(unsigned int rate)
3898 return rate / buffer_hz;
3901 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3902 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3903 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3904 * stores NULL into it if it is absent.
3906 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3909 * Returns 0 if successful, otherwise a positive errno value. */
3911 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3912 struct nlattr **options)
3914 static const struct nl_policy tca_policy[] = {
3915 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3916 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3918 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3920 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3921 tca_policy, ta, ARRAY_SIZE(ta))) {
3922 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3927 *kind = nl_attr_get_string(ta[TCA_KIND]);
3931 *options = ta[TCA_OPTIONS];
3946 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3947 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3948 * into '*options', and its queue statistics into '*stats'. Any of the output
3949 * arguments may be null.
3951 * Returns 0 if successful, otherwise a positive errno value. */
3953 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3954 struct nlattr **options, struct netdev_queue_stats *stats)
3956 static const struct nl_policy tca_policy[] = {
3957 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3958 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3960 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3962 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3963 tca_policy, ta, ARRAY_SIZE(ta))) {
3964 VLOG_WARN_RL(&rl, "failed to parse class message");
3969 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3970 *handlep = tc->tcm_handle;
3974 *options = ta[TCA_OPTIONS];
3978 const struct gnet_stats_queue *gsq;
3979 struct gnet_stats_basic gsb;
3981 static const struct nl_policy stats_policy[] = {
3982 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3983 .min_len = sizeof gsb },
3984 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3985 .min_len = sizeof *gsq },
3987 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3989 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3990 sa, ARRAY_SIZE(sa))) {
3991 VLOG_WARN_RL(&rl, "failed to parse class stats");
3995 /* Alignment issues screw up the length of struct gnet_stats_basic on
3996 * some arch/bitsize combinations. Newer versions of Linux have a
3997 * struct gnet_stats_basic_packed, but we can't depend on that. The
3998 * easiest thing to do is just to make a copy. */
3999 memset(&gsb, 0, sizeof gsb);
4000 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4001 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4002 stats->tx_bytes = gsb.bytes;
4003 stats->tx_packets = gsb.packets;
4005 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4006 stats->tx_errors = gsq->drops;
4016 memset(stats, 0, sizeof *stats);
4021 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4024 tc_query_class(const struct netdev *netdev,
4025 unsigned int handle, unsigned int parent,
4026 struct ofpbuf **replyp)
4028 struct ofpbuf request;
4029 struct tcmsg *tcmsg;
4032 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4036 tcmsg->tcm_handle = handle;
4037 tcmsg->tcm_parent = parent;
4039 error = tc_transact(&request, replyp);
4041 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4042 netdev_get_name(netdev),
4043 tc_get_major(handle), tc_get_minor(handle),
4044 tc_get_major(parent), tc_get_minor(parent),
4050 /* Equivalent to "tc class del dev <name> handle <handle>". */
4052 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4054 struct ofpbuf request;
4055 struct tcmsg *tcmsg;
4058 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4062 tcmsg->tcm_handle = handle;
4063 tcmsg->tcm_parent = 0;
4065 error = tc_transact(&request, NULL);
4067 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4068 netdev_get_name(netdev),
4069 tc_get_major(handle), tc_get_minor(handle),
4075 /* Equivalent to "tc qdisc del dev <name> root". */
4077 tc_del_qdisc(struct netdev *netdev)
4079 struct netdev_dev_linux *netdev_dev =
4080 netdev_dev_linux_cast(netdev_get_dev(netdev));
4081 struct ofpbuf request;
4082 struct tcmsg *tcmsg;
4085 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4089 tcmsg->tcm_handle = tc_make_handle(1, 0);
4090 tcmsg->tcm_parent = TC_H_ROOT;
4092 error = tc_transact(&request, NULL);
4093 if (error == EINVAL) {
4094 /* EINVAL probably means that the default qdisc was in use, in which
4095 * case we've accomplished our purpose. */
4098 if (!error && netdev_dev->tc) {
4099 if (netdev_dev->tc->ops->tc_destroy) {
4100 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4102 netdev_dev->tc = NULL;
4107 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4108 * kernel to determine what they are. Returns 0 if successful, otherwise a
4109 * positive errno value. */
4111 tc_query_qdisc(const struct netdev *netdev)
4113 struct netdev_dev_linux *netdev_dev =
4114 netdev_dev_linux_cast(netdev_get_dev(netdev));
4115 struct ofpbuf request, *qdisc;
4116 const struct tc_ops *ops;
4117 struct tcmsg *tcmsg;
4121 if (netdev_dev->tc) {
4125 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4126 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4127 * 2.6.35 without that fix backported to it.
4129 * To avoid the OOPS, we must not make a request that would attempt to dump
4130 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4131 * few others. There are a few ways that I can see to do this, but most of
4132 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4133 * technique chosen here is to assume that any non-default qdisc that we
4134 * create will have a class with handle 1:0. The built-in qdiscs only have
4135 * a class with handle 0:0.
4137 * We could check for Linux 2.6.35+ and use a more straightforward method
4139 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4143 tcmsg->tcm_handle = tc_make_handle(1, 0);
4144 tcmsg->tcm_parent = 0;
4146 /* Figure out what tc class to instantiate. */
4147 error = tc_transact(&request, &qdisc);
4151 error = tc_parse_qdisc(qdisc, &kind, NULL);
4153 ops = &tc_ops_other;
4155 ops = tc_lookup_linux_name(kind);
4157 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4158 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4160 ops = &tc_ops_other;
4163 } else if (error == ENOENT) {
4164 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4165 * other entity that doesn't have a handle 1:0. We will assume
4166 * that it's the system default qdisc. */
4167 ops = &tc_ops_default;
4170 /* Who knows? Maybe the device got deleted. */
4171 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4172 netdev_get_name(netdev), strerror(error));
4173 ops = &tc_ops_other;
4176 /* Instantiate it. */
4177 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4178 assert((load_error == 0) == (netdev_dev->tc != NULL));
4179 ofpbuf_delete(qdisc);
4181 return error ? error : load_error;
4184 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4185 approximate the time to transmit packets of various lengths. For an MTU of
4186 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4187 represents two possible packet lengths; for a MTU of 513 through 1024, four
4188 possible lengths; and so on.
4190 Returns, for the specified 'mtu', the number of bits that packet lengths
4191 need to be shifted right to fit within such a 256-entry table. */
4193 tc_calc_cell_log(unsigned int mtu)
4198 mtu = ETH_PAYLOAD_MAX;
4200 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4202 for (cell_log = 0; mtu >= 256; cell_log++) {
4209 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4212 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4214 memset(rate, 0, sizeof *rate);
4215 rate->cell_log = tc_calc_cell_log(mtu);
4216 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4217 /* rate->cell_align = 0; */ /* distro headers. */
4218 rate->mpu = ETH_TOTAL_MIN;
4222 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4223 * attribute of the specified "type".
4225 * See tc_calc_cell_log() above for a description of "rtab"s. */
4227 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4232 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4233 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4234 unsigned packet_size = (i + 1) << rate->cell_log;
4235 if (packet_size < rate->mpu) {
4236 packet_size = rate->mpu;
4238 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4242 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4243 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4244 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4247 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4249 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4250 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4253 /* Linux-only functions declared in netdev-linux.h */
4255 /* Returns a fd for an AF_INET socket or a negative errno value. */
4257 netdev_linux_get_af_inet_sock(void)
4259 int error = netdev_linux_init();
4260 return error ? -error : af_inet_sock;
4263 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4264 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4266 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4267 const char *flag_name, bool enable)
4269 const char *netdev_name = netdev_get_name(netdev);
4270 struct ethtool_value evalue;
4274 COVERAGE_INC(netdev_get_ethtool);
4275 memset(&evalue, 0, sizeof evalue);
4276 error = netdev_linux_do_ethtool(netdev_name,
4277 (struct ethtool_cmd *)&evalue,
4278 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4283 COVERAGE_INC(netdev_set_ethtool);
4284 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4285 error = netdev_linux_do_ethtool(netdev_name,
4286 (struct ethtool_cmd *)&evalue,
4287 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4292 COVERAGE_INC(netdev_get_ethtool);
4293 memset(&evalue, 0, sizeof evalue);
4294 error = netdev_linux_do_ethtool(netdev_name,
4295 (struct ethtool_cmd *)&evalue,
4296 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4301 if (new_flags != evalue.data) {
4302 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4303 "device %s failed", enable ? "enable" : "disable",
4304 flag_name, netdev_name);
4311 /* Utility functions. */
4313 /* Copies 'src' into 'dst', performing format conversion in the process. */
4315 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4316 const struct rtnl_link_stats *src)
4318 dst->rx_packets = src->rx_packets;
4319 dst->tx_packets = src->tx_packets;
4320 dst->rx_bytes = src->rx_bytes;
4321 dst->tx_bytes = src->tx_bytes;
4322 dst->rx_errors = src->rx_errors;
4323 dst->tx_errors = src->tx_errors;
4324 dst->rx_dropped = src->rx_dropped;
4325 dst->tx_dropped = src->tx_dropped;
4326 dst->multicast = src->multicast;
4327 dst->collisions = src->collisions;
4328 dst->rx_length_errors = src->rx_length_errors;
4329 dst->rx_over_errors = src->rx_over_errors;
4330 dst->rx_crc_errors = src->rx_crc_errors;
4331 dst->rx_frame_errors = src->rx_frame_errors;
4332 dst->rx_fifo_errors = src->rx_fifo_errors;
4333 dst->rx_missed_errors = src->rx_missed_errors;
4334 dst->tx_aborted_errors = src->tx_aborted_errors;
4335 dst->tx_carrier_errors = src->tx_carrier_errors;
4336 dst->tx_fifo_errors = src->tx_fifo_errors;
4337 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4338 dst->tx_window_errors = src->tx_window_errors;
4342 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4344 /* Policy for RTNLGRP_LINK messages.
4346 * There are *many* more fields in these messages, but currently we only
4347 * care about these fields. */
4348 static const struct nl_policy rtnlgrp_link_policy[] = {
4349 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4350 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4351 .min_len = sizeof(struct rtnl_link_stats) },
4354 struct ofpbuf request;
4355 struct ofpbuf *reply;
4356 struct ifinfomsg *ifi;
4357 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4360 ofpbuf_init(&request, 0);
4361 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4362 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4363 ifi->ifi_family = PF_UNSPEC;
4364 ifi->ifi_index = ifindex;
4365 error = nl_sock_transact(rtnl_sock, &request, &reply);
4366 ofpbuf_uninit(&request);
4371 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4372 rtnlgrp_link_policy,
4373 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4374 ofpbuf_delete(reply);
4378 if (!attrs[IFLA_STATS]) {
4379 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4380 ofpbuf_delete(reply);
4384 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4386 ofpbuf_delete(reply);
4392 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4394 static const char fn[] = "/proc/net/dev";
4399 stream = fopen(fn, "r");
4401 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4406 while (fgets(line, sizeof line, stream)) {
4409 #define X64 "%"SCNu64
4412 X64 X64 X64 X64 X64 X64 X64 "%*u"
4413 X64 X64 X64 X64 X64 X64 X64 "%*u",
4419 &stats->rx_fifo_errors,
4420 &stats->rx_frame_errors,
4426 &stats->tx_fifo_errors,
4428 &stats->tx_carrier_errors) != 15) {
4429 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4430 } else if (!strcmp(devname, netdev_name)) {
4431 stats->rx_length_errors = UINT64_MAX;
4432 stats->rx_over_errors = UINT64_MAX;
4433 stats->rx_crc_errors = UINT64_MAX;
4434 stats->rx_missed_errors = UINT64_MAX;
4435 stats->tx_aborted_errors = UINT64_MAX;
4436 stats->tx_heartbeat_errors = UINT64_MAX;
4437 stats->tx_window_errors = UINT64_MAX;
4443 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4449 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4455 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4458 *flags = ifr.ifr_flags;
4464 set_flags(struct netdev *netdev, unsigned int flags)
4468 ifr.ifr_flags = flags;
4469 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4474 do_get_ifindex(const char *netdev_name)
4478 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4479 COVERAGE_INC(netdev_get_ifindex);
4480 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4481 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4482 netdev_name, strerror(errno));
4485 return ifr.ifr_ifindex;
4489 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4491 struct netdev_dev_linux *netdev_dev =
4492 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4494 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4495 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4498 netdev_dev->get_ifindex_error = -ifindex;
4499 netdev_dev->ifindex = 0;
4501 netdev_dev->get_ifindex_error = 0;
4502 netdev_dev->ifindex = ifindex;
4504 netdev_dev->cache_valid |= VALID_IFINDEX;
4507 *ifindexp = netdev_dev->ifindex;
4508 return netdev_dev->get_ifindex_error;
4512 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4517 memset(&ifr, 0, sizeof ifr);
4518 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4519 COVERAGE_INC(netdev_get_hwaddr);
4520 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4521 /* ENODEV probably means that a vif disappeared asynchronously and
4522 * hasn't been removed from the database yet, so reduce the log level
4523 * to INFO for that case. */
4524 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4525 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4526 netdev_name, strerror(errno));
4529 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4530 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4531 VLOG_WARN("%s device has unknown hardware address family %d",
4532 netdev_name, hwaddr_family);
4534 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4539 set_etheraddr(const char *netdev_name,
4540 const uint8_t mac[ETH_ADDR_LEN])
4544 memset(&ifr, 0, sizeof ifr);
4545 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4546 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4547 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4548 COVERAGE_INC(netdev_set_hwaddr);
4549 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4550 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4551 netdev_name, strerror(errno));
4558 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4559 int cmd, const char *cmd_name)
4563 memset(&ifr, 0, sizeof ifr);
4564 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4565 ifr.ifr_data = (caddr_t) ecmd;
4568 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4571 if (errno != EOPNOTSUPP) {
4572 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4573 "failed: %s", cmd_name, name, strerror(errno));
4575 /* The device doesn't support this operation. That's pretty
4576 * common, so there's no point in logging anything. */
4583 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4584 const char *cmd_name)
4586 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4587 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4588 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4596 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4597 int cmd, const char *cmd_name)
4602 ifr.ifr_addr.sa_family = AF_INET;
4603 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4605 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4606 *ip = sin->sin_addr;
4611 /* Returns an AF_PACKET raw socket or a negative errno value. */
4613 af_packet_sock(void)
4615 static int sock = INT_MIN;
4617 if (sock == INT_MIN) {
4618 sock = socket(AF_PACKET, SOCK_RAW, 0);
4620 set_nonblocking(sock);
4623 VLOG_ERR("failed to create packet socket: %s", strerror(errno));