2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/gen_stats.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_tun.h>
28 #include <linux/types.h>
29 #include <linux/ethtool.h>
30 #include <linux/mii.h>
31 #include <linux/pkt_cls.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink-notifier.h"
59 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
67 #include "socket-util.h"
70 #include "unaligned.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct smap *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct smap *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct smap *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct smap *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 ovs_assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 ovs_assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
502 unsigned int ifi_flags,
506 if (!dev->change_seq) {
510 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
511 dev->carrier_resets++;
513 dev->ifi_flags = ifi_flags;
515 dev->cache_valid &= mask;
519 netdev_dev_linux_update(struct netdev_dev_linux *dev,
520 const struct rtnetlink_link_change *change)
522 if (change->nlmsg_type == RTM_NEWLINK) {
524 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
526 /* Update netdev from rtnl-change msg. */
528 dev->mtu = change->mtu;
529 dev->cache_valid |= VALID_MTU;
530 dev->netdev_mtu_error = 0;
533 if (!eth_addr_is_zero(change->addr)) {
534 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
535 dev->cache_valid |= VALID_ETHERADDR;
536 dev->ether_addr_error = 0;
539 dev->ifindex = change->ifi_index;
540 dev->cache_valid |= VALID_IFINDEX;
541 dev->get_ifindex_error = 0;
544 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
549 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
550 void *aux OVS_UNUSED)
552 struct netdev_dev_linux *dev;
554 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
556 const struct netdev_class *netdev_class =
557 netdev_dev_get_class(base_dev);
559 if (is_netdev_linux_class(netdev_class)) {
560 dev = netdev_dev_linux_cast(base_dev);
561 netdev_dev_linux_update(dev, change);
565 struct shash device_shash;
566 struct shash_node *node;
568 shash_init(&device_shash);
569 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
570 SHASH_FOR_EACH (node, &device_shash) {
575 get_flags(&dev->netdev_dev, &flags);
576 netdev_dev_linux_changed(dev, flags, 0);
578 shash_destroy(&device_shash);
583 cache_notifier_ref(void)
585 if (!cache_notifier_refcount) {
586 ovs_assert(!netdev_linux_cache_notifier);
588 netdev_linux_cache_notifier =
589 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
591 if (!netdev_linux_cache_notifier) {
595 cache_notifier_refcount++;
601 cache_notifier_unref(void)
603 ovs_assert(cache_notifier_refcount > 0);
604 if (!--cache_notifier_refcount) {
605 ovs_assert(netdev_linux_cache_notifier);
606 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
607 netdev_linux_cache_notifier = NULL;
611 /* Creates system and internal devices. */
613 netdev_linux_create(const struct netdev_class *class, const char *name,
614 struct netdev_dev **netdev_devp)
616 struct netdev_dev_linux *netdev_dev;
619 error = cache_notifier_ref();
624 netdev_dev = xzalloc(sizeof *netdev_dev);
625 netdev_dev->change_seq = 1;
626 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
627 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
629 *netdev_devp = &netdev_dev->netdev_dev;
633 /* For most types of netdevs we open the device for each call of
634 * netdev_open(). However, this is not the case with tap devices,
635 * since it is only possible to open the device once. In this
636 * situation we share a single file descriptor, and consequently
637 * buffers, across all readers. Therefore once data is read it will
638 * be unavailable to other reads for tap devices. */
640 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
641 const char *name, struct netdev_dev **netdev_devp)
643 struct netdev_dev_linux *netdev_dev;
644 struct tap_state *state;
645 static const char tap_dev[] = "/dev/net/tun";
649 netdev_dev = xzalloc(sizeof *netdev_dev);
650 state = &netdev_dev->state.tap;
652 error = cache_notifier_ref();
657 /* Open tap device. */
658 state->fd = open(tap_dev, O_RDWR);
661 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
662 goto error_unref_notifier;
665 /* Create tap device. */
666 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
667 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
668 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
669 VLOG_WARN("%s: creating tap device failed: %s", name,
672 goto error_unref_notifier;
675 /* Make non-blocking. */
676 error = set_nonblocking(state->fd);
678 goto error_unref_notifier;
681 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
682 *netdev_devp = &netdev_dev->netdev_dev;
685 error_unref_notifier:
686 cache_notifier_unref();
693 destroy_tap(struct netdev_dev_linux *netdev_dev)
695 struct tap_state *state = &netdev_dev->state.tap;
697 if (state->fd >= 0) {
702 /* Destroys the netdev device 'netdev_dev_'. */
704 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
706 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
707 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
709 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
710 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
713 if (class == &netdev_tap_class) {
714 destroy_tap(netdev_dev);
718 cache_notifier_unref();
722 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
724 struct netdev_linux *netdev;
725 enum netdev_flags flags;
728 /* Allocate network device. */
729 netdev = xzalloc(sizeof *netdev);
731 netdev_init(&netdev->netdev, netdev_dev_);
733 /* Verify that the device really exists, by attempting to read its flags.
734 * (The flags might be cached, in which case this won't actually do an
737 * Don't do this for "internal" netdevs, though, because those have to be
738 * created as netdev objects before they exist in the kernel, because
739 * creating them in the kernel happens by passing a netdev object to
740 * dpif_port_add(). */
741 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
742 error = netdev_get_flags(&netdev->netdev, &flags);
743 if (error == ENODEV) {
748 *netdevp = &netdev->netdev;
752 netdev_uninit(&netdev->netdev, true);
756 /* Closes and destroys 'netdev'. */
758 netdev_linux_close(struct netdev *netdev_)
760 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
762 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
769 netdev_linux_listen(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 struct netdev_dev_linux *netdev_dev =
773 netdev_dev_linux_cast(netdev_get_dev(netdev_));
774 struct sockaddr_ll sll;
779 if (netdev->fd >= 0) {
783 if (!strcmp(netdev_get_type(netdev_), "tap")
784 && !netdev_dev->state.tap.opened) {
785 netdev->fd = netdev_dev->state.tap.fd;
786 netdev_dev->state.tap.opened = true;
790 /* Create file descriptor. */
791 fd = socket(PF_PACKET, SOCK_RAW, 0);
794 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
798 /* Set non-blocking mode. */
799 error = set_nonblocking(fd);
804 /* Get ethernet device index. */
805 error = get_ifindex(&netdev->netdev, &ifindex);
810 /* Bind to specific ethernet device. */
811 memset(&sll, 0, sizeof sll);
812 sll.sll_family = AF_PACKET;
813 sll.sll_ifindex = ifindex;
814 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
815 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
817 VLOG_ERR("%s: failed to bind raw socket (%s)",
818 netdev_get_name(netdev_), strerror(error));
833 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
837 if (netdev->fd < 0) {
838 /* Device is not listening. */
845 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
846 ? read(netdev->fd, data, size)
847 : recv(netdev->fd, data, size, MSG_TRUNC));
849 return retval <= size ? retval : -EMSGSIZE;
850 } else if (errno != EINTR) {
851 if (errno != EAGAIN) {
852 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
853 strerror(errno), netdev_get_name(netdev_));
860 /* Registers with the poll loop to wake up from the next call to poll_block()
861 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
863 netdev_linux_recv_wait(struct netdev *netdev_)
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
866 if (netdev->fd >= 0) {
867 poll_fd_wait(netdev->fd, POLLIN);
871 /* Discards all packets waiting to be received from 'netdev'. */
873 netdev_linux_drain(struct netdev *netdev_)
875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
876 if (netdev->fd < 0) {
878 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
880 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
881 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
885 drain_fd(netdev->fd, ifr.ifr_qlen);
888 return drain_rcvbuf(netdev->fd);
892 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
893 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
894 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
895 * the packet is too big or too small to transmit on the device.
897 * The caller retains ownership of 'buffer' in all cases.
899 * The kernel maintains a packet transmission queue, so the caller is not
900 * expected to do additional queuing of packets. */
902 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
904 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
908 if (netdev->fd < 0) {
909 /* Use our AF_PACKET socket to send to this device. */
910 struct sockaddr_ll sll;
917 sock = af_packet_sock();
922 error = get_ifindex(netdev_, &ifindex);
927 /* We don't bother setting most fields in sockaddr_ll because the
928 * kernel ignores them for SOCK_RAW. */
929 memset(&sll, 0, sizeof sll);
930 sll.sll_family = AF_PACKET;
931 sll.sll_ifindex = ifindex;
933 iov.iov_base = CONST_CAST(void *, data);
937 msg.msg_namelen = sizeof sll;
940 msg.msg_control = NULL;
941 msg.msg_controllen = 0;
944 retval = sendmsg(sock, &msg, 0);
946 /* Use the netdev's own fd to send to this device. This is
947 * essential for tap devices, because packets sent to a tap device
948 * with an AF_PACKET socket will loop back to be *received* again
949 * on the tap device. */
950 retval = write(netdev->fd, data, size);
954 /* The Linux AF_PACKET implementation never blocks waiting for room
955 * for packets, instead returning ENOBUFS. Translate this into
956 * EAGAIN for the caller. */
957 if (errno == ENOBUFS) {
959 } else if (errno == EINTR) {
961 } else if (errno != EAGAIN) {
962 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
963 netdev_get_name(netdev_), strerror(errno));
966 } else if (retval != size) {
967 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
968 "%zu) on %s", retval, size, netdev_get_name(netdev_));
976 /* Registers with the poll loop to wake up from the next call to poll_block()
977 * when the packet transmission queue has sufficient room to transmit a packet
978 * with netdev_send().
980 * The kernel maintains a packet transmission queue, so the client is not
981 * expected to do additional queuing of packets. Thus, this function is
982 * unlikely to ever be used. It is included for completeness. */
984 netdev_linux_send_wait(struct netdev *netdev_)
986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
987 if (netdev->fd < 0) {
989 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
990 poll_fd_wait(netdev->fd, POLLOUT);
992 /* TAP device always accepts packets.*/
993 poll_immediate_wake();
997 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
998 * otherwise a positive errno value. */
1000 netdev_linux_set_etheraddr(struct netdev *netdev_,
1001 const uint8_t mac[ETH_ADDR_LEN])
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1006 bool up_again = false;
1008 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1009 if (netdev_dev->ether_addr_error) {
1010 return netdev_dev->ether_addr_error;
1012 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1015 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1018 /* Tap devices must be brought down before setting the address. */
1019 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1020 enum netdev_flags flags;
1022 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1023 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1027 error = set_etheraddr(netdev_get_name(netdev_), mac);
1028 if (!error || error == ENODEV) {
1029 netdev_dev->ether_addr_error = error;
1030 netdev_dev->cache_valid |= VALID_ETHERADDR;
1032 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1037 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1043 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1045 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1046 uint8_t mac[ETH_ADDR_LEN])
1048 struct netdev_dev_linux *netdev_dev =
1049 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1051 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1052 int error = get_etheraddr(netdev_get_name(netdev_),
1053 netdev_dev->etheraddr);
1055 netdev_dev->ether_addr_error = error;
1056 netdev_dev->cache_valid |= VALID_ETHERADDR;
1059 if (!netdev_dev->ether_addr_error) {
1060 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1063 return netdev_dev->ether_addr_error;
1066 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1067 * in bytes, not including the hardware header; thus, this is typically 1500
1068 * bytes for Ethernet devices. */
1070 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1072 struct netdev_dev_linux *netdev_dev =
1073 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1074 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1078 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1079 SIOCGIFMTU, "SIOCGIFMTU");
1081 netdev_dev->netdev_mtu_error = error;
1082 netdev_dev->mtu = ifr.ifr_mtu;
1083 netdev_dev->cache_valid |= VALID_MTU;
1086 if (!netdev_dev->netdev_mtu_error) {
1087 *mtup = netdev_dev->mtu;
1089 return netdev_dev->netdev_mtu_error;
1092 /* Sets the maximum size of transmitted (MTU) for given device using linux
1093 * networking ioctl interface.
1096 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1098 struct netdev_dev_linux *netdev_dev =
1099 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1103 if (netdev_dev->cache_valid & VALID_MTU) {
1104 if (netdev_dev->netdev_mtu_error) {
1105 return netdev_dev->netdev_mtu_error;
1107 if (netdev_dev->mtu == mtu) {
1110 netdev_dev->cache_valid &= ~VALID_MTU;
1113 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1114 SIOCSIFMTU, "SIOCSIFMTU");
1115 if (!error || error == ENODEV) {
1116 netdev_dev->netdev_mtu_error = error;
1117 netdev_dev->mtu = ifr.ifr_mtu;
1118 netdev_dev->cache_valid |= VALID_MTU;
1123 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1124 * On failure, returns a negative errno value. */
1126 netdev_linux_get_ifindex(const struct netdev *netdev)
1130 error = get_ifindex(netdev, &ifindex);
1131 return error ? -error : ifindex;
1135 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1137 struct netdev_dev_linux *netdev_dev =
1138 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1140 if (netdev_dev->miimon_interval > 0) {
1141 *carrier = netdev_dev->miimon;
1143 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1149 static long long int
1150 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1152 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1156 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1157 struct mii_ioctl_data *data)
1162 memset(&ifr, 0, sizeof ifr);
1163 memcpy(&ifr.ifr_data, data, sizeof *data);
1164 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1165 memcpy(data, &ifr.ifr_data, sizeof *data);
1171 netdev_linux_get_miimon(const char *name, bool *miimon)
1173 struct mii_ioctl_data data;
1178 memset(&data, 0, sizeof data);
1179 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1181 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1182 data.reg_num = MII_BMSR;
1183 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1187 *miimon = !!(data.val_out & BMSR_LSTATUS);
1189 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1192 struct ethtool_cmd ecmd;
1194 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1197 COVERAGE_INC(netdev_get_ethtool);
1198 memset(&ecmd, 0, sizeof ecmd);
1199 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1202 struct ethtool_value eval;
1204 memcpy(&eval, &ecmd, sizeof eval);
1205 *miimon = !!eval.data;
1207 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1215 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1216 long long int interval)
1218 struct netdev_dev_linux *netdev_dev;
1220 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1222 interval = interval > 0 ? MAX(interval, 100) : 0;
1223 if (netdev_dev->miimon_interval != interval) {
1224 netdev_dev->miimon_interval = interval;
1225 timer_set_expired(&netdev_dev->miimon_timer);
1232 netdev_linux_miimon_run(void)
1234 struct shash device_shash;
1235 struct shash_node *node;
1237 shash_init(&device_shash);
1238 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1239 SHASH_FOR_EACH (node, &device_shash) {
1240 struct netdev_dev_linux *dev = node->data;
1243 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1247 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1248 if (miimon != dev->miimon) {
1249 dev->miimon = miimon;
1250 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1253 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1256 shash_destroy(&device_shash);
1260 netdev_linux_miimon_wait(void)
1262 struct shash device_shash;
1263 struct shash_node *node;
1265 shash_init(&device_shash);
1266 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1267 SHASH_FOR_EACH (node, &device_shash) {
1268 struct netdev_dev_linux *dev = node->data;
1270 if (dev->miimon_interval > 0) {
1271 timer_wait(&dev->miimon_timer);
1274 shash_destroy(&device_shash);
1277 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1278 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1281 check_for_working_netlink_stats(void)
1283 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1284 * preferable, so if that works, we'll use it. */
1285 int ifindex = do_get_ifindex("lo");
1287 VLOG_WARN("failed to get ifindex for lo, "
1288 "obtaining netdev stats from proc");
1291 struct netdev_stats stats;
1292 int error = get_stats_via_netlink(ifindex, &stats);
1294 VLOG_DBG("obtaining netdev stats via rtnetlink");
1297 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1298 "via proc (you are probably running a pre-2.6.19 "
1299 "kernel)", strerror(error));
1306 swap_uint64(uint64_t *a, uint64_t *b)
1313 /* Copies 'src' into 'dst', performing format conversion in the process.
1315 * 'src' is allowed to be misaligned. */
1317 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1318 const struct ovs_vport_stats *src)
1320 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1321 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1322 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1323 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1324 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1325 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1326 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1327 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1329 dst->collisions = 0;
1330 dst->rx_length_errors = 0;
1331 dst->rx_over_errors = 0;
1332 dst->rx_crc_errors = 0;
1333 dst->rx_frame_errors = 0;
1334 dst->rx_fifo_errors = 0;
1335 dst->rx_missed_errors = 0;
1336 dst->tx_aborted_errors = 0;
1337 dst->tx_carrier_errors = 0;
1338 dst->tx_fifo_errors = 0;
1339 dst->tx_heartbeat_errors = 0;
1340 dst->tx_window_errors = 0;
1344 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1346 struct dpif_linux_vport reply;
1350 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1353 } else if (!reply.stats) {
1358 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1366 get_stats_via_vport(const struct netdev *netdev_,
1367 struct netdev_stats *stats)
1369 struct netdev_dev_linux *netdev_dev =
1370 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1372 if (!netdev_dev->vport_stats_error ||
1373 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1376 error = get_stats_via_vport__(netdev_, stats);
1377 if (error && error != ENOENT) {
1378 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1379 "(%s)", netdev_get_name(netdev_), strerror(error));
1381 netdev_dev->vport_stats_error = error;
1382 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1387 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1388 struct netdev_stats *stats)
1390 static int use_netlink_stats = -1;
1393 if (use_netlink_stats < 0) {
1394 use_netlink_stats = check_for_working_netlink_stats();
1397 if (use_netlink_stats) {
1400 error = get_ifindex(netdev_, &ifindex);
1402 error = get_stats_via_netlink(ifindex, stats);
1405 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1409 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1410 netdev_get_name(netdev_), error);
1416 /* Retrieves current device stats for 'netdev-linux'. */
1418 netdev_linux_get_stats(const struct netdev *netdev_,
1419 struct netdev_stats *stats)
1421 struct netdev_dev_linux *netdev_dev =
1422 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1423 struct netdev_stats dev_stats;
1426 get_stats_via_vport(netdev_, stats);
1428 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1431 if (netdev_dev->vport_stats_error) {
1438 if (netdev_dev->vport_stats_error) {
1439 /* stats not available from OVS then use ioctl stats. */
1442 stats->rx_errors += dev_stats.rx_errors;
1443 stats->tx_errors += dev_stats.tx_errors;
1444 stats->rx_dropped += dev_stats.rx_dropped;
1445 stats->tx_dropped += dev_stats.tx_dropped;
1446 stats->multicast += dev_stats.multicast;
1447 stats->collisions += dev_stats.collisions;
1448 stats->rx_length_errors += dev_stats.rx_length_errors;
1449 stats->rx_over_errors += dev_stats.rx_over_errors;
1450 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1451 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1452 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1453 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1454 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1455 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1456 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1457 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1458 stats->tx_window_errors += dev_stats.tx_window_errors;
1463 /* Retrieves current device stats for 'netdev-tap' netdev or
1464 * netdev-internal. */
1466 netdev_tap_get_stats(const struct netdev *netdev_,
1467 struct netdev_stats *stats)
1469 struct netdev_dev_linux *netdev_dev =
1470 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1471 struct netdev_stats dev_stats;
1474 get_stats_via_vport(netdev_, stats);
1476 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1478 if (netdev_dev->vport_stats_error) {
1485 /* If this port is an internal port then the transmit and receive stats
1486 * will appear to be swapped relative to the other ports since we are the
1487 * one sending the data, not a remote computer. For consistency, we swap
1488 * them back here. This does not apply if we are getting stats from the
1489 * vport layer because it always tracks stats from the perspective of the
1491 if (netdev_dev->vport_stats_error) {
1493 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1494 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1495 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1496 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1497 stats->rx_length_errors = 0;
1498 stats->rx_over_errors = 0;
1499 stats->rx_crc_errors = 0;
1500 stats->rx_frame_errors = 0;
1501 stats->rx_fifo_errors = 0;
1502 stats->rx_missed_errors = 0;
1503 stats->tx_aborted_errors = 0;
1504 stats->tx_carrier_errors = 0;
1505 stats->tx_fifo_errors = 0;
1506 stats->tx_heartbeat_errors = 0;
1507 stats->tx_window_errors = 0;
1509 stats->rx_dropped += dev_stats.tx_dropped;
1510 stats->tx_dropped += dev_stats.rx_dropped;
1512 stats->rx_errors += dev_stats.tx_errors;
1513 stats->tx_errors += dev_stats.rx_errors;
1515 stats->multicast += dev_stats.multicast;
1516 stats->collisions += dev_stats.collisions;
1522 netdev_internal_get_stats(const struct netdev *netdev_,
1523 struct netdev_stats *stats)
1525 struct netdev_dev_linux *netdev_dev =
1526 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1528 get_stats_via_vport(netdev_, stats);
1529 return netdev_dev->vport_stats_error;
1533 netdev_internal_set_stats(struct netdev *netdev,
1534 const struct netdev_stats *stats)
1536 struct ovs_vport_stats vport_stats;
1537 struct dpif_linux_vport vport;
1540 vport_stats.rx_packets = stats->rx_packets;
1541 vport_stats.tx_packets = stats->tx_packets;
1542 vport_stats.rx_bytes = stats->rx_bytes;
1543 vport_stats.tx_bytes = stats->tx_bytes;
1544 vport_stats.rx_errors = stats->rx_errors;
1545 vport_stats.tx_errors = stats->tx_errors;
1546 vport_stats.rx_dropped = stats->rx_dropped;
1547 vport_stats.tx_dropped = stats->tx_dropped;
1549 dpif_linux_vport_init(&vport);
1550 vport.cmd = OVS_VPORT_CMD_SET;
1551 vport.name = netdev_get_name(netdev);
1552 vport.stats = &vport_stats;
1554 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1556 /* If the vport layer doesn't know about the device, that doesn't mean it
1557 * doesn't exist (after all were able to open it when netdev_open() was
1558 * called), it just means that it isn't attached and we'll be getting
1559 * stats a different way. */
1560 if (err == ENODEV) {
1568 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1570 struct ethtool_cmd ecmd;
1574 if (netdev_dev->cache_valid & VALID_FEATURES) {
1578 COVERAGE_INC(netdev_get_ethtool);
1579 memset(&ecmd, 0, sizeof ecmd);
1580 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1581 ETHTOOL_GSET, "ETHTOOL_GSET");
1586 /* Supported features. */
1587 netdev_dev->supported = 0;
1588 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1589 netdev_dev->supported |= NETDEV_F_10MB_HD;
1591 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1592 netdev_dev->supported |= NETDEV_F_10MB_FD;
1594 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1595 netdev_dev->supported |= NETDEV_F_100MB_HD;
1597 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1598 netdev_dev->supported |= NETDEV_F_100MB_FD;
1600 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1601 netdev_dev->supported |= NETDEV_F_1GB_HD;
1603 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1604 netdev_dev->supported |= NETDEV_F_1GB_FD;
1606 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1607 netdev_dev->supported |= NETDEV_F_10GB_FD;
1609 if (ecmd.supported & SUPPORTED_TP) {
1610 netdev_dev->supported |= NETDEV_F_COPPER;
1612 if (ecmd.supported & SUPPORTED_FIBRE) {
1613 netdev_dev->supported |= NETDEV_F_FIBER;
1615 if (ecmd.supported & SUPPORTED_Autoneg) {
1616 netdev_dev->supported |= NETDEV_F_AUTONEG;
1618 if (ecmd.supported & SUPPORTED_Pause) {
1619 netdev_dev->supported |= NETDEV_F_PAUSE;
1621 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1622 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1625 /* Advertised features. */
1626 netdev_dev->advertised = 0;
1627 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1628 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1630 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1631 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1633 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1634 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1636 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1637 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1639 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1640 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1642 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1643 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1645 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1646 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1648 if (ecmd.advertising & ADVERTISED_TP) {
1649 netdev_dev->advertised |= NETDEV_F_COPPER;
1651 if (ecmd.advertising & ADVERTISED_FIBRE) {
1652 netdev_dev->advertised |= NETDEV_F_FIBER;
1654 if (ecmd.advertising & ADVERTISED_Autoneg) {
1655 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1657 if (ecmd.advertising & ADVERTISED_Pause) {
1658 netdev_dev->advertised |= NETDEV_F_PAUSE;
1660 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1661 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1664 /* Current settings. */
1666 if (speed == SPEED_10) {
1667 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1668 } else if (speed == SPEED_100) {
1669 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1670 } else if (speed == SPEED_1000) {
1671 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1672 } else if (speed == SPEED_10000) {
1673 netdev_dev->current = NETDEV_F_10GB_FD;
1674 } else if (speed == 40000) {
1675 netdev_dev->current = NETDEV_F_40GB_FD;
1676 } else if (speed == 100000) {
1677 netdev_dev->current = NETDEV_F_100GB_FD;
1678 } else if (speed == 1000000) {
1679 netdev_dev->current = NETDEV_F_1TB_FD;
1681 netdev_dev->current = 0;
1684 if (ecmd.port == PORT_TP) {
1685 netdev_dev->current |= NETDEV_F_COPPER;
1686 } else if (ecmd.port == PORT_FIBRE) {
1687 netdev_dev->current |= NETDEV_F_FIBER;
1691 netdev_dev->current |= NETDEV_F_AUTONEG;
1694 /* Peer advertisements. */
1695 netdev_dev->peer = 0; /* XXX */
1698 netdev_dev->cache_valid |= VALID_FEATURES;
1699 netdev_dev->get_features_error = error;
1702 /* Stores the features supported by 'netdev' into each of '*current',
1703 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1704 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1707 netdev_linux_get_features(const struct netdev *netdev_,
1708 enum netdev_features *current,
1709 enum netdev_features *advertised,
1710 enum netdev_features *supported,
1711 enum netdev_features *peer)
1713 struct netdev_dev_linux *netdev_dev =
1714 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1716 netdev_linux_read_features(netdev_dev);
1718 if (!netdev_dev->get_features_error) {
1719 *current = netdev_dev->current;
1720 *advertised = netdev_dev->advertised;
1721 *supported = netdev_dev->supported;
1722 *peer = netdev_dev->peer;
1724 return netdev_dev->get_features_error;
1727 /* Set the features advertised by 'netdev' to 'advertise'. */
1729 netdev_linux_set_advertisements(struct netdev *netdev,
1730 enum netdev_features advertise)
1732 struct ethtool_cmd ecmd;
1735 COVERAGE_INC(netdev_get_ethtool);
1736 memset(&ecmd, 0, sizeof ecmd);
1737 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1738 ETHTOOL_GSET, "ETHTOOL_GSET");
1743 ecmd.advertising = 0;
1744 if (advertise & NETDEV_F_10MB_HD) {
1745 ecmd.advertising |= ADVERTISED_10baseT_Half;
1747 if (advertise & NETDEV_F_10MB_FD) {
1748 ecmd.advertising |= ADVERTISED_10baseT_Full;
1750 if (advertise & NETDEV_F_100MB_HD) {
1751 ecmd.advertising |= ADVERTISED_100baseT_Half;
1753 if (advertise & NETDEV_F_100MB_FD) {
1754 ecmd.advertising |= ADVERTISED_100baseT_Full;
1756 if (advertise & NETDEV_F_1GB_HD) {
1757 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1759 if (advertise & NETDEV_F_1GB_FD) {
1760 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1762 if (advertise & NETDEV_F_10GB_FD) {
1763 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1765 if (advertise & NETDEV_F_COPPER) {
1766 ecmd.advertising |= ADVERTISED_TP;
1768 if (advertise & NETDEV_F_FIBER) {
1769 ecmd.advertising |= ADVERTISED_FIBRE;
1771 if (advertise & NETDEV_F_AUTONEG) {
1772 ecmd.advertising |= ADVERTISED_Autoneg;
1774 if (advertise & NETDEV_F_PAUSE) {
1775 ecmd.advertising |= ADVERTISED_Pause;
1777 if (advertise & NETDEV_F_PAUSE_ASYM) {
1778 ecmd.advertising |= ADVERTISED_Asym_Pause;
1780 COVERAGE_INC(netdev_set_ethtool);
1781 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1782 ETHTOOL_SSET, "ETHTOOL_SSET");
1785 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1786 * successful, otherwise a positive errno value. */
1788 netdev_linux_set_policing(struct netdev *netdev,
1789 uint32_t kbits_rate, uint32_t kbits_burst)
1791 struct netdev_dev_linux *netdev_dev =
1792 netdev_dev_linux_cast(netdev_get_dev(netdev));
1793 const char *netdev_name = netdev_get_name(netdev);
1797 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1798 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1799 : kbits_burst); /* Stick with user-specified value. */
1801 if (netdev_dev->cache_valid & VALID_POLICING) {
1802 if (netdev_dev->netdev_policing_error) {
1803 return netdev_dev->netdev_policing_error;
1806 if (netdev_dev->kbits_rate == kbits_rate &&
1807 netdev_dev->kbits_burst == kbits_burst) {
1808 /* Assume that settings haven't changed since we last set them. */
1811 netdev_dev->cache_valid &= ~VALID_POLICING;
1814 COVERAGE_INC(netdev_set_policing);
1815 /* Remove any existing ingress qdisc. */
1816 error = tc_add_del_ingress_qdisc(netdev, false);
1818 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1819 netdev_name, strerror(error));
1824 error = tc_add_del_ingress_qdisc(netdev, true);
1826 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1827 netdev_name, strerror(error));
1831 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1833 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1834 netdev_name, strerror(error));
1839 netdev_dev->kbits_rate = kbits_rate;
1840 netdev_dev->kbits_burst = kbits_burst;
1843 if (!error || error == ENODEV) {
1844 netdev_dev->netdev_policing_error = error;
1845 netdev_dev->cache_valid |= VALID_POLICING;
1851 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1854 const struct tc_ops **opsp;
1856 for (opsp = tcs; *opsp != NULL; opsp++) {
1857 const struct tc_ops *ops = *opsp;
1858 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1859 sset_add(types, ops->ovs_name);
1865 static const struct tc_ops *
1866 tc_lookup_ovs_name(const char *name)
1868 const struct tc_ops **opsp;
1870 for (opsp = tcs; *opsp != NULL; opsp++) {
1871 const struct tc_ops *ops = *opsp;
1872 if (!strcmp(name, ops->ovs_name)) {
1879 static const struct tc_ops *
1880 tc_lookup_linux_name(const char *name)
1882 const struct tc_ops **opsp;
1884 for (opsp = tcs; *opsp != NULL; opsp++) {
1885 const struct tc_ops *ops = *opsp;
1886 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1893 static struct tc_queue *
1894 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1897 struct netdev_dev_linux *netdev_dev =
1898 netdev_dev_linux_cast(netdev_get_dev(netdev));
1899 struct tc_queue *queue;
1901 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1902 if (queue->queue_id == queue_id) {
1909 static struct tc_queue *
1910 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1912 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1916 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1918 struct netdev_qos_capabilities *caps)
1920 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1924 caps->n_queues = ops->n_queues;
1929 netdev_linux_get_qos(const struct netdev *netdev,
1930 const char **typep, struct smap *details)
1932 struct netdev_dev_linux *netdev_dev =
1933 netdev_dev_linux_cast(netdev_get_dev(netdev));
1936 error = tc_query_qdisc(netdev);
1941 *typep = netdev_dev->tc->ops->ovs_name;
1942 return (netdev_dev->tc->ops->qdisc_get
1943 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1948 netdev_linux_set_qos(struct netdev *netdev,
1949 const char *type, const struct smap *details)
1951 struct netdev_dev_linux *netdev_dev =
1952 netdev_dev_linux_cast(netdev_get_dev(netdev));
1953 const struct tc_ops *new_ops;
1956 new_ops = tc_lookup_ovs_name(type);
1957 if (!new_ops || !new_ops->tc_install) {
1961 error = tc_query_qdisc(netdev);
1966 if (new_ops == netdev_dev->tc->ops) {
1967 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1969 /* Delete existing qdisc. */
1970 error = tc_del_qdisc(netdev);
1974 ovs_assert(netdev_dev->tc == NULL);
1976 /* Install new qdisc. */
1977 error = new_ops->tc_install(netdev, details);
1978 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
1985 netdev_linux_get_queue(const struct netdev *netdev,
1986 unsigned int queue_id, struct smap *details)
1988 struct netdev_dev_linux *netdev_dev =
1989 netdev_dev_linux_cast(netdev_get_dev(netdev));
1992 error = tc_query_qdisc(netdev);
1996 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1998 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
2004 netdev_linux_set_queue(struct netdev *netdev,
2005 unsigned int queue_id, const struct smap *details)
2007 struct netdev_dev_linux *netdev_dev =
2008 netdev_dev_linux_cast(netdev_get_dev(netdev));
2011 error = tc_query_qdisc(netdev);
2014 } else if (queue_id >= netdev_dev->tc->ops->n_queues
2015 || !netdev_dev->tc->ops->class_set) {
2019 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
2023 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
2025 struct netdev_dev_linux *netdev_dev =
2026 netdev_dev_linux_cast(netdev_get_dev(netdev));
2029 error = tc_query_qdisc(netdev);
2032 } else if (!netdev_dev->tc->ops->class_delete) {
2035 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2037 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2043 netdev_linux_get_queue_stats(const struct netdev *netdev,
2044 unsigned int queue_id,
2045 struct netdev_queue_stats *stats)
2047 struct netdev_dev_linux *netdev_dev =
2048 netdev_dev_linux_cast(netdev_get_dev(netdev));
2051 error = tc_query_qdisc(netdev);
2054 } else if (!netdev_dev->tc->ops->class_get_stats) {
2057 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2059 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2065 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2067 struct ofpbuf request;
2068 struct tcmsg *tcmsg;
2070 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2074 tcmsg->tcm_parent = 0;
2075 nl_dump_start(dump, rtnl_sock, &request);
2076 ofpbuf_uninit(&request);
2081 netdev_linux_dump_queues(const struct netdev *netdev,
2082 netdev_dump_queues_cb *cb, void *aux)
2084 struct netdev_dev_linux *netdev_dev =
2085 netdev_dev_linux_cast(netdev_get_dev(netdev));
2086 struct tc_queue *queue, *next_queue;
2087 struct smap details;
2091 error = tc_query_qdisc(netdev);
2094 } else if (!netdev_dev->tc->ops->class_get) {
2099 smap_init(&details);
2100 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2101 &netdev_dev->tc->queues) {
2102 smap_clear(&details);
2104 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2106 (*cb)(queue->queue_id, &details, aux);
2111 smap_destroy(&details);
2117 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2118 netdev_dump_queue_stats_cb *cb, void *aux)
2120 struct netdev_dev_linux *netdev_dev =
2121 netdev_dev_linux_cast(netdev_get_dev(netdev));
2122 struct nl_dump dump;
2127 error = tc_query_qdisc(netdev);
2130 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2135 if (!start_queue_dump(netdev, &dump)) {
2138 while (nl_dump_next(&dump, &msg)) {
2139 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2145 error = nl_dump_done(&dump);
2146 return error ? error : last_error;
2150 netdev_linux_get_in4(const struct netdev *netdev_,
2151 struct in_addr *address, struct in_addr *netmask)
2153 struct netdev_dev_linux *netdev_dev =
2154 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2156 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2159 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2160 SIOCGIFADDR, "SIOCGIFADDR");
2165 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2166 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2171 netdev_dev->cache_valid |= VALID_IN4;
2173 *address = netdev_dev->address;
2174 *netmask = netdev_dev->netmask;
2175 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2179 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2180 struct in_addr netmask)
2182 struct netdev_dev_linux *netdev_dev =
2183 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2186 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2188 netdev_dev->cache_valid |= VALID_IN4;
2189 netdev_dev->address = address;
2190 netdev_dev->netmask = netmask;
2191 if (address.s_addr != INADDR_ANY) {
2192 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2193 "SIOCSIFNETMASK", netmask);
2200 parse_if_inet6_line(const char *line,
2201 struct in6_addr *in6, char ifname[16 + 1])
2203 uint8_t *s6 = in6->s6_addr;
2204 #define X8 "%2"SCNx8
2206 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2207 "%*x %*x %*x %*x %16s\n",
2208 &s6[0], &s6[1], &s6[2], &s6[3],
2209 &s6[4], &s6[5], &s6[6], &s6[7],
2210 &s6[8], &s6[9], &s6[10], &s6[11],
2211 &s6[12], &s6[13], &s6[14], &s6[15],
2215 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2216 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2218 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2220 struct netdev_dev_linux *netdev_dev =
2221 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2222 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2226 netdev_dev->in6 = in6addr_any;
2228 file = fopen("/proc/net/if_inet6", "r");
2230 const char *name = netdev_get_name(netdev_);
2231 while (fgets(line, sizeof line, file)) {
2232 struct in6_addr in6_tmp;
2233 char ifname[16 + 1];
2234 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2235 && !strcmp(name, ifname))
2237 netdev_dev->in6 = in6_tmp;
2243 netdev_dev->cache_valid |= VALID_IN6;
2245 *in6 = netdev_dev->in6;
2250 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2252 struct sockaddr_in sin;
2253 memset(&sin, 0, sizeof sin);
2254 sin.sin_family = AF_INET;
2255 sin.sin_addr = addr;
2258 memset(sa, 0, sizeof *sa);
2259 memcpy(sa, &sin, sizeof sin);
2263 do_set_addr(struct netdev *netdev,
2264 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2267 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2268 make_in4_sockaddr(&ifr.ifr_addr, addr);
2270 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2274 /* Adds 'router' as a default IP gateway. */
2276 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2278 struct in_addr any = { INADDR_ANY };
2282 memset(&rt, 0, sizeof rt);
2283 make_in4_sockaddr(&rt.rt_dst, any);
2284 make_in4_sockaddr(&rt.rt_gateway, router);
2285 make_in4_sockaddr(&rt.rt_genmask, any);
2286 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2287 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2289 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2295 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2298 static const char fn[] = "/proc/net/route";
2303 *netdev_name = NULL;
2304 stream = fopen(fn, "r");
2305 if (stream == NULL) {
2306 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2311 while (fgets(line, sizeof line, stream)) {
2314 ovs_be32 dest, gateway, mask;
2315 int refcnt, metric, mtu;
2316 unsigned int flags, use, window, irtt;
2319 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2321 iface, &dest, &gateway, &flags, &refcnt,
2322 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2324 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2328 if (!(flags & RTF_UP)) {
2329 /* Skip routes that aren't up. */
2333 /* The output of 'dest', 'mask', and 'gateway' were given in
2334 * network byte order, so we don't need need any endian
2335 * conversions here. */
2336 if ((dest & mask) == (host->s_addr & mask)) {
2338 /* The host is directly reachable. */
2339 next_hop->s_addr = 0;
2341 /* To reach the host, we must go through a gateway. */
2342 next_hop->s_addr = gateway;
2344 *netdev_name = xstrdup(iface);
2356 netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
2358 struct netdev_dev_linux *netdev_dev;
2361 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2362 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2363 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2365 COVERAGE_INC(netdev_get_ethtool);
2366 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2367 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2370 "ETHTOOL_GDRVINFO");
2372 netdev_dev->cache_valid |= VALID_DRVINFO;
2377 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2378 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2379 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2385 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2388 smap_add(smap, "driver_name", "openvswitch");
2392 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2393 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2394 * returns 0. Otherwise, it returns a positive errno value; in particular,
2395 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2397 netdev_linux_arp_lookup(const struct netdev *netdev,
2398 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2401 struct sockaddr_in sin;
2404 memset(&r, 0, sizeof r);
2405 memset(&sin, 0, sizeof sin);
2406 sin.sin_family = AF_INET;
2407 sin.sin_addr.s_addr = ip;
2409 memcpy(&r.arp_pa, &sin, sizeof sin);
2410 r.arp_ha.sa_family = ARPHRD_ETHER;
2412 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2413 COVERAGE_INC(netdev_arp_lookup);
2414 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2416 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2417 } else if (retval != ENXIO) {
2418 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2419 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2425 nd_to_iff_flags(enum netdev_flags nd)
2428 if (nd & NETDEV_UP) {
2431 if (nd & NETDEV_PROMISC) {
2438 iff_to_nd_flags(int iff)
2440 enum netdev_flags nd = 0;
2444 if (iff & IFF_PROMISC) {
2445 nd |= NETDEV_PROMISC;
2451 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2452 enum netdev_flags on, enum netdev_flags *old_flagsp)
2454 struct netdev_dev_linux *netdev_dev;
2455 int old_flags, new_flags;
2458 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2459 old_flags = netdev_dev->ifi_flags;
2460 *old_flagsp = iff_to_nd_flags(old_flags);
2461 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2462 if (new_flags != old_flags) {
2463 error = set_flags(netdev, new_flags);
2464 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2470 netdev_linux_change_seq(const struct netdev *netdev)
2472 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2475 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2476 GET_FEATURES, GET_STATUS) \
2480 netdev_linux_init, \
2482 netdev_linux_wait, \
2485 netdev_linux_destroy, \
2486 NULL, /* get_config */ \
2487 NULL, /* set_config */ \
2488 NULL, /* get_tunnel_config */ \
2490 netdev_linux_open, \
2491 netdev_linux_close, \
2493 netdev_linux_listen, \
2494 netdev_linux_recv, \
2495 netdev_linux_recv_wait, \
2496 netdev_linux_drain, \
2498 netdev_linux_send, \
2499 netdev_linux_send_wait, \
2501 netdev_linux_set_etheraddr, \
2502 netdev_linux_get_etheraddr, \
2503 netdev_linux_get_mtu, \
2504 netdev_linux_set_mtu, \
2505 netdev_linux_get_ifindex, \
2506 netdev_linux_get_carrier, \
2507 netdev_linux_get_carrier_resets, \
2508 netdev_linux_set_miimon_interval, \
2513 netdev_linux_set_advertisements, \
2515 netdev_linux_set_policing, \
2516 netdev_linux_get_qos_types, \
2517 netdev_linux_get_qos_capabilities, \
2518 netdev_linux_get_qos, \
2519 netdev_linux_set_qos, \
2520 netdev_linux_get_queue, \
2521 netdev_linux_set_queue, \
2522 netdev_linux_delete_queue, \
2523 netdev_linux_get_queue_stats, \
2524 netdev_linux_dump_queues, \
2525 netdev_linux_dump_queue_stats, \
2527 netdev_linux_get_in4, \
2528 netdev_linux_set_in4, \
2529 netdev_linux_get_in6, \
2530 netdev_linux_add_router, \
2531 netdev_linux_get_next_hop, \
2533 netdev_linux_arp_lookup, \
2535 netdev_linux_update_flags, \
2537 netdev_linux_change_seq \
2540 const struct netdev_class netdev_linux_class =
2543 netdev_linux_create,
2544 netdev_linux_get_stats,
2545 NULL, /* set_stats */
2546 netdev_linux_get_features,
2547 netdev_linux_get_status);
2549 const struct netdev_class netdev_tap_class =
2552 netdev_linux_create_tap,
2553 netdev_tap_get_stats,
2554 NULL, /* set_stats */
2555 netdev_linux_get_features,
2556 netdev_linux_get_status);
2558 const struct netdev_class netdev_internal_class =
2561 netdev_linux_create,
2562 netdev_internal_get_stats,
2563 netdev_internal_set_stats,
2564 NULL, /* get_features */
2565 netdev_internal_get_status);
2567 /* HTB traffic control class. */
2569 #define HTB_N_QUEUES 0xf000
2573 unsigned int max_rate; /* In bytes/s. */
2577 struct tc_queue tc_queue;
2578 unsigned int min_rate; /* In bytes/s. */
2579 unsigned int max_rate; /* In bytes/s. */
2580 unsigned int burst; /* In bytes. */
2581 unsigned int priority; /* Lower values are higher priorities. */
2585 htb_get__(const struct netdev *netdev)
2587 struct netdev_dev_linux *netdev_dev =
2588 netdev_dev_linux_cast(netdev_get_dev(netdev));
2589 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2593 htb_install__(struct netdev *netdev, uint64_t max_rate)
2595 struct netdev_dev_linux *netdev_dev =
2596 netdev_dev_linux_cast(netdev_get_dev(netdev));
2599 htb = xmalloc(sizeof *htb);
2600 tc_init(&htb->tc, &tc_ops_htb);
2601 htb->max_rate = max_rate;
2603 netdev_dev->tc = &htb->tc;
2606 /* Create an HTB qdisc.
2608 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2610 htb_setup_qdisc__(struct netdev *netdev)
2613 struct tc_htb_glob opt;
2614 struct ofpbuf request;
2615 struct tcmsg *tcmsg;
2617 tc_del_qdisc(netdev);
2619 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2620 NLM_F_EXCL | NLM_F_CREATE, &request);
2624 tcmsg->tcm_handle = tc_make_handle(1, 0);
2625 tcmsg->tcm_parent = TC_H_ROOT;
2627 nl_msg_put_string(&request, TCA_KIND, "htb");
2629 memset(&opt, 0, sizeof opt);
2630 opt.rate2quantum = 10;
2634 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2635 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2636 nl_msg_end_nested(&request, opt_offset);
2638 return tc_transact(&request, NULL);
2641 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2642 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2644 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2645 unsigned int parent, struct htb_class *class)
2648 struct tc_htb_opt opt;
2649 struct ofpbuf request;
2650 struct tcmsg *tcmsg;
2654 error = netdev_get_mtu(netdev, &mtu);
2656 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2657 netdev_get_name(netdev));
2661 memset(&opt, 0, sizeof opt);
2662 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2663 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2664 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2665 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2666 opt.prio = class->priority;
2668 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2672 tcmsg->tcm_handle = handle;
2673 tcmsg->tcm_parent = parent;
2675 nl_msg_put_string(&request, TCA_KIND, "htb");
2676 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2677 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2678 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2679 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2680 nl_msg_end_nested(&request, opt_offset);
2682 error = tc_transact(&request, NULL);
2684 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2685 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2686 netdev_get_name(netdev),
2687 tc_get_major(handle), tc_get_minor(handle),
2688 tc_get_major(parent), tc_get_minor(parent),
2689 class->min_rate, class->max_rate,
2690 class->burst, class->priority, strerror(error));
2695 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2696 * description of them into 'details'. The description complies with the
2697 * specification given in the vswitch database documentation for linux-htb
2700 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2702 static const struct nl_policy tca_htb_policy[] = {
2703 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2704 .min_len = sizeof(struct tc_htb_opt) },
2707 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2708 const struct tc_htb_opt *htb;
2710 if (!nl_parse_nested(nl_options, tca_htb_policy,
2711 attrs, ARRAY_SIZE(tca_htb_policy))) {
2712 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2716 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2717 class->min_rate = htb->rate.rate;
2718 class->max_rate = htb->ceil.rate;
2719 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2720 class->priority = htb->prio;
2725 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2726 struct htb_class *options,
2727 struct netdev_queue_stats *stats)
2729 struct nlattr *nl_options;
2730 unsigned int handle;
2733 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2734 if (!error && queue_id) {
2735 unsigned int major = tc_get_major(handle);
2736 unsigned int minor = tc_get_minor(handle);
2737 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2738 *queue_id = minor - 1;
2743 if (!error && options) {
2744 error = htb_parse_tca_options__(nl_options, options);
2750 htb_parse_qdisc_details__(struct netdev *netdev,
2751 const struct smap *details, struct htb_class *hc)
2753 const char *max_rate_s;
2755 max_rate_s = smap_get(details, "max-rate");
2756 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2757 if (!hc->max_rate) {
2758 enum netdev_features current;
2760 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2761 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2763 hc->min_rate = hc->max_rate;
2769 htb_parse_class_details__(struct netdev *netdev,
2770 const struct smap *details, struct htb_class *hc)
2772 const struct htb *htb = htb_get__(netdev);
2773 const char *min_rate_s = smap_get(details, "min-rate");
2774 const char *max_rate_s = smap_get(details, "max-rate");
2775 const char *burst_s = smap_get(details, "burst");
2776 const char *priority_s = smap_get(details, "priority");
2779 error = netdev_get_mtu(netdev, &mtu);
2781 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2782 netdev_get_name(netdev));
2786 /* HTB requires at least an mtu sized min-rate to send any traffic even
2787 * on uncongested links. */
2788 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2789 hc->min_rate = MAX(hc->min_rate, mtu);
2790 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2793 hc->max_rate = (max_rate_s
2794 ? strtoull(max_rate_s, NULL, 10) / 8
2796 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2797 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2801 * According to hints in the documentation that I've read, it is important
2802 * that 'burst' be at least as big as the largest frame that might be
2803 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2804 * but having it a bit too small is a problem. Since netdev_get_mtu()
2805 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2806 * the MTU. We actually add 64, instead of 14, as a guard against
2807 * additional headers get tacked on somewhere that we're not aware of. */
2808 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2809 hc->burst = MAX(hc->burst, mtu + 64);
2812 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2818 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2819 unsigned int parent, struct htb_class *options,
2820 struct netdev_queue_stats *stats)
2822 struct ofpbuf *reply;
2825 error = tc_query_class(netdev, handle, parent, &reply);
2827 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2828 ofpbuf_delete(reply);
2834 htb_tc_install(struct netdev *netdev, const struct smap *details)
2838 error = htb_setup_qdisc__(netdev);
2840 struct htb_class hc;
2842 htb_parse_qdisc_details__(netdev, details, &hc);
2843 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2844 tc_make_handle(1, 0), &hc);
2846 htb_install__(netdev, hc.max_rate);
2852 static struct htb_class *
2853 htb_class_cast__(const struct tc_queue *queue)
2855 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2859 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2860 const struct htb_class *hc)
2862 struct htb *htb = htb_get__(netdev);
2863 size_t hash = hash_int(queue_id, 0);
2864 struct tc_queue *queue;
2865 struct htb_class *hcp;
2867 queue = tc_find_queue__(netdev, queue_id, hash);
2869 hcp = htb_class_cast__(queue);
2871 hcp = xmalloc(sizeof *hcp);
2872 queue = &hcp->tc_queue;
2873 queue->queue_id = queue_id;
2874 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2877 hcp->min_rate = hc->min_rate;
2878 hcp->max_rate = hc->max_rate;
2879 hcp->burst = hc->burst;
2880 hcp->priority = hc->priority;
2884 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2887 struct nl_dump dump;
2888 struct htb_class hc;
2890 /* Get qdisc options. */
2892 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2893 htb_install__(netdev, hc.max_rate);
2896 if (!start_queue_dump(netdev, &dump)) {
2899 while (nl_dump_next(&dump, &msg)) {
2900 unsigned int queue_id;
2902 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2903 htb_update_queue__(netdev, queue_id, &hc);
2906 nl_dump_done(&dump);
2912 htb_tc_destroy(struct tc *tc)
2914 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2915 struct htb_class *hc, *next;
2917 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2918 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2926 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2928 const struct htb *htb = htb_get__(netdev);
2929 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2934 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2936 struct htb_class hc;
2939 htb_parse_qdisc_details__(netdev, details, &hc);
2940 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2941 tc_make_handle(1, 0), &hc);
2943 htb_get__(netdev)->max_rate = hc.max_rate;
2949 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2950 const struct tc_queue *queue, struct smap *details)
2952 const struct htb_class *hc = htb_class_cast__(queue);
2954 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2955 if (hc->min_rate != hc->max_rate) {
2956 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2958 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2960 smap_add_format(details, "priority", "%u", hc->priority);
2966 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2967 const struct smap *details)
2969 struct htb_class hc;
2972 error = htb_parse_class_details__(netdev, details, &hc);
2977 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2978 tc_make_handle(1, 0xfffe), &hc);
2983 htb_update_queue__(netdev, queue_id, &hc);
2988 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2990 struct htb_class *hc = htb_class_cast__(queue);
2991 struct htb *htb = htb_get__(netdev);
2994 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2996 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3003 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3004 struct netdev_queue_stats *stats)
3006 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3007 tc_make_handle(1, 0xfffe), NULL, stats);
3011 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3012 const struct ofpbuf *nlmsg,
3013 netdev_dump_queue_stats_cb *cb, void *aux)
3015 struct netdev_queue_stats stats;
3016 unsigned int handle, major, minor;
3019 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3024 major = tc_get_major(handle);
3025 minor = tc_get_minor(handle);
3026 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3027 (*cb)(minor - 1, &stats, aux);
3032 static const struct tc_ops tc_ops_htb = {
3033 "htb", /* linux_name */
3034 "linux-htb", /* ovs_name */
3035 HTB_N_QUEUES, /* n_queues */
3044 htb_class_get_stats,
3045 htb_class_dump_stats
3048 /* "linux-hfsc" traffic control class. */
3050 #define HFSC_N_QUEUES 0xf000
3058 struct tc_queue tc_queue;
3063 static struct hfsc *
3064 hfsc_get__(const struct netdev *netdev)
3066 struct netdev_dev_linux *netdev_dev;
3067 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3068 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3071 static struct hfsc_class *
3072 hfsc_class_cast__(const struct tc_queue *queue)
3074 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3078 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3080 struct netdev_dev_linux * netdev_dev;
3083 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3084 hfsc = xmalloc(sizeof *hfsc);
3085 tc_init(&hfsc->tc, &tc_ops_hfsc);
3086 hfsc->max_rate = max_rate;
3087 netdev_dev->tc = &hfsc->tc;
3091 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3092 const struct hfsc_class *hc)
3096 struct hfsc_class *hcp;
3097 struct tc_queue *queue;
3099 hfsc = hfsc_get__(netdev);
3100 hash = hash_int(queue_id, 0);
3102 queue = tc_find_queue__(netdev, queue_id, hash);
3104 hcp = hfsc_class_cast__(queue);
3106 hcp = xmalloc(sizeof *hcp);
3107 queue = &hcp->tc_queue;
3108 queue->queue_id = queue_id;
3109 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3112 hcp->min_rate = hc->min_rate;
3113 hcp->max_rate = hc->max_rate;
3117 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3119 const struct tc_service_curve *rsc, *fsc, *usc;
3120 static const struct nl_policy tca_hfsc_policy[] = {
3122 .type = NL_A_UNSPEC,
3124 .min_len = sizeof(struct tc_service_curve),
3127 .type = NL_A_UNSPEC,
3129 .min_len = sizeof(struct tc_service_curve),
3132 .type = NL_A_UNSPEC,
3134 .min_len = sizeof(struct tc_service_curve),
3137 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3139 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3140 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3141 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3145 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3146 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3147 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3149 if (rsc->m1 != 0 || rsc->d != 0 ||
3150 fsc->m1 != 0 || fsc->d != 0 ||
3151 usc->m1 != 0 || usc->d != 0) {
3152 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3153 "Non-linear service curves are not supported.");
3157 if (rsc->m2 != fsc->m2) {
3158 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3159 "Real-time service curves are not supported ");
3163 if (rsc->m2 > usc->m2) {
3164 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3165 "Min-rate service curve is greater than "
3166 "the max-rate service curve.");
3170 class->min_rate = fsc->m2;
3171 class->max_rate = usc->m2;
3176 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3177 struct hfsc_class *options,
3178 struct netdev_queue_stats *stats)
3181 unsigned int handle;
3182 struct nlattr *nl_options;
3184 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3190 unsigned int major, minor;
3192 major = tc_get_major(handle);
3193 minor = tc_get_minor(handle);
3194 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3195 *queue_id = minor - 1;
3202 error = hfsc_parse_tca_options__(nl_options, options);
3209 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3210 unsigned int parent, struct hfsc_class *options,
3211 struct netdev_queue_stats *stats)
3214 struct ofpbuf *reply;
3216 error = tc_query_class(netdev, handle, parent, &reply);
3221 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3222 ofpbuf_delete(reply);
3227 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3228 struct hfsc_class *class)
3231 const char *max_rate_s;
3233 max_rate_s = smap_get(details, "max-rate");
3234 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3237 enum netdev_features current;
3239 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3240 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3243 class->min_rate = max_rate;
3244 class->max_rate = max_rate;
3248 hfsc_parse_class_details__(struct netdev *netdev,
3249 const struct smap *details,
3250 struct hfsc_class * class)
3252 const struct hfsc *hfsc;
3253 uint32_t min_rate, max_rate;
3254 const char *min_rate_s, *max_rate_s;
3256 hfsc = hfsc_get__(netdev);
3257 min_rate_s = smap_get(details, "min-rate");
3258 max_rate_s = smap_get(details, "max-rate");
3260 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3261 min_rate = MAX(min_rate, 1);
3262 min_rate = MIN(min_rate, hfsc->max_rate);
3264 max_rate = (max_rate_s
3265 ? strtoull(max_rate_s, NULL, 10) / 8
3267 max_rate = MAX(max_rate, min_rate);
3268 max_rate = MIN(max_rate, hfsc->max_rate);
3270 class->min_rate = min_rate;
3271 class->max_rate = max_rate;
3276 /* Create an HFSC qdisc.
3278 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3280 hfsc_setup_qdisc__(struct netdev * netdev)
3282 struct tcmsg *tcmsg;
3283 struct ofpbuf request;
3284 struct tc_hfsc_qopt opt;
3286 tc_del_qdisc(netdev);
3288 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3289 NLM_F_EXCL | NLM_F_CREATE, &request);
3295 tcmsg->tcm_handle = tc_make_handle(1, 0);
3296 tcmsg->tcm_parent = TC_H_ROOT;
3298 memset(&opt, 0, sizeof opt);
3301 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3302 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3304 return tc_transact(&request, NULL);
3307 /* Create an HFSC class.
3309 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3310 * sc rate <min_rate> ul rate <max_rate>" */
3312 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3313 unsigned int parent, struct hfsc_class *class)
3317 struct tcmsg *tcmsg;
3318 struct ofpbuf request;
3319 struct tc_service_curve min, max;
3321 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3327 tcmsg->tcm_handle = handle;
3328 tcmsg->tcm_parent = parent;
3332 min.m2 = class->min_rate;
3336 max.m2 = class->max_rate;
3338 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3339 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3340 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3341 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3342 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3343 nl_msg_end_nested(&request, opt_offset);
3345 error = tc_transact(&request, NULL);
3347 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3348 "min-rate %ubps, max-rate %ubps (%s)",
3349 netdev_get_name(netdev),
3350 tc_get_major(handle), tc_get_minor(handle),
3351 tc_get_major(parent), tc_get_minor(parent),
3352 class->min_rate, class->max_rate, strerror(error));
3359 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3362 struct hfsc_class class;
3364 error = hfsc_setup_qdisc__(netdev);
3370 hfsc_parse_qdisc_details__(netdev, details, &class);
3371 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3372 tc_make_handle(1, 0), &class);
3378 hfsc_install__(netdev, class.max_rate);
3383 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3386 struct nl_dump dump;
3387 struct hfsc_class hc;
3390 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3391 hfsc_install__(netdev, hc.max_rate);
3393 if (!start_queue_dump(netdev, &dump)) {
3397 while (nl_dump_next(&dump, &msg)) {
3398 unsigned int queue_id;
3400 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3401 hfsc_update_queue__(netdev, queue_id, &hc);
3405 nl_dump_done(&dump);
3410 hfsc_tc_destroy(struct tc *tc)
3413 struct hfsc_class *hc, *next;
3415 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3417 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3418 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3427 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3429 const struct hfsc *hfsc;
3430 hfsc = hfsc_get__(netdev);
3431 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3436 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3439 struct hfsc_class class;
3441 hfsc_parse_qdisc_details__(netdev, details, &class);
3442 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3443 tc_make_handle(1, 0), &class);
3446 hfsc_get__(netdev)->max_rate = class.max_rate;
3453 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3454 const struct tc_queue *queue, struct smap *details)
3456 const struct hfsc_class *hc;
3458 hc = hfsc_class_cast__(queue);
3459 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3460 if (hc->min_rate != hc->max_rate) {
3461 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3467 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3468 const struct smap *details)
3471 struct hfsc_class class;
3473 error = hfsc_parse_class_details__(netdev, details, &class);
3478 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3479 tc_make_handle(1, 0xfffe), &class);
3484 hfsc_update_queue__(netdev, queue_id, &class);
3489 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3493 struct hfsc_class *hc;
3495 hc = hfsc_class_cast__(queue);
3496 hfsc = hfsc_get__(netdev);
3498 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3500 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3507 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3508 struct netdev_queue_stats *stats)
3510 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3511 tc_make_handle(1, 0xfffe), NULL, stats);
3515 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3516 const struct ofpbuf *nlmsg,
3517 netdev_dump_queue_stats_cb *cb, void *aux)
3519 struct netdev_queue_stats stats;
3520 unsigned int handle, major, minor;
3523 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3528 major = tc_get_major(handle);
3529 minor = tc_get_minor(handle);
3530 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3531 (*cb)(minor - 1, &stats, aux);
3536 static const struct tc_ops tc_ops_hfsc = {
3537 "hfsc", /* linux_name */
3538 "linux-hfsc", /* ovs_name */
3539 HFSC_N_QUEUES, /* n_queues */
3540 hfsc_tc_install, /* tc_install */
3541 hfsc_tc_load, /* tc_load */
3542 hfsc_tc_destroy, /* tc_destroy */
3543 hfsc_qdisc_get, /* qdisc_get */
3544 hfsc_qdisc_set, /* qdisc_set */
3545 hfsc_class_get, /* class_get */
3546 hfsc_class_set, /* class_set */
3547 hfsc_class_delete, /* class_delete */
3548 hfsc_class_get_stats, /* class_get_stats */
3549 hfsc_class_dump_stats /* class_dump_stats */
3552 /* "linux-default" traffic control class.
3554 * This class represents the default, unnamed Linux qdisc. It corresponds to
3555 * the "" (empty string) QoS type in the OVS database. */
3558 default_install__(struct netdev *netdev)
3560 struct netdev_dev_linux *netdev_dev =
3561 netdev_dev_linux_cast(netdev_get_dev(netdev));
3562 static struct tc *tc;
3565 tc = xmalloc(sizeof *tc);
3566 tc_init(tc, &tc_ops_default);
3568 netdev_dev->tc = tc;
3572 default_tc_install(struct netdev *netdev,
3573 const struct smap *details OVS_UNUSED)
3575 default_install__(netdev);
3580 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3582 default_install__(netdev);
3586 static const struct tc_ops tc_ops_default = {
3587 NULL, /* linux_name */
3592 NULL, /* tc_destroy */
3593 NULL, /* qdisc_get */
3594 NULL, /* qdisc_set */
3595 NULL, /* class_get */
3596 NULL, /* class_set */
3597 NULL, /* class_delete */
3598 NULL, /* class_get_stats */
3599 NULL /* class_dump_stats */
3602 /* "linux-other" traffic control class.
3607 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3609 struct netdev_dev_linux *netdev_dev =
3610 netdev_dev_linux_cast(netdev_get_dev(netdev));
3611 static struct tc *tc;
3614 tc = xmalloc(sizeof *tc);
3615 tc_init(tc, &tc_ops_other);
3617 netdev_dev->tc = tc;
3621 static const struct tc_ops tc_ops_other = {
3622 NULL, /* linux_name */
3623 "linux-other", /* ovs_name */
3625 NULL, /* tc_install */
3627 NULL, /* tc_destroy */
3628 NULL, /* qdisc_get */
3629 NULL, /* qdisc_set */
3630 NULL, /* class_get */
3631 NULL, /* class_set */
3632 NULL, /* class_delete */
3633 NULL, /* class_get_stats */
3634 NULL /* class_dump_stats */
3637 /* Traffic control. */
3639 /* Number of kernel "tc" ticks per second. */
3640 static double ticks_per_s;
3642 /* Number of kernel "jiffies" per second. This is used for the purpose of
3643 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3644 * one jiffy's worth of data.
3646 * There are two possibilities here:
3648 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3649 * approximate range of 100 to 1024. That means that we really need to
3650 * make sure that the qdisc can buffer that much data.
3652 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3653 * has finely granular timers and there's no need to fudge additional room
3654 * for buffers. (There's no extra effort needed to implement that: the
3655 * large 'buffer_hz' is used as a divisor, so practically any number will
3656 * come out as 0 in the division. Small integer results in the case of
3657 * really high dividends won't have any real effect anyhow.)
3659 static unsigned int buffer_hz;
3661 /* Returns tc handle 'major':'minor'. */
3663 tc_make_handle(unsigned int major, unsigned int minor)
3665 return TC_H_MAKE(major << 16, minor);
3668 /* Returns the major number from 'handle'. */
3670 tc_get_major(unsigned int handle)
3672 return TC_H_MAJ(handle) >> 16;
3675 /* Returns the minor number from 'handle'. */
3677 tc_get_minor(unsigned int handle)
3679 return TC_H_MIN(handle);
3682 static struct tcmsg *
3683 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3684 struct ofpbuf *request)
3686 struct tcmsg *tcmsg;
3690 error = get_ifindex(netdev, &ifindex);
3695 ofpbuf_init(request, 512);
3696 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3697 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3698 tcmsg->tcm_family = AF_UNSPEC;
3699 tcmsg->tcm_ifindex = ifindex;
3700 /* Caller should fill in tcmsg->tcm_handle. */
3701 /* Caller should fill in tcmsg->tcm_parent. */
3707 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3709 int error = nl_sock_transact(rtnl_sock, request, replyp);
3710 ofpbuf_uninit(request);
3714 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3715 * policing configuration.
3717 * This function is equivalent to running the following when 'add' is true:
3718 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3720 * This function is equivalent to running the following when 'add' is false:
3721 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3723 * The configuration and stats may be seen with the following command:
3724 * /sbin/tc -s qdisc show dev <devname>
3726 * Returns 0 if successful, otherwise a positive errno value.
3729 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3731 struct ofpbuf request;
3732 struct tcmsg *tcmsg;
3734 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3735 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3737 tcmsg = tc_make_request(netdev, type, flags, &request);
3741 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3742 tcmsg->tcm_parent = TC_H_INGRESS;
3743 nl_msg_put_string(&request, TCA_KIND, "ingress");
3744 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3746 error = tc_transact(&request, NULL);
3748 /* If we're deleting the qdisc, don't worry about some of the
3749 * error conditions. */
3750 if (!add && (error == ENOENT || error == EINVAL)) {
3759 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3762 * This function is equivalent to running:
3763 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3764 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3767 * The configuration and stats may be seen with the following command:
3768 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3770 * Returns 0 if successful, otherwise a positive errno value.
3773 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3775 struct tc_police tc_police;
3776 struct ofpbuf request;
3777 struct tcmsg *tcmsg;
3778 size_t basic_offset;
3779 size_t police_offset;
3783 memset(&tc_police, 0, sizeof tc_police);
3784 tc_police.action = TC_POLICE_SHOT;
3785 tc_police.mtu = mtu;
3786 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3787 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3788 kbits_burst * 1024);
3790 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3791 NLM_F_EXCL | NLM_F_CREATE, &request);
3795 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3796 tcmsg->tcm_info = tc_make_handle(49,
3797 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3799 nl_msg_put_string(&request, TCA_KIND, "basic");
3800 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3801 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3802 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3803 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3804 nl_msg_end_nested(&request, police_offset);
3805 nl_msg_end_nested(&request, basic_offset);
3807 error = tc_transact(&request, NULL);
3818 /* The values in psched are not individually very meaningful, but they are
3819 * important. The tables below show some values seen in the wild.
3823 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3824 * (Before that, there are hints that it was 1000000000.)
3826 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3830 * -----------------------------------
3831 * [1] 000c8000 000f4240 000f4240 00000064
3832 * [2] 000003e8 00000400 000f4240 3b9aca00
3833 * [3] 000003e8 00000400 000f4240 3b9aca00
3834 * [4] 000003e8 00000400 000f4240 00000064
3835 * [5] 000003e8 00000040 000f4240 3b9aca00
3836 * [6] 000003e8 00000040 000f4240 000000f9
3838 * a b c d ticks_per_s buffer_hz
3839 * ------- --------- ---------- ------------- ----------- -------------
3840 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3841 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3842 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3843 * [4] 1,000 1,024 1,000,000 100 976,562 100
3844 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3845 * [6] 1,000 64 1,000,000 249 15,625,000 249
3847 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3848 * [2] 2.6.26-1-686-bigmem from Debian lenny
3849 * [3] 2.6.26-2-sparc64 from Debian lenny
3850 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3851 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3852 * [6] 2.6.34 from kernel.org on KVM
3854 static const char fn[] = "/proc/net/psched";
3855 unsigned int a, b, c, d;
3861 stream = fopen(fn, "r");
3863 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3867 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3868 VLOG_WARN("%s: read failed", fn);
3872 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3876 VLOG_WARN("%s: invalid scheduler parameters", fn);
3880 ticks_per_s = (double) a * c / b;
3884 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3887 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3890 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3891 * rate of 'rate' bytes per second. */
3893 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3898 return (rate * ticks) / ticks_per_s;
3901 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3902 * rate of 'rate' bytes per second. */
3904 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3909 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3912 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3913 * a transmission rate of 'rate' bytes per second. */
3915 tc_buffer_per_jiffy(unsigned int rate)
3920 return rate / buffer_hz;
3923 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3924 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3925 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3926 * stores NULL into it if it is absent.
3928 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3931 * Returns 0 if successful, otherwise a positive errno value. */
3933 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3934 struct nlattr **options)
3936 static const struct nl_policy tca_policy[] = {
3937 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3938 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3940 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3942 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3943 tca_policy, ta, ARRAY_SIZE(ta))) {
3944 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3949 *kind = nl_attr_get_string(ta[TCA_KIND]);
3953 *options = ta[TCA_OPTIONS];
3968 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3969 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3970 * into '*options', and its queue statistics into '*stats'. Any of the output
3971 * arguments may be null.
3973 * Returns 0 if successful, otherwise a positive errno value. */
3975 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3976 struct nlattr **options, struct netdev_queue_stats *stats)
3978 static const struct nl_policy tca_policy[] = {
3979 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3980 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3982 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3984 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3985 tca_policy, ta, ARRAY_SIZE(ta))) {
3986 VLOG_WARN_RL(&rl, "failed to parse class message");
3991 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3992 *handlep = tc->tcm_handle;
3996 *options = ta[TCA_OPTIONS];
4000 const struct gnet_stats_queue *gsq;
4001 struct gnet_stats_basic gsb;
4003 static const struct nl_policy stats_policy[] = {
4004 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4005 .min_len = sizeof gsb },
4006 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4007 .min_len = sizeof *gsq },
4009 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4011 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4012 sa, ARRAY_SIZE(sa))) {
4013 VLOG_WARN_RL(&rl, "failed to parse class stats");
4017 /* Alignment issues screw up the length of struct gnet_stats_basic on
4018 * some arch/bitsize combinations. Newer versions of Linux have a
4019 * struct gnet_stats_basic_packed, but we can't depend on that. The
4020 * easiest thing to do is just to make a copy. */
4021 memset(&gsb, 0, sizeof gsb);
4022 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4023 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4024 stats->tx_bytes = gsb.bytes;
4025 stats->tx_packets = gsb.packets;
4027 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4028 stats->tx_errors = gsq->drops;
4038 memset(stats, 0, sizeof *stats);
4043 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4046 tc_query_class(const struct netdev *netdev,
4047 unsigned int handle, unsigned int parent,
4048 struct ofpbuf **replyp)
4050 struct ofpbuf request;
4051 struct tcmsg *tcmsg;
4054 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4058 tcmsg->tcm_handle = handle;
4059 tcmsg->tcm_parent = parent;
4061 error = tc_transact(&request, replyp);
4063 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4064 netdev_get_name(netdev),
4065 tc_get_major(handle), tc_get_minor(handle),
4066 tc_get_major(parent), tc_get_minor(parent),
4072 /* Equivalent to "tc class del dev <name> handle <handle>". */
4074 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4076 struct ofpbuf request;
4077 struct tcmsg *tcmsg;
4080 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4084 tcmsg->tcm_handle = handle;
4085 tcmsg->tcm_parent = 0;
4087 error = tc_transact(&request, NULL);
4089 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4090 netdev_get_name(netdev),
4091 tc_get_major(handle), tc_get_minor(handle),
4097 /* Equivalent to "tc qdisc del dev <name> root". */
4099 tc_del_qdisc(struct netdev *netdev)
4101 struct netdev_dev_linux *netdev_dev =
4102 netdev_dev_linux_cast(netdev_get_dev(netdev));
4103 struct ofpbuf request;
4104 struct tcmsg *tcmsg;
4107 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4111 tcmsg->tcm_handle = tc_make_handle(1, 0);
4112 tcmsg->tcm_parent = TC_H_ROOT;
4114 error = tc_transact(&request, NULL);
4115 if (error == EINVAL) {
4116 /* EINVAL probably means that the default qdisc was in use, in which
4117 * case we've accomplished our purpose. */
4120 if (!error && netdev_dev->tc) {
4121 if (netdev_dev->tc->ops->tc_destroy) {
4122 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4124 netdev_dev->tc = NULL;
4129 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4130 * kernel to determine what they are. Returns 0 if successful, otherwise a
4131 * positive errno value. */
4133 tc_query_qdisc(const struct netdev *netdev)
4135 struct netdev_dev_linux *netdev_dev =
4136 netdev_dev_linux_cast(netdev_get_dev(netdev));
4137 struct ofpbuf request, *qdisc;
4138 const struct tc_ops *ops;
4139 struct tcmsg *tcmsg;
4143 if (netdev_dev->tc) {
4147 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4148 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4149 * 2.6.35 without that fix backported to it.
4151 * To avoid the OOPS, we must not make a request that would attempt to dump
4152 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4153 * few others. There are a few ways that I can see to do this, but most of
4154 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4155 * technique chosen here is to assume that any non-default qdisc that we
4156 * create will have a class with handle 1:0. The built-in qdiscs only have
4157 * a class with handle 0:0.
4159 * We could check for Linux 2.6.35+ and use a more straightforward method
4161 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4165 tcmsg->tcm_handle = tc_make_handle(1, 0);
4166 tcmsg->tcm_parent = 0;
4168 /* Figure out what tc class to instantiate. */
4169 error = tc_transact(&request, &qdisc);
4173 error = tc_parse_qdisc(qdisc, &kind, NULL);
4175 ops = &tc_ops_other;
4177 ops = tc_lookup_linux_name(kind);
4179 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4180 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4182 ops = &tc_ops_other;
4185 } else if (error == ENOENT) {
4186 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4187 * other entity that doesn't have a handle 1:0. We will assume
4188 * that it's the system default qdisc. */
4189 ops = &tc_ops_default;
4192 /* Who knows? Maybe the device got deleted. */
4193 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4194 netdev_get_name(netdev), strerror(error));
4195 ops = &tc_ops_other;
4198 /* Instantiate it. */
4199 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4200 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
4201 ofpbuf_delete(qdisc);
4203 return error ? error : load_error;
4206 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4207 approximate the time to transmit packets of various lengths. For an MTU of
4208 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4209 represents two possible packet lengths; for a MTU of 513 through 1024, four
4210 possible lengths; and so on.
4212 Returns, for the specified 'mtu', the number of bits that packet lengths
4213 need to be shifted right to fit within such a 256-entry table. */
4215 tc_calc_cell_log(unsigned int mtu)
4220 mtu = ETH_PAYLOAD_MAX;
4222 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4224 for (cell_log = 0; mtu >= 256; cell_log++) {
4231 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4234 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4236 memset(rate, 0, sizeof *rate);
4237 rate->cell_log = tc_calc_cell_log(mtu);
4238 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4239 /* rate->cell_align = 0; */ /* distro headers. */
4240 rate->mpu = ETH_TOTAL_MIN;
4244 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4245 * attribute of the specified "type".
4247 * See tc_calc_cell_log() above for a description of "rtab"s. */
4249 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4254 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4255 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4256 unsigned packet_size = (i + 1) << rate->cell_log;
4257 if (packet_size < rate->mpu) {
4258 packet_size = rate->mpu;
4260 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4264 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4265 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4266 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4269 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4271 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4272 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4275 /* Linux-only functions declared in netdev-linux.h */
4277 /* Returns a fd for an AF_INET socket or a negative errno value. */
4279 netdev_linux_get_af_inet_sock(void)
4281 int error = netdev_linux_init();
4282 return error ? -error : af_inet_sock;
4285 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4286 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4288 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4289 const char *flag_name, bool enable)
4291 const char *netdev_name = netdev_get_name(netdev);
4292 struct ethtool_value evalue;
4296 COVERAGE_INC(netdev_get_ethtool);
4297 memset(&evalue, 0, sizeof evalue);
4298 error = netdev_linux_do_ethtool(netdev_name,
4299 (struct ethtool_cmd *)&evalue,
4300 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4305 COVERAGE_INC(netdev_set_ethtool);
4306 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4307 error = netdev_linux_do_ethtool(netdev_name,
4308 (struct ethtool_cmd *)&evalue,
4309 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4314 COVERAGE_INC(netdev_get_ethtool);
4315 memset(&evalue, 0, sizeof evalue);
4316 error = netdev_linux_do_ethtool(netdev_name,
4317 (struct ethtool_cmd *)&evalue,
4318 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4323 if (new_flags != evalue.data) {
4324 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4325 "device %s failed", enable ? "enable" : "disable",
4326 flag_name, netdev_name);
4333 /* Utility functions. */
4335 /* Copies 'src' into 'dst', performing format conversion in the process. */
4337 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4338 const struct rtnl_link_stats *src)
4340 dst->rx_packets = src->rx_packets;
4341 dst->tx_packets = src->tx_packets;
4342 dst->rx_bytes = src->rx_bytes;
4343 dst->tx_bytes = src->tx_bytes;
4344 dst->rx_errors = src->rx_errors;
4345 dst->tx_errors = src->tx_errors;
4346 dst->rx_dropped = src->rx_dropped;
4347 dst->tx_dropped = src->tx_dropped;
4348 dst->multicast = src->multicast;
4349 dst->collisions = src->collisions;
4350 dst->rx_length_errors = src->rx_length_errors;
4351 dst->rx_over_errors = src->rx_over_errors;
4352 dst->rx_crc_errors = src->rx_crc_errors;
4353 dst->rx_frame_errors = src->rx_frame_errors;
4354 dst->rx_fifo_errors = src->rx_fifo_errors;
4355 dst->rx_missed_errors = src->rx_missed_errors;
4356 dst->tx_aborted_errors = src->tx_aborted_errors;
4357 dst->tx_carrier_errors = src->tx_carrier_errors;
4358 dst->tx_fifo_errors = src->tx_fifo_errors;
4359 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4360 dst->tx_window_errors = src->tx_window_errors;
4364 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4366 /* Policy for RTNLGRP_LINK messages.
4368 * There are *many* more fields in these messages, but currently we only
4369 * care about these fields. */
4370 static const struct nl_policy rtnlgrp_link_policy[] = {
4371 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4372 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4373 .min_len = sizeof(struct rtnl_link_stats) },
4376 struct ofpbuf request;
4377 struct ofpbuf *reply;
4378 struct ifinfomsg *ifi;
4379 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4382 ofpbuf_init(&request, 0);
4383 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4384 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4385 ifi->ifi_family = PF_UNSPEC;
4386 ifi->ifi_index = ifindex;
4387 error = nl_sock_transact(rtnl_sock, &request, &reply);
4388 ofpbuf_uninit(&request);
4393 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4394 rtnlgrp_link_policy,
4395 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4396 ofpbuf_delete(reply);
4400 if (!attrs[IFLA_STATS]) {
4401 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4402 ofpbuf_delete(reply);
4406 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4408 ofpbuf_delete(reply);
4414 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4416 static const char fn[] = "/proc/net/dev";
4421 stream = fopen(fn, "r");
4423 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4428 while (fgets(line, sizeof line, stream)) {
4431 #define X64 "%"SCNu64
4434 X64 X64 X64 X64 X64 X64 X64 "%*u"
4435 X64 X64 X64 X64 X64 X64 X64 "%*u",
4441 &stats->rx_fifo_errors,
4442 &stats->rx_frame_errors,
4448 &stats->tx_fifo_errors,
4450 &stats->tx_carrier_errors) != 15) {
4451 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4452 } else if (!strcmp(devname, netdev_name)) {
4453 stats->rx_length_errors = UINT64_MAX;
4454 stats->rx_over_errors = UINT64_MAX;
4455 stats->rx_crc_errors = UINT64_MAX;
4456 stats->rx_missed_errors = UINT64_MAX;
4457 stats->tx_aborted_errors = UINT64_MAX;
4458 stats->tx_heartbeat_errors = UINT64_MAX;
4459 stats->tx_window_errors = UINT64_MAX;
4465 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4471 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4477 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4480 *flags = ifr.ifr_flags;
4486 set_flags(struct netdev *netdev, unsigned int flags)
4490 ifr.ifr_flags = flags;
4491 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4496 do_get_ifindex(const char *netdev_name)
4500 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4501 COVERAGE_INC(netdev_get_ifindex);
4502 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4503 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4504 netdev_name, strerror(errno));
4507 return ifr.ifr_ifindex;
4511 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4513 struct netdev_dev_linux *netdev_dev =
4514 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4516 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4517 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4520 netdev_dev->get_ifindex_error = -ifindex;
4521 netdev_dev->ifindex = 0;
4523 netdev_dev->get_ifindex_error = 0;
4524 netdev_dev->ifindex = ifindex;
4526 netdev_dev->cache_valid |= VALID_IFINDEX;
4529 *ifindexp = netdev_dev->ifindex;
4530 return netdev_dev->get_ifindex_error;
4534 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4539 memset(&ifr, 0, sizeof ifr);
4540 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4541 COVERAGE_INC(netdev_get_hwaddr);
4542 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4543 /* ENODEV probably means that a vif disappeared asynchronously and
4544 * hasn't been removed from the database yet, so reduce the log level
4545 * to INFO for that case. */
4546 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4547 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4548 netdev_name, strerror(errno));
4551 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4552 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4553 VLOG_WARN("%s device has unknown hardware address family %d",
4554 netdev_name, hwaddr_family);
4556 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4561 set_etheraddr(const char *netdev_name,
4562 const uint8_t mac[ETH_ADDR_LEN])
4566 memset(&ifr, 0, sizeof ifr);
4567 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4568 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4569 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4570 COVERAGE_INC(netdev_set_hwaddr);
4571 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4572 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4573 netdev_name, strerror(errno));
4580 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4581 int cmd, const char *cmd_name)
4585 memset(&ifr, 0, sizeof ifr);
4586 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4587 ifr.ifr_data = (caddr_t) ecmd;
4590 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4593 if (errno != EOPNOTSUPP) {
4594 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4595 "failed: %s", cmd_name, name, strerror(errno));
4597 /* The device doesn't support this operation. That's pretty
4598 * common, so there's no point in logging anything. */
4605 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4606 const char *cmd_name)
4608 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4609 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4610 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4618 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4619 int cmd, const char *cmd_name)
4624 ifr.ifr_addr.sa_family = AF_INET;
4625 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4627 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4628 *ip = sin->sin_addr;
4633 /* Returns an AF_PACKET raw socket or a negative errno value. */
4635 af_packet_sock(void)
4637 static int sock = INT_MIN;
4639 if (sock == INT_MIN) {
4640 sock = socket(AF_PACKET, SOCK_RAW, 0);
4642 int error = set_nonblocking(sock);
4649 VLOG_ERR("failed to create packet socket: %s", strerror(errno));