2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct smap *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct smap *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct smap *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct smap *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
502 unsigned int ifi_flags,
506 if (!dev->change_seq) {
510 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
511 dev->carrier_resets++;
513 dev->ifi_flags = ifi_flags;
515 dev->cache_valid &= mask;
519 netdev_dev_linux_update(struct netdev_dev_linux *dev,
520 const struct rtnetlink_link_change *change)
522 if (change->nlmsg_type == RTM_NEWLINK) {
524 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
526 /* Update netdev from rtnl-change msg. */
528 dev->mtu = change->mtu;
529 dev->cache_valid |= VALID_MTU;
530 dev->netdev_mtu_error = 0;
533 if (!eth_addr_is_zero(change->addr)) {
534 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
535 dev->cache_valid |= VALID_ETHERADDR;
536 dev->ether_addr_error = 0;
539 dev->ifindex = change->ifi_index;
540 dev->cache_valid |= VALID_IFINDEX;
541 dev->get_ifindex_error = 0;
544 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
549 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
550 void *aux OVS_UNUSED)
552 struct netdev_dev_linux *dev;
554 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
556 const struct netdev_class *netdev_class =
557 netdev_dev_get_class(base_dev);
559 if (is_netdev_linux_class(netdev_class)) {
560 dev = netdev_dev_linux_cast(base_dev);
561 netdev_dev_linux_update(dev, change);
565 struct shash device_shash;
566 struct shash_node *node;
568 shash_init(&device_shash);
569 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
570 SHASH_FOR_EACH (node, &device_shash) {
575 get_flags(&dev->netdev_dev, &flags);
576 netdev_dev_linux_changed(dev, flags, 0);
578 shash_destroy(&device_shash);
583 cache_notifier_ref(void)
585 if (!cache_notifier_refcount) {
586 assert(!netdev_linux_cache_notifier);
588 netdev_linux_cache_notifier =
589 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
591 if (!netdev_linux_cache_notifier) {
595 cache_notifier_refcount++;
601 cache_notifier_unref(void)
603 assert(cache_notifier_refcount > 0);
604 if (!--cache_notifier_refcount) {
605 assert(netdev_linux_cache_notifier);
606 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
607 netdev_linux_cache_notifier = NULL;
611 /* Creates system and internal devices. */
613 netdev_linux_create(const struct netdev_class *class, const char *name,
614 struct netdev_dev **netdev_devp)
616 struct netdev_dev_linux *netdev_dev;
619 error = cache_notifier_ref();
624 netdev_dev = xzalloc(sizeof *netdev_dev);
625 netdev_dev->change_seq = 1;
626 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
627 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
629 *netdev_devp = &netdev_dev->netdev_dev;
633 /* For most types of netdevs we open the device for each call of
634 * netdev_open(). However, this is not the case with tap devices,
635 * since it is only possible to open the device once. In this
636 * situation we share a single file descriptor, and consequently
637 * buffers, across all readers. Therefore once data is read it will
638 * be unavailable to other reads for tap devices. */
640 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
641 const char *name, struct netdev_dev **netdev_devp)
643 struct netdev_dev_linux *netdev_dev;
644 struct tap_state *state;
645 static const char tap_dev[] = "/dev/net/tun";
649 netdev_dev = xzalloc(sizeof *netdev_dev);
650 state = &netdev_dev->state.tap;
652 error = cache_notifier_ref();
657 /* Open tap device. */
658 state->fd = open(tap_dev, O_RDWR);
661 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
662 goto error_unref_notifier;
665 /* Create tap device. */
666 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
667 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
668 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
669 VLOG_WARN("%s: creating tap device failed: %s", name,
672 goto error_unref_notifier;
675 /* Make non-blocking. */
676 error = set_nonblocking(state->fd);
678 goto error_unref_notifier;
681 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
682 *netdev_devp = &netdev_dev->netdev_dev;
685 error_unref_notifier:
686 cache_notifier_unref();
693 destroy_tap(struct netdev_dev_linux *netdev_dev)
695 struct tap_state *state = &netdev_dev->state.tap;
697 if (state->fd >= 0) {
702 /* Destroys the netdev device 'netdev_dev_'. */
704 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
706 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
707 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
709 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
710 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
713 if (class == &netdev_tap_class) {
714 destroy_tap(netdev_dev);
718 cache_notifier_unref();
722 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
724 struct netdev_linux *netdev;
725 enum netdev_flags flags;
728 /* Allocate network device. */
729 netdev = xzalloc(sizeof *netdev);
731 netdev_init(&netdev->netdev, netdev_dev_);
733 /* Verify that the device really exists, by attempting to read its flags.
734 * (The flags might be cached, in which case this won't actually do an
737 * Don't do this for "internal" netdevs, though, because those have to be
738 * created as netdev objects before they exist in the kernel, because
739 * creating them in the kernel happens by passing a netdev object to
740 * dpif_port_add(). */
741 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
742 error = netdev_get_flags(&netdev->netdev, &flags);
743 if (error == ENODEV) {
748 *netdevp = &netdev->netdev;
752 netdev_uninit(&netdev->netdev, true);
756 /* Closes and destroys 'netdev'. */
758 netdev_linux_close(struct netdev *netdev_)
760 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
762 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
769 netdev_linux_listen(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 struct netdev_dev_linux *netdev_dev =
773 netdev_dev_linux_cast(netdev_get_dev(netdev_));
774 struct sockaddr_ll sll;
779 if (netdev->fd >= 0) {
783 if (!strcmp(netdev_get_type(netdev_), "tap")
784 && !netdev_dev->state.tap.opened) {
785 netdev->fd = netdev_dev->state.tap.fd;
786 netdev_dev->state.tap.opened = true;
790 /* Create file descriptor. */
791 fd = socket(PF_PACKET, SOCK_RAW, 0);
794 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
798 /* Set non-blocking mode. */
799 error = set_nonblocking(fd);
804 /* Get ethernet device index. */
805 error = get_ifindex(&netdev->netdev, &ifindex);
810 /* Bind to specific ethernet device. */
811 memset(&sll, 0, sizeof sll);
812 sll.sll_family = AF_PACKET;
813 sll.sll_ifindex = ifindex;
814 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
815 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
817 VLOG_ERR("%s: failed to bind raw socket (%s)",
818 netdev_get_name(netdev_), strerror(error));
833 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
837 if (netdev->fd < 0) {
838 /* Device is not listening. */
845 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
846 ? read(netdev->fd, data, size)
847 : recv(netdev->fd, data, size, MSG_TRUNC));
849 return retval <= size ? retval : -EMSGSIZE;
850 } else if (errno != EINTR) {
851 if (errno != EAGAIN) {
852 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
853 strerror(errno), netdev_get_name(netdev_));
860 /* Registers with the poll loop to wake up from the next call to poll_block()
861 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
863 netdev_linux_recv_wait(struct netdev *netdev_)
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
866 if (netdev->fd >= 0) {
867 poll_fd_wait(netdev->fd, POLLIN);
871 /* Discards all packets waiting to be received from 'netdev'. */
873 netdev_linux_drain(struct netdev *netdev_)
875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
876 if (netdev->fd < 0) {
878 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
880 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
881 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
885 drain_fd(netdev->fd, ifr.ifr_qlen);
888 return drain_rcvbuf(netdev->fd);
892 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
893 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
894 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
895 * the packet is too big or too small to transmit on the device.
897 * The caller retains ownership of 'buffer' in all cases.
899 * The kernel maintains a packet transmission queue, so the caller is not
900 * expected to do additional queuing of packets. */
902 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
904 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
908 if (netdev->fd < 0) {
909 /* Use our AF_PACKET socket to send to this device. */
910 struct sockaddr_ll sll;
917 sock = af_packet_sock();
922 error = get_ifindex(netdev_, &ifindex);
927 /* We don't bother setting most fields in sockaddr_ll because the
928 * kernel ignores them for SOCK_RAW. */
929 memset(&sll, 0, sizeof sll);
930 sll.sll_family = AF_PACKET;
931 sll.sll_ifindex = ifindex;
933 iov.iov_base = CONST_CAST(void *, data);
937 msg.msg_namelen = sizeof sll;
940 msg.msg_control = NULL;
941 msg.msg_controllen = 0;
944 retval = sendmsg(sock, &msg, 0);
946 /* Use the netdev's own fd to send to this device. This is
947 * essential for tap devices, because packets sent to a tap device
948 * with an AF_PACKET socket will loop back to be *received* again
949 * on the tap device. */
950 retval = write(netdev->fd, data, size);
954 /* The Linux AF_PACKET implementation never blocks waiting for room
955 * for packets, instead returning ENOBUFS. Translate this into
956 * EAGAIN for the caller. */
957 if (errno == ENOBUFS) {
959 } else if (errno == EINTR) {
961 } else if (errno != EAGAIN) {
962 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
963 netdev_get_name(netdev_), strerror(errno));
966 } else if (retval != size) {
967 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
968 "%zu) on %s", retval, size, netdev_get_name(netdev_));
976 /* Registers with the poll loop to wake up from the next call to poll_block()
977 * when the packet transmission queue has sufficient room to transmit a packet
978 * with netdev_send().
980 * The kernel maintains a packet transmission queue, so the client is not
981 * expected to do additional queuing of packets. Thus, this function is
982 * unlikely to ever be used. It is included for completeness. */
984 netdev_linux_send_wait(struct netdev *netdev_)
986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
987 if (netdev->fd < 0) {
989 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
990 poll_fd_wait(netdev->fd, POLLOUT);
992 /* TAP device always accepts packets.*/
993 poll_immediate_wake();
997 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
998 * otherwise a positive errno value. */
1000 netdev_linux_set_etheraddr(struct netdev *netdev_,
1001 const uint8_t mac[ETH_ADDR_LEN])
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1006 bool up_again = false;
1008 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1009 if (netdev_dev->ether_addr_error) {
1010 return netdev_dev->ether_addr_error;
1012 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1015 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1018 /* Tap devices must be brought down before setting the address. */
1019 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1020 enum netdev_flags flags;
1022 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1023 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1027 error = set_etheraddr(netdev_get_name(netdev_), mac);
1028 if (!error || error == ENODEV) {
1029 netdev_dev->ether_addr_error = error;
1030 netdev_dev->cache_valid |= VALID_ETHERADDR;
1032 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1037 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1043 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1045 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1046 uint8_t mac[ETH_ADDR_LEN])
1048 struct netdev_dev_linux *netdev_dev =
1049 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1051 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1052 int error = get_etheraddr(netdev_get_name(netdev_),
1053 netdev_dev->etheraddr);
1055 netdev_dev->ether_addr_error = error;
1056 netdev_dev->cache_valid |= VALID_ETHERADDR;
1059 if (!netdev_dev->ether_addr_error) {
1060 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1063 return netdev_dev->ether_addr_error;
1066 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1067 * in bytes, not including the hardware header; thus, this is typically 1500
1068 * bytes for Ethernet devices. */
1070 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1072 struct netdev_dev_linux *netdev_dev =
1073 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1074 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1078 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1079 SIOCGIFMTU, "SIOCGIFMTU");
1081 netdev_dev->netdev_mtu_error = error;
1082 netdev_dev->mtu = ifr.ifr_mtu;
1083 netdev_dev->cache_valid |= VALID_MTU;
1086 if (!netdev_dev->netdev_mtu_error) {
1087 *mtup = netdev_dev->mtu;
1089 return netdev_dev->netdev_mtu_error;
1092 /* Sets the maximum size of transmitted (MTU) for given device using linux
1093 * networking ioctl interface.
1096 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1098 struct netdev_dev_linux *netdev_dev =
1099 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1103 if (netdev_dev->cache_valid & VALID_MTU) {
1104 if (netdev_dev->netdev_mtu_error) {
1105 return netdev_dev->netdev_mtu_error;
1107 if (netdev_dev->mtu == mtu) {
1110 netdev_dev->cache_valid &= ~VALID_MTU;
1113 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1114 SIOCSIFMTU, "SIOCSIFMTU");
1115 if (!error || error == ENODEV) {
1116 netdev_dev->netdev_mtu_error = error;
1117 netdev_dev->mtu = ifr.ifr_mtu;
1118 netdev_dev->cache_valid |= VALID_MTU;
1123 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1124 * On failure, returns a negative errno value. */
1126 netdev_linux_get_ifindex(const struct netdev *netdev)
1130 error = get_ifindex(netdev, &ifindex);
1131 return error ? -error : ifindex;
1135 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1137 struct netdev_dev_linux *netdev_dev =
1138 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1140 if (netdev_dev->miimon_interval > 0) {
1141 *carrier = netdev_dev->miimon;
1143 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1149 static long long int
1150 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1152 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1156 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1157 struct mii_ioctl_data *data)
1162 memset(&ifr, 0, sizeof ifr);
1163 memcpy(&ifr.ifr_data, data, sizeof *data);
1164 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1165 memcpy(data, &ifr.ifr_data, sizeof *data);
1171 netdev_linux_get_miimon(const char *name, bool *miimon)
1173 struct mii_ioctl_data data;
1178 memset(&data, 0, sizeof data);
1179 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1181 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1182 data.reg_num = MII_BMSR;
1183 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1187 *miimon = !!(data.val_out & BMSR_LSTATUS);
1189 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1192 struct ethtool_cmd ecmd;
1194 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1197 COVERAGE_INC(netdev_get_ethtool);
1198 memset(&ecmd, 0, sizeof ecmd);
1199 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1202 struct ethtool_value eval;
1204 memcpy(&eval, &ecmd, sizeof eval);
1205 *miimon = !!eval.data;
1207 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1215 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1216 long long int interval)
1218 struct netdev_dev_linux *netdev_dev;
1220 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1222 interval = interval > 0 ? MAX(interval, 100) : 0;
1223 if (netdev_dev->miimon_interval != interval) {
1224 netdev_dev->miimon_interval = interval;
1225 timer_set_expired(&netdev_dev->miimon_timer);
1232 netdev_linux_miimon_run(void)
1234 struct shash device_shash;
1235 struct shash_node *node;
1237 shash_init(&device_shash);
1238 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1239 SHASH_FOR_EACH (node, &device_shash) {
1240 struct netdev_dev_linux *dev = node->data;
1243 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1247 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1248 if (miimon != dev->miimon) {
1249 dev->miimon = miimon;
1250 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1253 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1256 shash_destroy(&device_shash);
1260 netdev_linux_miimon_wait(void)
1262 struct shash device_shash;
1263 struct shash_node *node;
1265 shash_init(&device_shash);
1266 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1267 SHASH_FOR_EACH (node, &device_shash) {
1268 struct netdev_dev_linux *dev = node->data;
1270 if (dev->miimon_interval > 0) {
1271 timer_wait(&dev->miimon_timer);
1274 shash_destroy(&device_shash);
1277 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1278 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1281 check_for_working_netlink_stats(void)
1283 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1284 * preferable, so if that works, we'll use it. */
1285 int ifindex = do_get_ifindex("lo");
1287 VLOG_WARN("failed to get ifindex for lo, "
1288 "obtaining netdev stats from proc");
1291 struct netdev_stats stats;
1292 int error = get_stats_via_netlink(ifindex, &stats);
1294 VLOG_DBG("obtaining netdev stats via rtnetlink");
1297 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1298 "via proc (you are probably running a pre-2.6.19 "
1299 "kernel)", strerror(error));
1306 swap_uint64(uint64_t *a, uint64_t *b)
1314 get_stats_via_vport(const struct netdev *netdev_,
1315 struct netdev_stats *stats)
1317 struct netdev_dev_linux *netdev_dev =
1318 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1320 if (!netdev_dev->vport_stats_error ||
1321 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1324 error = netdev_vport_get_stats(netdev_, stats);
1325 if (error && error != ENOENT) {
1326 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1327 "(%s)", netdev_get_name(netdev_), strerror(error));
1329 netdev_dev->vport_stats_error = error;
1330 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1335 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1336 struct netdev_stats *stats)
1338 static int use_netlink_stats = -1;
1341 if (use_netlink_stats < 0) {
1342 use_netlink_stats = check_for_working_netlink_stats();
1345 if (use_netlink_stats) {
1348 error = get_ifindex(netdev_, &ifindex);
1350 error = get_stats_via_netlink(ifindex, stats);
1353 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1357 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1358 netdev_get_name(netdev_), error);
1364 /* Retrieves current device stats for 'netdev-linux'. */
1366 netdev_linux_get_stats(const struct netdev *netdev_,
1367 struct netdev_stats *stats)
1369 struct netdev_dev_linux *netdev_dev =
1370 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1371 struct netdev_stats dev_stats;
1374 get_stats_via_vport(netdev_, stats);
1376 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1379 if (netdev_dev->vport_stats_error) {
1386 if (netdev_dev->vport_stats_error) {
1387 /* stats not available from OVS then use ioctl stats. */
1390 stats->rx_errors += dev_stats.rx_errors;
1391 stats->tx_errors += dev_stats.tx_errors;
1392 stats->rx_dropped += dev_stats.rx_dropped;
1393 stats->tx_dropped += dev_stats.tx_dropped;
1394 stats->multicast += dev_stats.multicast;
1395 stats->collisions += dev_stats.collisions;
1396 stats->rx_length_errors += dev_stats.rx_length_errors;
1397 stats->rx_over_errors += dev_stats.rx_over_errors;
1398 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1399 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1400 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1401 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1402 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1403 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1404 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1405 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1406 stats->tx_window_errors += dev_stats.tx_window_errors;
1411 /* Retrieves current device stats for 'netdev-tap' netdev or
1412 * netdev-internal. */
1414 netdev_tap_get_stats(const struct netdev *netdev_,
1415 struct netdev_stats *stats)
1417 struct netdev_dev_linux *netdev_dev =
1418 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1419 struct netdev_stats dev_stats;
1422 get_stats_via_vport(netdev_, stats);
1424 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1426 if (netdev_dev->vport_stats_error) {
1433 /* If this port is an internal port then the transmit and receive stats
1434 * will appear to be swapped relative to the other ports since we are the
1435 * one sending the data, not a remote computer. For consistency, we swap
1436 * them back here. This does not apply if we are getting stats from the
1437 * vport layer because it always tracks stats from the perspective of the
1439 if (netdev_dev->vport_stats_error) {
1441 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1442 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1443 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1444 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1445 stats->rx_length_errors = 0;
1446 stats->rx_over_errors = 0;
1447 stats->rx_crc_errors = 0;
1448 stats->rx_frame_errors = 0;
1449 stats->rx_fifo_errors = 0;
1450 stats->rx_missed_errors = 0;
1451 stats->tx_aborted_errors = 0;
1452 stats->tx_carrier_errors = 0;
1453 stats->tx_fifo_errors = 0;
1454 stats->tx_heartbeat_errors = 0;
1455 stats->tx_window_errors = 0;
1457 stats->rx_dropped += dev_stats.tx_dropped;
1458 stats->tx_dropped += dev_stats.rx_dropped;
1460 stats->rx_errors += dev_stats.tx_errors;
1461 stats->tx_errors += dev_stats.rx_errors;
1463 stats->multicast += dev_stats.multicast;
1464 stats->collisions += dev_stats.collisions;
1470 netdev_internal_get_stats(const struct netdev *netdev_,
1471 struct netdev_stats *stats)
1473 struct netdev_dev_linux *netdev_dev =
1474 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1476 get_stats_via_vport(netdev_, stats);
1477 return netdev_dev->vport_stats_error;
1481 netdev_internal_set_stats(struct netdev *netdev,
1482 const struct netdev_stats *stats)
1484 struct ovs_vport_stats vport_stats;
1485 struct dpif_linux_vport vport;
1488 vport_stats.rx_packets = stats->rx_packets;
1489 vport_stats.tx_packets = stats->tx_packets;
1490 vport_stats.rx_bytes = stats->rx_bytes;
1491 vport_stats.tx_bytes = stats->tx_bytes;
1492 vport_stats.rx_errors = stats->rx_errors;
1493 vport_stats.tx_errors = stats->tx_errors;
1494 vport_stats.rx_dropped = stats->rx_dropped;
1495 vport_stats.tx_dropped = stats->tx_dropped;
1497 dpif_linux_vport_init(&vport);
1498 vport.cmd = OVS_VPORT_CMD_SET;
1499 vport.name = netdev_get_name(netdev);
1500 vport.stats = &vport_stats;
1502 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1504 /* If the vport layer doesn't know about the device, that doesn't mean it
1505 * doesn't exist (after all were able to open it when netdev_open() was
1506 * called), it just means that it isn't attached and we'll be getting
1507 * stats a different way. */
1508 if (err == ENODEV) {
1516 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1518 struct ethtool_cmd ecmd;
1522 if (netdev_dev->cache_valid & VALID_FEATURES) {
1526 COVERAGE_INC(netdev_get_ethtool);
1527 memset(&ecmd, 0, sizeof ecmd);
1528 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1529 ETHTOOL_GSET, "ETHTOOL_GSET");
1534 /* Supported features. */
1535 netdev_dev->supported = 0;
1536 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1537 netdev_dev->supported |= NETDEV_F_10MB_HD;
1539 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1540 netdev_dev->supported |= NETDEV_F_10MB_FD;
1542 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1543 netdev_dev->supported |= NETDEV_F_100MB_HD;
1545 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1546 netdev_dev->supported |= NETDEV_F_100MB_FD;
1548 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1549 netdev_dev->supported |= NETDEV_F_1GB_HD;
1551 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1552 netdev_dev->supported |= NETDEV_F_1GB_FD;
1554 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1555 netdev_dev->supported |= NETDEV_F_10GB_FD;
1557 if (ecmd.supported & SUPPORTED_TP) {
1558 netdev_dev->supported |= NETDEV_F_COPPER;
1560 if (ecmd.supported & SUPPORTED_FIBRE) {
1561 netdev_dev->supported |= NETDEV_F_FIBER;
1563 if (ecmd.supported & SUPPORTED_Autoneg) {
1564 netdev_dev->supported |= NETDEV_F_AUTONEG;
1566 if (ecmd.supported & SUPPORTED_Pause) {
1567 netdev_dev->supported |= NETDEV_F_PAUSE;
1569 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1570 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1573 /* Advertised features. */
1574 netdev_dev->advertised = 0;
1575 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1576 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1578 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1579 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1581 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1582 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1584 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1585 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1587 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1588 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1590 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1591 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1593 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1594 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1596 if (ecmd.advertising & ADVERTISED_TP) {
1597 netdev_dev->advertised |= NETDEV_F_COPPER;
1599 if (ecmd.advertising & ADVERTISED_FIBRE) {
1600 netdev_dev->advertised |= NETDEV_F_FIBER;
1602 if (ecmd.advertising & ADVERTISED_Autoneg) {
1603 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1605 if (ecmd.advertising & ADVERTISED_Pause) {
1606 netdev_dev->advertised |= NETDEV_F_PAUSE;
1608 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1609 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1612 /* Current settings. */
1614 if (speed == SPEED_10) {
1615 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1616 } else if (speed == SPEED_100) {
1617 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1618 } else if (speed == SPEED_1000) {
1619 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1620 } else if (speed == SPEED_10000) {
1621 netdev_dev->current = NETDEV_F_10GB_FD;
1622 } else if (speed == 40000) {
1623 netdev_dev->current = NETDEV_F_40GB_FD;
1624 } else if (speed == 100000) {
1625 netdev_dev->current = NETDEV_F_100GB_FD;
1626 } else if (speed == 1000000) {
1627 netdev_dev->current = NETDEV_F_1TB_FD;
1629 netdev_dev->current = 0;
1632 if (ecmd.port == PORT_TP) {
1633 netdev_dev->current |= NETDEV_F_COPPER;
1634 } else if (ecmd.port == PORT_FIBRE) {
1635 netdev_dev->current |= NETDEV_F_FIBER;
1639 netdev_dev->current |= NETDEV_F_AUTONEG;
1642 /* Peer advertisements. */
1643 netdev_dev->peer = 0; /* XXX */
1646 netdev_dev->cache_valid |= VALID_FEATURES;
1647 netdev_dev->get_features_error = error;
1650 /* Stores the features supported by 'netdev' into each of '*current',
1651 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1652 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1655 netdev_linux_get_features(const struct netdev *netdev_,
1656 enum netdev_features *current,
1657 enum netdev_features *advertised,
1658 enum netdev_features *supported,
1659 enum netdev_features *peer)
1661 struct netdev_dev_linux *netdev_dev =
1662 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1664 netdev_linux_read_features(netdev_dev);
1666 if (!netdev_dev->get_features_error) {
1667 *current = netdev_dev->current;
1668 *advertised = netdev_dev->advertised;
1669 *supported = netdev_dev->supported;
1670 *peer = netdev_dev->peer;
1672 return netdev_dev->get_features_error;
1675 /* Set the features advertised by 'netdev' to 'advertise'. */
1677 netdev_linux_set_advertisements(struct netdev *netdev,
1678 enum netdev_features advertise)
1680 struct ethtool_cmd ecmd;
1683 COVERAGE_INC(netdev_get_ethtool);
1684 memset(&ecmd, 0, sizeof ecmd);
1685 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1686 ETHTOOL_GSET, "ETHTOOL_GSET");
1691 ecmd.advertising = 0;
1692 if (advertise & NETDEV_F_10MB_HD) {
1693 ecmd.advertising |= ADVERTISED_10baseT_Half;
1695 if (advertise & NETDEV_F_10MB_FD) {
1696 ecmd.advertising |= ADVERTISED_10baseT_Full;
1698 if (advertise & NETDEV_F_100MB_HD) {
1699 ecmd.advertising |= ADVERTISED_100baseT_Half;
1701 if (advertise & NETDEV_F_100MB_FD) {
1702 ecmd.advertising |= ADVERTISED_100baseT_Full;
1704 if (advertise & NETDEV_F_1GB_HD) {
1705 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1707 if (advertise & NETDEV_F_1GB_FD) {
1708 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1710 if (advertise & NETDEV_F_10GB_FD) {
1711 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1713 if (advertise & NETDEV_F_COPPER) {
1714 ecmd.advertising |= ADVERTISED_TP;
1716 if (advertise & NETDEV_F_FIBER) {
1717 ecmd.advertising |= ADVERTISED_FIBRE;
1719 if (advertise & NETDEV_F_AUTONEG) {
1720 ecmd.advertising |= ADVERTISED_Autoneg;
1722 if (advertise & NETDEV_F_PAUSE) {
1723 ecmd.advertising |= ADVERTISED_Pause;
1725 if (advertise & NETDEV_F_PAUSE_ASYM) {
1726 ecmd.advertising |= ADVERTISED_Asym_Pause;
1728 COVERAGE_INC(netdev_set_ethtool);
1729 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1730 ETHTOOL_SSET, "ETHTOOL_SSET");
1733 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1734 * successful, otherwise a positive errno value. */
1736 netdev_linux_set_policing(struct netdev *netdev,
1737 uint32_t kbits_rate, uint32_t kbits_burst)
1739 struct netdev_dev_linux *netdev_dev =
1740 netdev_dev_linux_cast(netdev_get_dev(netdev));
1741 const char *netdev_name = netdev_get_name(netdev);
1745 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1746 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1747 : kbits_burst); /* Stick with user-specified value. */
1749 if (netdev_dev->cache_valid & VALID_POLICING) {
1750 if (netdev_dev->netdev_policing_error) {
1751 return netdev_dev->netdev_policing_error;
1754 if (netdev_dev->kbits_rate == kbits_rate &&
1755 netdev_dev->kbits_burst == kbits_burst) {
1756 /* Assume that settings haven't changed since we last set them. */
1759 netdev_dev->cache_valid &= ~VALID_POLICING;
1762 COVERAGE_INC(netdev_set_policing);
1763 /* Remove any existing ingress qdisc. */
1764 error = tc_add_del_ingress_qdisc(netdev, false);
1766 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1767 netdev_name, strerror(error));
1772 error = tc_add_del_ingress_qdisc(netdev, true);
1774 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1775 netdev_name, strerror(error));
1779 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1781 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1782 netdev_name, strerror(error));
1787 netdev_dev->kbits_rate = kbits_rate;
1788 netdev_dev->kbits_burst = kbits_burst;
1791 if (!error || error == ENODEV) {
1792 netdev_dev->netdev_policing_error = error;
1793 netdev_dev->cache_valid |= VALID_POLICING;
1799 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1802 const struct tc_ops **opsp;
1804 for (opsp = tcs; *opsp != NULL; opsp++) {
1805 const struct tc_ops *ops = *opsp;
1806 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1807 sset_add(types, ops->ovs_name);
1813 static const struct tc_ops *
1814 tc_lookup_ovs_name(const char *name)
1816 const struct tc_ops **opsp;
1818 for (opsp = tcs; *opsp != NULL; opsp++) {
1819 const struct tc_ops *ops = *opsp;
1820 if (!strcmp(name, ops->ovs_name)) {
1827 static const struct tc_ops *
1828 tc_lookup_linux_name(const char *name)
1830 const struct tc_ops **opsp;
1832 for (opsp = tcs; *opsp != NULL; opsp++) {
1833 const struct tc_ops *ops = *opsp;
1834 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1841 static struct tc_queue *
1842 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1845 struct netdev_dev_linux *netdev_dev =
1846 netdev_dev_linux_cast(netdev_get_dev(netdev));
1847 struct tc_queue *queue;
1849 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1850 if (queue->queue_id == queue_id) {
1857 static struct tc_queue *
1858 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1860 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1864 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1866 struct netdev_qos_capabilities *caps)
1868 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1872 caps->n_queues = ops->n_queues;
1877 netdev_linux_get_qos(const struct netdev *netdev,
1878 const char **typep, struct smap *details)
1880 struct netdev_dev_linux *netdev_dev =
1881 netdev_dev_linux_cast(netdev_get_dev(netdev));
1884 error = tc_query_qdisc(netdev);
1889 *typep = netdev_dev->tc->ops->ovs_name;
1890 return (netdev_dev->tc->ops->qdisc_get
1891 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1896 netdev_linux_set_qos(struct netdev *netdev,
1897 const char *type, const struct smap *details)
1899 struct netdev_dev_linux *netdev_dev =
1900 netdev_dev_linux_cast(netdev_get_dev(netdev));
1901 const struct tc_ops *new_ops;
1904 new_ops = tc_lookup_ovs_name(type);
1905 if (!new_ops || !new_ops->tc_install) {
1909 error = tc_query_qdisc(netdev);
1914 if (new_ops == netdev_dev->tc->ops) {
1915 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1917 /* Delete existing qdisc. */
1918 error = tc_del_qdisc(netdev);
1922 assert(netdev_dev->tc == NULL);
1924 /* Install new qdisc. */
1925 error = new_ops->tc_install(netdev, details);
1926 assert((error == 0) == (netdev_dev->tc != NULL));
1933 netdev_linux_get_queue(const struct netdev *netdev,
1934 unsigned int queue_id, struct smap *details)
1936 struct netdev_dev_linux *netdev_dev =
1937 netdev_dev_linux_cast(netdev_get_dev(netdev));
1940 error = tc_query_qdisc(netdev);
1944 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1946 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1952 netdev_linux_set_queue(struct netdev *netdev,
1953 unsigned int queue_id, const struct smap *details)
1955 struct netdev_dev_linux *netdev_dev =
1956 netdev_dev_linux_cast(netdev_get_dev(netdev));
1959 error = tc_query_qdisc(netdev);
1962 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1963 || !netdev_dev->tc->ops->class_set) {
1967 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1971 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1973 struct netdev_dev_linux *netdev_dev =
1974 netdev_dev_linux_cast(netdev_get_dev(netdev));
1977 error = tc_query_qdisc(netdev);
1980 } else if (!netdev_dev->tc->ops->class_delete) {
1983 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1985 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1991 netdev_linux_get_queue_stats(const struct netdev *netdev,
1992 unsigned int queue_id,
1993 struct netdev_queue_stats *stats)
1995 struct netdev_dev_linux *netdev_dev =
1996 netdev_dev_linux_cast(netdev_get_dev(netdev));
1999 error = tc_query_qdisc(netdev);
2002 } else if (!netdev_dev->tc->ops->class_get_stats) {
2005 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2007 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2013 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2015 struct ofpbuf request;
2016 struct tcmsg *tcmsg;
2018 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2022 tcmsg->tcm_parent = 0;
2023 nl_dump_start(dump, rtnl_sock, &request);
2024 ofpbuf_uninit(&request);
2029 netdev_linux_dump_queues(const struct netdev *netdev,
2030 netdev_dump_queues_cb *cb, void *aux)
2032 struct netdev_dev_linux *netdev_dev =
2033 netdev_dev_linux_cast(netdev_get_dev(netdev));
2034 struct tc_queue *queue, *next_queue;
2035 struct smap details;
2039 error = tc_query_qdisc(netdev);
2042 } else if (!netdev_dev->tc->ops->class_get) {
2047 smap_init(&details);
2048 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2049 &netdev_dev->tc->queues) {
2050 smap_clear(&details);
2052 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2054 (*cb)(queue->queue_id, &details, aux);
2059 smap_destroy(&details);
2065 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2066 netdev_dump_queue_stats_cb *cb, void *aux)
2068 struct netdev_dev_linux *netdev_dev =
2069 netdev_dev_linux_cast(netdev_get_dev(netdev));
2070 struct nl_dump dump;
2075 error = tc_query_qdisc(netdev);
2078 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2083 if (!start_queue_dump(netdev, &dump)) {
2086 while (nl_dump_next(&dump, &msg)) {
2087 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2093 error = nl_dump_done(&dump);
2094 return error ? error : last_error;
2098 netdev_linux_get_in4(const struct netdev *netdev_,
2099 struct in_addr *address, struct in_addr *netmask)
2101 struct netdev_dev_linux *netdev_dev =
2102 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2104 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2107 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2108 SIOCGIFADDR, "SIOCGIFADDR");
2113 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2114 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2119 netdev_dev->cache_valid |= VALID_IN4;
2121 *address = netdev_dev->address;
2122 *netmask = netdev_dev->netmask;
2123 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2127 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2128 struct in_addr netmask)
2130 struct netdev_dev_linux *netdev_dev =
2131 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2134 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2136 netdev_dev->cache_valid |= VALID_IN4;
2137 netdev_dev->address = address;
2138 netdev_dev->netmask = netmask;
2139 if (address.s_addr != INADDR_ANY) {
2140 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2141 "SIOCSIFNETMASK", netmask);
2148 parse_if_inet6_line(const char *line,
2149 struct in6_addr *in6, char ifname[16 + 1])
2151 uint8_t *s6 = in6->s6_addr;
2152 #define X8 "%2"SCNx8
2154 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2155 "%*x %*x %*x %*x %16s\n",
2156 &s6[0], &s6[1], &s6[2], &s6[3],
2157 &s6[4], &s6[5], &s6[6], &s6[7],
2158 &s6[8], &s6[9], &s6[10], &s6[11],
2159 &s6[12], &s6[13], &s6[14], &s6[15],
2163 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2164 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2166 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2168 struct netdev_dev_linux *netdev_dev =
2169 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2170 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2174 netdev_dev->in6 = in6addr_any;
2176 file = fopen("/proc/net/if_inet6", "r");
2178 const char *name = netdev_get_name(netdev_);
2179 while (fgets(line, sizeof line, file)) {
2180 struct in6_addr in6_tmp;
2181 char ifname[16 + 1];
2182 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2183 && !strcmp(name, ifname))
2185 netdev_dev->in6 = in6_tmp;
2191 netdev_dev->cache_valid |= VALID_IN6;
2193 *in6 = netdev_dev->in6;
2198 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2200 struct sockaddr_in sin;
2201 memset(&sin, 0, sizeof sin);
2202 sin.sin_family = AF_INET;
2203 sin.sin_addr = addr;
2206 memset(sa, 0, sizeof *sa);
2207 memcpy(sa, &sin, sizeof sin);
2211 do_set_addr(struct netdev *netdev,
2212 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2215 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2216 make_in4_sockaddr(&ifr.ifr_addr, addr);
2218 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2222 /* Adds 'router' as a default IP gateway. */
2224 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2226 struct in_addr any = { INADDR_ANY };
2230 memset(&rt, 0, sizeof rt);
2231 make_in4_sockaddr(&rt.rt_dst, any);
2232 make_in4_sockaddr(&rt.rt_gateway, router);
2233 make_in4_sockaddr(&rt.rt_genmask, any);
2234 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2235 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2237 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2243 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2246 static const char fn[] = "/proc/net/route";
2251 *netdev_name = NULL;
2252 stream = fopen(fn, "r");
2253 if (stream == NULL) {
2254 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2259 while (fgets(line, sizeof line, stream)) {
2262 ovs_be32 dest, gateway, mask;
2263 int refcnt, metric, mtu;
2264 unsigned int flags, use, window, irtt;
2267 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2269 iface, &dest, &gateway, &flags, &refcnt,
2270 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2272 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2276 if (!(flags & RTF_UP)) {
2277 /* Skip routes that aren't up. */
2281 /* The output of 'dest', 'mask', and 'gateway' were given in
2282 * network byte order, so we don't need need any endian
2283 * conversions here. */
2284 if ((dest & mask) == (host->s_addr & mask)) {
2286 /* The host is directly reachable. */
2287 next_hop->s_addr = 0;
2289 /* To reach the host, we must go through a gateway. */
2290 next_hop->s_addr = gateway;
2292 *netdev_name = xstrdup(iface);
2304 netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
2306 struct netdev_dev_linux *netdev_dev;
2309 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2310 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2311 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2313 COVERAGE_INC(netdev_get_ethtool);
2314 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2315 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2318 "ETHTOOL_GDRVINFO");
2320 netdev_dev->cache_valid |= VALID_DRVINFO;
2325 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2326 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2327 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2333 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2336 smap_add(smap, "driver_name", "openvswitch");
2340 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2341 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2342 * returns 0. Otherwise, it returns a positive errno value; in particular,
2343 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2345 netdev_linux_arp_lookup(const struct netdev *netdev,
2346 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2349 struct sockaddr_in sin;
2352 memset(&r, 0, sizeof r);
2353 memset(&sin, 0, sizeof sin);
2354 sin.sin_family = AF_INET;
2355 sin.sin_addr.s_addr = ip;
2357 memcpy(&r.arp_pa, &sin, sizeof sin);
2358 r.arp_ha.sa_family = ARPHRD_ETHER;
2360 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2361 COVERAGE_INC(netdev_arp_lookup);
2362 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2364 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2365 } else if (retval != ENXIO) {
2366 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2367 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
2373 nd_to_iff_flags(enum netdev_flags nd)
2376 if (nd & NETDEV_UP) {
2379 if (nd & NETDEV_PROMISC) {
2386 iff_to_nd_flags(int iff)
2388 enum netdev_flags nd = 0;
2392 if (iff & IFF_PROMISC) {
2393 nd |= NETDEV_PROMISC;
2399 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2400 enum netdev_flags on, enum netdev_flags *old_flagsp)
2402 struct netdev_dev_linux *netdev_dev;
2403 int old_flags, new_flags;
2406 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2407 old_flags = netdev_dev->ifi_flags;
2408 *old_flagsp = iff_to_nd_flags(old_flags);
2409 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2410 if (new_flags != old_flags) {
2411 error = set_flags(netdev, new_flags);
2412 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2418 netdev_linux_change_seq(const struct netdev *netdev)
2420 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2423 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2424 GET_FEATURES, GET_STATUS) \
2428 netdev_linux_init, \
2430 netdev_linux_wait, \
2433 netdev_linux_destroy, \
2434 NULL, /* get_config */ \
2435 NULL, /* set_config */ \
2436 NULL, /* get_tunnel_config */ \
2438 netdev_linux_open, \
2439 netdev_linux_close, \
2441 netdev_linux_listen, \
2442 netdev_linux_recv, \
2443 netdev_linux_recv_wait, \
2444 netdev_linux_drain, \
2446 netdev_linux_send, \
2447 netdev_linux_send_wait, \
2449 netdev_linux_set_etheraddr, \
2450 netdev_linux_get_etheraddr, \
2451 netdev_linux_get_mtu, \
2452 netdev_linux_set_mtu, \
2453 netdev_linux_get_ifindex, \
2454 netdev_linux_get_carrier, \
2455 netdev_linux_get_carrier_resets, \
2456 netdev_linux_set_miimon_interval, \
2461 netdev_linux_set_advertisements, \
2463 netdev_linux_set_policing, \
2464 netdev_linux_get_qos_types, \
2465 netdev_linux_get_qos_capabilities, \
2466 netdev_linux_get_qos, \
2467 netdev_linux_set_qos, \
2468 netdev_linux_get_queue, \
2469 netdev_linux_set_queue, \
2470 netdev_linux_delete_queue, \
2471 netdev_linux_get_queue_stats, \
2472 netdev_linux_dump_queues, \
2473 netdev_linux_dump_queue_stats, \
2475 netdev_linux_get_in4, \
2476 netdev_linux_set_in4, \
2477 netdev_linux_get_in6, \
2478 netdev_linux_add_router, \
2479 netdev_linux_get_next_hop, \
2481 netdev_linux_arp_lookup, \
2483 netdev_linux_update_flags, \
2485 netdev_linux_change_seq \
2488 const struct netdev_class netdev_linux_class =
2491 netdev_linux_create,
2492 netdev_linux_get_stats,
2493 NULL, /* set_stats */
2494 netdev_linux_get_features,
2495 netdev_linux_get_status);
2497 const struct netdev_class netdev_tap_class =
2500 netdev_linux_create_tap,
2501 netdev_tap_get_stats,
2502 NULL, /* set_stats */
2503 netdev_linux_get_features,
2504 netdev_linux_get_status);
2506 const struct netdev_class netdev_internal_class =
2509 netdev_linux_create,
2510 netdev_internal_get_stats,
2511 netdev_internal_set_stats,
2512 NULL, /* get_features */
2513 netdev_internal_get_status);
2515 /* HTB traffic control class. */
2517 #define HTB_N_QUEUES 0xf000
2521 unsigned int max_rate; /* In bytes/s. */
2525 struct tc_queue tc_queue;
2526 unsigned int min_rate; /* In bytes/s. */
2527 unsigned int max_rate; /* In bytes/s. */
2528 unsigned int burst; /* In bytes. */
2529 unsigned int priority; /* Lower values are higher priorities. */
2533 htb_get__(const struct netdev *netdev)
2535 struct netdev_dev_linux *netdev_dev =
2536 netdev_dev_linux_cast(netdev_get_dev(netdev));
2537 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2541 htb_install__(struct netdev *netdev, uint64_t max_rate)
2543 struct netdev_dev_linux *netdev_dev =
2544 netdev_dev_linux_cast(netdev_get_dev(netdev));
2547 htb = xmalloc(sizeof *htb);
2548 tc_init(&htb->tc, &tc_ops_htb);
2549 htb->max_rate = max_rate;
2551 netdev_dev->tc = &htb->tc;
2554 /* Create an HTB qdisc.
2556 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2558 htb_setup_qdisc__(struct netdev *netdev)
2561 struct tc_htb_glob opt;
2562 struct ofpbuf request;
2563 struct tcmsg *tcmsg;
2565 tc_del_qdisc(netdev);
2567 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2568 NLM_F_EXCL | NLM_F_CREATE, &request);
2572 tcmsg->tcm_handle = tc_make_handle(1, 0);
2573 tcmsg->tcm_parent = TC_H_ROOT;
2575 nl_msg_put_string(&request, TCA_KIND, "htb");
2577 memset(&opt, 0, sizeof opt);
2578 opt.rate2quantum = 10;
2582 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2583 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2584 nl_msg_end_nested(&request, opt_offset);
2586 return tc_transact(&request, NULL);
2589 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2590 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2592 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2593 unsigned int parent, struct htb_class *class)
2596 struct tc_htb_opt opt;
2597 struct ofpbuf request;
2598 struct tcmsg *tcmsg;
2602 error = netdev_get_mtu(netdev, &mtu);
2604 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2605 netdev_get_name(netdev));
2609 memset(&opt, 0, sizeof opt);
2610 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2611 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2612 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2613 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2614 opt.prio = class->priority;
2616 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2620 tcmsg->tcm_handle = handle;
2621 tcmsg->tcm_parent = parent;
2623 nl_msg_put_string(&request, TCA_KIND, "htb");
2624 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2625 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2626 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2627 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2628 nl_msg_end_nested(&request, opt_offset);
2630 error = tc_transact(&request, NULL);
2632 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2633 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2634 netdev_get_name(netdev),
2635 tc_get_major(handle), tc_get_minor(handle),
2636 tc_get_major(parent), tc_get_minor(parent),
2637 class->min_rate, class->max_rate,
2638 class->burst, class->priority, strerror(error));
2643 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2644 * description of them into 'details'. The description complies with the
2645 * specification given in the vswitch database documentation for linux-htb
2648 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2650 static const struct nl_policy tca_htb_policy[] = {
2651 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2652 .min_len = sizeof(struct tc_htb_opt) },
2655 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2656 const struct tc_htb_opt *htb;
2658 if (!nl_parse_nested(nl_options, tca_htb_policy,
2659 attrs, ARRAY_SIZE(tca_htb_policy))) {
2660 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2664 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2665 class->min_rate = htb->rate.rate;
2666 class->max_rate = htb->ceil.rate;
2667 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2668 class->priority = htb->prio;
2673 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2674 struct htb_class *options,
2675 struct netdev_queue_stats *stats)
2677 struct nlattr *nl_options;
2678 unsigned int handle;
2681 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2682 if (!error && queue_id) {
2683 unsigned int major = tc_get_major(handle);
2684 unsigned int minor = tc_get_minor(handle);
2685 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2686 *queue_id = minor - 1;
2691 if (!error && options) {
2692 error = htb_parse_tca_options__(nl_options, options);
2698 htb_parse_qdisc_details__(struct netdev *netdev,
2699 const struct smap *details, struct htb_class *hc)
2701 const char *max_rate_s;
2703 max_rate_s = smap_get(details, "max-rate");
2704 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2705 if (!hc->max_rate) {
2706 enum netdev_features current;
2708 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2709 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2711 hc->min_rate = hc->max_rate;
2717 htb_parse_class_details__(struct netdev *netdev,
2718 const struct smap *details, struct htb_class *hc)
2720 const struct htb *htb = htb_get__(netdev);
2721 const char *min_rate_s = smap_get(details, "min-rate");
2722 const char *max_rate_s = smap_get(details, "max-rate");
2723 const char *burst_s = smap_get(details, "burst");
2724 const char *priority_s = smap_get(details, "priority");
2727 error = netdev_get_mtu(netdev, &mtu);
2729 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2730 netdev_get_name(netdev));
2734 /* HTB requires at least an mtu sized min-rate to send any traffic even
2735 * on uncongested links. */
2736 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2737 hc->min_rate = MAX(hc->min_rate, mtu);
2738 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2741 hc->max_rate = (max_rate_s
2742 ? strtoull(max_rate_s, NULL, 10) / 8
2744 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2745 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2749 * According to hints in the documentation that I've read, it is important
2750 * that 'burst' be at least as big as the largest frame that might be
2751 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2752 * but having it a bit too small is a problem. Since netdev_get_mtu()
2753 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2754 * the MTU. We actually add 64, instead of 14, as a guard against
2755 * additional headers get tacked on somewhere that we're not aware of. */
2756 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2757 hc->burst = MAX(hc->burst, mtu + 64);
2760 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2766 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2767 unsigned int parent, struct htb_class *options,
2768 struct netdev_queue_stats *stats)
2770 struct ofpbuf *reply;
2773 error = tc_query_class(netdev, handle, parent, &reply);
2775 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2776 ofpbuf_delete(reply);
2782 htb_tc_install(struct netdev *netdev, const struct smap *details)
2786 error = htb_setup_qdisc__(netdev);
2788 struct htb_class hc;
2790 htb_parse_qdisc_details__(netdev, details, &hc);
2791 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2792 tc_make_handle(1, 0), &hc);
2794 htb_install__(netdev, hc.max_rate);
2800 static struct htb_class *
2801 htb_class_cast__(const struct tc_queue *queue)
2803 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2807 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2808 const struct htb_class *hc)
2810 struct htb *htb = htb_get__(netdev);
2811 size_t hash = hash_int(queue_id, 0);
2812 struct tc_queue *queue;
2813 struct htb_class *hcp;
2815 queue = tc_find_queue__(netdev, queue_id, hash);
2817 hcp = htb_class_cast__(queue);
2819 hcp = xmalloc(sizeof *hcp);
2820 queue = &hcp->tc_queue;
2821 queue->queue_id = queue_id;
2822 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2825 hcp->min_rate = hc->min_rate;
2826 hcp->max_rate = hc->max_rate;
2827 hcp->burst = hc->burst;
2828 hcp->priority = hc->priority;
2832 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2835 struct nl_dump dump;
2836 struct htb_class hc;
2838 /* Get qdisc options. */
2840 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2841 htb_install__(netdev, hc.max_rate);
2844 if (!start_queue_dump(netdev, &dump)) {
2847 while (nl_dump_next(&dump, &msg)) {
2848 unsigned int queue_id;
2850 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2851 htb_update_queue__(netdev, queue_id, &hc);
2854 nl_dump_done(&dump);
2860 htb_tc_destroy(struct tc *tc)
2862 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2863 struct htb_class *hc, *next;
2865 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2866 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2874 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2876 const struct htb *htb = htb_get__(netdev);
2877 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2882 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2884 struct htb_class hc;
2887 htb_parse_qdisc_details__(netdev, details, &hc);
2888 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2889 tc_make_handle(1, 0), &hc);
2891 htb_get__(netdev)->max_rate = hc.max_rate;
2897 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2898 const struct tc_queue *queue, struct smap *details)
2900 const struct htb_class *hc = htb_class_cast__(queue);
2902 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2903 if (hc->min_rate != hc->max_rate) {
2904 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2906 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2908 smap_add_format(details, "priority", "%u", hc->priority);
2914 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2915 const struct smap *details)
2917 struct htb_class hc;
2920 error = htb_parse_class_details__(netdev, details, &hc);
2925 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2926 tc_make_handle(1, 0xfffe), &hc);
2931 htb_update_queue__(netdev, queue_id, &hc);
2936 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2938 struct htb_class *hc = htb_class_cast__(queue);
2939 struct htb *htb = htb_get__(netdev);
2942 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2944 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2951 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2952 struct netdev_queue_stats *stats)
2954 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2955 tc_make_handle(1, 0xfffe), NULL, stats);
2959 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2960 const struct ofpbuf *nlmsg,
2961 netdev_dump_queue_stats_cb *cb, void *aux)
2963 struct netdev_queue_stats stats;
2964 unsigned int handle, major, minor;
2967 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2972 major = tc_get_major(handle);
2973 minor = tc_get_minor(handle);
2974 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2975 (*cb)(minor - 1, &stats, aux);
2980 static const struct tc_ops tc_ops_htb = {
2981 "htb", /* linux_name */
2982 "linux-htb", /* ovs_name */
2983 HTB_N_QUEUES, /* n_queues */
2992 htb_class_get_stats,
2993 htb_class_dump_stats
2996 /* "linux-hfsc" traffic control class. */
2998 #define HFSC_N_QUEUES 0xf000
3006 struct tc_queue tc_queue;
3011 static struct hfsc *
3012 hfsc_get__(const struct netdev *netdev)
3014 struct netdev_dev_linux *netdev_dev;
3015 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3016 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3019 static struct hfsc_class *
3020 hfsc_class_cast__(const struct tc_queue *queue)
3022 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3026 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3028 struct netdev_dev_linux * netdev_dev;
3031 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3032 hfsc = xmalloc(sizeof *hfsc);
3033 tc_init(&hfsc->tc, &tc_ops_hfsc);
3034 hfsc->max_rate = max_rate;
3035 netdev_dev->tc = &hfsc->tc;
3039 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3040 const struct hfsc_class *hc)
3044 struct hfsc_class *hcp;
3045 struct tc_queue *queue;
3047 hfsc = hfsc_get__(netdev);
3048 hash = hash_int(queue_id, 0);
3050 queue = tc_find_queue__(netdev, queue_id, hash);
3052 hcp = hfsc_class_cast__(queue);
3054 hcp = xmalloc(sizeof *hcp);
3055 queue = &hcp->tc_queue;
3056 queue->queue_id = queue_id;
3057 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3060 hcp->min_rate = hc->min_rate;
3061 hcp->max_rate = hc->max_rate;
3065 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3067 const struct tc_service_curve *rsc, *fsc, *usc;
3068 static const struct nl_policy tca_hfsc_policy[] = {
3070 .type = NL_A_UNSPEC,
3072 .min_len = sizeof(struct tc_service_curve),
3075 .type = NL_A_UNSPEC,
3077 .min_len = sizeof(struct tc_service_curve),
3080 .type = NL_A_UNSPEC,
3082 .min_len = sizeof(struct tc_service_curve),
3085 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3087 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3088 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3089 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3093 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3094 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3095 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3097 if (rsc->m1 != 0 || rsc->d != 0 ||
3098 fsc->m1 != 0 || fsc->d != 0 ||
3099 usc->m1 != 0 || usc->d != 0) {
3100 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3101 "Non-linear service curves are not supported.");
3105 if (rsc->m2 != fsc->m2) {
3106 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3107 "Real-time service curves are not supported ");
3111 if (rsc->m2 > usc->m2) {
3112 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3113 "Min-rate service curve is greater than "
3114 "the max-rate service curve.");
3118 class->min_rate = fsc->m2;
3119 class->max_rate = usc->m2;
3124 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3125 struct hfsc_class *options,
3126 struct netdev_queue_stats *stats)
3129 unsigned int handle;
3130 struct nlattr *nl_options;
3132 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3138 unsigned int major, minor;
3140 major = tc_get_major(handle);
3141 minor = tc_get_minor(handle);
3142 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3143 *queue_id = minor - 1;
3150 error = hfsc_parse_tca_options__(nl_options, options);
3157 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3158 unsigned int parent, struct hfsc_class *options,
3159 struct netdev_queue_stats *stats)
3162 struct ofpbuf *reply;
3164 error = tc_query_class(netdev, handle, parent, &reply);
3169 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3170 ofpbuf_delete(reply);
3175 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3176 struct hfsc_class *class)
3179 const char *max_rate_s;
3181 max_rate_s = smap_get(details, "max-rate");
3182 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3185 enum netdev_features current;
3187 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3188 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3191 class->min_rate = max_rate;
3192 class->max_rate = max_rate;
3196 hfsc_parse_class_details__(struct netdev *netdev,
3197 const struct smap *details,
3198 struct hfsc_class * class)
3200 const struct hfsc *hfsc;
3201 uint32_t min_rate, max_rate;
3202 const char *min_rate_s, *max_rate_s;
3204 hfsc = hfsc_get__(netdev);
3205 min_rate_s = smap_get(details, "min-rate");
3206 max_rate_s = smap_get(details, "max-rate");
3208 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3209 min_rate = MAX(min_rate, 1);
3210 min_rate = MIN(min_rate, hfsc->max_rate);
3212 max_rate = (max_rate_s
3213 ? strtoull(max_rate_s, NULL, 10) / 8
3215 max_rate = MAX(max_rate, min_rate);
3216 max_rate = MIN(max_rate, hfsc->max_rate);
3218 class->min_rate = min_rate;
3219 class->max_rate = max_rate;
3224 /* Create an HFSC qdisc.
3226 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3228 hfsc_setup_qdisc__(struct netdev * netdev)
3230 struct tcmsg *tcmsg;
3231 struct ofpbuf request;
3232 struct tc_hfsc_qopt opt;
3234 tc_del_qdisc(netdev);
3236 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3237 NLM_F_EXCL | NLM_F_CREATE, &request);
3243 tcmsg->tcm_handle = tc_make_handle(1, 0);
3244 tcmsg->tcm_parent = TC_H_ROOT;
3246 memset(&opt, 0, sizeof opt);
3249 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3250 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3252 return tc_transact(&request, NULL);
3255 /* Create an HFSC class.
3257 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3258 * sc rate <min_rate> ul rate <max_rate>" */
3260 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3261 unsigned int parent, struct hfsc_class *class)
3265 struct tcmsg *tcmsg;
3266 struct ofpbuf request;
3267 struct tc_service_curve min, max;
3269 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3275 tcmsg->tcm_handle = handle;
3276 tcmsg->tcm_parent = parent;
3280 min.m2 = class->min_rate;
3284 max.m2 = class->max_rate;
3286 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3287 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3288 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3289 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3290 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3291 nl_msg_end_nested(&request, opt_offset);
3293 error = tc_transact(&request, NULL);
3295 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3296 "min-rate %ubps, max-rate %ubps (%s)",
3297 netdev_get_name(netdev),
3298 tc_get_major(handle), tc_get_minor(handle),
3299 tc_get_major(parent), tc_get_minor(parent),
3300 class->min_rate, class->max_rate, strerror(error));
3307 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3310 struct hfsc_class class;
3312 error = hfsc_setup_qdisc__(netdev);
3318 hfsc_parse_qdisc_details__(netdev, details, &class);
3319 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3320 tc_make_handle(1, 0), &class);
3326 hfsc_install__(netdev, class.max_rate);
3331 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3334 struct nl_dump dump;
3335 struct hfsc_class hc;
3338 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3339 hfsc_install__(netdev, hc.max_rate);
3341 if (!start_queue_dump(netdev, &dump)) {
3345 while (nl_dump_next(&dump, &msg)) {
3346 unsigned int queue_id;
3348 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3349 hfsc_update_queue__(netdev, queue_id, &hc);
3353 nl_dump_done(&dump);
3358 hfsc_tc_destroy(struct tc *tc)
3361 struct hfsc_class *hc, *next;
3363 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3365 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3366 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3375 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3377 const struct hfsc *hfsc;
3378 hfsc = hfsc_get__(netdev);
3379 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3384 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3387 struct hfsc_class class;
3389 hfsc_parse_qdisc_details__(netdev, details, &class);
3390 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3391 tc_make_handle(1, 0), &class);
3394 hfsc_get__(netdev)->max_rate = class.max_rate;
3401 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3402 const struct tc_queue *queue, struct smap *details)
3404 const struct hfsc_class *hc;
3406 hc = hfsc_class_cast__(queue);
3407 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3408 if (hc->min_rate != hc->max_rate) {
3409 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3415 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3416 const struct smap *details)
3419 struct hfsc_class class;
3421 error = hfsc_parse_class_details__(netdev, details, &class);
3426 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3427 tc_make_handle(1, 0xfffe), &class);
3432 hfsc_update_queue__(netdev, queue_id, &class);
3437 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3441 struct hfsc_class *hc;
3443 hc = hfsc_class_cast__(queue);
3444 hfsc = hfsc_get__(netdev);
3446 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3448 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3455 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3456 struct netdev_queue_stats *stats)
3458 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3459 tc_make_handle(1, 0xfffe), NULL, stats);
3463 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3464 const struct ofpbuf *nlmsg,
3465 netdev_dump_queue_stats_cb *cb, void *aux)
3467 struct netdev_queue_stats stats;
3468 unsigned int handle, major, minor;
3471 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3476 major = tc_get_major(handle);
3477 minor = tc_get_minor(handle);
3478 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3479 (*cb)(minor - 1, &stats, aux);
3484 static const struct tc_ops tc_ops_hfsc = {
3485 "hfsc", /* linux_name */
3486 "linux-hfsc", /* ovs_name */
3487 HFSC_N_QUEUES, /* n_queues */
3488 hfsc_tc_install, /* tc_install */
3489 hfsc_tc_load, /* tc_load */
3490 hfsc_tc_destroy, /* tc_destroy */
3491 hfsc_qdisc_get, /* qdisc_get */
3492 hfsc_qdisc_set, /* qdisc_set */
3493 hfsc_class_get, /* class_get */
3494 hfsc_class_set, /* class_set */
3495 hfsc_class_delete, /* class_delete */
3496 hfsc_class_get_stats, /* class_get_stats */
3497 hfsc_class_dump_stats /* class_dump_stats */
3500 /* "linux-default" traffic control class.
3502 * This class represents the default, unnamed Linux qdisc. It corresponds to
3503 * the "" (empty string) QoS type in the OVS database. */
3506 default_install__(struct netdev *netdev)
3508 struct netdev_dev_linux *netdev_dev =
3509 netdev_dev_linux_cast(netdev_get_dev(netdev));
3510 static struct tc *tc;
3513 tc = xmalloc(sizeof *tc);
3514 tc_init(tc, &tc_ops_default);
3516 netdev_dev->tc = tc;
3520 default_tc_install(struct netdev *netdev,
3521 const struct smap *details OVS_UNUSED)
3523 default_install__(netdev);
3528 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3530 default_install__(netdev);
3534 static const struct tc_ops tc_ops_default = {
3535 NULL, /* linux_name */
3540 NULL, /* tc_destroy */
3541 NULL, /* qdisc_get */
3542 NULL, /* qdisc_set */
3543 NULL, /* class_get */
3544 NULL, /* class_set */
3545 NULL, /* class_delete */
3546 NULL, /* class_get_stats */
3547 NULL /* class_dump_stats */
3550 /* "linux-other" traffic control class.
3555 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3557 struct netdev_dev_linux *netdev_dev =
3558 netdev_dev_linux_cast(netdev_get_dev(netdev));
3559 static struct tc *tc;
3562 tc = xmalloc(sizeof *tc);
3563 tc_init(tc, &tc_ops_other);
3565 netdev_dev->tc = tc;
3569 static const struct tc_ops tc_ops_other = {
3570 NULL, /* linux_name */
3571 "linux-other", /* ovs_name */
3573 NULL, /* tc_install */
3575 NULL, /* tc_destroy */
3576 NULL, /* qdisc_get */
3577 NULL, /* qdisc_set */
3578 NULL, /* class_get */
3579 NULL, /* class_set */
3580 NULL, /* class_delete */
3581 NULL, /* class_get_stats */
3582 NULL /* class_dump_stats */
3585 /* Traffic control. */
3587 /* Number of kernel "tc" ticks per second. */
3588 static double ticks_per_s;
3590 /* Number of kernel "jiffies" per second. This is used for the purpose of
3591 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3592 * one jiffy's worth of data.
3594 * There are two possibilities here:
3596 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3597 * approximate range of 100 to 1024. That means that we really need to
3598 * make sure that the qdisc can buffer that much data.
3600 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3601 * has finely granular timers and there's no need to fudge additional room
3602 * for buffers. (There's no extra effort needed to implement that: the
3603 * large 'buffer_hz' is used as a divisor, so practically any number will
3604 * come out as 0 in the division. Small integer results in the case of
3605 * really high dividends won't have any real effect anyhow.)
3607 static unsigned int buffer_hz;
3609 /* Returns tc handle 'major':'minor'. */
3611 tc_make_handle(unsigned int major, unsigned int minor)
3613 return TC_H_MAKE(major << 16, minor);
3616 /* Returns the major number from 'handle'. */
3618 tc_get_major(unsigned int handle)
3620 return TC_H_MAJ(handle) >> 16;
3623 /* Returns the minor number from 'handle'. */
3625 tc_get_minor(unsigned int handle)
3627 return TC_H_MIN(handle);
3630 static struct tcmsg *
3631 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3632 struct ofpbuf *request)
3634 struct tcmsg *tcmsg;
3638 error = get_ifindex(netdev, &ifindex);
3643 ofpbuf_init(request, 512);
3644 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3645 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3646 tcmsg->tcm_family = AF_UNSPEC;
3647 tcmsg->tcm_ifindex = ifindex;
3648 /* Caller should fill in tcmsg->tcm_handle. */
3649 /* Caller should fill in tcmsg->tcm_parent. */
3655 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3657 int error = nl_sock_transact(rtnl_sock, request, replyp);
3658 ofpbuf_uninit(request);
3662 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3663 * policing configuration.
3665 * This function is equivalent to running the following when 'add' is true:
3666 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3668 * This function is equivalent to running the following when 'add' is false:
3669 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3671 * The configuration and stats may be seen with the following command:
3672 * /sbin/tc -s qdisc show dev <devname>
3674 * Returns 0 if successful, otherwise a positive errno value.
3677 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3679 struct ofpbuf request;
3680 struct tcmsg *tcmsg;
3682 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3683 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3685 tcmsg = tc_make_request(netdev, type, flags, &request);
3689 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3690 tcmsg->tcm_parent = TC_H_INGRESS;
3691 nl_msg_put_string(&request, TCA_KIND, "ingress");
3692 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3694 error = tc_transact(&request, NULL);
3696 /* If we're deleting the qdisc, don't worry about some of the
3697 * error conditions. */
3698 if (!add && (error == ENOENT || error == EINVAL)) {
3707 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3710 * This function is equivalent to running:
3711 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3712 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3715 * The configuration and stats may be seen with the following command:
3716 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3718 * Returns 0 if successful, otherwise a positive errno value.
3721 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3723 struct tc_police tc_police;
3724 struct ofpbuf request;
3725 struct tcmsg *tcmsg;
3726 size_t basic_offset;
3727 size_t police_offset;
3731 memset(&tc_police, 0, sizeof tc_police);
3732 tc_police.action = TC_POLICE_SHOT;
3733 tc_police.mtu = mtu;
3734 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3735 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3736 kbits_burst * 1024);
3738 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3739 NLM_F_EXCL | NLM_F_CREATE, &request);
3743 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3744 tcmsg->tcm_info = tc_make_handle(49,
3745 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3747 nl_msg_put_string(&request, TCA_KIND, "basic");
3748 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3749 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3750 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3751 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3752 nl_msg_end_nested(&request, police_offset);
3753 nl_msg_end_nested(&request, basic_offset);
3755 error = tc_transact(&request, NULL);
3766 /* The values in psched are not individually very meaningful, but they are
3767 * important. The tables below show some values seen in the wild.
3771 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3772 * (Before that, there are hints that it was 1000000000.)
3774 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3778 * -----------------------------------
3779 * [1] 000c8000 000f4240 000f4240 00000064
3780 * [2] 000003e8 00000400 000f4240 3b9aca00
3781 * [3] 000003e8 00000400 000f4240 3b9aca00
3782 * [4] 000003e8 00000400 000f4240 00000064
3783 * [5] 000003e8 00000040 000f4240 3b9aca00
3784 * [6] 000003e8 00000040 000f4240 000000f9
3786 * a b c d ticks_per_s buffer_hz
3787 * ------- --------- ---------- ------------- ----------- -------------
3788 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3789 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3790 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3791 * [4] 1,000 1,024 1,000,000 100 976,562 100
3792 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3793 * [6] 1,000 64 1,000,000 249 15,625,000 249
3795 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3796 * [2] 2.6.26-1-686-bigmem from Debian lenny
3797 * [3] 2.6.26-2-sparc64 from Debian lenny
3798 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3799 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3800 * [6] 2.6.34 from kernel.org on KVM
3802 static const char fn[] = "/proc/net/psched";
3803 unsigned int a, b, c, d;
3809 stream = fopen(fn, "r");
3811 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3815 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3816 VLOG_WARN("%s: read failed", fn);
3820 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3824 VLOG_WARN("%s: invalid scheduler parameters", fn);
3828 ticks_per_s = (double) a * c / b;
3832 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3835 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3838 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3839 * rate of 'rate' bytes per second. */
3841 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3846 return (rate * ticks) / ticks_per_s;
3849 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3850 * rate of 'rate' bytes per second. */
3852 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3857 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3860 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3861 * a transmission rate of 'rate' bytes per second. */
3863 tc_buffer_per_jiffy(unsigned int rate)
3868 return rate / buffer_hz;
3871 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3872 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3873 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3874 * stores NULL into it if it is absent.
3876 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3879 * Returns 0 if successful, otherwise a positive errno value. */
3881 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3882 struct nlattr **options)
3884 static const struct nl_policy tca_policy[] = {
3885 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3886 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3888 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3890 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3891 tca_policy, ta, ARRAY_SIZE(ta))) {
3892 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3897 *kind = nl_attr_get_string(ta[TCA_KIND]);
3901 *options = ta[TCA_OPTIONS];
3916 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3917 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3918 * into '*options', and its queue statistics into '*stats'. Any of the output
3919 * arguments may be null.
3921 * Returns 0 if successful, otherwise a positive errno value. */
3923 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3924 struct nlattr **options, struct netdev_queue_stats *stats)
3926 static const struct nl_policy tca_policy[] = {
3927 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3928 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3930 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3932 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3933 tca_policy, ta, ARRAY_SIZE(ta))) {
3934 VLOG_WARN_RL(&rl, "failed to parse class message");
3939 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3940 *handlep = tc->tcm_handle;
3944 *options = ta[TCA_OPTIONS];
3948 const struct gnet_stats_queue *gsq;
3949 struct gnet_stats_basic gsb;
3951 static const struct nl_policy stats_policy[] = {
3952 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3953 .min_len = sizeof gsb },
3954 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3955 .min_len = sizeof *gsq },
3957 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3959 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3960 sa, ARRAY_SIZE(sa))) {
3961 VLOG_WARN_RL(&rl, "failed to parse class stats");
3965 /* Alignment issues screw up the length of struct gnet_stats_basic on
3966 * some arch/bitsize combinations. Newer versions of Linux have a
3967 * struct gnet_stats_basic_packed, but we can't depend on that. The
3968 * easiest thing to do is just to make a copy. */
3969 memset(&gsb, 0, sizeof gsb);
3970 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3971 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3972 stats->tx_bytes = gsb.bytes;
3973 stats->tx_packets = gsb.packets;
3975 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3976 stats->tx_errors = gsq->drops;
3986 memset(stats, 0, sizeof *stats);
3991 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3994 tc_query_class(const struct netdev *netdev,
3995 unsigned int handle, unsigned int parent,
3996 struct ofpbuf **replyp)
3998 struct ofpbuf request;
3999 struct tcmsg *tcmsg;
4002 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4006 tcmsg->tcm_handle = handle;
4007 tcmsg->tcm_parent = parent;
4009 error = tc_transact(&request, replyp);
4011 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4012 netdev_get_name(netdev),
4013 tc_get_major(handle), tc_get_minor(handle),
4014 tc_get_major(parent), tc_get_minor(parent),
4020 /* Equivalent to "tc class del dev <name> handle <handle>". */
4022 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4024 struct ofpbuf request;
4025 struct tcmsg *tcmsg;
4028 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4032 tcmsg->tcm_handle = handle;
4033 tcmsg->tcm_parent = 0;
4035 error = tc_transact(&request, NULL);
4037 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4038 netdev_get_name(netdev),
4039 tc_get_major(handle), tc_get_minor(handle),
4045 /* Equivalent to "tc qdisc del dev <name> root". */
4047 tc_del_qdisc(struct netdev *netdev)
4049 struct netdev_dev_linux *netdev_dev =
4050 netdev_dev_linux_cast(netdev_get_dev(netdev));
4051 struct ofpbuf request;
4052 struct tcmsg *tcmsg;
4055 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4059 tcmsg->tcm_handle = tc_make_handle(1, 0);
4060 tcmsg->tcm_parent = TC_H_ROOT;
4062 error = tc_transact(&request, NULL);
4063 if (error == EINVAL) {
4064 /* EINVAL probably means that the default qdisc was in use, in which
4065 * case we've accomplished our purpose. */
4068 if (!error && netdev_dev->tc) {
4069 if (netdev_dev->tc->ops->tc_destroy) {
4070 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4072 netdev_dev->tc = NULL;
4077 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4078 * kernel to determine what they are. Returns 0 if successful, otherwise a
4079 * positive errno value. */
4081 tc_query_qdisc(const struct netdev *netdev)
4083 struct netdev_dev_linux *netdev_dev =
4084 netdev_dev_linux_cast(netdev_get_dev(netdev));
4085 struct ofpbuf request, *qdisc;
4086 const struct tc_ops *ops;
4087 struct tcmsg *tcmsg;
4091 if (netdev_dev->tc) {
4095 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4096 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4097 * 2.6.35 without that fix backported to it.
4099 * To avoid the OOPS, we must not make a request that would attempt to dump
4100 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4101 * few others. There are a few ways that I can see to do this, but most of
4102 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4103 * technique chosen here is to assume that any non-default qdisc that we
4104 * create will have a class with handle 1:0. The built-in qdiscs only have
4105 * a class with handle 0:0.
4107 * We could check for Linux 2.6.35+ and use a more straightforward method
4109 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4113 tcmsg->tcm_handle = tc_make_handle(1, 0);
4114 tcmsg->tcm_parent = 0;
4116 /* Figure out what tc class to instantiate. */
4117 error = tc_transact(&request, &qdisc);
4121 error = tc_parse_qdisc(qdisc, &kind, NULL);
4123 ops = &tc_ops_other;
4125 ops = tc_lookup_linux_name(kind);
4127 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4128 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4130 ops = &tc_ops_other;
4133 } else if (error == ENOENT) {
4134 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4135 * other entity that doesn't have a handle 1:0. We will assume
4136 * that it's the system default qdisc. */
4137 ops = &tc_ops_default;
4140 /* Who knows? Maybe the device got deleted. */
4141 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4142 netdev_get_name(netdev), strerror(error));
4143 ops = &tc_ops_other;
4146 /* Instantiate it. */
4147 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4148 assert((load_error == 0) == (netdev_dev->tc != NULL));
4149 ofpbuf_delete(qdisc);
4151 return error ? error : load_error;
4154 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4155 approximate the time to transmit packets of various lengths. For an MTU of
4156 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4157 represents two possible packet lengths; for a MTU of 513 through 1024, four
4158 possible lengths; and so on.
4160 Returns, for the specified 'mtu', the number of bits that packet lengths
4161 need to be shifted right to fit within such a 256-entry table. */
4163 tc_calc_cell_log(unsigned int mtu)
4168 mtu = ETH_PAYLOAD_MAX;
4170 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4172 for (cell_log = 0; mtu >= 256; cell_log++) {
4179 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4182 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4184 memset(rate, 0, sizeof *rate);
4185 rate->cell_log = tc_calc_cell_log(mtu);
4186 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4187 /* rate->cell_align = 0; */ /* distro headers. */
4188 rate->mpu = ETH_TOTAL_MIN;
4192 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4193 * attribute of the specified "type".
4195 * See tc_calc_cell_log() above for a description of "rtab"s. */
4197 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4202 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4203 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4204 unsigned packet_size = (i + 1) << rate->cell_log;
4205 if (packet_size < rate->mpu) {
4206 packet_size = rate->mpu;
4208 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4212 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4213 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4214 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4217 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4219 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4220 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4223 /* Linux-only functions declared in netdev-linux.h */
4225 /* Returns a fd for an AF_INET socket or a negative errno value. */
4227 netdev_linux_get_af_inet_sock(void)
4229 int error = netdev_linux_init();
4230 return error ? -error : af_inet_sock;
4233 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4234 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4236 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4237 const char *flag_name, bool enable)
4239 const char *netdev_name = netdev_get_name(netdev);
4240 struct ethtool_value evalue;
4244 COVERAGE_INC(netdev_get_ethtool);
4245 memset(&evalue, 0, sizeof evalue);
4246 error = netdev_linux_do_ethtool(netdev_name,
4247 (struct ethtool_cmd *)&evalue,
4248 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4253 COVERAGE_INC(netdev_set_ethtool);
4254 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4255 error = netdev_linux_do_ethtool(netdev_name,
4256 (struct ethtool_cmd *)&evalue,
4257 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4262 COVERAGE_INC(netdev_get_ethtool);
4263 memset(&evalue, 0, sizeof evalue);
4264 error = netdev_linux_do_ethtool(netdev_name,
4265 (struct ethtool_cmd *)&evalue,
4266 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4271 if (new_flags != evalue.data) {
4272 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4273 "device %s failed", enable ? "enable" : "disable",
4274 flag_name, netdev_name);
4281 /* Utility functions. */
4283 /* Copies 'src' into 'dst', performing format conversion in the process. */
4285 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4286 const struct rtnl_link_stats *src)
4288 dst->rx_packets = src->rx_packets;
4289 dst->tx_packets = src->tx_packets;
4290 dst->rx_bytes = src->rx_bytes;
4291 dst->tx_bytes = src->tx_bytes;
4292 dst->rx_errors = src->rx_errors;
4293 dst->tx_errors = src->tx_errors;
4294 dst->rx_dropped = src->rx_dropped;
4295 dst->tx_dropped = src->tx_dropped;
4296 dst->multicast = src->multicast;
4297 dst->collisions = src->collisions;
4298 dst->rx_length_errors = src->rx_length_errors;
4299 dst->rx_over_errors = src->rx_over_errors;
4300 dst->rx_crc_errors = src->rx_crc_errors;
4301 dst->rx_frame_errors = src->rx_frame_errors;
4302 dst->rx_fifo_errors = src->rx_fifo_errors;
4303 dst->rx_missed_errors = src->rx_missed_errors;
4304 dst->tx_aborted_errors = src->tx_aborted_errors;
4305 dst->tx_carrier_errors = src->tx_carrier_errors;
4306 dst->tx_fifo_errors = src->tx_fifo_errors;
4307 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4308 dst->tx_window_errors = src->tx_window_errors;
4312 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4314 /* Policy for RTNLGRP_LINK messages.
4316 * There are *many* more fields in these messages, but currently we only
4317 * care about these fields. */
4318 static const struct nl_policy rtnlgrp_link_policy[] = {
4319 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4320 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4321 .min_len = sizeof(struct rtnl_link_stats) },
4324 struct ofpbuf request;
4325 struct ofpbuf *reply;
4326 struct ifinfomsg *ifi;
4327 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4330 ofpbuf_init(&request, 0);
4331 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4332 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4333 ifi->ifi_family = PF_UNSPEC;
4334 ifi->ifi_index = ifindex;
4335 error = nl_sock_transact(rtnl_sock, &request, &reply);
4336 ofpbuf_uninit(&request);
4341 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4342 rtnlgrp_link_policy,
4343 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4344 ofpbuf_delete(reply);
4348 if (!attrs[IFLA_STATS]) {
4349 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4350 ofpbuf_delete(reply);
4354 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4356 ofpbuf_delete(reply);
4362 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4364 static const char fn[] = "/proc/net/dev";
4369 stream = fopen(fn, "r");
4371 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4376 while (fgets(line, sizeof line, stream)) {
4379 #define X64 "%"SCNu64
4382 X64 X64 X64 X64 X64 X64 X64 "%*u"
4383 X64 X64 X64 X64 X64 X64 X64 "%*u",
4389 &stats->rx_fifo_errors,
4390 &stats->rx_frame_errors,
4396 &stats->tx_fifo_errors,
4398 &stats->tx_carrier_errors) != 15) {
4399 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4400 } else if (!strcmp(devname, netdev_name)) {
4401 stats->rx_length_errors = UINT64_MAX;
4402 stats->rx_over_errors = UINT64_MAX;
4403 stats->rx_crc_errors = UINT64_MAX;
4404 stats->rx_missed_errors = UINT64_MAX;
4405 stats->tx_aborted_errors = UINT64_MAX;
4406 stats->tx_heartbeat_errors = UINT64_MAX;
4407 stats->tx_window_errors = UINT64_MAX;
4413 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4419 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4425 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4428 *flags = ifr.ifr_flags;
4434 set_flags(struct netdev *netdev, unsigned int flags)
4438 ifr.ifr_flags = flags;
4439 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4444 do_get_ifindex(const char *netdev_name)
4448 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4449 COVERAGE_INC(netdev_get_ifindex);
4450 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4451 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4452 netdev_name, strerror(errno));
4455 return ifr.ifr_ifindex;
4459 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4461 struct netdev_dev_linux *netdev_dev =
4462 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4464 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4465 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4468 netdev_dev->get_ifindex_error = -ifindex;
4469 netdev_dev->ifindex = 0;
4471 netdev_dev->get_ifindex_error = 0;
4472 netdev_dev->ifindex = ifindex;
4474 netdev_dev->cache_valid |= VALID_IFINDEX;
4477 *ifindexp = netdev_dev->ifindex;
4478 return netdev_dev->get_ifindex_error;
4482 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4487 memset(&ifr, 0, sizeof ifr);
4488 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4489 COVERAGE_INC(netdev_get_hwaddr);
4490 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4491 /* ENODEV probably means that a vif disappeared asynchronously and
4492 * hasn't been removed from the database yet, so reduce the log level
4493 * to INFO for that case. */
4494 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4495 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4496 netdev_name, strerror(errno));
4499 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4500 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4501 VLOG_WARN("%s device has unknown hardware address family %d",
4502 netdev_name, hwaddr_family);
4504 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4509 set_etheraddr(const char *netdev_name,
4510 const uint8_t mac[ETH_ADDR_LEN])
4514 memset(&ifr, 0, sizeof ifr);
4515 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4516 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4517 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4518 COVERAGE_INC(netdev_set_hwaddr);
4519 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4520 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4521 netdev_name, strerror(errno));
4528 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4529 int cmd, const char *cmd_name)
4533 memset(&ifr, 0, sizeof ifr);
4534 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4535 ifr.ifr_data = (caddr_t) ecmd;
4538 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4541 if (errno != EOPNOTSUPP) {
4542 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4543 "failed: %s", cmd_name, name, strerror(errno));
4545 /* The device doesn't support this operation. That's pretty
4546 * common, so there's no point in logging anything. */
4553 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4554 const char *cmd_name)
4556 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4557 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4558 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4566 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4567 int cmd, const char *cmd_name)
4572 ifr.ifr_addr.sa_family = AF_INET;
4573 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4575 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4576 *ip = sin->sin_addr;
4581 /* Returns an AF_PACKET raw socket or a negative errno value. */
4583 af_packet_sock(void)
4585 static int sock = INT_MIN;
4587 if (sock == INT_MIN) {
4588 sock = socket(AF_PACKET, SOCK_RAW, 0);
4590 set_nonblocking(sock);
4593 VLOG_ERR("failed to create packet socket: %s", strerror(errno));