2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_ethtool);
82 /* These were introduced in Linux 2.6.14, so they might be missing if we have
84 #ifndef ADVERTISED_Pause
85 #define ADVERTISED_Pause (1 << 13)
87 #ifndef ADVERTISED_Asym_Pause
88 #define ADVERTISED_Asym_Pause (1 << 14)
91 /* These were introduced in Linux 2.6.24, so they might be missing if we
92 * have old headers. */
93 #ifndef ETHTOOL_GFLAGS
94 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96 #ifndef ETHTOOL_SFLAGS
97 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 #define TC_RTAB_SIZE 1024
106 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
107 static int cache_notifier_refcount;
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_HAVE_VPORT_STATS = 1 << 6
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 /* One traffic control queue.
140 * Each TC implementation subclasses this with whatever additional data it
143 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144 unsigned int queue_id; /* OpenFlow queue ID. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct shash *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct shash *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct shash *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct shash *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
331 struct nlattr **options);
332 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
333 struct nlattr **options,
334 struct netdev_queue_stats *);
335 static int tc_query_class(const struct netdev *,
336 unsigned int handle, unsigned int parent,
337 struct ofpbuf **replyp);
338 static int tc_delete_class(const struct netdev *, unsigned int handle);
340 static int tc_del_qdisc(struct netdev *netdev);
341 static int tc_query_qdisc(const struct netdev *netdev);
343 static int tc_calc_cell_log(unsigned int mtu);
344 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
345 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
346 const struct tc_ratespec *rate);
347 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
349 struct netdev_dev_linux {
350 struct netdev_dev netdev_dev;
352 struct shash_node *shash_node;
353 unsigned int cache_valid;
354 unsigned int change_seq;
356 bool miimon; /* Link status of last poll. */
357 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
358 struct timer miimon_timer;
360 /* The following are figured out "on demand" only. They are only valid
361 * when the corresponding VALID_* bit in 'cache_valid' is set. */
363 uint8_t etheraddr[ETH_ADDR_LEN];
364 struct in_addr address, netmask;
368 long long int carrier_resets;
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
371 bool have_vport_stats;
375 struct tap_state tap;
379 struct netdev_linux {
380 struct netdev netdev;
384 /* Sockets used for ioctl operations. */
385 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387 /* A Netlink routing socket that is not subscribed to any multicast groups. */
388 static struct nl_sock *rtnl_sock;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414 static int get_carrier_via_sysfs(const char *name, bool *carrier);
415 static int af_packet_sock(void);
416 static void netdev_linux_miimon_run(void);
417 static void netdev_linux_miimon_wait(void);
420 is_netdev_linux_class(const struct netdev_class *netdev_class)
422 return netdev_class->init == netdev_linux_init;
425 static struct netdev_dev_linux *
426 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
434 static struct netdev_linux *
435 netdev_linux_cast(const struct netdev *netdev)
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
445 netdev_linux_init(void)
447 static int status = -1;
449 /* Create AF_INET socket. */
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
456 /* Create rtnetlink socket. */
458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
469 netdev_linux_run(void)
471 rtnetlink_link_run();
472 netdev_linux_miimon_run();
476 netdev_linux_wait(void)
478 rtnetlink_link_wait();
479 netdev_linux_miimon_wait();
483 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
486 if (!dev->change_seq) {
489 dev->cache_valid = 0;
493 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
494 void *aux OVS_UNUSED)
496 struct netdev_dev_linux *dev;
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
506 if (dev->carrier != change->running) {
507 dev->carrier = change->running;
508 dev->carrier_resets++;
511 netdev_dev_linux_changed(dev);
515 struct shash device_shash;
516 struct shash_node *node;
518 shash_init(&device_shash);
519 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
520 SHASH_FOR_EACH (node, &device_shash) {
525 get_carrier_via_sysfs(node->name, &carrier);
526 if (dev->carrier != carrier) {
527 dev->carrier = carrier;
528 dev->carrier_resets++;
531 netdev_dev_linux_changed(dev);
533 shash_destroy(&device_shash);
537 /* Creates system and internal devices. */
539 netdev_linux_create(const struct netdev_class *class, const char *name,
540 struct netdev_dev **netdev_devp)
542 struct netdev_dev_linux *netdev_dev;
544 if (!cache_notifier_refcount) {
545 assert(!netdev_linux_cache_notifier);
547 netdev_linux_cache_notifier =
548 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550 if (!netdev_linux_cache_notifier) {
554 cache_notifier_refcount++;
556 netdev_dev = xzalloc(sizeof *netdev_dev);
557 netdev_dev->change_seq = 1;
558 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
559 get_carrier_via_sysfs(name, &netdev_dev->carrier);
561 *netdev_devp = &netdev_dev->netdev_dev;
565 /* For most types of netdevs we open the device for each call of
566 * netdev_open(). However, this is not the case with tap devices,
567 * since it is only possible to open the device once. In this
568 * situation we share a single file descriptor, and consequently
569 * buffers, across all readers. Therefore once data is read it will
570 * be unavailable to other reads for tap devices. */
572 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
573 const char *name, struct netdev_dev **netdev_devp)
575 struct netdev_dev_linux *netdev_dev;
576 struct tap_state *state;
577 static const char tap_dev[] = "/dev/net/tun";
581 netdev_dev = xzalloc(sizeof *netdev_dev);
582 state = &netdev_dev->state.tap;
584 /* Open tap device. */
585 state->fd = open(tap_dev, O_RDWR);
588 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
592 /* Create tap device. */
593 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
594 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
595 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
596 VLOG_WARN("%s: creating tap device failed: %s", name,
602 /* Make non-blocking. */
603 error = set_nonblocking(state->fd);
608 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
609 *netdev_devp = &netdev_dev->netdev_dev;
618 destroy_tap(struct netdev_dev_linux *netdev_dev)
620 struct tap_state *state = &netdev_dev->state.tap;
622 if (state->fd >= 0) {
627 /* Destroys the netdev device 'netdev_dev_'. */
629 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
631 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
632 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
634 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
635 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
638 if (class == &netdev_linux_class || class == &netdev_internal_class) {
639 cache_notifier_refcount--;
641 if (!cache_notifier_refcount) {
642 assert(netdev_linux_cache_notifier);
643 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
644 netdev_linux_cache_notifier = NULL;
646 } else if (class == &netdev_tap_class) {
647 destroy_tap(netdev_dev);
656 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
658 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
659 struct netdev_linux *netdev;
660 enum netdev_flags flags;
663 /* Allocate network device. */
664 netdev = xzalloc(sizeof *netdev);
666 netdev_init(&netdev->netdev, netdev_dev_);
668 /* Verify that the device really exists, by attempting to read its flags.
669 * (The flags might be cached, in which case this won't actually do an
672 * Don't do this for "internal" netdevs, though, because those have to be
673 * created as netdev objects before they exist in the kernel, because
674 * creating them in the kernel happens by passing a netdev object to
675 * dpif_port_add(). */
676 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
677 error = netdev_get_flags(&netdev->netdev, &flags);
678 if (error == ENODEV) {
683 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
684 !netdev_dev->state.tap.opened) {
686 /* We assume that the first user of the tap device is the primary user
687 * and give them the tap FD. Subsequent users probably just expect
688 * this to be a system device so open it normally to avoid send/receive
689 * directions appearing to be reversed. */
690 netdev->fd = netdev_dev->state.tap.fd;
691 netdev_dev->state.tap.opened = true;
694 *netdevp = &netdev->netdev;
698 netdev_uninit(&netdev->netdev, true);
702 /* Closes and destroys 'netdev'. */
704 netdev_linux_close(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
708 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
715 netdev_linux_listen(struct netdev *netdev_)
717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
718 struct sockaddr_ll sll;
723 if (netdev->fd >= 0) {
727 /* Create file descriptor. */
728 fd = socket(PF_PACKET, SOCK_RAW, 0);
731 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
735 /* Set non-blocking mode. */
736 error = set_nonblocking(fd);
741 /* Get ethernet device index. */
742 error = get_ifindex(&netdev->netdev, &ifindex);
747 /* Bind to specific ethernet device. */
748 memset(&sll, 0, sizeof sll);
749 sll.sll_family = AF_PACKET;
750 sll.sll_ifindex = ifindex;
751 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
752 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
754 VLOG_ERR("%s: failed to bind raw socket (%s)",
755 netdev_get_name(netdev_), strerror(error));
770 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 if (netdev->fd < 0) {
775 /* Device is not listening. */
780 ssize_t retval = read(netdev->fd, data, size);
783 } else if (errno != EINTR) {
784 if (errno != EAGAIN) {
785 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
786 strerror(errno), netdev_get_name(netdev_));
793 /* Registers with the poll loop to wake up from the next call to poll_block()
794 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
796 netdev_linux_recv_wait(struct netdev *netdev_)
798 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 if (netdev->fd >= 0) {
800 poll_fd_wait(netdev->fd, POLLIN);
804 /* Discards all packets waiting to be received from 'netdev'. */
806 netdev_linux_drain(struct netdev *netdev_)
808 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
809 if (netdev->fd < 0) {
811 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
813 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
814 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
818 drain_fd(netdev->fd, ifr.ifr_qlen);
821 return drain_rcvbuf(netdev->fd);
825 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
826 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
827 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
828 * the packet is too big or too small to transmit on the device.
830 * The caller retains ownership of 'buffer' in all cases.
832 * The kernel maintains a packet transmission queue, so the caller is not
833 * expected to do additional queuing of packets. */
835 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841 if (netdev->fd < 0) {
842 /* Use our AF_PACKET socket to send to this device. */
843 struct sockaddr_ll sll;
850 sock = af_packet_sock();
855 error = get_ifindex(netdev_, &ifindex);
860 /* We don't bother setting most fields in sockaddr_ll because the
861 * kernel ignores them for SOCK_RAW. */
862 memset(&sll, 0, sizeof sll);
863 sll.sll_family = AF_PACKET;
864 sll.sll_ifindex = ifindex;
866 iov.iov_base = (void *) data;
870 msg.msg_namelen = sizeof sll;
873 msg.msg_control = NULL;
874 msg.msg_controllen = 0;
877 retval = sendmsg(sock, &msg, 0);
879 /* Use the netdev's own fd to send to this device. This is
880 * essential for tap devices, because packets sent to a tap device
881 * with an AF_PACKET socket will loop back to be *received* again
882 * on the tap device. */
883 retval = write(netdev->fd, data, size);
887 /* The Linux AF_PACKET implementation never blocks waiting for room
888 * for packets, instead returning ENOBUFS. Translate this into
889 * EAGAIN for the caller. */
890 if (errno == ENOBUFS) {
892 } else if (errno == EINTR) {
894 } else if (errno != EAGAIN) {
895 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
896 netdev_get_name(netdev_), strerror(errno));
899 } else if (retval != size) {
900 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
901 "%zu) on %s", retval, size, netdev_get_name(netdev_));
909 /* Registers with the poll loop to wake up from the next call to poll_block()
910 * when the packet transmission queue has sufficient room to transmit a packet
911 * with netdev_send().
913 * The kernel maintains a packet transmission queue, so the client is not
914 * expected to do additional queuing of packets. Thus, this function is
915 * unlikely to ever be used. It is included for completeness. */
917 netdev_linux_send_wait(struct netdev *netdev_)
919 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
920 if (netdev->fd < 0) {
922 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
923 poll_fd_wait(netdev->fd, POLLOUT);
925 /* TAP device always accepts packets.*/
926 poll_immediate_wake();
930 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
931 * otherwise a positive errno value. */
933 netdev_linux_set_etheraddr(struct netdev *netdev_,
934 const uint8_t mac[ETH_ADDR_LEN])
936 struct netdev_dev_linux *netdev_dev =
937 netdev_dev_linux_cast(netdev_get_dev(netdev_));
940 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
941 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
942 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
944 netdev_dev->cache_valid |= VALID_ETHERADDR;
945 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
953 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
954 * free the returned buffer. */
956 netdev_linux_get_etheraddr(const struct netdev *netdev_,
957 uint8_t mac[ETH_ADDR_LEN])
959 struct netdev_dev_linux *netdev_dev =
960 netdev_dev_linux_cast(netdev_get_dev(netdev_));
961 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
962 int error = get_etheraddr(netdev_get_name(netdev_),
963 netdev_dev->etheraddr);
967 netdev_dev->cache_valid |= VALID_ETHERADDR;
969 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
973 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
974 * in bytes, not including the hardware header; thus, this is typically 1500
975 * bytes for Ethernet devices. */
977 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
979 struct netdev_dev_linux *netdev_dev =
980 netdev_dev_linux_cast(netdev_get_dev(netdev_));
981 if (!(netdev_dev->cache_valid & VALID_MTU)) {
985 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
986 SIOCGIFMTU, "SIOCGIFMTU");
990 netdev_dev->mtu = ifr.ifr_mtu;
991 netdev_dev->cache_valid |= VALID_MTU;
993 *mtup = netdev_dev->mtu;
997 /* Sets the maximum size of transmitted (MTU) for given device using linux
998 * networking ioctl interface.
1001 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1009 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1010 SIOCSIFMTU, "SIOCSIFMTU");
1015 netdev_dev->mtu = ifr.ifr_mtu;
1016 netdev_dev->cache_valid |= VALID_MTU;
1020 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1021 * On failure, returns a negative errno value. */
1023 netdev_linux_get_ifindex(const struct netdev *netdev)
1027 error = get_ifindex(netdev, &ifindex);
1028 return error ? -error : ifindex;
1032 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1034 struct netdev_dev_linux *netdev_dev =
1035 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1037 if (netdev_dev->miimon_interval > 0) {
1038 *carrier = netdev_dev->miimon;
1040 *carrier = netdev_dev->carrier;
1046 static long long int
1047 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1049 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1053 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1054 struct mii_ioctl_data *data)
1059 memset(&ifr, 0, sizeof ifr);
1060 memcpy(&ifr.ifr_data, data, sizeof *data);
1061 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1062 memcpy(data, &ifr.ifr_data, sizeof *data);
1068 netdev_linux_get_miimon(const char *name, bool *miimon)
1070 struct mii_ioctl_data data;
1075 memset(&data, 0, sizeof data);
1076 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1078 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1079 data.reg_num = MII_BMSR;
1080 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1084 *miimon = !!(data.val_out & BMSR_LSTATUS);
1086 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1089 struct ethtool_cmd ecmd;
1091 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1094 memset(&ecmd, 0, sizeof ecmd);
1095 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1098 struct ethtool_value eval;
1100 memcpy(&eval, &ecmd, sizeof eval);
1101 *miimon = !!eval.data;
1103 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1111 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1112 long long int interval)
1114 struct netdev_dev_linux *netdev_dev;
1116 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1118 interval = interval > 0 ? MAX(interval, 100) : 0;
1119 if (netdev_dev->miimon_interval != interval) {
1120 netdev_dev->miimon_interval = interval;
1121 timer_set_expired(&netdev_dev->miimon_timer);
1128 netdev_linux_miimon_run(void)
1130 struct shash device_shash;
1131 struct shash_node *node;
1133 shash_init(&device_shash);
1134 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1135 SHASH_FOR_EACH (node, &device_shash) {
1136 struct netdev_dev_linux *dev = node->data;
1139 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1143 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1144 if (miimon != dev->miimon) {
1145 dev->miimon = miimon;
1146 netdev_dev_linux_changed(dev);
1149 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1152 shash_destroy(&device_shash);
1156 netdev_linux_miimon_wait(void)
1158 struct shash device_shash;
1159 struct shash_node *node;
1161 shash_init(&device_shash);
1162 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163 SHASH_FOR_EACH (node, &device_shash) {
1164 struct netdev_dev_linux *dev = node->data;
1166 if (dev->miimon_interval > 0) {
1167 timer_wait(&dev->miimon_timer);
1170 shash_destroy(&device_shash);
1173 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1174 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1177 check_for_working_netlink_stats(void)
1179 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1180 * preferable, so if that works, we'll use it. */
1181 int ifindex = do_get_ifindex("lo");
1183 VLOG_WARN("failed to get ifindex for lo, "
1184 "obtaining netdev stats from proc");
1187 struct netdev_stats stats;
1188 int error = get_stats_via_netlink(ifindex, &stats);
1190 VLOG_DBG("obtaining netdev stats via rtnetlink");
1193 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1194 "via proc (you are probably running a pre-2.6.19 "
1195 "kernel)", strerror(error));
1202 swap_uint64(uint64_t *a, uint64_t *b)
1210 get_stats_via_vport(const struct netdev *netdev_,
1211 struct netdev_stats *stats)
1213 struct netdev_dev_linux *netdev_dev =
1214 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1216 if (netdev_dev->have_vport_stats ||
1217 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1220 error = netdev_vport_get_stats(netdev_, stats);
1222 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1223 netdev_get_name(netdev_), error);
1225 netdev_dev->have_vport_stats = !error;
1226 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1231 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1232 struct netdev_stats *stats)
1234 static int use_netlink_stats = -1;
1237 if (use_netlink_stats < 0) {
1238 use_netlink_stats = check_for_working_netlink_stats();
1241 if (use_netlink_stats) {
1244 error = get_ifindex(netdev_, &ifindex);
1246 error = get_stats_via_netlink(ifindex, stats);
1249 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1253 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1254 netdev_get_name(netdev_), error);
1260 /* Retrieves current device stats for 'netdev-linux'. */
1262 netdev_linux_get_stats(const struct netdev *netdev_,
1263 struct netdev_stats *stats)
1265 struct netdev_dev_linux *netdev_dev =
1266 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1267 struct netdev_stats dev_stats;
1270 get_stats_via_vport(netdev_, stats);
1272 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1275 if (!netdev_dev->have_vport_stats) {
1282 if (!netdev_dev->have_vport_stats) {
1283 /* stats not available from OVS then use ioctl stats. */
1286 stats->rx_errors += dev_stats.rx_errors;
1287 stats->tx_errors += dev_stats.tx_errors;
1288 stats->rx_dropped += dev_stats.rx_dropped;
1289 stats->tx_dropped += dev_stats.tx_dropped;
1290 stats->multicast += dev_stats.multicast;
1291 stats->collisions += dev_stats.collisions;
1292 stats->rx_length_errors += dev_stats.rx_length_errors;
1293 stats->rx_over_errors += dev_stats.rx_over_errors;
1294 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1295 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1296 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1297 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1298 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1299 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1300 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1301 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1302 stats->tx_window_errors += dev_stats.tx_window_errors;
1307 /* Retrieves current device stats for 'netdev-tap' netdev or
1308 * netdev-internal. */
1310 netdev_pseudo_get_stats(const struct netdev *netdev_,
1311 struct netdev_stats *stats)
1313 struct netdev_dev_linux *netdev_dev =
1314 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1315 struct netdev_stats dev_stats;
1318 get_stats_via_vport(netdev_, stats);
1320 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1322 if (!netdev_dev->have_vport_stats) {
1329 /* If this port is an internal port then the transmit and receive stats
1330 * will appear to be swapped relative to the other ports since we are the
1331 * one sending the data, not a remote computer. For consistency, we swap
1332 * them back here. This does not apply if we are getting stats from the
1333 * vport layer because it always tracks stats from the perspective of the
1335 if (!netdev_dev->have_vport_stats) {
1337 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1338 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1339 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1340 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1341 stats->rx_length_errors = 0;
1342 stats->rx_over_errors = 0;
1343 stats->rx_crc_errors = 0;
1344 stats->rx_frame_errors = 0;
1345 stats->rx_fifo_errors = 0;
1346 stats->rx_missed_errors = 0;
1347 stats->tx_aborted_errors = 0;
1348 stats->tx_carrier_errors = 0;
1349 stats->tx_fifo_errors = 0;
1350 stats->tx_heartbeat_errors = 0;
1351 stats->tx_window_errors = 0;
1353 stats->rx_dropped += dev_stats.tx_dropped;
1354 stats->tx_dropped += dev_stats.rx_dropped;
1356 stats->rx_errors += dev_stats.tx_errors;
1357 stats->tx_errors += dev_stats.rx_errors;
1359 stats->multicast += dev_stats.multicast;
1360 stats->collisions += dev_stats.collisions;
1365 /* Stores the features supported by 'netdev' into each of '*current',
1366 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1367 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1368 * successful, otherwise a positive errno value. */
1370 netdev_linux_get_features(const struct netdev *netdev,
1371 uint32_t *current, uint32_t *advertised,
1372 uint32_t *supported, uint32_t *peer)
1374 struct ethtool_cmd ecmd;
1377 memset(&ecmd, 0, sizeof ecmd);
1378 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1379 ETHTOOL_GSET, "ETHTOOL_GSET");
1384 /* Supported features. */
1386 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1387 *supported |= OFPPF_10MB_HD;
1389 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1390 *supported |= OFPPF_10MB_FD;
1392 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1393 *supported |= OFPPF_100MB_HD;
1395 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1396 *supported |= OFPPF_100MB_FD;
1398 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1399 *supported |= OFPPF_1GB_HD;
1401 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1402 *supported |= OFPPF_1GB_FD;
1404 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1405 *supported |= OFPPF_10GB_FD;
1407 if (ecmd.supported & SUPPORTED_TP) {
1408 *supported |= OFPPF_COPPER;
1410 if (ecmd.supported & SUPPORTED_FIBRE) {
1411 *supported |= OFPPF_FIBER;
1413 if (ecmd.supported & SUPPORTED_Autoneg) {
1414 *supported |= OFPPF_AUTONEG;
1416 if (ecmd.supported & SUPPORTED_Pause) {
1417 *supported |= OFPPF_PAUSE;
1419 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1420 *supported |= OFPPF_PAUSE_ASYM;
1423 /* Advertised features. */
1425 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1426 *advertised |= OFPPF_10MB_HD;
1428 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1429 *advertised |= OFPPF_10MB_FD;
1431 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1432 *advertised |= OFPPF_100MB_HD;
1434 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1435 *advertised |= OFPPF_100MB_FD;
1437 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1438 *advertised |= OFPPF_1GB_HD;
1440 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1441 *advertised |= OFPPF_1GB_FD;
1443 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1444 *advertised |= OFPPF_10GB_FD;
1446 if (ecmd.advertising & ADVERTISED_TP) {
1447 *advertised |= OFPPF_COPPER;
1449 if (ecmd.advertising & ADVERTISED_FIBRE) {
1450 *advertised |= OFPPF_FIBER;
1452 if (ecmd.advertising & ADVERTISED_Autoneg) {
1453 *advertised |= OFPPF_AUTONEG;
1455 if (ecmd.advertising & ADVERTISED_Pause) {
1456 *advertised |= OFPPF_PAUSE;
1458 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1459 *advertised |= OFPPF_PAUSE_ASYM;
1462 /* Current settings. */
1463 if (ecmd.speed == SPEED_10) {
1464 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1465 } else if (ecmd.speed == SPEED_100) {
1466 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1467 } else if (ecmd.speed == SPEED_1000) {
1468 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1469 } else if (ecmd.speed == SPEED_10000) {
1470 *current = OFPPF_10GB_FD;
1475 if (ecmd.port == PORT_TP) {
1476 *current |= OFPPF_COPPER;
1477 } else if (ecmd.port == PORT_FIBRE) {
1478 *current |= OFPPF_FIBER;
1482 *current |= OFPPF_AUTONEG;
1485 /* Peer advertisements. */
1486 *peer = 0; /* XXX */
1491 /* Set the features advertised by 'netdev' to 'advertise'. */
1493 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1495 struct ethtool_cmd ecmd;
1498 memset(&ecmd, 0, sizeof ecmd);
1499 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1500 ETHTOOL_GSET, "ETHTOOL_GSET");
1505 ecmd.advertising = 0;
1506 if (advertise & OFPPF_10MB_HD) {
1507 ecmd.advertising |= ADVERTISED_10baseT_Half;
1509 if (advertise & OFPPF_10MB_FD) {
1510 ecmd.advertising |= ADVERTISED_10baseT_Full;
1512 if (advertise & OFPPF_100MB_HD) {
1513 ecmd.advertising |= ADVERTISED_100baseT_Half;
1515 if (advertise & OFPPF_100MB_FD) {
1516 ecmd.advertising |= ADVERTISED_100baseT_Full;
1518 if (advertise & OFPPF_1GB_HD) {
1519 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1521 if (advertise & OFPPF_1GB_FD) {
1522 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1524 if (advertise & OFPPF_10GB_FD) {
1525 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1527 if (advertise & OFPPF_COPPER) {
1528 ecmd.advertising |= ADVERTISED_TP;
1530 if (advertise & OFPPF_FIBER) {
1531 ecmd.advertising |= ADVERTISED_FIBRE;
1533 if (advertise & OFPPF_AUTONEG) {
1534 ecmd.advertising |= ADVERTISED_Autoneg;
1536 if (advertise & OFPPF_PAUSE) {
1537 ecmd.advertising |= ADVERTISED_Pause;
1539 if (advertise & OFPPF_PAUSE_ASYM) {
1540 ecmd.advertising |= ADVERTISED_Asym_Pause;
1542 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1543 ETHTOOL_SSET, "ETHTOOL_SSET");
1546 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1547 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1549 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1550 * positive errno value.
1552 * This function is equivalent to running
1553 * /sbin/tc qdisc del dev %s handle ffff: ingress
1554 * but it is much, much faster.
1557 netdev_linux_remove_policing(struct netdev *netdev)
1559 struct netdev_dev_linux *netdev_dev =
1560 netdev_dev_linux_cast(netdev_get_dev(netdev));
1561 const char *netdev_name = netdev_get_name(netdev);
1563 struct ofpbuf request;
1564 struct tcmsg *tcmsg;
1567 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1571 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1572 tcmsg->tcm_parent = TC_H_INGRESS;
1573 nl_msg_put_string(&request, TCA_KIND, "ingress");
1574 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1576 error = tc_transact(&request, NULL);
1577 if (error && error != ENOENT && error != EINVAL) {
1578 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1579 netdev_name, strerror(error));
1583 netdev_dev->kbits_rate = 0;
1584 netdev_dev->kbits_burst = 0;
1585 netdev_dev->cache_valid |= VALID_POLICING;
1589 /* Attempts to set input rate limiting (policing) policy. */
1591 netdev_linux_set_policing(struct netdev *netdev,
1592 uint32_t kbits_rate, uint32_t kbits_burst)
1594 struct netdev_dev_linux *netdev_dev =
1595 netdev_dev_linux_cast(netdev_get_dev(netdev));
1596 const char *netdev_name = netdev_get_name(netdev);
1599 COVERAGE_INC(netdev_set_policing);
1601 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1602 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1603 : kbits_burst); /* Stick with user-specified value. */
1605 if (netdev_dev->cache_valid & VALID_POLICING
1606 && netdev_dev->kbits_rate == kbits_rate
1607 && netdev_dev->kbits_burst == kbits_burst) {
1608 /* Assume that settings haven't changed since we last set them. */
1612 netdev_linux_remove_policing(netdev);
1614 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1615 if (system(command) != 0) {
1616 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1620 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1621 kbits_rate, kbits_burst);
1622 if (system(command) != 0) {
1623 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1628 netdev_dev->kbits_rate = kbits_rate;
1629 netdev_dev->kbits_burst = kbits_burst;
1630 netdev_dev->cache_valid |= VALID_POLICING;
1637 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1640 const struct tc_ops **opsp;
1642 for (opsp = tcs; *opsp != NULL; opsp++) {
1643 const struct tc_ops *ops = *opsp;
1644 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1645 sset_add(types, ops->ovs_name);
1651 static const struct tc_ops *
1652 tc_lookup_ovs_name(const char *name)
1654 const struct tc_ops **opsp;
1656 for (opsp = tcs; *opsp != NULL; opsp++) {
1657 const struct tc_ops *ops = *opsp;
1658 if (!strcmp(name, ops->ovs_name)) {
1665 static const struct tc_ops *
1666 tc_lookup_linux_name(const char *name)
1668 const struct tc_ops **opsp;
1670 for (opsp = tcs; *opsp != NULL; opsp++) {
1671 const struct tc_ops *ops = *opsp;
1672 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1679 static struct tc_queue *
1680 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1683 struct netdev_dev_linux *netdev_dev =
1684 netdev_dev_linux_cast(netdev_get_dev(netdev));
1685 struct tc_queue *queue;
1687 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1688 if (queue->queue_id == queue_id) {
1695 static struct tc_queue *
1696 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1698 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1702 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1704 struct netdev_qos_capabilities *caps)
1706 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1710 caps->n_queues = ops->n_queues;
1715 netdev_linux_get_qos(const struct netdev *netdev,
1716 const char **typep, struct shash *details)
1718 struct netdev_dev_linux *netdev_dev =
1719 netdev_dev_linux_cast(netdev_get_dev(netdev));
1722 error = tc_query_qdisc(netdev);
1727 *typep = netdev_dev->tc->ops->ovs_name;
1728 return (netdev_dev->tc->ops->qdisc_get
1729 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1734 netdev_linux_set_qos(struct netdev *netdev,
1735 const char *type, const struct shash *details)
1737 struct netdev_dev_linux *netdev_dev =
1738 netdev_dev_linux_cast(netdev_get_dev(netdev));
1739 const struct tc_ops *new_ops;
1742 new_ops = tc_lookup_ovs_name(type);
1743 if (!new_ops || !new_ops->tc_install) {
1747 error = tc_query_qdisc(netdev);
1752 if (new_ops == netdev_dev->tc->ops) {
1753 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1755 /* Delete existing qdisc. */
1756 error = tc_del_qdisc(netdev);
1760 assert(netdev_dev->tc == NULL);
1762 /* Install new qdisc. */
1763 error = new_ops->tc_install(netdev, details);
1764 assert((error == 0) == (netdev_dev->tc != NULL));
1771 netdev_linux_get_queue(const struct netdev *netdev,
1772 unsigned int queue_id, struct shash *details)
1774 struct netdev_dev_linux *netdev_dev =
1775 netdev_dev_linux_cast(netdev_get_dev(netdev));
1778 error = tc_query_qdisc(netdev);
1782 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1784 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1790 netdev_linux_set_queue(struct netdev *netdev,
1791 unsigned int queue_id, const struct shash *details)
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev));
1797 error = tc_query_qdisc(netdev);
1800 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1801 || !netdev_dev->tc->ops->class_set) {
1805 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1809 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1811 struct netdev_dev_linux *netdev_dev =
1812 netdev_dev_linux_cast(netdev_get_dev(netdev));
1815 error = tc_query_qdisc(netdev);
1818 } else if (!netdev_dev->tc->ops->class_delete) {
1821 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1823 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1829 netdev_linux_get_queue_stats(const struct netdev *netdev,
1830 unsigned int queue_id,
1831 struct netdev_queue_stats *stats)
1833 struct netdev_dev_linux *netdev_dev =
1834 netdev_dev_linux_cast(netdev_get_dev(netdev));
1837 error = tc_query_qdisc(netdev);
1840 } else if (!netdev_dev->tc->ops->class_get_stats) {
1843 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1845 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1851 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1853 struct ofpbuf request;
1854 struct tcmsg *tcmsg;
1856 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1860 tcmsg->tcm_parent = 0;
1861 nl_dump_start(dump, rtnl_sock, &request);
1862 ofpbuf_uninit(&request);
1867 netdev_linux_dump_queues(const struct netdev *netdev,
1868 netdev_dump_queues_cb *cb, void *aux)
1870 struct netdev_dev_linux *netdev_dev =
1871 netdev_dev_linux_cast(netdev_get_dev(netdev));
1872 struct tc_queue *queue, *next_queue;
1873 struct shash details;
1877 error = tc_query_qdisc(netdev);
1880 } else if (!netdev_dev->tc->ops->class_get) {
1885 shash_init(&details);
1886 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
1887 &netdev_dev->tc->queues) {
1888 shash_clear(&details);
1890 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1892 (*cb)(queue->queue_id, &details, aux);
1897 shash_destroy(&details);
1903 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1904 netdev_dump_queue_stats_cb *cb, void *aux)
1906 struct netdev_dev_linux *netdev_dev =
1907 netdev_dev_linux_cast(netdev_get_dev(netdev));
1908 struct nl_dump dump;
1913 error = tc_query_qdisc(netdev);
1916 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1921 if (!start_queue_dump(netdev, &dump)) {
1924 while (nl_dump_next(&dump, &msg)) {
1925 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1931 error = nl_dump_done(&dump);
1932 return error ? error : last_error;
1936 netdev_linux_get_in4(const struct netdev *netdev_,
1937 struct in_addr *address, struct in_addr *netmask)
1939 struct netdev_dev_linux *netdev_dev =
1940 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1942 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1945 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1946 SIOCGIFADDR, "SIOCGIFADDR");
1951 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1952 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1957 netdev_dev->cache_valid |= VALID_IN4;
1959 *address = netdev_dev->address;
1960 *netmask = netdev_dev->netmask;
1961 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1965 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1966 struct in_addr netmask)
1968 struct netdev_dev_linux *netdev_dev =
1969 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1972 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1974 netdev_dev->cache_valid |= VALID_IN4;
1975 netdev_dev->address = address;
1976 netdev_dev->netmask = netmask;
1977 if (address.s_addr != INADDR_ANY) {
1978 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1979 "SIOCSIFNETMASK", netmask);
1986 parse_if_inet6_line(const char *line,
1987 struct in6_addr *in6, char ifname[16 + 1])
1989 uint8_t *s6 = in6->s6_addr;
1990 #define X8 "%2"SCNx8
1992 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1993 "%*x %*x %*x %*x %16s\n",
1994 &s6[0], &s6[1], &s6[2], &s6[3],
1995 &s6[4], &s6[5], &s6[6], &s6[7],
1996 &s6[8], &s6[9], &s6[10], &s6[11],
1997 &s6[12], &s6[13], &s6[14], &s6[15],
2001 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2002 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2004 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2008 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2012 netdev_dev->in6 = in6addr_any;
2014 file = fopen("/proc/net/if_inet6", "r");
2016 const char *name = netdev_get_name(netdev_);
2017 while (fgets(line, sizeof line, file)) {
2018 struct in6_addr in6_tmp;
2019 char ifname[16 + 1];
2020 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2021 && !strcmp(name, ifname))
2023 netdev_dev->in6 = in6_tmp;
2029 netdev_dev->cache_valid |= VALID_IN6;
2031 *in6 = netdev_dev->in6;
2036 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2038 struct sockaddr_in sin;
2039 memset(&sin, 0, sizeof sin);
2040 sin.sin_family = AF_INET;
2041 sin.sin_addr = addr;
2044 memset(sa, 0, sizeof *sa);
2045 memcpy(sa, &sin, sizeof sin);
2049 do_set_addr(struct netdev *netdev,
2050 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2053 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2054 make_in4_sockaddr(&ifr.ifr_addr, addr);
2056 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2060 /* Adds 'router' as a default IP gateway. */
2062 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2064 struct in_addr any = { INADDR_ANY };
2068 memset(&rt, 0, sizeof rt);
2069 make_in4_sockaddr(&rt.rt_dst, any);
2070 make_in4_sockaddr(&rt.rt_gateway, router);
2071 make_in4_sockaddr(&rt.rt_genmask, any);
2072 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2073 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2075 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2081 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2084 static const char fn[] = "/proc/net/route";
2089 *netdev_name = NULL;
2090 stream = fopen(fn, "r");
2091 if (stream == NULL) {
2092 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2097 while (fgets(line, sizeof line, stream)) {
2100 ovs_be32 dest, gateway, mask;
2101 int refcnt, metric, mtu;
2102 unsigned int flags, use, window, irtt;
2105 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2107 iface, &dest, &gateway, &flags, &refcnt,
2108 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2110 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2114 if (!(flags & RTF_UP)) {
2115 /* Skip routes that aren't up. */
2119 /* The output of 'dest', 'mask', and 'gateway' were given in
2120 * network byte order, so we don't need need any endian
2121 * conversions here. */
2122 if ((dest & mask) == (host->s_addr & mask)) {
2124 /* The host is directly reachable. */
2125 next_hop->s_addr = 0;
2127 /* To reach the host, we must go through a gateway. */
2128 next_hop->s_addr = gateway;
2130 *netdev_name = xstrdup(iface);
2142 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2144 struct ethtool_drvinfo drvinfo;
2147 memset(&drvinfo, 0, sizeof drvinfo);
2148 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2149 (struct ethtool_cmd *)&drvinfo,
2151 "ETHTOOL_GDRVINFO");
2153 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2154 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2155 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2161 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2162 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2163 * returns 0. Otherwise, it returns a positive errno value; in particular,
2164 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2166 netdev_linux_arp_lookup(const struct netdev *netdev,
2167 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2170 struct sockaddr_in sin;
2173 memset(&r, 0, sizeof r);
2174 memset(&sin, 0, sizeof sin);
2175 sin.sin_family = AF_INET;
2176 sin.sin_addr.s_addr = ip;
2178 memcpy(&r.arp_pa, &sin, sizeof sin);
2179 r.arp_ha.sa_family = ARPHRD_ETHER;
2181 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2182 COVERAGE_INC(netdev_arp_lookup);
2183 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2185 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2186 } else if (retval != ENXIO) {
2187 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2188 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2194 nd_to_iff_flags(enum netdev_flags nd)
2197 if (nd & NETDEV_UP) {
2200 if (nd & NETDEV_PROMISC) {
2207 iff_to_nd_flags(int iff)
2209 enum netdev_flags nd = 0;
2213 if (iff & IFF_PROMISC) {
2214 nd |= NETDEV_PROMISC;
2220 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2221 enum netdev_flags on, enum netdev_flags *old_flagsp)
2223 int old_flags, new_flags;
2226 error = get_flags(netdev, &old_flags);
2228 *old_flagsp = iff_to_nd_flags(old_flags);
2229 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2230 if (new_flags != old_flags) {
2231 error = set_flags(netdev, new_flags);
2238 netdev_linux_change_seq(const struct netdev *netdev)
2240 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2243 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2247 netdev_linux_init, \
2249 netdev_linux_wait, \
2252 netdev_linux_destroy, \
2253 NULL, /* get_config */ \
2254 NULL, /* set_config */ \
2256 netdev_linux_open, \
2257 netdev_linux_close, \
2259 netdev_linux_listen, \
2260 netdev_linux_recv, \
2261 netdev_linux_recv_wait, \
2262 netdev_linux_drain, \
2264 netdev_linux_send, \
2265 netdev_linux_send_wait, \
2267 netdev_linux_set_etheraddr, \
2268 netdev_linux_get_etheraddr, \
2269 netdev_linux_get_mtu, \
2270 netdev_linux_set_mtu, \
2271 netdev_linux_get_ifindex, \
2272 netdev_linux_get_carrier, \
2273 netdev_linux_get_carrier_resets, \
2274 netdev_linux_set_miimon_interval, \
2278 netdev_linux_get_features, \
2279 netdev_linux_set_advertisements, \
2281 netdev_linux_set_policing, \
2282 netdev_linux_get_qos_types, \
2283 netdev_linux_get_qos_capabilities, \
2284 netdev_linux_get_qos, \
2285 netdev_linux_set_qos, \
2286 netdev_linux_get_queue, \
2287 netdev_linux_set_queue, \
2288 netdev_linux_delete_queue, \
2289 netdev_linux_get_queue_stats, \
2290 netdev_linux_dump_queues, \
2291 netdev_linux_dump_queue_stats, \
2293 netdev_linux_get_in4, \
2294 netdev_linux_set_in4, \
2295 netdev_linux_get_in6, \
2296 netdev_linux_add_router, \
2297 netdev_linux_get_next_hop, \
2298 netdev_linux_get_status, \
2299 netdev_linux_arp_lookup, \
2301 netdev_linux_update_flags, \
2303 netdev_linux_change_seq \
2306 const struct netdev_class netdev_linux_class =
2309 netdev_linux_create,
2310 netdev_linux_get_stats,
2311 NULL); /* set_stats */
2313 const struct netdev_class netdev_tap_class =
2316 netdev_linux_create_tap,
2317 netdev_pseudo_get_stats,
2318 NULL); /* set_stats */
2320 const struct netdev_class netdev_internal_class =
2323 netdev_linux_create,
2324 netdev_pseudo_get_stats,
2325 netdev_vport_set_stats);
2327 /* HTB traffic control class. */
2329 #define HTB_N_QUEUES 0xf000
2333 unsigned int max_rate; /* In bytes/s. */
2337 struct tc_queue tc_queue;
2338 unsigned int min_rate; /* In bytes/s. */
2339 unsigned int max_rate; /* In bytes/s. */
2340 unsigned int burst; /* In bytes. */
2341 unsigned int priority; /* Lower values are higher priorities. */
2345 htb_get__(const struct netdev *netdev)
2347 struct netdev_dev_linux *netdev_dev =
2348 netdev_dev_linux_cast(netdev_get_dev(netdev));
2349 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2353 htb_install__(struct netdev *netdev, uint64_t max_rate)
2355 struct netdev_dev_linux *netdev_dev =
2356 netdev_dev_linux_cast(netdev_get_dev(netdev));
2359 htb = xmalloc(sizeof *htb);
2360 tc_init(&htb->tc, &tc_ops_htb);
2361 htb->max_rate = max_rate;
2363 netdev_dev->tc = &htb->tc;
2366 /* Create an HTB qdisc.
2368 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2370 htb_setup_qdisc__(struct netdev *netdev)
2373 struct tc_htb_glob opt;
2374 struct ofpbuf request;
2375 struct tcmsg *tcmsg;
2377 tc_del_qdisc(netdev);
2379 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2380 NLM_F_EXCL | NLM_F_CREATE, &request);
2384 tcmsg->tcm_handle = tc_make_handle(1, 0);
2385 tcmsg->tcm_parent = TC_H_ROOT;
2387 nl_msg_put_string(&request, TCA_KIND, "htb");
2389 memset(&opt, 0, sizeof opt);
2390 opt.rate2quantum = 10;
2394 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2395 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2396 nl_msg_end_nested(&request, opt_offset);
2398 return tc_transact(&request, NULL);
2401 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2402 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2404 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2405 unsigned int parent, struct htb_class *class)
2408 struct tc_htb_opt opt;
2409 struct ofpbuf request;
2410 struct tcmsg *tcmsg;
2414 error = netdev_get_mtu(netdev, &mtu);
2416 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2417 netdev_get_name(netdev));
2421 memset(&opt, 0, sizeof opt);
2422 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2423 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2424 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2425 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2426 opt.prio = class->priority;
2428 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2432 tcmsg->tcm_handle = handle;
2433 tcmsg->tcm_parent = parent;
2435 nl_msg_put_string(&request, TCA_KIND, "htb");
2436 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2437 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2438 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2439 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2440 nl_msg_end_nested(&request, opt_offset);
2442 error = tc_transact(&request, NULL);
2444 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2445 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2446 netdev_get_name(netdev),
2447 tc_get_major(handle), tc_get_minor(handle),
2448 tc_get_major(parent), tc_get_minor(parent),
2449 class->min_rate, class->max_rate,
2450 class->burst, class->priority, strerror(error));
2455 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2456 * description of them into 'details'. The description complies with the
2457 * specification given in the vswitch database documentation for linux-htb
2460 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2462 static const struct nl_policy tca_htb_policy[] = {
2463 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2464 .min_len = sizeof(struct tc_htb_opt) },
2467 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2468 const struct tc_htb_opt *htb;
2470 if (!nl_parse_nested(nl_options, tca_htb_policy,
2471 attrs, ARRAY_SIZE(tca_htb_policy))) {
2472 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2476 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2477 class->min_rate = htb->rate.rate;
2478 class->max_rate = htb->ceil.rate;
2479 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2480 class->priority = htb->prio;
2485 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2486 struct htb_class *options,
2487 struct netdev_queue_stats *stats)
2489 struct nlattr *nl_options;
2490 unsigned int handle;
2493 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2494 if (!error && queue_id) {
2495 unsigned int major = tc_get_major(handle);
2496 unsigned int minor = tc_get_minor(handle);
2497 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2498 *queue_id = minor - 1;
2503 if (!error && options) {
2504 error = htb_parse_tca_options__(nl_options, options);
2510 htb_parse_qdisc_details__(struct netdev *netdev,
2511 const struct shash *details, struct htb_class *hc)
2513 const char *max_rate_s;
2515 max_rate_s = shash_find_data(details, "max-rate");
2516 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2517 if (!hc->max_rate) {
2520 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2521 hc->max_rate = netdev_features_to_bps(current) / 8;
2523 hc->min_rate = hc->max_rate;
2529 htb_parse_class_details__(struct netdev *netdev,
2530 const struct shash *details, struct htb_class *hc)
2532 const struct htb *htb = htb_get__(netdev);
2533 const char *min_rate_s = shash_find_data(details, "min-rate");
2534 const char *max_rate_s = shash_find_data(details, "max-rate");
2535 const char *burst_s = shash_find_data(details, "burst");
2536 const char *priority_s = shash_find_data(details, "priority");
2539 error = netdev_get_mtu(netdev, &mtu);
2541 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2542 netdev_get_name(netdev));
2546 /* HTB requires at least an mtu sized min-rate to send any traffic even
2547 * on uncongested links. */
2548 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2549 hc->min_rate = MAX(hc->min_rate, mtu);
2550 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2553 hc->max_rate = (max_rate_s
2554 ? strtoull(max_rate_s, NULL, 10) / 8
2556 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2557 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2561 * According to hints in the documentation that I've read, it is important
2562 * that 'burst' be at least as big as the largest frame that might be
2563 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2564 * but having it a bit too small is a problem. Since netdev_get_mtu()
2565 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2566 * the MTU. We actually add 64, instead of 14, as a guard against
2567 * additional headers get tacked on somewhere that we're not aware of. */
2568 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2569 hc->burst = MAX(hc->burst, mtu + 64);
2572 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2578 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2579 unsigned int parent, struct htb_class *options,
2580 struct netdev_queue_stats *stats)
2582 struct ofpbuf *reply;
2585 error = tc_query_class(netdev, handle, parent, &reply);
2587 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2588 ofpbuf_delete(reply);
2594 htb_tc_install(struct netdev *netdev, const struct shash *details)
2598 error = htb_setup_qdisc__(netdev);
2600 struct htb_class hc;
2602 htb_parse_qdisc_details__(netdev, details, &hc);
2603 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2604 tc_make_handle(1, 0), &hc);
2606 htb_install__(netdev, hc.max_rate);
2612 static struct htb_class *
2613 htb_class_cast__(const struct tc_queue *queue)
2615 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2619 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2620 const struct htb_class *hc)
2622 struct htb *htb = htb_get__(netdev);
2623 size_t hash = hash_int(queue_id, 0);
2624 struct tc_queue *queue;
2625 struct htb_class *hcp;
2627 queue = tc_find_queue__(netdev, queue_id, hash);
2629 hcp = htb_class_cast__(queue);
2631 hcp = xmalloc(sizeof *hcp);
2632 queue = &hcp->tc_queue;
2633 queue->queue_id = queue_id;
2634 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2637 hcp->min_rate = hc->min_rate;
2638 hcp->max_rate = hc->max_rate;
2639 hcp->burst = hc->burst;
2640 hcp->priority = hc->priority;
2644 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2647 struct nl_dump dump;
2648 struct htb_class hc;
2650 /* Get qdisc options. */
2652 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2653 htb_install__(netdev, hc.max_rate);
2656 if (!start_queue_dump(netdev, &dump)) {
2659 while (nl_dump_next(&dump, &msg)) {
2660 unsigned int queue_id;
2662 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2663 htb_update_queue__(netdev, queue_id, &hc);
2666 nl_dump_done(&dump);
2672 htb_tc_destroy(struct tc *tc)
2674 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2675 struct htb_class *hc, *next;
2677 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2678 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2686 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2688 const struct htb *htb = htb_get__(netdev);
2689 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2694 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2696 struct htb_class hc;
2699 htb_parse_qdisc_details__(netdev, details, &hc);
2700 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2701 tc_make_handle(1, 0), &hc);
2703 htb_get__(netdev)->max_rate = hc.max_rate;
2709 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2710 const struct tc_queue *queue, struct shash *details)
2712 const struct htb_class *hc = htb_class_cast__(queue);
2714 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2715 if (hc->min_rate != hc->max_rate) {
2716 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2718 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2720 shash_add(details, "priority", xasprintf("%u", hc->priority));
2726 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2727 const struct shash *details)
2729 struct htb_class hc;
2732 error = htb_parse_class_details__(netdev, details, &hc);
2737 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2738 tc_make_handle(1, 0xfffe), &hc);
2743 htb_update_queue__(netdev, queue_id, &hc);
2748 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2750 struct htb_class *hc = htb_class_cast__(queue);
2751 struct htb *htb = htb_get__(netdev);
2754 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2756 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2763 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2764 struct netdev_queue_stats *stats)
2766 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2767 tc_make_handle(1, 0xfffe), NULL, stats);
2771 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2772 const struct ofpbuf *nlmsg,
2773 netdev_dump_queue_stats_cb *cb, void *aux)
2775 struct netdev_queue_stats stats;
2776 unsigned int handle, major, minor;
2779 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2784 major = tc_get_major(handle);
2785 minor = tc_get_minor(handle);
2786 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2787 (*cb)(minor - 1, &stats, aux);
2792 static const struct tc_ops tc_ops_htb = {
2793 "htb", /* linux_name */
2794 "linux-htb", /* ovs_name */
2795 HTB_N_QUEUES, /* n_queues */
2804 htb_class_get_stats,
2805 htb_class_dump_stats
2808 /* "linux-hfsc" traffic control class. */
2810 #define HFSC_N_QUEUES 0xf000
2818 struct tc_queue tc_queue;
2823 static struct hfsc *
2824 hfsc_get__(const struct netdev *netdev)
2826 struct netdev_dev_linux *netdev_dev;
2827 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2828 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2831 static struct hfsc_class *
2832 hfsc_class_cast__(const struct tc_queue *queue)
2834 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2838 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2840 struct netdev_dev_linux * netdev_dev;
2843 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2844 hfsc = xmalloc(sizeof *hfsc);
2845 tc_init(&hfsc->tc, &tc_ops_hfsc);
2846 hfsc->max_rate = max_rate;
2847 netdev_dev->tc = &hfsc->tc;
2851 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2852 const struct hfsc_class *hc)
2856 struct hfsc_class *hcp;
2857 struct tc_queue *queue;
2859 hfsc = hfsc_get__(netdev);
2860 hash = hash_int(queue_id, 0);
2862 queue = tc_find_queue__(netdev, queue_id, hash);
2864 hcp = hfsc_class_cast__(queue);
2866 hcp = xmalloc(sizeof *hcp);
2867 queue = &hcp->tc_queue;
2868 queue->queue_id = queue_id;
2869 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2872 hcp->min_rate = hc->min_rate;
2873 hcp->max_rate = hc->max_rate;
2877 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2879 const struct tc_service_curve *rsc, *fsc, *usc;
2880 static const struct nl_policy tca_hfsc_policy[] = {
2882 .type = NL_A_UNSPEC,
2884 .min_len = sizeof(struct tc_service_curve),
2887 .type = NL_A_UNSPEC,
2889 .min_len = sizeof(struct tc_service_curve),
2892 .type = NL_A_UNSPEC,
2894 .min_len = sizeof(struct tc_service_curve),
2897 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2899 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2900 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2901 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2905 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2906 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2907 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2909 if (rsc->m1 != 0 || rsc->d != 0 ||
2910 fsc->m1 != 0 || fsc->d != 0 ||
2911 usc->m1 != 0 || usc->d != 0) {
2912 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2913 "Non-linear service curves are not supported.");
2917 if (rsc->m2 != fsc->m2) {
2918 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2919 "Real-time service curves are not supported ");
2923 if (rsc->m2 > usc->m2) {
2924 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2925 "Min-rate service curve is greater than "
2926 "the max-rate service curve.");
2930 class->min_rate = fsc->m2;
2931 class->max_rate = usc->m2;
2936 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2937 struct hfsc_class *options,
2938 struct netdev_queue_stats *stats)
2941 unsigned int handle;
2942 struct nlattr *nl_options;
2944 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2950 unsigned int major, minor;
2952 major = tc_get_major(handle);
2953 minor = tc_get_minor(handle);
2954 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2955 *queue_id = minor - 1;
2962 error = hfsc_parse_tca_options__(nl_options, options);
2969 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2970 unsigned int parent, struct hfsc_class *options,
2971 struct netdev_queue_stats *stats)
2974 struct ofpbuf *reply;
2976 error = tc_query_class(netdev, handle, parent, &reply);
2981 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2982 ofpbuf_delete(reply);
2987 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2988 struct hfsc_class *class)
2991 const char *max_rate_s;
2993 max_rate_s = shash_find_data(details, "max-rate");
2994 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2999 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3000 max_rate = netdev_features_to_bps(current) / 8;
3003 class->min_rate = max_rate;
3004 class->max_rate = max_rate;
3008 hfsc_parse_class_details__(struct netdev *netdev,
3009 const struct shash *details,
3010 struct hfsc_class * class)
3012 const struct hfsc *hfsc;
3013 uint32_t min_rate, max_rate;
3014 const char *min_rate_s, *max_rate_s;
3016 hfsc = hfsc_get__(netdev);
3017 min_rate_s = shash_find_data(details, "min-rate");
3018 max_rate_s = shash_find_data(details, "max-rate");
3020 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3021 min_rate = MAX(min_rate, 1);
3022 min_rate = MIN(min_rate, hfsc->max_rate);
3024 max_rate = (max_rate_s
3025 ? strtoull(max_rate_s, NULL, 10) / 8
3027 max_rate = MAX(max_rate, min_rate);
3028 max_rate = MIN(max_rate, hfsc->max_rate);
3030 class->min_rate = min_rate;
3031 class->max_rate = max_rate;
3036 /* Create an HFSC qdisc.
3038 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3040 hfsc_setup_qdisc__(struct netdev * netdev)
3042 struct tcmsg *tcmsg;
3043 struct ofpbuf request;
3044 struct tc_hfsc_qopt opt;
3046 tc_del_qdisc(netdev);
3048 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3049 NLM_F_EXCL | NLM_F_CREATE, &request);
3055 tcmsg->tcm_handle = tc_make_handle(1, 0);
3056 tcmsg->tcm_parent = TC_H_ROOT;
3058 memset(&opt, 0, sizeof opt);
3061 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3062 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3064 return tc_transact(&request, NULL);
3067 /* Create an HFSC class.
3069 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3070 * sc rate <min_rate> ul rate <max_rate>" */
3072 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3073 unsigned int parent, struct hfsc_class *class)
3077 struct tcmsg *tcmsg;
3078 struct ofpbuf request;
3079 struct tc_service_curve min, max;
3081 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3087 tcmsg->tcm_handle = handle;
3088 tcmsg->tcm_parent = parent;
3092 min.m2 = class->min_rate;
3096 max.m2 = class->max_rate;
3098 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3099 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3100 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3101 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3102 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3103 nl_msg_end_nested(&request, opt_offset);
3105 error = tc_transact(&request, NULL);
3107 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3108 "min-rate %ubps, max-rate %ubps (%s)",
3109 netdev_get_name(netdev),
3110 tc_get_major(handle), tc_get_minor(handle),
3111 tc_get_major(parent), tc_get_minor(parent),
3112 class->min_rate, class->max_rate, strerror(error));
3119 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3122 struct hfsc_class class;
3124 error = hfsc_setup_qdisc__(netdev);
3130 hfsc_parse_qdisc_details__(netdev, details, &class);
3131 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3132 tc_make_handle(1, 0), &class);
3138 hfsc_install__(netdev, class.max_rate);
3143 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3146 struct nl_dump dump;
3147 struct hfsc_class hc;
3150 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3151 hfsc_install__(netdev, hc.max_rate);
3153 if (!start_queue_dump(netdev, &dump)) {
3157 while (nl_dump_next(&dump, &msg)) {
3158 unsigned int queue_id;
3160 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3161 hfsc_update_queue__(netdev, queue_id, &hc);
3165 nl_dump_done(&dump);
3170 hfsc_tc_destroy(struct tc *tc)
3173 struct hfsc_class *hc, *next;
3175 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3177 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3178 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3187 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3189 const struct hfsc *hfsc;
3190 hfsc = hfsc_get__(netdev);
3191 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3196 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3199 struct hfsc_class class;
3201 hfsc_parse_qdisc_details__(netdev, details, &class);
3202 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3203 tc_make_handle(1, 0), &class);
3206 hfsc_get__(netdev)->max_rate = class.max_rate;
3213 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3214 const struct tc_queue *queue, struct shash *details)
3216 const struct hfsc_class *hc;
3218 hc = hfsc_class_cast__(queue);
3219 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3220 if (hc->min_rate != hc->max_rate) {
3221 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3227 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3228 const struct shash *details)
3231 struct hfsc_class class;
3233 error = hfsc_parse_class_details__(netdev, details, &class);
3238 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3239 tc_make_handle(1, 0xfffe), &class);
3244 hfsc_update_queue__(netdev, queue_id, &class);
3249 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3253 struct hfsc_class *hc;
3255 hc = hfsc_class_cast__(queue);
3256 hfsc = hfsc_get__(netdev);
3258 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3260 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3267 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3268 struct netdev_queue_stats *stats)
3270 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3271 tc_make_handle(1, 0xfffe), NULL, stats);
3275 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3276 const struct ofpbuf *nlmsg,
3277 netdev_dump_queue_stats_cb *cb, void *aux)
3279 struct netdev_queue_stats stats;
3280 unsigned int handle, major, minor;
3283 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3288 major = tc_get_major(handle);
3289 minor = tc_get_minor(handle);
3290 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3291 (*cb)(minor - 1, &stats, aux);
3296 static const struct tc_ops tc_ops_hfsc = {
3297 "hfsc", /* linux_name */
3298 "linux-hfsc", /* ovs_name */
3299 HFSC_N_QUEUES, /* n_queues */
3300 hfsc_tc_install, /* tc_install */
3301 hfsc_tc_load, /* tc_load */
3302 hfsc_tc_destroy, /* tc_destroy */
3303 hfsc_qdisc_get, /* qdisc_get */
3304 hfsc_qdisc_set, /* qdisc_set */
3305 hfsc_class_get, /* class_get */
3306 hfsc_class_set, /* class_set */
3307 hfsc_class_delete, /* class_delete */
3308 hfsc_class_get_stats, /* class_get_stats */
3309 hfsc_class_dump_stats /* class_dump_stats */
3312 /* "linux-default" traffic control class.
3314 * This class represents the default, unnamed Linux qdisc. It corresponds to
3315 * the "" (empty string) QoS type in the OVS database. */
3318 default_install__(struct netdev *netdev)
3320 struct netdev_dev_linux *netdev_dev =
3321 netdev_dev_linux_cast(netdev_get_dev(netdev));
3322 static struct tc *tc;
3325 tc = xmalloc(sizeof *tc);
3326 tc_init(tc, &tc_ops_default);
3328 netdev_dev->tc = tc;
3332 default_tc_install(struct netdev *netdev,
3333 const struct shash *details OVS_UNUSED)
3335 default_install__(netdev);
3340 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3342 default_install__(netdev);
3346 static const struct tc_ops tc_ops_default = {
3347 NULL, /* linux_name */
3352 NULL, /* tc_destroy */
3353 NULL, /* qdisc_get */
3354 NULL, /* qdisc_set */
3355 NULL, /* class_get */
3356 NULL, /* class_set */
3357 NULL, /* class_delete */
3358 NULL, /* class_get_stats */
3359 NULL /* class_dump_stats */
3362 /* "linux-other" traffic control class.
3367 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3369 struct netdev_dev_linux *netdev_dev =
3370 netdev_dev_linux_cast(netdev_get_dev(netdev));
3371 static struct tc *tc;
3374 tc = xmalloc(sizeof *tc);
3375 tc_init(tc, &tc_ops_other);
3377 netdev_dev->tc = tc;
3381 static const struct tc_ops tc_ops_other = {
3382 NULL, /* linux_name */
3383 "linux-other", /* ovs_name */
3385 NULL, /* tc_install */
3387 NULL, /* tc_destroy */
3388 NULL, /* qdisc_get */
3389 NULL, /* qdisc_set */
3390 NULL, /* class_get */
3391 NULL, /* class_set */
3392 NULL, /* class_delete */
3393 NULL, /* class_get_stats */
3394 NULL /* class_dump_stats */
3397 /* Traffic control. */
3399 /* Number of kernel "tc" ticks per second. */
3400 static double ticks_per_s;
3402 /* Number of kernel "jiffies" per second. This is used for the purpose of
3403 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3404 * one jiffy's worth of data.
3406 * There are two possibilities here:
3408 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3409 * approximate range of 100 to 1024. That means that we really need to
3410 * make sure that the qdisc can buffer that much data.
3412 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3413 * has finely granular timers and there's no need to fudge additional room
3414 * for buffers. (There's no extra effort needed to implement that: the
3415 * large 'buffer_hz' is used as a divisor, so practically any number will
3416 * come out as 0 in the division. Small integer results in the case of
3417 * really high dividends won't have any real effect anyhow.)
3419 static unsigned int buffer_hz;
3421 /* Returns tc handle 'major':'minor'. */
3423 tc_make_handle(unsigned int major, unsigned int minor)
3425 return TC_H_MAKE(major << 16, minor);
3428 /* Returns the major number from 'handle'. */
3430 tc_get_major(unsigned int handle)
3432 return TC_H_MAJ(handle) >> 16;
3435 /* Returns the minor number from 'handle'. */
3437 tc_get_minor(unsigned int handle)
3439 return TC_H_MIN(handle);
3442 static struct tcmsg *
3443 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3444 struct ofpbuf *request)
3446 struct tcmsg *tcmsg;
3450 error = get_ifindex(netdev, &ifindex);
3455 ofpbuf_init(request, 512);
3456 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3457 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3458 tcmsg->tcm_family = AF_UNSPEC;
3459 tcmsg->tcm_ifindex = ifindex;
3460 /* Caller should fill in tcmsg->tcm_handle. */
3461 /* Caller should fill in tcmsg->tcm_parent. */
3467 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3469 int error = nl_sock_transact(rtnl_sock, request, replyp);
3470 ofpbuf_uninit(request);
3477 /* The values in psched are not individually very meaningful, but they are
3478 * important. The tables below show some values seen in the wild.
3482 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3483 * (Before that, there are hints that it was 1000000000.)
3485 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3489 * -----------------------------------
3490 * [1] 000c8000 000f4240 000f4240 00000064
3491 * [2] 000003e8 00000400 000f4240 3b9aca00
3492 * [3] 000003e8 00000400 000f4240 3b9aca00
3493 * [4] 000003e8 00000400 000f4240 00000064
3494 * [5] 000003e8 00000040 000f4240 3b9aca00
3495 * [6] 000003e8 00000040 000f4240 000000f9
3497 * a b c d ticks_per_s buffer_hz
3498 * ------- --------- ---------- ------------- ----------- -------------
3499 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3500 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3501 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3502 * [4] 1,000 1,024 1,000,000 100 976,562 100
3503 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3504 * [6] 1,000 64 1,000,000 249 15,625,000 249
3506 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3507 * [2] 2.6.26-1-686-bigmem from Debian lenny
3508 * [3] 2.6.26-2-sparc64 from Debian lenny
3509 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3510 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3511 * [6] 2.6.34 from kernel.org on KVM
3513 static const char fn[] = "/proc/net/psched";
3514 unsigned int a, b, c, d;
3520 stream = fopen(fn, "r");
3522 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3526 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3527 VLOG_WARN("%s: read failed", fn);
3531 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3535 VLOG_WARN("%s: invalid scheduler parameters", fn);
3539 ticks_per_s = (double) a * c / b;
3543 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3546 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3549 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3550 * rate of 'rate' bytes per second. */
3552 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3557 return (rate * ticks) / ticks_per_s;
3560 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3561 * rate of 'rate' bytes per second. */
3563 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3568 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3571 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3572 * a transmission rate of 'rate' bytes per second. */
3574 tc_buffer_per_jiffy(unsigned int rate)
3579 return rate / buffer_hz;
3582 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3583 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3584 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3585 * stores NULL into it if it is absent.
3587 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3590 * Returns 0 if successful, otherwise a positive errno value. */
3592 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3593 struct nlattr **options)
3595 static const struct nl_policy tca_policy[] = {
3596 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3597 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3599 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3601 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3602 tca_policy, ta, ARRAY_SIZE(ta))) {
3603 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3608 *kind = nl_attr_get_string(ta[TCA_KIND]);
3612 *options = ta[TCA_OPTIONS];
3627 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3628 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3629 * into '*options', and its queue statistics into '*stats'. Any of the output
3630 * arguments may be null.
3632 * Returns 0 if successful, otherwise a positive errno value. */
3634 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3635 struct nlattr **options, struct netdev_queue_stats *stats)
3637 static const struct nl_policy tca_policy[] = {
3638 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3639 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3641 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3643 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3644 tca_policy, ta, ARRAY_SIZE(ta))) {
3645 VLOG_WARN_RL(&rl, "failed to parse class message");
3650 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3651 *handlep = tc->tcm_handle;
3655 *options = ta[TCA_OPTIONS];
3659 const struct gnet_stats_queue *gsq;
3660 struct gnet_stats_basic gsb;
3662 static const struct nl_policy stats_policy[] = {
3663 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3664 .min_len = sizeof gsb },
3665 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3666 .min_len = sizeof *gsq },
3668 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3670 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3671 sa, ARRAY_SIZE(sa))) {
3672 VLOG_WARN_RL(&rl, "failed to parse class stats");
3676 /* Alignment issues screw up the length of struct gnet_stats_basic on
3677 * some arch/bitsize combinations. Newer versions of Linux have a
3678 * struct gnet_stats_basic_packed, but we can't depend on that. The
3679 * easiest thing to do is just to make a copy. */
3680 memset(&gsb, 0, sizeof gsb);
3681 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3682 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3683 stats->tx_bytes = gsb.bytes;
3684 stats->tx_packets = gsb.packets;
3686 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3687 stats->tx_errors = gsq->drops;
3697 memset(stats, 0, sizeof *stats);
3702 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3705 tc_query_class(const struct netdev *netdev,
3706 unsigned int handle, unsigned int parent,
3707 struct ofpbuf **replyp)
3709 struct ofpbuf request;
3710 struct tcmsg *tcmsg;
3713 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3717 tcmsg->tcm_handle = handle;
3718 tcmsg->tcm_parent = parent;
3720 error = tc_transact(&request, replyp);
3722 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3723 netdev_get_name(netdev),
3724 tc_get_major(handle), tc_get_minor(handle),
3725 tc_get_major(parent), tc_get_minor(parent),
3731 /* Equivalent to "tc class del dev <name> handle <handle>". */
3733 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3735 struct ofpbuf request;
3736 struct tcmsg *tcmsg;
3739 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3743 tcmsg->tcm_handle = handle;
3744 tcmsg->tcm_parent = 0;
3746 error = tc_transact(&request, NULL);
3748 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3749 netdev_get_name(netdev),
3750 tc_get_major(handle), tc_get_minor(handle),
3756 /* Equivalent to "tc qdisc del dev <name> root". */
3758 tc_del_qdisc(struct netdev *netdev)
3760 struct netdev_dev_linux *netdev_dev =
3761 netdev_dev_linux_cast(netdev_get_dev(netdev));
3762 struct ofpbuf request;
3763 struct tcmsg *tcmsg;
3766 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3770 tcmsg->tcm_handle = tc_make_handle(1, 0);
3771 tcmsg->tcm_parent = TC_H_ROOT;
3773 error = tc_transact(&request, NULL);
3774 if (error == EINVAL) {
3775 /* EINVAL probably means that the default qdisc was in use, in which
3776 * case we've accomplished our purpose. */
3779 if (!error && netdev_dev->tc) {
3780 if (netdev_dev->tc->ops->tc_destroy) {
3781 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3783 netdev_dev->tc = NULL;
3788 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3789 * kernel to determine what they are. Returns 0 if successful, otherwise a
3790 * positive errno value. */
3792 tc_query_qdisc(const struct netdev *netdev)
3794 struct netdev_dev_linux *netdev_dev =
3795 netdev_dev_linux_cast(netdev_get_dev(netdev));
3796 struct ofpbuf request, *qdisc;
3797 const struct tc_ops *ops;
3798 struct tcmsg *tcmsg;
3802 if (netdev_dev->tc) {
3806 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3807 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3808 * 2.6.35 without that fix backported to it.
3810 * To avoid the OOPS, we must not make a request that would attempt to dump
3811 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3812 * few others. There are a few ways that I can see to do this, but most of
3813 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3814 * technique chosen here is to assume that any non-default qdisc that we
3815 * create will have a class with handle 1:0. The built-in qdiscs only have
3816 * a class with handle 0:0.
3818 * We could check for Linux 2.6.35+ and use a more straightforward method
3820 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3824 tcmsg->tcm_handle = tc_make_handle(1, 0);
3825 tcmsg->tcm_parent = 0;
3827 /* Figure out what tc class to instantiate. */
3828 error = tc_transact(&request, &qdisc);
3832 error = tc_parse_qdisc(qdisc, &kind, NULL);
3834 ops = &tc_ops_other;
3836 ops = tc_lookup_linux_name(kind);
3838 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3839 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3841 ops = &tc_ops_other;
3844 } else if (error == ENOENT) {
3845 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3846 * other entity that doesn't have a handle 1:0. We will assume
3847 * that it's the system default qdisc. */
3848 ops = &tc_ops_default;
3851 /* Who knows? Maybe the device got deleted. */
3852 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3853 netdev_get_name(netdev), strerror(error));
3854 ops = &tc_ops_other;
3857 /* Instantiate it. */
3858 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3859 assert((load_error == 0) == (netdev_dev->tc != NULL));
3860 ofpbuf_delete(qdisc);
3862 return error ? error : load_error;
3865 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3866 approximate the time to transmit packets of various lengths. For an MTU of
3867 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3868 represents two possible packet lengths; for a MTU of 513 through 1024, four
3869 possible lengths; and so on.
3871 Returns, for the specified 'mtu', the number of bits that packet lengths
3872 need to be shifted right to fit within such a 256-entry table. */
3874 tc_calc_cell_log(unsigned int mtu)
3879 mtu = ETH_PAYLOAD_MAX;
3881 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3883 for (cell_log = 0; mtu >= 256; cell_log++) {
3890 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3893 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3895 memset(rate, 0, sizeof *rate);
3896 rate->cell_log = tc_calc_cell_log(mtu);
3897 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3898 /* rate->cell_align = 0; */ /* distro headers. */
3899 rate->mpu = ETH_TOTAL_MIN;
3903 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3904 * attribute of the specified "type".
3906 * See tc_calc_cell_log() above for a description of "rtab"s. */
3908 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3913 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3914 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3915 unsigned packet_size = (i + 1) << rate->cell_log;
3916 if (packet_size < rate->mpu) {
3917 packet_size = rate->mpu;
3919 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3923 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3924 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3925 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3928 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3930 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3931 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3934 /* Linux-only functions declared in netdev-linux.h */
3936 /* Returns a fd for an AF_INET socket or a negative errno value. */
3938 netdev_linux_get_af_inet_sock(void)
3940 int error = netdev_linux_init();
3941 return error ? -error : af_inet_sock;
3944 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
3945 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
3947 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
3948 const char *flag_name, bool enable)
3950 const char *netdev_name = netdev_get_name(netdev);
3951 struct ethtool_value evalue;
3955 memset(&evalue, 0, sizeof evalue);
3956 error = netdev_linux_do_ethtool(netdev_name,
3957 (struct ethtool_cmd *)&evalue,
3958 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3963 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
3964 error = netdev_linux_do_ethtool(netdev_name,
3965 (struct ethtool_cmd *)&evalue,
3966 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
3971 memset(&evalue, 0, sizeof evalue);
3972 error = netdev_linux_do_ethtool(netdev_name,
3973 (struct ethtool_cmd *)&evalue,
3974 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3979 if (new_flags != evalue.data) {
3980 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
3981 "device %s failed", enable ? "enable" : "disable",
3982 flag_name, netdev_name);
3989 /* Utility functions. */
3991 /* Copies 'src' into 'dst', performing format conversion in the process. */
3993 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3994 const struct rtnl_link_stats *src)
3996 dst->rx_packets = src->rx_packets;
3997 dst->tx_packets = src->tx_packets;
3998 dst->rx_bytes = src->rx_bytes;
3999 dst->tx_bytes = src->tx_bytes;
4000 dst->rx_errors = src->rx_errors;
4001 dst->tx_errors = src->tx_errors;
4002 dst->rx_dropped = src->rx_dropped;
4003 dst->tx_dropped = src->tx_dropped;
4004 dst->multicast = src->multicast;
4005 dst->collisions = src->collisions;
4006 dst->rx_length_errors = src->rx_length_errors;
4007 dst->rx_over_errors = src->rx_over_errors;
4008 dst->rx_crc_errors = src->rx_crc_errors;
4009 dst->rx_frame_errors = src->rx_frame_errors;
4010 dst->rx_fifo_errors = src->rx_fifo_errors;
4011 dst->rx_missed_errors = src->rx_missed_errors;
4012 dst->tx_aborted_errors = src->tx_aborted_errors;
4013 dst->tx_carrier_errors = src->tx_carrier_errors;
4014 dst->tx_fifo_errors = src->tx_fifo_errors;
4015 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4016 dst->tx_window_errors = src->tx_window_errors;
4020 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4022 /* Policy for RTNLGRP_LINK messages.
4024 * There are *many* more fields in these messages, but currently we only
4025 * care about these fields. */
4026 static const struct nl_policy rtnlgrp_link_policy[] = {
4027 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4028 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4029 .min_len = sizeof(struct rtnl_link_stats) },
4032 struct ofpbuf request;
4033 struct ofpbuf *reply;
4034 struct ifinfomsg *ifi;
4035 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4038 ofpbuf_init(&request, 0);
4039 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4040 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4041 ifi->ifi_family = PF_UNSPEC;
4042 ifi->ifi_index = ifindex;
4043 error = nl_sock_transact(rtnl_sock, &request, &reply);
4044 ofpbuf_uninit(&request);
4049 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4050 rtnlgrp_link_policy,
4051 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4052 ofpbuf_delete(reply);
4056 if (!attrs[IFLA_STATS]) {
4057 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4058 ofpbuf_delete(reply);
4062 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4064 ofpbuf_delete(reply);
4070 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4072 static const char fn[] = "/proc/net/dev";
4077 stream = fopen(fn, "r");
4079 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4084 while (fgets(line, sizeof line, stream)) {
4087 #define X64 "%"SCNu64
4090 X64 X64 X64 X64 X64 X64 X64 "%*u"
4091 X64 X64 X64 X64 X64 X64 X64 "%*u",
4097 &stats->rx_fifo_errors,
4098 &stats->rx_frame_errors,
4104 &stats->tx_fifo_errors,
4106 &stats->tx_carrier_errors) != 15) {
4107 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4108 } else if (!strcmp(devname, netdev_name)) {
4109 stats->rx_length_errors = UINT64_MAX;
4110 stats->rx_over_errors = UINT64_MAX;
4111 stats->rx_crc_errors = UINT64_MAX;
4112 stats->rx_missed_errors = UINT64_MAX;
4113 stats->tx_aborted_errors = UINT64_MAX;
4114 stats->tx_heartbeat_errors = UINT64_MAX;
4115 stats->tx_window_errors = UINT64_MAX;
4121 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4127 get_carrier_via_sysfs(const char *name, bool *carrier)
4138 fn = xasprintf("/sys/class/net/%s/carrier", name);
4139 fd = open(fn, O_RDONLY);
4142 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4146 retval = read(fd, line, sizeof line);
4149 if (error == EINVAL) {
4150 /* This is the normal return value when we try to check carrier if
4151 * the network device is not up. */
4153 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4156 } else if (retval == 0) {
4158 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4162 if (line[0] != '0' && line[0] != '1') {
4164 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4167 *carrier = line[0] != '0';
4179 get_flags(const struct netdev *netdev, int *flags)
4184 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4186 *flags = ifr.ifr_flags;
4191 set_flags(struct netdev *netdev, int flags)
4195 ifr.ifr_flags = flags;
4196 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4201 do_get_ifindex(const char *netdev_name)
4205 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4206 COVERAGE_INC(netdev_get_ifindex);
4207 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4208 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4209 netdev_name, strerror(errno));
4212 return ifr.ifr_ifindex;
4216 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4218 struct netdev_dev_linux *netdev_dev =
4219 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4221 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4222 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4226 netdev_dev->cache_valid |= VALID_IFINDEX;
4227 netdev_dev->ifindex = ifindex;
4229 *ifindexp = netdev_dev->ifindex;
4234 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4239 memset(&ifr, 0, sizeof ifr);
4240 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4241 COVERAGE_INC(netdev_get_hwaddr);
4242 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4243 /* ENODEV probably means that a vif disappeared asynchronously and
4244 * hasn't been removed from the database yet, so reduce the log level
4245 * to INFO for that case. */
4246 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4247 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4248 netdev_name, strerror(errno));
4251 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4252 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4253 VLOG_WARN("%s device has unknown hardware address family %d",
4254 netdev_name, hwaddr_family);
4256 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4261 set_etheraddr(const char *netdev_name, int hwaddr_family,
4262 const uint8_t mac[ETH_ADDR_LEN])
4266 memset(&ifr, 0, sizeof ifr);
4267 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4268 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4269 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4270 COVERAGE_INC(netdev_set_hwaddr);
4271 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4272 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4273 netdev_name, strerror(errno));
4280 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4281 int cmd, const char *cmd_name)
4285 memset(&ifr, 0, sizeof ifr);
4286 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4287 ifr.ifr_data = (caddr_t) ecmd;
4290 COVERAGE_INC(netdev_ethtool);
4291 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4294 if (errno != EOPNOTSUPP) {
4295 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4296 "failed: %s", cmd_name, name, strerror(errno));
4298 /* The device doesn't support this operation. That's pretty
4299 * common, so there's no point in logging anything. */
4306 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4307 const char *cmd_name)
4309 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4310 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4311 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4319 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4320 int cmd, const char *cmd_name)
4325 ifr.ifr_addr.sa_family = AF_INET;
4326 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4328 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4329 *ip = sin->sin_addr;
4334 /* Returns an AF_PACKET raw socket or a negative errno value. */
4336 af_packet_sock(void)
4338 static int sock = INT_MIN;
4340 if (sock == INT_MIN) {
4341 sock = socket(AF_PACKET, SOCK_RAW, 0);
4343 set_nonblocking(sock);
4346 VLOG_ERR("failed to create packet socket: %s", strerror(errno));