2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
66 #include "socket-util.h"
72 VLOG_DEFINE_THIS_MODULE(netdev_linux);
74 COVERAGE_DEFINE(netdev_get_vlan_vid);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_ethtool);
82 /* These were introduced in Linux 2.6.14, so they might be missing if we have
84 #ifndef ADVERTISED_Pause
85 #define ADVERTISED_Pause (1 << 13)
87 #ifndef ADVERTISED_Asym_Pause
88 #define ADVERTISED_Asym_Pause (1 << 14)
91 /* These were introduced in Linux 2.6.24, so they might be missing if we
92 * have old headers. */
93 #ifndef ETHTOOL_GFLAGS
94 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96 #ifndef ETHTOOL_SFLAGS
97 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 #define TC_RTAB_SIZE 1024
106 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
107 static int cache_notifier_refcount;
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_CARRIER = 1 << 5,
116 VALID_POLICING = 1 << 6,
117 VALID_HAVE_VPORT_STATS = 1 << 7
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
331 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
332 struct nlattr **options);
333 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
334 struct nlattr **options,
335 struct netdev_queue_stats *);
336 static int tc_query_class(const struct netdev *,
337 unsigned int handle, unsigned int parent,
338 struct ofpbuf **replyp);
339 static int tc_delete_class(const struct netdev *, unsigned int handle);
341 static int tc_del_qdisc(struct netdev *netdev);
342 static int tc_query_qdisc(const struct netdev *netdev);
344 static int tc_calc_cell_log(unsigned int mtu);
345 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
346 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
347 const struct tc_ratespec *rate);
348 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
350 struct netdev_dev_linux {
351 struct netdev_dev netdev_dev;
353 struct shash_node *shash_node;
354 unsigned int cache_valid;
355 unsigned int change_seq;
357 bool miimon; /* Link status of last poll. */
358 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
359 struct timer miimon_timer;
361 /* The following are figured out "on demand" only. They are only valid
362 * when the corresponding VALID_* bit in 'cache_valid' is set. */
364 uint8_t etheraddr[ETH_ADDR_LEN];
365 struct in_addr address, netmask;
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
371 bool have_vport_stats;
375 struct tap_state tap;
379 struct netdev_linux {
380 struct netdev netdev;
384 /* Sockets used for ioctl operations. */
385 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387 /* A Netlink routing socket that is not subscribed to any multicast groups. */
388 static struct nl_sock *rtnl_sock;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414 static int af_packet_sock(void);
415 static void netdev_linux_miimon_run(void);
416 static void netdev_linux_miimon_wait(void);
419 is_netdev_linux_class(const struct netdev_class *netdev_class)
421 return netdev_class->init == netdev_linux_init;
424 static struct netdev_dev_linux *
425 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
427 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
428 assert(is_netdev_linux_class(netdev_class));
430 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
433 static struct netdev_linux *
434 netdev_linux_cast(const struct netdev *netdev)
436 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
437 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
438 assert(is_netdev_linux_class(netdev_class));
440 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
444 netdev_linux_init(void)
446 static int status = -1;
448 /* Create AF_INET socket. */
449 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
450 status = af_inet_sock >= 0 ? 0 : errno;
452 VLOG_ERR("failed to create inet socket: %s", strerror(status));
455 /* Create rtnetlink socket. */
457 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
459 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
468 netdev_linux_run(void)
470 rtnetlink_link_run();
471 netdev_linux_miimon_run();
475 netdev_linux_wait(void)
477 rtnetlink_link_wait();
478 netdev_linux_miimon_wait();
482 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
485 if (!dev->change_seq) {
488 dev->cache_valid = 0;
492 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
493 void *aux OVS_UNUSED)
495 struct netdev_dev_linux *dev;
497 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
499 const struct netdev_class *netdev_class =
500 netdev_dev_get_class(base_dev);
502 if (is_netdev_linux_class(netdev_class)) {
503 dev = netdev_dev_linux_cast(base_dev);
504 netdev_dev_linux_changed(dev);
508 struct shash device_shash;
509 struct shash_node *node;
511 shash_init(&device_shash);
512 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
513 SHASH_FOR_EACH (node, &device_shash) {
515 netdev_dev_linux_changed(dev);
517 shash_destroy(&device_shash);
521 /* Creates system and internal devices. */
523 netdev_linux_create(const struct netdev_class *class, const char *name,
524 struct netdev_dev **netdev_devp)
526 struct netdev_dev_linux *netdev_dev;
528 if (!cache_notifier_refcount) {
529 assert(!netdev_linux_cache_notifier);
531 netdev_linux_cache_notifier =
532 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
534 if (!netdev_linux_cache_notifier) {
538 cache_notifier_refcount++;
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 netdev_dev->change_seq = 1;
542 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
544 *netdev_devp = &netdev_dev->netdev_dev;
548 /* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
555 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
556 const char *name, struct netdev_dev **netdev_devp)
558 struct netdev_dev_linux *netdev_dev;
559 struct tap_state *state;
560 static const char tap_dev[] = "/dev/net/tun";
564 netdev_dev = xzalloc(sizeof *netdev_dev);
565 state = &netdev_dev->state.tap;
567 /* Open tap device. */
568 state->fd = open(tap_dev, O_RDWR);
571 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
575 /* Create tap device. */
576 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
577 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
578 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
579 VLOG_WARN("%s: creating tap device failed: %s", name,
585 /* Make non-blocking. */
586 error = set_nonblocking(state->fd);
591 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
592 *netdev_devp = &netdev_dev->netdev_dev;
601 destroy_tap(struct netdev_dev_linux *netdev_dev)
603 struct tap_state *state = &netdev_dev->state.tap;
605 if (state->fd >= 0) {
610 /* Destroys the netdev device 'netdev_dev_'. */
612 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
614 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
615 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
617 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
618 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
621 if (class == &netdev_linux_class || class == &netdev_internal_class) {
622 cache_notifier_refcount--;
624 if (!cache_notifier_refcount) {
625 assert(netdev_linux_cache_notifier);
626 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
627 netdev_linux_cache_notifier = NULL;
629 } else if (class == &netdev_tap_class) {
630 destroy_tap(netdev_dev);
639 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
641 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
642 struct netdev_linux *netdev;
643 enum netdev_flags flags;
646 /* Allocate network device. */
647 netdev = xzalloc(sizeof *netdev);
649 netdev_init(&netdev->netdev, netdev_dev_);
651 /* Verify that the device really exists, by attempting to read its flags.
652 * (The flags might be cached, in which case this won't actually do an
655 * Don't do this for "internal" netdevs, though, because those have to be
656 * created as netdev objects before they exist in the kernel, because
657 * creating them in the kernel happens by passing a netdev object to
658 * dpif_port_add(). */
659 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
660 error = netdev_get_flags(&netdev->netdev, &flags);
661 if (error == ENODEV) {
666 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
667 !netdev_dev->state.tap.opened) {
669 /* We assume that the first user of the tap device is the primary user
670 * and give them the tap FD. Subsequent users probably just expect
671 * this to be a system device so open it normally to avoid send/receive
672 * directions appearing to be reversed. */
673 netdev->fd = netdev_dev->state.tap.fd;
674 netdev_dev->state.tap.opened = true;
677 *netdevp = &netdev->netdev;
681 netdev_uninit(&netdev->netdev, true);
685 /* Closes and destroys 'netdev'. */
687 netdev_linux_close(struct netdev *netdev_)
689 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
691 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
698 netdev_linux_listen(struct netdev *netdev_)
700 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
701 struct sockaddr_ll sll;
706 if (netdev->fd >= 0) {
710 /* Create file descriptor. */
711 fd = socket(PF_PACKET, SOCK_RAW, 0);
714 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
718 /* Set non-blocking mode. */
719 error = set_nonblocking(fd);
724 /* Get ethernet device index. */
725 error = get_ifindex(&netdev->netdev, &ifindex);
730 /* Bind to specific ethernet device. */
731 memset(&sll, 0, sizeof sll);
732 sll.sll_family = AF_PACKET;
733 sll.sll_ifindex = ifindex;
734 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
735 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
737 VLOG_ERR("%s: failed to bind raw socket (%s)",
738 netdev_get_name(netdev_), strerror(error));
753 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
755 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
757 if (netdev->fd < 0) {
758 /* Device is not listening. */
763 ssize_t retval = read(netdev->fd, data, size);
766 } else if (errno != EINTR) {
767 if (errno != EAGAIN) {
768 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
769 strerror(errno), netdev_get_name(netdev_));
776 /* Registers with the poll loop to wake up from the next call to poll_block()
777 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
779 netdev_linux_recv_wait(struct netdev *netdev_)
781 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 if (netdev->fd >= 0) {
783 poll_fd_wait(netdev->fd, POLLIN);
787 /* Discards all packets waiting to be received from 'netdev'. */
789 netdev_linux_drain(struct netdev *netdev_)
791 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
792 if (netdev->fd < 0) {
794 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
796 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
797 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
801 drain_fd(netdev->fd, ifr.ifr_qlen);
804 return drain_rcvbuf(netdev->fd);
808 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
809 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
810 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
811 * the packet is too big or too small to transmit on the device.
813 * The caller retains ownership of 'buffer' in all cases.
815 * The kernel maintains a packet transmission queue, so the caller is not
816 * expected to do additional queuing of packets. */
818 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
824 if (netdev->fd < 0) {
825 /* Use our AF_PACKET socket to send to this device. */
826 struct sockaddr_ll sll;
833 sock = af_packet_sock();
838 error = get_ifindex(netdev_, &ifindex);
843 /* We don't bother setting most fields in sockaddr_ll because the
844 * kernel ignores them for SOCK_RAW. */
845 memset(&sll, 0, sizeof sll);
846 sll.sll_family = AF_PACKET;
847 sll.sll_ifindex = ifindex;
849 iov.iov_base = (void *) data;
853 msg.msg_namelen = sizeof sll;
856 msg.msg_control = NULL;
857 msg.msg_controllen = 0;
860 retval = sendmsg(sock, &msg, 0);
862 /* Use the netdev's own fd to send to this device. This is
863 * essential for tap devices, because packets sent to a tap device
864 * with an AF_PACKET socket will loop back to be *received* again
865 * on the tap device. */
866 retval = write(netdev->fd, data, size);
870 /* The Linux AF_PACKET implementation never blocks waiting for room
871 * for packets, instead returning ENOBUFS. Translate this into
872 * EAGAIN for the caller. */
873 if (errno == ENOBUFS) {
875 } else if (errno == EINTR) {
877 } else if (errno != EAGAIN) {
878 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
879 netdev_get_name(netdev_), strerror(errno));
882 } else if (retval != size) {
883 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
884 "%zu) on %s", retval, size, netdev_get_name(netdev_));
892 /* Registers with the poll loop to wake up from the next call to poll_block()
893 * when the packet transmission queue has sufficient room to transmit a packet
894 * with netdev_send().
896 * The kernel maintains a packet transmission queue, so the client is not
897 * expected to do additional queuing of packets. Thus, this function is
898 * unlikely to ever be used. It is included for completeness. */
900 netdev_linux_send_wait(struct netdev *netdev_)
902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
903 if (netdev->fd < 0) {
905 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
906 poll_fd_wait(netdev->fd, POLLOUT);
908 /* TAP device always accepts packets.*/
909 poll_immediate_wake();
913 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
914 * otherwise a positive errno value. */
916 netdev_linux_set_etheraddr(struct netdev *netdev_,
917 const uint8_t mac[ETH_ADDR_LEN])
919 struct netdev_dev_linux *netdev_dev =
920 netdev_dev_linux_cast(netdev_get_dev(netdev_));
923 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
924 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
925 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
927 netdev_dev->cache_valid |= VALID_ETHERADDR;
928 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
936 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
937 * free the returned buffer. */
939 netdev_linux_get_etheraddr(const struct netdev *netdev_,
940 uint8_t mac[ETH_ADDR_LEN])
942 struct netdev_dev_linux *netdev_dev =
943 netdev_dev_linux_cast(netdev_get_dev(netdev_));
944 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
945 int error = get_etheraddr(netdev_get_name(netdev_),
946 netdev_dev->etheraddr);
950 netdev_dev->cache_valid |= VALID_ETHERADDR;
952 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
956 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
957 * in bytes, not including the hardware header; thus, this is typically 1500
958 * bytes for Ethernet devices. */
960 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
962 struct netdev_dev_linux *netdev_dev =
963 netdev_dev_linux_cast(netdev_get_dev(netdev_));
964 if (!(netdev_dev->cache_valid & VALID_MTU)) {
968 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
969 SIOCGIFMTU, "SIOCGIFMTU");
973 netdev_dev->mtu = ifr.ifr_mtu;
974 netdev_dev->cache_valid |= VALID_MTU;
976 *mtup = netdev_dev->mtu;
980 /* Sets the maximum size of transmitted (MTU) for given device using linux
981 * networking ioctl interface.
984 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
986 struct netdev_dev_linux *netdev_dev =
987 netdev_dev_linux_cast(netdev_get_dev(netdev_));
992 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
993 SIOCSIFMTU, "SIOCSIFMTU");
998 netdev_dev->mtu = ifr.ifr_mtu;
999 netdev_dev->cache_valid |= VALID_MTU;
1003 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1004 * On failure, returns a negative errno value. */
1006 netdev_linux_get_ifindex(const struct netdev *netdev)
1010 error = get_ifindex(netdev, &ifindex);
1011 return error ? -error : ifindex;
1015 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1017 struct netdev_dev_linux *netdev_dev =
1018 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1023 if (netdev_dev->miimon_interval > 0) {
1024 *carrier = netdev_dev->miimon;
1028 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1032 fn = xasprintf("/sys/class/net/%s/carrier",
1033 netdev_get_name(netdev_));
1034 fd = open(fn, O_RDONLY);
1037 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1041 retval = read(fd, line, sizeof line);
1044 if (error == EINVAL) {
1045 /* This is the normal return value when we try to check carrier
1046 * if the network device is not up. */
1048 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1051 } else if (retval == 0) {
1053 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1057 if (line[0] != '0' && line[0] != '1') {
1059 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1063 netdev_dev->carrier = line[0] != '0';
1064 netdev_dev->cache_valid |= VALID_CARRIER;
1066 *carrier = netdev_dev->carrier;
1078 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1079 struct mii_ioctl_data *data)
1084 memset(&ifr, 0, sizeof ifr);
1085 memcpy(&ifr.ifr_data, data, sizeof *data);
1086 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1087 memcpy(data, &ifr.ifr_data, sizeof *data);
1093 netdev_linux_get_miimon(const char *name, bool *miimon)
1095 struct mii_ioctl_data data;
1100 memset(&data, 0, sizeof data);
1101 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1103 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1104 data.reg_num = MII_BMSR;
1105 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1109 *miimon = !!(data.val_out & BMSR_LSTATUS);
1111 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1114 struct ethtool_cmd ecmd;
1116 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1119 memset(&ecmd, 0, sizeof ecmd);
1120 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1123 struct ethtool_value eval;
1125 memcpy(&eval, &ecmd, sizeof eval);
1126 *miimon = !!eval.data;
1128 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1136 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1137 long long int interval)
1139 struct netdev_dev_linux *netdev_dev;
1141 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1143 interval = interval > 0 ? MAX(interval, 100) : 0;
1144 if (netdev_dev->miimon_interval != interval) {
1145 netdev_dev->miimon_interval = interval;
1146 timer_set_expired(&netdev_dev->miimon_timer);
1153 netdev_linux_miimon_run(void)
1155 struct shash device_shash;
1156 struct shash_node *node;
1158 shash_init(&device_shash);
1159 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1160 SHASH_FOR_EACH (node, &device_shash) {
1161 struct netdev_dev_linux *dev = node->data;
1164 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1168 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1169 if (miimon != dev->miimon) {
1170 dev->miimon = miimon;
1171 netdev_dev_linux_changed(dev);
1174 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1177 shash_destroy(&device_shash);
1181 netdev_linux_miimon_wait(void)
1183 struct shash device_shash;
1184 struct shash_node *node;
1186 shash_init(&device_shash);
1187 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1188 SHASH_FOR_EACH (node, &device_shash) {
1189 struct netdev_dev_linux *dev = node->data;
1191 if (dev->miimon_interval > 0) {
1192 timer_wait(&dev->miimon_timer);
1195 shash_destroy(&device_shash);
1198 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1199 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1202 check_for_working_netlink_stats(void)
1204 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1205 * preferable, so if that works, we'll use it. */
1206 int ifindex = do_get_ifindex("lo");
1208 VLOG_WARN("failed to get ifindex for lo, "
1209 "obtaining netdev stats from proc");
1212 struct netdev_stats stats;
1213 int error = get_stats_via_netlink(ifindex, &stats);
1215 VLOG_DBG("obtaining netdev stats via rtnetlink");
1218 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1219 "via proc (you are probably running a pre-2.6.19 "
1220 "kernel)", strerror(error));
1227 swap_uint64(uint64_t *a, uint64_t *b)
1235 get_stats_via_vport(const struct netdev *netdev_,
1236 struct netdev_stats *stats)
1238 struct netdev_dev_linux *netdev_dev =
1239 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1241 if (netdev_dev->have_vport_stats ||
1242 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1245 error = netdev_vport_get_stats(netdev_, stats);
1247 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1248 netdev_get_name(netdev_), error);
1250 netdev_dev->have_vport_stats = !error;
1251 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1256 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1257 struct netdev_stats *stats)
1259 static int use_netlink_stats = -1;
1262 if (use_netlink_stats < 0) {
1263 use_netlink_stats = check_for_working_netlink_stats();
1266 if (use_netlink_stats) {
1269 error = get_ifindex(netdev_, &ifindex);
1271 error = get_stats_via_netlink(ifindex, stats);
1274 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1278 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1279 netdev_get_name(netdev_), error);
1285 /* Retrieves current device stats for 'netdev-linux'. */
1287 netdev_linux_get_stats(const struct netdev *netdev_,
1288 struct netdev_stats *stats)
1290 struct netdev_dev_linux *netdev_dev =
1291 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1292 struct netdev_stats dev_stats;
1295 get_stats_via_vport(netdev_, stats);
1297 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1300 if (!netdev_dev->have_vport_stats) {
1307 if (!netdev_dev->have_vport_stats) {
1308 /* stats not available from OVS then use ioctl stats. */
1311 stats->rx_errors += dev_stats.rx_errors;
1312 stats->tx_errors += dev_stats.tx_errors;
1313 stats->rx_dropped += dev_stats.rx_dropped;
1314 stats->tx_dropped += dev_stats.tx_dropped;
1315 stats->multicast += dev_stats.multicast;
1316 stats->collisions += dev_stats.collisions;
1317 stats->rx_length_errors += dev_stats.rx_length_errors;
1318 stats->rx_over_errors += dev_stats.rx_over_errors;
1319 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1320 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1321 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1322 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1323 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1324 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1325 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1326 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1327 stats->tx_window_errors += dev_stats.tx_window_errors;
1332 /* Retrieves current device stats for 'netdev-tap' netdev or
1333 * netdev-internal. */
1335 netdev_pseudo_get_stats(const struct netdev *netdev_,
1336 struct netdev_stats *stats)
1338 struct netdev_dev_linux *netdev_dev =
1339 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1340 struct netdev_stats dev_stats;
1343 get_stats_via_vport(netdev_, stats);
1345 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1347 if (!netdev_dev->have_vport_stats) {
1354 /* If this port is an internal port then the transmit and receive stats
1355 * will appear to be swapped relative to the other ports since we are the
1356 * one sending the data, not a remote computer. For consistency, we swap
1357 * them back here. This does not apply if we are getting stats from the
1358 * vport layer because it always tracks stats from the perspective of the
1360 if (!netdev_dev->have_vport_stats) {
1362 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1363 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1364 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1365 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1366 stats->rx_length_errors = 0;
1367 stats->rx_over_errors = 0;
1368 stats->rx_crc_errors = 0;
1369 stats->rx_frame_errors = 0;
1370 stats->rx_fifo_errors = 0;
1371 stats->rx_missed_errors = 0;
1372 stats->tx_aborted_errors = 0;
1373 stats->tx_carrier_errors = 0;
1374 stats->tx_fifo_errors = 0;
1375 stats->tx_heartbeat_errors = 0;
1376 stats->tx_window_errors = 0;
1378 stats->rx_dropped += dev_stats.tx_dropped;
1379 stats->tx_dropped += dev_stats.rx_dropped;
1381 stats->rx_errors += dev_stats.tx_errors;
1382 stats->tx_errors += dev_stats.rx_errors;
1384 stats->multicast += dev_stats.multicast;
1385 stats->collisions += dev_stats.collisions;
1390 /* Stores the features supported by 'netdev' into each of '*current',
1391 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1392 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1393 * successful, otherwise a positive errno value. */
1395 netdev_linux_get_features(const struct netdev *netdev,
1396 uint32_t *current, uint32_t *advertised,
1397 uint32_t *supported, uint32_t *peer)
1399 struct ethtool_cmd ecmd;
1402 memset(&ecmd, 0, sizeof ecmd);
1403 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1404 ETHTOOL_GSET, "ETHTOOL_GSET");
1409 /* Supported features. */
1411 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1412 *supported |= OFPPF_10MB_HD;
1414 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1415 *supported |= OFPPF_10MB_FD;
1417 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1418 *supported |= OFPPF_100MB_HD;
1420 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1421 *supported |= OFPPF_100MB_FD;
1423 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1424 *supported |= OFPPF_1GB_HD;
1426 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1427 *supported |= OFPPF_1GB_FD;
1429 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1430 *supported |= OFPPF_10GB_FD;
1432 if (ecmd.supported & SUPPORTED_TP) {
1433 *supported |= OFPPF_COPPER;
1435 if (ecmd.supported & SUPPORTED_FIBRE) {
1436 *supported |= OFPPF_FIBER;
1438 if (ecmd.supported & SUPPORTED_Autoneg) {
1439 *supported |= OFPPF_AUTONEG;
1441 if (ecmd.supported & SUPPORTED_Pause) {
1442 *supported |= OFPPF_PAUSE;
1444 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1445 *supported |= OFPPF_PAUSE_ASYM;
1448 /* Advertised features. */
1450 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1451 *advertised |= OFPPF_10MB_HD;
1453 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1454 *advertised |= OFPPF_10MB_FD;
1456 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1457 *advertised |= OFPPF_100MB_HD;
1459 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1460 *advertised |= OFPPF_100MB_FD;
1462 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1463 *advertised |= OFPPF_1GB_HD;
1465 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1466 *advertised |= OFPPF_1GB_FD;
1468 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1469 *advertised |= OFPPF_10GB_FD;
1471 if (ecmd.advertising & ADVERTISED_TP) {
1472 *advertised |= OFPPF_COPPER;
1474 if (ecmd.advertising & ADVERTISED_FIBRE) {
1475 *advertised |= OFPPF_FIBER;
1477 if (ecmd.advertising & ADVERTISED_Autoneg) {
1478 *advertised |= OFPPF_AUTONEG;
1480 if (ecmd.advertising & ADVERTISED_Pause) {
1481 *advertised |= OFPPF_PAUSE;
1483 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1484 *advertised |= OFPPF_PAUSE_ASYM;
1487 /* Current settings. */
1488 if (ecmd.speed == SPEED_10) {
1489 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1490 } else if (ecmd.speed == SPEED_100) {
1491 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1492 } else if (ecmd.speed == SPEED_1000) {
1493 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1494 } else if (ecmd.speed == SPEED_10000) {
1495 *current = OFPPF_10GB_FD;
1500 if (ecmd.port == PORT_TP) {
1501 *current |= OFPPF_COPPER;
1502 } else if (ecmd.port == PORT_FIBRE) {
1503 *current |= OFPPF_FIBER;
1507 *current |= OFPPF_AUTONEG;
1510 /* Peer advertisements. */
1511 *peer = 0; /* XXX */
1516 /* Set the features advertised by 'netdev' to 'advertise'. */
1518 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1520 struct ethtool_cmd ecmd;
1523 memset(&ecmd, 0, sizeof ecmd);
1524 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1525 ETHTOOL_GSET, "ETHTOOL_GSET");
1530 ecmd.advertising = 0;
1531 if (advertise & OFPPF_10MB_HD) {
1532 ecmd.advertising |= ADVERTISED_10baseT_Half;
1534 if (advertise & OFPPF_10MB_FD) {
1535 ecmd.advertising |= ADVERTISED_10baseT_Full;
1537 if (advertise & OFPPF_100MB_HD) {
1538 ecmd.advertising |= ADVERTISED_100baseT_Half;
1540 if (advertise & OFPPF_100MB_FD) {
1541 ecmd.advertising |= ADVERTISED_100baseT_Full;
1543 if (advertise & OFPPF_1GB_HD) {
1544 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1546 if (advertise & OFPPF_1GB_FD) {
1547 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1549 if (advertise & OFPPF_10GB_FD) {
1550 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1552 if (advertise & OFPPF_COPPER) {
1553 ecmd.advertising |= ADVERTISED_TP;
1555 if (advertise & OFPPF_FIBER) {
1556 ecmd.advertising |= ADVERTISED_FIBRE;
1558 if (advertise & OFPPF_AUTONEG) {
1559 ecmd.advertising |= ADVERTISED_Autoneg;
1561 if (advertise & OFPPF_PAUSE) {
1562 ecmd.advertising |= ADVERTISED_Pause;
1564 if (advertise & OFPPF_PAUSE_ASYM) {
1565 ecmd.advertising |= ADVERTISED_Asym_Pause;
1567 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1568 ETHTOOL_SSET, "ETHTOOL_SSET");
1571 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1572 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1573 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1574 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1575 * sets '*vlan_vid' to -1. */
1577 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1579 const char *netdev_name = netdev_get_name(netdev);
1580 struct ds line = DS_EMPTY_INITIALIZER;
1581 FILE *stream = NULL;
1585 COVERAGE_INC(netdev_get_vlan_vid);
1586 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1587 stream = fopen(fn, "r");
1593 if (ds_get_line(&line, stream)) {
1594 if (ferror(stream)) {
1596 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1599 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1604 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1606 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1607 fn, ds_cstr(&line));
1625 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1626 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1628 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1629 * positive errno value.
1631 * This function is equivalent to running
1632 * /sbin/tc qdisc del dev %s handle ffff: ingress
1633 * but it is much, much faster.
1636 netdev_linux_remove_policing(struct netdev *netdev)
1638 struct netdev_dev_linux *netdev_dev =
1639 netdev_dev_linux_cast(netdev_get_dev(netdev));
1640 const char *netdev_name = netdev_get_name(netdev);
1642 struct ofpbuf request;
1643 struct tcmsg *tcmsg;
1646 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1650 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1651 tcmsg->tcm_parent = TC_H_INGRESS;
1652 nl_msg_put_string(&request, TCA_KIND, "ingress");
1653 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1655 error = tc_transact(&request, NULL);
1656 if (error && error != ENOENT && error != EINVAL) {
1657 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1658 netdev_name, strerror(error));
1662 netdev_dev->kbits_rate = 0;
1663 netdev_dev->kbits_burst = 0;
1664 netdev_dev->cache_valid |= VALID_POLICING;
1668 /* Attempts to set input rate limiting (policing) policy. */
1670 netdev_linux_set_policing(struct netdev *netdev,
1671 uint32_t kbits_rate, uint32_t kbits_burst)
1673 struct netdev_dev_linux *netdev_dev =
1674 netdev_dev_linux_cast(netdev_get_dev(netdev));
1675 const char *netdev_name = netdev_get_name(netdev);
1678 COVERAGE_INC(netdev_set_policing);
1680 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1681 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1682 : kbits_burst); /* Stick with user-specified value. */
1684 if (netdev_dev->cache_valid & VALID_POLICING
1685 && netdev_dev->kbits_rate == kbits_rate
1686 && netdev_dev->kbits_burst == kbits_burst) {
1687 /* Assume that settings haven't changed since we last set them. */
1691 netdev_linux_remove_policing(netdev);
1693 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1694 if (system(command) != 0) {
1695 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1699 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1700 kbits_rate, kbits_burst);
1701 if (system(command) != 0) {
1702 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1707 netdev_dev->kbits_rate = kbits_rate;
1708 netdev_dev->kbits_burst = kbits_burst;
1709 netdev_dev->cache_valid |= VALID_POLICING;
1716 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1719 const struct tc_ops **opsp;
1721 for (opsp = tcs; *opsp != NULL; opsp++) {
1722 const struct tc_ops *ops = *opsp;
1723 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1724 sset_add(types, ops->ovs_name);
1730 static const struct tc_ops *
1731 tc_lookup_ovs_name(const char *name)
1733 const struct tc_ops **opsp;
1735 for (opsp = tcs; *opsp != NULL; opsp++) {
1736 const struct tc_ops *ops = *opsp;
1737 if (!strcmp(name, ops->ovs_name)) {
1744 static const struct tc_ops *
1745 tc_lookup_linux_name(const char *name)
1747 const struct tc_ops **opsp;
1749 for (opsp = tcs; *opsp != NULL; opsp++) {
1750 const struct tc_ops *ops = *opsp;
1751 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1758 static struct tc_queue *
1759 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1762 struct netdev_dev_linux *netdev_dev =
1763 netdev_dev_linux_cast(netdev_get_dev(netdev));
1764 struct tc_queue *queue;
1766 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1767 if (queue->queue_id == queue_id) {
1774 static struct tc_queue *
1775 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1777 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1781 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1783 struct netdev_qos_capabilities *caps)
1785 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1789 caps->n_queues = ops->n_queues;
1794 netdev_linux_get_qos(const struct netdev *netdev,
1795 const char **typep, struct shash *details)
1797 struct netdev_dev_linux *netdev_dev =
1798 netdev_dev_linux_cast(netdev_get_dev(netdev));
1801 error = tc_query_qdisc(netdev);
1806 *typep = netdev_dev->tc->ops->ovs_name;
1807 return (netdev_dev->tc->ops->qdisc_get
1808 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1813 netdev_linux_set_qos(struct netdev *netdev,
1814 const char *type, const struct shash *details)
1816 struct netdev_dev_linux *netdev_dev =
1817 netdev_dev_linux_cast(netdev_get_dev(netdev));
1818 const struct tc_ops *new_ops;
1821 new_ops = tc_lookup_ovs_name(type);
1822 if (!new_ops || !new_ops->tc_install) {
1826 error = tc_query_qdisc(netdev);
1831 if (new_ops == netdev_dev->tc->ops) {
1832 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1834 /* Delete existing qdisc. */
1835 error = tc_del_qdisc(netdev);
1839 assert(netdev_dev->tc == NULL);
1841 /* Install new qdisc. */
1842 error = new_ops->tc_install(netdev, details);
1843 assert((error == 0) == (netdev_dev->tc != NULL));
1850 netdev_linux_get_queue(const struct netdev *netdev,
1851 unsigned int queue_id, struct shash *details)
1853 struct netdev_dev_linux *netdev_dev =
1854 netdev_dev_linux_cast(netdev_get_dev(netdev));
1857 error = tc_query_qdisc(netdev);
1861 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1863 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1869 netdev_linux_set_queue(struct netdev *netdev,
1870 unsigned int queue_id, const struct shash *details)
1872 struct netdev_dev_linux *netdev_dev =
1873 netdev_dev_linux_cast(netdev_get_dev(netdev));
1876 error = tc_query_qdisc(netdev);
1879 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1880 || !netdev_dev->tc->ops->class_set) {
1884 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1888 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1890 struct netdev_dev_linux *netdev_dev =
1891 netdev_dev_linux_cast(netdev_get_dev(netdev));
1894 error = tc_query_qdisc(netdev);
1897 } else if (!netdev_dev->tc->ops->class_delete) {
1900 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1902 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1908 netdev_linux_get_queue_stats(const struct netdev *netdev,
1909 unsigned int queue_id,
1910 struct netdev_queue_stats *stats)
1912 struct netdev_dev_linux *netdev_dev =
1913 netdev_dev_linux_cast(netdev_get_dev(netdev));
1916 error = tc_query_qdisc(netdev);
1919 } else if (!netdev_dev->tc->ops->class_get_stats) {
1922 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1924 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1930 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1932 struct ofpbuf request;
1933 struct tcmsg *tcmsg;
1935 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1939 tcmsg->tcm_parent = 0;
1940 nl_dump_start(dump, rtnl_sock, &request);
1941 ofpbuf_uninit(&request);
1946 netdev_linux_dump_queues(const struct netdev *netdev,
1947 netdev_dump_queues_cb *cb, void *aux)
1949 struct netdev_dev_linux *netdev_dev =
1950 netdev_dev_linux_cast(netdev_get_dev(netdev));
1951 struct tc_queue *queue;
1952 struct shash details;
1956 error = tc_query_qdisc(netdev);
1959 } else if (!netdev_dev->tc->ops->class_get) {
1964 shash_init(&details);
1965 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1966 shash_clear(&details);
1968 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1970 (*cb)(queue->queue_id, &details, aux);
1975 shash_destroy(&details);
1981 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1982 netdev_dump_queue_stats_cb *cb, void *aux)
1984 struct netdev_dev_linux *netdev_dev =
1985 netdev_dev_linux_cast(netdev_get_dev(netdev));
1986 struct nl_dump dump;
1991 error = tc_query_qdisc(netdev);
1994 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1999 if (!start_queue_dump(netdev, &dump)) {
2002 while (nl_dump_next(&dump, &msg)) {
2003 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2009 error = nl_dump_done(&dump);
2010 return error ? error : last_error;
2014 netdev_linux_get_in4(const struct netdev *netdev_,
2015 struct in_addr *address, struct in_addr *netmask)
2017 struct netdev_dev_linux *netdev_dev =
2018 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2020 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2023 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2024 SIOCGIFADDR, "SIOCGIFADDR");
2029 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2030 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2035 netdev_dev->cache_valid |= VALID_IN4;
2037 *address = netdev_dev->address;
2038 *netmask = netdev_dev->netmask;
2039 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2043 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2044 struct in_addr netmask)
2046 struct netdev_dev_linux *netdev_dev =
2047 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2050 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2052 netdev_dev->cache_valid |= VALID_IN4;
2053 netdev_dev->address = address;
2054 netdev_dev->netmask = netmask;
2055 if (address.s_addr != INADDR_ANY) {
2056 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2057 "SIOCSIFNETMASK", netmask);
2064 parse_if_inet6_line(const char *line,
2065 struct in6_addr *in6, char ifname[16 + 1])
2067 uint8_t *s6 = in6->s6_addr;
2068 #define X8 "%2"SCNx8
2070 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2071 "%*x %*x %*x %*x %16s\n",
2072 &s6[0], &s6[1], &s6[2], &s6[3],
2073 &s6[4], &s6[5], &s6[6], &s6[7],
2074 &s6[8], &s6[9], &s6[10], &s6[11],
2075 &s6[12], &s6[13], &s6[14], &s6[15],
2079 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2080 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2082 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2084 struct netdev_dev_linux *netdev_dev =
2085 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2086 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2090 netdev_dev->in6 = in6addr_any;
2092 file = fopen("/proc/net/if_inet6", "r");
2094 const char *name = netdev_get_name(netdev_);
2095 while (fgets(line, sizeof line, file)) {
2096 struct in6_addr in6_tmp;
2097 char ifname[16 + 1];
2098 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2099 && !strcmp(name, ifname))
2101 netdev_dev->in6 = in6_tmp;
2107 netdev_dev->cache_valid |= VALID_IN6;
2109 *in6 = netdev_dev->in6;
2114 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2116 struct sockaddr_in sin;
2117 memset(&sin, 0, sizeof sin);
2118 sin.sin_family = AF_INET;
2119 sin.sin_addr = addr;
2122 memset(sa, 0, sizeof *sa);
2123 memcpy(sa, &sin, sizeof sin);
2127 do_set_addr(struct netdev *netdev,
2128 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2131 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2132 make_in4_sockaddr(&ifr.ifr_addr, addr);
2134 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2138 /* Adds 'router' as a default IP gateway. */
2140 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2142 struct in_addr any = { INADDR_ANY };
2146 memset(&rt, 0, sizeof rt);
2147 make_in4_sockaddr(&rt.rt_dst, any);
2148 make_in4_sockaddr(&rt.rt_gateway, router);
2149 make_in4_sockaddr(&rt.rt_genmask, any);
2150 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2151 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2153 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2159 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2162 static const char fn[] = "/proc/net/route";
2167 *netdev_name = NULL;
2168 stream = fopen(fn, "r");
2169 if (stream == NULL) {
2170 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2175 while (fgets(line, sizeof line, stream)) {
2178 ovs_be32 dest, gateway, mask;
2179 int refcnt, metric, mtu;
2180 unsigned int flags, use, window, irtt;
2183 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2185 iface, &dest, &gateway, &flags, &refcnt,
2186 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2188 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2192 if (!(flags & RTF_UP)) {
2193 /* Skip routes that aren't up. */
2197 /* The output of 'dest', 'mask', and 'gateway' were given in
2198 * network byte order, so we don't need need any endian
2199 * conversions here. */
2200 if ((dest & mask) == (host->s_addr & mask)) {
2202 /* The host is directly reachable. */
2203 next_hop->s_addr = 0;
2205 /* To reach the host, we must go through a gateway. */
2206 next_hop->s_addr = gateway;
2208 *netdev_name = xstrdup(iface);
2220 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2222 struct ethtool_drvinfo drvinfo;
2225 memset(&drvinfo, 0, sizeof drvinfo);
2226 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2227 (struct ethtool_cmd *)&drvinfo,
2229 "ETHTOOL_GDRVINFO");
2231 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2232 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2233 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2239 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2240 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2241 * returns 0. Otherwise, it returns a positive errno value; in particular,
2242 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2244 netdev_linux_arp_lookup(const struct netdev *netdev,
2245 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2248 struct sockaddr_in sin;
2251 memset(&r, 0, sizeof r);
2252 memset(&sin, 0, sizeof sin);
2253 sin.sin_family = AF_INET;
2254 sin.sin_addr.s_addr = ip;
2256 memcpy(&r.arp_pa, &sin, sizeof sin);
2257 r.arp_ha.sa_family = ARPHRD_ETHER;
2259 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2260 COVERAGE_INC(netdev_arp_lookup);
2261 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2263 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2264 } else if (retval != ENXIO) {
2265 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2266 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2272 nd_to_iff_flags(enum netdev_flags nd)
2275 if (nd & NETDEV_UP) {
2278 if (nd & NETDEV_PROMISC) {
2285 iff_to_nd_flags(int iff)
2287 enum netdev_flags nd = 0;
2291 if (iff & IFF_PROMISC) {
2292 nd |= NETDEV_PROMISC;
2298 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2299 enum netdev_flags on, enum netdev_flags *old_flagsp)
2301 int old_flags, new_flags;
2304 error = get_flags(netdev, &old_flags);
2306 *old_flagsp = iff_to_nd_flags(old_flags);
2307 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2308 if (new_flags != old_flags) {
2309 error = set_flags(netdev, new_flags);
2316 netdev_linux_change_seq(const struct netdev *netdev)
2318 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2321 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2325 netdev_linux_init, \
2327 netdev_linux_wait, \
2330 netdev_linux_destroy, \
2331 NULL, /* get_config */ \
2332 NULL, /* set_config */ \
2334 netdev_linux_open, \
2335 netdev_linux_close, \
2337 netdev_linux_listen, \
2338 netdev_linux_recv, \
2339 netdev_linux_recv_wait, \
2340 netdev_linux_drain, \
2342 netdev_linux_send, \
2343 netdev_linux_send_wait, \
2345 netdev_linux_set_etheraddr, \
2346 netdev_linux_get_etheraddr, \
2347 netdev_linux_get_mtu, \
2348 netdev_linux_set_mtu, \
2349 netdev_linux_get_ifindex, \
2350 netdev_linux_get_carrier, \
2351 netdev_linux_set_miimon_interval, \
2355 netdev_linux_get_features, \
2356 netdev_linux_set_advertisements, \
2357 netdev_linux_get_vlan_vid, \
2359 netdev_linux_set_policing, \
2360 netdev_linux_get_qos_types, \
2361 netdev_linux_get_qos_capabilities, \
2362 netdev_linux_get_qos, \
2363 netdev_linux_set_qos, \
2364 netdev_linux_get_queue, \
2365 netdev_linux_set_queue, \
2366 netdev_linux_delete_queue, \
2367 netdev_linux_get_queue_stats, \
2368 netdev_linux_dump_queues, \
2369 netdev_linux_dump_queue_stats, \
2371 netdev_linux_get_in4, \
2372 netdev_linux_set_in4, \
2373 netdev_linux_get_in6, \
2374 netdev_linux_add_router, \
2375 netdev_linux_get_next_hop, \
2376 netdev_linux_get_status, \
2377 netdev_linux_arp_lookup, \
2379 netdev_linux_update_flags, \
2381 netdev_linux_change_seq \
2384 const struct netdev_class netdev_linux_class =
2387 netdev_linux_create,
2388 netdev_linux_get_stats,
2389 NULL); /* set_stats */
2391 const struct netdev_class netdev_tap_class =
2394 netdev_linux_create_tap,
2395 netdev_pseudo_get_stats,
2396 NULL); /* set_stats */
2398 const struct netdev_class netdev_internal_class =
2401 netdev_linux_create,
2402 netdev_pseudo_get_stats,
2403 netdev_vport_set_stats);
2405 /* HTB traffic control class. */
2407 #define HTB_N_QUEUES 0xf000
2411 unsigned int max_rate; /* In bytes/s. */
2415 struct tc_queue tc_queue;
2416 unsigned int min_rate; /* In bytes/s. */
2417 unsigned int max_rate; /* In bytes/s. */
2418 unsigned int burst; /* In bytes. */
2419 unsigned int priority; /* Lower values are higher priorities. */
2423 htb_get__(const struct netdev *netdev)
2425 struct netdev_dev_linux *netdev_dev =
2426 netdev_dev_linux_cast(netdev_get_dev(netdev));
2427 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2431 htb_install__(struct netdev *netdev, uint64_t max_rate)
2433 struct netdev_dev_linux *netdev_dev =
2434 netdev_dev_linux_cast(netdev_get_dev(netdev));
2437 htb = xmalloc(sizeof *htb);
2438 tc_init(&htb->tc, &tc_ops_htb);
2439 htb->max_rate = max_rate;
2441 netdev_dev->tc = &htb->tc;
2444 /* Create an HTB qdisc.
2446 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2448 htb_setup_qdisc__(struct netdev *netdev)
2451 struct tc_htb_glob opt;
2452 struct ofpbuf request;
2453 struct tcmsg *tcmsg;
2455 tc_del_qdisc(netdev);
2457 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2458 NLM_F_EXCL | NLM_F_CREATE, &request);
2462 tcmsg->tcm_handle = tc_make_handle(1, 0);
2463 tcmsg->tcm_parent = TC_H_ROOT;
2465 nl_msg_put_string(&request, TCA_KIND, "htb");
2467 memset(&opt, 0, sizeof opt);
2468 opt.rate2quantum = 10;
2472 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2473 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2474 nl_msg_end_nested(&request, opt_offset);
2476 return tc_transact(&request, NULL);
2479 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2480 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2482 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2483 unsigned int parent, struct htb_class *class)
2486 struct tc_htb_opt opt;
2487 struct ofpbuf request;
2488 struct tcmsg *tcmsg;
2492 error = netdev_get_mtu(netdev, &mtu);
2494 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2495 netdev_get_name(netdev));
2499 memset(&opt, 0, sizeof opt);
2500 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2501 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2502 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2503 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2504 opt.prio = class->priority;
2506 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2510 tcmsg->tcm_handle = handle;
2511 tcmsg->tcm_parent = parent;
2513 nl_msg_put_string(&request, TCA_KIND, "htb");
2514 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2515 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2516 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2517 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2518 nl_msg_end_nested(&request, opt_offset);
2520 error = tc_transact(&request, NULL);
2522 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2523 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2524 netdev_get_name(netdev),
2525 tc_get_major(handle), tc_get_minor(handle),
2526 tc_get_major(parent), tc_get_minor(parent),
2527 class->min_rate, class->max_rate,
2528 class->burst, class->priority, strerror(error));
2533 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2534 * description of them into 'details'. The description complies with the
2535 * specification given in the vswitch database documentation for linux-htb
2538 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2540 static const struct nl_policy tca_htb_policy[] = {
2541 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2542 .min_len = sizeof(struct tc_htb_opt) },
2545 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2546 const struct tc_htb_opt *htb;
2548 if (!nl_parse_nested(nl_options, tca_htb_policy,
2549 attrs, ARRAY_SIZE(tca_htb_policy))) {
2550 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2554 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2555 class->min_rate = htb->rate.rate;
2556 class->max_rate = htb->ceil.rate;
2557 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2558 class->priority = htb->prio;
2563 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2564 struct htb_class *options,
2565 struct netdev_queue_stats *stats)
2567 struct nlattr *nl_options;
2568 unsigned int handle;
2571 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2572 if (!error && queue_id) {
2573 unsigned int major = tc_get_major(handle);
2574 unsigned int minor = tc_get_minor(handle);
2575 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2576 *queue_id = minor - 1;
2581 if (!error && options) {
2582 error = htb_parse_tca_options__(nl_options, options);
2588 htb_parse_qdisc_details__(struct netdev *netdev,
2589 const struct shash *details, struct htb_class *hc)
2591 const char *max_rate_s;
2593 max_rate_s = shash_find_data(details, "max-rate");
2594 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2595 if (!hc->max_rate) {
2598 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2599 hc->max_rate = netdev_features_to_bps(current) / 8;
2601 hc->min_rate = hc->max_rate;
2607 htb_parse_class_details__(struct netdev *netdev,
2608 const struct shash *details, struct htb_class *hc)
2610 const struct htb *htb = htb_get__(netdev);
2611 const char *min_rate_s = shash_find_data(details, "min-rate");
2612 const char *max_rate_s = shash_find_data(details, "max-rate");
2613 const char *burst_s = shash_find_data(details, "burst");
2614 const char *priority_s = shash_find_data(details, "priority");
2617 error = netdev_get_mtu(netdev, &mtu);
2619 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2620 netdev_get_name(netdev));
2624 /* HTB requires at least an mtu sized min-rate to send any traffic even
2625 * on uncongested links. */
2626 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2627 hc->min_rate = MAX(hc->min_rate, mtu);
2628 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2631 hc->max_rate = (max_rate_s
2632 ? strtoull(max_rate_s, NULL, 10) / 8
2634 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2635 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2639 * According to hints in the documentation that I've read, it is important
2640 * that 'burst' be at least as big as the largest frame that might be
2641 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2642 * but having it a bit too small is a problem. Since netdev_get_mtu()
2643 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2644 * the MTU. We actually add 64, instead of 14, as a guard against
2645 * additional headers get tacked on somewhere that we're not aware of. */
2646 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2647 hc->burst = MAX(hc->burst, mtu + 64);
2650 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2656 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2657 unsigned int parent, struct htb_class *options,
2658 struct netdev_queue_stats *stats)
2660 struct ofpbuf *reply;
2663 error = tc_query_class(netdev, handle, parent, &reply);
2665 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2666 ofpbuf_delete(reply);
2672 htb_tc_install(struct netdev *netdev, const struct shash *details)
2676 error = htb_setup_qdisc__(netdev);
2678 struct htb_class hc;
2680 htb_parse_qdisc_details__(netdev, details, &hc);
2681 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2682 tc_make_handle(1, 0), &hc);
2684 htb_install__(netdev, hc.max_rate);
2690 static struct htb_class *
2691 htb_class_cast__(const struct tc_queue *queue)
2693 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2697 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2698 const struct htb_class *hc)
2700 struct htb *htb = htb_get__(netdev);
2701 size_t hash = hash_int(queue_id, 0);
2702 struct tc_queue *queue;
2703 struct htb_class *hcp;
2705 queue = tc_find_queue__(netdev, queue_id, hash);
2707 hcp = htb_class_cast__(queue);
2709 hcp = xmalloc(sizeof *hcp);
2710 queue = &hcp->tc_queue;
2711 queue->queue_id = queue_id;
2712 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2715 hcp->min_rate = hc->min_rate;
2716 hcp->max_rate = hc->max_rate;
2717 hcp->burst = hc->burst;
2718 hcp->priority = hc->priority;
2722 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2725 struct nl_dump dump;
2726 struct htb_class hc;
2728 /* Get qdisc options. */
2730 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2731 htb_install__(netdev, hc.max_rate);
2734 if (!start_queue_dump(netdev, &dump)) {
2737 while (nl_dump_next(&dump, &msg)) {
2738 unsigned int queue_id;
2740 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2741 htb_update_queue__(netdev, queue_id, &hc);
2744 nl_dump_done(&dump);
2750 htb_tc_destroy(struct tc *tc)
2752 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2753 struct htb_class *hc, *next;
2755 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2756 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2764 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2766 const struct htb *htb = htb_get__(netdev);
2767 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2772 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2774 struct htb_class hc;
2777 htb_parse_qdisc_details__(netdev, details, &hc);
2778 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2779 tc_make_handle(1, 0), &hc);
2781 htb_get__(netdev)->max_rate = hc.max_rate;
2787 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2788 const struct tc_queue *queue, struct shash *details)
2790 const struct htb_class *hc = htb_class_cast__(queue);
2792 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2793 if (hc->min_rate != hc->max_rate) {
2794 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2796 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2798 shash_add(details, "priority", xasprintf("%u", hc->priority));
2804 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2805 const struct shash *details)
2807 struct htb_class hc;
2810 error = htb_parse_class_details__(netdev, details, &hc);
2815 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2816 tc_make_handle(1, 0xfffe), &hc);
2821 htb_update_queue__(netdev, queue_id, &hc);
2826 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2828 struct htb_class *hc = htb_class_cast__(queue);
2829 struct htb *htb = htb_get__(netdev);
2832 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2834 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2841 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2842 struct netdev_queue_stats *stats)
2844 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2845 tc_make_handle(1, 0xfffe), NULL, stats);
2849 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2850 const struct ofpbuf *nlmsg,
2851 netdev_dump_queue_stats_cb *cb, void *aux)
2853 struct netdev_queue_stats stats;
2854 unsigned int handle, major, minor;
2857 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2862 major = tc_get_major(handle);
2863 minor = tc_get_minor(handle);
2864 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2865 (*cb)(minor - 1, &stats, aux);
2870 static const struct tc_ops tc_ops_htb = {
2871 "htb", /* linux_name */
2872 "linux-htb", /* ovs_name */
2873 HTB_N_QUEUES, /* n_queues */
2882 htb_class_get_stats,
2883 htb_class_dump_stats
2886 /* "linux-hfsc" traffic control class. */
2888 #define HFSC_N_QUEUES 0xf000
2896 struct tc_queue tc_queue;
2901 static struct hfsc *
2902 hfsc_get__(const struct netdev *netdev)
2904 struct netdev_dev_linux *netdev_dev;
2905 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2906 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2909 static struct hfsc_class *
2910 hfsc_class_cast__(const struct tc_queue *queue)
2912 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2916 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2918 struct netdev_dev_linux * netdev_dev;
2921 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2922 hfsc = xmalloc(sizeof *hfsc);
2923 tc_init(&hfsc->tc, &tc_ops_hfsc);
2924 hfsc->max_rate = max_rate;
2925 netdev_dev->tc = &hfsc->tc;
2929 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2930 const struct hfsc_class *hc)
2934 struct hfsc_class *hcp;
2935 struct tc_queue *queue;
2937 hfsc = hfsc_get__(netdev);
2938 hash = hash_int(queue_id, 0);
2940 queue = tc_find_queue__(netdev, queue_id, hash);
2942 hcp = hfsc_class_cast__(queue);
2944 hcp = xmalloc(sizeof *hcp);
2945 queue = &hcp->tc_queue;
2946 queue->queue_id = queue_id;
2947 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2950 hcp->min_rate = hc->min_rate;
2951 hcp->max_rate = hc->max_rate;
2955 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2957 const struct tc_service_curve *rsc, *fsc, *usc;
2958 static const struct nl_policy tca_hfsc_policy[] = {
2960 .type = NL_A_UNSPEC,
2962 .min_len = sizeof(struct tc_service_curve),
2965 .type = NL_A_UNSPEC,
2967 .min_len = sizeof(struct tc_service_curve),
2970 .type = NL_A_UNSPEC,
2972 .min_len = sizeof(struct tc_service_curve),
2975 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2977 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2978 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2979 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2983 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2984 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2985 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2987 if (rsc->m1 != 0 || rsc->d != 0 ||
2988 fsc->m1 != 0 || fsc->d != 0 ||
2989 usc->m1 != 0 || usc->d != 0) {
2990 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2991 "Non-linear service curves are not supported.");
2995 if (rsc->m2 != fsc->m2) {
2996 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2997 "Real-time service curves are not supported ");
3001 if (rsc->m2 > usc->m2) {
3002 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3003 "Min-rate service curve is greater than "
3004 "the max-rate service curve.");
3008 class->min_rate = fsc->m2;
3009 class->max_rate = usc->m2;
3014 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3015 struct hfsc_class *options,
3016 struct netdev_queue_stats *stats)
3019 unsigned int handle;
3020 struct nlattr *nl_options;
3022 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3028 unsigned int major, minor;
3030 major = tc_get_major(handle);
3031 minor = tc_get_minor(handle);
3032 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3033 *queue_id = minor - 1;
3040 error = hfsc_parse_tca_options__(nl_options, options);
3047 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3048 unsigned int parent, struct hfsc_class *options,
3049 struct netdev_queue_stats *stats)
3052 struct ofpbuf *reply;
3054 error = tc_query_class(netdev, handle, parent, &reply);
3059 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3060 ofpbuf_delete(reply);
3065 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3066 struct hfsc_class *class)
3069 const char *max_rate_s;
3071 max_rate_s = shash_find_data(details, "max-rate");
3072 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3077 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3078 max_rate = netdev_features_to_bps(current) / 8;
3081 class->min_rate = max_rate;
3082 class->max_rate = max_rate;
3086 hfsc_parse_class_details__(struct netdev *netdev,
3087 const struct shash *details,
3088 struct hfsc_class * class)
3090 const struct hfsc *hfsc;
3091 uint32_t min_rate, max_rate;
3092 const char *min_rate_s, *max_rate_s;
3094 hfsc = hfsc_get__(netdev);
3095 min_rate_s = shash_find_data(details, "min-rate");
3096 max_rate_s = shash_find_data(details, "max-rate");
3098 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3099 min_rate = MAX(min_rate, 1);
3100 min_rate = MIN(min_rate, hfsc->max_rate);
3102 max_rate = (max_rate_s
3103 ? strtoull(max_rate_s, NULL, 10) / 8
3105 max_rate = MAX(max_rate, min_rate);
3106 max_rate = MIN(max_rate, hfsc->max_rate);
3108 class->min_rate = min_rate;
3109 class->max_rate = max_rate;
3114 /* Create an HFSC qdisc.
3116 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3118 hfsc_setup_qdisc__(struct netdev * netdev)
3120 struct tcmsg *tcmsg;
3121 struct ofpbuf request;
3122 struct tc_hfsc_qopt opt;
3124 tc_del_qdisc(netdev);
3126 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3127 NLM_F_EXCL | NLM_F_CREATE, &request);
3133 tcmsg->tcm_handle = tc_make_handle(1, 0);
3134 tcmsg->tcm_parent = TC_H_ROOT;
3136 memset(&opt, 0, sizeof opt);
3139 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3140 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3142 return tc_transact(&request, NULL);
3145 /* Create an HFSC class.
3147 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3148 * sc rate <min_rate> ul rate <max_rate>" */
3150 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3151 unsigned int parent, struct hfsc_class *class)
3155 struct tcmsg *tcmsg;
3156 struct ofpbuf request;
3157 struct tc_service_curve min, max;
3159 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3165 tcmsg->tcm_handle = handle;
3166 tcmsg->tcm_parent = parent;
3170 min.m2 = class->min_rate;
3174 max.m2 = class->max_rate;
3176 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3177 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3178 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3179 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3180 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3181 nl_msg_end_nested(&request, opt_offset);
3183 error = tc_transact(&request, NULL);
3185 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3186 "min-rate %ubps, max-rate %ubps (%s)",
3187 netdev_get_name(netdev),
3188 tc_get_major(handle), tc_get_minor(handle),
3189 tc_get_major(parent), tc_get_minor(parent),
3190 class->min_rate, class->max_rate, strerror(error));
3197 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3200 struct hfsc_class class;
3202 error = hfsc_setup_qdisc__(netdev);
3208 hfsc_parse_qdisc_details__(netdev, details, &class);
3209 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3210 tc_make_handle(1, 0), &class);
3216 hfsc_install__(netdev, class.max_rate);
3221 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3224 struct nl_dump dump;
3225 struct hfsc_class hc;
3228 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3229 hfsc_install__(netdev, hc.max_rate);
3231 if (!start_queue_dump(netdev, &dump)) {
3235 while (nl_dump_next(&dump, &msg)) {
3236 unsigned int queue_id;
3238 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3239 hfsc_update_queue__(netdev, queue_id, &hc);
3243 nl_dump_done(&dump);
3248 hfsc_tc_destroy(struct tc *tc)
3251 struct hfsc_class *hc, *next;
3253 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3255 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3256 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3265 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3267 const struct hfsc *hfsc;
3268 hfsc = hfsc_get__(netdev);
3269 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3274 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3277 struct hfsc_class class;
3279 hfsc_parse_qdisc_details__(netdev, details, &class);
3280 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3281 tc_make_handle(1, 0), &class);
3284 hfsc_get__(netdev)->max_rate = class.max_rate;
3291 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3292 const struct tc_queue *queue, struct shash *details)
3294 const struct hfsc_class *hc;
3296 hc = hfsc_class_cast__(queue);
3297 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3298 if (hc->min_rate != hc->max_rate) {
3299 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3305 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3306 const struct shash *details)
3309 struct hfsc_class class;
3311 error = hfsc_parse_class_details__(netdev, details, &class);
3316 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3317 tc_make_handle(1, 0xfffe), &class);
3322 hfsc_update_queue__(netdev, queue_id, &class);
3327 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3331 struct hfsc_class *hc;
3333 hc = hfsc_class_cast__(queue);
3334 hfsc = hfsc_get__(netdev);
3336 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3338 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3345 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3346 struct netdev_queue_stats *stats)
3348 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3349 tc_make_handle(1, 0xfffe), NULL, stats);
3353 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3354 const struct ofpbuf *nlmsg,
3355 netdev_dump_queue_stats_cb *cb, void *aux)
3357 struct netdev_queue_stats stats;
3358 unsigned int handle, major, minor;
3361 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3366 major = tc_get_major(handle);
3367 minor = tc_get_minor(handle);
3368 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3369 (*cb)(minor - 1, &stats, aux);
3374 static const struct tc_ops tc_ops_hfsc = {
3375 "hfsc", /* linux_name */
3376 "linux-hfsc", /* ovs_name */
3377 HFSC_N_QUEUES, /* n_queues */
3378 hfsc_tc_install, /* tc_install */
3379 hfsc_tc_load, /* tc_load */
3380 hfsc_tc_destroy, /* tc_destroy */
3381 hfsc_qdisc_get, /* qdisc_get */
3382 hfsc_qdisc_set, /* qdisc_set */
3383 hfsc_class_get, /* class_get */
3384 hfsc_class_set, /* class_set */
3385 hfsc_class_delete, /* class_delete */
3386 hfsc_class_get_stats, /* class_get_stats */
3387 hfsc_class_dump_stats /* class_dump_stats */
3390 /* "linux-default" traffic control class.
3392 * This class represents the default, unnamed Linux qdisc. It corresponds to
3393 * the "" (empty string) QoS type in the OVS database. */
3396 default_install__(struct netdev *netdev)
3398 struct netdev_dev_linux *netdev_dev =
3399 netdev_dev_linux_cast(netdev_get_dev(netdev));
3400 static struct tc *tc;
3403 tc = xmalloc(sizeof *tc);
3404 tc_init(tc, &tc_ops_default);
3406 netdev_dev->tc = tc;
3410 default_tc_install(struct netdev *netdev,
3411 const struct shash *details OVS_UNUSED)
3413 default_install__(netdev);
3418 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3420 default_install__(netdev);
3424 static const struct tc_ops tc_ops_default = {
3425 NULL, /* linux_name */
3430 NULL, /* tc_destroy */
3431 NULL, /* qdisc_get */
3432 NULL, /* qdisc_set */
3433 NULL, /* class_get */
3434 NULL, /* class_set */
3435 NULL, /* class_delete */
3436 NULL, /* class_get_stats */
3437 NULL /* class_dump_stats */
3440 /* "linux-other" traffic control class.
3445 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3447 struct netdev_dev_linux *netdev_dev =
3448 netdev_dev_linux_cast(netdev_get_dev(netdev));
3449 static struct tc *tc;
3452 tc = xmalloc(sizeof *tc);
3453 tc_init(tc, &tc_ops_other);
3455 netdev_dev->tc = tc;
3459 static const struct tc_ops tc_ops_other = {
3460 NULL, /* linux_name */
3461 "linux-other", /* ovs_name */
3463 NULL, /* tc_install */
3465 NULL, /* tc_destroy */
3466 NULL, /* qdisc_get */
3467 NULL, /* qdisc_set */
3468 NULL, /* class_get */
3469 NULL, /* class_set */
3470 NULL, /* class_delete */
3471 NULL, /* class_get_stats */
3472 NULL /* class_dump_stats */
3475 /* Traffic control. */
3477 /* Number of kernel "tc" ticks per second. */
3478 static double ticks_per_s;
3480 /* Number of kernel "jiffies" per second. This is used for the purpose of
3481 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3482 * one jiffy's worth of data.
3484 * There are two possibilities here:
3486 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3487 * approximate range of 100 to 1024. That means that we really need to
3488 * make sure that the qdisc can buffer that much data.
3490 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3491 * has finely granular timers and there's no need to fudge additional room
3492 * for buffers. (There's no extra effort needed to implement that: the
3493 * large 'buffer_hz' is used as a divisor, so practically any number will
3494 * come out as 0 in the division. Small integer results in the case of
3495 * really high dividends won't have any real effect anyhow.)
3497 static unsigned int buffer_hz;
3499 /* Returns tc handle 'major':'minor'. */
3501 tc_make_handle(unsigned int major, unsigned int minor)
3503 return TC_H_MAKE(major << 16, minor);
3506 /* Returns the major number from 'handle'. */
3508 tc_get_major(unsigned int handle)
3510 return TC_H_MAJ(handle) >> 16;
3513 /* Returns the minor number from 'handle'. */
3515 tc_get_minor(unsigned int handle)
3517 return TC_H_MIN(handle);
3520 static struct tcmsg *
3521 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3522 struct ofpbuf *request)
3524 struct tcmsg *tcmsg;
3528 error = get_ifindex(netdev, &ifindex);
3533 ofpbuf_init(request, 512);
3534 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3535 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3536 tcmsg->tcm_family = AF_UNSPEC;
3537 tcmsg->tcm_ifindex = ifindex;
3538 /* Caller should fill in tcmsg->tcm_handle. */
3539 /* Caller should fill in tcmsg->tcm_parent. */
3545 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3547 int error = nl_sock_transact(rtnl_sock, request, replyp);
3548 ofpbuf_uninit(request);
3555 /* The values in psched are not individually very meaningful, but they are
3556 * important. The tables below show some values seen in the wild.
3560 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3561 * (Before that, there are hints that it was 1000000000.)
3563 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3567 * -----------------------------------
3568 * [1] 000c8000 000f4240 000f4240 00000064
3569 * [2] 000003e8 00000400 000f4240 3b9aca00
3570 * [3] 000003e8 00000400 000f4240 3b9aca00
3571 * [4] 000003e8 00000400 000f4240 00000064
3572 * [5] 000003e8 00000040 000f4240 3b9aca00
3573 * [6] 000003e8 00000040 000f4240 000000f9
3575 * a b c d ticks_per_s buffer_hz
3576 * ------- --------- ---------- ------------- ----------- -------------
3577 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3578 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3579 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3580 * [4] 1,000 1,024 1,000,000 100 976,562 100
3581 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3582 * [6] 1,000 64 1,000,000 249 15,625,000 249
3584 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3585 * [2] 2.6.26-1-686-bigmem from Debian lenny
3586 * [3] 2.6.26-2-sparc64 from Debian lenny
3587 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3588 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3589 * [6] 2.6.34 from kernel.org on KVM
3591 static const char fn[] = "/proc/net/psched";
3592 unsigned int a, b, c, d;
3598 stream = fopen(fn, "r");
3600 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3604 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3605 VLOG_WARN("%s: read failed", fn);
3609 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3613 VLOG_WARN("%s: invalid scheduler parameters", fn);
3617 ticks_per_s = (double) a * c / b;
3621 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3624 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3627 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3628 * rate of 'rate' bytes per second. */
3630 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3635 return (rate * ticks) / ticks_per_s;
3638 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3639 * rate of 'rate' bytes per second. */
3641 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3646 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3649 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3650 * a transmission rate of 'rate' bytes per second. */
3652 tc_buffer_per_jiffy(unsigned int rate)
3657 return rate / buffer_hz;
3660 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3661 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3662 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3663 * stores NULL into it if it is absent.
3665 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3668 * Returns 0 if successful, otherwise a positive errno value. */
3670 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3671 struct nlattr **options)
3673 static const struct nl_policy tca_policy[] = {
3674 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3675 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3677 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3679 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3680 tca_policy, ta, ARRAY_SIZE(ta))) {
3681 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3686 *kind = nl_attr_get_string(ta[TCA_KIND]);
3690 *options = ta[TCA_OPTIONS];
3705 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3706 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3707 * into '*options', and its queue statistics into '*stats'. Any of the output
3708 * arguments may be null.
3710 * Returns 0 if successful, otherwise a positive errno value. */
3712 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3713 struct nlattr **options, struct netdev_queue_stats *stats)
3715 static const struct nl_policy tca_policy[] = {
3716 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3717 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3719 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3721 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3722 tca_policy, ta, ARRAY_SIZE(ta))) {
3723 VLOG_WARN_RL(&rl, "failed to parse class message");
3728 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3729 *handlep = tc->tcm_handle;
3733 *options = ta[TCA_OPTIONS];
3737 const struct gnet_stats_queue *gsq;
3738 struct gnet_stats_basic gsb;
3740 static const struct nl_policy stats_policy[] = {
3741 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3742 .min_len = sizeof gsb },
3743 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3744 .min_len = sizeof *gsq },
3746 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3748 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3749 sa, ARRAY_SIZE(sa))) {
3750 VLOG_WARN_RL(&rl, "failed to parse class stats");
3754 /* Alignment issues screw up the length of struct gnet_stats_basic on
3755 * some arch/bitsize combinations. Newer versions of Linux have a
3756 * struct gnet_stats_basic_packed, but we can't depend on that. The
3757 * easiest thing to do is just to make a copy. */
3758 memset(&gsb, 0, sizeof gsb);
3759 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3760 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3761 stats->tx_bytes = gsb.bytes;
3762 stats->tx_packets = gsb.packets;
3764 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3765 stats->tx_errors = gsq->drops;
3775 memset(stats, 0, sizeof *stats);
3780 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3783 tc_query_class(const struct netdev *netdev,
3784 unsigned int handle, unsigned int parent,
3785 struct ofpbuf **replyp)
3787 struct ofpbuf request;
3788 struct tcmsg *tcmsg;
3791 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3795 tcmsg->tcm_handle = handle;
3796 tcmsg->tcm_parent = parent;
3798 error = tc_transact(&request, replyp);
3800 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3801 netdev_get_name(netdev),
3802 tc_get_major(handle), tc_get_minor(handle),
3803 tc_get_major(parent), tc_get_minor(parent),
3809 /* Equivalent to "tc class del dev <name> handle <handle>". */
3811 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3813 struct ofpbuf request;
3814 struct tcmsg *tcmsg;
3817 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3821 tcmsg->tcm_handle = handle;
3822 tcmsg->tcm_parent = 0;
3824 error = tc_transact(&request, NULL);
3826 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3827 netdev_get_name(netdev),
3828 tc_get_major(handle), tc_get_minor(handle),
3834 /* Equivalent to "tc qdisc del dev <name> root". */
3836 tc_del_qdisc(struct netdev *netdev)
3838 struct netdev_dev_linux *netdev_dev =
3839 netdev_dev_linux_cast(netdev_get_dev(netdev));
3840 struct ofpbuf request;
3841 struct tcmsg *tcmsg;
3844 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3848 tcmsg->tcm_handle = tc_make_handle(1, 0);
3849 tcmsg->tcm_parent = TC_H_ROOT;
3851 error = tc_transact(&request, NULL);
3852 if (error == EINVAL) {
3853 /* EINVAL probably means that the default qdisc was in use, in which
3854 * case we've accomplished our purpose. */
3857 if (!error && netdev_dev->tc) {
3858 if (netdev_dev->tc->ops->tc_destroy) {
3859 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3861 netdev_dev->tc = NULL;
3866 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3867 * kernel to determine what they are. Returns 0 if successful, otherwise a
3868 * positive errno value. */
3870 tc_query_qdisc(const struct netdev *netdev)
3872 struct netdev_dev_linux *netdev_dev =
3873 netdev_dev_linux_cast(netdev_get_dev(netdev));
3874 struct ofpbuf request, *qdisc;
3875 const struct tc_ops *ops;
3876 struct tcmsg *tcmsg;
3880 if (netdev_dev->tc) {
3884 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3885 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3886 * 2.6.35 without that fix backported to it.
3888 * To avoid the OOPS, we must not make a request that would attempt to dump
3889 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3890 * few others. There are a few ways that I can see to do this, but most of
3891 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3892 * technique chosen here is to assume that any non-default qdisc that we
3893 * create will have a class with handle 1:0. The built-in qdiscs only have
3894 * a class with handle 0:0.
3896 * We could check for Linux 2.6.35+ and use a more straightforward method
3898 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3902 tcmsg->tcm_handle = tc_make_handle(1, 0);
3903 tcmsg->tcm_parent = 0;
3905 /* Figure out what tc class to instantiate. */
3906 error = tc_transact(&request, &qdisc);
3910 error = tc_parse_qdisc(qdisc, &kind, NULL);
3912 ops = &tc_ops_other;
3914 ops = tc_lookup_linux_name(kind);
3916 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3917 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3919 ops = &tc_ops_other;
3922 } else if (error == ENOENT) {
3923 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3924 * other entity that doesn't have a handle 1:0. We will assume
3925 * that it's the system default qdisc. */
3926 ops = &tc_ops_default;
3929 /* Who knows? Maybe the device got deleted. */
3930 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3931 netdev_get_name(netdev), strerror(error));
3932 ops = &tc_ops_other;
3935 /* Instantiate it. */
3936 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3937 assert((load_error == 0) == (netdev_dev->tc != NULL));
3938 ofpbuf_delete(qdisc);
3940 return error ? error : load_error;
3943 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3944 approximate the time to transmit packets of various lengths. For an MTU of
3945 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3946 represents two possible packet lengths; for a MTU of 513 through 1024, four
3947 possible lengths; and so on.
3949 Returns, for the specified 'mtu', the number of bits that packet lengths
3950 need to be shifted right to fit within such a 256-entry table. */
3952 tc_calc_cell_log(unsigned int mtu)
3957 mtu = ETH_PAYLOAD_MAX;
3959 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3961 for (cell_log = 0; mtu >= 256; cell_log++) {
3968 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3971 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3973 memset(rate, 0, sizeof *rate);
3974 rate->cell_log = tc_calc_cell_log(mtu);
3975 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3976 /* rate->cell_align = 0; */ /* distro headers. */
3977 rate->mpu = ETH_TOTAL_MIN;
3981 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3982 * attribute of the specified "type".
3984 * See tc_calc_cell_log() above for a description of "rtab"s. */
3986 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3991 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3992 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3993 unsigned packet_size = (i + 1) << rate->cell_log;
3994 if (packet_size < rate->mpu) {
3995 packet_size = rate->mpu;
3997 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4001 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4002 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4003 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4006 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4008 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4009 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4012 /* Copies 'src' into 'dst', performing format conversion in the process. */
4014 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4015 const struct rtnl_link_stats *src)
4017 dst->rx_packets = src->rx_packets;
4018 dst->tx_packets = src->tx_packets;
4019 dst->rx_bytes = src->rx_bytes;
4020 dst->tx_bytes = src->tx_bytes;
4021 dst->rx_errors = src->rx_errors;
4022 dst->tx_errors = src->tx_errors;
4023 dst->rx_dropped = src->rx_dropped;
4024 dst->tx_dropped = src->tx_dropped;
4025 dst->multicast = src->multicast;
4026 dst->collisions = src->collisions;
4027 dst->rx_length_errors = src->rx_length_errors;
4028 dst->rx_over_errors = src->rx_over_errors;
4029 dst->rx_crc_errors = src->rx_crc_errors;
4030 dst->rx_frame_errors = src->rx_frame_errors;
4031 dst->rx_fifo_errors = src->rx_fifo_errors;
4032 dst->rx_missed_errors = src->rx_missed_errors;
4033 dst->tx_aborted_errors = src->tx_aborted_errors;
4034 dst->tx_carrier_errors = src->tx_carrier_errors;
4035 dst->tx_fifo_errors = src->tx_fifo_errors;
4036 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4037 dst->tx_window_errors = src->tx_window_errors;
4041 /* Utility functions. */
4044 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4046 /* Policy for RTNLGRP_LINK messages.
4048 * There are *many* more fields in these messages, but currently we only
4049 * care about these fields. */
4050 static const struct nl_policy rtnlgrp_link_policy[] = {
4051 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4052 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4053 .min_len = sizeof(struct rtnl_link_stats) },
4056 struct ofpbuf request;
4057 struct ofpbuf *reply;
4058 struct ifinfomsg *ifi;
4059 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4062 ofpbuf_init(&request, 0);
4063 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4064 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4065 ifi->ifi_family = PF_UNSPEC;
4066 ifi->ifi_index = ifindex;
4067 error = nl_sock_transact(rtnl_sock, &request, &reply);
4068 ofpbuf_uninit(&request);
4073 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4074 rtnlgrp_link_policy,
4075 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4076 ofpbuf_delete(reply);
4080 if (!attrs[IFLA_STATS]) {
4081 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4082 ofpbuf_delete(reply);
4086 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4088 ofpbuf_delete(reply);
4094 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4096 static const char fn[] = "/proc/net/dev";
4101 stream = fopen(fn, "r");
4103 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4108 while (fgets(line, sizeof line, stream)) {
4111 #define X64 "%"SCNu64
4114 X64 X64 X64 X64 X64 X64 X64 "%*u"
4115 X64 X64 X64 X64 X64 X64 X64 "%*u",
4121 &stats->rx_fifo_errors,
4122 &stats->rx_frame_errors,
4128 &stats->tx_fifo_errors,
4130 &stats->tx_carrier_errors) != 15) {
4131 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4132 } else if (!strcmp(devname, netdev_name)) {
4133 stats->rx_length_errors = UINT64_MAX;
4134 stats->rx_over_errors = UINT64_MAX;
4135 stats->rx_crc_errors = UINT64_MAX;
4136 stats->rx_missed_errors = UINT64_MAX;
4137 stats->tx_aborted_errors = UINT64_MAX;
4138 stats->tx_heartbeat_errors = UINT64_MAX;
4139 stats->tx_window_errors = UINT64_MAX;
4145 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4151 get_flags(const struct netdev *netdev, int *flags)
4156 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4158 *flags = ifr.ifr_flags;
4163 set_flags(struct netdev *netdev, int flags)
4167 ifr.ifr_flags = flags;
4168 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4173 do_get_ifindex(const char *netdev_name)
4177 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4178 COVERAGE_INC(netdev_get_ifindex);
4179 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4180 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4181 netdev_name, strerror(errno));
4184 return ifr.ifr_ifindex;
4188 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4190 struct netdev_dev_linux *netdev_dev =
4191 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4193 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4194 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4198 netdev_dev->cache_valid |= VALID_IFINDEX;
4199 netdev_dev->ifindex = ifindex;
4201 *ifindexp = netdev_dev->ifindex;
4206 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4211 memset(&ifr, 0, sizeof ifr);
4212 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4213 COVERAGE_INC(netdev_get_hwaddr);
4214 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4215 /* ENODEV probably means that a vif disappeared asynchronously and
4216 * hasn't been removed from the database yet, so reduce the log level
4217 * to INFO for that case. */
4218 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4219 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4220 netdev_name, strerror(errno));
4223 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4224 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4225 VLOG_WARN("%s device has unknown hardware address family %d",
4226 netdev_name, hwaddr_family);
4228 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4233 set_etheraddr(const char *netdev_name, int hwaddr_family,
4234 const uint8_t mac[ETH_ADDR_LEN])
4238 memset(&ifr, 0, sizeof ifr);
4239 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4240 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4241 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4242 COVERAGE_INC(netdev_set_hwaddr);
4243 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4244 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4245 netdev_name, strerror(errno));
4252 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4253 int cmd, const char *cmd_name)
4257 memset(&ifr, 0, sizeof ifr);
4258 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4259 ifr.ifr_data = (caddr_t) ecmd;
4262 COVERAGE_INC(netdev_ethtool);
4263 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4266 if (errno != EOPNOTSUPP) {
4267 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4268 "failed: %s", cmd_name, name, strerror(errno));
4270 /* The device doesn't support this operation. That's pretty
4271 * common, so there's no point in logging anything. */
4277 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4278 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4280 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4281 const char *flag_name, bool enable)
4283 const char *netdev_name = netdev_get_name(netdev);
4284 struct ethtool_value evalue;
4288 memset(&evalue, 0, sizeof evalue);
4289 error = netdev_linux_do_ethtool(netdev_name,
4290 (struct ethtool_cmd *)&evalue,
4291 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4296 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4297 error = netdev_linux_do_ethtool(netdev_name,
4298 (struct ethtool_cmd *)&evalue,
4299 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4304 memset(&evalue, 0, sizeof evalue);
4305 error = netdev_linux_do_ethtool(netdev_name,
4306 (struct ethtool_cmd *)&evalue,
4307 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4312 if (new_flags != evalue.data) {
4313 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4314 "device %s failed", enable ? "enable" : "disable",
4315 flag_name, netdev_name);
4323 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4324 const char *cmd_name)
4326 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4327 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4328 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4336 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4337 int cmd, const char *cmd_name)
4342 ifr.ifr_addr.sa_family = AF_INET;
4343 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4345 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4346 *ip = sin->sin_addr;
4351 /* Returns an AF_PACKET raw socket or a negative errno value. */
4353 af_packet_sock(void)
4355 static int sock = INT_MIN;
4357 if (sock == INT_MIN) {
4358 sock = socket(AF_PACKET, SOCK_RAW, 0);
4360 set_nonblocking(sock);
4363 VLOG_ERR("failed to create packet socket: %s", strerror(errno));