2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_get_vlan_vid);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
331 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
332 struct nlattr **options);
333 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
334 struct nlattr **options,
335 struct netdev_queue_stats *);
336 static int tc_query_class(const struct netdev *,
337 unsigned int handle, unsigned int parent,
338 struct ofpbuf **replyp);
339 static int tc_delete_class(const struct netdev *, unsigned int handle);
341 static int tc_del_qdisc(struct netdev *netdev);
342 static int tc_query_qdisc(const struct netdev *netdev);
344 static int tc_calc_cell_log(unsigned int mtu);
345 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
346 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
347 const struct tc_ratespec *rate);
348 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
350 struct netdev_dev_linux {
351 struct netdev_dev netdev_dev;
353 struct shash_node *shash_node;
354 unsigned int cache_valid;
355 unsigned int change_seq;
357 bool miimon; /* Link status of last poll. */
358 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
359 struct timer miimon_timer;
361 /* The following are figured out "on demand" only. They are only valid
362 * when the corresponding VALID_* bit in 'cache_valid' is set. */
364 uint8_t etheraddr[ETH_ADDR_LEN];
365 struct in_addr address, netmask;
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
371 bool have_vport_stats;
375 struct tap_state tap;
379 struct netdev_linux {
380 struct netdev netdev;
384 /* Sockets used for ioctl operations. */
385 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387 /* A Netlink routing socket that is not subscribed to any multicast groups. */
388 static struct nl_sock *rtnl_sock;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414 static int get_carrier_via_sysfs(const char *name, bool *carrier);
415 static int af_packet_sock(void);
416 static void netdev_linux_miimon_run(void);
417 static void netdev_linux_miimon_wait(void);
420 is_netdev_linux_class(const struct netdev_class *netdev_class)
422 return netdev_class->init == netdev_linux_init;
425 static struct netdev_dev_linux *
426 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
434 static struct netdev_linux *
435 netdev_linux_cast(const struct netdev *netdev)
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
445 netdev_linux_init(void)
447 static int status = -1;
449 /* Create AF_INET socket. */
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
456 /* Create rtnetlink socket. */
458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
469 netdev_linux_run(void)
471 rtnetlink_link_run();
472 netdev_linux_miimon_run();
476 netdev_linux_wait(void)
478 rtnetlink_link_wait();
479 netdev_linux_miimon_wait();
483 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
486 if (!dev->change_seq) {
489 dev->cache_valid = 0;
493 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
494 void *aux OVS_UNUSED)
496 struct netdev_dev_linux *dev;
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
506 if (dev->carrier != change->running) {
507 dev->carrier = change->running;
510 netdev_dev_linux_changed(dev);
514 struct shash device_shash;
515 struct shash_node *node;
517 shash_init(&device_shash);
518 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
519 SHASH_FOR_EACH (node, &device_shash) {
524 get_carrier_via_sysfs(node->name, &carrier);
525 if (dev->carrier != carrier) {
526 dev->carrier = carrier;
529 netdev_dev_linux_changed(dev);
531 shash_destroy(&device_shash);
535 /* Creates system and internal devices. */
537 netdev_linux_create(const struct netdev_class *class, const char *name,
538 struct netdev_dev **netdev_devp)
540 struct netdev_dev_linux *netdev_dev;
542 if (!cache_notifier_refcount) {
543 assert(!netdev_linux_cache_notifier);
545 netdev_linux_cache_notifier =
546 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
548 if (!netdev_linux_cache_notifier) {
552 cache_notifier_refcount++;
554 netdev_dev = xzalloc(sizeof *netdev_dev);
555 netdev_dev->change_seq = 1;
556 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
557 get_carrier_via_sysfs(name, &netdev_dev->carrier);
559 *netdev_devp = &netdev_dev->netdev_dev;
563 /* For most types of netdevs we open the device for each call of
564 * netdev_open(). However, this is not the case with tap devices,
565 * since it is only possible to open the device once. In this
566 * situation we share a single file descriptor, and consequently
567 * buffers, across all readers. Therefore once data is read it will
568 * be unavailable to other reads for tap devices. */
570 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
571 const char *name, struct netdev_dev **netdev_devp)
573 struct netdev_dev_linux *netdev_dev;
574 struct tap_state *state;
575 static const char tap_dev[] = "/dev/net/tun";
579 netdev_dev = xzalloc(sizeof *netdev_dev);
580 state = &netdev_dev->state.tap;
582 /* Open tap device. */
583 state->fd = open(tap_dev, O_RDWR);
586 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
590 /* Create tap device. */
591 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
592 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
593 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
594 VLOG_WARN("%s: creating tap device failed: %s", name,
600 /* Make non-blocking. */
601 error = set_nonblocking(state->fd);
606 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
607 *netdev_devp = &netdev_dev->netdev_dev;
616 destroy_tap(struct netdev_dev_linux *netdev_dev)
618 struct tap_state *state = &netdev_dev->state.tap;
620 if (state->fd >= 0) {
625 /* Destroys the netdev device 'netdev_dev_'. */
627 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
630 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
632 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
633 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
636 if (class == &netdev_linux_class || class == &netdev_internal_class) {
637 cache_notifier_refcount--;
639 if (!cache_notifier_refcount) {
640 assert(netdev_linux_cache_notifier);
641 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
642 netdev_linux_cache_notifier = NULL;
644 } else if (class == &netdev_tap_class) {
645 destroy_tap(netdev_dev);
654 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
656 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
657 struct netdev_linux *netdev;
658 enum netdev_flags flags;
661 /* Allocate network device. */
662 netdev = xzalloc(sizeof *netdev);
664 netdev_init(&netdev->netdev, netdev_dev_);
666 /* Verify that the device really exists, by attempting to read its flags.
667 * (The flags might be cached, in which case this won't actually do an
670 * Don't do this for "internal" netdevs, though, because those have to be
671 * created as netdev objects before they exist in the kernel, because
672 * creating them in the kernel happens by passing a netdev object to
673 * dpif_port_add(). */
674 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
675 error = netdev_get_flags(&netdev->netdev, &flags);
676 if (error == ENODEV) {
681 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
682 !netdev_dev->state.tap.opened) {
684 /* We assume that the first user of the tap device is the primary user
685 * and give them the tap FD. Subsequent users probably just expect
686 * this to be a system device so open it normally to avoid send/receive
687 * directions appearing to be reversed. */
688 netdev->fd = netdev_dev->state.tap.fd;
689 netdev_dev->state.tap.opened = true;
692 *netdevp = &netdev->netdev;
696 netdev_uninit(&netdev->netdev, true);
700 /* Closes and destroys 'netdev'. */
702 netdev_linux_close(struct netdev *netdev_)
704 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
706 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
713 netdev_linux_listen(struct netdev *netdev_)
715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
716 struct sockaddr_ll sll;
721 if (netdev->fd >= 0) {
725 /* Create file descriptor. */
726 fd = socket(PF_PACKET, SOCK_RAW, 0);
729 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
733 /* Set non-blocking mode. */
734 error = set_nonblocking(fd);
739 /* Get ethernet device index. */
740 error = get_ifindex(&netdev->netdev, &ifindex);
745 /* Bind to specific ethernet device. */
746 memset(&sll, 0, sizeof sll);
747 sll.sll_family = AF_PACKET;
748 sll.sll_ifindex = ifindex;
749 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
750 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
752 VLOG_ERR("%s: failed to bind raw socket (%s)",
753 netdev_get_name(netdev_), strerror(error));
768 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 if (netdev->fd < 0) {
773 /* Device is not listening. */
778 ssize_t retval = read(netdev->fd, data, size);
781 } else if (errno != EINTR) {
782 if (errno != EAGAIN) {
783 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
784 strerror(errno), netdev_get_name(netdev_));
791 /* Registers with the poll loop to wake up from the next call to poll_block()
792 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
794 netdev_linux_recv_wait(struct netdev *netdev_)
796 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
797 if (netdev->fd >= 0) {
798 poll_fd_wait(netdev->fd, POLLIN);
802 /* Discards all packets waiting to be received from 'netdev'. */
804 netdev_linux_drain(struct netdev *netdev_)
806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
807 if (netdev->fd < 0) {
809 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
811 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
812 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
816 drain_fd(netdev->fd, ifr.ifr_qlen);
819 return drain_rcvbuf(netdev->fd);
823 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
824 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
825 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
826 * the packet is too big or too small to transmit on the device.
828 * The caller retains ownership of 'buffer' in all cases.
830 * The kernel maintains a packet transmission queue, so the caller is not
831 * expected to do additional queuing of packets. */
833 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 if (netdev->fd < 0) {
840 /* Use our AF_PACKET socket to send to this device. */
841 struct sockaddr_ll sll;
848 sock = af_packet_sock();
853 error = get_ifindex(netdev_, &ifindex);
858 /* We don't bother setting most fields in sockaddr_ll because the
859 * kernel ignores them for SOCK_RAW. */
860 memset(&sll, 0, sizeof sll);
861 sll.sll_family = AF_PACKET;
862 sll.sll_ifindex = ifindex;
864 iov.iov_base = (void *) data;
868 msg.msg_namelen = sizeof sll;
871 msg.msg_control = NULL;
872 msg.msg_controllen = 0;
875 retval = sendmsg(sock, &msg, 0);
877 /* Use the netdev's own fd to send to this device. This is
878 * essential for tap devices, because packets sent to a tap device
879 * with an AF_PACKET socket will loop back to be *received* again
880 * on the tap device. */
881 retval = write(netdev->fd, data, size);
885 /* The Linux AF_PACKET implementation never blocks waiting for room
886 * for packets, instead returning ENOBUFS. Translate this into
887 * EAGAIN for the caller. */
888 if (errno == ENOBUFS) {
890 } else if (errno == EINTR) {
892 } else if (errno != EAGAIN) {
893 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
894 netdev_get_name(netdev_), strerror(errno));
897 } else if (retval != size) {
898 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
899 "%zu) on %s", retval, size, netdev_get_name(netdev_));
907 /* Registers with the poll loop to wake up from the next call to poll_block()
908 * when the packet transmission queue has sufficient room to transmit a packet
909 * with netdev_send().
911 * The kernel maintains a packet transmission queue, so the client is not
912 * expected to do additional queuing of packets. Thus, this function is
913 * unlikely to ever be used. It is included for completeness. */
915 netdev_linux_send_wait(struct netdev *netdev_)
917 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
918 if (netdev->fd < 0) {
920 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
921 poll_fd_wait(netdev->fd, POLLOUT);
923 /* TAP device always accepts packets.*/
924 poll_immediate_wake();
928 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
929 * otherwise a positive errno value. */
931 netdev_linux_set_etheraddr(struct netdev *netdev_,
932 const uint8_t mac[ETH_ADDR_LEN])
934 struct netdev_dev_linux *netdev_dev =
935 netdev_dev_linux_cast(netdev_get_dev(netdev_));
938 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
939 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
940 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
942 netdev_dev->cache_valid |= VALID_ETHERADDR;
943 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
951 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
952 * free the returned buffer. */
954 netdev_linux_get_etheraddr(const struct netdev *netdev_,
955 uint8_t mac[ETH_ADDR_LEN])
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
959 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
960 int error = get_etheraddr(netdev_get_name(netdev_),
961 netdev_dev->etheraddr);
965 netdev_dev->cache_valid |= VALID_ETHERADDR;
967 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
971 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
972 * in bytes, not including the hardware header; thus, this is typically 1500
973 * bytes for Ethernet devices. */
975 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
977 struct netdev_dev_linux *netdev_dev =
978 netdev_dev_linux_cast(netdev_get_dev(netdev_));
979 if (!(netdev_dev->cache_valid & VALID_MTU)) {
983 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
984 SIOCGIFMTU, "SIOCGIFMTU");
988 netdev_dev->mtu = ifr.ifr_mtu;
989 netdev_dev->cache_valid |= VALID_MTU;
991 *mtup = netdev_dev->mtu;
995 /* Sets the maximum size of transmitted (MTU) for given device using linux
996 * networking ioctl interface.
999 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1001 struct netdev_dev_linux *netdev_dev =
1002 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1007 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1008 SIOCSIFMTU, "SIOCSIFMTU");
1013 netdev_dev->mtu = ifr.ifr_mtu;
1014 netdev_dev->cache_valid |= VALID_MTU;
1018 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1019 * On failure, returns a negative errno value. */
1021 netdev_linux_get_ifindex(const struct netdev *netdev)
1025 error = get_ifindex(netdev, &ifindex);
1026 return error ? -error : ifindex;
1030 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1032 struct netdev_dev_linux *netdev_dev =
1033 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1035 if (netdev_dev->miimon_interval > 0) {
1036 *carrier = netdev_dev->miimon;
1038 *carrier = netdev_dev->carrier;
1045 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1046 struct mii_ioctl_data *data)
1051 memset(&ifr, 0, sizeof ifr);
1052 memcpy(&ifr.ifr_data, data, sizeof *data);
1053 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1054 memcpy(data, &ifr.ifr_data, sizeof *data);
1060 netdev_linux_get_miimon(const char *name, bool *miimon)
1062 struct mii_ioctl_data data;
1067 memset(&data, 0, sizeof data);
1068 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1070 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1071 data.reg_num = MII_BMSR;
1072 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1076 *miimon = !!(data.val_out & BMSR_LSTATUS);
1078 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1081 struct ethtool_cmd ecmd;
1083 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1086 memset(&ecmd, 0, sizeof ecmd);
1087 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1090 struct ethtool_value eval;
1092 memcpy(&eval, &ecmd, sizeof eval);
1093 *miimon = !!eval.data;
1095 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1103 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1104 long long int interval)
1106 struct netdev_dev_linux *netdev_dev;
1108 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1110 interval = interval > 0 ? MAX(interval, 100) : 0;
1111 if (netdev_dev->miimon_interval != interval) {
1112 netdev_dev->miimon_interval = interval;
1113 timer_set_expired(&netdev_dev->miimon_timer);
1120 netdev_linux_miimon_run(void)
1122 struct shash device_shash;
1123 struct shash_node *node;
1125 shash_init(&device_shash);
1126 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1127 SHASH_FOR_EACH (node, &device_shash) {
1128 struct netdev_dev_linux *dev = node->data;
1131 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1135 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1136 if (miimon != dev->miimon) {
1137 dev->miimon = miimon;
1138 netdev_dev_linux_changed(dev);
1141 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1144 shash_destroy(&device_shash);
1148 netdev_linux_miimon_wait(void)
1150 struct shash device_shash;
1151 struct shash_node *node;
1153 shash_init(&device_shash);
1154 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1155 SHASH_FOR_EACH (node, &device_shash) {
1156 struct netdev_dev_linux *dev = node->data;
1158 if (dev->miimon_interval > 0) {
1159 timer_wait(&dev->miimon_timer);
1162 shash_destroy(&device_shash);
1165 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1166 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1169 check_for_working_netlink_stats(void)
1171 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1172 * preferable, so if that works, we'll use it. */
1173 int ifindex = do_get_ifindex("lo");
1175 VLOG_WARN("failed to get ifindex for lo, "
1176 "obtaining netdev stats from proc");
1179 struct netdev_stats stats;
1180 int error = get_stats_via_netlink(ifindex, &stats);
1182 VLOG_DBG("obtaining netdev stats via rtnetlink");
1185 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1186 "via proc (you are probably running a pre-2.6.19 "
1187 "kernel)", strerror(error));
1194 swap_uint64(uint64_t *a, uint64_t *b)
1202 get_stats_via_vport(const struct netdev *netdev_,
1203 struct netdev_stats *stats)
1205 struct netdev_dev_linux *netdev_dev =
1206 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1208 if (netdev_dev->have_vport_stats ||
1209 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1212 error = netdev_vport_get_stats(netdev_, stats);
1214 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1215 netdev_get_name(netdev_), error);
1217 netdev_dev->have_vport_stats = !error;
1218 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1223 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1224 struct netdev_stats *stats)
1226 static int use_netlink_stats = -1;
1229 if (use_netlink_stats < 0) {
1230 use_netlink_stats = check_for_working_netlink_stats();
1233 if (use_netlink_stats) {
1236 error = get_ifindex(netdev_, &ifindex);
1238 error = get_stats_via_netlink(ifindex, stats);
1241 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1245 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1246 netdev_get_name(netdev_), error);
1252 /* Retrieves current device stats for 'netdev-linux'. */
1254 netdev_linux_get_stats(const struct netdev *netdev_,
1255 struct netdev_stats *stats)
1257 struct netdev_dev_linux *netdev_dev =
1258 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1259 struct netdev_stats dev_stats;
1262 get_stats_via_vport(netdev_, stats);
1264 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1267 if (!netdev_dev->have_vport_stats) {
1274 if (!netdev_dev->have_vport_stats) {
1275 /* stats not available from OVS then use ioctl stats. */
1278 stats->rx_errors += dev_stats.rx_errors;
1279 stats->tx_errors += dev_stats.tx_errors;
1280 stats->rx_dropped += dev_stats.rx_dropped;
1281 stats->tx_dropped += dev_stats.tx_dropped;
1282 stats->multicast += dev_stats.multicast;
1283 stats->collisions += dev_stats.collisions;
1284 stats->rx_length_errors += dev_stats.rx_length_errors;
1285 stats->rx_over_errors += dev_stats.rx_over_errors;
1286 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1287 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1288 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1289 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1290 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1291 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1292 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1293 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1294 stats->tx_window_errors += dev_stats.tx_window_errors;
1299 /* Retrieves current device stats for 'netdev-tap' netdev or
1300 * netdev-internal. */
1302 netdev_pseudo_get_stats(const struct netdev *netdev_,
1303 struct netdev_stats *stats)
1305 struct netdev_dev_linux *netdev_dev =
1306 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1307 struct netdev_stats dev_stats;
1310 get_stats_via_vport(netdev_, stats);
1312 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1314 if (!netdev_dev->have_vport_stats) {
1321 /* If this port is an internal port then the transmit and receive stats
1322 * will appear to be swapped relative to the other ports since we are the
1323 * one sending the data, not a remote computer. For consistency, we swap
1324 * them back here. This does not apply if we are getting stats from the
1325 * vport layer because it always tracks stats from the perspective of the
1327 if (!netdev_dev->have_vport_stats) {
1329 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1330 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1331 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1332 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1333 stats->rx_length_errors = 0;
1334 stats->rx_over_errors = 0;
1335 stats->rx_crc_errors = 0;
1336 stats->rx_frame_errors = 0;
1337 stats->rx_fifo_errors = 0;
1338 stats->rx_missed_errors = 0;
1339 stats->tx_aborted_errors = 0;
1340 stats->tx_carrier_errors = 0;
1341 stats->tx_fifo_errors = 0;
1342 stats->tx_heartbeat_errors = 0;
1343 stats->tx_window_errors = 0;
1345 stats->rx_dropped += dev_stats.tx_dropped;
1346 stats->tx_dropped += dev_stats.rx_dropped;
1348 stats->rx_errors += dev_stats.tx_errors;
1349 stats->tx_errors += dev_stats.rx_errors;
1351 stats->multicast += dev_stats.multicast;
1352 stats->collisions += dev_stats.collisions;
1357 /* Stores the features supported by 'netdev' into each of '*current',
1358 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1359 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1360 * successful, otherwise a positive errno value. */
1362 netdev_linux_get_features(const struct netdev *netdev,
1363 uint32_t *current, uint32_t *advertised,
1364 uint32_t *supported, uint32_t *peer)
1366 struct ethtool_cmd ecmd;
1369 memset(&ecmd, 0, sizeof ecmd);
1370 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1371 ETHTOOL_GSET, "ETHTOOL_GSET");
1376 /* Supported features. */
1378 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1379 *supported |= OFPPF_10MB_HD;
1381 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1382 *supported |= OFPPF_10MB_FD;
1384 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1385 *supported |= OFPPF_100MB_HD;
1387 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1388 *supported |= OFPPF_100MB_FD;
1390 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1391 *supported |= OFPPF_1GB_HD;
1393 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1394 *supported |= OFPPF_1GB_FD;
1396 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1397 *supported |= OFPPF_10GB_FD;
1399 if (ecmd.supported & SUPPORTED_TP) {
1400 *supported |= OFPPF_COPPER;
1402 if (ecmd.supported & SUPPORTED_FIBRE) {
1403 *supported |= OFPPF_FIBER;
1405 if (ecmd.supported & SUPPORTED_Autoneg) {
1406 *supported |= OFPPF_AUTONEG;
1408 if (ecmd.supported & SUPPORTED_Pause) {
1409 *supported |= OFPPF_PAUSE;
1411 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1412 *supported |= OFPPF_PAUSE_ASYM;
1415 /* Advertised features. */
1417 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1418 *advertised |= OFPPF_10MB_HD;
1420 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1421 *advertised |= OFPPF_10MB_FD;
1423 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1424 *advertised |= OFPPF_100MB_HD;
1426 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1427 *advertised |= OFPPF_100MB_FD;
1429 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1430 *advertised |= OFPPF_1GB_HD;
1432 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1433 *advertised |= OFPPF_1GB_FD;
1435 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1436 *advertised |= OFPPF_10GB_FD;
1438 if (ecmd.advertising & ADVERTISED_TP) {
1439 *advertised |= OFPPF_COPPER;
1441 if (ecmd.advertising & ADVERTISED_FIBRE) {
1442 *advertised |= OFPPF_FIBER;
1444 if (ecmd.advertising & ADVERTISED_Autoneg) {
1445 *advertised |= OFPPF_AUTONEG;
1447 if (ecmd.advertising & ADVERTISED_Pause) {
1448 *advertised |= OFPPF_PAUSE;
1450 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1451 *advertised |= OFPPF_PAUSE_ASYM;
1454 /* Current settings. */
1455 if (ecmd.speed == SPEED_10) {
1456 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1457 } else if (ecmd.speed == SPEED_100) {
1458 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1459 } else if (ecmd.speed == SPEED_1000) {
1460 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1461 } else if (ecmd.speed == SPEED_10000) {
1462 *current = OFPPF_10GB_FD;
1467 if (ecmd.port == PORT_TP) {
1468 *current |= OFPPF_COPPER;
1469 } else if (ecmd.port == PORT_FIBRE) {
1470 *current |= OFPPF_FIBER;
1474 *current |= OFPPF_AUTONEG;
1477 /* Peer advertisements. */
1478 *peer = 0; /* XXX */
1483 /* Set the features advertised by 'netdev' to 'advertise'. */
1485 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1487 struct ethtool_cmd ecmd;
1490 memset(&ecmd, 0, sizeof ecmd);
1491 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1492 ETHTOOL_GSET, "ETHTOOL_GSET");
1497 ecmd.advertising = 0;
1498 if (advertise & OFPPF_10MB_HD) {
1499 ecmd.advertising |= ADVERTISED_10baseT_Half;
1501 if (advertise & OFPPF_10MB_FD) {
1502 ecmd.advertising |= ADVERTISED_10baseT_Full;
1504 if (advertise & OFPPF_100MB_HD) {
1505 ecmd.advertising |= ADVERTISED_100baseT_Half;
1507 if (advertise & OFPPF_100MB_FD) {
1508 ecmd.advertising |= ADVERTISED_100baseT_Full;
1510 if (advertise & OFPPF_1GB_HD) {
1511 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1513 if (advertise & OFPPF_1GB_FD) {
1514 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1516 if (advertise & OFPPF_10GB_FD) {
1517 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1519 if (advertise & OFPPF_COPPER) {
1520 ecmd.advertising |= ADVERTISED_TP;
1522 if (advertise & OFPPF_FIBER) {
1523 ecmd.advertising |= ADVERTISED_FIBRE;
1525 if (advertise & OFPPF_AUTONEG) {
1526 ecmd.advertising |= ADVERTISED_Autoneg;
1528 if (advertise & OFPPF_PAUSE) {
1529 ecmd.advertising |= ADVERTISED_Pause;
1531 if (advertise & OFPPF_PAUSE_ASYM) {
1532 ecmd.advertising |= ADVERTISED_Asym_Pause;
1534 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1535 ETHTOOL_SSET, "ETHTOOL_SSET");
1538 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1539 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1540 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1541 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1542 * sets '*vlan_vid' to -1. */
1544 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1546 const char *netdev_name = netdev_get_name(netdev);
1547 struct ds line = DS_EMPTY_INITIALIZER;
1548 FILE *stream = NULL;
1552 COVERAGE_INC(netdev_get_vlan_vid);
1553 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1554 stream = fopen(fn, "r");
1560 if (ds_get_line(&line, stream)) {
1561 if (ferror(stream)) {
1563 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1566 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1571 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1573 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1574 fn, ds_cstr(&line));
1592 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1593 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1595 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1596 * positive errno value.
1598 * This function is equivalent to running
1599 * /sbin/tc qdisc del dev %s handle ffff: ingress
1600 * but it is much, much faster.
1603 netdev_linux_remove_policing(struct netdev *netdev)
1605 struct netdev_dev_linux *netdev_dev =
1606 netdev_dev_linux_cast(netdev_get_dev(netdev));
1607 const char *netdev_name = netdev_get_name(netdev);
1609 struct ofpbuf request;
1610 struct tcmsg *tcmsg;
1613 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1617 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1618 tcmsg->tcm_parent = TC_H_INGRESS;
1619 nl_msg_put_string(&request, TCA_KIND, "ingress");
1620 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1622 error = tc_transact(&request, NULL);
1623 if (error && error != ENOENT && error != EINVAL) {
1624 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1625 netdev_name, strerror(error));
1629 netdev_dev->kbits_rate = 0;
1630 netdev_dev->kbits_burst = 0;
1631 netdev_dev->cache_valid |= VALID_POLICING;
1635 /* Attempts to set input rate limiting (policing) policy. */
1637 netdev_linux_set_policing(struct netdev *netdev,
1638 uint32_t kbits_rate, uint32_t kbits_burst)
1640 struct netdev_dev_linux *netdev_dev =
1641 netdev_dev_linux_cast(netdev_get_dev(netdev));
1642 const char *netdev_name = netdev_get_name(netdev);
1645 COVERAGE_INC(netdev_set_policing);
1647 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1648 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1649 : kbits_burst); /* Stick with user-specified value. */
1651 if (netdev_dev->cache_valid & VALID_POLICING
1652 && netdev_dev->kbits_rate == kbits_rate
1653 && netdev_dev->kbits_burst == kbits_burst) {
1654 /* Assume that settings haven't changed since we last set them. */
1658 netdev_linux_remove_policing(netdev);
1660 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1661 if (system(command) != 0) {
1662 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1666 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1667 kbits_rate, kbits_burst);
1668 if (system(command) != 0) {
1669 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1674 netdev_dev->kbits_rate = kbits_rate;
1675 netdev_dev->kbits_burst = kbits_burst;
1676 netdev_dev->cache_valid |= VALID_POLICING;
1683 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1686 const struct tc_ops **opsp;
1688 for (opsp = tcs; *opsp != NULL; opsp++) {
1689 const struct tc_ops *ops = *opsp;
1690 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1691 sset_add(types, ops->ovs_name);
1697 static const struct tc_ops *
1698 tc_lookup_ovs_name(const char *name)
1700 const struct tc_ops **opsp;
1702 for (opsp = tcs; *opsp != NULL; opsp++) {
1703 const struct tc_ops *ops = *opsp;
1704 if (!strcmp(name, ops->ovs_name)) {
1711 static const struct tc_ops *
1712 tc_lookup_linux_name(const char *name)
1714 const struct tc_ops **opsp;
1716 for (opsp = tcs; *opsp != NULL; opsp++) {
1717 const struct tc_ops *ops = *opsp;
1718 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1725 static struct tc_queue *
1726 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1731 struct tc_queue *queue;
1733 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1734 if (queue->queue_id == queue_id) {
1741 static struct tc_queue *
1742 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1744 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1748 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1750 struct netdev_qos_capabilities *caps)
1752 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1756 caps->n_queues = ops->n_queues;
1761 netdev_linux_get_qos(const struct netdev *netdev,
1762 const char **typep, struct shash *details)
1764 struct netdev_dev_linux *netdev_dev =
1765 netdev_dev_linux_cast(netdev_get_dev(netdev));
1768 error = tc_query_qdisc(netdev);
1773 *typep = netdev_dev->tc->ops->ovs_name;
1774 return (netdev_dev->tc->ops->qdisc_get
1775 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1780 netdev_linux_set_qos(struct netdev *netdev,
1781 const char *type, const struct shash *details)
1783 struct netdev_dev_linux *netdev_dev =
1784 netdev_dev_linux_cast(netdev_get_dev(netdev));
1785 const struct tc_ops *new_ops;
1788 new_ops = tc_lookup_ovs_name(type);
1789 if (!new_ops || !new_ops->tc_install) {
1793 error = tc_query_qdisc(netdev);
1798 if (new_ops == netdev_dev->tc->ops) {
1799 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1801 /* Delete existing qdisc. */
1802 error = tc_del_qdisc(netdev);
1806 assert(netdev_dev->tc == NULL);
1808 /* Install new qdisc. */
1809 error = new_ops->tc_install(netdev, details);
1810 assert((error == 0) == (netdev_dev->tc != NULL));
1817 netdev_linux_get_queue(const struct netdev *netdev,
1818 unsigned int queue_id, struct shash *details)
1820 struct netdev_dev_linux *netdev_dev =
1821 netdev_dev_linux_cast(netdev_get_dev(netdev));
1824 error = tc_query_qdisc(netdev);
1828 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1830 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1836 netdev_linux_set_queue(struct netdev *netdev,
1837 unsigned int queue_id, const struct shash *details)
1839 struct netdev_dev_linux *netdev_dev =
1840 netdev_dev_linux_cast(netdev_get_dev(netdev));
1843 error = tc_query_qdisc(netdev);
1846 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1847 || !netdev_dev->tc->ops->class_set) {
1851 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1855 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1857 struct netdev_dev_linux *netdev_dev =
1858 netdev_dev_linux_cast(netdev_get_dev(netdev));
1861 error = tc_query_qdisc(netdev);
1864 } else if (!netdev_dev->tc->ops->class_delete) {
1867 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1869 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1875 netdev_linux_get_queue_stats(const struct netdev *netdev,
1876 unsigned int queue_id,
1877 struct netdev_queue_stats *stats)
1879 struct netdev_dev_linux *netdev_dev =
1880 netdev_dev_linux_cast(netdev_get_dev(netdev));
1883 error = tc_query_qdisc(netdev);
1886 } else if (!netdev_dev->tc->ops->class_get_stats) {
1889 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1891 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1897 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1899 struct ofpbuf request;
1900 struct tcmsg *tcmsg;
1902 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1906 tcmsg->tcm_parent = 0;
1907 nl_dump_start(dump, rtnl_sock, &request);
1908 ofpbuf_uninit(&request);
1913 netdev_linux_dump_queues(const struct netdev *netdev,
1914 netdev_dump_queues_cb *cb, void *aux)
1916 struct netdev_dev_linux *netdev_dev =
1917 netdev_dev_linux_cast(netdev_get_dev(netdev));
1918 struct tc_queue *queue;
1919 struct shash details;
1923 error = tc_query_qdisc(netdev);
1926 } else if (!netdev_dev->tc->ops->class_get) {
1931 shash_init(&details);
1932 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1933 shash_clear(&details);
1935 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1937 (*cb)(queue->queue_id, &details, aux);
1942 shash_destroy(&details);
1948 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1949 netdev_dump_queue_stats_cb *cb, void *aux)
1951 struct netdev_dev_linux *netdev_dev =
1952 netdev_dev_linux_cast(netdev_get_dev(netdev));
1953 struct nl_dump dump;
1958 error = tc_query_qdisc(netdev);
1961 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1966 if (!start_queue_dump(netdev, &dump)) {
1969 while (nl_dump_next(&dump, &msg)) {
1970 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1976 error = nl_dump_done(&dump);
1977 return error ? error : last_error;
1981 netdev_linux_get_in4(const struct netdev *netdev_,
1982 struct in_addr *address, struct in_addr *netmask)
1984 struct netdev_dev_linux *netdev_dev =
1985 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1987 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1990 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1991 SIOCGIFADDR, "SIOCGIFADDR");
1996 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1997 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2002 netdev_dev->cache_valid |= VALID_IN4;
2004 *address = netdev_dev->address;
2005 *netmask = netdev_dev->netmask;
2006 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2010 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2011 struct in_addr netmask)
2013 struct netdev_dev_linux *netdev_dev =
2014 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2017 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2019 netdev_dev->cache_valid |= VALID_IN4;
2020 netdev_dev->address = address;
2021 netdev_dev->netmask = netmask;
2022 if (address.s_addr != INADDR_ANY) {
2023 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2024 "SIOCSIFNETMASK", netmask);
2031 parse_if_inet6_line(const char *line,
2032 struct in6_addr *in6, char ifname[16 + 1])
2034 uint8_t *s6 = in6->s6_addr;
2035 #define X8 "%2"SCNx8
2037 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2038 "%*x %*x %*x %*x %16s\n",
2039 &s6[0], &s6[1], &s6[2], &s6[3],
2040 &s6[4], &s6[5], &s6[6], &s6[7],
2041 &s6[8], &s6[9], &s6[10], &s6[11],
2042 &s6[12], &s6[13], &s6[14], &s6[15],
2046 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2047 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2049 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2051 struct netdev_dev_linux *netdev_dev =
2052 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2053 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2057 netdev_dev->in6 = in6addr_any;
2059 file = fopen("/proc/net/if_inet6", "r");
2061 const char *name = netdev_get_name(netdev_);
2062 while (fgets(line, sizeof line, file)) {
2063 struct in6_addr in6_tmp;
2064 char ifname[16 + 1];
2065 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2066 && !strcmp(name, ifname))
2068 netdev_dev->in6 = in6_tmp;
2074 netdev_dev->cache_valid |= VALID_IN6;
2076 *in6 = netdev_dev->in6;
2081 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2083 struct sockaddr_in sin;
2084 memset(&sin, 0, sizeof sin);
2085 sin.sin_family = AF_INET;
2086 sin.sin_addr = addr;
2089 memset(sa, 0, sizeof *sa);
2090 memcpy(sa, &sin, sizeof sin);
2094 do_set_addr(struct netdev *netdev,
2095 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2098 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2099 make_in4_sockaddr(&ifr.ifr_addr, addr);
2101 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2105 /* Adds 'router' as a default IP gateway. */
2107 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2109 struct in_addr any = { INADDR_ANY };
2113 memset(&rt, 0, sizeof rt);
2114 make_in4_sockaddr(&rt.rt_dst, any);
2115 make_in4_sockaddr(&rt.rt_gateway, router);
2116 make_in4_sockaddr(&rt.rt_genmask, any);
2117 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2118 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2120 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2126 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2129 static const char fn[] = "/proc/net/route";
2134 *netdev_name = NULL;
2135 stream = fopen(fn, "r");
2136 if (stream == NULL) {
2137 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2142 while (fgets(line, sizeof line, stream)) {
2145 ovs_be32 dest, gateway, mask;
2146 int refcnt, metric, mtu;
2147 unsigned int flags, use, window, irtt;
2150 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2152 iface, &dest, &gateway, &flags, &refcnt,
2153 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2155 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2159 if (!(flags & RTF_UP)) {
2160 /* Skip routes that aren't up. */
2164 /* The output of 'dest', 'mask', and 'gateway' were given in
2165 * network byte order, so we don't need need any endian
2166 * conversions here. */
2167 if ((dest & mask) == (host->s_addr & mask)) {
2169 /* The host is directly reachable. */
2170 next_hop->s_addr = 0;
2172 /* To reach the host, we must go through a gateway. */
2173 next_hop->s_addr = gateway;
2175 *netdev_name = xstrdup(iface);
2187 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2189 struct ethtool_drvinfo drvinfo;
2192 memset(&drvinfo, 0, sizeof drvinfo);
2193 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2194 (struct ethtool_cmd *)&drvinfo,
2196 "ETHTOOL_GDRVINFO");
2198 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2199 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2200 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2206 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2207 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2208 * returns 0. Otherwise, it returns a positive errno value; in particular,
2209 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2211 netdev_linux_arp_lookup(const struct netdev *netdev,
2212 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2215 struct sockaddr_in sin;
2218 memset(&r, 0, sizeof r);
2219 memset(&sin, 0, sizeof sin);
2220 sin.sin_family = AF_INET;
2221 sin.sin_addr.s_addr = ip;
2223 memcpy(&r.arp_pa, &sin, sizeof sin);
2224 r.arp_ha.sa_family = ARPHRD_ETHER;
2226 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2227 COVERAGE_INC(netdev_arp_lookup);
2228 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2230 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2231 } else if (retval != ENXIO) {
2232 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2233 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2239 nd_to_iff_flags(enum netdev_flags nd)
2242 if (nd & NETDEV_UP) {
2245 if (nd & NETDEV_PROMISC) {
2252 iff_to_nd_flags(int iff)
2254 enum netdev_flags nd = 0;
2258 if (iff & IFF_PROMISC) {
2259 nd |= NETDEV_PROMISC;
2265 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2266 enum netdev_flags on, enum netdev_flags *old_flagsp)
2268 int old_flags, new_flags;
2271 error = get_flags(netdev, &old_flags);
2273 *old_flagsp = iff_to_nd_flags(old_flags);
2274 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2275 if (new_flags != old_flags) {
2276 error = set_flags(netdev, new_flags);
2283 netdev_linux_change_seq(const struct netdev *netdev)
2285 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2288 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2292 netdev_linux_init, \
2294 netdev_linux_wait, \
2297 netdev_linux_destroy, \
2298 NULL, /* get_config */ \
2299 NULL, /* set_config */ \
2301 netdev_linux_open, \
2302 netdev_linux_close, \
2304 netdev_linux_listen, \
2305 netdev_linux_recv, \
2306 netdev_linux_recv_wait, \
2307 netdev_linux_drain, \
2309 netdev_linux_send, \
2310 netdev_linux_send_wait, \
2312 netdev_linux_set_etheraddr, \
2313 netdev_linux_get_etheraddr, \
2314 netdev_linux_get_mtu, \
2315 netdev_linux_set_mtu, \
2316 netdev_linux_get_ifindex, \
2317 netdev_linux_get_carrier, \
2318 netdev_linux_set_miimon_interval, \
2322 netdev_linux_get_features, \
2323 netdev_linux_set_advertisements, \
2324 netdev_linux_get_vlan_vid, \
2326 netdev_linux_set_policing, \
2327 netdev_linux_get_qos_types, \
2328 netdev_linux_get_qos_capabilities, \
2329 netdev_linux_get_qos, \
2330 netdev_linux_set_qos, \
2331 netdev_linux_get_queue, \
2332 netdev_linux_set_queue, \
2333 netdev_linux_delete_queue, \
2334 netdev_linux_get_queue_stats, \
2335 netdev_linux_dump_queues, \
2336 netdev_linux_dump_queue_stats, \
2338 netdev_linux_get_in4, \
2339 netdev_linux_set_in4, \
2340 netdev_linux_get_in6, \
2341 netdev_linux_add_router, \
2342 netdev_linux_get_next_hop, \
2343 netdev_linux_get_status, \
2344 netdev_linux_arp_lookup, \
2346 netdev_linux_update_flags, \
2348 netdev_linux_change_seq \
2351 const struct netdev_class netdev_linux_class =
2354 netdev_linux_create,
2355 netdev_linux_get_stats,
2356 NULL); /* set_stats */
2358 const struct netdev_class netdev_tap_class =
2361 netdev_linux_create_tap,
2362 netdev_pseudo_get_stats,
2363 NULL); /* set_stats */
2365 const struct netdev_class netdev_internal_class =
2368 netdev_linux_create,
2369 netdev_pseudo_get_stats,
2370 netdev_vport_set_stats);
2372 /* HTB traffic control class. */
2374 #define HTB_N_QUEUES 0xf000
2378 unsigned int max_rate; /* In bytes/s. */
2382 struct tc_queue tc_queue;
2383 unsigned int min_rate; /* In bytes/s. */
2384 unsigned int max_rate; /* In bytes/s. */
2385 unsigned int burst; /* In bytes. */
2386 unsigned int priority; /* Lower values are higher priorities. */
2390 htb_get__(const struct netdev *netdev)
2392 struct netdev_dev_linux *netdev_dev =
2393 netdev_dev_linux_cast(netdev_get_dev(netdev));
2394 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2398 htb_install__(struct netdev *netdev, uint64_t max_rate)
2400 struct netdev_dev_linux *netdev_dev =
2401 netdev_dev_linux_cast(netdev_get_dev(netdev));
2404 htb = xmalloc(sizeof *htb);
2405 tc_init(&htb->tc, &tc_ops_htb);
2406 htb->max_rate = max_rate;
2408 netdev_dev->tc = &htb->tc;
2411 /* Create an HTB qdisc.
2413 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2415 htb_setup_qdisc__(struct netdev *netdev)
2418 struct tc_htb_glob opt;
2419 struct ofpbuf request;
2420 struct tcmsg *tcmsg;
2422 tc_del_qdisc(netdev);
2424 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2425 NLM_F_EXCL | NLM_F_CREATE, &request);
2429 tcmsg->tcm_handle = tc_make_handle(1, 0);
2430 tcmsg->tcm_parent = TC_H_ROOT;
2432 nl_msg_put_string(&request, TCA_KIND, "htb");
2434 memset(&opt, 0, sizeof opt);
2435 opt.rate2quantum = 10;
2439 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2440 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2441 nl_msg_end_nested(&request, opt_offset);
2443 return tc_transact(&request, NULL);
2446 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2447 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2449 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2450 unsigned int parent, struct htb_class *class)
2453 struct tc_htb_opt opt;
2454 struct ofpbuf request;
2455 struct tcmsg *tcmsg;
2459 error = netdev_get_mtu(netdev, &mtu);
2461 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2462 netdev_get_name(netdev));
2466 memset(&opt, 0, sizeof opt);
2467 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2468 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2469 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2470 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2471 opt.prio = class->priority;
2473 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2477 tcmsg->tcm_handle = handle;
2478 tcmsg->tcm_parent = parent;
2480 nl_msg_put_string(&request, TCA_KIND, "htb");
2481 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2482 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2483 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2484 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2485 nl_msg_end_nested(&request, opt_offset);
2487 error = tc_transact(&request, NULL);
2489 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2490 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2491 netdev_get_name(netdev),
2492 tc_get_major(handle), tc_get_minor(handle),
2493 tc_get_major(parent), tc_get_minor(parent),
2494 class->min_rate, class->max_rate,
2495 class->burst, class->priority, strerror(error));
2500 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2501 * description of them into 'details'. The description complies with the
2502 * specification given in the vswitch database documentation for linux-htb
2505 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2507 static const struct nl_policy tca_htb_policy[] = {
2508 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2509 .min_len = sizeof(struct tc_htb_opt) },
2512 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2513 const struct tc_htb_opt *htb;
2515 if (!nl_parse_nested(nl_options, tca_htb_policy,
2516 attrs, ARRAY_SIZE(tca_htb_policy))) {
2517 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2521 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2522 class->min_rate = htb->rate.rate;
2523 class->max_rate = htb->ceil.rate;
2524 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2525 class->priority = htb->prio;
2530 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2531 struct htb_class *options,
2532 struct netdev_queue_stats *stats)
2534 struct nlattr *nl_options;
2535 unsigned int handle;
2538 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2539 if (!error && queue_id) {
2540 unsigned int major = tc_get_major(handle);
2541 unsigned int minor = tc_get_minor(handle);
2542 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2543 *queue_id = minor - 1;
2548 if (!error && options) {
2549 error = htb_parse_tca_options__(nl_options, options);
2555 htb_parse_qdisc_details__(struct netdev *netdev,
2556 const struct shash *details, struct htb_class *hc)
2558 const char *max_rate_s;
2560 max_rate_s = shash_find_data(details, "max-rate");
2561 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2562 if (!hc->max_rate) {
2565 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2566 hc->max_rate = netdev_features_to_bps(current) / 8;
2568 hc->min_rate = hc->max_rate;
2574 htb_parse_class_details__(struct netdev *netdev,
2575 const struct shash *details, struct htb_class *hc)
2577 const struct htb *htb = htb_get__(netdev);
2578 const char *min_rate_s = shash_find_data(details, "min-rate");
2579 const char *max_rate_s = shash_find_data(details, "max-rate");
2580 const char *burst_s = shash_find_data(details, "burst");
2581 const char *priority_s = shash_find_data(details, "priority");
2584 error = netdev_get_mtu(netdev, &mtu);
2586 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2587 netdev_get_name(netdev));
2591 /* HTB requires at least an mtu sized min-rate to send any traffic even
2592 * on uncongested links. */
2593 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2594 hc->min_rate = MAX(hc->min_rate, mtu);
2595 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2598 hc->max_rate = (max_rate_s
2599 ? strtoull(max_rate_s, NULL, 10) / 8
2601 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2602 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2606 * According to hints in the documentation that I've read, it is important
2607 * that 'burst' be at least as big as the largest frame that might be
2608 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2609 * but having it a bit too small is a problem. Since netdev_get_mtu()
2610 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2611 * the MTU. We actually add 64, instead of 14, as a guard against
2612 * additional headers get tacked on somewhere that we're not aware of. */
2613 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2614 hc->burst = MAX(hc->burst, mtu + 64);
2617 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2623 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2624 unsigned int parent, struct htb_class *options,
2625 struct netdev_queue_stats *stats)
2627 struct ofpbuf *reply;
2630 error = tc_query_class(netdev, handle, parent, &reply);
2632 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2633 ofpbuf_delete(reply);
2639 htb_tc_install(struct netdev *netdev, const struct shash *details)
2643 error = htb_setup_qdisc__(netdev);
2645 struct htb_class hc;
2647 htb_parse_qdisc_details__(netdev, details, &hc);
2648 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2649 tc_make_handle(1, 0), &hc);
2651 htb_install__(netdev, hc.max_rate);
2657 static struct htb_class *
2658 htb_class_cast__(const struct tc_queue *queue)
2660 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2664 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2665 const struct htb_class *hc)
2667 struct htb *htb = htb_get__(netdev);
2668 size_t hash = hash_int(queue_id, 0);
2669 struct tc_queue *queue;
2670 struct htb_class *hcp;
2672 queue = tc_find_queue__(netdev, queue_id, hash);
2674 hcp = htb_class_cast__(queue);
2676 hcp = xmalloc(sizeof *hcp);
2677 queue = &hcp->tc_queue;
2678 queue->queue_id = queue_id;
2679 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2682 hcp->min_rate = hc->min_rate;
2683 hcp->max_rate = hc->max_rate;
2684 hcp->burst = hc->burst;
2685 hcp->priority = hc->priority;
2689 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2692 struct nl_dump dump;
2693 struct htb_class hc;
2695 /* Get qdisc options. */
2697 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2698 htb_install__(netdev, hc.max_rate);
2701 if (!start_queue_dump(netdev, &dump)) {
2704 while (nl_dump_next(&dump, &msg)) {
2705 unsigned int queue_id;
2707 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2708 htb_update_queue__(netdev, queue_id, &hc);
2711 nl_dump_done(&dump);
2717 htb_tc_destroy(struct tc *tc)
2719 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2720 struct htb_class *hc, *next;
2722 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2723 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2731 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2733 const struct htb *htb = htb_get__(netdev);
2734 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2739 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2741 struct htb_class hc;
2744 htb_parse_qdisc_details__(netdev, details, &hc);
2745 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2746 tc_make_handle(1, 0), &hc);
2748 htb_get__(netdev)->max_rate = hc.max_rate;
2754 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2755 const struct tc_queue *queue, struct shash *details)
2757 const struct htb_class *hc = htb_class_cast__(queue);
2759 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2760 if (hc->min_rate != hc->max_rate) {
2761 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2763 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2765 shash_add(details, "priority", xasprintf("%u", hc->priority));
2771 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2772 const struct shash *details)
2774 struct htb_class hc;
2777 error = htb_parse_class_details__(netdev, details, &hc);
2782 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2783 tc_make_handle(1, 0xfffe), &hc);
2788 htb_update_queue__(netdev, queue_id, &hc);
2793 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2795 struct htb_class *hc = htb_class_cast__(queue);
2796 struct htb *htb = htb_get__(netdev);
2799 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2801 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2808 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2809 struct netdev_queue_stats *stats)
2811 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2812 tc_make_handle(1, 0xfffe), NULL, stats);
2816 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2817 const struct ofpbuf *nlmsg,
2818 netdev_dump_queue_stats_cb *cb, void *aux)
2820 struct netdev_queue_stats stats;
2821 unsigned int handle, major, minor;
2824 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2829 major = tc_get_major(handle);
2830 minor = tc_get_minor(handle);
2831 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2832 (*cb)(minor - 1, &stats, aux);
2837 static const struct tc_ops tc_ops_htb = {
2838 "htb", /* linux_name */
2839 "linux-htb", /* ovs_name */
2840 HTB_N_QUEUES, /* n_queues */
2849 htb_class_get_stats,
2850 htb_class_dump_stats
2853 /* "linux-hfsc" traffic control class. */
2855 #define HFSC_N_QUEUES 0xf000
2863 struct tc_queue tc_queue;
2868 static struct hfsc *
2869 hfsc_get__(const struct netdev *netdev)
2871 struct netdev_dev_linux *netdev_dev;
2872 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2873 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2876 static struct hfsc_class *
2877 hfsc_class_cast__(const struct tc_queue *queue)
2879 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2883 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2885 struct netdev_dev_linux * netdev_dev;
2888 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2889 hfsc = xmalloc(sizeof *hfsc);
2890 tc_init(&hfsc->tc, &tc_ops_hfsc);
2891 hfsc->max_rate = max_rate;
2892 netdev_dev->tc = &hfsc->tc;
2896 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2897 const struct hfsc_class *hc)
2901 struct hfsc_class *hcp;
2902 struct tc_queue *queue;
2904 hfsc = hfsc_get__(netdev);
2905 hash = hash_int(queue_id, 0);
2907 queue = tc_find_queue__(netdev, queue_id, hash);
2909 hcp = hfsc_class_cast__(queue);
2911 hcp = xmalloc(sizeof *hcp);
2912 queue = &hcp->tc_queue;
2913 queue->queue_id = queue_id;
2914 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2917 hcp->min_rate = hc->min_rate;
2918 hcp->max_rate = hc->max_rate;
2922 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2924 const struct tc_service_curve *rsc, *fsc, *usc;
2925 static const struct nl_policy tca_hfsc_policy[] = {
2927 .type = NL_A_UNSPEC,
2929 .min_len = sizeof(struct tc_service_curve),
2932 .type = NL_A_UNSPEC,
2934 .min_len = sizeof(struct tc_service_curve),
2937 .type = NL_A_UNSPEC,
2939 .min_len = sizeof(struct tc_service_curve),
2942 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2944 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2945 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2946 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2950 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2951 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2952 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2954 if (rsc->m1 != 0 || rsc->d != 0 ||
2955 fsc->m1 != 0 || fsc->d != 0 ||
2956 usc->m1 != 0 || usc->d != 0) {
2957 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2958 "Non-linear service curves are not supported.");
2962 if (rsc->m2 != fsc->m2) {
2963 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2964 "Real-time service curves are not supported ");
2968 if (rsc->m2 > usc->m2) {
2969 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2970 "Min-rate service curve is greater than "
2971 "the max-rate service curve.");
2975 class->min_rate = fsc->m2;
2976 class->max_rate = usc->m2;
2981 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2982 struct hfsc_class *options,
2983 struct netdev_queue_stats *stats)
2986 unsigned int handle;
2987 struct nlattr *nl_options;
2989 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2995 unsigned int major, minor;
2997 major = tc_get_major(handle);
2998 minor = tc_get_minor(handle);
2999 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3000 *queue_id = minor - 1;
3007 error = hfsc_parse_tca_options__(nl_options, options);
3014 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3015 unsigned int parent, struct hfsc_class *options,
3016 struct netdev_queue_stats *stats)
3019 struct ofpbuf *reply;
3021 error = tc_query_class(netdev, handle, parent, &reply);
3026 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3027 ofpbuf_delete(reply);
3032 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3033 struct hfsc_class *class)
3036 const char *max_rate_s;
3038 max_rate_s = shash_find_data(details, "max-rate");
3039 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3044 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3045 max_rate = netdev_features_to_bps(current) / 8;
3048 class->min_rate = max_rate;
3049 class->max_rate = max_rate;
3053 hfsc_parse_class_details__(struct netdev *netdev,
3054 const struct shash *details,
3055 struct hfsc_class * class)
3057 const struct hfsc *hfsc;
3058 uint32_t min_rate, max_rate;
3059 const char *min_rate_s, *max_rate_s;
3061 hfsc = hfsc_get__(netdev);
3062 min_rate_s = shash_find_data(details, "min-rate");
3063 max_rate_s = shash_find_data(details, "max-rate");
3065 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3066 min_rate = MAX(min_rate, 1);
3067 min_rate = MIN(min_rate, hfsc->max_rate);
3069 max_rate = (max_rate_s
3070 ? strtoull(max_rate_s, NULL, 10) / 8
3072 max_rate = MAX(max_rate, min_rate);
3073 max_rate = MIN(max_rate, hfsc->max_rate);
3075 class->min_rate = min_rate;
3076 class->max_rate = max_rate;
3081 /* Create an HFSC qdisc.
3083 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3085 hfsc_setup_qdisc__(struct netdev * netdev)
3087 struct tcmsg *tcmsg;
3088 struct ofpbuf request;
3089 struct tc_hfsc_qopt opt;
3091 tc_del_qdisc(netdev);
3093 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3094 NLM_F_EXCL | NLM_F_CREATE, &request);
3100 tcmsg->tcm_handle = tc_make_handle(1, 0);
3101 tcmsg->tcm_parent = TC_H_ROOT;
3103 memset(&opt, 0, sizeof opt);
3106 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3107 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3109 return tc_transact(&request, NULL);
3112 /* Create an HFSC class.
3114 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3115 * sc rate <min_rate> ul rate <max_rate>" */
3117 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3118 unsigned int parent, struct hfsc_class *class)
3122 struct tcmsg *tcmsg;
3123 struct ofpbuf request;
3124 struct tc_service_curve min, max;
3126 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3132 tcmsg->tcm_handle = handle;
3133 tcmsg->tcm_parent = parent;
3137 min.m2 = class->min_rate;
3141 max.m2 = class->max_rate;
3143 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3144 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3145 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3146 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3147 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3148 nl_msg_end_nested(&request, opt_offset);
3150 error = tc_transact(&request, NULL);
3152 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3153 "min-rate %ubps, max-rate %ubps (%s)",
3154 netdev_get_name(netdev),
3155 tc_get_major(handle), tc_get_minor(handle),
3156 tc_get_major(parent), tc_get_minor(parent),
3157 class->min_rate, class->max_rate, strerror(error));
3164 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3167 struct hfsc_class class;
3169 error = hfsc_setup_qdisc__(netdev);
3175 hfsc_parse_qdisc_details__(netdev, details, &class);
3176 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3177 tc_make_handle(1, 0), &class);
3183 hfsc_install__(netdev, class.max_rate);
3188 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3191 struct nl_dump dump;
3192 struct hfsc_class hc;
3195 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3196 hfsc_install__(netdev, hc.max_rate);
3198 if (!start_queue_dump(netdev, &dump)) {
3202 while (nl_dump_next(&dump, &msg)) {
3203 unsigned int queue_id;
3205 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3206 hfsc_update_queue__(netdev, queue_id, &hc);
3210 nl_dump_done(&dump);
3215 hfsc_tc_destroy(struct tc *tc)
3218 struct hfsc_class *hc, *next;
3220 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3222 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3223 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3232 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3234 const struct hfsc *hfsc;
3235 hfsc = hfsc_get__(netdev);
3236 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3241 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3244 struct hfsc_class class;
3246 hfsc_parse_qdisc_details__(netdev, details, &class);
3247 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3248 tc_make_handle(1, 0), &class);
3251 hfsc_get__(netdev)->max_rate = class.max_rate;
3258 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3259 const struct tc_queue *queue, struct shash *details)
3261 const struct hfsc_class *hc;
3263 hc = hfsc_class_cast__(queue);
3264 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3265 if (hc->min_rate != hc->max_rate) {
3266 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3272 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3273 const struct shash *details)
3276 struct hfsc_class class;
3278 error = hfsc_parse_class_details__(netdev, details, &class);
3283 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3284 tc_make_handle(1, 0xfffe), &class);
3289 hfsc_update_queue__(netdev, queue_id, &class);
3294 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3298 struct hfsc_class *hc;
3300 hc = hfsc_class_cast__(queue);
3301 hfsc = hfsc_get__(netdev);
3303 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3305 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3312 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3313 struct netdev_queue_stats *stats)
3315 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3316 tc_make_handle(1, 0xfffe), NULL, stats);
3320 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3321 const struct ofpbuf *nlmsg,
3322 netdev_dump_queue_stats_cb *cb, void *aux)
3324 struct netdev_queue_stats stats;
3325 unsigned int handle, major, minor;
3328 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3333 major = tc_get_major(handle);
3334 minor = tc_get_minor(handle);
3335 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3336 (*cb)(minor - 1, &stats, aux);
3341 static const struct tc_ops tc_ops_hfsc = {
3342 "hfsc", /* linux_name */
3343 "linux-hfsc", /* ovs_name */
3344 HFSC_N_QUEUES, /* n_queues */
3345 hfsc_tc_install, /* tc_install */
3346 hfsc_tc_load, /* tc_load */
3347 hfsc_tc_destroy, /* tc_destroy */
3348 hfsc_qdisc_get, /* qdisc_get */
3349 hfsc_qdisc_set, /* qdisc_set */
3350 hfsc_class_get, /* class_get */
3351 hfsc_class_set, /* class_set */
3352 hfsc_class_delete, /* class_delete */
3353 hfsc_class_get_stats, /* class_get_stats */
3354 hfsc_class_dump_stats /* class_dump_stats */
3357 /* "linux-default" traffic control class.
3359 * This class represents the default, unnamed Linux qdisc. It corresponds to
3360 * the "" (empty string) QoS type in the OVS database. */
3363 default_install__(struct netdev *netdev)
3365 struct netdev_dev_linux *netdev_dev =
3366 netdev_dev_linux_cast(netdev_get_dev(netdev));
3367 static struct tc *tc;
3370 tc = xmalloc(sizeof *tc);
3371 tc_init(tc, &tc_ops_default);
3373 netdev_dev->tc = tc;
3377 default_tc_install(struct netdev *netdev,
3378 const struct shash *details OVS_UNUSED)
3380 default_install__(netdev);
3385 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3387 default_install__(netdev);
3391 static const struct tc_ops tc_ops_default = {
3392 NULL, /* linux_name */
3397 NULL, /* tc_destroy */
3398 NULL, /* qdisc_get */
3399 NULL, /* qdisc_set */
3400 NULL, /* class_get */
3401 NULL, /* class_set */
3402 NULL, /* class_delete */
3403 NULL, /* class_get_stats */
3404 NULL /* class_dump_stats */
3407 /* "linux-other" traffic control class.
3412 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3414 struct netdev_dev_linux *netdev_dev =
3415 netdev_dev_linux_cast(netdev_get_dev(netdev));
3416 static struct tc *tc;
3419 tc = xmalloc(sizeof *tc);
3420 tc_init(tc, &tc_ops_other);
3422 netdev_dev->tc = tc;
3426 static const struct tc_ops tc_ops_other = {
3427 NULL, /* linux_name */
3428 "linux-other", /* ovs_name */
3430 NULL, /* tc_install */
3432 NULL, /* tc_destroy */
3433 NULL, /* qdisc_get */
3434 NULL, /* qdisc_set */
3435 NULL, /* class_get */
3436 NULL, /* class_set */
3437 NULL, /* class_delete */
3438 NULL, /* class_get_stats */
3439 NULL /* class_dump_stats */
3442 /* Traffic control. */
3444 /* Number of kernel "tc" ticks per second. */
3445 static double ticks_per_s;
3447 /* Number of kernel "jiffies" per second. This is used for the purpose of
3448 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3449 * one jiffy's worth of data.
3451 * There are two possibilities here:
3453 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3454 * approximate range of 100 to 1024. That means that we really need to
3455 * make sure that the qdisc can buffer that much data.
3457 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3458 * has finely granular timers and there's no need to fudge additional room
3459 * for buffers. (There's no extra effort needed to implement that: the
3460 * large 'buffer_hz' is used as a divisor, so practically any number will
3461 * come out as 0 in the division. Small integer results in the case of
3462 * really high dividends won't have any real effect anyhow.)
3464 static unsigned int buffer_hz;
3466 /* Returns tc handle 'major':'minor'. */
3468 tc_make_handle(unsigned int major, unsigned int minor)
3470 return TC_H_MAKE(major << 16, minor);
3473 /* Returns the major number from 'handle'. */
3475 tc_get_major(unsigned int handle)
3477 return TC_H_MAJ(handle) >> 16;
3480 /* Returns the minor number from 'handle'. */
3482 tc_get_minor(unsigned int handle)
3484 return TC_H_MIN(handle);
3487 static struct tcmsg *
3488 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3489 struct ofpbuf *request)
3491 struct tcmsg *tcmsg;
3495 error = get_ifindex(netdev, &ifindex);
3500 ofpbuf_init(request, 512);
3501 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3502 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3503 tcmsg->tcm_family = AF_UNSPEC;
3504 tcmsg->tcm_ifindex = ifindex;
3505 /* Caller should fill in tcmsg->tcm_handle. */
3506 /* Caller should fill in tcmsg->tcm_parent. */
3512 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3514 int error = nl_sock_transact(rtnl_sock, request, replyp);
3515 ofpbuf_uninit(request);
3522 /* The values in psched are not individually very meaningful, but they are
3523 * important. The tables below show some values seen in the wild.
3527 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3528 * (Before that, there are hints that it was 1000000000.)
3530 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3534 * -----------------------------------
3535 * [1] 000c8000 000f4240 000f4240 00000064
3536 * [2] 000003e8 00000400 000f4240 3b9aca00
3537 * [3] 000003e8 00000400 000f4240 3b9aca00
3538 * [4] 000003e8 00000400 000f4240 00000064
3539 * [5] 000003e8 00000040 000f4240 3b9aca00
3540 * [6] 000003e8 00000040 000f4240 000000f9
3542 * a b c d ticks_per_s buffer_hz
3543 * ------- --------- ---------- ------------- ----------- -------------
3544 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3545 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3546 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3547 * [4] 1,000 1,024 1,000,000 100 976,562 100
3548 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3549 * [6] 1,000 64 1,000,000 249 15,625,000 249
3551 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3552 * [2] 2.6.26-1-686-bigmem from Debian lenny
3553 * [3] 2.6.26-2-sparc64 from Debian lenny
3554 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3555 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3556 * [6] 2.6.34 from kernel.org on KVM
3558 static const char fn[] = "/proc/net/psched";
3559 unsigned int a, b, c, d;
3565 stream = fopen(fn, "r");
3567 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3571 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3572 VLOG_WARN("%s: read failed", fn);
3576 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3580 VLOG_WARN("%s: invalid scheduler parameters", fn);
3584 ticks_per_s = (double) a * c / b;
3588 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3591 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3594 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3595 * rate of 'rate' bytes per second. */
3597 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3602 return (rate * ticks) / ticks_per_s;
3605 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3606 * rate of 'rate' bytes per second. */
3608 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3613 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3616 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3617 * a transmission rate of 'rate' bytes per second. */
3619 tc_buffer_per_jiffy(unsigned int rate)
3624 return rate / buffer_hz;
3627 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3628 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3629 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3630 * stores NULL into it if it is absent.
3632 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3635 * Returns 0 if successful, otherwise a positive errno value. */
3637 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3638 struct nlattr **options)
3640 static const struct nl_policy tca_policy[] = {
3641 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3642 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3644 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3646 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3647 tca_policy, ta, ARRAY_SIZE(ta))) {
3648 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3653 *kind = nl_attr_get_string(ta[TCA_KIND]);
3657 *options = ta[TCA_OPTIONS];
3672 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3673 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3674 * into '*options', and its queue statistics into '*stats'. Any of the output
3675 * arguments may be null.
3677 * Returns 0 if successful, otherwise a positive errno value. */
3679 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3680 struct nlattr **options, struct netdev_queue_stats *stats)
3682 static const struct nl_policy tca_policy[] = {
3683 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3684 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3686 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3688 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3689 tca_policy, ta, ARRAY_SIZE(ta))) {
3690 VLOG_WARN_RL(&rl, "failed to parse class message");
3695 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3696 *handlep = tc->tcm_handle;
3700 *options = ta[TCA_OPTIONS];
3704 const struct gnet_stats_queue *gsq;
3705 struct gnet_stats_basic gsb;
3707 static const struct nl_policy stats_policy[] = {
3708 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3709 .min_len = sizeof gsb },
3710 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3711 .min_len = sizeof *gsq },
3713 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3715 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3716 sa, ARRAY_SIZE(sa))) {
3717 VLOG_WARN_RL(&rl, "failed to parse class stats");
3721 /* Alignment issues screw up the length of struct gnet_stats_basic on
3722 * some arch/bitsize combinations. Newer versions of Linux have a
3723 * struct gnet_stats_basic_packed, but we can't depend on that. The
3724 * easiest thing to do is just to make a copy. */
3725 memset(&gsb, 0, sizeof gsb);
3726 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3727 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3728 stats->tx_bytes = gsb.bytes;
3729 stats->tx_packets = gsb.packets;
3731 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3732 stats->tx_errors = gsq->drops;
3742 memset(stats, 0, sizeof *stats);
3747 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3750 tc_query_class(const struct netdev *netdev,
3751 unsigned int handle, unsigned int parent,
3752 struct ofpbuf **replyp)
3754 struct ofpbuf request;
3755 struct tcmsg *tcmsg;
3758 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3762 tcmsg->tcm_handle = handle;
3763 tcmsg->tcm_parent = parent;
3765 error = tc_transact(&request, replyp);
3767 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3768 netdev_get_name(netdev),
3769 tc_get_major(handle), tc_get_minor(handle),
3770 tc_get_major(parent), tc_get_minor(parent),
3776 /* Equivalent to "tc class del dev <name> handle <handle>". */
3778 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3780 struct ofpbuf request;
3781 struct tcmsg *tcmsg;
3784 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3788 tcmsg->tcm_handle = handle;
3789 tcmsg->tcm_parent = 0;
3791 error = tc_transact(&request, NULL);
3793 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3794 netdev_get_name(netdev),
3795 tc_get_major(handle), tc_get_minor(handle),
3801 /* Equivalent to "tc qdisc del dev <name> root". */
3803 tc_del_qdisc(struct netdev *netdev)
3805 struct netdev_dev_linux *netdev_dev =
3806 netdev_dev_linux_cast(netdev_get_dev(netdev));
3807 struct ofpbuf request;
3808 struct tcmsg *tcmsg;
3811 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3815 tcmsg->tcm_handle = tc_make_handle(1, 0);
3816 tcmsg->tcm_parent = TC_H_ROOT;
3818 error = tc_transact(&request, NULL);
3819 if (error == EINVAL) {
3820 /* EINVAL probably means that the default qdisc was in use, in which
3821 * case we've accomplished our purpose. */
3824 if (!error && netdev_dev->tc) {
3825 if (netdev_dev->tc->ops->tc_destroy) {
3826 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3828 netdev_dev->tc = NULL;
3833 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3834 * kernel to determine what they are. Returns 0 if successful, otherwise a
3835 * positive errno value. */
3837 tc_query_qdisc(const struct netdev *netdev)
3839 struct netdev_dev_linux *netdev_dev =
3840 netdev_dev_linux_cast(netdev_get_dev(netdev));
3841 struct ofpbuf request, *qdisc;
3842 const struct tc_ops *ops;
3843 struct tcmsg *tcmsg;
3847 if (netdev_dev->tc) {
3851 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3852 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3853 * 2.6.35 without that fix backported to it.
3855 * To avoid the OOPS, we must not make a request that would attempt to dump
3856 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3857 * few others. There are a few ways that I can see to do this, but most of
3858 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3859 * technique chosen here is to assume that any non-default qdisc that we
3860 * create will have a class with handle 1:0. The built-in qdiscs only have
3861 * a class with handle 0:0.
3863 * We could check for Linux 2.6.35+ and use a more straightforward method
3865 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3869 tcmsg->tcm_handle = tc_make_handle(1, 0);
3870 tcmsg->tcm_parent = 0;
3872 /* Figure out what tc class to instantiate. */
3873 error = tc_transact(&request, &qdisc);
3877 error = tc_parse_qdisc(qdisc, &kind, NULL);
3879 ops = &tc_ops_other;
3881 ops = tc_lookup_linux_name(kind);
3883 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3884 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3886 ops = &tc_ops_other;
3889 } else if (error == ENOENT) {
3890 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3891 * other entity that doesn't have a handle 1:0. We will assume
3892 * that it's the system default qdisc. */
3893 ops = &tc_ops_default;
3896 /* Who knows? Maybe the device got deleted. */
3897 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3898 netdev_get_name(netdev), strerror(error));
3899 ops = &tc_ops_other;
3902 /* Instantiate it. */
3903 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3904 assert((load_error == 0) == (netdev_dev->tc != NULL));
3905 ofpbuf_delete(qdisc);
3907 return error ? error : load_error;
3910 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3911 approximate the time to transmit packets of various lengths. For an MTU of
3912 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3913 represents two possible packet lengths; for a MTU of 513 through 1024, four
3914 possible lengths; and so on.
3916 Returns, for the specified 'mtu', the number of bits that packet lengths
3917 need to be shifted right to fit within such a 256-entry table. */
3919 tc_calc_cell_log(unsigned int mtu)
3924 mtu = ETH_PAYLOAD_MAX;
3926 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3928 for (cell_log = 0; mtu >= 256; cell_log++) {
3935 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3938 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3940 memset(rate, 0, sizeof *rate);
3941 rate->cell_log = tc_calc_cell_log(mtu);
3942 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3943 /* rate->cell_align = 0; */ /* distro headers. */
3944 rate->mpu = ETH_TOTAL_MIN;
3948 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3949 * attribute of the specified "type".
3951 * See tc_calc_cell_log() above for a description of "rtab"s. */
3953 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3958 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3959 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3960 unsigned packet_size = (i + 1) << rate->cell_log;
3961 if (packet_size < rate->mpu) {
3962 packet_size = rate->mpu;
3964 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3968 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3969 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3970 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3973 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3975 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3976 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3979 /* Copies 'src' into 'dst', performing format conversion in the process. */
3981 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3982 const struct rtnl_link_stats *src)
3984 dst->rx_packets = src->rx_packets;
3985 dst->tx_packets = src->tx_packets;
3986 dst->rx_bytes = src->rx_bytes;
3987 dst->tx_bytes = src->tx_bytes;
3988 dst->rx_errors = src->rx_errors;
3989 dst->tx_errors = src->tx_errors;
3990 dst->rx_dropped = src->rx_dropped;
3991 dst->tx_dropped = src->tx_dropped;
3992 dst->multicast = src->multicast;
3993 dst->collisions = src->collisions;
3994 dst->rx_length_errors = src->rx_length_errors;
3995 dst->rx_over_errors = src->rx_over_errors;
3996 dst->rx_crc_errors = src->rx_crc_errors;
3997 dst->rx_frame_errors = src->rx_frame_errors;
3998 dst->rx_fifo_errors = src->rx_fifo_errors;
3999 dst->rx_missed_errors = src->rx_missed_errors;
4000 dst->tx_aborted_errors = src->tx_aborted_errors;
4001 dst->tx_carrier_errors = src->tx_carrier_errors;
4002 dst->tx_fifo_errors = src->tx_fifo_errors;
4003 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4004 dst->tx_window_errors = src->tx_window_errors;
4008 /* Utility functions. */
4011 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4013 /* Policy for RTNLGRP_LINK messages.
4015 * There are *many* more fields in these messages, but currently we only
4016 * care about these fields. */
4017 static const struct nl_policy rtnlgrp_link_policy[] = {
4018 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4019 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4020 .min_len = sizeof(struct rtnl_link_stats) },
4023 struct ofpbuf request;
4024 struct ofpbuf *reply;
4025 struct ifinfomsg *ifi;
4026 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4029 ofpbuf_init(&request, 0);
4030 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4031 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4032 ifi->ifi_family = PF_UNSPEC;
4033 ifi->ifi_index = ifindex;
4034 error = nl_sock_transact(rtnl_sock, &request, &reply);
4035 ofpbuf_uninit(&request);
4040 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4041 rtnlgrp_link_policy,
4042 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4043 ofpbuf_delete(reply);
4047 if (!attrs[IFLA_STATS]) {
4048 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4049 ofpbuf_delete(reply);
4053 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4055 ofpbuf_delete(reply);
4061 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4063 static const char fn[] = "/proc/net/dev";
4068 stream = fopen(fn, "r");
4070 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4075 while (fgets(line, sizeof line, stream)) {
4078 #define X64 "%"SCNu64
4081 X64 X64 X64 X64 X64 X64 X64 "%*u"
4082 X64 X64 X64 X64 X64 X64 X64 "%*u",
4088 &stats->rx_fifo_errors,
4089 &stats->rx_frame_errors,
4095 &stats->tx_fifo_errors,
4097 &stats->tx_carrier_errors) != 15) {
4098 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4099 } else if (!strcmp(devname, netdev_name)) {
4100 stats->rx_length_errors = UINT64_MAX;
4101 stats->rx_over_errors = UINT64_MAX;
4102 stats->rx_crc_errors = UINT64_MAX;
4103 stats->rx_missed_errors = UINT64_MAX;
4104 stats->tx_aborted_errors = UINT64_MAX;
4105 stats->tx_heartbeat_errors = UINT64_MAX;
4106 stats->tx_window_errors = UINT64_MAX;
4112 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4118 get_carrier_via_sysfs(const char *name, bool *carrier)
4129 fn = xasprintf("/sys/class/net/%s/carrier", name);
4130 fd = open(fn, O_RDONLY);
4133 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4137 retval = read(fd, line, sizeof line);
4140 if (error == EINVAL) {
4141 /* This is the normal return value when we try to check carrier if
4142 * the network device is not up. */
4144 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4147 } else if (retval == 0) {
4149 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4153 if (line[0] != '0' && line[0] != '1') {
4155 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4158 *carrier = line[0] != '0';
4170 get_flags(const struct netdev *netdev, int *flags)
4175 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4177 *flags = ifr.ifr_flags;
4182 set_flags(struct netdev *netdev, int flags)
4186 ifr.ifr_flags = flags;
4187 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4192 do_get_ifindex(const char *netdev_name)
4196 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4197 COVERAGE_INC(netdev_get_ifindex);
4198 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4199 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4200 netdev_name, strerror(errno));
4203 return ifr.ifr_ifindex;
4207 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4209 struct netdev_dev_linux *netdev_dev =
4210 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4212 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4213 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4217 netdev_dev->cache_valid |= VALID_IFINDEX;
4218 netdev_dev->ifindex = ifindex;
4220 *ifindexp = netdev_dev->ifindex;
4225 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4230 memset(&ifr, 0, sizeof ifr);
4231 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4232 COVERAGE_INC(netdev_get_hwaddr);
4233 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4234 /* ENODEV probably means that a vif disappeared asynchronously and
4235 * hasn't been removed from the database yet, so reduce the log level
4236 * to INFO for that case. */
4237 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4238 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4239 netdev_name, strerror(errno));
4242 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4243 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4244 VLOG_WARN("%s device has unknown hardware address family %d",
4245 netdev_name, hwaddr_family);
4247 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4252 set_etheraddr(const char *netdev_name, int hwaddr_family,
4253 const uint8_t mac[ETH_ADDR_LEN])
4257 memset(&ifr, 0, sizeof ifr);
4258 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4259 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4260 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4261 COVERAGE_INC(netdev_set_hwaddr);
4262 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4263 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4264 netdev_name, strerror(errno));
4271 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4272 int cmd, const char *cmd_name)
4276 memset(&ifr, 0, sizeof ifr);
4277 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4278 ifr.ifr_data = (caddr_t) ecmd;
4281 COVERAGE_INC(netdev_ethtool);
4282 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4285 if (errno != EOPNOTSUPP) {
4286 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4287 "failed: %s", cmd_name, name, strerror(errno));
4289 /* The device doesn't support this operation. That's pretty
4290 * common, so there's no point in logging anything. */
4296 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4297 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4299 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4300 const char *flag_name, bool enable)
4302 const char *netdev_name = netdev_get_name(netdev);
4303 struct ethtool_value evalue;
4307 memset(&evalue, 0, sizeof evalue);
4308 error = netdev_linux_do_ethtool(netdev_name,
4309 (struct ethtool_cmd *)&evalue,
4310 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4315 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4316 error = netdev_linux_do_ethtool(netdev_name,
4317 (struct ethtool_cmd *)&evalue,
4318 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4323 memset(&evalue, 0, sizeof evalue);
4324 error = netdev_linux_do_ethtool(netdev_name,
4325 (struct ethtool_cmd *)&evalue,
4326 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4331 if (new_flags != evalue.data) {
4332 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4333 "device %s failed", enable ? "enable" : "disable",
4334 flag_name, netdev_name);
4342 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4343 const char *cmd_name)
4345 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4346 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4347 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4355 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4356 int cmd, const char *cmd_name)
4361 ifr.ifr_addr.sa_family = AF_INET;
4362 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4364 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4365 *ip = sin->sin_addr;
4370 /* Returns an AF_PACKET raw socket or a negative errno value. */
4372 af_packet_sock(void)
4374 static int sock = INT_MIN;
4376 if (sock == INT_MIN) {
4377 sock = socket(AF_PACKET, SOCK_RAW, 0);
4379 set_nonblocking(sock);
4382 VLOG_ERR("failed to create packet socket: %s", strerror(errno));