2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dpif-linux.h"
51 #include "dynamic-string.h"
52 #include "fatal-signal.h"
55 #include "netdev-provider.h"
56 #include "netdev-vport.h"
58 #include "netlink-socket.h"
60 #include "openflow/openflow.h"
62 #include "poll-loop.h"
63 #include "rtnetlink.h"
64 #include "rtnetlink-link.h"
65 #include "socket-util.h"
70 VLOG_DEFINE_THIS_MODULE(netdev_linux);
72 COVERAGE_DEFINE(netdev_get_vlan_vid);
73 COVERAGE_DEFINE(netdev_set_policing);
74 COVERAGE_DEFINE(netdev_arp_lookup);
75 COVERAGE_DEFINE(netdev_get_ifindex);
76 COVERAGE_DEFINE(netdev_get_hwaddr);
77 COVERAGE_DEFINE(netdev_set_hwaddr);
78 COVERAGE_DEFINE(netdev_ethtool);
80 /* These were introduced in Linux 2.6.14, so they might be missing if we have
82 #ifndef ADVERTISED_Pause
83 #define ADVERTISED_Pause (1 << 13)
85 #ifndef ADVERTISED_Asym_Pause
86 #define ADVERTISED_Asym_Pause (1 << 14)
89 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
92 #define TC_RTAB_SIZE 1024
95 static struct rtnetlink_notifier netdev_linux_cache_notifier;
96 static int cache_notifier_refcount;
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
115 /* Traffic control. */
117 /* An instance of a traffic control class. Always associated with a particular
120 * Each TC implementation subclasses this with whatever additional data it
123 const struct tc_ops *ops;
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
129 /* One traffic control queue.
131 * Each TC implementation subclasses this with whatever additional data it
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
138 /* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
210 * This function may be null if 'tc' is not configurable.
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
221 * This function may be null if 'tc' is not configurable.
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
236 * This function may be null if 'tc' does not have queues ('n_queues' is
238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
239 struct shash *details);
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
265 * On success, initializes '*stats'.
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
271 struct netdev_queue_stats *stats);
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
284 tc_init(struct tc *tc, const struct tc_ops *ops)
287 hmap_init(&tc->queues);
291 tc_destroy(struct tc *tc)
293 hmap_destroy(&tc->queues);
296 static const struct tc_ops tc_ops_htb;
297 static const struct tc_ops tc_ops_hfsc;
298 static const struct tc_ops tc_ops_default;
299 static const struct tc_ops tc_ops_other;
301 static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
309 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310 static unsigned int tc_get_major(unsigned int handle);
311 static unsigned int tc_get_minor(unsigned int handle);
313 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
317 static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
321 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326 static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329 static int tc_delete_class(const struct netdev *, unsigned int handle);
331 static int tc_del_qdisc(struct netdev *netdev);
332 static int tc_query_qdisc(const struct netdev *netdev);
334 static int tc_calc_cell_log(unsigned int mtu);
335 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
340 struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
343 struct shash_node *shash_node;
344 unsigned int cache_valid;
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
349 uint8_t etheraddr[ETH_ADDR_LEN];
350 struct in_addr address, netmask;
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
358 bool have_vport_stats;
362 struct tap_state tap;
366 struct netdev_linux {
367 struct netdev netdev;
371 /* Sockets used for ioctl operations. */
372 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
373 static int af_packet_sock = -1; /* AF_PACKET, SOCK_RAW. */
375 /* A Netlink routing socket that is not subscribed to any multicast groups. */
376 static struct nl_sock *rtnl_sock;
378 struct netdev_linux_notifier {
379 struct netdev_notifier notifier;
383 static struct shash netdev_linux_notifiers =
384 SHASH_INITIALIZER(&netdev_linux_notifiers);
385 static struct rtnetlink_notifier netdev_linux_poll_notifier;
387 /* This is set pretty low because we probably won't learn anything from the
388 * additional log messages. */
389 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
391 static int netdev_linux_init(void);
393 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
394 int cmd, const char *cmd_name);
395 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
396 const char *cmd_name);
397 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
398 int cmd, const char *cmd_name);
399 static int get_flags(const struct netdev *, int *flagsp);
400 static int set_flags(struct netdev *, int flags);
401 static int do_get_ifindex(const char *netdev_name);
402 static int get_ifindex(const struct netdev *, int *ifindexp);
403 static int do_set_addr(struct netdev *netdev,
404 int ioctl_nr, const char *ioctl_name,
405 struct in_addr addr);
406 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
407 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
408 const uint8_t[ETH_ADDR_LEN]);
409 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
410 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
413 is_netdev_linux_class(const struct netdev_class *netdev_class)
415 return netdev_class->init == netdev_linux_init;
418 static struct netdev_dev_linux *
419 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
421 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
422 assert(is_netdev_linux_class(netdev_class));
424 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
427 static struct netdev_linux *
428 netdev_linux_cast(const struct netdev *netdev)
430 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
431 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
432 assert(is_netdev_linux_class(netdev_class));
434 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
438 netdev_linux_init(void)
440 static int status = -1;
442 /* Create AF_INET socket. */
443 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
444 status = af_inet_sock >= 0 ? 0 : errno;
446 VLOG_ERR("failed to create inet socket: %s", strerror(status));
448 /* Create AF_PACKET socket. */
449 af_packet_sock = socket(AF_PACKET, SOCK_RAW, 0);
450 status = af_packet_sock >= 0 ? 0 : errno;
452 VLOG_ERR("failed to create packet socket: %s",
455 set_nonblocking(af_packet_sock);
458 /* Create rtnetlink socket. */
460 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
462 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
471 netdev_linux_run(void)
473 rtnetlink_link_notifier_run();
477 netdev_linux_wait(void)
479 rtnetlink_link_notifier_wait();
483 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
484 void *aux OVS_UNUSED)
486 struct netdev_dev_linux *dev;
488 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
490 const struct netdev_class *netdev_class =
491 netdev_dev_get_class(base_dev);
493 if (is_netdev_linux_class(netdev_class)) {
494 dev = netdev_dev_linux_cast(base_dev);
495 dev->cache_valid = 0;
499 struct shash device_shash;
500 struct shash_node *node;
502 shash_init(&device_shash);
503 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
504 SHASH_FOR_EACH (node, &device_shash) {
506 dev->cache_valid = 0;
508 shash_destroy(&device_shash);
512 /* Creates system and internal devices. */
514 netdev_linux_create(const struct netdev_class *class,
515 const char *name, const struct shash *args,
516 struct netdev_dev **netdev_devp)
518 struct netdev_dev_linux *netdev_dev;
521 if (!shash_is_empty(args)) {
522 VLOG_WARN("%s: arguments for %s devices should be empty",
526 if (!cache_notifier_refcount) {
527 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
528 netdev_linux_cache_cb, NULL);
533 cache_notifier_refcount++;
535 netdev_dev = xzalloc(sizeof *netdev_dev);
536 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
538 *netdev_devp = &netdev_dev->netdev_dev;
542 /* For most types of netdevs we open the device for each call of
543 * netdev_open(). However, this is not the case with tap devices,
544 * since it is only possible to open the device once. In this
545 * situation we share a single file descriptor, and consequently
546 * buffers, across all readers. Therefore once data is read it will
547 * be unavailable to other reads for tap devices. */
549 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
550 const char *name, const struct shash *args,
551 struct netdev_dev **netdev_devp)
553 struct netdev_dev_linux *netdev_dev;
554 struct tap_state *state;
555 static const char tap_dev[] = "/dev/net/tun";
559 if (!shash_is_empty(args)) {
560 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
563 netdev_dev = xzalloc(sizeof *netdev_dev);
564 state = &netdev_dev->state.tap;
566 /* Open tap device. */
567 state->fd = open(tap_dev, O_RDWR);
570 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
574 /* Create tap device. */
575 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
576 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
577 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
578 VLOG_WARN("%s: creating tap device failed: %s", name,
584 /* Make non-blocking. */
585 error = set_nonblocking(state->fd);
590 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
591 *netdev_devp = &netdev_dev->netdev_dev;
600 destroy_tap(struct netdev_dev_linux *netdev_dev)
602 struct tap_state *state = &netdev_dev->state.tap;
604 if (state->fd >= 0) {
609 /* Destroys the netdev device 'netdev_dev_'. */
611 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
613 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
614 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
616 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
617 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
620 if (class == &netdev_linux_class || class == &netdev_internal_class) {
621 cache_notifier_refcount--;
623 if (!cache_notifier_refcount) {
624 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
626 } else if (class == &netdev_tap_class) {
627 destroy_tap(netdev_dev);
636 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
637 struct netdev **netdevp)
639 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
640 struct netdev_linux *netdev;
641 enum netdev_flags flags;
644 /* Allocate network device. */
645 netdev = xzalloc(sizeof *netdev);
647 netdev_init(&netdev->netdev, netdev_dev_);
649 /* Verify that the device really exists, by attempting to read its flags.
650 * (The flags might be cached, in which case this won't actually do an
653 * Don't do this for "internal" netdevs, though, because those have to be
654 * created as netdev objects before they exist in the kernel, because
655 * creating them in the kernel happens by passing a netdev object to
656 * dpif_port_add(). */
657 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
658 error = netdev_get_flags(&netdev->netdev, &flags);
659 if (error == ENODEV) {
664 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
665 !netdev_dev->state.tap.opened) {
667 /* We assume that the first user of the tap device is the primary user
668 * and give them the tap FD. Subsequent users probably just expect
669 * this to be a system device so open it normally to avoid send/receive
670 * directions appearing to be reversed. */
671 netdev->fd = netdev_dev->state.tap.fd;
672 netdev_dev->state.tap.opened = true;
673 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
674 struct sockaddr_ll sll;
678 /* Create file descriptor. */
679 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
680 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
682 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
683 if (netdev->fd < 0) {
688 /* Set non-blocking mode. */
689 error = set_nonblocking(netdev->fd);
694 /* Get ethernet device index. */
695 error = get_ifindex(&netdev->netdev, &ifindex);
700 /* Bind to specific ethernet device. */
701 memset(&sll, 0, sizeof sll);
702 sll.sll_family = AF_PACKET;
703 sll.sll_ifindex = ifindex;
705 (struct sockaddr *) &sll, sizeof sll) < 0) {
707 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
712 /* Between the socket() and bind() calls above, the socket receives all
713 * packets of the requested type on all system interfaces. We do not
714 * want to receive that data, but there is no way to avoid it. So we
715 * must now drain out the receive queue. */
716 error = drain_rcvbuf(netdev->fd);
722 *netdevp = &netdev->netdev;
726 netdev_uninit(&netdev->netdev, true);
730 /* Closes and destroys 'netdev'. */
732 netdev_linux_close(struct netdev *netdev_)
734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
736 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
742 /* Initializes 'sset' with a list of the names of all known network devices. */
744 netdev_linux_enumerate(struct sset *sset)
746 struct if_nameindex *names;
748 names = if_nameindex();
752 for (i = 0; names[i].if_name != NULL; i++) {
753 sset_add(sset, names[i].if_name);
755 if_freenameindex(names);
758 VLOG_WARN("could not obtain list of network device names: %s",
765 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
767 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
769 if (netdev->fd < 0) {
770 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
775 ssize_t retval = read(netdev->fd, data, size);
778 } else if (errno != EINTR) {
779 if (errno != EAGAIN) {
780 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
781 strerror(errno), netdev_get_name(netdev_));
788 /* Registers with the poll loop to wake up from the next call to poll_block()
789 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
791 netdev_linux_recv_wait(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd >= 0) {
795 poll_fd_wait(netdev->fd, POLLIN);
799 /* Discards all packets waiting to be received from 'netdev'. */
801 netdev_linux_drain(struct netdev *netdev_)
803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
804 if (netdev->fd < 0) {
806 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
808 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
809 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
813 drain_fd(netdev->fd, ifr.ifr_qlen);
816 return drain_rcvbuf(netdev->fd);
820 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
821 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
822 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
823 * the packet is too big or too small to transmit on the device.
825 * The caller retains ownership of 'buffer' in all cases.
827 * The kernel maintains a packet transmission queue, so the caller is not
828 * expected to do additional queuing of packets. */
830 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
832 struct sockaddr_ll sll;
838 error = get_ifindex(netdev_, &ifindex);
843 /* We don't bother setting most fields in sockaddr_ll because the kernel
844 * ignores them for SOCK_RAW. */
845 memset(&sll, 0, sizeof sll);
846 sll.sll_family = AF_PACKET;
847 sll.sll_ifindex = ifindex;
849 iov.iov_base = (void *) data;
853 msg.msg_namelen = sizeof sll;
856 msg.msg_control = NULL;
857 msg.msg_controllen = 0;
861 ssize_t retval = sendmsg(af_packet_sock, &msg, 0);
863 /* The Linux AF_PACKET implementation never blocks waiting for room
864 * for packets, instead returning ENOBUFS. Translate this into
865 * EAGAIN for the caller. */
866 if (errno == ENOBUFS) {
868 } else if (errno == EINTR) {
870 } else if (errno != EAGAIN) {
871 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
872 netdev_get_name(netdev_), strerror(errno));
875 } else if (retval != size) {
876 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
877 "%zu) on %s", retval, size, netdev_get_name(netdev_));
885 /* Registers with the poll loop to wake up from the next call to poll_block()
886 * when the packet transmission queue has sufficient room to transmit a packet
887 * with netdev_send().
889 * The kernel maintains a packet transmission queue, so the client is not
890 * expected to do additional queuing of packets. Thus, this function is
891 * unlikely to ever be used. It is included for completeness. */
893 netdev_linux_send_wait(struct netdev *netdev_)
895 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
896 if (netdev->fd < 0) {
898 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
899 poll_fd_wait(netdev->fd, POLLOUT);
901 /* TAP device always accepts packets.*/
902 poll_immediate_wake();
906 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
907 * otherwise a positive errno value. */
909 netdev_linux_set_etheraddr(struct netdev *netdev_,
910 const uint8_t mac[ETH_ADDR_LEN])
912 struct netdev_dev_linux *netdev_dev =
913 netdev_dev_linux_cast(netdev_get_dev(netdev_));
916 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
917 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
918 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
920 netdev_dev->cache_valid |= VALID_ETHERADDR;
921 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
929 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
930 * free the returned buffer. */
932 netdev_linux_get_etheraddr(const struct netdev *netdev_,
933 uint8_t mac[ETH_ADDR_LEN])
935 struct netdev_dev_linux *netdev_dev =
936 netdev_dev_linux_cast(netdev_get_dev(netdev_));
937 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
938 int error = get_etheraddr(netdev_get_name(netdev_),
939 netdev_dev->etheraddr);
943 netdev_dev->cache_valid |= VALID_ETHERADDR;
945 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
949 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
950 * in bytes, not including the hardware header; thus, this is typically 1500
951 * bytes for Ethernet devices. */
953 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
955 struct netdev_dev_linux *netdev_dev =
956 netdev_dev_linux_cast(netdev_get_dev(netdev_));
957 if (!(netdev_dev->cache_valid & VALID_MTU)) {
961 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
962 SIOCGIFMTU, "SIOCGIFMTU");
966 netdev_dev->mtu = ifr.ifr_mtu;
967 netdev_dev->cache_valid |= VALID_MTU;
969 *mtup = netdev_dev->mtu;
973 /* Returns the ifindex of 'netdev', if successful, as a positive number.
974 * On failure, returns a negative errno value. */
976 netdev_linux_get_ifindex(const struct netdev *netdev)
980 error = get_ifindex(netdev, &ifindex);
981 return error ? -error : ifindex;
985 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
987 struct netdev_dev_linux *netdev_dev =
988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
993 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
997 fn = xasprintf("/sys/class/net/%s/carrier",
998 netdev_get_name(netdev_));
999 fd = open(fn, O_RDONLY);
1002 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1006 retval = read(fd, line, sizeof line);
1009 if (error == EINVAL) {
1010 /* This is the normal return value when we try to check carrier
1011 * if the network device is not up. */
1013 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1016 } else if (retval == 0) {
1018 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1022 if (line[0] != '0' && line[0] != '1') {
1024 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1028 netdev_dev->carrier = line[0] != '0';
1029 netdev_dev->cache_valid |= VALID_CARRIER;
1031 *carrier = netdev_dev->carrier;
1043 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1044 const char *cmd_name, struct mii_ioctl_data *data)
1049 memset(&ifr, 0, sizeof ifr);
1050 memcpy(&ifr.ifr_data, data, sizeof *data);
1051 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1052 &ifr, cmd, cmd_name);
1053 memcpy(data, &ifr.ifr_data, sizeof *data);
1059 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1061 const char *name = netdev_get_name(netdev);
1062 struct mii_ioctl_data data;
1067 memset(&data, 0, sizeof data);
1068 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1070 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1071 data.reg_num = MII_BMSR;
1072 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1076 *miimon = !!(data.val_out & BMSR_LSTATUS);
1078 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1081 struct ethtool_cmd ecmd;
1083 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1086 memset(&ecmd, 0, sizeof ecmd);
1087 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1090 struct ethtool_value eval;
1092 memcpy(&eval, &ecmd, sizeof eval);
1093 *miimon = !!eval.data;
1095 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1102 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1103 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1106 check_for_working_netlink_stats(void)
1108 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1109 * preferable, so if that works, we'll use it. */
1110 int ifindex = do_get_ifindex("lo");
1112 VLOG_WARN("failed to get ifindex for lo, "
1113 "obtaining netdev stats from proc");
1116 struct netdev_stats stats;
1117 int error = get_stats_via_netlink(ifindex, &stats);
1119 VLOG_DBG("obtaining netdev stats via rtnetlink");
1122 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1123 "via proc (you are probably running a pre-2.6.19 "
1124 "kernel)", strerror(error));
1130 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1132 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1134 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1135 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1136 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1138 netdev_dev->is_tap = !strcmp(type, "tap");
1139 netdev_dev->is_internal = (!netdev_dev->is_tap
1140 && dpif_linux_is_internal_device(name));
1141 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1146 swap_uint64(uint64_t *a, uint64_t *b)
1153 /* Retrieves current device stats for 'netdev'. */
1155 netdev_linux_get_stats(const struct netdev *netdev_,
1156 struct netdev_stats *stats)
1158 struct netdev_dev_linux *netdev_dev =
1159 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1160 static int use_netlink_stats = -1;
1163 if (netdev_dev->have_vport_stats ||
1164 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1166 error = netdev_vport_get_stats(netdev_, stats);
1167 netdev_dev->have_vport_stats = !error;
1168 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1171 if (!netdev_dev->have_vport_stats) {
1172 if (use_netlink_stats < 0) {
1173 use_netlink_stats = check_for_working_netlink_stats();
1175 if (use_netlink_stats) {
1178 error = get_ifindex(netdev_, &ifindex);
1180 error = get_stats_via_netlink(ifindex, stats);
1183 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1187 /* If this port is an internal port then the transmit and receive stats
1188 * will appear to be swapped relative to the other ports since we are the
1189 * one sending the data, not a remote computer. For consistency, we swap
1190 * them back here. This does not apply if we are getting stats from the
1191 * vport layer because it always tracks stats from the perspective of the
1193 netdev_linux_update_is_pseudo(netdev_dev);
1194 if (!error && !netdev_dev->have_vport_stats &&
1195 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1196 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1197 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1198 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1199 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1200 stats->rx_length_errors = 0;
1201 stats->rx_over_errors = 0;
1202 stats->rx_crc_errors = 0;
1203 stats->rx_frame_errors = 0;
1204 stats->rx_fifo_errors = 0;
1205 stats->rx_missed_errors = 0;
1206 stats->tx_aborted_errors = 0;
1207 stats->tx_carrier_errors = 0;
1208 stats->tx_fifo_errors = 0;
1209 stats->tx_heartbeat_errors = 0;
1210 stats->tx_window_errors = 0;
1216 /* Stores the features supported by 'netdev' into each of '*current',
1217 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1218 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1219 * successful, otherwise a positive errno value. */
1221 netdev_linux_get_features(const struct netdev *netdev,
1222 uint32_t *current, uint32_t *advertised,
1223 uint32_t *supported, uint32_t *peer)
1225 struct ethtool_cmd ecmd;
1228 memset(&ecmd, 0, sizeof ecmd);
1229 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1230 ETHTOOL_GSET, "ETHTOOL_GSET");
1235 /* Supported features. */
1237 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1238 *supported |= OFPPF_10MB_HD;
1240 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1241 *supported |= OFPPF_10MB_FD;
1243 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1244 *supported |= OFPPF_100MB_HD;
1246 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1247 *supported |= OFPPF_100MB_FD;
1249 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1250 *supported |= OFPPF_1GB_HD;
1252 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1253 *supported |= OFPPF_1GB_FD;
1255 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1256 *supported |= OFPPF_10GB_FD;
1258 if (ecmd.supported & SUPPORTED_TP) {
1259 *supported |= OFPPF_COPPER;
1261 if (ecmd.supported & SUPPORTED_FIBRE) {
1262 *supported |= OFPPF_FIBER;
1264 if (ecmd.supported & SUPPORTED_Autoneg) {
1265 *supported |= OFPPF_AUTONEG;
1267 if (ecmd.supported & SUPPORTED_Pause) {
1268 *supported |= OFPPF_PAUSE;
1270 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1271 *supported |= OFPPF_PAUSE_ASYM;
1274 /* Advertised features. */
1276 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1277 *advertised |= OFPPF_10MB_HD;
1279 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1280 *advertised |= OFPPF_10MB_FD;
1282 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1283 *advertised |= OFPPF_100MB_HD;
1285 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1286 *advertised |= OFPPF_100MB_FD;
1288 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1289 *advertised |= OFPPF_1GB_HD;
1291 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1292 *advertised |= OFPPF_1GB_FD;
1294 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1295 *advertised |= OFPPF_10GB_FD;
1297 if (ecmd.advertising & ADVERTISED_TP) {
1298 *advertised |= OFPPF_COPPER;
1300 if (ecmd.advertising & ADVERTISED_FIBRE) {
1301 *advertised |= OFPPF_FIBER;
1303 if (ecmd.advertising & ADVERTISED_Autoneg) {
1304 *advertised |= OFPPF_AUTONEG;
1306 if (ecmd.advertising & ADVERTISED_Pause) {
1307 *advertised |= OFPPF_PAUSE;
1309 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1310 *advertised |= OFPPF_PAUSE_ASYM;
1313 /* Current settings. */
1314 if (ecmd.speed == SPEED_10) {
1315 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1316 } else if (ecmd.speed == SPEED_100) {
1317 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1318 } else if (ecmd.speed == SPEED_1000) {
1319 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1320 } else if (ecmd.speed == SPEED_10000) {
1321 *current = OFPPF_10GB_FD;
1326 if (ecmd.port == PORT_TP) {
1327 *current |= OFPPF_COPPER;
1328 } else if (ecmd.port == PORT_FIBRE) {
1329 *current |= OFPPF_FIBER;
1333 *current |= OFPPF_AUTONEG;
1336 /* Peer advertisements. */
1337 *peer = 0; /* XXX */
1342 /* Set the features advertised by 'netdev' to 'advertise'. */
1344 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1346 struct ethtool_cmd ecmd;
1349 memset(&ecmd, 0, sizeof ecmd);
1350 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1351 ETHTOOL_GSET, "ETHTOOL_GSET");
1356 ecmd.advertising = 0;
1357 if (advertise & OFPPF_10MB_HD) {
1358 ecmd.advertising |= ADVERTISED_10baseT_Half;
1360 if (advertise & OFPPF_10MB_FD) {
1361 ecmd.advertising |= ADVERTISED_10baseT_Full;
1363 if (advertise & OFPPF_100MB_HD) {
1364 ecmd.advertising |= ADVERTISED_100baseT_Half;
1366 if (advertise & OFPPF_100MB_FD) {
1367 ecmd.advertising |= ADVERTISED_100baseT_Full;
1369 if (advertise & OFPPF_1GB_HD) {
1370 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1372 if (advertise & OFPPF_1GB_FD) {
1373 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1375 if (advertise & OFPPF_10GB_FD) {
1376 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1378 if (advertise & OFPPF_COPPER) {
1379 ecmd.advertising |= ADVERTISED_TP;
1381 if (advertise & OFPPF_FIBER) {
1382 ecmd.advertising |= ADVERTISED_FIBRE;
1384 if (advertise & OFPPF_AUTONEG) {
1385 ecmd.advertising |= ADVERTISED_Autoneg;
1387 if (advertise & OFPPF_PAUSE) {
1388 ecmd.advertising |= ADVERTISED_Pause;
1390 if (advertise & OFPPF_PAUSE_ASYM) {
1391 ecmd.advertising |= ADVERTISED_Asym_Pause;
1393 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1394 ETHTOOL_SSET, "ETHTOOL_SSET");
1397 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1398 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1399 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1400 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1401 * sets '*vlan_vid' to -1. */
1403 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1405 const char *netdev_name = netdev_get_name(netdev);
1406 struct ds line = DS_EMPTY_INITIALIZER;
1407 FILE *stream = NULL;
1411 COVERAGE_INC(netdev_get_vlan_vid);
1412 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1413 stream = fopen(fn, "r");
1419 if (ds_get_line(&line, stream)) {
1420 if (ferror(stream)) {
1422 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1425 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1430 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1432 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1433 fn, ds_cstr(&line));
1451 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1452 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1454 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1455 * positive errno value.
1457 * This function is equivalent to running
1458 * /sbin/tc qdisc del dev %s handle ffff: ingress
1459 * but it is much, much faster.
1462 netdev_linux_remove_policing(struct netdev *netdev)
1464 struct netdev_dev_linux *netdev_dev =
1465 netdev_dev_linux_cast(netdev_get_dev(netdev));
1466 const char *netdev_name = netdev_get_name(netdev);
1468 struct ofpbuf request;
1469 struct tcmsg *tcmsg;
1472 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1476 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1477 tcmsg->tcm_parent = TC_H_INGRESS;
1478 nl_msg_put_string(&request, TCA_KIND, "ingress");
1479 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1481 error = tc_transact(&request, NULL);
1482 if (error && error != ENOENT && error != EINVAL) {
1483 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1484 netdev_name, strerror(error));
1488 netdev_dev->kbits_rate = 0;
1489 netdev_dev->kbits_burst = 0;
1490 netdev_dev->cache_valid |= VALID_POLICING;
1494 /* Attempts to set input rate limiting (policing) policy. */
1496 netdev_linux_set_policing(struct netdev *netdev,
1497 uint32_t kbits_rate, uint32_t kbits_burst)
1499 struct netdev_dev_linux *netdev_dev =
1500 netdev_dev_linux_cast(netdev_get_dev(netdev));
1501 const char *netdev_name = netdev_get_name(netdev);
1504 COVERAGE_INC(netdev_set_policing);
1506 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1507 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1508 : kbits_burst); /* Stick with user-specified value. */
1510 if (netdev_dev->cache_valid & VALID_POLICING
1511 && netdev_dev->kbits_rate == kbits_rate
1512 && netdev_dev->kbits_burst == kbits_burst) {
1513 /* Assume that settings haven't changed since we last set them. */
1517 netdev_linux_remove_policing(netdev);
1519 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1520 if (system(command) != 0) {
1521 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1525 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1526 kbits_rate, kbits_burst);
1527 if (system(command) != 0) {
1528 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1533 netdev_dev->kbits_rate = kbits_rate;
1534 netdev_dev->kbits_burst = kbits_burst;
1535 netdev_dev->cache_valid |= VALID_POLICING;
1542 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1545 const struct tc_ops **opsp;
1547 for (opsp = tcs; *opsp != NULL; opsp++) {
1548 const struct tc_ops *ops = *opsp;
1549 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1550 sset_add(types, ops->ovs_name);
1556 static const struct tc_ops *
1557 tc_lookup_ovs_name(const char *name)
1559 const struct tc_ops **opsp;
1561 for (opsp = tcs; *opsp != NULL; opsp++) {
1562 const struct tc_ops *ops = *opsp;
1563 if (!strcmp(name, ops->ovs_name)) {
1570 static const struct tc_ops *
1571 tc_lookup_linux_name(const char *name)
1573 const struct tc_ops **opsp;
1575 for (opsp = tcs; *opsp != NULL; opsp++) {
1576 const struct tc_ops *ops = *opsp;
1577 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1584 static struct tc_queue *
1585 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1588 struct netdev_dev_linux *netdev_dev =
1589 netdev_dev_linux_cast(netdev_get_dev(netdev));
1590 struct tc_queue *queue;
1592 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1593 if (queue->queue_id == queue_id) {
1600 static struct tc_queue *
1601 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1603 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1607 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1609 struct netdev_qos_capabilities *caps)
1611 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1615 caps->n_queues = ops->n_queues;
1620 netdev_linux_get_qos(const struct netdev *netdev,
1621 const char **typep, struct shash *details)
1623 struct netdev_dev_linux *netdev_dev =
1624 netdev_dev_linux_cast(netdev_get_dev(netdev));
1627 error = tc_query_qdisc(netdev);
1632 *typep = netdev_dev->tc->ops->ovs_name;
1633 return (netdev_dev->tc->ops->qdisc_get
1634 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1639 netdev_linux_set_qos(struct netdev *netdev,
1640 const char *type, const struct shash *details)
1642 struct netdev_dev_linux *netdev_dev =
1643 netdev_dev_linux_cast(netdev_get_dev(netdev));
1644 const struct tc_ops *new_ops;
1647 new_ops = tc_lookup_ovs_name(type);
1648 if (!new_ops || !new_ops->tc_install) {
1652 error = tc_query_qdisc(netdev);
1657 if (new_ops == netdev_dev->tc->ops) {
1658 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1660 /* Delete existing qdisc. */
1661 error = tc_del_qdisc(netdev);
1665 assert(netdev_dev->tc == NULL);
1667 /* Install new qdisc. */
1668 error = new_ops->tc_install(netdev, details);
1669 assert((error == 0) == (netdev_dev->tc != NULL));
1676 netdev_linux_get_queue(const struct netdev *netdev,
1677 unsigned int queue_id, struct shash *details)
1679 struct netdev_dev_linux *netdev_dev =
1680 netdev_dev_linux_cast(netdev_get_dev(netdev));
1683 error = tc_query_qdisc(netdev);
1687 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1689 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1695 netdev_linux_set_queue(struct netdev *netdev,
1696 unsigned int queue_id, const struct shash *details)
1698 struct netdev_dev_linux *netdev_dev =
1699 netdev_dev_linux_cast(netdev_get_dev(netdev));
1702 error = tc_query_qdisc(netdev);
1705 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1706 || !netdev_dev->tc->ops->class_set) {
1710 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1714 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1716 struct netdev_dev_linux *netdev_dev =
1717 netdev_dev_linux_cast(netdev_get_dev(netdev));
1720 error = tc_query_qdisc(netdev);
1723 } else if (!netdev_dev->tc->ops->class_delete) {
1726 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1728 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1734 netdev_linux_get_queue_stats(const struct netdev *netdev,
1735 unsigned int queue_id,
1736 struct netdev_queue_stats *stats)
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
1742 error = tc_query_qdisc(netdev);
1745 } else if (!netdev_dev->tc->ops->class_get_stats) {
1748 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1750 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1756 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1758 struct ofpbuf request;
1759 struct tcmsg *tcmsg;
1761 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1765 tcmsg->tcm_parent = 0;
1766 nl_dump_start(dump, rtnl_sock, &request);
1767 ofpbuf_uninit(&request);
1772 netdev_linux_dump_queues(const struct netdev *netdev,
1773 netdev_dump_queues_cb *cb, void *aux)
1775 struct netdev_dev_linux *netdev_dev =
1776 netdev_dev_linux_cast(netdev_get_dev(netdev));
1777 struct tc_queue *queue;
1778 struct shash details;
1782 error = tc_query_qdisc(netdev);
1785 } else if (!netdev_dev->tc->ops->class_get) {
1790 shash_init(&details);
1791 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1792 shash_clear(&details);
1794 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1796 (*cb)(queue->queue_id, &details, aux);
1801 shash_destroy(&details);
1807 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1808 netdev_dump_queue_stats_cb *cb, void *aux)
1810 struct netdev_dev_linux *netdev_dev =
1811 netdev_dev_linux_cast(netdev_get_dev(netdev));
1812 struct nl_dump dump;
1817 error = tc_query_qdisc(netdev);
1820 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1825 if (!start_queue_dump(netdev, &dump)) {
1828 while (nl_dump_next(&dump, &msg)) {
1829 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1835 error = nl_dump_done(&dump);
1836 return error ? error : last_error;
1840 netdev_linux_get_in4(const struct netdev *netdev_,
1841 struct in_addr *address, struct in_addr *netmask)
1843 struct netdev_dev_linux *netdev_dev =
1844 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1846 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1849 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1850 SIOCGIFADDR, "SIOCGIFADDR");
1855 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1856 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1861 netdev_dev->cache_valid |= VALID_IN4;
1863 *address = netdev_dev->address;
1864 *netmask = netdev_dev->netmask;
1865 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1869 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1870 struct in_addr netmask)
1872 struct netdev_dev_linux *netdev_dev =
1873 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1876 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1878 netdev_dev->cache_valid |= VALID_IN4;
1879 netdev_dev->address = address;
1880 netdev_dev->netmask = netmask;
1881 if (address.s_addr != INADDR_ANY) {
1882 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1883 "SIOCSIFNETMASK", netmask);
1890 parse_if_inet6_line(const char *line,
1891 struct in6_addr *in6, char ifname[16 + 1])
1893 uint8_t *s6 = in6->s6_addr;
1894 #define X8 "%2"SCNx8
1896 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1897 "%*x %*x %*x %*x %16s\n",
1898 &s6[0], &s6[1], &s6[2], &s6[3],
1899 &s6[4], &s6[5], &s6[6], &s6[7],
1900 &s6[8], &s6[9], &s6[10], &s6[11],
1901 &s6[12], &s6[13], &s6[14], &s6[15],
1905 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1906 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1908 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1910 struct netdev_dev_linux *netdev_dev =
1911 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1912 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1916 netdev_dev->in6 = in6addr_any;
1918 file = fopen("/proc/net/if_inet6", "r");
1920 const char *name = netdev_get_name(netdev_);
1921 while (fgets(line, sizeof line, file)) {
1922 struct in6_addr in6_tmp;
1923 char ifname[16 + 1];
1924 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1925 && !strcmp(name, ifname))
1927 netdev_dev->in6 = in6_tmp;
1933 netdev_dev->cache_valid |= VALID_IN6;
1935 *in6 = netdev_dev->in6;
1940 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1942 struct sockaddr_in sin;
1943 memset(&sin, 0, sizeof sin);
1944 sin.sin_family = AF_INET;
1945 sin.sin_addr = addr;
1948 memset(sa, 0, sizeof *sa);
1949 memcpy(sa, &sin, sizeof sin);
1953 do_set_addr(struct netdev *netdev,
1954 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1957 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1958 make_in4_sockaddr(&ifr.ifr_addr, addr);
1960 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1964 /* Adds 'router' as a default IP gateway. */
1966 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1968 struct in_addr any = { INADDR_ANY };
1972 memset(&rt, 0, sizeof rt);
1973 make_in4_sockaddr(&rt.rt_dst, any);
1974 make_in4_sockaddr(&rt.rt_gateway, router);
1975 make_in4_sockaddr(&rt.rt_genmask, any);
1976 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1977 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1979 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1985 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1988 static const char fn[] = "/proc/net/route";
1993 *netdev_name = NULL;
1994 stream = fopen(fn, "r");
1995 if (stream == NULL) {
1996 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2001 while (fgets(line, sizeof line, stream)) {
2004 uint32_t dest, gateway, mask;
2005 int refcnt, metric, mtu;
2006 unsigned int flags, use, window, irtt;
2009 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2011 iface, &dest, &gateway, &flags, &refcnt,
2012 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2014 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2018 if (!(flags & RTF_UP)) {
2019 /* Skip routes that aren't up. */
2023 /* The output of 'dest', 'mask', and 'gateway' were given in
2024 * network byte order, so we don't need need any endian
2025 * conversions here. */
2026 if ((dest & mask) == (host->s_addr & mask)) {
2028 /* The host is directly reachable. */
2029 next_hop->s_addr = 0;
2031 /* To reach the host, we must go through a gateway. */
2032 next_hop->s_addr = gateway;
2034 *netdev_name = xstrdup(iface);
2046 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2048 struct ethtool_drvinfo drvinfo;
2051 memset(&drvinfo, 0, sizeof drvinfo);
2052 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2053 (struct ethtool_cmd *)&drvinfo,
2055 "ETHTOOL_GDRVINFO");
2057 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2058 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2059 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2065 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2066 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2067 * returns 0. Otherwise, it returns a positive errno value; in particular,
2068 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2070 netdev_linux_arp_lookup(const struct netdev *netdev,
2071 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2074 struct sockaddr_in sin;
2077 memset(&r, 0, sizeof r);
2078 memset(&sin, 0, sizeof sin);
2079 sin.sin_family = AF_INET;
2080 sin.sin_addr.s_addr = ip;
2082 memcpy(&r.arp_pa, &sin, sizeof sin);
2083 r.arp_ha.sa_family = ARPHRD_ETHER;
2085 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2086 COVERAGE_INC(netdev_arp_lookup);
2087 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2089 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2090 } else if (retval != ENXIO) {
2091 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2092 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2098 nd_to_iff_flags(enum netdev_flags nd)
2101 if (nd & NETDEV_UP) {
2104 if (nd & NETDEV_PROMISC) {
2111 iff_to_nd_flags(int iff)
2113 enum netdev_flags nd = 0;
2117 if (iff & IFF_PROMISC) {
2118 nd |= NETDEV_PROMISC;
2124 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2125 enum netdev_flags on, enum netdev_flags *old_flagsp)
2127 int old_flags, new_flags;
2130 error = get_flags(netdev, &old_flags);
2132 *old_flagsp = iff_to_nd_flags(old_flags);
2133 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2134 if (new_flags != old_flags) {
2135 error = set_flags(netdev, new_flags);
2142 poll_notify(struct list *list)
2144 struct netdev_linux_notifier *notifier;
2145 LIST_FOR_EACH (notifier, node, list) {
2146 struct netdev_notifier *n = ¬ifier->notifier;
2152 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2153 void *aux OVS_UNUSED)
2156 struct list *list = shash_find_data(&netdev_linux_notifiers,
2162 struct shash_node *node;
2163 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2164 poll_notify(node->data);
2170 netdev_linux_poll_add(struct netdev *netdev,
2171 void (*cb)(struct netdev_notifier *), void *aux,
2172 struct netdev_notifier **notifierp)
2174 const char *netdev_name = netdev_get_name(netdev);
2175 struct netdev_linux_notifier *notifier;
2178 if (shash_is_empty(&netdev_linux_notifiers)) {
2180 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2181 netdev_linux_poll_cb, NULL);
2187 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2189 list = xmalloc(sizeof *list);
2191 shash_add(&netdev_linux_notifiers, netdev_name, list);
2194 notifier = xmalloc(sizeof *notifier);
2195 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2196 list_push_back(list, ¬ifier->node);
2197 *notifierp = ¬ifier->notifier;
2202 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2204 struct netdev_linux_notifier *notifier =
2205 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2208 /* Remove 'notifier' from its list. */
2209 list = list_remove(¬ifier->node);
2210 if (list_is_empty(list)) {
2211 /* The list is now empty. Remove it from the hash and free it. */
2212 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2213 shash_delete(&netdev_linux_notifiers,
2214 shash_find(&netdev_linux_notifiers, netdev_name));
2219 /* If that was the last notifier, unregister. */
2220 if (shash_is_empty(&netdev_linux_notifiers)) {
2221 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2225 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2229 netdev_linux_init, \
2231 netdev_linux_wait, \
2234 netdev_linux_destroy, \
2235 NULL, /* set_config */ \
2237 netdev_linux_open, \
2238 netdev_linux_close, \
2242 netdev_linux_recv, \
2243 netdev_linux_recv_wait, \
2244 netdev_linux_drain, \
2246 netdev_linux_send, \
2247 netdev_linux_send_wait, \
2249 netdev_linux_set_etheraddr, \
2250 netdev_linux_get_etheraddr, \
2251 netdev_linux_get_mtu, \
2252 netdev_linux_get_ifindex, \
2253 netdev_linux_get_carrier, \
2254 netdev_linux_get_miimon, \
2255 netdev_linux_get_stats, \
2258 netdev_linux_get_features, \
2259 netdev_linux_set_advertisements, \
2260 netdev_linux_get_vlan_vid, \
2262 netdev_linux_set_policing, \
2263 netdev_linux_get_qos_types, \
2264 netdev_linux_get_qos_capabilities, \
2265 netdev_linux_get_qos, \
2266 netdev_linux_set_qos, \
2267 netdev_linux_get_queue, \
2268 netdev_linux_set_queue, \
2269 netdev_linux_delete_queue, \
2270 netdev_linux_get_queue_stats, \
2271 netdev_linux_dump_queues, \
2272 netdev_linux_dump_queue_stats, \
2274 netdev_linux_get_in4, \
2275 netdev_linux_set_in4, \
2276 netdev_linux_get_in6, \
2277 netdev_linux_add_router, \
2278 netdev_linux_get_next_hop, \
2279 netdev_linux_get_status, \
2280 netdev_linux_arp_lookup, \
2282 netdev_linux_update_flags, \
2284 netdev_linux_poll_add, \
2285 netdev_linux_poll_remove \
2288 const struct netdev_class netdev_linux_class =
2291 netdev_linux_create,
2292 netdev_linux_enumerate,
2293 NULL); /* set_stats */
2295 const struct netdev_class netdev_tap_class =
2298 netdev_linux_create_tap,
2299 NULL, /* enumerate */
2300 NULL); /* set_stats */
2302 const struct netdev_class netdev_internal_class =
2305 netdev_linux_create,
2306 NULL, /* enumerate */
2307 netdev_vport_set_stats);
2309 /* HTB traffic control class. */
2311 #define HTB_N_QUEUES 0xf000
2315 unsigned int max_rate; /* In bytes/s. */
2319 struct tc_queue tc_queue;
2320 unsigned int min_rate; /* In bytes/s. */
2321 unsigned int max_rate; /* In bytes/s. */
2322 unsigned int burst; /* In bytes. */
2323 unsigned int priority; /* Lower values are higher priorities. */
2327 htb_get__(const struct netdev *netdev)
2329 struct netdev_dev_linux *netdev_dev =
2330 netdev_dev_linux_cast(netdev_get_dev(netdev));
2331 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2335 htb_install__(struct netdev *netdev, uint64_t max_rate)
2337 struct netdev_dev_linux *netdev_dev =
2338 netdev_dev_linux_cast(netdev_get_dev(netdev));
2341 htb = xmalloc(sizeof *htb);
2342 tc_init(&htb->tc, &tc_ops_htb);
2343 htb->max_rate = max_rate;
2345 netdev_dev->tc = &htb->tc;
2348 /* Create an HTB qdisc.
2350 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2352 htb_setup_qdisc__(struct netdev *netdev)
2355 struct tc_htb_glob opt;
2356 struct ofpbuf request;
2357 struct tcmsg *tcmsg;
2359 tc_del_qdisc(netdev);
2361 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2362 NLM_F_EXCL | NLM_F_CREATE, &request);
2366 tcmsg->tcm_handle = tc_make_handle(1, 0);
2367 tcmsg->tcm_parent = TC_H_ROOT;
2369 nl_msg_put_string(&request, TCA_KIND, "htb");
2371 memset(&opt, 0, sizeof opt);
2372 opt.rate2quantum = 10;
2376 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2377 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2378 nl_msg_end_nested(&request, opt_offset);
2380 return tc_transact(&request, NULL);
2383 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2384 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2386 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2387 unsigned int parent, struct htb_class *class)
2390 struct tc_htb_opt opt;
2391 struct ofpbuf request;
2392 struct tcmsg *tcmsg;
2396 netdev_get_mtu(netdev, &mtu);
2397 if (mtu == INT_MAX) {
2398 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2399 netdev_get_name(netdev));
2403 memset(&opt, 0, sizeof opt);
2404 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2405 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2406 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2407 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2408 opt.prio = class->priority;
2410 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2414 tcmsg->tcm_handle = handle;
2415 tcmsg->tcm_parent = parent;
2417 nl_msg_put_string(&request, TCA_KIND, "htb");
2418 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2419 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2420 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2421 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2422 nl_msg_end_nested(&request, opt_offset);
2424 error = tc_transact(&request, NULL);
2426 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2427 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2428 netdev_get_name(netdev),
2429 tc_get_major(handle), tc_get_minor(handle),
2430 tc_get_major(parent), tc_get_minor(parent),
2431 class->min_rate, class->max_rate,
2432 class->burst, class->priority, strerror(error));
2437 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2438 * description of them into 'details'. The description complies with the
2439 * specification given in the vswitch database documentation for linux-htb
2442 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2444 static const struct nl_policy tca_htb_policy[] = {
2445 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2446 .min_len = sizeof(struct tc_htb_opt) },
2449 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2450 const struct tc_htb_opt *htb;
2452 if (!nl_parse_nested(nl_options, tca_htb_policy,
2453 attrs, ARRAY_SIZE(tca_htb_policy))) {
2454 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2458 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2459 class->min_rate = htb->rate.rate;
2460 class->max_rate = htb->ceil.rate;
2461 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2462 class->priority = htb->prio;
2467 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2468 struct htb_class *options,
2469 struct netdev_queue_stats *stats)
2471 struct nlattr *nl_options;
2472 unsigned int handle;
2475 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2476 if (!error && queue_id) {
2477 unsigned int major = tc_get_major(handle);
2478 unsigned int minor = tc_get_minor(handle);
2479 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2480 *queue_id = minor - 1;
2485 if (!error && options) {
2486 error = htb_parse_tca_options__(nl_options, options);
2492 htb_parse_qdisc_details__(struct netdev *netdev,
2493 const struct shash *details, struct htb_class *hc)
2495 const char *max_rate_s;
2497 max_rate_s = shash_find_data(details, "max-rate");
2498 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2499 if (!hc->max_rate) {
2502 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2503 hc->max_rate = netdev_features_to_bps(current) / 8;
2505 hc->min_rate = hc->max_rate;
2511 htb_parse_class_details__(struct netdev *netdev,
2512 const struct shash *details, struct htb_class *hc)
2514 const struct htb *htb = htb_get__(netdev);
2515 const char *min_rate_s = shash_find_data(details, "min-rate");
2516 const char *max_rate_s = shash_find_data(details, "max-rate");
2517 const char *burst_s = shash_find_data(details, "burst");
2518 const char *priority_s = shash_find_data(details, "priority");
2521 netdev_get_mtu(netdev, &mtu);
2522 if (mtu == INT_MAX) {
2523 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2524 netdev_get_name(netdev));
2528 /* HTB requires at least an mtu sized min-rate to send any traffic even
2529 * on uncongested links. */
2530 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2531 hc->min_rate = MAX(hc->min_rate, mtu);
2532 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2535 hc->max_rate = (max_rate_s
2536 ? strtoull(max_rate_s, NULL, 10) / 8
2538 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2539 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2543 * According to hints in the documentation that I've read, it is important
2544 * that 'burst' be at least as big as the largest frame that might be
2545 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2546 * but having it a bit too small is a problem. Since netdev_get_mtu()
2547 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2548 * the MTU. We actually add 64, instead of 14, as a guard against
2549 * additional headers get tacked on somewhere that we're not aware of. */
2550 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2551 hc->burst = MAX(hc->burst, mtu + 64);
2554 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2560 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2561 unsigned int parent, struct htb_class *options,
2562 struct netdev_queue_stats *stats)
2564 struct ofpbuf *reply;
2567 error = tc_query_class(netdev, handle, parent, &reply);
2569 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2570 ofpbuf_delete(reply);
2576 htb_tc_install(struct netdev *netdev, const struct shash *details)
2580 error = htb_setup_qdisc__(netdev);
2582 struct htb_class hc;
2584 htb_parse_qdisc_details__(netdev, details, &hc);
2585 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2586 tc_make_handle(1, 0), &hc);
2588 htb_install__(netdev, hc.max_rate);
2594 static struct htb_class *
2595 htb_class_cast__(const struct tc_queue *queue)
2597 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2601 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2602 const struct htb_class *hc)
2604 struct htb *htb = htb_get__(netdev);
2605 size_t hash = hash_int(queue_id, 0);
2606 struct tc_queue *queue;
2607 struct htb_class *hcp;
2609 queue = tc_find_queue__(netdev, queue_id, hash);
2611 hcp = htb_class_cast__(queue);
2613 hcp = xmalloc(sizeof *hcp);
2614 queue = &hcp->tc_queue;
2615 queue->queue_id = queue_id;
2616 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2619 hcp->min_rate = hc->min_rate;
2620 hcp->max_rate = hc->max_rate;
2621 hcp->burst = hc->burst;
2622 hcp->priority = hc->priority;
2626 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2629 struct nl_dump dump;
2630 struct htb_class hc;
2632 /* Get qdisc options. */
2634 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2635 htb_install__(netdev, hc.max_rate);
2638 if (!start_queue_dump(netdev, &dump)) {
2641 while (nl_dump_next(&dump, &msg)) {
2642 unsigned int queue_id;
2644 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2645 htb_update_queue__(netdev, queue_id, &hc);
2648 nl_dump_done(&dump);
2654 htb_tc_destroy(struct tc *tc)
2656 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2657 struct htb_class *hc, *next;
2659 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2660 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2668 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2670 const struct htb *htb = htb_get__(netdev);
2671 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2676 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2678 struct htb_class hc;
2681 htb_parse_qdisc_details__(netdev, details, &hc);
2682 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2683 tc_make_handle(1, 0), &hc);
2685 htb_get__(netdev)->max_rate = hc.max_rate;
2691 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2692 const struct tc_queue *queue, struct shash *details)
2694 const struct htb_class *hc = htb_class_cast__(queue);
2696 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2697 if (hc->min_rate != hc->max_rate) {
2698 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2700 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2702 shash_add(details, "priority", xasprintf("%u", hc->priority));
2708 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2709 const struct shash *details)
2711 struct htb_class hc;
2714 error = htb_parse_class_details__(netdev, details, &hc);
2719 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2720 tc_make_handle(1, 0xfffe), &hc);
2725 htb_update_queue__(netdev, queue_id, &hc);
2730 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2732 struct htb_class *hc = htb_class_cast__(queue);
2733 struct htb *htb = htb_get__(netdev);
2736 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2738 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2745 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2746 struct netdev_queue_stats *stats)
2748 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2749 tc_make_handle(1, 0xfffe), NULL, stats);
2753 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2754 const struct ofpbuf *nlmsg,
2755 netdev_dump_queue_stats_cb *cb, void *aux)
2757 struct netdev_queue_stats stats;
2758 unsigned int handle, major, minor;
2761 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2766 major = tc_get_major(handle);
2767 minor = tc_get_minor(handle);
2768 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2769 (*cb)(minor - 1, &stats, aux);
2774 static const struct tc_ops tc_ops_htb = {
2775 "htb", /* linux_name */
2776 "linux-htb", /* ovs_name */
2777 HTB_N_QUEUES, /* n_queues */
2786 htb_class_get_stats,
2787 htb_class_dump_stats
2790 /* "linux-hfsc" traffic control class. */
2792 #define HFSC_N_QUEUES 0xf000
2800 struct tc_queue tc_queue;
2805 static struct hfsc *
2806 hfsc_get__(const struct netdev *netdev)
2808 struct netdev_dev_linux *netdev_dev;
2809 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2810 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2813 static struct hfsc_class *
2814 hfsc_class_cast__(const struct tc_queue *queue)
2816 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2820 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2822 struct netdev_dev_linux * netdev_dev;
2825 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2826 hfsc = xmalloc(sizeof *hfsc);
2827 tc_init(&hfsc->tc, &tc_ops_hfsc);
2828 hfsc->max_rate = max_rate;
2829 netdev_dev->tc = &hfsc->tc;
2833 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2834 const struct hfsc_class *hc)
2838 struct hfsc_class *hcp;
2839 struct tc_queue *queue;
2841 hfsc = hfsc_get__(netdev);
2842 hash = hash_int(queue_id, 0);
2844 queue = tc_find_queue__(netdev, queue_id, hash);
2846 hcp = hfsc_class_cast__(queue);
2848 hcp = xmalloc(sizeof *hcp);
2849 queue = &hcp->tc_queue;
2850 queue->queue_id = queue_id;
2851 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2854 hcp->min_rate = hc->min_rate;
2855 hcp->max_rate = hc->max_rate;
2859 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2861 const struct tc_service_curve *rsc, *fsc, *usc;
2862 static const struct nl_policy tca_hfsc_policy[] = {
2864 .type = NL_A_UNSPEC,
2866 .min_len = sizeof(struct tc_service_curve),
2869 .type = NL_A_UNSPEC,
2871 .min_len = sizeof(struct tc_service_curve),
2874 .type = NL_A_UNSPEC,
2876 .min_len = sizeof(struct tc_service_curve),
2879 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2881 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2882 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2883 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2887 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2888 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2889 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2891 if (rsc->m1 != 0 || rsc->d != 0 ||
2892 fsc->m1 != 0 || fsc->d != 0 ||
2893 usc->m1 != 0 || usc->d != 0) {
2894 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2895 "Non-linear service curves are not supported.");
2899 if (rsc->m2 != fsc->m2) {
2900 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2901 "Real-time service curves are not supported ");
2905 if (rsc->m2 > usc->m2) {
2906 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2907 "Min-rate service curve is greater than "
2908 "the max-rate service curve.");
2912 class->min_rate = fsc->m2;
2913 class->max_rate = usc->m2;
2918 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2919 struct hfsc_class *options,
2920 struct netdev_queue_stats *stats)
2923 unsigned int handle;
2924 struct nlattr *nl_options;
2926 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2932 unsigned int major, minor;
2934 major = tc_get_major(handle);
2935 minor = tc_get_minor(handle);
2936 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2937 *queue_id = minor - 1;
2944 error = hfsc_parse_tca_options__(nl_options, options);
2951 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2952 unsigned int parent, struct hfsc_class *options,
2953 struct netdev_queue_stats *stats)
2956 struct ofpbuf *reply;
2958 error = tc_query_class(netdev, handle, parent, &reply);
2963 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2964 ofpbuf_delete(reply);
2969 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2970 struct hfsc_class *class)
2973 const char *max_rate_s;
2975 max_rate_s = shash_find_data(details, "max-rate");
2976 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2981 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2982 max_rate = netdev_features_to_bps(current) / 8;
2985 class->min_rate = max_rate;
2986 class->max_rate = max_rate;
2990 hfsc_parse_class_details__(struct netdev *netdev,
2991 const struct shash *details,
2992 struct hfsc_class * class)
2994 const struct hfsc *hfsc;
2995 uint32_t min_rate, max_rate;
2996 const char *min_rate_s, *max_rate_s;
2998 hfsc = hfsc_get__(netdev);
2999 min_rate_s = shash_find_data(details, "min-rate");
3000 max_rate_s = shash_find_data(details, "max-rate");
3002 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3003 min_rate = MAX(min_rate, 1);
3004 min_rate = MIN(min_rate, hfsc->max_rate);
3006 max_rate = (max_rate_s
3007 ? strtoull(max_rate_s, NULL, 10) / 8
3009 max_rate = MAX(max_rate, min_rate);
3010 max_rate = MIN(max_rate, hfsc->max_rate);
3012 class->min_rate = min_rate;
3013 class->max_rate = max_rate;
3018 /* Create an HFSC qdisc.
3020 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3022 hfsc_setup_qdisc__(struct netdev * netdev)
3024 struct tcmsg *tcmsg;
3025 struct ofpbuf request;
3026 struct tc_hfsc_qopt opt;
3028 tc_del_qdisc(netdev);
3030 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3031 NLM_F_EXCL | NLM_F_CREATE, &request);
3037 tcmsg->tcm_handle = tc_make_handle(1, 0);
3038 tcmsg->tcm_parent = TC_H_ROOT;
3040 memset(&opt, 0, sizeof opt);
3043 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3044 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3046 return tc_transact(&request, NULL);
3049 /* Create an HFSC class.
3051 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3052 * sc rate <min_rate> ul rate <max_rate>" */
3054 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3055 unsigned int parent, struct hfsc_class *class)
3059 struct tcmsg *tcmsg;
3060 struct ofpbuf request;
3061 struct tc_service_curve min, max;
3063 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3069 tcmsg->tcm_handle = handle;
3070 tcmsg->tcm_parent = parent;
3074 min.m2 = class->min_rate;
3078 max.m2 = class->max_rate;
3080 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3081 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3082 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3083 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3084 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3085 nl_msg_end_nested(&request, opt_offset);
3087 error = tc_transact(&request, NULL);
3089 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3090 "min-rate %ubps, max-rate %ubps (%s)",
3091 netdev_get_name(netdev),
3092 tc_get_major(handle), tc_get_minor(handle),
3093 tc_get_major(parent), tc_get_minor(parent),
3094 class->min_rate, class->max_rate, strerror(error));
3101 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3104 struct hfsc_class class;
3106 error = hfsc_setup_qdisc__(netdev);
3112 hfsc_parse_qdisc_details__(netdev, details, &class);
3113 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3114 tc_make_handle(1, 0), &class);
3120 hfsc_install__(netdev, class.max_rate);
3125 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3128 struct nl_dump dump;
3129 struct hfsc_class hc;
3132 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3133 hfsc_install__(netdev, hc.max_rate);
3135 if (!start_queue_dump(netdev, &dump)) {
3139 while (nl_dump_next(&dump, &msg)) {
3140 unsigned int queue_id;
3142 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3143 hfsc_update_queue__(netdev, queue_id, &hc);
3147 nl_dump_done(&dump);
3152 hfsc_tc_destroy(struct tc *tc)
3155 struct hfsc_class *hc, *next;
3157 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3159 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3160 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3169 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3171 const struct hfsc *hfsc;
3172 hfsc = hfsc_get__(netdev);
3173 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3178 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3181 struct hfsc_class class;
3183 hfsc_parse_qdisc_details__(netdev, details, &class);
3184 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3185 tc_make_handle(1, 0), &class);
3188 hfsc_get__(netdev)->max_rate = class.max_rate;
3195 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3196 const struct tc_queue *queue, struct shash *details)
3198 const struct hfsc_class *hc;
3200 hc = hfsc_class_cast__(queue);
3201 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3202 if (hc->min_rate != hc->max_rate) {
3203 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3209 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3210 const struct shash *details)
3213 struct hfsc_class class;
3215 error = hfsc_parse_class_details__(netdev, details, &class);
3220 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3221 tc_make_handle(1, 0xfffe), &class);
3226 hfsc_update_queue__(netdev, queue_id, &class);
3231 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3235 struct hfsc_class *hc;
3237 hc = hfsc_class_cast__(queue);
3238 hfsc = hfsc_get__(netdev);
3240 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3242 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3249 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3250 struct netdev_queue_stats *stats)
3252 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3253 tc_make_handle(1, 0xfffe), NULL, stats);
3257 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3258 const struct ofpbuf *nlmsg,
3259 netdev_dump_queue_stats_cb *cb, void *aux)
3261 struct netdev_queue_stats stats;
3262 unsigned int handle, major, minor;
3265 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3270 major = tc_get_major(handle);
3271 minor = tc_get_minor(handle);
3272 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3273 (*cb)(minor - 1, &stats, aux);
3278 static const struct tc_ops tc_ops_hfsc = {
3279 "hfsc", /* linux_name */
3280 "linux-hfsc", /* ovs_name */
3281 HFSC_N_QUEUES, /* n_queues */
3282 hfsc_tc_install, /* tc_install */
3283 hfsc_tc_load, /* tc_load */
3284 hfsc_tc_destroy, /* tc_destroy */
3285 hfsc_qdisc_get, /* qdisc_get */
3286 hfsc_qdisc_set, /* qdisc_set */
3287 hfsc_class_get, /* class_get */
3288 hfsc_class_set, /* class_set */
3289 hfsc_class_delete, /* class_delete */
3290 hfsc_class_get_stats, /* class_get_stats */
3291 hfsc_class_dump_stats /* class_dump_stats */
3294 /* "linux-default" traffic control class.
3296 * This class represents the default, unnamed Linux qdisc. It corresponds to
3297 * the "" (empty string) QoS type in the OVS database. */
3300 default_install__(struct netdev *netdev)
3302 struct netdev_dev_linux *netdev_dev =
3303 netdev_dev_linux_cast(netdev_get_dev(netdev));
3304 static struct tc *tc;
3307 tc = xmalloc(sizeof *tc);
3308 tc_init(tc, &tc_ops_default);
3310 netdev_dev->tc = tc;
3314 default_tc_install(struct netdev *netdev,
3315 const struct shash *details OVS_UNUSED)
3317 default_install__(netdev);
3322 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3324 default_install__(netdev);
3328 static const struct tc_ops tc_ops_default = {
3329 NULL, /* linux_name */
3334 NULL, /* tc_destroy */
3335 NULL, /* qdisc_get */
3336 NULL, /* qdisc_set */
3337 NULL, /* class_get */
3338 NULL, /* class_set */
3339 NULL, /* class_delete */
3340 NULL, /* class_get_stats */
3341 NULL /* class_dump_stats */
3344 /* "linux-other" traffic control class.
3349 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3351 struct netdev_dev_linux *netdev_dev =
3352 netdev_dev_linux_cast(netdev_get_dev(netdev));
3353 static struct tc *tc;
3356 tc = xmalloc(sizeof *tc);
3357 tc_init(tc, &tc_ops_other);
3359 netdev_dev->tc = tc;
3363 static const struct tc_ops tc_ops_other = {
3364 NULL, /* linux_name */
3365 "linux-other", /* ovs_name */
3367 NULL, /* tc_install */
3369 NULL, /* tc_destroy */
3370 NULL, /* qdisc_get */
3371 NULL, /* qdisc_set */
3372 NULL, /* class_get */
3373 NULL, /* class_set */
3374 NULL, /* class_delete */
3375 NULL, /* class_get_stats */
3376 NULL /* class_dump_stats */
3379 /* Traffic control. */
3381 /* Number of kernel "tc" ticks per second. */
3382 static double ticks_per_s;
3384 /* Number of kernel "jiffies" per second. This is used for the purpose of
3385 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3386 * one jiffy's worth of data.
3388 * There are two possibilities here:
3390 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3391 * approximate range of 100 to 1024. That means that we really need to
3392 * make sure that the qdisc can buffer that much data.
3394 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3395 * has finely granular timers and there's no need to fudge additional room
3396 * for buffers. (There's no extra effort needed to implement that: the
3397 * large 'buffer_hz' is used as a divisor, so practically any number will
3398 * come out as 0 in the division. Small integer results in the case of
3399 * really high dividends won't have any real effect anyhow.)
3401 static unsigned int buffer_hz;
3403 /* Returns tc handle 'major':'minor'. */
3405 tc_make_handle(unsigned int major, unsigned int minor)
3407 return TC_H_MAKE(major << 16, minor);
3410 /* Returns the major number from 'handle'. */
3412 tc_get_major(unsigned int handle)
3414 return TC_H_MAJ(handle) >> 16;
3417 /* Returns the minor number from 'handle'. */
3419 tc_get_minor(unsigned int handle)
3421 return TC_H_MIN(handle);
3424 static struct tcmsg *
3425 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3426 struct ofpbuf *request)
3428 struct tcmsg *tcmsg;
3432 error = get_ifindex(netdev, &ifindex);
3437 ofpbuf_init(request, 512);
3438 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3439 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3440 tcmsg->tcm_family = AF_UNSPEC;
3441 tcmsg->tcm_ifindex = ifindex;
3442 /* Caller should fill in tcmsg->tcm_handle. */
3443 /* Caller should fill in tcmsg->tcm_parent. */
3449 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3451 int error = nl_sock_transact(rtnl_sock, request, replyp);
3452 ofpbuf_uninit(request);
3459 /* The values in psched are not individually very meaningful, but they are
3460 * important. The tables below show some values seen in the wild.
3464 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3465 * (Before that, there are hints that it was 1000000000.)
3467 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3471 * -----------------------------------
3472 * [1] 000c8000 000f4240 000f4240 00000064
3473 * [2] 000003e8 00000400 000f4240 3b9aca00
3474 * [3] 000003e8 00000400 000f4240 3b9aca00
3475 * [4] 000003e8 00000400 000f4240 00000064
3476 * [5] 000003e8 00000040 000f4240 3b9aca00
3477 * [6] 000003e8 00000040 000f4240 000000f9
3479 * a b c d ticks_per_s buffer_hz
3480 * ------- --------- ---------- ------------- ----------- -------------
3481 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3482 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3483 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3484 * [4] 1,000 1,024 1,000,000 100 976,562 100
3485 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3486 * [6] 1,000 64 1,000,000 249 15,625,000 249
3488 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3489 * [2] 2.6.26-1-686-bigmem from Debian lenny
3490 * [3] 2.6.26-2-sparc64 from Debian lenny
3491 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3492 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3493 * [6] 2.6.34 from kernel.org on KVM
3495 static const char fn[] = "/proc/net/psched";
3496 unsigned int a, b, c, d;
3502 stream = fopen(fn, "r");
3504 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3508 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3509 VLOG_WARN("%s: read failed", fn);
3513 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3517 VLOG_WARN("%s: invalid scheduler parameters", fn);
3521 ticks_per_s = (double) a * c / b;
3525 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3528 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3531 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3532 * rate of 'rate' bytes per second. */
3534 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3539 return (rate * ticks) / ticks_per_s;
3542 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3543 * rate of 'rate' bytes per second. */
3545 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3550 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3553 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3554 * a transmission rate of 'rate' bytes per second. */
3556 tc_buffer_per_jiffy(unsigned int rate)
3561 return rate / buffer_hz;
3564 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3565 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3566 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3567 * stores NULL into it if it is absent.
3569 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3572 * Returns 0 if successful, otherwise a positive errno value. */
3574 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3575 struct nlattr **options)
3577 static const struct nl_policy tca_policy[] = {
3578 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3579 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3581 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3583 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3584 tca_policy, ta, ARRAY_SIZE(ta))) {
3585 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3590 *kind = nl_attr_get_string(ta[TCA_KIND]);
3594 *options = ta[TCA_OPTIONS];
3609 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3610 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3611 * into '*options', and its queue statistics into '*stats'. Any of the output
3612 * arguments may be null.
3614 * Returns 0 if successful, otherwise a positive errno value. */
3616 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3617 struct nlattr **options, struct netdev_queue_stats *stats)
3619 static const struct nl_policy tca_policy[] = {
3620 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3621 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3623 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3625 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3626 tca_policy, ta, ARRAY_SIZE(ta))) {
3627 VLOG_WARN_RL(&rl, "failed to parse class message");
3632 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3633 *handlep = tc->tcm_handle;
3637 *options = ta[TCA_OPTIONS];
3641 const struct gnet_stats_queue *gsq;
3642 struct gnet_stats_basic gsb;
3644 static const struct nl_policy stats_policy[] = {
3645 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3646 .min_len = sizeof gsb },
3647 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3648 .min_len = sizeof *gsq },
3650 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3652 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3653 sa, ARRAY_SIZE(sa))) {
3654 VLOG_WARN_RL(&rl, "failed to parse class stats");
3658 /* Alignment issues screw up the length of struct gnet_stats_basic on
3659 * some arch/bitsize combinations. Newer versions of Linux have a
3660 * struct gnet_stats_basic_packed, but we can't depend on that. The
3661 * easiest thing to do is just to make a copy. */
3662 memset(&gsb, 0, sizeof gsb);
3663 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3664 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3665 stats->tx_bytes = gsb.bytes;
3666 stats->tx_packets = gsb.packets;
3668 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3669 stats->tx_errors = gsq->drops;
3679 memset(stats, 0, sizeof *stats);
3684 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3687 tc_query_class(const struct netdev *netdev,
3688 unsigned int handle, unsigned int parent,
3689 struct ofpbuf **replyp)
3691 struct ofpbuf request;
3692 struct tcmsg *tcmsg;
3695 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3699 tcmsg->tcm_handle = handle;
3700 tcmsg->tcm_parent = parent;
3702 error = tc_transact(&request, replyp);
3704 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3705 netdev_get_name(netdev),
3706 tc_get_major(handle), tc_get_minor(handle),
3707 tc_get_major(parent), tc_get_minor(parent),
3713 /* Equivalent to "tc class del dev <name> handle <handle>". */
3715 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3717 struct ofpbuf request;
3718 struct tcmsg *tcmsg;
3721 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3725 tcmsg->tcm_handle = handle;
3726 tcmsg->tcm_parent = 0;
3728 error = tc_transact(&request, NULL);
3730 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3731 netdev_get_name(netdev),
3732 tc_get_major(handle), tc_get_minor(handle),
3738 /* Equivalent to "tc qdisc del dev <name> root". */
3740 tc_del_qdisc(struct netdev *netdev)
3742 struct netdev_dev_linux *netdev_dev =
3743 netdev_dev_linux_cast(netdev_get_dev(netdev));
3744 struct ofpbuf request;
3745 struct tcmsg *tcmsg;
3748 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3752 tcmsg->tcm_handle = tc_make_handle(1, 0);
3753 tcmsg->tcm_parent = TC_H_ROOT;
3755 error = tc_transact(&request, NULL);
3756 if (error == EINVAL) {
3757 /* EINVAL probably means that the default qdisc was in use, in which
3758 * case we've accomplished our purpose. */
3761 if (!error && netdev_dev->tc) {
3762 if (netdev_dev->tc->ops->tc_destroy) {
3763 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3765 netdev_dev->tc = NULL;
3770 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3771 * kernel to determine what they are. Returns 0 if successful, otherwise a
3772 * positive errno value. */
3774 tc_query_qdisc(const struct netdev *netdev)
3776 struct netdev_dev_linux *netdev_dev =
3777 netdev_dev_linux_cast(netdev_get_dev(netdev));
3778 struct ofpbuf request, *qdisc;
3779 const struct tc_ops *ops;
3780 struct tcmsg *tcmsg;
3784 if (netdev_dev->tc) {
3788 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3789 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3790 * 2.6.35 without that fix backported to it.
3792 * To avoid the OOPS, we must not make a request that would attempt to dump
3793 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3794 * few others. There are a few ways that I can see to do this, but most of
3795 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3796 * technique chosen here is to assume that any non-default qdisc that we
3797 * create will have a class with handle 1:0. The built-in qdiscs only have
3798 * a class with handle 0:0.
3800 * We could check for Linux 2.6.35+ and use a more straightforward method
3802 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3806 tcmsg->tcm_handle = tc_make_handle(1, 0);
3807 tcmsg->tcm_parent = 0;
3809 /* Figure out what tc class to instantiate. */
3810 error = tc_transact(&request, &qdisc);
3814 error = tc_parse_qdisc(qdisc, &kind, NULL);
3816 ops = &tc_ops_other;
3818 ops = tc_lookup_linux_name(kind);
3820 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3821 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3823 ops = &tc_ops_other;
3826 } else if (error == ENOENT) {
3827 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3828 * other entity that doesn't have a handle 1:0. We will assume
3829 * that it's the system default qdisc. */
3830 ops = &tc_ops_default;
3833 /* Who knows? Maybe the device got deleted. */
3834 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3835 netdev_get_name(netdev), strerror(error));
3836 ops = &tc_ops_other;
3839 /* Instantiate it. */
3840 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3841 assert((load_error == 0) == (netdev_dev->tc != NULL));
3842 ofpbuf_delete(qdisc);
3844 return error ? error : load_error;
3847 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3848 approximate the time to transmit packets of various lengths. For an MTU of
3849 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3850 represents two possible packet lengths; for a MTU of 513 through 1024, four
3851 possible lengths; and so on.
3853 Returns, for the specified 'mtu', the number of bits that packet lengths
3854 need to be shifted right to fit within such a 256-entry table. */
3856 tc_calc_cell_log(unsigned int mtu)
3861 mtu = ETH_PAYLOAD_MAX;
3863 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3865 for (cell_log = 0; mtu >= 256; cell_log++) {
3872 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3875 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3877 memset(rate, 0, sizeof *rate);
3878 rate->cell_log = tc_calc_cell_log(mtu);
3879 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3880 /* rate->cell_align = 0; */ /* distro headers. */
3881 rate->mpu = ETH_TOTAL_MIN;
3885 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3886 * attribute of the specified "type".
3888 * See tc_calc_cell_log() above for a description of "rtab"s. */
3890 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3895 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3896 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3897 unsigned packet_size = (i + 1) << rate->cell_log;
3898 if (packet_size < rate->mpu) {
3899 packet_size = rate->mpu;
3901 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3905 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3906 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3907 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3910 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3912 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3913 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3917 /* Utility functions. */
3920 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3922 /* Policy for RTNLGRP_LINK messages.
3924 * There are *many* more fields in these messages, but currently we only
3925 * care about these fields. */
3926 static const struct nl_policy rtnlgrp_link_policy[] = {
3927 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3928 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3929 .min_len = sizeof(struct rtnl_link_stats) },
3932 struct ofpbuf request;
3933 struct ofpbuf *reply;
3934 struct ifinfomsg *ifi;
3935 const struct rtnl_link_stats *rtnl_stats;
3936 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3939 ofpbuf_init(&request, 0);
3940 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3941 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3942 ifi->ifi_family = PF_UNSPEC;
3943 ifi->ifi_index = ifindex;
3944 error = nl_sock_transact(rtnl_sock, &request, &reply);
3945 ofpbuf_uninit(&request);
3950 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3951 rtnlgrp_link_policy,
3952 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3953 ofpbuf_delete(reply);
3957 if (!attrs[IFLA_STATS]) {
3958 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3959 ofpbuf_delete(reply);
3963 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3964 stats->rx_packets = rtnl_stats->rx_packets;
3965 stats->tx_packets = rtnl_stats->tx_packets;
3966 stats->rx_bytes = rtnl_stats->rx_bytes;
3967 stats->tx_bytes = rtnl_stats->tx_bytes;
3968 stats->rx_errors = rtnl_stats->rx_errors;
3969 stats->tx_errors = rtnl_stats->tx_errors;
3970 stats->rx_dropped = rtnl_stats->rx_dropped;
3971 stats->tx_dropped = rtnl_stats->tx_dropped;
3972 stats->multicast = rtnl_stats->multicast;
3973 stats->collisions = rtnl_stats->collisions;
3974 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3975 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3976 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3977 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3978 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3979 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3980 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3981 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3982 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3983 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3984 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3986 ofpbuf_delete(reply);
3992 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3994 static const char fn[] = "/proc/net/dev";
3999 stream = fopen(fn, "r");
4001 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4006 while (fgets(line, sizeof line, stream)) {
4009 #define X64 "%"SCNu64
4012 X64 X64 X64 X64 X64 X64 X64 "%*u"
4013 X64 X64 X64 X64 X64 X64 X64 "%*u",
4019 &stats->rx_fifo_errors,
4020 &stats->rx_frame_errors,
4026 &stats->tx_fifo_errors,
4028 &stats->tx_carrier_errors) != 15) {
4029 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4030 } else if (!strcmp(devname, netdev_name)) {
4031 stats->rx_length_errors = UINT64_MAX;
4032 stats->rx_over_errors = UINT64_MAX;
4033 stats->rx_crc_errors = UINT64_MAX;
4034 stats->rx_missed_errors = UINT64_MAX;
4035 stats->tx_aborted_errors = UINT64_MAX;
4036 stats->tx_heartbeat_errors = UINT64_MAX;
4037 stats->tx_window_errors = UINT64_MAX;
4043 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4049 get_flags(const struct netdev *netdev, int *flags)
4054 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4056 *flags = ifr.ifr_flags;
4061 set_flags(struct netdev *netdev, int flags)
4065 ifr.ifr_flags = flags;
4066 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4071 do_get_ifindex(const char *netdev_name)
4075 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4076 COVERAGE_INC(netdev_get_ifindex);
4077 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4078 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4079 netdev_name, strerror(errno));
4082 return ifr.ifr_ifindex;
4086 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4088 struct netdev_dev_linux *netdev_dev =
4089 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4091 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4092 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4096 netdev_dev->cache_valid |= VALID_IFINDEX;
4097 netdev_dev->ifindex = ifindex;
4099 *ifindexp = netdev_dev->ifindex;
4104 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4109 memset(&ifr, 0, sizeof ifr);
4110 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4111 COVERAGE_INC(netdev_get_hwaddr);
4112 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4113 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4114 netdev_name, strerror(errno));
4117 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4118 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4119 VLOG_WARN("%s device has unknown hardware address family %d",
4120 netdev_name, hwaddr_family);
4122 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4127 set_etheraddr(const char *netdev_name, int hwaddr_family,
4128 const uint8_t mac[ETH_ADDR_LEN])
4132 memset(&ifr, 0, sizeof ifr);
4133 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4134 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4135 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4136 COVERAGE_INC(netdev_set_hwaddr);
4137 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4138 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4139 netdev_name, strerror(errno));
4146 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4147 int cmd, const char *cmd_name)
4151 memset(&ifr, 0, sizeof ifr);
4152 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4153 ifr.ifr_data = (caddr_t) ecmd;
4156 COVERAGE_INC(netdev_ethtool);
4157 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4160 if (errno != EOPNOTSUPP) {
4161 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4162 "failed: %s", cmd_name, name, strerror(errno));
4164 /* The device doesn't support this operation. That's pretty
4165 * common, so there's no point in logging anything. */
4172 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4173 const char *cmd_name)
4175 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4176 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4177 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4185 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4186 int cmd, const char *cmd_name)
4191 ifr.ifr_addr.sa_family = AF_INET;
4192 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4194 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4195 *ip = sin->sin_addr;