2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dynamic-string.h"
51 #include "fatal-signal.h"
54 #include "netdev-provider.h"
55 #include "netdev-vport.h"
57 #include "netlink-socket.h"
59 #include "openflow/openflow.h"
61 #include "poll-loop.h"
62 #include "rtnetlink.h"
63 #include "rtnetlink-link.h"
64 #include "socket-util.h"
69 VLOG_DEFINE_THIS_MODULE(netdev_linux);
71 COVERAGE_DEFINE(netdev_get_vlan_vid);
72 COVERAGE_DEFINE(netdev_set_policing);
73 COVERAGE_DEFINE(netdev_arp_lookup);
74 COVERAGE_DEFINE(netdev_get_ifindex);
75 COVERAGE_DEFINE(netdev_get_hwaddr);
76 COVERAGE_DEFINE(netdev_set_hwaddr);
77 COVERAGE_DEFINE(netdev_ethtool);
79 /* These were introduced in Linux 2.6.14, so they might be missing if we have
81 #ifndef ADVERTISED_Pause
82 #define ADVERTISED_Pause (1 << 13)
84 #ifndef ADVERTISED_Asym_Pause
85 #define ADVERTISED_Asym_Pause (1 << 14)
88 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
91 #define TC_RTAB_SIZE 1024
94 static struct rtnetlink_notifier netdev_linux_cache_notifier;
95 static int cache_notifier_refcount;
98 VALID_IFINDEX = 1 << 0,
99 VALID_ETHERADDR = 1 << 1,
103 VALID_CARRIER = 1 << 5,
104 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
105 VALID_POLICING = 1 << 7,
106 VALID_HAVE_VPORT_STATS = 1 << 8
114 /* Traffic control. */
116 /* An instance of a traffic control class. Always associated with a particular
119 * Each TC implementation subclasses this with whatever additional data it
122 const struct tc_ops *ops;
123 struct hmap queues; /* Contains "struct tc_queue"s.
124 * Read by generic TC layer.
125 * Written only by TC implementation. */
128 /* One traffic control queue.
130 * Each TC implementation subclasses this with whatever additional data it
133 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
134 unsigned int queue_id; /* OpenFlow queue ID. */
137 /* A particular kind of traffic control. Each implementation generally maps to
138 * one particular Linux qdisc class.
140 * The functions below return 0 if successful or a positive errno value on
141 * failure, except where otherwise noted. All of them must be provided, except
142 * where otherwise noted. */
144 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
145 * This is null for tc_ops_default and tc_ops_other, for which there are no
146 * appropriate values. */
147 const char *linux_name;
149 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
150 const char *ovs_name;
152 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
153 * queues. The queues are numbered 0 through n_queues - 1. */
154 unsigned int n_queues;
156 /* Called to install this TC class on 'netdev'. The implementation should
157 * make the Netlink calls required to set up 'netdev' with the right qdisc
158 * and configure it according to 'details'. The implementation may assume
159 * that the current qdisc is the default; that is, there is no need for it
160 * to delete the current qdisc before installing itself.
162 * The contents of 'details' should be documented as valid for 'ovs_name'
163 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
164 * (which is built as ovs-vswitchd.conf.db(8)).
166 * This function must return 0 if and only if it sets 'netdev->tc' to an
167 * initialized 'struct tc'.
169 * (This function is null for tc_ops_other, which cannot be installed. For
170 * other TC classes it should always be nonnull.) */
171 int (*tc_install)(struct netdev *netdev, const struct shash *details);
173 /* Called when the netdev code determines (through a Netlink query) that
174 * this TC class's qdisc is installed on 'netdev', but we didn't install
175 * it ourselves and so don't know any of the details.
177 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
178 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
179 * implementation should parse the other attributes of 'nlmsg' as
180 * necessary to determine its configuration. If necessary it should also
181 * use Netlink queries to determine the configuration of queues on
184 * This function must return 0 if and only if it sets 'netdev->tc' to an
185 * initialized 'struct tc'. */
186 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
188 /* Destroys the data structures allocated by the implementation as part of
189 * 'tc'. (This includes destroying 'tc->queues' by calling
192 * The implementation should not need to perform any Netlink calls. If
193 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
194 * (But it may not be desirable.)
196 * This function may be null if 'tc' is trivial. */
197 void (*tc_destroy)(struct tc *tc);
199 /* Retrieves details of 'netdev->tc' configuration into 'details'.
201 * The implementation should not need to perform any Netlink calls, because
202 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
203 * cached the configuration.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function may be null if 'tc' is not configurable.
211 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
213 /* Reconfigures 'netdev->tc' according to 'details', performing any
214 * required Netlink calls to complete the reconfiguration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_set)(struct netdev *, const struct shash *details);
224 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
225 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "Queue" table in
229 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 * The implementation should not need to perform any Netlink calls, because
232 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
233 * cached the queue configuration.
235 * This function may be null if 'tc' does not have queues ('n_queues' is
237 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
238 struct shash *details);
240 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
241 * 'details', perfoming any required Netlink calls to complete the
242 * reconfiguration. The caller ensures that 'queue_id' is less than
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "Queue" table in
247 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
249 * This function may be null if 'tc' does not have queues or its queues are
250 * not configurable. */
251 int (*class_set)(struct netdev *, unsigned int queue_id,
252 const struct shash *details);
254 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
255 * tc_queue's within 'netdev->tc->queues'.
257 * This function may be null if 'tc' does not have queues or its queues
258 * cannot be deleted. */
259 int (*class_delete)(struct netdev *, struct tc_queue *queue);
261 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
262 * 'struct tc_queue's within 'netdev->tc->queues'.
264 * On success, initializes '*stats'.
266 * This function may be null if 'tc' does not have queues or if it cannot
267 * report queue statistics. */
268 int (*class_get_stats)(const struct netdev *netdev,
269 const struct tc_queue *queue,
270 struct netdev_queue_stats *stats);
272 /* Extracts queue stats from 'nlmsg', which is a response to a
273 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
275 * This function may be null if 'tc' does not have queues or if it cannot
276 * report queue statistics. */
277 int (*class_dump_stats)(const struct netdev *netdev,
278 const struct ofpbuf *nlmsg,
279 netdev_dump_queue_stats_cb *cb, void *aux);
283 tc_init(struct tc *tc, const struct tc_ops *ops)
286 hmap_init(&tc->queues);
290 tc_destroy(struct tc *tc)
292 hmap_destroy(&tc->queues);
295 static const struct tc_ops tc_ops_htb;
296 static const struct tc_ops tc_ops_hfsc;
297 static const struct tc_ops tc_ops_default;
298 static const struct tc_ops tc_ops_other;
300 static const struct tc_ops *tcs[] = {
301 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
302 &tc_ops_hfsc, /* Hierarchical fair service curve. */
303 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
304 &tc_ops_other, /* Some other qdisc. */
308 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
309 static unsigned int tc_get_major(unsigned int handle);
310 static unsigned int tc_get_minor(unsigned int handle);
312 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
313 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
314 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
316 static struct tcmsg *tc_make_request(const struct netdev *, int type,
317 unsigned int flags, struct ofpbuf *);
318 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
320 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
321 struct nlattr **options);
322 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
323 struct nlattr **options,
324 struct netdev_queue_stats *);
325 static int tc_query_class(const struct netdev *,
326 unsigned int handle, unsigned int parent,
327 struct ofpbuf **replyp);
328 static int tc_delete_class(const struct netdev *, unsigned int handle);
330 static int tc_del_qdisc(struct netdev *netdev);
331 static int tc_query_qdisc(const struct netdev *netdev);
333 static int tc_calc_cell_log(unsigned int mtu);
334 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
335 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
336 const struct tc_ratespec *rate);
337 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
339 struct netdev_dev_linux {
340 struct netdev_dev netdev_dev;
342 struct shash_node *shash_node;
343 unsigned int cache_valid;
345 /* The following are figured out "on demand" only. They are only valid
346 * when the corresponding VALID_* bit in 'cache_valid' is set. */
348 uint8_t etheraddr[ETH_ADDR_LEN];
349 struct in_addr address, netmask;
353 bool is_internal; /* Is this an openvswitch internal device? */
354 bool is_tap; /* Is this a tuntap device? */
355 uint32_t kbits_rate; /* Policing data. */
356 uint32_t kbits_burst;
357 bool have_vport_stats;
361 struct tap_state tap;
365 struct netdev_linux {
366 struct netdev netdev;
370 /* An AF_INET socket (used for ioctl operations). */
371 static int af_inet_sock = -1;
373 /* A Netlink routing socket that is not subscribed to any multicast groups. */
374 static struct nl_sock *rtnl_sock;
376 struct netdev_linux_notifier {
377 struct netdev_notifier notifier;
381 static struct shash netdev_linux_notifiers =
382 SHASH_INITIALIZER(&netdev_linux_notifiers);
383 static struct rtnetlink_notifier netdev_linux_poll_notifier;
385 /* This is set pretty low because we probably won't learn anything from the
386 * additional log messages. */
387 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389 static int netdev_linux_init(void);
391 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
392 int cmd, const char *cmd_name);
393 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
394 const char *cmd_name);
395 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
396 int cmd, const char *cmd_name);
397 static int get_flags(const struct netdev *, int *flagsp);
398 static int set_flags(struct netdev *, int flags);
399 static int do_get_ifindex(const char *netdev_name);
400 static int get_ifindex(const struct netdev *, int *ifindexp);
401 static int do_set_addr(struct netdev *netdev,
402 int ioctl_nr, const char *ioctl_name,
403 struct in_addr addr);
404 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
405 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
406 const uint8_t[ETH_ADDR_LEN]);
407 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
408 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
411 is_netdev_linux_class(const struct netdev_class *netdev_class)
413 return netdev_class->init == netdev_linux_init;
416 static struct netdev_dev_linux *
417 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
419 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
420 assert(is_netdev_linux_class(netdev_class));
422 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
425 static struct netdev_linux *
426 netdev_linux_cast(const struct netdev *netdev)
428 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
432 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
436 netdev_linux_init(void)
438 static int status = -1;
440 /* Create AF_INET socket. */
441 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
442 status = af_inet_sock >= 0 ? 0 : errno;
444 VLOG_ERR("failed to create inet socket: %s", strerror(status));
447 /* Create rtnetlink socket. */
449 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
451 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
460 netdev_linux_run(void)
462 rtnetlink_link_notifier_run();
466 netdev_linux_wait(void)
468 rtnetlink_link_notifier_wait();
472 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
473 void *aux OVS_UNUSED)
475 struct netdev_dev_linux *dev;
477 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
479 const struct netdev_class *netdev_class =
480 netdev_dev_get_class(base_dev);
482 if (is_netdev_linux_class(netdev_class)) {
483 dev = netdev_dev_linux_cast(base_dev);
484 dev->cache_valid = 0;
488 struct shash device_shash;
489 struct shash_node *node;
491 shash_init(&device_shash);
492 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
493 SHASH_FOR_EACH (node, &device_shash) {
495 dev->cache_valid = 0;
497 shash_destroy(&device_shash);
501 /* Creates system and internal devices. */
503 netdev_linux_create(const struct netdev_class *class,
504 const char *name, const struct shash *args,
505 struct netdev_dev **netdev_devp)
507 struct netdev_dev_linux *netdev_dev;
510 if (!shash_is_empty(args)) {
511 VLOG_WARN("%s: arguments for %s devices should be empty",
515 if (!cache_notifier_refcount) {
516 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
517 netdev_linux_cache_cb, NULL);
522 cache_notifier_refcount++;
524 netdev_dev = xzalloc(sizeof *netdev_dev);
525 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
527 *netdev_devp = &netdev_dev->netdev_dev;
531 /* For most types of netdevs we open the device for each call of
532 * netdev_open(). However, this is not the case with tap devices,
533 * since it is only possible to open the device once. In this
534 * situation we share a single file descriptor, and consequently
535 * buffers, across all readers. Therefore once data is read it will
536 * be unavailable to other reads for tap devices. */
538 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
539 const char *name, const struct shash *args,
540 struct netdev_dev **netdev_devp)
542 struct netdev_dev_linux *netdev_dev;
543 struct tap_state *state;
544 static const char tap_dev[] = "/dev/net/tun";
548 if (!shash_is_empty(args)) {
549 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
552 netdev_dev = xzalloc(sizeof *netdev_dev);
553 state = &netdev_dev->state.tap;
555 /* Open tap device. */
556 state->fd = open(tap_dev, O_RDWR);
559 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
563 /* Create tap device. */
564 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
565 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
566 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
567 VLOG_WARN("%s: creating tap device failed: %s", name,
573 /* Make non-blocking. */
574 error = set_nonblocking(state->fd);
579 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
580 *netdev_devp = &netdev_dev->netdev_dev;
589 destroy_tap(struct netdev_dev_linux *netdev_dev)
591 struct tap_state *state = &netdev_dev->state.tap;
593 if (state->fd >= 0) {
598 /* Destroys the netdev device 'netdev_dev_'. */
600 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
602 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
603 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
605 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
606 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
609 if (class == &netdev_linux_class || class == &netdev_internal_class) {
610 cache_notifier_refcount--;
612 if (!cache_notifier_refcount) {
613 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
615 } else if (class == &netdev_tap_class) {
616 destroy_tap(netdev_dev);
625 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
626 struct netdev **netdevp)
628 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
629 struct netdev_linux *netdev;
630 enum netdev_flags flags;
633 /* Allocate network device. */
634 netdev = xzalloc(sizeof *netdev);
636 netdev_init(&netdev->netdev, netdev_dev_);
638 /* Verify that the device really exists, by attempting to read its flags.
639 * (The flags might be cached, in which case this won't actually do an
642 * Don't do this for "internal" netdevs, though, because those have to be
643 * created as netdev objects before they exist in the kernel, because
644 * creating them in the kernel happens by passing a netdev object to
645 * dpif_port_add(). */
646 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
647 error = netdev_get_flags(&netdev->netdev, &flags);
648 if (error == ENODEV) {
653 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
654 !netdev_dev->state.tap.opened) {
656 /* We assume that the first user of the tap device is the primary user
657 * and give them the tap FD. Subsequent users probably just expect
658 * this to be a system device so open it normally to avoid send/receive
659 * directions appearing to be reversed. */
660 netdev->fd = netdev_dev->state.tap.fd;
661 netdev_dev->state.tap.opened = true;
662 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
663 struct sockaddr_ll sll;
667 /* Create file descriptor. */
668 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
669 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
671 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
672 if (netdev->fd < 0) {
677 /* Set non-blocking mode. */
678 error = set_nonblocking(netdev->fd);
683 /* Get ethernet device index. */
684 error = get_ifindex(&netdev->netdev, &ifindex);
689 /* Bind to specific ethernet device. */
690 memset(&sll, 0, sizeof sll);
691 sll.sll_family = AF_PACKET;
692 sll.sll_ifindex = ifindex;
694 (struct sockaddr *) &sll, sizeof sll) < 0) {
696 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
701 /* Between the socket() and bind() calls above, the socket receives all
702 * packets of the requested type on all system interfaces. We do not
703 * want to receive that data, but there is no way to avoid it. So we
704 * must now drain out the receive queue. */
705 error = drain_rcvbuf(netdev->fd);
711 *netdevp = &netdev->netdev;
715 netdev_uninit(&netdev->netdev, true);
719 /* Closes and destroys 'netdev'. */
721 netdev_linux_close(struct netdev *netdev_)
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
731 /* Initializes 'svec' with a list of the names of all known network devices. */
733 netdev_linux_enumerate(struct svec *svec)
735 struct if_nameindex *names;
737 names = if_nameindex();
741 for (i = 0; names[i].if_name != NULL; i++) {
742 svec_add(svec, names[i].if_name);
744 if_freenameindex(names);
747 VLOG_WARN("could not obtain list of network device names: %s",
754 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
759 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
764 ssize_t retval = read(netdev->fd, data, size);
767 } else if (errno != EINTR) {
768 if (errno != EAGAIN) {
769 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
770 strerror(errno), netdev_get_name(netdev_));
777 /* Registers with the poll loop to wake up from the next call to poll_block()
778 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780 netdev_linux_recv_wait(struct netdev *netdev_)
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
783 if (netdev->fd >= 0) {
784 poll_fd_wait(netdev->fd, POLLIN);
788 /* Discards all packets waiting to be received from 'netdev'. */
790 netdev_linux_drain(struct netdev *netdev_)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 if (netdev->fd < 0) {
795 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
797 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
798 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
802 drain_fd(netdev->fd, ifr.ifr_qlen);
805 return drain_rcvbuf(netdev->fd);
809 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
810 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
811 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
812 * the packet is too big or too small to transmit on the device.
814 * The caller retains ownership of 'buffer' in all cases.
816 * The kernel maintains a packet transmission queue, so the caller is not
817 * expected to do additional queuing of packets. */
819 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
825 if (netdev->fd < 0) {
830 ssize_t retval = write(netdev->fd, data, size);
832 /* The Linux AF_PACKET implementation never blocks waiting for room
833 * for packets, instead returning ENOBUFS. Translate this into
834 * EAGAIN for the caller. */
835 if (errno == ENOBUFS) {
837 } else if (errno == EINTR) {
839 } else if (errno != EAGAIN) {
840 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
841 netdev_get_name(netdev_), strerror(errno));
844 } else if (retval != size) {
845 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
846 "%zu) on %s", retval, size, netdev_get_name(netdev_));
854 /* Registers with the poll loop to wake up from the next call to poll_block()
855 * when the packet transmission queue has sufficient room to transmit a packet
856 * with netdev_send().
858 * The kernel maintains a packet transmission queue, so the client is not
859 * expected to do additional queuing of packets. Thus, this function is
860 * unlikely to ever be used. It is included for completeness. */
862 netdev_linux_send_wait(struct netdev *netdev_)
864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
865 if (netdev->fd < 0) {
867 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
868 poll_fd_wait(netdev->fd, POLLOUT);
870 /* TAP device always accepts packets.*/
871 poll_immediate_wake();
875 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
876 * otherwise a positive errno value. */
878 netdev_linux_set_etheraddr(struct netdev *netdev_,
879 const uint8_t mac[ETH_ADDR_LEN])
881 struct netdev_dev_linux *netdev_dev =
882 netdev_dev_linux_cast(netdev_get_dev(netdev_));
885 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
886 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
887 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
889 netdev_dev->cache_valid |= VALID_ETHERADDR;
890 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
898 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
899 * free the returned buffer. */
901 netdev_linux_get_etheraddr(const struct netdev *netdev_,
902 uint8_t mac[ETH_ADDR_LEN])
904 struct netdev_dev_linux *netdev_dev =
905 netdev_dev_linux_cast(netdev_get_dev(netdev_));
906 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
907 int error = get_etheraddr(netdev_get_name(netdev_),
908 netdev_dev->etheraddr);
912 netdev_dev->cache_valid |= VALID_ETHERADDR;
914 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
918 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
919 * in bytes, not including the hardware header; thus, this is typically 1500
920 * bytes for Ethernet devices. */
922 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
924 struct netdev_dev_linux *netdev_dev =
925 netdev_dev_linux_cast(netdev_get_dev(netdev_));
926 if (!(netdev_dev->cache_valid & VALID_MTU)) {
930 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
931 SIOCGIFMTU, "SIOCGIFMTU");
935 netdev_dev->mtu = ifr.ifr_mtu;
936 netdev_dev->cache_valid |= VALID_MTU;
938 *mtup = netdev_dev->mtu;
942 /* Returns the ifindex of 'netdev', if successful, as a positive number.
943 * On failure, returns a negative errno value. */
945 netdev_linux_get_ifindex(const struct netdev *netdev)
949 error = get_ifindex(netdev, &ifindex);
950 return error ? -error : ifindex;
954 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
956 struct netdev_dev_linux *netdev_dev =
957 netdev_dev_linux_cast(netdev_get_dev(netdev_));
962 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
966 fn = xasprintf("/sys/class/net/%s/carrier",
967 netdev_get_name(netdev_));
968 fd = open(fn, O_RDONLY);
971 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
975 retval = read(fd, line, sizeof line);
978 if (error == EINVAL) {
979 /* This is the normal return value when we try to check carrier
980 * if the network device is not up. */
982 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
985 } else if (retval == 0) {
987 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
991 if (line[0] != '0' && line[0] != '1') {
993 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
997 netdev_dev->carrier = line[0] != '0';
998 netdev_dev->cache_valid |= VALID_CARRIER;
1000 *carrier = netdev_dev->carrier;
1012 netdev_linux_get_miimon(const struct netdev *netdev_, bool *miimon)
1016 const char *name = netdev_get_name(netdev_);
1019 memset(&ifr, 0, sizeof ifr);
1021 error = netdev_linux_do_ioctl(name, &ifr, SIOCGMIIPHY, "SIOCGMIIPHY");
1023 struct mii_ioctl_data *data = (struct mii_ioctl_data *)&ifr.ifr_data;
1025 /* data->phy_id is filled out by previous SIOCGMIIPHY ioctl call. */
1026 data->reg_num = MII_BMSR;
1027 error = netdev_linux_do_ioctl(name, &ifr, SIOCGMIIREG, "SIOCGMIIREG");
1030 *miimon = !!(data->val_out & BMSR_LSTATUS);
1032 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1035 struct ethtool_cmd ecmd;
1036 struct ethtool_value *eval = (struct ethtool_value *) &ecmd;
1038 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1041 memset(&ecmd, 0, sizeof ecmd);
1042 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1045 *miimon = !!eval->data;
1047 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1054 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1055 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1058 check_for_working_netlink_stats(void)
1060 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1061 * preferable, so if that works, we'll use it. */
1062 int ifindex = do_get_ifindex("lo");
1064 VLOG_WARN("failed to get ifindex for lo, "
1065 "obtaining netdev stats from proc");
1068 struct netdev_stats stats;
1069 int error = get_stats_via_netlink(ifindex, &stats);
1071 VLOG_DBG("obtaining netdev stats via rtnetlink");
1074 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1075 "via proc (you are probably running a pre-2.6.19 "
1076 "kernel)", strerror(error));
1082 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1084 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1086 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1087 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1088 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1090 netdev_dev->is_tap = !strcmp(type, "tap");
1091 netdev_dev->is_internal = false;
1092 if (!netdev_dev->is_tap) {
1093 struct ethtool_drvinfo drvinfo;
1096 memset(&drvinfo, 0, sizeof drvinfo);
1097 error = netdev_linux_do_ethtool(name,
1098 (struct ethtool_cmd *)&drvinfo,
1100 "ETHTOOL_GDRVINFO");
1102 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1103 netdev_dev->is_internal = true;
1107 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1112 swap_uint64(uint64_t *a, uint64_t *b)
1119 /* Retrieves current device stats for 'netdev'. */
1121 netdev_linux_get_stats(const struct netdev *netdev_,
1122 struct netdev_stats *stats)
1124 struct netdev_dev_linux *netdev_dev =
1125 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1126 static int use_netlink_stats = -1;
1129 if (netdev_dev->have_vport_stats ||
1130 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1132 error = netdev_vport_get_stats(netdev_, stats);
1133 netdev_dev->have_vport_stats = !error;
1134 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1137 if (!netdev_dev->have_vport_stats) {
1138 if (use_netlink_stats < 0) {
1139 use_netlink_stats = check_for_working_netlink_stats();
1141 if (use_netlink_stats) {
1144 error = get_ifindex(netdev_, &ifindex);
1146 error = get_stats_via_netlink(ifindex, stats);
1149 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1153 /* If this port is an internal port then the transmit and receive stats
1154 * will appear to be swapped relative to the other ports since we are the
1155 * one sending the data, not a remote computer. For consistency, we swap
1156 * them back here. This does not apply if we are getting stats from the
1157 * vport layer because it always tracks stats from the perspective of the
1159 netdev_linux_update_is_pseudo(netdev_dev);
1160 if (!error && !netdev_dev->have_vport_stats &&
1161 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1162 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1163 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1164 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1165 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1166 stats->rx_length_errors = 0;
1167 stats->rx_over_errors = 0;
1168 stats->rx_crc_errors = 0;
1169 stats->rx_frame_errors = 0;
1170 stats->rx_fifo_errors = 0;
1171 stats->rx_missed_errors = 0;
1172 stats->tx_aborted_errors = 0;
1173 stats->tx_carrier_errors = 0;
1174 stats->tx_fifo_errors = 0;
1175 stats->tx_heartbeat_errors = 0;
1176 stats->tx_window_errors = 0;
1182 /* Stores the features supported by 'netdev' into each of '*current',
1183 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1184 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1185 * successful, otherwise a positive errno value. */
1187 netdev_linux_get_features(struct netdev *netdev,
1188 uint32_t *current, uint32_t *advertised,
1189 uint32_t *supported, uint32_t *peer)
1191 struct ethtool_cmd ecmd;
1194 memset(&ecmd, 0, sizeof ecmd);
1195 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1196 ETHTOOL_GSET, "ETHTOOL_GSET");
1201 /* Supported features. */
1203 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1204 *supported |= OFPPF_10MB_HD;
1206 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1207 *supported |= OFPPF_10MB_FD;
1209 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1210 *supported |= OFPPF_100MB_HD;
1212 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1213 *supported |= OFPPF_100MB_FD;
1215 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1216 *supported |= OFPPF_1GB_HD;
1218 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1219 *supported |= OFPPF_1GB_FD;
1221 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1222 *supported |= OFPPF_10GB_FD;
1224 if (ecmd.supported & SUPPORTED_TP) {
1225 *supported |= OFPPF_COPPER;
1227 if (ecmd.supported & SUPPORTED_FIBRE) {
1228 *supported |= OFPPF_FIBER;
1230 if (ecmd.supported & SUPPORTED_Autoneg) {
1231 *supported |= OFPPF_AUTONEG;
1233 if (ecmd.supported & SUPPORTED_Pause) {
1234 *supported |= OFPPF_PAUSE;
1236 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1237 *supported |= OFPPF_PAUSE_ASYM;
1240 /* Advertised features. */
1242 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1243 *advertised |= OFPPF_10MB_HD;
1245 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1246 *advertised |= OFPPF_10MB_FD;
1248 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1249 *advertised |= OFPPF_100MB_HD;
1251 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1252 *advertised |= OFPPF_100MB_FD;
1254 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1255 *advertised |= OFPPF_1GB_HD;
1257 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1258 *advertised |= OFPPF_1GB_FD;
1260 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1261 *advertised |= OFPPF_10GB_FD;
1263 if (ecmd.advertising & ADVERTISED_TP) {
1264 *advertised |= OFPPF_COPPER;
1266 if (ecmd.advertising & ADVERTISED_FIBRE) {
1267 *advertised |= OFPPF_FIBER;
1269 if (ecmd.advertising & ADVERTISED_Autoneg) {
1270 *advertised |= OFPPF_AUTONEG;
1272 if (ecmd.advertising & ADVERTISED_Pause) {
1273 *advertised |= OFPPF_PAUSE;
1275 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1276 *advertised |= OFPPF_PAUSE_ASYM;
1279 /* Current settings. */
1280 if (ecmd.speed == SPEED_10) {
1281 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1282 } else if (ecmd.speed == SPEED_100) {
1283 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1284 } else if (ecmd.speed == SPEED_1000) {
1285 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1286 } else if (ecmd.speed == SPEED_10000) {
1287 *current = OFPPF_10GB_FD;
1292 if (ecmd.port == PORT_TP) {
1293 *current |= OFPPF_COPPER;
1294 } else if (ecmd.port == PORT_FIBRE) {
1295 *current |= OFPPF_FIBER;
1299 *current |= OFPPF_AUTONEG;
1302 /* Peer advertisements. */
1303 *peer = 0; /* XXX */
1308 /* Set the features advertised by 'netdev' to 'advertise'. */
1310 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1312 struct ethtool_cmd ecmd;
1315 memset(&ecmd, 0, sizeof ecmd);
1316 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1317 ETHTOOL_GSET, "ETHTOOL_GSET");
1322 ecmd.advertising = 0;
1323 if (advertise & OFPPF_10MB_HD) {
1324 ecmd.advertising |= ADVERTISED_10baseT_Half;
1326 if (advertise & OFPPF_10MB_FD) {
1327 ecmd.advertising |= ADVERTISED_10baseT_Full;
1329 if (advertise & OFPPF_100MB_HD) {
1330 ecmd.advertising |= ADVERTISED_100baseT_Half;
1332 if (advertise & OFPPF_100MB_FD) {
1333 ecmd.advertising |= ADVERTISED_100baseT_Full;
1335 if (advertise & OFPPF_1GB_HD) {
1336 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1338 if (advertise & OFPPF_1GB_FD) {
1339 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1341 if (advertise & OFPPF_10GB_FD) {
1342 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1344 if (advertise & OFPPF_COPPER) {
1345 ecmd.advertising |= ADVERTISED_TP;
1347 if (advertise & OFPPF_FIBER) {
1348 ecmd.advertising |= ADVERTISED_FIBRE;
1350 if (advertise & OFPPF_AUTONEG) {
1351 ecmd.advertising |= ADVERTISED_Autoneg;
1353 if (advertise & OFPPF_PAUSE) {
1354 ecmd.advertising |= ADVERTISED_Pause;
1356 if (advertise & OFPPF_PAUSE_ASYM) {
1357 ecmd.advertising |= ADVERTISED_Asym_Pause;
1359 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1360 ETHTOOL_SSET, "ETHTOOL_SSET");
1363 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1364 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1365 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1366 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1367 * sets '*vlan_vid' to -1. */
1369 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1371 const char *netdev_name = netdev_get_name(netdev);
1372 struct ds line = DS_EMPTY_INITIALIZER;
1373 FILE *stream = NULL;
1377 COVERAGE_INC(netdev_get_vlan_vid);
1378 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1379 stream = fopen(fn, "r");
1385 if (ds_get_line(&line, stream)) {
1386 if (ferror(stream)) {
1388 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1391 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1396 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1398 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1399 fn, ds_cstr(&line));
1417 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1418 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1420 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1421 * positive errno value.
1423 * This function is equivalent to running
1424 * /sbin/tc qdisc del dev %s handle ffff: ingress
1425 * but it is much, much faster.
1428 netdev_linux_remove_policing(struct netdev *netdev)
1430 struct netdev_dev_linux *netdev_dev =
1431 netdev_dev_linux_cast(netdev_get_dev(netdev));
1432 const char *netdev_name = netdev_get_name(netdev);
1434 struct ofpbuf request;
1435 struct tcmsg *tcmsg;
1438 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1442 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1443 tcmsg->tcm_parent = TC_H_INGRESS;
1444 nl_msg_put_string(&request, TCA_KIND, "ingress");
1445 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1447 error = tc_transact(&request, NULL);
1448 if (error && error != ENOENT && error != EINVAL) {
1449 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1450 netdev_name, strerror(error));
1454 netdev_dev->kbits_rate = 0;
1455 netdev_dev->kbits_burst = 0;
1456 netdev_dev->cache_valid |= VALID_POLICING;
1460 /* Attempts to set input rate limiting (policing) policy. */
1462 netdev_linux_set_policing(struct netdev *netdev,
1463 uint32_t kbits_rate, uint32_t kbits_burst)
1465 struct netdev_dev_linux *netdev_dev =
1466 netdev_dev_linux_cast(netdev_get_dev(netdev));
1467 const char *netdev_name = netdev_get_name(netdev);
1470 COVERAGE_INC(netdev_set_policing);
1472 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1473 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1474 : kbits_burst); /* Stick with user-specified value. */
1476 if (netdev_dev->cache_valid & VALID_POLICING
1477 && netdev_dev->kbits_rate == kbits_rate
1478 && netdev_dev->kbits_burst == kbits_burst) {
1479 /* Assume that settings haven't changed since we last set them. */
1483 netdev_linux_remove_policing(netdev);
1485 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1486 if (system(command) != 0) {
1487 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1491 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1492 kbits_rate, kbits_burst);
1493 if (system(command) != 0) {
1494 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1499 netdev_dev->kbits_rate = kbits_rate;
1500 netdev_dev->kbits_burst = kbits_burst;
1501 netdev_dev->cache_valid |= VALID_POLICING;
1508 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1511 const struct tc_ops **opsp;
1513 for (opsp = tcs; *opsp != NULL; opsp++) {
1514 const struct tc_ops *ops = *opsp;
1515 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1516 svec_add(types, ops->ovs_name);
1522 static const struct tc_ops *
1523 tc_lookup_ovs_name(const char *name)
1525 const struct tc_ops **opsp;
1527 for (opsp = tcs; *opsp != NULL; opsp++) {
1528 const struct tc_ops *ops = *opsp;
1529 if (!strcmp(name, ops->ovs_name)) {
1536 static const struct tc_ops *
1537 tc_lookup_linux_name(const char *name)
1539 const struct tc_ops **opsp;
1541 for (opsp = tcs; *opsp != NULL; opsp++) {
1542 const struct tc_ops *ops = *opsp;
1543 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1550 static struct tc_queue *
1551 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1554 struct netdev_dev_linux *netdev_dev =
1555 netdev_dev_linux_cast(netdev_get_dev(netdev));
1556 struct tc_queue *queue;
1558 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1559 if (queue->queue_id == queue_id) {
1566 static struct tc_queue *
1567 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1569 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1573 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1575 struct netdev_qos_capabilities *caps)
1577 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1581 caps->n_queues = ops->n_queues;
1586 netdev_linux_get_qos(const struct netdev *netdev,
1587 const char **typep, struct shash *details)
1589 struct netdev_dev_linux *netdev_dev =
1590 netdev_dev_linux_cast(netdev_get_dev(netdev));
1593 error = tc_query_qdisc(netdev);
1598 *typep = netdev_dev->tc->ops->ovs_name;
1599 return (netdev_dev->tc->ops->qdisc_get
1600 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1605 netdev_linux_set_qos(struct netdev *netdev,
1606 const char *type, const struct shash *details)
1608 struct netdev_dev_linux *netdev_dev =
1609 netdev_dev_linux_cast(netdev_get_dev(netdev));
1610 const struct tc_ops *new_ops;
1613 new_ops = tc_lookup_ovs_name(type);
1614 if (!new_ops || !new_ops->tc_install) {
1618 error = tc_query_qdisc(netdev);
1623 if (new_ops == netdev_dev->tc->ops) {
1624 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1626 /* Delete existing qdisc. */
1627 error = tc_del_qdisc(netdev);
1631 assert(netdev_dev->tc == NULL);
1633 /* Install new qdisc. */
1634 error = new_ops->tc_install(netdev, details);
1635 assert((error == 0) == (netdev_dev->tc != NULL));
1642 netdev_linux_get_queue(const struct netdev *netdev,
1643 unsigned int queue_id, struct shash *details)
1645 struct netdev_dev_linux *netdev_dev =
1646 netdev_dev_linux_cast(netdev_get_dev(netdev));
1649 error = tc_query_qdisc(netdev);
1653 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1655 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1661 netdev_linux_set_queue(struct netdev *netdev,
1662 unsigned int queue_id, const struct shash *details)
1664 struct netdev_dev_linux *netdev_dev =
1665 netdev_dev_linux_cast(netdev_get_dev(netdev));
1668 error = tc_query_qdisc(netdev);
1671 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1672 || !netdev_dev->tc->ops->class_set) {
1676 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1680 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1682 struct netdev_dev_linux *netdev_dev =
1683 netdev_dev_linux_cast(netdev_get_dev(netdev));
1686 error = tc_query_qdisc(netdev);
1689 } else if (!netdev_dev->tc->ops->class_delete) {
1692 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1694 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1700 netdev_linux_get_queue_stats(const struct netdev *netdev,
1701 unsigned int queue_id,
1702 struct netdev_queue_stats *stats)
1704 struct netdev_dev_linux *netdev_dev =
1705 netdev_dev_linux_cast(netdev_get_dev(netdev));
1708 error = tc_query_qdisc(netdev);
1711 } else if (!netdev_dev->tc->ops->class_get_stats) {
1714 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1716 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1722 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1724 struct ofpbuf request;
1725 struct tcmsg *tcmsg;
1727 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1731 tcmsg->tcm_parent = 0;
1732 nl_dump_start(dump, rtnl_sock, &request);
1733 ofpbuf_uninit(&request);
1738 netdev_linux_dump_queues(const struct netdev *netdev,
1739 netdev_dump_queues_cb *cb, void *aux)
1741 struct netdev_dev_linux *netdev_dev =
1742 netdev_dev_linux_cast(netdev_get_dev(netdev));
1743 struct tc_queue *queue;
1744 struct shash details;
1748 error = tc_query_qdisc(netdev);
1751 } else if (!netdev_dev->tc->ops->class_get) {
1756 shash_init(&details);
1757 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1758 shash_clear(&details);
1760 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1762 (*cb)(queue->queue_id, &details, aux);
1767 shash_destroy(&details);
1773 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1774 netdev_dump_queue_stats_cb *cb, void *aux)
1776 struct netdev_dev_linux *netdev_dev =
1777 netdev_dev_linux_cast(netdev_get_dev(netdev));
1778 struct nl_dump dump;
1783 error = tc_query_qdisc(netdev);
1786 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1791 if (!start_queue_dump(netdev, &dump)) {
1794 while (nl_dump_next(&dump, &msg)) {
1795 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1801 error = nl_dump_done(&dump);
1802 return error ? error : last_error;
1806 netdev_linux_get_in4(const struct netdev *netdev_,
1807 struct in_addr *address, struct in_addr *netmask)
1809 struct netdev_dev_linux *netdev_dev =
1810 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1812 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1815 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1816 SIOCGIFADDR, "SIOCGIFADDR");
1821 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1822 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1827 netdev_dev->cache_valid |= VALID_IN4;
1829 *address = netdev_dev->address;
1830 *netmask = netdev_dev->netmask;
1831 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1835 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1836 struct in_addr netmask)
1838 struct netdev_dev_linux *netdev_dev =
1839 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1842 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1844 netdev_dev->cache_valid |= VALID_IN4;
1845 netdev_dev->address = address;
1846 netdev_dev->netmask = netmask;
1847 if (address.s_addr != INADDR_ANY) {
1848 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1849 "SIOCSIFNETMASK", netmask);
1856 parse_if_inet6_line(const char *line,
1857 struct in6_addr *in6, char ifname[16 + 1])
1859 uint8_t *s6 = in6->s6_addr;
1860 #define X8 "%2"SCNx8
1862 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1863 "%*x %*x %*x %*x %16s\n",
1864 &s6[0], &s6[1], &s6[2], &s6[3],
1865 &s6[4], &s6[5], &s6[6], &s6[7],
1866 &s6[8], &s6[9], &s6[10], &s6[11],
1867 &s6[12], &s6[13], &s6[14], &s6[15],
1871 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1872 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1874 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1876 struct netdev_dev_linux *netdev_dev =
1877 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1878 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1882 netdev_dev->in6 = in6addr_any;
1884 file = fopen("/proc/net/if_inet6", "r");
1886 const char *name = netdev_get_name(netdev_);
1887 while (fgets(line, sizeof line, file)) {
1888 struct in6_addr in6_tmp;
1889 char ifname[16 + 1];
1890 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1891 && !strcmp(name, ifname))
1893 netdev_dev->in6 = in6_tmp;
1899 netdev_dev->cache_valid |= VALID_IN6;
1901 *in6 = netdev_dev->in6;
1906 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1908 struct sockaddr_in sin;
1909 memset(&sin, 0, sizeof sin);
1910 sin.sin_family = AF_INET;
1911 sin.sin_addr = addr;
1914 memset(sa, 0, sizeof *sa);
1915 memcpy(sa, &sin, sizeof sin);
1919 do_set_addr(struct netdev *netdev,
1920 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1923 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1924 make_in4_sockaddr(&ifr.ifr_addr, addr);
1926 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1930 /* Adds 'router' as a default IP gateway. */
1932 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1934 struct in_addr any = { INADDR_ANY };
1938 memset(&rt, 0, sizeof rt);
1939 make_in4_sockaddr(&rt.rt_dst, any);
1940 make_in4_sockaddr(&rt.rt_gateway, router);
1941 make_in4_sockaddr(&rt.rt_genmask, any);
1942 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1943 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1945 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1951 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1954 static const char fn[] = "/proc/net/route";
1959 *netdev_name = NULL;
1960 stream = fopen(fn, "r");
1961 if (stream == NULL) {
1962 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1967 while (fgets(line, sizeof line, stream)) {
1970 uint32_t dest, gateway, mask;
1971 int refcnt, metric, mtu;
1972 unsigned int flags, use, window, irtt;
1975 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1977 iface, &dest, &gateway, &flags, &refcnt,
1978 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1980 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1984 if (!(flags & RTF_UP)) {
1985 /* Skip routes that aren't up. */
1989 /* The output of 'dest', 'mask', and 'gateway' were given in
1990 * network byte order, so we don't need need any endian
1991 * conversions here. */
1992 if ((dest & mask) == (host->s_addr & mask)) {
1994 /* The host is directly reachable. */
1995 next_hop->s_addr = 0;
1997 /* To reach the host, we must go through a gateway. */
1998 next_hop->s_addr = gateway;
2000 *netdev_name = xstrdup(iface);
2011 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2012 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2013 * returns 0. Otherwise, it returns a positive errno value; in particular,
2014 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2016 netdev_linux_arp_lookup(const struct netdev *netdev,
2017 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2020 struct sockaddr_in sin;
2023 memset(&r, 0, sizeof r);
2024 sin.sin_family = AF_INET;
2025 sin.sin_addr.s_addr = ip;
2027 memcpy(&r.arp_pa, &sin, sizeof sin);
2028 r.arp_ha.sa_family = ARPHRD_ETHER;
2030 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2031 COVERAGE_INC(netdev_arp_lookup);
2032 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2034 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2035 } else if (retval != ENXIO) {
2036 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2037 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2043 nd_to_iff_flags(enum netdev_flags nd)
2046 if (nd & NETDEV_UP) {
2049 if (nd & NETDEV_PROMISC) {
2056 iff_to_nd_flags(int iff)
2058 enum netdev_flags nd = 0;
2062 if (iff & IFF_PROMISC) {
2063 nd |= NETDEV_PROMISC;
2069 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2070 enum netdev_flags on, enum netdev_flags *old_flagsp)
2072 int old_flags, new_flags;
2075 error = get_flags(netdev, &old_flags);
2077 *old_flagsp = iff_to_nd_flags(old_flags);
2078 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2079 if (new_flags != old_flags) {
2080 error = set_flags(netdev, new_flags);
2087 poll_notify(struct list *list)
2089 struct netdev_linux_notifier *notifier;
2090 LIST_FOR_EACH (notifier, node, list) {
2091 struct netdev_notifier *n = ¬ifier->notifier;
2097 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2098 void *aux OVS_UNUSED)
2101 struct list *list = shash_find_data(&netdev_linux_notifiers,
2107 struct shash_node *node;
2108 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2109 poll_notify(node->data);
2115 netdev_linux_poll_add(struct netdev *netdev,
2116 void (*cb)(struct netdev_notifier *), void *aux,
2117 struct netdev_notifier **notifierp)
2119 const char *netdev_name = netdev_get_name(netdev);
2120 struct netdev_linux_notifier *notifier;
2123 if (shash_is_empty(&netdev_linux_notifiers)) {
2125 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2126 netdev_linux_poll_cb, NULL);
2132 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2134 list = xmalloc(sizeof *list);
2136 shash_add(&netdev_linux_notifiers, netdev_name, list);
2139 notifier = xmalloc(sizeof *notifier);
2140 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2141 list_push_back(list, ¬ifier->node);
2142 *notifierp = ¬ifier->notifier;
2147 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2149 struct netdev_linux_notifier *notifier =
2150 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2153 /* Remove 'notifier' from its list. */
2154 list = list_remove(¬ifier->node);
2155 if (list_is_empty(list)) {
2156 /* The list is now empty. Remove it from the hash and free it. */
2157 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2158 shash_delete(&netdev_linux_notifiers,
2159 shash_find(&netdev_linux_notifiers, netdev_name));
2164 /* If that was the last notifier, unregister. */
2165 if (shash_is_empty(&netdev_linux_notifiers)) {
2166 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2170 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2174 netdev_linux_init, \
2176 netdev_linux_wait, \
2179 netdev_linux_destroy, \
2180 NULL, /* reconfigure */ \
2182 netdev_linux_open, \
2183 netdev_linux_close, \
2187 netdev_linux_recv, \
2188 netdev_linux_recv_wait, \
2189 netdev_linux_drain, \
2191 netdev_linux_send, \
2192 netdev_linux_send_wait, \
2194 netdev_linux_set_etheraddr, \
2195 netdev_linux_get_etheraddr, \
2196 netdev_linux_get_mtu, \
2197 netdev_linux_get_ifindex, \
2198 netdev_linux_get_carrier, \
2199 netdev_linux_get_miimon, \
2200 netdev_linux_get_stats, \
2203 netdev_linux_get_features, \
2204 netdev_linux_set_advertisements, \
2205 netdev_linux_get_vlan_vid, \
2207 netdev_linux_set_policing, \
2208 netdev_linux_get_qos_types, \
2209 netdev_linux_get_qos_capabilities, \
2210 netdev_linux_get_qos, \
2211 netdev_linux_set_qos, \
2212 netdev_linux_get_queue, \
2213 netdev_linux_set_queue, \
2214 netdev_linux_delete_queue, \
2215 netdev_linux_get_queue_stats, \
2216 netdev_linux_dump_queues, \
2217 netdev_linux_dump_queue_stats, \
2219 netdev_linux_get_in4, \
2220 netdev_linux_set_in4, \
2221 netdev_linux_get_in6, \
2222 netdev_linux_add_router, \
2223 netdev_linux_get_next_hop, \
2224 NULL, /* get_status */ \
2225 netdev_linux_arp_lookup, \
2227 netdev_linux_update_flags, \
2229 netdev_linux_poll_add, \
2230 netdev_linux_poll_remove \
2233 const struct netdev_class netdev_linux_class =
2236 netdev_linux_create,
2237 netdev_linux_enumerate,
2238 NULL); /* set_stats */
2240 const struct netdev_class netdev_tap_class =
2243 netdev_linux_create_tap,
2244 NULL, /* enumerate */
2245 NULL); /* set_stats */
2247 const struct netdev_class netdev_internal_class =
2250 netdev_linux_create,
2251 NULL, /* enumerate */
2252 netdev_vport_set_stats);
2254 /* HTB traffic control class. */
2256 #define HTB_N_QUEUES 0xf000
2260 unsigned int max_rate; /* In bytes/s. */
2264 struct tc_queue tc_queue;
2265 unsigned int min_rate; /* In bytes/s. */
2266 unsigned int max_rate; /* In bytes/s. */
2267 unsigned int burst; /* In bytes. */
2268 unsigned int priority; /* Lower values are higher priorities. */
2272 htb_get__(const struct netdev *netdev)
2274 struct netdev_dev_linux *netdev_dev =
2275 netdev_dev_linux_cast(netdev_get_dev(netdev));
2276 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2280 htb_install__(struct netdev *netdev, uint64_t max_rate)
2282 struct netdev_dev_linux *netdev_dev =
2283 netdev_dev_linux_cast(netdev_get_dev(netdev));
2286 htb = xmalloc(sizeof *htb);
2287 tc_init(&htb->tc, &tc_ops_htb);
2288 htb->max_rate = max_rate;
2290 netdev_dev->tc = &htb->tc;
2295 /* Create an HTB qdisc.
2297 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2299 htb_setup_qdisc__(struct netdev *netdev)
2302 struct tc_htb_glob opt;
2303 struct ofpbuf request;
2304 struct tcmsg *tcmsg;
2306 tc_del_qdisc(netdev);
2308 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2309 NLM_F_EXCL | NLM_F_CREATE, &request);
2313 tcmsg->tcm_handle = tc_make_handle(1, 0);
2314 tcmsg->tcm_parent = TC_H_ROOT;
2316 nl_msg_put_string(&request, TCA_KIND, "htb");
2318 memset(&opt, 0, sizeof opt);
2319 opt.rate2quantum = 10;
2323 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2324 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2325 nl_msg_end_nested(&request, opt_offset);
2327 return tc_transact(&request, NULL);
2330 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2331 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2333 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2334 unsigned int parent, struct htb_class *class)
2337 struct tc_htb_opt opt;
2338 struct ofpbuf request;
2339 struct tcmsg *tcmsg;
2343 netdev_get_mtu(netdev, &mtu);
2345 memset(&opt, 0, sizeof opt);
2346 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2347 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2348 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2349 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2350 opt.prio = class->priority;
2352 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2356 tcmsg->tcm_handle = handle;
2357 tcmsg->tcm_parent = parent;
2359 nl_msg_put_string(&request, TCA_KIND, "htb");
2360 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2361 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2362 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2363 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2364 nl_msg_end_nested(&request, opt_offset);
2366 error = tc_transact(&request, NULL);
2368 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2369 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2370 netdev_get_name(netdev),
2371 tc_get_major(handle), tc_get_minor(handle),
2372 tc_get_major(parent), tc_get_minor(parent),
2373 class->min_rate, class->max_rate,
2374 class->burst, class->priority, strerror(error));
2379 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2380 * description of them into 'details'. The description complies with the
2381 * specification given in the vswitch database documentation for linux-htb
2384 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2386 static const struct nl_policy tca_htb_policy[] = {
2387 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2388 .min_len = sizeof(struct tc_htb_opt) },
2391 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2392 const struct tc_htb_opt *htb;
2394 if (!nl_parse_nested(nl_options, tca_htb_policy,
2395 attrs, ARRAY_SIZE(tca_htb_policy))) {
2396 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2400 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2401 class->min_rate = htb->rate.rate;
2402 class->max_rate = htb->ceil.rate;
2403 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2404 class->priority = htb->prio;
2409 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2410 struct htb_class *options,
2411 struct netdev_queue_stats *stats)
2413 struct nlattr *nl_options;
2414 unsigned int handle;
2417 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2418 if (!error && queue_id) {
2419 unsigned int major = tc_get_major(handle);
2420 unsigned int minor = tc_get_minor(handle);
2421 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2422 *queue_id = minor - 1;
2427 if (!error && options) {
2428 error = htb_parse_tca_options__(nl_options, options);
2434 htb_parse_qdisc_details__(struct netdev *netdev,
2435 const struct shash *details, struct htb_class *hc)
2437 const char *max_rate_s;
2439 max_rate_s = shash_find_data(details, "max-rate");
2440 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2441 if (!hc->max_rate) {
2444 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2445 hc->max_rate = netdev_features_to_bps(current) / 8;
2447 hc->min_rate = hc->max_rate;
2453 htb_parse_class_details__(struct netdev *netdev,
2454 const struct shash *details, struct htb_class *hc)
2456 const struct htb *htb = htb_get__(netdev);
2457 const char *min_rate_s = shash_find_data(details, "min-rate");
2458 const char *max_rate_s = shash_find_data(details, "max-rate");
2459 const char *burst_s = shash_find_data(details, "burst");
2460 const char *priority_s = shash_find_data(details, "priority");
2463 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2465 /* min-rate is required. */
2468 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2469 hc->min_rate = MAX(hc->min_rate, 1500);
2470 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2473 hc->max_rate = (max_rate_s
2474 ? strtoull(max_rate_s, NULL, 10) / 8
2476 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2477 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2481 * According to hints in the documentation that I've read, it is important
2482 * that 'burst' be at least as big as the largest frame that might be
2483 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2484 * but having it a bit too small is a problem. Since netdev_get_mtu()
2485 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2486 * the MTU. We actually add 64, instead of 14, as a guard against
2487 * additional headers get tacked on somewhere that we're not aware of. */
2488 netdev_get_mtu(netdev, &mtu);
2489 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2490 hc->burst = MAX(hc->burst, mtu + 64);
2493 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2499 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2500 unsigned int parent, struct htb_class *options,
2501 struct netdev_queue_stats *stats)
2503 struct ofpbuf *reply;
2506 error = tc_query_class(netdev, handle, parent, &reply);
2508 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2509 ofpbuf_delete(reply);
2515 htb_tc_install(struct netdev *netdev, const struct shash *details)
2519 error = htb_setup_qdisc__(netdev);
2521 struct htb_class hc;
2523 htb_parse_qdisc_details__(netdev, details, &hc);
2524 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2525 tc_make_handle(1, 0), &hc);
2527 htb_install__(netdev, hc.max_rate);
2533 static struct htb_class *
2534 htb_class_cast__(const struct tc_queue *queue)
2536 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2540 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2541 const struct htb_class *hc)
2543 struct htb *htb = htb_get__(netdev);
2544 size_t hash = hash_int(queue_id, 0);
2545 struct tc_queue *queue;
2546 struct htb_class *hcp;
2548 queue = tc_find_queue__(netdev, queue_id, hash);
2550 hcp = htb_class_cast__(queue);
2552 hcp = xmalloc(sizeof *hcp);
2553 queue = &hcp->tc_queue;
2554 queue->queue_id = queue_id;
2555 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2558 hcp->min_rate = hc->min_rate;
2559 hcp->max_rate = hc->max_rate;
2560 hcp->burst = hc->burst;
2561 hcp->priority = hc->priority;
2565 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2568 struct nl_dump dump;
2569 struct htb_class hc;
2572 /* Get qdisc options. */
2574 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2575 htb = htb_install__(netdev, hc.max_rate);
2578 if (!start_queue_dump(netdev, &dump)) {
2581 while (nl_dump_next(&dump, &msg)) {
2582 unsigned int queue_id;
2584 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2585 htb_update_queue__(netdev, queue_id, &hc);
2588 nl_dump_done(&dump);
2594 htb_tc_destroy(struct tc *tc)
2596 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2597 struct htb_class *hc, *next;
2599 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2600 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2608 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2610 const struct htb *htb = htb_get__(netdev);
2611 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2616 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2618 struct htb_class hc;
2621 htb_parse_qdisc_details__(netdev, details, &hc);
2622 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2623 tc_make_handle(1, 0), &hc);
2625 htb_get__(netdev)->max_rate = hc.max_rate;
2631 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2632 const struct tc_queue *queue, struct shash *details)
2634 const struct htb_class *hc = htb_class_cast__(queue);
2636 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2637 if (hc->min_rate != hc->max_rate) {
2638 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2640 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2642 shash_add(details, "priority", xasprintf("%u", hc->priority));
2648 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2649 const struct shash *details)
2651 struct htb_class hc;
2654 error = htb_parse_class_details__(netdev, details, &hc);
2659 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2660 tc_make_handle(1, 0xfffe), &hc);
2665 htb_update_queue__(netdev, queue_id, &hc);
2670 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2672 struct htb_class *hc = htb_class_cast__(queue);
2673 struct htb *htb = htb_get__(netdev);
2676 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2678 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2685 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2686 struct netdev_queue_stats *stats)
2688 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2689 tc_make_handle(1, 0xfffe), NULL, stats);
2693 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2694 const struct ofpbuf *nlmsg,
2695 netdev_dump_queue_stats_cb *cb, void *aux)
2697 struct netdev_queue_stats stats;
2698 unsigned int handle, major, minor;
2701 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2706 major = tc_get_major(handle);
2707 minor = tc_get_minor(handle);
2708 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2709 (*cb)(minor - 1, &stats, aux);
2714 static const struct tc_ops tc_ops_htb = {
2715 "htb", /* linux_name */
2716 "linux-htb", /* ovs_name */
2717 HTB_N_QUEUES, /* n_queues */
2726 htb_class_get_stats,
2727 htb_class_dump_stats
2730 /* "linux-hfsc" traffic control class. */
2732 #define HFSC_N_QUEUES 0xf000
2740 struct tc_queue tc_queue;
2745 static struct hfsc *
2746 hfsc_get__(const struct netdev *netdev)
2748 struct netdev_dev_linux *netdev_dev;
2749 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2750 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2753 static struct hfsc_class *
2754 hfsc_class_cast__(const struct tc_queue *queue)
2756 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2759 static struct hfsc *
2760 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2762 struct netdev_dev_linux * netdev_dev;
2765 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2766 hfsc = xmalloc(sizeof *hfsc);
2767 tc_init(&hfsc->tc, &tc_ops_hfsc);
2768 hfsc->max_rate = max_rate;
2769 netdev_dev->tc = &hfsc->tc;
2775 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2776 const struct hfsc_class *hc)
2780 struct hfsc_class *hcp;
2781 struct tc_queue *queue;
2783 hfsc = hfsc_get__(netdev);
2784 hash = hash_int(queue_id, 0);
2786 queue = tc_find_queue__(netdev, queue_id, hash);
2788 hcp = hfsc_class_cast__(queue);
2790 hcp = xmalloc(sizeof *hcp);
2791 queue = &hcp->tc_queue;
2792 queue->queue_id = queue_id;
2793 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2796 hcp->min_rate = hc->min_rate;
2797 hcp->max_rate = hc->max_rate;
2801 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2803 const struct tc_service_curve *rsc, *fsc, *usc;
2804 static const struct nl_policy tca_hfsc_policy[] = {
2806 .type = NL_A_UNSPEC,
2808 .min_len = sizeof(struct tc_service_curve),
2811 .type = NL_A_UNSPEC,
2813 .min_len = sizeof(struct tc_service_curve),
2816 .type = NL_A_UNSPEC,
2818 .min_len = sizeof(struct tc_service_curve),
2821 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2823 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2824 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2825 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2829 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2830 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2831 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2833 if (rsc->m1 != 0 || rsc->d != 0 ||
2834 fsc->m1 != 0 || fsc->d != 0 ||
2835 usc->m1 != 0 || usc->d != 0) {
2836 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2837 "Non-linear service curves are not supported.");
2841 if (rsc->m2 != fsc->m2) {
2842 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2843 "Real-time service curves are not supported ");
2847 if (rsc->m2 > usc->m2) {
2848 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2849 "Min-rate service curve is greater than "
2850 "the max-rate service curve.");
2854 class->min_rate = fsc->m2;
2855 class->max_rate = usc->m2;
2860 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2861 struct hfsc_class *options,
2862 struct netdev_queue_stats *stats)
2865 unsigned int handle;
2866 struct nlattr *nl_options;
2868 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2874 unsigned int major, minor;
2876 major = tc_get_major(handle);
2877 minor = tc_get_minor(handle);
2878 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2879 *queue_id = minor - 1;
2886 error = hfsc_parse_tca_options__(nl_options, options);
2893 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2894 unsigned int parent, struct hfsc_class *options,
2895 struct netdev_queue_stats *stats)
2898 struct ofpbuf *reply;
2900 error = tc_query_class(netdev, handle, parent, &reply);
2905 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2906 ofpbuf_delete(reply);
2911 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2912 struct hfsc_class *class)
2915 const char *max_rate_s;
2917 max_rate_s = shash_find_data(details, "max-rate");
2918 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2923 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2924 max_rate = netdev_features_to_bps(current) / 8;
2927 class->min_rate = max_rate;
2928 class->max_rate = max_rate;
2932 hfsc_parse_class_details__(struct netdev *netdev,
2933 const struct shash *details,
2934 struct hfsc_class * class)
2936 const struct hfsc *hfsc;
2937 uint32_t min_rate, max_rate;
2938 const char *min_rate_s, *max_rate_s;
2940 hfsc = hfsc_get__(netdev);
2941 min_rate_s = shash_find_data(details, "min-rate");
2942 max_rate_s = shash_find_data(details, "max-rate");
2948 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2949 min_rate = MAX(min_rate, 1500);
2950 min_rate = MIN(min_rate, hfsc->max_rate);
2952 max_rate = (max_rate_s
2953 ? strtoull(max_rate_s, NULL, 10) / 8
2955 max_rate = MAX(max_rate, min_rate);
2956 max_rate = MIN(max_rate, hfsc->max_rate);
2958 class->min_rate = min_rate;
2959 class->max_rate = max_rate;
2964 /* Create an HFSC qdisc.
2966 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2968 hfsc_setup_qdisc__(struct netdev * netdev)
2970 struct tcmsg *tcmsg;
2971 struct ofpbuf request;
2972 struct tc_hfsc_qopt opt;
2974 tc_del_qdisc(netdev);
2976 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2977 NLM_F_EXCL | NLM_F_CREATE, &request);
2983 tcmsg->tcm_handle = tc_make_handle(1, 0);
2984 tcmsg->tcm_parent = TC_H_ROOT;
2986 memset(&opt, 0, sizeof opt);
2989 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2990 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2992 return tc_transact(&request, NULL);
2995 /* Create an HFSC class.
2997 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2998 * sc rate <min_rate> ul rate <max_rate>" */
3000 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3001 unsigned int parent, struct hfsc_class *class)
3005 struct tcmsg *tcmsg;
3006 struct ofpbuf request;
3007 struct tc_service_curve min, max;
3009 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3015 tcmsg->tcm_handle = handle;
3016 tcmsg->tcm_parent = parent;
3020 min.m2 = class->min_rate;
3024 max.m2 = class->max_rate;
3026 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3027 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3028 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3029 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3030 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3031 nl_msg_end_nested(&request, opt_offset);
3033 error = tc_transact(&request, NULL);
3035 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3036 "min-rate %ubps, max-rate %ubps (%s)",
3037 netdev_get_name(netdev),
3038 tc_get_major(handle), tc_get_minor(handle),
3039 tc_get_major(parent), tc_get_minor(parent),
3040 class->min_rate, class->max_rate, strerror(error));
3047 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3050 struct hfsc_class class;
3052 error = hfsc_setup_qdisc__(netdev);
3058 hfsc_parse_qdisc_details__(netdev, details, &class);
3059 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3060 tc_make_handle(1, 0), &class);
3066 hfsc_install__(netdev, class.max_rate);
3071 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3075 struct nl_dump dump;
3076 struct hfsc_class hc;
3079 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3080 hfsc = hfsc_install__(netdev, hc.max_rate);
3082 if (!start_queue_dump(netdev, &dump)) {
3086 while (nl_dump_next(&dump, &msg)) {
3087 unsigned int queue_id;
3089 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3090 hfsc_update_queue__(netdev, queue_id, &hc);
3094 nl_dump_done(&dump);
3099 hfsc_tc_destroy(struct tc *tc)
3102 struct hfsc_class *hc, *next;
3104 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3106 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3107 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3116 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3118 const struct hfsc *hfsc;
3119 hfsc = hfsc_get__(netdev);
3120 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3125 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3128 struct hfsc_class class;
3130 hfsc_parse_qdisc_details__(netdev, details, &class);
3131 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3132 tc_make_handle(1, 0), &class);
3135 hfsc_get__(netdev)->max_rate = class.max_rate;
3142 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3143 const struct tc_queue *queue, struct shash *details)
3145 const struct hfsc_class *hc;
3147 hc = hfsc_class_cast__(queue);
3148 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3149 if (hc->min_rate != hc->max_rate) {
3150 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3156 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3157 const struct shash *details)
3160 struct hfsc_class class;
3162 error = hfsc_parse_class_details__(netdev, details, &class);
3167 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3168 tc_make_handle(1, 0xfffe), &class);
3173 hfsc_update_queue__(netdev, queue_id, &class);
3178 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3182 struct hfsc_class *hc;
3184 hc = hfsc_class_cast__(queue);
3185 hfsc = hfsc_get__(netdev);
3187 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3189 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3196 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3197 struct netdev_queue_stats *stats)
3199 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3200 tc_make_handle(1, 0xfffe), NULL, stats);
3204 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3205 const struct ofpbuf *nlmsg,
3206 netdev_dump_queue_stats_cb *cb, void *aux)
3208 struct netdev_queue_stats stats;
3209 unsigned int handle, major, minor;
3212 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3217 major = tc_get_major(handle);
3218 minor = tc_get_minor(handle);
3219 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3220 (*cb)(minor - 1, &stats, aux);
3225 static const struct tc_ops tc_ops_hfsc = {
3226 "hfsc", /* linux_name */
3227 "linux-hfsc", /* ovs_name */
3228 HFSC_N_QUEUES, /* n_queues */
3229 hfsc_tc_install, /* tc_install */
3230 hfsc_tc_load, /* tc_load */
3231 hfsc_tc_destroy, /* tc_destroy */
3232 hfsc_qdisc_get, /* qdisc_get */
3233 hfsc_qdisc_set, /* qdisc_set */
3234 hfsc_class_get, /* class_get */
3235 hfsc_class_set, /* class_set */
3236 hfsc_class_delete, /* class_delete */
3237 hfsc_class_get_stats, /* class_get_stats */
3238 hfsc_class_dump_stats /* class_dump_stats */
3241 /* "linux-default" traffic control class.
3243 * This class represents the default, unnamed Linux qdisc. It corresponds to
3244 * the "" (empty string) QoS type in the OVS database. */
3247 default_install__(struct netdev *netdev)
3249 struct netdev_dev_linux *netdev_dev =
3250 netdev_dev_linux_cast(netdev_get_dev(netdev));
3251 static struct tc *tc;
3254 tc = xmalloc(sizeof *tc);
3255 tc_init(tc, &tc_ops_default);
3257 netdev_dev->tc = tc;
3261 default_tc_install(struct netdev *netdev,
3262 const struct shash *details OVS_UNUSED)
3264 default_install__(netdev);
3269 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3271 default_install__(netdev);
3275 static const struct tc_ops tc_ops_default = {
3276 NULL, /* linux_name */
3281 NULL, /* tc_destroy */
3282 NULL, /* qdisc_get */
3283 NULL, /* qdisc_set */
3284 NULL, /* class_get */
3285 NULL, /* class_set */
3286 NULL, /* class_delete */
3287 NULL, /* class_get_stats */
3288 NULL /* class_dump_stats */
3291 /* "linux-other" traffic control class.
3296 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3298 struct netdev_dev_linux *netdev_dev =
3299 netdev_dev_linux_cast(netdev_get_dev(netdev));
3300 static struct tc *tc;
3303 tc = xmalloc(sizeof *tc);
3304 tc_init(tc, &tc_ops_other);
3306 netdev_dev->tc = tc;
3310 static const struct tc_ops tc_ops_other = {
3311 NULL, /* linux_name */
3312 "linux-other", /* ovs_name */
3314 NULL, /* tc_install */
3316 NULL, /* tc_destroy */
3317 NULL, /* qdisc_get */
3318 NULL, /* qdisc_set */
3319 NULL, /* class_get */
3320 NULL, /* class_set */
3321 NULL, /* class_delete */
3322 NULL, /* class_get_stats */
3323 NULL /* class_dump_stats */
3326 /* Traffic control. */
3328 /* Number of kernel "tc" ticks per second. */
3329 static double ticks_per_s;
3331 /* Number of kernel "jiffies" per second. This is used for the purpose of
3332 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3333 * one jiffy's worth of data.
3335 * There are two possibilities here:
3337 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3338 * approximate range of 100 to 1024. That means that we really need to
3339 * make sure that the qdisc can buffer that much data.
3341 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3342 * has finely granular timers and there's no need to fudge additional room
3343 * for buffers. (There's no extra effort needed to implement that: the
3344 * large 'buffer_hz' is used as a divisor, so practically any number will
3345 * come out as 0 in the division. Small integer results in the case of
3346 * really high dividends won't have any real effect anyhow.)
3348 static unsigned int buffer_hz;
3350 /* Returns tc handle 'major':'minor'. */
3352 tc_make_handle(unsigned int major, unsigned int minor)
3354 return TC_H_MAKE(major << 16, minor);
3357 /* Returns the major number from 'handle'. */
3359 tc_get_major(unsigned int handle)
3361 return TC_H_MAJ(handle) >> 16;
3364 /* Returns the minor number from 'handle'. */
3366 tc_get_minor(unsigned int handle)
3368 return TC_H_MIN(handle);
3371 static struct tcmsg *
3372 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3373 struct ofpbuf *request)
3375 struct tcmsg *tcmsg;
3379 error = get_ifindex(netdev, &ifindex);
3384 ofpbuf_init(request, 512);
3385 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3386 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3387 tcmsg->tcm_family = AF_UNSPEC;
3388 tcmsg->tcm_ifindex = ifindex;
3389 /* Caller should fill in tcmsg->tcm_handle. */
3390 /* Caller should fill in tcmsg->tcm_parent. */
3396 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3398 int error = nl_sock_transact(rtnl_sock, request, replyp);
3399 ofpbuf_uninit(request);
3406 /* The values in psched are not individually very meaningful, but they are
3407 * important. The tables below show some values seen in the wild.
3411 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3412 * (Before that, there are hints that it was 1000000000.)
3414 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3418 * -----------------------------------
3419 * [1] 000c8000 000f4240 000f4240 00000064
3420 * [2] 000003e8 00000400 000f4240 3b9aca00
3421 * [3] 000003e8 00000400 000f4240 3b9aca00
3422 * [4] 000003e8 00000400 000f4240 00000064
3423 * [5] 000003e8 00000040 000f4240 3b9aca00
3424 * [6] 000003e8 00000040 000f4240 000000f9
3426 * a b c d ticks_per_s buffer_hz
3427 * ------- --------- ---------- ------------- ----------- -------------
3428 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3429 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3430 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3431 * [4] 1,000 1,024 1,000,000 100 976,562 100
3432 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3433 * [6] 1,000 64 1,000,000 249 15,625,000 249
3435 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3436 * [2] 2.6.26-1-686-bigmem from Debian lenny
3437 * [3] 2.6.26-2-sparc64 from Debian lenny
3438 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3439 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3440 * [6] 2.6.34 from kernel.org on KVM
3442 static const char fn[] = "/proc/net/psched";
3443 unsigned int a, b, c, d;
3449 stream = fopen(fn, "r");
3451 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3455 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3456 VLOG_WARN("%s: read failed", fn);
3460 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3464 VLOG_WARN("%s: invalid scheduler parameters", fn);
3468 ticks_per_s = (double) a * c / b;
3472 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3475 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3478 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3479 * rate of 'rate' bytes per second. */
3481 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3486 return (rate * ticks) / ticks_per_s;
3489 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3490 * rate of 'rate' bytes per second. */
3492 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3497 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3500 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3501 * a transmission rate of 'rate' bytes per second. */
3503 tc_buffer_per_jiffy(unsigned int rate)
3508 return rate / buffer_hz;
3511 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3512 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3513 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3514 * stores NULL into it if it is absent.
3516 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3519 * Returns 0 if successful, otherwise a positive errno value. */
3521 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3522 struct nlattr **options)
3524 static const struct nl_policy tca_policy[] = {
3525 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3526 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3528 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3530 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3531 tca_policy, ta, ARRAY_SIZE(ta))) {
3532 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3537 *kind = nl_attr_get_string(ta[TCA_KIND]);
3541 *options = ta[TCA_OPTIONS];
3556 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3557 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3558 * into '*options', and its queue statistics into '*stats'. Any of the output
3559 * arguments may be null.
3561 * Returns 0 if successful, otherwise a positive errno value. */
3563 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3564 struct nlattr **options, struct netdev_queue_stats *stats)
3566 static const struct nl_policy tca_policy[] = {
3567 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3568 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3570 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3572 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3573 tca_policy, ta, ARRAY_SIZE(ta))) {
3574 VLOG_WARN_RL(&rl, "failed to parse class message");
3579 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3580 *handlep = tc->tcm_handle;
3584 *options = ta[TCA_OPTIONS];
3588 const struct gnet_stats_queue *gsq;
3589 struct gnet_stats_basic gsb;
3591 static const struct nl_policy stats_policy[] = {
3592 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3593 .min_len = sizeof gsb },
3594 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3595 .min_len = sizeof *gsq },
3597 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3599 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3600 sa, ARRAY_SIZE(sa))) {
3601 VLOG_WARN_RL(&rl, "failed to parse class stats");
3605 /* Alignment issues screw up the length of struct gnet_stats_basic on
3606 * some arch/bitsize combinations. Newer versions of Linux have a
3607 * struct gnet_stats_basic_packed, but we can't depend on that. The
3608 * easiest thing to do is just to make a copy. */
3609 memset(&gsb, 0, sizeof gsb);
3610 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3611 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3612 stats->tx_bytes = gsb.bytes;
3613 stats->tx_packets = gsb.packets;
3615 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3616 stats->tx_errors = gsq->drops;
3626 memset(stats, 0, sizeof *stats);
3631 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3634 tc_query_class(const struct netdev *netdev,
3635 unsigned int handle, unsigned int parent,
3636 struct ofpbuf **replyp)
3638 struct ofpbuf request;
3639 struct tcmsg *tcmsg;
3642 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3646 tcmsg->tcm_handle = handle;
3647 tcmsg->tcm_parent = parent;
3649 error = tc_transact(&request, replyp);
3651 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3652 netdev_get_name(netdev),
3653 tc_get_major(handle), tc_get_minor(handle),
3654 tc_get_major(parent), tc_get_minor(parent),
3660 /* Equivalent to "tc class del dev <name> handle <handle>". */
3662 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3664 struct ofpbuf request;
3665 struct tcmsg *tcmsg;
3668 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3672 tcmsg->tcm_handle = handle;
3673 tcmsg->tcm_parent = 0;
3675 error = tc_transact(&request, NULL);
3677 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3678 netdev_get_name(netdev),
3679 tc_get_major(handle), tc_get_minor(handle),
3685 /* Equivalent to "tc qdisc del dev <name> root". */
3687 tc_del_qdisc(struct netdev *netdev)
3689 struct netdev_dev_linux *netdev_dev =
3690 netdev_dev_linux_cast(netdev_get_dev(netdev));
3691 struct ofpbuf request;
3692 struct tcmsg *tcmsg;
3695 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3699 tcmsg->tcm_handle = tc_make_handle(1, 0);
3700 tcmsg->tcm_parent = TC_H_ROOT;
3702 error = tc_transact(&request, NULL);
3703 if (error == EINVAL) {
3704 /* EINVAL probably means that the default qdisc was in use, in which
3705 * case we've accomplished our purpose. */
3708 if (!error && netdev_dev->tc) {
3709 if (netdev_dev->tc->ops->tc_destroy) {
3710 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3712 netdev_dev->tc = NULL;
3717 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3718 * kernel to determine what they are. Returns 0 if successful, otherwise a
3719 * positive errno value. */
3721 tc_query_qdisc(const struct netdev *netdev)
3723 struct netdev_dev_linux *netdev_dev =
3724 netdev_dev_linux_cast(netdev_get_dev(netdev));
3725 struct ofpbuf request, *qdisc;
3726 const struct tc_ops *ops;
3727 struct tcmsg *tcmsg;
3731 if (netdev_dev->tc) {
3735 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3736 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3737 * 2.6.35 without that fix backported to it.
3739 * To avoid the OOPS, we must not make a request that would attempt to dump
3740 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3741 * few others. There are a few ways that I can see to do this, but most of
3742 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3743 * technique chosen here is to assume that any non-default qdisc that we
3744 * create will have a class with handle 1:0. The built-in qdiscs only have
3745 * a class with handle 0:0.
3747 * We could check for Linux 2.6.35+ and use a more straightforward method
3749 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3753 tcmsg->tcm_handle = tc_make_handle(1, 0);
3754 tcmsg->tcm_parent = 0;
3756 /* Figure out what tc class to instantiate. */
3757 error = tc_transact(&request, &qdisc);
3761 error = tc_parse_qdisc(qdisc, &kind, NULL);
3763 ops = &tc_ops_other;
3765 ops = tc_lookup_linux_name(kind);
3767 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3768 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3770 ops = &tc_ops_other;
3773 } else if (error == ENOENT) {
3774 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3775 * other entity that doesn't have a handle 1:0. We will assume
3776 * that it's the system default qdisc. */
3777 ops = &tc_ops_default;
3780 /* Who knows? Maybe the device got deleted. */
3781 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3782 netdev_get_name(netdev), strerror(error));
3783 ops = &tc_ops_other;
3786 /* Instantiate it. */
3787 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3788 assert((load_error == 0) == (netdev_dev->tc != NULL));
3789 ofpbuf_delete(qdisc);
3791 return error ? error : load_error;
3794 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3795 approximate the time to transmit packets of various lengths. For an MTU of
3796 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3797 represents two possible packet lengths; for a MTU of 513 through 1024, four
3798 possible lengths; and so on.
3800 Returns, for the specified 'mtu', the number of bits that packet lengths
3801 need to be shifted right to fit within such a 256-entry table. */
3803 tc_calc_cell_log(unsigned int mtu)
3808 mtu = ETH_PAYLOAD_MAX;
3810 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3812 for (cell_log = 0; mtu >= 256; cell_log++) {
3819 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3822 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3824 memset(rate, 0, sizeof *rate);
3825 rate->cell_log = tc_calc_cell_log(mtu);
3826 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3827 /* rate->cell_align = 0; */ /* distro headers. */
3828 rate->mpu = ETH_TOTAL_MIN;
3832 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3833 * attribute of the specified "type".
3835 * See tc_calc_cell_log() above for a description of "rtab"s. */
3837 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3842 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3843 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3844 unsigned packet_size = (i + 1) << rate->cell_log;
3845 if (packet_size < rate->mpu) {
3846 packet_size = rate->mpu;
3848 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3852 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3853 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3854 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3857 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3859 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3860 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3864 /* Utility functions. */
3867 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3869 /* Policy for RTNLGRP_LINK messages.
3871 * There are *many* more fields in these messages, but currently we only
3872 * care about these fields. */
3873 static const struct nl_policy rtnlgrp_link_policy[] = {
3874 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3875 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3876 .min_len = sizeof(struct rtnl_link_stats) },
3879 struct ofpbuf request;
3880 struct ofpbuf *reply;
3881 struct ifinfomsg *ifi;
3882 const struct rtnl_link_stats *rtnl_stats;
3883 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3886 ofpbuf_init(&request, 0);
3887 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3888 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3889 ifi->ifi_family = PF_UNSPEC;
3890 ifi->ifi_index = ifindex;
3891 error = nl_sock_transact(rtnl_sock, &request, &reply);
3892 ofpbuf_uninit(&request);
3897 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3898 rtnlgrp_link_policy,
3899 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3900 ofpbuf_delete(reply);
3904 if (!attrs[IFLA_STATS]) {
3905 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3906 ofpbuf_delete(reply);
3910 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3911 stats->rx_packets = rtnl_stats->rx_packets;
3912 stats->tx_packets = rtnl_stats->tx_packets;
3913 stats->rx_bytes = rtnl_stats->rx_bytes;
3914 stats->tx_bytes = rtnl_stats->tx_bytes;
3915 stats->rx_errors = rtnl_stats->rx_errors;
3916 stats->tx_errors = rtnl_stats->tx_errors;
3917 stats->rx_dropped = rtnl_stats->rx_dropped;
3918 stats->tx_dropped = rtnl_stats->tx_dropped;
3919 stats->multicast = rtnl_stats->multicast;
3920 stats->collisions = rtnl_stats->collisions;
3921 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3922 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3923 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3924 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3925 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3926 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3927 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3928 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3929 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3930 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3931 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3933 ofpbuf_delete(reply);
3939 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3941 static const char fn[] = "/proc/net/dev";
3946 stream = fopen(fn, "r");
3948 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3953 while (fgets(line, sizeof line, stream)) {
3956 #define X64 "%"SCNu64
3959 X64 X64 X64 X64 X64 X64 X64 "%*u"
3960 X64 X64 X64 X64 X64 X64 X64 "%*u",
3966 &stats->rx_fifo_errors,
3967 &stats->rx_frame_errors,
3973 &stats->tx_fifo_errors,
3975 &stats->tx_carrier_errors) != 15) {
3976 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3977 } else if (!strcmp(devname, netdev_name)) {
3978 stats->rx_length_errors = UINT64_MAX;
3979 stats->rx_over_errors = UINT64_MAX;
3980 stats->rx_crc_errors = UINT64_MAX;
3981 stats->rx_missed_errors = UINT64_MAX;
3982 stats->tx_aborted_errors = UINT64_MAX;
3983 stats->tx_heartbeat_errors = UINT64_MAX;
3984 stats->tx_window_errors = UINT64_MAX;
3990 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3996 get_flags(const struct netdev *netdev, int *flags)
4001 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4003 *flags = ifr.ifr_flags;
4008 set_flags(struct netdev *netdev, int flags)
4012 ifr.ifr_flags = flags;
4013 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4018 do_get_ifindex(const char *netdev_name)
4022 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4023 COVERAGE_INC(netdev_get_ifindex);
4024 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4025 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4026 netdev_name, strerror(errno));
4029 return ifr.ifr_ifindex;
4033 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4035 struct netdev_dev_linux *netdev_dev =
4036 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4038 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4039 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4043 netdev_dev->cache_valid |= VALID_IFINDEX;
4044 netdev_dev->ifindex = ifindex;
4046 *ifindexp = netdev_dev->ifindex;
4051 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4056 memset(&ifr, 0, sizeof ifr);
4057 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4058 COVERAGE_INC(netdev_get_hwaddr);
4059 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4060 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4061 netdev_name, strerror(errno));
4064 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4065 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4066 VLOG_WARN("%s device has unknown hardware address family %d",
4067 netdev_name, hwaddr_family);
4069 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4074 set_etheraddr(const char *netdev_name, int hwaddr_family,
4075 const uint8_t mac[ETH_ADDR_LEN])
4079 memset(&ifr, 0, sizeof ifr);
4080 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4081 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4082 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4083 COVERAGE_INC(netdev_set_hwaddr);
4084 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4085 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4086 netdev_name, strerror(errno));
4093 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4094 int cmd, const char *cmd_name)
4098 memset(&ifr, 0, sizeof ifr);
4099 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4100 ifr.ifr_data = (caddr_t) ecmd;
4103 COVERAGE_INC(netdev_ethtool);
4104 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4107 if (errno != EOPNOTSUPP) {
4108 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4109 "failed: %s", cmd_name, name, strerror(errno));
4111 /* The device doesn't support this operation. That's pretty
4112 * common, so there's no point in logging anything. */
4119 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4120 const char *cmd_name)
4122 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4123 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4124 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4132 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4133 int cmd, const char *cmd_name)
4138 ifr.ifr_addr.sa_family = AF_INET;
4139 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4141 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4142 *ip = sin->sin_addr;