2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
57 #include "openflow/openflow.h"
59 #include "poll-loop.h"
60 #include "rtnetlink.h"
61 #include "socket-util.h"
66 VLOG_DEFINE_THIS_MODULE(netdev_linux)
68 /* These were introduced in Linux 2.6.14, so they might be missing if we have
70 #ifndef ADVERTISED_Pause
71 #define ADVERTISED_Pause (1 << 13)
73 #ifndef ADVERTISED_Asym_Pause
74 #define ADVERTISED_Asym_Pause (1 << 14)
77 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
80 #define TC_RTAB_SIZE 1024
83 static struct rtnetlink_notifier netdev_linux_cache_notifier;
84 static int cache_notifier_refcount;
87 VALID_IFINDEX = 1 << 0,
88 VALID_ETHERADDR = 1 << 1,
92 VALID_CARRIER = 1 << 5,
93 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
94 VALID_POLICING = 1 << 7,
95 VALID_HAVE_VPORT_STATS = 1 << 8
103 /* Traffic control. */
105 /* An instance of a traffic control class. Always associated with a particular
108 * Each TC implementation subclasses this with whatever additional data it
111 const struct tc_ops *ops;
112 struct hmap queues; /* Contains "struct tc_queue"s.
113 * Read by generic TC layer.
114 * Written only by TC implementation. */
117 /* One traffic control queue.
119 * Each TC implementation subclasses this with whatever additional data it
122 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
123 unsigned int queue_id; /* OpenFlow queue ID. */
126 /* A particular kind of traffic control. Each implementation generally maps to
127 * one particular Linux qdisc class.
129 * The functions below return 0 if successful or a positive errno value on
130 * failure, except where otherwise noted. All of them must be provided, except
131 * where otherwise noted. */
133 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
134 * This is null for tc_ops_default and tc_ops_other, for which there are no
135 * appropriate values. */
136 const char *linux_name;
138 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
139 const char *ovs_name;
141 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
142 * queues. The queues are numbered 0 through n_queues - 1. */
143 unsigned int n_queues;
145 /* Called to install this TC class on 'netdev'. The implementation should
146 * make the Netlink calls required to set up 'netdev' with the right qdisc
147 * and configure it according to 'details'. The implementation may assume
148 * that the current qdisc is the default; that is, there is no need for it
149 * to delete the current qdisc before installing itself.
151 * The contents of 'details' should be documented as valid for 'ovs_name'
152 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
153 * (which is built as ovs-vswitchd.conf.db(8)).
155 * This function must return 0 if and only if it sets 'netdev->tc' to an
156 * initialized 'struct tc'.
158 * (This function is null for tc_ops_other, which cannot be installed. For
159 * other TC classes it should always be nonnull.) */
160 int (*tc_install)(struct netdev *netdev, const struct shash *details);
162 /* Called when the netdev code determines (through a Netlink query) that
163 * this TC class's qdisc is installed on 'netdev', but we didn't install
164 * it ourselves and so don't know any of the details.
166 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
167 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
168 * implementation should parse the other attributes of 'nlmsg' as
169 * necessary to determine its configuration. If necessary it should also
170 * use Netlink queries to determine the configuration of queues on
173 * This function must return 0 if and only if it sets 'netdev->tc' to an
174 * initialized 'struct tc'. */
175 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
177 /* Destroys the data structures allocated by the implementation as part of
178 * 'tc'. (This includes destroying 'tc->queues' by calling
181 * The implementation should not need to perform any Netlink calls. If
182 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
183 * (But it may not be desirable.)
185 * This function may be null if 'tc' is trivial. */
186 void (*tc_destroy)(struct tc *tc);
188 /* Retrieves details of 'netdev->tc' configuration into 'details'.
190 * The implementation should not need to perform any Netlink calls, because
191 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
192 * cached the configuration.
194 * The contents of 'details' should be documented as valid for 'ovs_name'
195 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
196 * (which is built as ovs-vswitchd.conf.db(8)).
198 * This function may be null if 'tc' is not configurable.
200 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
202 /* Reconfigures 'netdev->tc' according to 'details', performing any
203 * required Netlink calls to complete the reconfiguration.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function may be null if 'tc' is not configurable.
211 int (*qdisc_set)(struct netdev *, const struct shash *details);
213 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
214 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "Queue" table in
218 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
220 * The implementation should not need to perform any Netlink calls, because
221 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
222 * cached the queue configuration.
224 * This function may be null if 'tc' does not have queues ('n_queues' is
226 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
227 struct shash *details);
229 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
230 * 'details', perfoming any required Netlink calls to complete the
231 * reconfiguration. The caller ensures that 'queue_id' is less than
234 * The contents of 'details' should be documented as valid for 'ovs_name'
235 * in the "other_config" column in the "Queue" table in
236 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
238 * This function may be null if 'tc' does not have queues or its queues are
239 * not configurable. */
240 int (*class_set)(struct netdev *, unsigned int queue_id,
241 const struct shash *details);
243 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
244 * tc_queue's within 'netdev->tc->queues'.
246 * This function may be null if 'tc' does not have queues or its queues
247 * cannot be deleted. */
248 int (*class_delete)(struct netdev *, struct tc_queue *queue);
250 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
251 * 'struct tc_queue's within 'netdev->tc->queues'.
253 * On success, initializes '*stats'.
255 * This function may be null if 'tc' does not have queues or if it cannot
256 * report queue statistics. */
257 int (*class_get_stats)(const struct netdev *netdev,
258 const struct tc_queue *queue,
259 struct netdev_queue_stats *stats);
261 /* Extracts queue stats from 'nlmsg', which is a response to a
262 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_dump_stats)(const struct netdev *netdev,
267 const struct ofpbuf *nlmsg,
268 netdev_dump_queue_stats_cb *cb, void *aux);
272 tc_init(struct tc *tc, const struct tc_ops *ops)
275 hmap_init(&tc->queues);
279 tc_destroy(struct tc *tc)
281 hmap_destroy(&tc->queues);
284 static const struct tc_ops tc_ops_htb;
285 static const struct tc_ops tc_ops_default;
286 static const struct tc_ops tc_ops_other;
288 static const struct tc_ops *tcs[] = {
289 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
290 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
291 &tc_ops_other, /* Some other qdisc. */
295 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
296 static unsigned int tc_get_major(unsigned int handle);
297 static unsigned int tc_get_minor(unsigned int handle);
299 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
300 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
301 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
303 static struct tcmsg *tc_make_request(const struct netdev *, int type,
304 unsigned int flags, struct ofpbuf *);
305 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
307 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
308 struct nlattr **options);
309 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
310 struct nlattr **options,
311 struct netdev_queue_stats *);
312 static int tc_query_class(const struct netdev *,
313 unsigned int handle, unsigned int parent,
314 struct ofpbuf **replyp);
315 static int tc_delete_class(const struct netdev *, unsigned int handle);
317 static int tc_del_qdisc(struct netdev *netdev);
318 static int tc_query_qdisc(const struct netdev *netdev);
320 static int tc_calc_cell_log(unsigned int mtu);
321 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
322 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
323 const struct tc_ratespec *rate);
324 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
326 struct netdev_dev_linux {
327 struct netdev_dev netdev_dev;
329 struct shash_node *shash_node;
330 unsigned int cache_valid;
332 /* The following are figured out "on demand" only. They are only valid
333 * when the corresponding VALID_* bit in 'cache_valid' is set. */
335 uint8_t etheraddr[ETH_ADDR_LEN];
336 struct in_addr address, netmask;
340 bool is_internal; /* Is this an openvswitch internal device? */
341 bool is_tap; /* Is this a tuntap device? */
342 uint32_t kbits_rate; /* Policing data. */
343 uint32_t kbits_burst;
344 bool have_vport_stats;
348 struct tap_state tap;
352 struct netdev_linux {
353 struct netdev netdev;
357 /* An AF_INET socket (used for ioctl operations). */
358 static int af_inet_sock = -1;
360 /* A Netlink routing socket that is not subscribed to any multicast groups. */
361 static struct nl_sock *rtnl_sock;
363 struct netdev_linux_notifier {
364 struct netdev_notifier notifier;
368 static struct shash netdev_linux_notifiers =
369 SHASH_INITIALIZER(&netdev_linux_notifiers);
370 static struct rtnetlink_notifier netdev_linux_poll_notifier;
372 /* This is set pretty low because we probably won't learn anything from the
373 * additional log messages. */
374 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
376 static int netdev_linux_init(void);
378 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
379 int cmd, const char *cmd_name);
380 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
381 const char *cmd_name);
382 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
383 int cmd, const char *cmd_name);
384 static int get_flags(const struct netdev *, int *flagsp);
385 static int set_flags(struct netdev *, int flags);
386 static int do_get_ifindex(const char *netdev_name);
387 static int get_ifindex(const struct netdev *, int *ifindexp);
388 static int do_set_addr(struct netdev *netdev,
389 int ioctl_nr, const char *ioctl_name,
390 struct in_addr addr);
391 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
392 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
393 const uint8_t[ETH_ADDR_LEN]);
394 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
395 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
398 is_netdev_linux_class(const struct netdev_class *netdev_class)
400 return netdev_class->init == netdev_linux_init;
403 static struct netdev_dev_linux *
404 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
406 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
407 assert(is_netdev_linux_class(netdev_class));
409 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
412 static struct netdev_linux *
413 netdev_linux_cast(const struct netdev *netdev)
415 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
416 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
417 assert(is_netdev_linux_class(netdev_class));
419 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
423 netdev_linux_init(void)
425 static int status = -1;
427 /* Create AF_INET socket. */
428 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
429 status = af_inet_sock >= 0 ? 0 : errno;
431 VLOG_ERR("failed to create inet socket: %s", strerror(status));
434 /* Create rtnetlink socket. */
436 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
438 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
447 netdev_linux_run(void)
449 rtnetlink_notifier_run();
453 netdev_linux_wait(void)
455 rtnetlink_notifier_wait();
459 netdev_linux_cache_cb(const struct rtnetlink_change *change,
460 void *aux OVS_UNUSED)
462 struct netdev_dev_linux *dev;
464 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
466 const struct netdev_class *netdev_class =
467 netdev_dev_get_class(base_dev);
469 if (is_netdev_linux_class(netdev_class)) {
470 dev = netdev_dev_linux_cast(base_dev);
471 dev->cache_valid = 0;
475 struct shash device_shash;
476 struct shash_node *node;
478 shash_init(&device_shash);
479 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
480 SHASH_FOR_EACH (node, &device_shash) {
482 dev->cache_valid = 0;
484 shash_destroy(&device_shash);
488 /* Creates the netdev device of 'type' with 'name'. */
490 netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
491 const struct shash *args, struct netdev_dev **netdev_devp)
493 struct netdev_dev_linux *netdev_dev;
496 if (!shash_is_empty(args)) {
497 VLOG_WARN("%s: arguments for system devices should be empty", name);
500 if (!cache_notifier_refcount) {
501 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
502 netdev_linux_cache_cb, NULL);
507 cache_notifier_refcount++;
509 netdev_dev = xzalloc(sizeof *netdev_dev);
510 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
512 *netdev_devp = &netdev_dev->netdev_dev;
516 /* For most types of netdevs we open the device for each call of
517 * netdev_open(). However, this is not the case with tap devices,
518 * since it is only possible to open the device once. In this
519 * situation we share a single file descriptor, and consequently
520 * buffers, across all readers. Therefore once data is read it will
521 * be unavailable to other reads for tap devices. */
523 netdev_linux_create_tap(const char *name, const char *type OVS_UNUSED,
524 const struct shash *args, struct netdev_dev **netdev_devp)
526 struct netdev_dev_linux *netdev_dev;
527 struct tap_state *state;
528 static const char tap_dev[] = "/dev/net/tun";
532 if (!shash_is_empty(args)) {
533 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
536 netdev_dev = xzalloc(sizeof *netdev_dev);
537 state = &netdev_dev->state.tap;
539 /* Open tap device. */
540 state->fd = open(tap_dev, O_RDWR);
543 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
547 /* Create tap device. */
548 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
549 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
550 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
551 VLOG_WARN("%s: creating tap device failed: %s", name,
557 /* Make non-blocking. */
558 error = set_nonblocking(state->fd);
563 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
564 *netdev_devp = &netdev_dev->netdev_dev;
573 destroy_tap(struct netdev_dev_linux *netdev_dev)
575 struct tap_state *state = &netdev_dev->state.tap;
577 if (state->fd >= 0) {
582 /* Destroys the netdev device 'netdev_dev_'. */
584 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
586 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
587 const char *type = netdev_dev_get_type(netdev_dev_);
589 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
590 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
593 if (!strcmp(type, "system")) {
594 cache_notifier_refcount--;
596 if (!cache_notifier_refcount) {
597 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
599 } else if (!strcmp(type, "tap")) {
600 destroy_tap(netdev_dev);
607 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
608 struct netdev **netdevp)
610 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
611 struct netdev_linux *netdev;
612 enum netdev_flags flags;
615 /* Allocate network device. */
616 netdev = xzalloc(sizeof *netdev);
618 netdev_init(&netdev->netdev, netdev_dev_);
620 error = netdev_get_flags(&netdev->netdev, &flags);
621 if (error == ENODEV) {
625 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
626 !netdev_dev->state.tap.opened) {
628 /* We assume that the first user of the tap device is the primary user
629 * and give them the tap FD. Subsequent users probably just expect
630 * this to be a system device so open it normally to avoid send/receive
631 * directions appearing to be reversed. */
632 netdev->fd = netdev_dev->state.tap.fd;
633 netdev_dev->state.tap.opened = true;
634 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
635 struct sockaddr_ll sll;
639 /* Create file descriptor. */
640 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
641 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
643 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
644 if (netdev->fd < 0) {
649 /* Set non-blocking mode. */
650 error = set_nonblocking(netdev->fd);
655 /* Get ethernet device index. */
656 error = get_ifindex(&netdev->netdev, &ifindex);
661 /* Bind to specific ethernet device. */
662 memset(&sll, 0, sizeof sll);
663 sll.sll_family = AF_PACKET;
664 sll.sll_ifindex = ifindex;
666 (struct sockaddr *) &sll, sizeof sll) < 0) {
668 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
673 /* Between the socket() and bind() calls above, the socket receives all
674 * packets of the requested type on all system interfaces. We do not
675 * want to receive that data, but there is no way to avoid it. So we
676 * must now drain out the receive queue. */
677 error = drain_rcvbuf(netdev->fd);
683 *netdevp = &netdev->netdev;
687 netdev_uninit(&netdev->netdev, true);
691 /* Closes and destroys 'netdev'. */
693 netdev_linux_close(struct netdev *netdev_)
695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
697 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
703 /* Initializes 'svec' with a list of the names of all known network devices. */
705 netdev_linux_enumerate(struct svec *svec)
707 struct if_nameindex *names;
709 names = if_nameindex();
713 for (i = 0; names[i].if_name != NULL; i++) {
714 svec_add(svec, names[i].if_name);
716 if_freenameindex(names);
719 VLOG_WARN("could not obtain list of network device names: %s",
726 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
728 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
730 if (netdev->fd < 0) {
731 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
736 ssize_t retval = read(netdev->fd, data, size);
739 } else if (errno != EINTR) {
740 if (errno != EAGAIN) {
741 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
742 strerror(errno), netdev_get_name(netdev_));
749 /* Registers with the poll loop to wake up from the next call to poll_block()
750 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
752 netdev_linux_recv_wait(struct netdev *netdev_)
754 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
755 if (netdev->fd >= 0) {
756 poll_fd_wait(netdev->fd, POLLIN);
760 /* Discards all packets waiting to be received from 'netdev'. */
762 netdev_linux_drain(struct netdev *netdev_)
764 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
765 if (netdev->fd < 0) {
767 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
769 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
770 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
774 drain_fd(netdev->fd, ifr.ifr_qlen);
777 return drain_rcvbuf(netdev->fd);
781 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
782 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
783 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
784 * the packet is too big or too small to transmit on the device.
786 * The caller retains ownership of 'buffer' in all cases.
788 * The kernel maintains a packet transmission queue, so the caller is not
789 * expected to do additional queuing of packets. */
791 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
795 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
797 if (netdev->fd < 0) {
802 ssize_t retval = write(netdev->fd, data, size);
804 /* The Linux AF_PACKET implementation never blocks waiting for room
805 * for packets, instead returning ENOBUFS. Translate this into
806 * EAGAIN for the caller. */
807 if (errno == ENOBUFS) {
809 } else if (errno == EINTR) {
811 } else if (errno != EAGAIN) {
812 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
813 netdev_get_name(netdev_), strerror(errno));
816 } else if (retval != size) {
817 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
818 "%zu) on %s", retval, size, netdev_get_name(netdev_));
826 /* Registers with the poll loop to wake up from the next call to poll_block()
827 * when the packet transmission queue has sufficient room to transmit a packet
828 * with netdev_send().
830 * The kernel maintains a packet transmission queue, so the client is not
831 * expected to do additional queuing of packets. Thus, this function is
832 * unlikely to ever be used. It is included for completeness. */
834 netdev_linux_send_wait(struct netdev *netdev_)
836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
837 if (netdev->fd < 0) {
839 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
840 poll_fd_wait(netdev->fd, POLLOUT);
842 /* TAP device always accepts packets.*/
843 poll_immediate_wake();
847 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
848 * otherwise a positive errno value. */
850 netdev_linux_set_etheraddr(struct netdev *netdev_,
851 const uint8_t mac[ETH_ADDR_LEN])
853 struct netdev_dev_linux *netdev_dev =
854 netdev_dev_linux_cast(netdev_get_dev(netdev_));
857 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
858 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
859 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
861 netdev_dev->cache_valid |= VALID_ETHERADDR;
862 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
870 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
871 * free the returned buffer. */
873 netdev_linux_get_etheraddr(const struct netdev *netdev_,
874 uint8_t mac[ETH_ADDR_LEN])
876 struct netdev_dev_linux *netdev_dev =
877 netdev_dev_linux_cast(netdev_get_dev(netdev_));
878 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
879 int error = get_etheraddr(netdev_get_name(netdev_),
880 netdev_dev->etheraddr);
884 netdev_dev->cache_valid |= VALID_ETHERADDR;
886 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
890 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
891 * in bytes, not including the hardware header; thus, this is typically 1500
892 * bytes for Ethernet devices. */
894 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
896 struct netdev_dev_linux *netdev_dev =
897 netdev_dev_linux_cast(netdev_get_dev(netdev_));
898 if (!(netdev_dev->cache_valid & VALID_MTU)) {
902 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
903 SIOCGIFMTU, "SIOCGIFMTU");
907 netdev_dev->mtu = ifr.ifr_mtu;
908 netdev_dev->cache_valid |= VALID_MTU;
910 *mtup = netdev_dev->mtu;
914 /* Returns the ifindex of 'netdev', if successful, as a positive number.
915 * On failure, returns a negative errno value. */
917 netdev_linux_get_ifindex(const struct netdev *netdev)
921 error = get_ifindex(netdev, &ifindex);
922 return error ? -error : ifindex;
926 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
928 struct netdev_dev_linux *netdev_dev =
929 netdev_dev_linux_cast(netdev_get_dev(netdev_));
934 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
938 fn = xasprintf("/sys/class/net/%s/carrier",
939 netdev_get_name(netdev_));
940 fd = open(fn, O_RDONLY);
943 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
947 retval = read(fd, line, sizeof line);
950 if (error == EINVAL) {
951 /* This is the normal return value when we try to check carrier
952 * if the network device is not up. */
954 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
957 } else if (retval == 0) {
959 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
963 if (line[0] != '0' && line[0] != '1') {
965 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
969 netdev_dev->carrier = line[0] != '0';
970 netdev_dev->cache_valid |= VALID_CARRIER;
972 *carrier = netdev_dev->carrier;
983 /* Check whether we can we use RTM_GETLINK to get network device statistics.
984 * In pre-2.6.19 kernels, this was only available if wireless extensions were
987 check_for_working_netlink_stats(void)
989 /* Decide on the netdev_get_stats() implementation to use. Netlink is
990 * preferable, so if that works, we'll use it. */
991 int ifindex = do_get_ifindex("lo");
993 VLOG_WARN("failed to get ifindex for lo, "
994 "obtaining netdev stats from proc");
997 struct netdev_stats stats;
998 int error = get_stats_via_netlink(ifindex, &stats);
1000 VLOG_DBG("obtaining netdev stats via rtnetlink");
1003 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1004 "via proc (you are probably running a pre-2.6.19 "
1005 "kernel)", strerror(error));
1011 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1013 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1015 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1016 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1017 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1019 netdev_dev->is_tap = !strcmp(type, "tap");
1020 netdev_dev->is_internal = false;
1021 if (!netdev_dev->is_tap) {
1022 struct ethtool_drvinfo drvinfo;
1025 memset(&drvinfo, 0, sizeof drvinfo);
1026 error = netdev_linux_do_ethtool(name,
1027 (struct ethtool_cmd *)&drvinfo,
1029 "ETHTOOL_GDRVINFO");
1031 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1032 netdev_dev->is_internal = true;
1036 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1041 swap_uint64(uint64_t *a, uint64_t *b)
1048 /* Retrieves current device stats for 'netdev'. */
1050 netdev_linux_get_stats(const struct netdev *netdev_,
1051 struct netdev_stats *stats)
1053 struct netdev_dev_linux *netdev_dev =
1054 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1055 static int use_netlink_stats = -1;
1058 COVERAGE_INC(netdev_get_stats);
1060 if (netdev_dev->have_vport_stats ||
1061 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1063 error = netdev_vport_get_stats(netdev_, stats);
1064 netdev_dev->have_vport_stats = !error;
1065 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1068 if (!netdev_dev->have_vport_stats) {
1069 if (use_netlink_stats < 0) {
1070 use_netlink_stats = check_for_working_netlink_stats();
1072 if (use_netlink_stats) {
1075 error = get_ifindex(netdev_, &ifindex);
1077 error = get_stats_via_netlink(ifindex, stats);
1080 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1084 /* If this port is an internal port then the transmit and receive stats
1085 * will appear to be swapped relative to the other ports since we are the
1086 * one sending the data, not a remote computer. For consistency, we swap
1087 * them back here. This does not apply if we are getting stats from the
1088 * vport layer because it always tracks stats from the perspective of the
1090 netdev_linux_update_is_pseudo(netdev_dev);
1091 if (!error && !netdev_dev->have_vport_stats &&
1092 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1093 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1094 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1095 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1096 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1097 stats->rx_length_errors = 0;
1098 stats->rx_over_errors = 0;
1099 stats->rx_crc_errors = 0;
1100 stats->rx_frame_errors = 0;
1101 stats->rx_fifo_errors = 0;
1102 stats->rx_missed_errors = 0;
1103 stats->tx_aborted_errors = 0;
1104 stats->tx_carrier_errors = 0;
1105 stats->tx_fifo_errors = 0;
1106 stats->tx_heartbeat_errors = 0;
1107 stats->tx_window_errors = 0;
1113 /* Stores the features supported by 'netdev' into each of '*current',
1114 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1115 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1116 * successful, otherwise a positive errno value. */
1118 netdev_linux_get_features(struct netdev *netdev,
1119 uint32_t *current, uint32_t *advertised,
1120 uint32_t *supported, uint32_t *peer)
1122 struct ethtool_cmd ecmd;
1125 memset(&ecmd, 0, sizeof ecmd);
1126 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1127 ETHTOOL_GSET, "ETHTOOL_GSET");
1132 /* Supported features. */
1134 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1135 *supported |= OFPPF_10MB_HD;
1137 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1138 *supported |= OFPPF_10MB_FD;
1140 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1141 *supported |= OFPPF_100MB_HD;
1143 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1144 *supported |= OFPPF_100MB_FD;
1146 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1147 *supported |= OFPPF_1GB_HD;
1149 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1150 *supported |= OFPPF_1GB_FD;
1152 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1153 *supported |= OFPPF_10GB_FD;
1155 if (ecmd.supported & SUPPORTED_TP) {
1156 *supported |= OFPPF_COPPER;
1158 if (ecmd.supported & SUPPORTED_FIBRE) {
1159 *supported |= OFPPF_FIBER;
1161 if (ecmd.supported & SUPPORTED_Autoneg) {
1162 *supported |= OFPPF_AUTONEG;
1164 if (ecmd.supported & SUPPORTED_Pause) {
1165 *supported |= OFPPF_PAUSE;
1167 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1168 *supported |= OFPPF_PAUSE_ASYM;
1171 /* Advertised features. */
1173 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1174 *advertised |= OFPPF_10MB_HD;
1176 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1177 *advertised |= OFPPF_10MB_FD;
1179 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1180 *advertised |= OFPPF_100MB_HD;
1182 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1183 *advertised |= OFPPF_100MB_FD;
1185 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1186 *advertised |= OFPPF_1GB_HD;
1188 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1189 *advertised |= OFPPF_1GB_FD;
1191 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1192 *advertised |= OFPPF_10GB_FD;
1194 if (ecmd.advertising & ADVERTISED_TP) {
1195 *advertised |= OFPPF_COPPER;
1197 if (ecmd.advertising & ADVERTISED_FIBRE) {
1198 *advertised |= OFPPF_FIBER;
1200 if (ecmd.advertising & ADVERTISED_Autoneg) {
1201 *advertised |= OFPPF_AUTONEG;
1203 if (ecmd.advertising & ADVERTISED_Pause) {
1204 *advertised |= OFPPF_PAUSE;
1206 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1207 *advertised |= OFPPF_PAUSE_ASYM;
1210 /* Current settings. */
1211 if (ecmd.speed == SPEED_10) {
1212 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1213 } else if (ecmd.speed == SPEED_100) {
1214 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1215 } else if (ecmd.speed == SPEED_1000) {
1216 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1217 } else if (ecmd.speed == SPEED_10000) {
1218 *current = OFPPF_10GB_FD;
1223 if (ecmd.port == PORT_TP) {
1224 *current |= OFPPF_COPPER;
1225 } else if (ecmd.port == PORT_FIBRE) {
1226 *current |= OFPPF_FIBER;
1230 *current |= OFPPF_AUTONEG;
1233 /* Peer advertisements. */
1234 *peer = 0; /* XXX */
1239 /* Set the features advertised by 'netdev' to 'advertise'. */
1241 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1243 struct ethtool_cmd ecmd;
1246 memset(&ecmd, 0, sizeof ecmd);
1247 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1248 ETHTOOL_GSET, "ETHTOOL_GSET");
1253 ecmd.advertising = 0;
1254 if (advertise & OFPPF_10MB_HD) {
1255 ecmd.advertising |= ADVERTISED_10baseT_Half;
1257 if (advertise & OFPPF_10MB_FD) {
1258 ecmd.advertising |= ADVERTISED_10baseT_Full;
1260 if (advertise & OFPPF_100MB_HD) {
1261 ecmd.advertising |= ADVERTISED_100baseT_Half;
1263 if (advertise & OFPPF_100MB_FD) {
1264 ecmd.advertising |= ADVERTISED_100baseT_Full;
1266 if (advertise & OFPPF_1GB_HD) {
1267 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1269 if (advertise & OFPPF_1GB_FD) {
1270 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1272 if (advertise & OFPPF_10GB_FD) {
1273 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1275 if (advertise & OFPPF_COPPER) {
1276 ecmd.advertising |= ADVERTISED_TP;
1278 if (advertise & OFPPF_FIBER) {
1279 ecmd.advertising |= ADVERTISED_FIBRE;
1281 if (advertise & OFPPF_AUTONEG) {
1282 ecmd.advertising |= ADVERTISED_Autoneg;
1284 if (advertise & OFPPF_PAUSE) {
1285 ecmd.advertising |= ADVERTISED_Pause;
1287 if (advertise & OFPPF_PAUSE_ASYM) {
1288 ecmd.advertising |= ADVERTISED_Asym_Pause;
1290 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1291 ETHTOOL_SSET, "ETHTOOL_SSET");
1294 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1295 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1296 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1297 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1298 * sets '*vlan_vid' to -1. */
1300 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1302 const char *netdev_name = netdev_get_name(netdev);
1303 struct ds line = DS_EMPTY_INITIALIZER;
1304 FILE *stream = NULL;
1308 COVERAGE_INC(netdev_get_vlan_vid);
1309 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1310 stream = fopen(fn, "r");
1316 if (ds_get_line(&line, stream)) {
1317 if (ferror(stream)) {
1319 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1322 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1327 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1329 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1330 fn, ds_cstr(&line));
1348 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1349 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1351 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1352 * positive errno value.
1354 * This function is equivalent to running
1355 * /sbin/tc qdisc del dev %s handle ffff: ingress
1356 * but it is much, much faster.
1359 netdev_linux_remove_policing(struct netdev *netdev)
1361 struct netdev_dev_linux *netdev_dev =
1362 netdev_dev_linux_cast(netdev_get_dev(netdev));
1363 const char *netdev_name = netdev_get_name(netdev);
1365 struct ofpbuf request;
1366 struct tcmsg *tcmsg;
1369 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1370 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1371 tcmsg->tcm_parent = TC_H_INGRESS;
1372 nl_msg_put_string(&request, TCA_KIND, "ingress");
1373 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1375 error = tc_transact(&request, NULL);
1376 if (error && error != ENOENT && error != EINVAL) {
1377 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1378 netdev_name, strerror(error));
1382 netdev_dev->kbits_rate = 0;
1383 netdev_dev->kbits_burst = 0;
1384 netdev_dev->cache_valid |= VALID_POLICING;
1388 /* Attempts to set input rate limiting (policing) policy. */
1390 netdev_linux_set_policing(struct netdev *netdev,
1391 uint32_t kbits_rate, uint32_t kbits_burst)
1393 struct netdev_dev_linux *netdev_dev =
1394 netdev_dev_linux_cast(netdev_get_dev(netdev));
1395 const char *netdev_name = netdev_get_name(netdev);
1398 COVERAGE_INC(netdev_set_policing);
1400 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1401 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1402 : kbits_burst); /* Stick with user-specified value. */
1404 if (netdev_dev->cache_valid & VALID_POLICING
1405 && netdev_dev->kbits_rate == kbits_rate
1406 && netdev_dev->kbits_burst == kbits_burst) {
1407 /* Assume that settings haven't changed since we last set them. */
1411 netdev_linux_remove_policing(netdev);
1413 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1414 if (system(command) != 0) {
1415 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1419 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1420 kbits_rate, kbits_burst);
1421 if (system(command) != 0) {
1422 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1427 netdev_dev->kbits_rate = kbits_rate;
1428 netdev_dev->kbits_burst = kbits_burst;
1429 netdev_dev->cache_valid |= VALID_POLICING;
1436 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1439 const struct tc_ops **opsp;
1441 for (opsp = tcs; *opsp != NULL; opsp++) {
1442 const struct tc_ops *ops = *opsp;
1443 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1444 svec_add(types, ops->ovs_name);
1450 static const struct tc_ops *
1451 tc_lookup_ovs_name(const char *name)
1453 const struct tc_ops **opsp;
1455 for (opsp = tcs; *opsp != NULL; opsp++) {
1456 const struct tc_ops *ops = *opsp;
1457 if (!strcmp(name, ops->ovs_name)) {
1464 static const struct tc_ops *
1465 tc_lookup_linux_name(const char *name)
1467 const struct tc_ops **opsp;
1469 for (opsp = tcs; *opsp != NULL; opsp++) {
1470 const struct tc_ops *ops = *opsp;
1471 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1478 static struct tc_queue *
1479 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1482 struct netdev_dev_linux *netdev_dev =
1483 netdev_dev_linux_cast(netdev_get_dev(netdev));
1484 struct tc_queue *queue;
1486 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1487 if (queue->queue_id == queue_id) {
1494 static struct tc_queue *
1495 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1497 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1501 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1503 struct netdev_qos_capabilities *caps)
1505 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1509 caps->n_queues = ops->n_queues;
1514 netdev_linux_get_qos(const struct netdev *netdev,
1515 const char **typep, struct shash *details)
1517 struct netdev_dev_linux *netdev_dev =
1518 netdev_dev_linux_cast(netdev_get_dev(netdev));
1521 error = tc_query_qdisc(netdev);
1526 *typep = netdev_dev->tc->ops->ovs_name;
1527 return (netdev_dev->tc->ops->qdisc_get
1528 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1533 netdev_linux_set_qos(struct netdev *netdev,
1534 const char *type, const struct shash *details)
1536 struct netdev_dev_linux *netdev_dev =
1537 netdev_dev_linux_cast(netdev_get_dev(netdev));
1538 const struct tc_ops *new_ops;
1541 new_ops = tc_lookup_ovs_name(type);
1542 if (!new_ops || !new_ops->tc_install) {
1546 error = tc_query_qdisc(netdev);
1551 if (new_ops == netdev_dev->tc->ops) {
1552 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1554 /* Delete existing qdisc. */
1555 error = tc_del_qdisc(netdev);
1559 assert(netdev_dev->tc == NULL);
1561 /* Install new qdisc. */
1562 error = new_ops->tc_install(netdev, details);
1563 assert((error == 0) == (netdev_dev->tc != NULL));
1570 netdev_linux_get_queue(const struct netdev *netdev,
1571 unsigned int queue_id, struct shash *details)
1573 struct netdev_dev_linux *netdev_dev =
1574 netdev_dev_linux_cast(netdev_get_dev(netdev));
1577 error = tc_query_qdisc(netdev);
1581 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1583 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1589 netdev_linux_set_queue(struct netdev *netdev,
1590 unsigned int queue_id, const struct shash *details)
1592 struct netdev_dev_linux *netdev_dev =
1593 netdev_dev_linux_cast(netdev_get_dev(netdev));
1596 error = tc_query_qdisc(netdev);
1599 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1600 || !netdev_dev->tc->ops->class_set) {
1604 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1608 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1610 struct netdev_dev_linux *netdev_dev =
1611 netdev_dev_linux_cast(netdev_get_dev(netdev));
1614 error = tc_query_qdisc(netdev);
1617 } else if (!netdev_dev->tc->ops->class_delete) {
1620 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1622 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1628 netdev_linux_get_queue_stats(const struct netdev *netdev,
1629 unsigned int queue_id,
1630 struct netdev_queue_stats *stats)
1632 struct netdev_dev_linux *netdev_dev =
1633 netdev_dev_linux_cast(netdev_get_dev(netdev));
1636 error = tc_query_qdisc(netdev);
1639 } else if (!netdev_dev->tc->ops->class_get_stats) {
1642 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1644 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1650 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1652 struct ofpbuf request;
1653 struct tcmsg *tcmsg;
1655 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1656 tcmsg->tcm_parent = 0;
1657 nl_dump_start(dump, rtnl_sock, &request);
1658 ofpbuf_uninit(&request);
1662 netdev_linux_dump_queues(const struct netdev *netdev,
1663 netdev_dump_queues_cb *cb, void *aux)
1665 struct netdev_dev_linux *netdev_dev =
1666 netdev_dev_linux_cast(netdev_get_dev(netdev));
1667 struct tc_queue *queue;
1668 struct shash details;
1672 error = tc_query_qdisc(netdev);
1675 } else if (!netdev_dev->tc->ops->class_get) {
1680 shash_init(&details);
1681 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1682 shash_clear(&details);
1684 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1686 (*cb)(queue->queue_id, &details, aux);
1691 shash_destroy(&details);
1697 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1698 netdev_dump_queue_stats_cb *cb, void *aux)
1700 struct netdev_dev_linux *netdev_dev =
1701 netdev_dev_linux_cast(netdev_get_dev(netdev));
1702 struct nl_dump dump;
1707 error = tc_query_qdisc(netdev);
1710 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1715 start_queue_dump(netdev, &dump);
1716 while (nl_dump_next(&dump, &msg)) {
1717 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1723 error = nl_dump_done(&dump);
1724 return error ? error : last_error;
1728 netdev_linux_get_in4(const struct netdev *netdev_,
1729 struct in_addr *address, struct in_addr *netmask)
1731 struct netdev_dev_linux *netdev_dev =
1732 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1734 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1737 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1738 SIOCGIFADDR, "SIOCGIFADDR");
1743 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1744 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1749 netdev_dev->cache_valid |= VALID_IN4;
1751 *address = netdev_dev->address;
1752 *netmask = netdev_dev->netmask;
1753 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1757 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1758 struct in_addr netmask)
1760 struct netdev_dev_linux *netdev_dev =
1761 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1764 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1766 netdev_dev->cache_valid |= VALID_IN4;
1767 netdev_dev->address = address;
1768 netdev_dev->netmask = netmask;
1769 if (address.s_addr != INADDR_ANY) {
1770 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1771 "SIOCSIFNETMASK", netmask);
1778 parse_if_inet6_line(const char *line,
1779 struct in6_addr *in6, char ifname[16 + 1])
1781 uint8_t *s6 = in6->s6_addr;
1782 #define X8 "%2"SCNx8
1784 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1785 "%*x %*x %*x %*x %16s\n",
1786 &s6[0], &s6[1], &s6[2], &s6[3],
1787 &s6[4], &s6[5], &s6[6], &s6[7],
1788 &s6[8], &s6[9], &s6[10], &s6[11],
1789 &s6[12], &s6[13], &s6[14], &s6[15],
1793 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1794 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1796 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1798 struct netdev_dev_linux *netdev_dev =
1799 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1800 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1804 netdev_dev->in6 = in6addr_any;
1806 file = fopen("/proc/net/if_inet6", "r");
1808 const char *name = netdev_get_name(netdev_);
1809 while (fgets(line, sizeof line, file)) {
1810 struct in6_addr in6_tmp;
1811 char ifname[16 + 1];
1812 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1813 && !strcmp(name, ifname))
1815 netdev_dev->in6 = in6_tmp;
1821 netdev_dev->cache_valid |= VALID_IN6;
1823 *in6 = netdev_dev->in6;
1828 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1830 struct sockaddr_in sin;
1831 memset(&sin, 0, sizeof sin);
1832 sin.sin_family = AF_INET;
1833 sin.sin_addr = addr;
1836 memset(sa, 0, sizeof *sa);
1837 memcpy(sa, &sin, sizeof sin);
1841 do_set_addr(struct netdev *netdev,
1842 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1845 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1846 make_in4_sockaddr(&ifr.ifr_addr, addr);
1848 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1852 /* Adds 'router' as a default IP gateway. */
1854 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1856 struct in_addr any = { INADDR_ANY };
1860 memset(&rt, 0, sizeof rt);
1861 make_in4_sockaddr(&rt.rt_dst, any);
1862 make_in4_sockaddr(&rt.rt_gateway, router);
1863 make_in4_sockaddr(&rt.rt_genmask, any);
1864 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1865 COVERAGE_INC(netdev_add_router);
1866 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1868 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1874 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1877 static const char fn[] = "/proc/net/route";
1882 *netdev_name = NULL;
1883 stream = fopen(fn, "r");
1884 if (stream == NULL) {
1885 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1890 while (fgets(line, sizeof line, stream)) {
1893 uint32_t dest, gateway, mask;
1894 int refcnt, metric, mtu;
1895 unsigned int flags, use, window, irtt;
1898 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1900 iface, &dest, &gateway, &flags, &refcnt,
1901 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1903 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1907 if (!(flags & RTF_UP)) {
1908 /* Skip routes that aren't up. */
1912 /* The output of 'dest', 'mask', and 'gateway' were given in
1913 * network byte order, so we don't need need any endian
1914 * conversions here. */
1915 if ((dest & mask) == (host->s_addr & mask)) {
1917 /* The host is directly reachable. */
1918 next_hop->s_addr = 0;
1920 /* To reach the host, we must go through a gateway. */
1921 next_hop->s_addr = gateway;
1923 *netdev_name = xstrdup(iface);
1934 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1935 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1936 * returns 0. Otherwise, it returns a positive errno value; in particular,
1937 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1939 netdev_linux_arp_lookup(const struct netdev *netdev,
1940 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1943 struct sockaddr_in sin;
1946 memset(&r, 0, sizeof r);
1947 sin.sin_family = AF_INET;
1948 sin.sin_addr.s_addr = ip;
1950 memcpy(&r.arp_pa, &sin, sizeof sin);
1951 r.arp_ha.sa_family = ARPHRD_ETHER;
1953 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1954 COVERAGE_INC(netdev_arp_lookup);
1955 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1957 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1958 } else if (retval != ENXIO) {
1959 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1960 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1966 nd_to_iff_flags(enum netdev_flags nd)
1969 if (nd & NETDEV_UP) {
1972 if (nd & NETDEV_PROMISC) {
1979 iff_to_nd_flags(int iff)
1981 enum netdev_flags nd = 0;
1985 if (iff & IFF_PROMISC) {
1986 nd |= NETDEV_PROMISC;
1992 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1993 enum netdev_flags on, enum netdev_flags *old_flagsp)
1995 int old_flags, new_flags;
1998 error = get_flags(netdev, &old_flags);
2000 *old_flagsp = iff_to_nd_flags(old_flags);
2001 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2002 if (new_flags != old_flags) {
2003 error = set_flags(netdev, new_flags);
2010 poll_notify(struct list *list)
2012 struct netdev_linux_notifier *notifier;
2013 LIST_FOR_EACH (notifier, node, list) {
2014 struct netdev_notifier *n = ¬ifier->notifier;
2020 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2021 void *aux OVS_UNUSED)
2024 struct list *list = shash_find_data(&netdev_linux_notifiers,
2030 struct shash_node *node;
2031 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2032 poll_notify(node->data);
2038 netdev_linux_poll_add(struct netdev *netdev,
2039 void (*cb)(struct netdev_notifier *), void *aux,
2040 struct netdev_notifier **notifierp)
2042 const char *netdev_name = netdev_get_name(netdev);
2043 struct netdev_linux_notifier *notifier;
2046 if (shash_is_empty(&netdev_linux_notifiers)) {
2047 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2048 netdev_linux_poll_cb, NULL);
2054 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2056 list = xmalloc(sizeof *list);
2058 shash_add(&netdev_linux_notifiers, netdev_name, list);
2061 notifier = xmalloc(sizeof *notifier);
2062 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2063 list_push_back(list, ¬ifier->node);
2064 *notifierp = ¬ifier->notifier;
2069 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2071 struct netdev_linux_notifier *notifier =
2072 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2075 /* Remove 'notifier' from its list. */
2076 list = list_remove(¬ifier->node);
2077 if (list_is_empty(list)) {
2078 /* The list is now empty. Remove it from the hash and free it. */
2079 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2080 shash_delete(&netdev_linux_notifiers,
2081 shash_find(&netdev_linux_notifiers, netdev_name));
2086 /* If that was the last notifier, unregister. */
2087 if (shash_is_empty(&netdev_linux_notifiers)) {
2088 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2092 const struct netdev_class netdev_linux_class = {
2099 netdev_linux_create_system,
2100 netdev_linux_destroy,
2101 NULL, /* reconfigure */
2106 netdev_linux_enumerate,
2109 netdev_linux_recv_wait,
2113 netdev_linux_send_wait,
2115 netdev_linux_set_etheraddr,
2116 netdev_linux_get_etheraddr,
2117 netdev_linux_get_mtu,
2118 netdev_linux_get_ifindex,
2119 netdev_linux_get_carrier,
2120 netdev_linux_get_stats,
2121 netdev_vport_set_stats,
2123 netdev_linux_get_features,
2124 netdev_linux_set_advertisements,
2125 netdev_linux_get_vlan_vid,
2127 netdev_linux_set_policing,
2128 netdev_linux_get_qos_types,
2129 netdev_linux_get_qos_capabilities,
2130 netdev_linux_get_qos,
2131 netdev_linux_set_qos,
2132 netdev_linux_get_queue,
2133 netdev_linux_set_queue,
2134 netdev_linux_delete_queue,
2135 netdev_linux_get_queue_stats,
2136 netdev_linux_dump_queues,
2137 netdev_linux_dump_queue_stats,
2139 netdev_linux_get_in4,
2140 netdev_linux_set_in4,
2141 netdev_linux_get_in6,
2142 netdev_linux_add_router,
2143 netdev_linux_get_next_hop,
2144 netdev_linux_arp_lookup,
2146 netdev_linux_update_flags,
2148 netdev_linux_poll_add,
2149 netdev_linux_poll_remove,
2152 const struct netdev_class netdev_tap_class = {
2159 netdev_linux_create_tap,
2160 netdev_linux_destroy,
2161 NULL, /* reconfigure */
2166 NULL, /* enumerate */
2169 netdev_linux_recv_wait,
2173 netdev_linux_send_wait,
2175 netdev_linux_set_etheraddr,
2176 netdev_linux_get_etheraddr,
2177 netdev_linux_get_mtu,
2178 netdev_linux_get_ifindex,
2179 netdev_linux_get_carrier,
2180 netdev_linux_get_stats,
2181 NULL, /* set_stats */
2183 netdev_linux_get_features,
2184 netdev_linux_set_advertisements,
2185 netdev_linux_get_vlan_vid,
2187 netdev_linux_set_policing,
2188 netdev_linux_get_qos_types,
2189 netdev_linux_get_qos_capabilities,
2190 netdev_linux_get_qos,
2191 netdev_linux_set_qos,
2192 netdev_linux_get_queue,
2193 netdev_linux_set_queue,
2194 netdev_linux_delete_queue,
2195 netdev_linux_get_queue_stats,
2196 netdev_linux_dump_queues,
2197 netdev_linux_dump_queue_stats,
2199 netdev_linux_get_in4,
2200 netdev_linux_set_in4,
2201 netdev_linux_get_in6,
2202 netdev_linux_add_router,
2203 netdev_linux_get_next_hop,
2204 netdev_linux_arp_lookup,
2206 netdev_linux_update_flags,
2208 netdev_linux_poll_add,
2209 netdev_linux_poll_remove,
2212 /* HTB traffic control class. */
2214 #define HTB_N_QUEUES 0xf000
2218 unsigned int max_rate; /* In bytes/s. */
2222 struct tc_queue tc_queue;
2223 unsigned int min_rate; /* In bytes/s. */
2224 unsigned int max_rate; /* In bytes/s. */
2225 unsigned int burst; /* In bytes. */
2226 unsigned int priority; /* Lower values are higher priorities. */
2230 htb_get__(const struct netdev *netdev)
2232 struct netdev_dev_linux *netdev_dev =
2233 netdev_dev_linux_cast(netdev_get_dev(netdev));
2234 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2238 htb_install__(struct netdev *netdev, uint64_t max_rate)
2240 struct netdev_dev_linux *netdev_dev =
2241 netdev_dev_linux_cast(netdev_get_dev(netdev));
2244 htb = xmalloc(sizeof *htb);
2245 tc_init(&htb->tc, &tc_ops_htb);
2246 htb->max_rate = max_rate;
2248 netdev_dev->tc = &htb->tc;
2253 /* Create an HTB qdisc.
2255 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2258 htb_setup_qdisc__(struct netdev *netdev)
2261 struct tc_htb_glob opt;
2262 struct ofpbuf request;
2263 struct tcmsg *tcmsg;
2265 tc_del_qdisc(netdev);
2267 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2268 NLM_F_EXCL | NLM_F_CREATE, &request);
2269 tcmsg->tcm_handle = tc_make_handle(1, 0);
2270 tcmsg->tcm_parent = TC_H_ROOT;
2272 nl_msg_put_string(&request, TCA_KIND, "htb");
2274 memset(&opt, 0, sizeof opt);
2275 opt.rate2quantum = 10;
2279 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2280 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2281 nl_msg_end_nested(&request, opt_offset);
2283 return tc_transact(&request, NULL);
2286 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2287 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2289 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2290 unsigned int parent, struct htb_class *class)
2293 struct tc_htb_opt opt;
2294 struct ofpbuf request;
2295 struct tcmsg *tcmsg;
2299 netdev_get_mtu(netdev, &mtu);
2301 memset(&opt, 0, sizeof opt);
2302 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2303 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2304 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2305 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2306 opt.prio = class->priority;
2308 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2309 tcmsg->tcm_handle = handle;
2310 tcmsg->tcm_parent = parent;
2312 nl_msg_put_string(&request, TCA_KIND, "htb");
2313 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2314 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2315 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2316 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2317 nl_msg_end_nested(&request, opt_offset);
2319 error = tc_transact(&request, NULL);
2321 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2322 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2323 netdev_get_name(netdev),
2324 tc_get_major(handle), tc_get_minor(handle),
2325 tc_get_major(parent), tc_get_minor(parent),
2326 class->min_rate, class->max_rate,
2327 class->burst, class->priority, strerror(error));
2332 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2333 * description of them into 'details'. The description complies with the
2334 * specification given in the vswitch database documentation for linux-htb
2337 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2339 static const struct nl_policy tca_htb_policy[] = {
2340 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2341 .min_len = sizeof(struct tc_htb_opt) },
2344 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2345 const struct tc_htb_opt *htb;
2347 if (!nl_parse_nested(nl_options, tca_htb_policy,
2348 attrs, ARRAY_SIZE(tca_htb_policy))) {
2349 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2353 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2354 class->min_rate = htb->rate.rate;
2355 class->max_rate = htb->ceil.rate;
2356 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2357 class->priority = htb->prio;
2362 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2363 struct htb_class *options,
2364 struct netdev_queue_stats *stats)
2366 struct nlattr *nl_options;
2367 unsigned int handle;
2370 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2371 if (!error && queue_id) {
2372 unsigned int major = tc_get_major(handle);
2373 unsigned int minor = tc_get_minor(handle);
2374 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2375 *queue_id = minor - 1;
2380 if (!error && options) {
2381 error = htb_parse_tca_options__(nl_options, options);
2387 htb_parse_qdisc_details__(struct netdev *netdev,
2388 const struct shash *details, struct htb_class *hc)
2390 const char *max_rate_s;
2392 max_rate_s = shash_find_data(details, "max-rate");
2393 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2394 if (!hc->max_rate) {
2397 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2398 hc->max_rate = netdev_features_to_bps(current) / 8;
2400 hc->min_rate = hc->max_rate;
2406 htb_parse_class_details__(struct netdev *netdev,
2407 const struct shash *details, struct htb_class *hc)
2409 const struct htb *htb = htb_get__(netdev);
2410 const char *min_rate_s = shash_find_data(details, "min-rate");
2411 const char *max_rate_s = shash_find_data(details, "max-rate");
2412 const char *burst_s = shash_find_data(details, "burst");
2413 const char *priority_s = shash_find_data(details, "priority");
2418 /* min-rate is required. */
2421 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2422 hc->min_rate = MAX(hc->min_rate, 0);
2423 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2426 hc->max_rate = (max_rate_s
2427 ? strtoull(max_rate_s, NULL, 10) / 8
2429 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2430 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2434 * According to hints in the documentation that I've read, it is important
2435 * that 'burst' be at least as big as the largest frame that might be
2436 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2437 * but having it a bit too small is a problem. Since netdev_get_mtu()
2438 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2439 * the MTU. We actually add 64, instead of 14, as a guard against
2440 * additional headers get tacked on somewhere that we're not aware of. */
2441 netdev_get_mtu(netdev, &mtu);
2442 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2443 hc->burst = MAX(hc->burst, mtu + 64);
2446 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2452 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2453 unsigned int parent, struct htb_class *options,
2454 struct netdev_queue_stats *stats)
2456 struct ofpbuf *reply;
2459 error = tc_query_class(netdev, handle, parent, &reply);
2461 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2462 ofpbuf_delete(reply);
2468 htb_tc_install(struct netdev *netdev, const struct shash *details)
2472 error = htb_setup_qdisc__(netdev);
2474 struct htb_class hc;
2476 htb_parse_qdisc_details__(netdev, details, &hc);
2477 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2478 tc_make_handle(1, 0), &hc);
2480 htb_install__(netdev, hc.max_rate);
2486 static struct htb_class *
2487 htb_class_cast__(const struct tc_queue *queue)
2489 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2493 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2494 const struct htb_class *hc)
2496 struct htb *htb = htb_get__(netdev);
2497 size_t hash = hash_int(queue_id, 0);
2498 struct tc_queue *queue;
2499 struct htb_class *hcp;
2501 queue = tc_find_queue__(netdev, queue_id, hash);
2503 hcp = htb_class_cast__(queue);
2505 hcp = xmalloc(sizeof *hcp);
2506 queue = &hcp->tc_queue;
2507 queue->queue_id = queue_id;
2508 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2511 hcp->min_rate = hc->min_rate;
2512 hcp->max_rate = hc->max_rate;
2513 hcp->burst = hc->burst;
2514 hcp->priority = hc->priority;
2518 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2520 struct shash details = SHASH_INITIALIZER(&details);
2522 struct nl_dump dump;
2523 struct htb_class hc;
2526 /* Get qdisc options. */
2528 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2529 htb = htb_install__(netdev, hc.max_rate);
2532 start_queue_dump(netdev, &dump);
2533 shash_init(&details);
2534 while (nl_dump_next(&dump, &msg)) {
2535 unsigned int queue_id;
2537 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2538 htb_update_queue__(netdev, queue_id, &hc);
2541 nl_dump_done(&dump);
2547 htb_tc_destroy(struct tc *tc)
2549 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2550 struct htb_class *hc, *next;
2552 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2553 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2561 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2563 const struct htb *htb = htb_get__(netdev);
2564 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2569 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2571 struct htb_class hc;
2574 htb_parse_qdisc_details__(netdev, details, &hc);
2575 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2576 tc_make_handle(1, 0), &hc);
2578 htb_get__(netdev)->max_rate = hc.max_rate;
2584 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2585 const struct tc_queue *queue, struct shash *details)
2587 const struct htb_class *hc = htb_class_cast__(queue);
2589 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2590 if (hc->min_rate != hc->max_rate) {
2591 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2593 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2595 shash_add(details, "priority", xasprintf("%u", hc->priority));
2601 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2602 const struct shash *details)
2604 struct htb_class hc;
2607 error = htb_parse_class_details__(netdev, details, &hc);
2612 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2613 tc_make_handle(1, 0xfffe), &hc);
2618 htb_update_queue__(netdev, queue_id, &hc);
2623 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2625 struct htb_class *hc = htb_class_cast__(queue);
2626 struct htb *htb = htb_get__(netdev);
2629 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2631 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2638 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2639 struct netdev_queue_stats *stats)
2641 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2642 tc_make_handle(1, 0xfffe), NULL, stats);
2646 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2647 const struct ofpbuf *nlmsg,
2648 netdev_dump_queue_stats_cb *cb, void *aux)
2650 struct netdev_queue_stats stats;
2651 unsigned int handle, major, minor;
2654 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2659 major = tc_get_major(handle);
2660 minor = tc_get_minor(handle);
2661 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2662 (*cb)(minor - 1, &stats, aux);
2667 static const struct tc_ops tc_ops_htb = {
2668 "htb", /* linux_name */
2669 "linux-htb", /* ovs_name */
2670 HTB_N_QUEUES, /* n_queues */
2679 htb_class_get_stats,
2680 htb_class_dump_stats
2683 /* "linux-default" traffic control class.
2685 * This class represents the default, unnamed Linux qdisc. It corresponds to
2686 * the "" (empty string) QoS type in the OVS database. */
2689 default_install__(struct netdev *netdev)
2691 struct netdev_dev_linux *netdev_dev =
2692 netdev_dev_linux_cast(netdev_get_dev(netdev));
2693 static struct tc *tc;
2696 tc = xmalloc(sizeof *tc);
2697 tc_init(tc, &tc_ops_default);
2699 netdev_dev->tc = tc;
2703 default_tc_install(struct netdev *netdev,
2704 const struct shash *details OVS_UNUSED)
2706 default_install__(netdev);
2711 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2713 default_install__(netdev);
2717 static const struct tc_ops tc_ops_default = {
2718 NULL, /* linux_name */
2723 NULL, /* tc_destroy */
2724 NULL, /* qdisc_get */
2725 NULL, /* qdisc_set */
2726 NULL, /* class_get */
2727 NULL, /* class_set */
2728 NULL, /* class_delete */
2729 NULL, /* class_get_stats */
2730 NULL /* class_dump_stats */
2733 /* "linux-other" traffic control class.
2738 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2740 struct netdev_dev_linux *netdev_dev =
2741 netdev_dev_linux_cast(netdev_get_dev(netdev));
2742 static struct tc *tc;
2745 tc = xmalloc(sizeof *tc);
2746 tc_init(tc, &tc_ops_other);
2748 netdev_dev->tc = tc;
2752 static const struct tc_ops tc_ops_other = {
2753 NULL, /* linux_name */
2754 "linux-other", /* ovs_name */
2756 NULL, /* tc_install */
2758 NULL, /* tc_destroy */
2759 NULL, /* qdisc_get */
2760 NULL, /* qdisc_set */
2761 NULL, /* class_get */
2762 NULL, /* class_set */
2763 NULL, /* class_delete */
2764 NULL, /* class_get_stats */
2765 NULL /* class_dump_stats */
2768 /* Traffic control. */
2770 /* Number of kernel "tc" ticks per second. */
2771 static double ticks_per_s;
2773 /* Number of kernel "jiffies" per second. This is used for the purpose of
2774 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2775 * one jiffy's worth of data.
2777 * There are two possibilities here:
2779 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2780 * approximate range of 100 to 1024. That means that we really need to
2781 * make sure that the qdisc can buffer that much data.
2783 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2784 * has finely granular timers and there's no need to fudge additional room
2785 * for buffers. (There's no extra effort needed to implement that: the
2786 * large 'buffer_hz' is used as a divisor, so practically any number will
2787 * come out as 0 in the division. Small integer results in the case of
2788 * really high dividends won't have any real effect anyhow.)
2790 static unsigned int buffer_hz;
2792 /* Returns tc handle 'major':'minor'. */
2794 tc_make_handle(unsigned int major, unsigned int minor)
2796 return TC_H_MAKE(major << 16, minor);
2799 /* Returns the major number from 'handle'. */
2801 tc_get_major(unsigned int handle)
2803 return TC_H_MAJ(handle) >> 16;
2806 /* Returns the minor number from 'handle'. */
2808 tc_get_minor(unsigned int handle)
2810 return TC_H_MIN(handle);
2813 static struct tcmsg *
2814 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2815 struct ofpbuf *request)
2817 struct tcmsg *tcmsg;
2821 error = get_ifindex(netdev, &ifindex);
2826 ofpbuf_init(request, 512);
2827 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2828 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2829 tcmsg->tcm_family = AF_UNSPEC;
2830 tcmsg->tcm_ifindex = ifindex;
2831 /* Caller should fill in tcmsg->tcm_handle. */
2832 /* Caller should fill in tcmsg->tcm_parent. */
2838 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2840 int error = nl_sock_transact(rtnl_sock, request, replyp);
2841 ofpbuf_uninit(request);
2848 /* The values in psched are not individually very meaningful, but they are
2849 * important. The tables below show some values seen in the wild.
2853 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2854 * (Before that, there are hints that it was 1000000000.)
2856 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2860 * -----------------------------------
2861 * [1] 000c8000 000f4240 000f4240 00000064
2862 * [2] 000003e8 00000400 000f4240 3b9aca00
2863 * [3] 000003e8 00000400 000f4240 3b9aca00
2864 * [4] 000003e8 00000400 000f4240 00000064
2865 * [5] 000003e8 00000040 000f4240 3b9aca00
2866 * [6] 000003e8 00000040 000f4240 000000f9
2868 * a b c d ticks_per_s buffer_hz
2869 * ------- --------- ---------- ------------- ----------- -------------
2870 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2871 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2872 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2873 * [4] 1,000 1,024 1,000,000 100 976,562 100
2874 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2875 * [6] 1,000 64 1,000,000 249 15,625,000 249
2877 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2878 * [2] 2.6.26-1-686-bigmem from Debian lenny
2879 * [3] 2.6.26-2-sparc64 from Debian lenny
2880 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2881 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2882 * [6] 2.6.34 from kernel.org on KVM
2884 static const char fn[] = "/proc/net/psched";
2885 unsigned int a, b, c, d;
2891 stream = fopen(fn, "r");
2893 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2897 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2898 VLOG_WARN("%s: read failed", fn);
2902 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2906 VLOG_WARN("%s: invalid scheduler parameters", fn);
2910 ticks_per_s = (double) a * c / b;
2914 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2917 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2920 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2921 * rate of 'rate' bytes per second. */
2923 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2928 return (rate * ticks) / ticks_per_s;
2931 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2932 * rate of 'rate' bytes per second. */
2934 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2939 return ((unsigned long long int) ticks_per_s * size) / rate;
2942 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2943 * a transmission rate of 'rate' bytes per second. */
2945 tc_buffer_per_jiffy(unsigned int rate)
2950 return rate / buffer_hz;
2953 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2954 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2955 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2956 * stores NULL into it if it is absent.
2958 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2961 * Returns 0 if successful, otherwise a positive errno value. */
2963 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2964 struct nlattr **options)
2966 static const struct nl_policy tca_policy[] = {
2967 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2968 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2970 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2972 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2973 tca_policy, ta, ARRAY_SIZE(ta))) {
2974 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2979 *kind = nl_attr_get_string(ta[TCA_KIND]);
2983 *options = ta[TCA_OPTIONS];
2998 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
2999 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3000 * into '*options', and its queue statistics into '*stats'. Any of the output
3001 * arguments may be null.
3003 * Returns 0 if successful, otherwise a positive errno value. */
3005 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3006 struct nlattr **options, struct netdev_queue_stats *stats)
3008 static const struct nl_policy tca_policy[] = {
3009 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3010 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3012 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3014 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3015 tca_policy, ta, ARRAY_SIZE(ta))) {
3016 VLOG_WARN_RL(&rl, "failed to parse class message");
3021 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3022 *handlep = tc->tcm_handle;
3026 *options = ta[TCA_OPTIONS];
3030 const struct gnet_stats_queue *gsq;
3031 struct gnet_stats_basic gsb;
3033 static const struct nl_policy stats_policy[] = {
3034 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3035 .min_len = sizeof gsb },
3036 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3037 .min_len = sizeof *gsq },
3039 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3041 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3042 sa, ARRAY_SIZE(sa))) {
3043 VLOG_WARN_RL(&rl, "failed to parse class stats");
3047 /* Alignment issues screw up the length of struct gnet_stats_basic on
3048 * some arch/bitsize combinations. Newer versions of Linux have a
3049 * struct gnet_stats_basic_packed, but we can't depend on that. The
3050 * easiest thing to do is just to make a copy. */
3051 memset(&gsb, 0, sizeof gsb);
3052 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3053 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3054 stats->tx_bytes = gsb.bytes;
3055 stats->tx_packets = gsb.packets;
3057 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3058 stats->tx_errors = gsq->drops;
3068 memset(stats, 0, sizeof *stats);
3073 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3076 tc_query_class(const struct netdev *netdev,
3077 unsigned int handle, unsigned int parent,
3078 struct ofpbuf **replyp)
3080 struct ofpbuf request;
3081 struct tcmsg *tcmsg;
3084 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3085 tcmsg->tcm_handle = handle;
3086 tcmsg->tcm_parent = parent;
3088 error = tc_transact(&request, replyp);
3090 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3091 netdev_get_name(netdev),
3092 tc_get_major(handle), tc_get_minor(handle),
3093 tc_get_major(parent), tc_get_minor(parent),
3099 /* Equivalent to "tc class del dev <name> handle <handle>". */
3101 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3103 struct ofpbuf request;
3104 struct tcmsg *tcmsg;
3107 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3108 tcmsg->tcm_handle = handle;
3109 tcmsg->tcm_parent = 0;
3111 error = tc_transact(&request, NULL);
3113 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3114 netdev_get_name(netdev),
3115 tc_get_major(handle), tc_get_minor(handle),
3121 /* Equivalent to "tc qdisc del dev <name> root". */
3123 tc_del_qdisc(struct netdev *netdev)
3125 struct netdev_dev_linux *netdev_dev =
3126 netdev_dev_linux_cast(netdev_get_dev(netdev));
3127 struct ofpbuf request;
3128 struct tcmsg *tcmsg;
3131 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3132 tcmsg->tcm_handle = tc_make_handle(1, 0);
3133 tcmsg->tcm_parent = TC_H_ROOT;
3135 error = tc_transact(&request, NULL);
3136 if (error == EINVAL) {
3137 /* EINVAL probably means that the default qdisc was in use, in which
3138 * case we've accomplished our purpose. */
3141 if (!error && netdev_dev->tc) {
3142 if (netdev_dev->tc->ops->tc_destroy) {
3143 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3145 netdev_dev->tc = NULL;
3150 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3151 * kernel to determine what they are. Returns 0 if successful, otherwise a
3152 * positive errno value. */
3154 tc_query_qdisc(const struct netdev *netdev)
3156 struct netdev_dev_linux *netdev_dev =
3157 netdev_dev_linux_cast(netdev_get_dev(netdev));
3158 struct ofpbuf request, *qdisc;
3159 const struct tc_ops *ops;
3160 struct tcmsg *tcmsg;
3164 if (netdev_dev->tc) {
3168 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3169 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3170 * 2.6.35 without that fix backported to it.
3172 * To avoid the OOPS, we must not make a request that would attempt to dump
3173 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3174 * few others. There are a few ways that I can see to do this, but most of
3175 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3176 * technique chosen here is to assume that any non-default qdisc that we
3177 * create will have a class with handle 1:0. The built-in qdiscs only have
3178 * a class with handle 0:0.
3180 * We could check for Linux 2.6.35+ and use a more straightforward method
3182 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3183 tcmsg->tcm_handle = tc_make_handle(1, 0);
3184 tcmsg->tcm_parent = 0;
3186 /* Figure out what tc class to instantiate. */
3187 error = tc_transact(&request, &qdisc);
3191 error = tc_parse_qdisc(qdisc, &kind, NULL);
3193 ops = &tc_ops_other;
3195 ops = tc_lookup_linux_name(kind);
3197 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3198 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3200 ops = &tc_ops_other;
3203 } else if (error == ENOENT) {
3204 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3205 * other entity that doesn't have a handle 1:0. We will assume
3206 * that it's the system default qdisc. */
3207 ops = &tc_ops_default;
3210 /* Who knows? Maybe the device got deleted. */
3211 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3212 netdev_get_name(netdev), strerror(error));
3213 ops = &tc_ops_other;
3216 /* Instantiate it. */
3217 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3218 assert((load_error == 0) == (netdev_dev->tc != NULL));
3219 ofpbuf_delete(qdisc);
3221 return error ? error : load_error;
3224 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3225 approximate the time to transmit packets of various lengths. For an MTU of
3226 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3227 represents two possible packet lengths; for a MTU of 513 through 1024, four
3228 possible lengths; and so on.
3230 Returns, for the specified 'mtu', the number of bits that packet lengths
3231 need to be shifted right to fit within such a 256-entry table. */
3233 tc_calc_cell_log(unsigned int mtu)
3238 mtu = ETH_PAYLOAD_MAX;
3240 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3242 for (cell_log = 0; mtu >= 256; cell_log++) {
3249 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3252 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3254 memset(rate, 0, sizeof *rate);
3255 rate->cell_log = tc_calc_cell_log(mtu);
3256 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3257 /* rate->cell_align = 0; */ /* distro headers. */
3258 rate->mpu = ETH_TOTAL_MIN;
3262 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3263 * attribute of the specified "type".
3265 * See tc_calc_cell_log() above for a description of "rtab"s. */
3267 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3272 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3273 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3274 unsigned packet_size = (i + 1) << rate->cell_log;
3275 if (packet_size < rate->mpu) {
3276 packet_size = rate->mpu;
3278 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3282 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3283 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3284 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3289 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3291 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3292 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3296 /* Utility functions. */
3299 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3301 /* Policy for RTNLGRP_LINK messages.
3303 * There are *many* more fields in these messages, but currently we only
3304 * care about these fields. */
3305 static const struct nl_policy rtnlgrp_link_policy[] = {
3306 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3307 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3308 .min_len = sizeof(struct rtnl_link_stats) },
3311 struct ofpbuf request;
3312 struct ofpbuf *reply;
3313 struct ifinfomsg *ifi;
3314 const struct rtnl_link_stats *rtnl_stats;
3315 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3318 ofpbuf_init(&request, 0);
3319 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3320 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3321 ifi->ifi_family = PF_UNSPEC;
3322 ifi->ifi_index = ifindex;
3323 error = nl_sock_transact(rtnl_sock, &request, &reply);
3324 ofpbuf_uninit(&request);
3329 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3330 rtnlgrp_link_policy,
3331 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3332 ofpbuf_delete(reply);
3336 if (!attrs[IFLA_STATS]) {
3337 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3338 ofpbuf_delete(reply);
3342 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3343 stats->rx_packets = rtnl_stats->rx_packets;
3344 stats->tx_packets = rtnl_stats->tx_packets;
3345 stats->rx_bytes = rtnl_stats->rx_bytes;
3346 stats->tx_bytes = rtnl_stats->tx_bytes;
3347 stats->rx_errors = rtnl_stats->rx_errors;
3348 stats->tx_errors = rtnl_stats->tx_errors;
3349 stats->rx_dropped = rtnl_stats->rx_dropped;
3350 stats->tx_dropped = rtnl_stats->tx_dropped;
3351 stats->multicast = rtnl_stats->multicast;
3352 stats->collisions = rtnl_stats->collisions;
3353 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3354 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3355 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3356 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3357 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3358 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3359 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3360 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3361 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3362 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3363 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3365 ofpbuf_delete(reply);
3371 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3373 static const char fn[] = "/proc/net/dev";
3378 stream = fopen(fn, "r");
3380 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3385 while (fgets(line, sizeof line, stream)) {
3388 #define X64 "%"SCNu64
3391 X64 X64 X64 X64 X64 X64 X64 "%*u"
3392 X64 X64 X64 X64 X64 X64 X64 "%*u",
3398 &stats->rx_fifo_errors,
3399 &stats->rx_frame_errors,
3405 &stats->tx_fifo_errors,
3407 &stats->tx_carrier_errors) != 15) {
3408 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3409 } else if (!strcmp(devname, netdev_name)) {
3410 stats->rx_length_errors = UINT64_MAX;
3411 stats->rx_over_errors = UINT64_MAX;
3412 stats->rx_crc_errors = UINT64_MAX;
3413 stats->rx_missed_errors = UINT64_MAX;
3414 stats->tx_aborted_errors = UINT64_MAX;
3415 stats->tx_heartbeat_errors = UINT64_MAX;
3416 stats->tx_window_errors = UINT64_MAX;
3422 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3428 get_flags(const struct netdev *netdev, int *flags)
3433 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3435 *flags = ifr.ifr_flags;
3440 set_flags(struct netdev *netdev, int flags)
3444 ifr.ifr_flags = flags;
3445 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3450 do_get_ifindex(const char *netdev_name)
3454 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3455 COVERAGE_INC(netdev_get_ifindex);
3456 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3457 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3458 netdev_name, strerror(errno));
3461 return ifr.ifr_ifindex;
3465 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3467 struct netdev_dev_linux *netdev_dev =
3468 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3470 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3471 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3475 netdev_dev->cache_valid |= VALID_IFINDEX;
3476 netdev_dev->ifindex = ifindex;
3478 *ifindexp = netdev_dev->ifindex;
3483 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3488 memset(&ifr, 0, sizeof ifr);
3489 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3490 COVERAGE_INC(netdev_get_hwaddr);
3491 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3492 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3493 netdev_name, strerror(errno));
3496 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3497 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3498 VLOG_WARN("%s device has unknown hardware address family %d",
3499 netdev_name, hwaddr_family);
3501 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3506 set_etheraddr(const char *netdev_name, int hwaddr_family,
3507 const uint8_t mac[ETH_ADDR_LEN])
3511 memset(&ifr, 0, sizeof ifr);
3512 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3513 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3514 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3515 COVERAGE_INC(netdev_set_hwaddr);
3516 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3517 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3518 netdev_name, strerror(errno));
3525 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3526 int cmd, const char *cmd_name)
3530 memset(&ifr, 0, sizeof ifr);
3531 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3532 ifr.ifr_data = (caddr_t) ecmd;
3535 COVERAGE_INC(netdev_ethtool);
3536 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3539 if (errno != EOPNOTSUPP) {
3540 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3541 "failed: %s", cmd_name, name, strerror(errno));
3543 /* The device doesn't support this operation. That's pretty
3544 * common, so there's no point in logging anything. */
3551 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3552 const char *cmd_name)
3554 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3555 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3556 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3564 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3565 int cmd, const char *cmd_name)
3570 ifr.ifr_addr.sa_family = AF_INET;
3571 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3573 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3574 *ip = sin->sin_addr;