2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
51 #include "netdev-provider.h"
52 #include "netdev-vport.h"
55 #include "openflow/openflow.h"
57 #include "poll-loop.h"
58 #include "port-array.h"
59 #include "rtnetlink.h"
60 #include "socket-util.h"
65 VLOG_DEFINE_THIS_MODULE(netdev_linux)
67 /* These were introduced in Linux 2.6.14, so they might be missing if we have
69 #ifndef ADVERTISED_Pause
70 #define ADVERTISED_Pause (1 << 13)
72 #ifndef ADVERTISED_Asym_Pause
73 #define ADVERTISED_Asym_Pause (1 << 14)
76 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
79 #define TC_RTAB_SIZE 1024
82 static struct rtnetlink_notifier netdev_linux_cache_notifier;
83 static int cache_notifier_refcount;
86 VALID_IFINDEX = 1 << 0,
87 VALID_ETHERADDR = 1 << 1,
91 VALID_CARRIER = 1 << 5,
92 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
93 VALID_POLICING = 1 << 7,
94 VALID_HAVE_VPORT_STATS = 1 << 8
102 /* Traffic control. */
104 /* An instance of a traffic control class. Always associated with a particular
107 const struct tc_ops *ops;
109 /* Maps from queue ID to tc-specific data.
111 * The generic netdev TC layer uses this to the following extent: if an
112 * entry is nonnull, then the queue whose ID is the index is assumed to
113 * exist; if an entry is null, then that queue is assumed not to exist.
114 * Implementations must adhere to this scheme, although they may store
115 * whatever they like as data.
117 struct port_array queues;
120 /* A particular kind of traffic control. Each implementation generally maps to
121 * one particular Linux qdisc class.
123 * The functions below return 0 if successful or a positive errno value on
124 * failure, except where otherwise noted. All of them must be provided, except
125 * where otherwise noted. */
127 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
128 * This is null for tc_ops_default and tc_ops_other, for which there are no
129 * appropriate values. */
130 const char *linux_name;
132 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
133 const char *ovs_name;
135 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
136 * queues. The queues are numbered 0 through n_queues - 1. */
137 unsigned int n_queues;
139 /* Called to install this TC class on 'netdev'. The implementation should
140 * make the Netlink calls required to set up 'netdev' with the right qdisc
141 * and configure it according to 'details'. The implementation may assume
142 * that the current qdisc is the default; that is, there is no need for it
143 * to delete the current qdisc before installing itself.
145 * The contents of 'details' should be documented as valid for 'ovs_name'
146 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
147 * (which is built as ovs-vswitchd.conf.db(8)).
149 * This function must return 0 if and only if it sets 'netdev->tc' to an
150 * initialized 'struct tc'.
152 * (This function is null for tc_ops_other, which cannot be installed. For
153 * other TC classes it should always be nonnull.) */
154 int (*tc_install)(struct netdev *netdev, const struct shash *details);
156 /* Called when the netdev code determines (through a Netlink query) that
157 * this TC class's qdisc is installed on 'netdev', but we didn't install
158 * it ourselves and so don't know any of the details.
160 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
161 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
162 * implementation should parse the other attributes of 'nlmsg' as
163 * necessary to determine its configuration. If necessary it should also
164 * use Netlink queries to determine the configuration of queues on
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'. */
169 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
171 /* Destroys the data structures allocated by the implementation as part of
172 * 'tc'. (This includes destroying 'tc->queues' by calling
175 * The implementation should not need to perform any Netlink calls. If
176 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
177 * (But it may not be desirable.)
179 * This function may be null if 'tc' is trivial. */
180 void (*tc_destroy)(struct tc *tc);
182 /* Retrieves details of 'netdev->tc' configuration into 'details'.
184 * The implementation should not need to perform any Netlink calls, because
185 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
186 * cached the configuration.
188 * The contents of 'details' should be documented as valid for 'ovs_name'
189 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
190 * (which is built as ovs-vswitchd.conf.db(8)).
192 * This function may be null if 'tc' is not configurable.
194 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
196 /* Reconfigures 'netdev->tc' according to 'details', performing any
197 * required Netlink calls to complete the reconfiguration.
199 * The contents of 'details' should be documented as valid for 'ovs_name'
200 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
201 * (which is built as ovs-vswitchd.conf.db(8)).
203 * This function may be null if 'tc' is not configurable.
205 int (*qdisc_set)(struct netdev *, const struct shash *details);
207 /* Retrieves details of 'queue_id' on 'netdev->tc' into 'details'. The
208 * caller ensures that 'queues' has a nonnull value for index 'queue_id.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "Queue" table in
212 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the queue configuration.
218 * This function may be null if 'tc' does not have queues ('n_queues' is
220 int (*class_get)(const struct netdev *netdev, unsigned int queue_id,
221 struct shash *details);
223 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
224 * 'details', perfoming any required Netlink calls to complete the
225 * reconfiguration. The caller ensures that 'queue_id' is less than
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * This function may be null if 'tc' does not have queues or its queues are
233 * not configurable. */
234 int (*class_set)(struct netdev *, unsigned int queue_id,
235 const struct shash *details);
237 /* Deletes 'queue_id' from 'netdev->tc'. The caller ensures that 'queues'
238 * has a nonnull value for index 'queue_id.
240 * This function may be null if 'tc' does not have queues or its queues
241 * cannot be deleted. */
242 int (*class_delete)(struct netdev *, unsigned int queue_id);
244 /* Obtains stats for 'queue' from 'netdev->tc'. The caller ensures that
245 * 'queues' has a nonnull value for index 'queue_id.
247 * On success, initializes '*stats'.
249 * This function may be null if 'tc' does not have queues or if it cannot
250 * report queue statistics. */
251 int (*class_get_stats)(const struct netdev *netdev, unsigned int queue_id,
252 struct netdev_queue_stats *stats);
254 /* Extracts queue stats from 'nlmsg', which is a response to a
255 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
257 * This function may be null if 'tc' does not have queues or if it cannot
258 * report queue statistics. */
259 int (*class_dump_stats)(const struct netdev *netdev,
260 const struct ofpbuf *nlmsg,
261 netdev_dump_queue_stats_cb *cb, void *aux);
265 tc_init(struct tc *tc, const struct tc_ops *ops)
268 port_array_init(&tc->queues);
272 tc_destroy(struct tc *tc)
274 port_array_destroy(&tc->queues);
277 static const struct tc_ops tc_ops_htb;
278 static const struct tc_ops tc_ops_default;
279 static const struct tc_ops tc_ops_other;
281 static const struct tc_ops *tcs[] = {
282 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
283 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
284 &tc_ops_other, /* Some other qdisc. */
288 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
289 static unsigned int tc_get_major(unsigned int handle);
290 static unsigned int tc_get_minor(unsigned int handle);
292 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
293 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
294 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
296 static struct tcmsg *tc_make_request(const struct netdev *, int type,
297 unsigned int flags, struct ofpbuf *);
298 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
300 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
301 struct nlattr **options);
302 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
303 struct nlattr **options,
304 struct netdev_queue_stats *);
305 static int tc_query_class(const struct netdev *,
306 unsigned int handle, unsigned int parent,
307 struct ofpbuf **replyp);
308 static int tc_delete_class(const struct netdev *, unsigned int handle);
310 static int tc_del_qdisc(struct netdev *netdev);
311 static int tc_query_qdisc(const struct netdev *netdev);
313 static int tc_calc_cell_log(unsigned int mtu);
314 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
315 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
316 const struct tc_ratespec *rate);
317 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
319 struct netdev_dev_linux {
320 struct netdev_dev netdev_dev;
322 struct shash_node *shash_node;
323 unsigned int cache_valid;
325 /* The following are figured out "on demand" only. They are only valid
326 * when the corresponding VALID_* bit in 'cache_valid' is set. */
328 uint8_t etheraddr[ETH_ADDR_LEN];
329 struct in_addr address, netmask;
333 bool is_internal; /* Is this an openvswitch internal device? */
334 bool is_tap; /* Is this a tuntap device? */
335 uint32_t kbits_rate; /* Policing data. */
336 uint32_t kbits_burst;
337 bool have_vport_stats;
341 struct tap_state tap;
345 struct netdev_linux {
346 struct netdev netdev;
350 /* An AF_INET socket (used for ioctl operations). */
351 static int af_inet_sock = -1;
353 /* A Netlink routing socket that is not subscribed to any multicast groups. */
354 static struct nl_sock *rtnl_sock;
356 struct netdev_linux_notifier {
357 struct netdev_notifier notifier;
361 static struct shash netdev_linux_notifiers =
362 SHASH_INITIALIZER(&netdev_linux_notifiers);
363 static struct rtnetlink_notifier netdev_linux_poll_notifier;
365 /* This is set pretty low because we probably won't learn anything from the
366 * additional log messages. */
367 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
369 static int netdev_linux_init(void);
371 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
372 int cmd, const char *cmd_name);
373 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
374 const char *cmd_name);
375 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
376 int cmd, const char *cmd_name);
377 static int get_flags(const struct netdev *, int *flagsp);
378 static int set_flags(struct netdev *, int flags);
379 static int do_get_ifindex(const char *netdev_name);
380 static int get_ifindex(const struct netdev *, int *ifindexp);
381 static int do_set_addr(struct netdev *netdev,
382 int ioctl_nr, const char *ioctl_name,
383 struct in_addr addr);
384 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
385 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
386 const uint8_t[ETH_ADDR_LEN]);
387 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
388 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
391 is_netdev_linux_class(const struct netdev_class *netdev_class)
393 return netdev_class->init == netdev_linux_init;
396 static struct netdev_dev_linux *
397 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
399 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
400 assert(is_netdev_linux_class(netdev_class));
402 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
405 static struct netdev_linux *
406 netdev_linux_cast(const struct netdev *netdev)
408 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
409 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
410 assert(is_netdev_linux_class(netdev_class));
412 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
416 netdev_linux_init(void)
418 static int status = -1;
420 /* Create AF_INET socket. */
421 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
422 status = af_inet_sock >= 0 ? 0 : errno;
424 VLOG_ERR("failed to create inet socket: %s", strerror(status));
427 /* Create rtnetlink socket. */
429 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
431 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
440 netdev_linux_run(void)
442 rtnetlink_notifier_run();
446 netdev_linux_wait(void)
448 rtnetlink_notifier_wait();
452 netdev_linux_cache_cb(const struct rtnetlink_change *change,
453 void *aux OVS_UNUSED)
455 struct netdev_dev_linux *dev;
457 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
459 const struct netdev_class *netdev_class =
460 netdev_dev_get_class(base_dev);
462 if (is_netdev_linux_class(netdev_class)) {
463 dev = netdev_dev_linux_cast(base_dev);
464 dev->cache_valid = 0;
468 struct shash device_shash;
469 struct shash_node *node;
471 shash_init(&device_shash);
472 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
473 SHASH_FOR_EACH (node, &device_shash) {
475 dev->cache_valid = 0;
477 shash_destroy(&device_shash);
481 /* Creates the netdev device of 'type' with 'name'. */
483 netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
484 const struct shash *args, struct netdev_dev **netdev_devp)
486 struct netdev_dev_linux *netdev_dev;
489 if (!shash_is_empty(args)) {
490 VLOG_WARN("%s: arguments for system devices should be empty", name);
493 if (!cache_notifier_refcount) {
494 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
495 netdev_linux_cache_cb, NULL);
500 cache_notifier_refcount++;
502 netdev_dev = xzalloc(sizeof *netdev_dev);
503 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
505 *netdev_devp = &netdev_dev->netdev_dev;
509 /* For most types of netdevs we open the device for each call of
510 * netdev_open(). However, this is not the case with tap devices,
511 * since it is only possible to open the device once. In this
512 * situation we share a single file descriptor, and consequently
513 * buffers, across all readers. Therefore once data is read it will
514 * be unavailable to other reads for tap devices. */
516 netdev_linux_create_tap(const char *name, const char *type OVS_UNUSED,
517 const struct shash *args, struct netdev_dev **netdev_devp)
519 struct netdev_dev_linux *netdev_dev;
520 struct tap_state *state;
521 static const char tap_dev[] = "/dev/net/tun";
525 if (!shash_is_empty(args)) {
526 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
529 netdev_dev = xzalloc(sizeof *netdev_dev);
530 state = &netdev_dev->state.tap;
532 /* Open tap device. */
533 state->fd = open(tap_dev, O_RDWR);
536 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
540 /* Create tap device. */
541 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
542 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
543 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
544 VLOG_WARN("%s: creating tap device failed: %s", name,
550 /* Make non-blocking. */
551 error = set_nonblocking(state->fd);
556 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
557 *netdev_devp = &netdev_dev->netdev_dev;
566 destroy_tap(struct netdev_dev_linux *netdev_dev)
568 struct tap_state *state = &netdev_dev->state.tap;
570 if (state->fd >= 0) {
575 /* Destroys the netdev device 'netdev_dev_'. */
577 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
579 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
580 const char *type = netdev_dev_get_type(netdev_dev_);
582 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
583 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
586 if (!strcmp(type, "system")) {
587 cache_notifier_refcount--;
589 if (!cache_notifier_refcount) {
590 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
592 } else if (!strcmp(type, "tap")) {
593 destroy_tap(netdev_dev);
600 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
601 struct netdev **netdevp)
603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
604 struct netdev_linux *netdev;
605 enum netdev_flags flags;
608 /* Allocate network device. */
609 netdev = xzalloc(sizeof *netdev);
611 netdev_init(&netdev->netdev, netdev_dev_);
613 error = netdev_get_flags(&netdev->netdev, &flags);
614 if (error == ENODEV) {
618 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
619 !netdev_dev->state.tap.opened) {
621 /* We assume that the first user of the tap device is the primary user
622 * and give them the tap FD. Subsequent users probably just expect
623 * this to be a system device so open it normally to avoid send/receive
624 * directions appearing to be reversed. */
625 netdev->fd = netdev_dev->state.tap.fd;
626 netdev_dev->state.tap.opened = true;
627 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
628 struct sockaddr_ll sll;
632 /* Create file descriptor. */
633 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
634 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
636 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
637 if (netdev->fd < 0) {
642 /* Set non-blocking mode. */
643 error = set_nonblocking(netdev->fd);
648 /* Get ethernet device index. */
649 error = get_ifindex(&netdev->netdev, &ifindex);
654 /* Bind to specific ethernet device. */
655 memset(&sll, 0, sizeof sll);
656 sll.sll_family = AF_PACKET;
657 sll.sll_ifindex = ifindex;
659 (struct sockaddr *) &sll, sizeof sll) < 0) {
661 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
666 /* Between the socket() and bind() calls above, the socket receives all
667 * packets of the requested type on all system interfaces. We do not
668 * want to receive that data, but there is no way to avoid it. So we
669 * must now drain out the receive queue. */
670 error = drain_rcvbuf(netdev->fd);
676 *netdevp = &netdev->netdev;
680 netdev_uninit(&netdev->netdev, true);
684 /* Closes and destroys 'netdev'. */
686 netdev_linux_close(struct netdev *netdev_)
688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
690 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
696 /* Initializes 'svec' with a list of the names of all known network devices. */
698 netdev_linux_enumerate(struct svec *svec)
700 struct if_nameindex *names;
702 names = if_nameindex();
706 for (i = 0; names[i].if_name != NULL; i++) {
707 svec_add(svec, names[i].if_name);
709 if_freenameindex(names);
712 VLOG_WARN("could not obtain list of network device names: %s",
719 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
723 if (netdev->fd < 0) {
724 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
729 ssize_t retval = read(netdev->fd, data, size);
732 } else if (errno != EINTR) {
733 if (errno != EAGAIN) {
734 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
735 strerror(errno), netdev_get_name(netdev_));
742 /* Registers with the poll loop to wake up from the next call to poll_block()
743 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
745 netdev_linux_recv_wait(struct netdev *netdev_)
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
748 if (netdev->fd >= 0) {
749 poll_fd_wait(netdev->fd, POLLIN);
753 /* Discards all packets waiting to be received from 'netdev'. */
755 netdev_linux_drain(struct netdev *netdev_)
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
760 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
762 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
763 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
767 drain_fd(netdev->fd, ifr.ifr_qlen);
770 return drain_rcvbuf(netdev->fd);
774 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
775 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
776 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
777 * the packet is too big or too small to transmit on the device.
779 * The caller retains ownership of 'buffer' in all cases.
781 * The kernel maintains a packet transmission queue, so the caller is not
782 * expected to do additional queuing of packets. */
784 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
790 if (netdev->fd < 0) {
795 ssize_t retval = write(netdev->fd, data, size);
797 /* The Linux AF_PACKET implementation never blocks waiting for room
798 * for packets, instead returning ENOBUFS. Translate this into
799 * EAGAIN for the caller. */
800 if (errno == ENOBUFS) {
802 } else if (errno == EINTR) {
804 } else if (errno != EAGAIN) {
805 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
806 netdev_get_name(netdev_), strerror(errno));
809 } else if (retval != size) {
810 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
811 "%zu) on %s", retval, size, netdev_get_name(netdev_));
819 /* Registers with the poll loop to wake up from the next call to poll_block()
820 * when the packet transmission queue has sufficient room to transmit a packet
821 * with netdev_send().
823 * The kernel maintains a packet transmission queue, so the client is not
824 * expected to do additional queuing of packets. Thus, this function is
825 * unlikely to ever be used. It is included for completeness. */
827 netdev_linux_send_wait(struct netdev *netdev_)
829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
830 if (netdev->fd < 0) {
832 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
833 poll_fd_wait(netdev->fd, POLLOUT);
835 /* TAP device always accepts packets.*/
836 poll_immediate_wake();
840 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
841 * otherwise a positive errno value. */
843 netdev_linux_set_etheraddr(struct netdev *netdev_,
844 const uint8_t mac[ETH_ADDR_LEN])
846 struct netdev_dev_linux *netdev_dev =
847 netdev_dev_linux_cast(netdev_get_dev(netdev_));
850 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
851 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
852 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
854 netdev_dev->cache_valid |= VALID_ETHERADDR;
855 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
863 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
864 * free the returned buffer. */
866 netdev_linux_get_etheraddr(const struct netdev *netdev_,
867 uint8_t mac[ETH_ADDR_LEN])
869 struct netdev_dev_linux *netdev_dev =
870 netdev_dev_linux_cast(netdev_get_dev(netdev_));
871 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
872 int error = get_etheraddr(netdev_get_name(netdev_),
873 netdev_dev->etheraddr);
877 netdev_dev->cache_valid |= VALID_ETHERADDR;
879 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
883 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
884 * in bytes, not including the hardware header; thus, this is typically 1500
885 * bytes for Ethernet devices. */
887 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
889 struct netdev_dev_linux *netdev_dev =
890 netdev_dev_linux_cast(netdev_get_dev(netdev_));
891 if (!(netdev_dev->cache_valid & VALID_MTU)) {
895 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
896 SIOCGIFMTU, "SIOCGIFMTU");
900 netdev_dev->mtu = ifr.ifr_mtu;
901 netdev_dev->cache_valid |= VALID_MTU;
903 *mtup = netdev_dev->mtu;
907 /* Returns the ifindex of 'netdev', if successful, as a positive number.
908 * On failure, returns a negative errno value. */
910 netdev_linux_get_ifindex(const struct netdev *netdev)
914 error = get_ifindex(netdev, &ifindex);
915 return error ? -error : ifindex;
919 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
921 struct netdev_dev_linux *netdev_dev =
922 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
931 fn = xasprintf("/sys/class/net/%s/carrier",
932 netdev_get_name(netdev_));
933 fd = open(fn, O_RDONLY);
936 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
940 retval = read(fd, line, sizeof line);
943 if (error == EINVAL) {
944 /* This is the normal return value when we try to check carrier
945 * if the network device is not up. */
947 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
950 } else if (retval == 0) {
952 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
956 if (line[0] != '0' && line[0] != '1') {
958 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
962 netdev_dev->carrier = line[0] != '0';
963 netdev_dev->cache_valid |= VALID_CARRIER;
965 *carrier = netdev_dev->carrier;
976 /* Check whether we can we use RTM_GETLINK to get network device statistics.
977 * In pre-2.6.19 kernels, this was only available if wireless extensions were
980 check_for_working_netlink_stats(void)
982 /* Decide on the netdev_get_stats() implementation to use. Netlink is
983 * preferable, so if that works, we'll use it. */
984 int ifindex = do_get_ifindex("lo");
986 VLOG_WARN("failed to get ifindex for lo, "
987 "obtaining netdev stats from proc");
990 struct netdev_stats stats;
991 int error = get_stats_via_netlink(ifindex, &stats);
993 VLOG_DBG("obtaining netdev stats via rtnetlink");
996 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
997 "via proc (you are probably running a pre-2.6.19 "
998 "kernel)", strerror(error));
1004 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1006 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1008 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1009 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1010 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1012 netdev_dev->is_tap = !strcmp(type, "tap");
1013 netdev_dev->is_internal = false;
1014 if (!netdev_dev->is_tap) {
1015 struct ethtool_drvinfo drvinfo;
1018 memset(&drvinfo, 0, sizeof drvinfo);
1019 error = netdev_linux_do_ethtool(name,
1020 (struct ethtool_cmd *)&drvinfo,
1022 "ETHTOOL_GDRVINFO");
1024 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1025 netdev_dev->is_internal = true;
1029 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1034 swap_uint64(uint64_t *a, uint64_t *b)
1041 /* Retrieves current device stats for 'netdev'. */
1043 netdev_linux_get_stats(const struct netdev *netdev_,
1044 struct netdev_stats *stats)
1046 struct netdev_dev_linux *netdev_dev =
1047 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1048 static int use_netlink_stats = -1;
1051 COVERAGE_INC(netdev_get_stats);
1053 if (netdev_dev->have_vport_stats ||
1054 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1056 error = netdev_vport_get_stats(netdev_, stats);
1057 netdev_dev->have_vport_stats = !error;
1058 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1061 if (!netdev_dev->have_vport_stats) {
1062 if (use_netlink_stats < 0) {
1063 use_netlink_stats = check_for_working_netlink_stats();
1065 if (use_netlink_stats) {
1068 error = get_ifindex(netdev_, &ifindex);
1070 error = get_stats_via_netlink(ifindex, stats);
1073 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1077 /* If this port is an internal port then the transmit and receive stats
1078 * will appear to be swapped relative to the other ports since we are the
1079 * one sending the data, not a remote computer. For consistency, we swap
1080 * them back here. This does not apply if we are getting stats from the
1081 * vport layer because it always tracks stats from the perspective of the
1083 netdev_linux_update_is_pseudo(netdev_dev);
1084 if (!error && !netdev_dev->have_vport_stats &&
1085 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1086 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1087 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1088 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1089 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1090 stats->rx_length_errors = 0;
1091 stats->rx_over_errors = 0;
1092 stats->rx_crc_errors = 0;
1093 stats->rx_frame_errors = 0;
1094 stats->rx_fifo_errors = 0;
1095 stats->rx_missed_errors = 0;
1096 stats->tx_aborted_errors = 0;
1097 stats->tx_carrier_errors = 0;
1098 stats->tx_fifo_errors = 0;
1099 stats->tx_heartbeat_errors = 0;
1100 stats->tx_window_errors = 0;
1106 /* Stores the features supported by 'netdev' into each of '*current',
1107 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1108 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1109 * successful, otherwise a positive errno value. */
1111 netdev_linux_get_features(struct netdev *netdev,
1112 uint32_t *current, uint32_t *advertised,
1113 uint32_t *supported, uint32_t *peer)
1115 struct ethtool_cmd ecmd;
1118 memset(&ecmd, 0, sizeof ecmd);
1119 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1120 ETHTOOL_GSET, "ETHTOOL_GSET");
1125 /* Supported features. */
1127 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1128 *supported |= OFPPF_10MB_HD;
1130 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1131 *supported |= OFPPF_10MB_FD;
1133 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1134 *supported |= OFPPF_100MB_HD;
1136 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1137 *supported |= OFPPF_100MB_FD;
1139 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1140 *supported |= OFPPF_1GB_HD;
1142 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1143 *supported |= OFPPF_1GB_FD;
1145 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1146 *supported |= OFPPF_10GB_FD;
1148 if (ecmd.supported & SUPPORTED_TP) {
1149 *supported |= OFPPF_COPPER;
1151 if (ecmd.supported & SUPPORTED_FIBRE) {
1152 *supported |= OFPPF_FIBER;
1154 if (ecmd.supported & SUPPORTED_Autoneg) {
1155 *supported |= OFPPF_AUTONEG;
1157 if (ecmd.supported & SUPPORTED_Pause) {
1158 *supported |= OFPPF_PAUSE;
1160 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1161 *supported |= OFPPF_PAUSE_ASYM;
1164 /* Advertised features. */
1166 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1167 *advertised |= OFPPF_10MB_HD;
1169 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1170 *advertised |= OFPPF_10MB_FD;
1172 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1173 *advertised |= OFPPF_100MB_HD;
1175 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1176 *advertised |= OFPPF_100MB_FD;
1178 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1179 *advertised |= OFPPF_1GB_HD;
1181 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1182 *advertised |= OFPPF_1GB_FD;
1184 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1185 *advertised |= OFPPF_10GB_FD;
1187 if (ecmd.advertising & ADVERTISED_TP) {
1188 *advertised |= OFPPF_COPPER;
1190 if (ecmd.advertising & ADVERTISED_FIBRE) {
1191 *advertised |= OFPPF_FIBER;
1193 if (ecmd.advertising & ADVERTISED_Autoneg) {
1194 *advertised |= OFPPF_AUTONEG;
1196 if (ecmd.advertising & ADVERTISED_Pause) {
1197 *advertised |= OFPPF_PAUSE;
1199 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1200 *advertised |= OFPPF_PAUSE_ASYM;
1203 /* Current settings. */
1204 if (ecmd.speed == SPEED_10) {
1205 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1206 } else if (ecmd.speed == SPEED_100) {
1207 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1208 } else if (ecmd.speed == SPEED_1000) {
1209 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1210 } else if (ecmd.speed == SPEED_10000) {
1211 *current = OFPPF_10GB_FD;
1216 if (ecmd.port == PORT_TP) {
1217 *current |= OFPPF_COPPER;
1218 } else if (ecmd.port == PORT_FIBRE) {
1219 *current |= OFPPF_FIBER;
1223 *current |= OFPPF_AUTONEG;
1226 /* Peer advertisements. */
1227 *peer = 0; /* XXX */
1232 /* Set the features advertised by 'netdev' to 'advertise'. */
1234 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1236 struct ethtool_cmd ecmd;
1239 memset(&ecmd, 0, sizeof ecmd);
1240 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1241 ETHTOOL_GSET, "ETHTOOL_GSET");
1246 ecmd.advertising = 0;
1247 if (advertise & OFPPF_10MB_HD) {
1248 ecmd.advertising |= ADVERTISED_10baseT_Half;
1250 if (advertise & OFPPF_10MB_FD) {
1251 ecmd.advertising |= ADVERTISED_10baseT_Full;
1253 if (advertise & OFPPF_100MB_HD) {
1254 ecmd.advertising |= ADVERTISED_100baseT_Half;
1256 if (advertise & OFPPF_100MB_FD) {
1257 ecmd.advertising |= ADVERTISED_100baseT_Full;
1259 if (advertise & OFPPF_1GB_HD) {
1260 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1262 if (advertise & OFPPF_1GB_FD) {
1263 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1265 if (advertise & OFPPF_10GB_FD) {
1266 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1268 if (advertise & OFPPF_COPPER) {
1269 ecmd.advertising |= ADVERTISED_TP;
1271 if (advertise & OFPPF_FIBER) {
1272 ecmd.advertising |= ADVERTISED_FIBRE;
1274 if (advertise & OFPPF_AUTONEG) {
1275 ecmd.advertising |= ADVERTISED_Autoneg;
1277 if (advertise & OFPPF_PAUSE) {
1278 ecmd.advertising |= ADVERTISED_Pause;
1280 if (advertise & OFPPF_PAUSE_ASYM) {
1281 ecmd.advertising |= ADVERTISED_Asym_Pause;
1283 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1284 ETHTOOL_SSET, "ETHTOOL_SSET");
1287 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1288 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1289 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1290 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1291 * sets '*vlan_vid' to -1. */
1293 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1295 const char *netdev_name = netdev_get_name(netdev);
1296 struct ds line = DS_EMPTY_INITIALIZER;
1297 FILE *stream = NULL;
1301 COVERAGE_INC(netdev_get_vlan_vid);
1302 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1303 stream = fopen(fn, "r");
1309 if (ds_get_line(&line, stream)) {
1310 if (ferror(stream)) {
1312 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1315 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1320 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1322 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1323 fn, ds_cstr(&line));
1341 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1342 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1344 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1345 * positive errno value.
1347 * This function is equivalent to running
1348 * /sbin/tc qdisc del dev %s handle ffff: ingress
1349 * but it is much, much faster.
1352 netdev_linux_remove_policing(struct netdev *netdev)
1354 struct netdev_dev_linux *netdev_dev =
1355 netdev_dev_linux_cast(netdev_get_dev(netdev));
1356 const char *netdev_name = netdev_get_name(netdev);
1358 struct ofpbuf request;
1359 struct tcmsg *tcmsg;
1362 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1363 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1364 tcmsg->tcm_parent = TC_H_INGRESS;
1365 nl_msg_put_string(&request, TCA_KIND, "ingress");
1366 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1368 error = tc_transact(&request, NULL);
1369 if (error && error != ENOENT && error != EINVAL) {
1370 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1371 netdev_name, strerror(error));
1375 netdev_dev->kbits_rate = 0;
1376 netdev_dev->kbits_burst = 0;
1377 netdev_dev->cache_valid |= VALID_POLICING;
1381 /* Attempts to set input rate limiting (policing) policy. */
1383 netdev_linux_set_policing(struct netdev *netdev,
1384 uint32_t kbits_rate, uint32_t kbits_burst)
1386 struct netdev_dev_linux *netdev_dev =
1387 netdev_dev_linux_cast(netdev_get_dev(netdev));
1388 const char *netdev_name = netdev_get_name(netdev);
1391 COVERAGE_INC(netdev_set_policing);
1393 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1394 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1395 : kbits_burst); /* Stick with user-specified value. */
1397 if (netdev_dev->cache_valid & VALID_POLICING
1398 && netdev_dev->kbits_rate == kbits_rate
1399 && netdev_dev->kbits_burst == kbits_burst) {
1400 /* Assume that settings haven't changed since we last set them. */
1404 netdev_linux_remove_policing(netdev);
1406 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1407 if (system(command) != 0) {
1408 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1412 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1413 kbits_rate, kbits_burst);
1414 if (system(command) != 0) {
1415 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1420 netdev_dev->kbits_rate = kbits_rate;
1421 netdev_dev->kbits_burst = kbits_burst;
1422 netdev_dev->cache_valid |= VALID_POLICING;
1429 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1432 const struct tc_ops **opsp;
1434 for (opsp = tcs; *opsp != NULL; opsp++) {
1435 const struct tc_ops *ops = *opsp;
1436 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1437 svec_add(types, ops->ovs_name);
1443 static const struct tc_ops *
1444 tc_lookup_ovs_name(const char *name)
1446 const struct tc_ops **opsp;
1448 for (opsp = tcs; *opsp != NULL; opsp++) {
1449 const struct tc_ops *ops = *opsp;
1450 if (!strcmp(name, ops->ovs_name)) {
1457 static const struct tc_ops *
1458 tc_lookup_linux_name(const char *name)
1460 const struct tc_ops **opsp;
1462 for (opsp = tcs; *opsp != NULL; opsp++) {
1463 const struct tc_ops *ops = *opsp;
1464 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1472 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1474 struct netdev_qos_capabilities *caps)
1476 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1480 caps->n_queues = ops->n_queues;
1485 netdev_linux_get_qos(const struct netdev *netdev,
1486 const char **typep, struct shash *details)
1488 struct netdev_dev_linux *netdev_dev =
1489 netdev_dev_linux_cast(netdev_get_dev(netdev));
1492 error = tc_query_qdisc(netdev);
1497 *typep = netdev_dev->tc->ops->ovs_name;
1498 return (netdev_dev->tc->ops->qdisc_get
1499 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1504 netdev_linux_set_qos(struct netdev *netdev,
1505 const char *type, const struct shash *details)
1507 struct netdev_dev_linux *netdev_dev =
1508 netdev_dev_linux_cast(netdev_get_dev(netdev));
1509 const struct tc_ops *new_ops;
1512 new_ops = tc_lookup_ovs_name(type);
1513 if (!new_ops || !new_ops->tc_install) {
1517 error = tc_query_qdisc(netdev);
1522 if (new_ops == netdev_dev->tc->ops) {
1523 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1525 /* Delete existing qdisc. */
1526 error = tc_del_qdisc(netdev);
1530 assert(netdev_dev->tc == NULL);
1532 /* Install new qdisc. */
1533 error = new_ops->tc_install(netdev, details);
1534 assert((error == 0) == (netdev_dev->tc != NULL));
1541 netdev_linux_get_queue(const struct netdev *netdev,
1542 unsigned int queue_id, struct shash *details)
1544 struct netdev_dev_linux *netdev_dev =
1545 netdev_dev_linux_cast(netdev_get_dev(netdev));
1548 error = tc_query_qdisc(netdev);
1551 } else if (queue_id > UINT16_MAX
1552 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1556 return netdev_dev->tc->ops->class_get(netdev, queue_id, details);
1560 netdev_linux_set_queue(struct netdev *netdev,
1561 unsigned int queue_id, const struct shash *details)
1563 struct netdev_dev_linux *netdev_dev =
1564 netdev_dev_linux_cast(netdev_get_dev(netdev));
1567 error = tc_query_qdisc(netdev);
1570 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1571 || !netdev_dev->tc->ops->class_set) {
1575 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1579 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1581 struct netdev_dev_linux *netdev_dev =
1582 netdev_dev_linux_cast(netdev_get_dev(netdev));
1585 error = tc_query_qdisc(netdev);
1588 } else if (!netdev_dev->tc->ops->class_delete) {
1590 } else if (queue_id > UINT16_MAX
1591 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1595 return netdev_dev->tc->ops->class_delete(netdev, queue_id);
1599 netdev_linux_get_queue_stats(const struct netdev *netdev,
1600 unsigned int queue_id,
1601 struct netdev_queue_stats *stats)
1603 struct netdev_dev_linux *netdev_dev =
1604 netdev_dev_linux_cast(netdev_get_dev(netdev));
1607 error = tc_query_qdisc(netdev);
1610 } else if (queue_id > UINT16_MAX
1611 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1613 } else if (!netdev_dev->tc->ops->class_get_stats) {
1617 return netdev_dev->tc->ops->class_get_stats(netdev, queue_id, stats);
1621 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1623 struct ofpbuf request;
1624 struct tcmsg *tcmsg;
1626 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1627 tcmsg->tcm_parent = 0;
1628 nl_dump_start(dump, rtnl_sock, &request);
1629 ofpbuf_uninit(&request);
1633 netdev_linux_dump_queues(const struct netdev *netdev,
1634 netdev_dump_queues_cb *cb, void *aux)
1636 struct netdev_dev_linux *netdev_dev =
1637 netdev_dev_linux_cast(netdev_get_dev(netdev));
1638 unsigned int queue_id;
1639 struct shash details;
1644 error = tc_query_qdisc(netdev);
1647 } else if (!netdev_dev->tc->ops->class_get) {
1652 shash_init(&details);
1653 PORT_ARRAY_FOR_EACH (queue, &netdev_dev->tc->queues, queue_id) {
1654 shash_clear(&details);
1656 error = netdev_dev->tc->ops->class_get(netdev, queue_id, &details);
1658 (*cb)(queue_id, &details, aux);
1663 shash_destroy(&details);
1669 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1670 netdev_dump_queue_stats_cb *cb, void *aux)
1672 struct netdev_dev_linux *netdev_dev =
1673 netdev_dev_linux_cast(netdev_get_dev(netdev));
1674 struct nl_dump dump;
1679 error = tc_query_qdisc(netdev);
1682 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1687 start_queue_dump(netdev, &dump);
1688 while (nl_dump_next(&dump, &msg)) {
1689 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1695 error = nl_dump_done(&dump);
1696 return error ? error : last_error;
1700 netdev_linux_get_in4(const struct netdev *netdev_,
1701 struct in_addr *address, struct in_addr *netmask)
1703 struct netdev_dev_linux *netdev_dev =
1704 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1706 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1709 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1710 SIOCGIFADDR, "SIOCGIFADDR");
1715 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1716 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1721 netdev_dev->cache_valid |= VALID_IN4;
1723 *address = netdev_dev->address;
1724 *netmask = netdev_dev->netmask;
1725 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1729 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1730 struct in_addr netmask)
1732 struct netdev_dev_linux *netdev_dev =
1733 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1736 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1738 netdev_dev->cache_valid |= VALID_IN4;
1739 netdev_dev->address = address;
1740 netdev_dev->netmask = netmask;
1741 if (address.s_addr != INADDR_ANY) {
1742 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1743 "SIOCSIFNETMASK", netmask);
1750 parse_if_inet6_line(const char *line,
1751 struct in6_addr *in6, char ifname[16 + 1])
1753 uint8_t *s6 = in6->s6_addr;
1754 #define X8 "%2"SCNx8
1756 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1757 "%*x %*x %*x %*x %16s\n",
1758 &s6[0], &s6[1], &s6[2], &s6[3],
1759 &s6[4], &s6[5], &s6[6], &s6[7],
1760 &s6[8], &s6[9], &s6[10], &s6[11],
1761 &s6[12], &s6[13], &s6[14], &s6[15],
1765 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1766 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1768 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1770 struct netdev_dev_linux *netdev_dev =
1771 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1772 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1776 netdev_dev->in6 = in6addr_any;
1778 file = fopen("/proc/net/if_inet6", "r");
1780 const char *name = netdev_get_name(netdev_);
1781 while (fgets(line, sizeof line, file)) {
1782 struct in6_addr in6;
1783 char ifname[16 + 1];
1784 if (parse_if_inet6_line(line, &in6, ifname)
1785 && !strcmp(name, ifname))
1787 netdev_dev->in6 = in6;
1793 netdev_dev->cache_valid |= VALID_IN6;
1795 *in6 = netdev_dev->in6;
1800 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1802 struct sockaddr_in sin;
1803 memset(&sin, 0, sizeof sin);
1804 sin.sin_family = AF_INET;
1805 sin.sin_addr = addr;
1808 memset(sa, 0, sizeof *sa);
1809 memcpy(sa, &sin, sizeof sin);
1813 do_set_addr(struct netdev *netdev,
1814 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1817 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1818 make_in4_sockaddr(&ifr.ifr_addr, addr);
1820 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1824 /* Adds 'router' as a default IP gateway. */
1826 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1828 struct in_addr any = { INADDR_ANY };
1832 memset(&rt, 0, sizeof rt);
1833 make_in4_sockaddr(&rt.rt_dst, any);
1834 make_in4_sockaddr(&rt.rt_gateway, router);
1835 make_in4_sockaddr(&rt.rt_genmask, any);
1836 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1837 COVERAGE_INC(netdev_add_router);
1838 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1840 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1846 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1849 static const char fn[] = "/proc/net/route";
1854 *netdev_name = NULL;
1855 stream = fopen(fn, "r");
1856 if (stream == NULL) {
1857 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1862 while (fgets(line, sizeof line, stream)) {
1865 uint32_t dest, gateway, mask;
1866 int refcnt, metric, mtu;
1867 unsigned int flags, use, window, irtt;
1870 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1872 iface, &dest, &gateway, &flags, &refcnt,
1873 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1875 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1879 if (!(flags & RTF_UP)) {
1880 /* Skip routes that aren't up. */
1884 /* The output of 'dest', 'mask', and 'gateway' were given in
1885 * network byte order, so we don't need need any endian
1886 * conversions here. */
1887 if ((dest & mask) == (host->s_addr & mask)) {
1889 /* The host is directly reachable. */
1890 next_hop->s_addr = 0;
1892 /* To reach the host, we must go through a gateway. */
1893 next_hop->s_addr = gateway;
1895 *netdev_name = xstrdup(iface);
1906 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1907 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1908 * returns 0. Otherwise, it returns a positive errno value; in particular,
1909 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1911 netdev_linux_arp_lookup(const struct netdev *netdev,
1912 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1915 struct sockaddr_in sin;
1918 memset(&r, 0, sizeof r);
1919 sin.sin_family = AF_INET;
1920 sin.sin_addr.s_addr = ip;
1922 memcpy(&r.arp_pa, &sin, sizeof sin);
1923 r.arp_ha.sa_family = ARPHRD_ETHER;
1925 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1926 COVERAGE_INC(netdev_arp_lookup);
1927 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1929 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1930 } else if (retval != ENXIO) {
1931 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1932 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1938 nd_to_iff_flags(enum netdev_flags nd)
1941 if (nd & NETDEV_UP) {
1944 if (nd & NETDEV_PROMISC) {
1951 iff_to_nd_flags(int iff)
1953 enum netdev_flags nd = 0;
1957 if (iff & IFF_PROMISC) {
1958 nd |= NETDEV_PROMISC;
1964 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1965 enum netdev_flags on, enum netdev_flags *old_flagsp)
1967 int old_flags, new_flags;
1970 error = get_flags(netdev, &old_flags);
1972 *old_flagsp = iff_to_nd_flags(old_flags);
1973 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
1974 if (new_flags != old_flags) {
1975 error = set_flags(netdev, new_flags);
1982 poll_notify(struct list *list)
1984 struct netdev_linux_notifier *notifier;
1985 LIST_FOR_EACH (notifier, struct netdev_linux_notifier, node, list) {
1986 struct netdev_notifier *n = ¬ifier->notifier;
1992 netdev_linux_poll_cb(const struct rtnetlink_change *change,
1993 void *aux OVS_UNUSED)
1996 struct list *list = shash_find_data(&netdev_linux_notifiers,
2002 struct shash_node *node;
2003 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2004 poll_notify(node->data);
2010 netdev_linux_poll_add(struct netdev *netdev,
2011 void (*cb)(struct netdev_notifier *), void *aux,
2012 struct netdev_notifier **notifierp)
2014 const char *netdev_name = netdev_get_name(netdev);
2015 struct netdev_linux_notifier *notifier;
2018 if (shash_is_empty(&netdev_linux_notifiers)) {
2019 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2020 netdev_linux_poll_cb, NULL);
2026 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2028 list = xmalloc(sizeof *list);
2030 shash_add(&netdev_linux_notifiers, netdev_name, list);
2033 notifier = xmalloc(sizeof *notifier);
2034 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2035 list_push_back(list, ¬ifier->node);
2036 *notifierp = ¬ifier->notifier;
2041 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2043 struct netdev_linux_notifier *notifier =
2044 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2047 /* Remove 'notifier' from its list. */
2048 list = list_remove(¬ifier->node);
2049 if (list_is_empty(list)) {
2050 /* The list is now empty. Remove it from the hash and free it. */
2051 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2052 shash_delete(&netdev_linux_notifiers,
2053 shash_find(&netdev_linux_notifiers, netdev_name));
2058 /* If that was the last notifier, unregister. */
2059 if (shash_is_empty(&netdev_linux_notifiers)) {
2060 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2064 const struct netdev_class netdev_linux_class = {
2071 netdev_linux_create_system,
2072 netdev_linux_destroy,
2073 NULL, /* reconfigure */
2078 netdev_linux_enumerate,
2081 netdev_linux_recv_wait,
2085 netdev_linux_send_wait,
2087 netdev_linux_set_etheraddr,
2088 netdev_linux_get_etheraddr,
2089 netdev_linux_get_mtu,
2090 netdev_linux_get_ifindex,
2091 netdev_linux_get_carrier,
2092 netdev_linux_get_stats,
2093 netdev_vport_set_stats,
2095 netdev_linux_get_features,
2096 netdev_linux_set_advertisements,
2097 netdev_linux_get_vlan_vid,
2099 netdev_linux_set_policing,
2100 netdev_linux_get_qos_types,
2101 netdev_linux_get_qos_capabilities,
2102 netdev_linux_get_qos,
2103 netdev_linux_set_qos,
2104 netdev_linux_get_queue,
2105 netdev_linux_set_queue,
2106 netdev_linux_delete_queue,
2107 netdev_linux_get_queue_stats,
2108 netdev_linux_dump_queues,
2109 netdev_linux_dump_queue_stats,
2111 netdev_linux_get_in4,
2112 netdev_linux_set_in4,
2113 netdev_linux_get_in6,
2114 netdev_linux_add_router,
2115 netdev_linux_get_next_hop,
2116 netdev_linux_arp_lookup,
2118 netdev_linux_update_flags,
2120 netdev_linux_poll_add,
2121 netdev_linux_poll_remove,
2124 const struct netdev_class netdev_tap_class = {
2131 netdev_linux_create_tap,
2132 netdev_linux_destroy,
2133 NULL, /* reconfigure */
2138 NULL, /* enumerate */
2141 netdev_linux_recv_wait,
2145 netdev_linux_send_wait,
2147 netdev_linux_set_etheraddr,
2148 netdev_linux_get_etheraddr,
2149 netdev_linux_get_mtu,
2150 netdev_linux_get_ifindex,
2151 netdev_linux_get_carrier,
2152 netdev_linux_get_stats,
2153 NULL, /* set_stats */
2155 netdev_linux_get_features,
2156 netdev_linux_set_advertisements,
2157 netdev_linux_get_vlan_vid,
2159 netdev_linux_set_policing,
2160 netdev_linux_get_qos_types,
2161 netdev_linux_get_qos_capabilities,
2162 netdev_linux_get_qos,
2163 netdev_linux_set_qos,
2164 netdev_linux_get_queue,
2165 netdev_linux_set_queue,
2166 netdev_linux_delete_queue,
2167 netdev_linux_get_queue_stats,
2168 netdev_linux_dump_queues,
2169 netdev_linux_dump_queue_stats,
2171 netdev_linux_get_in4,
2172 netdev_linux_set_in4,
2173 netdev_linux_get_in6,
2174 netdev_linux_add_router,
2175 netdev_linux_get_next_hop,
2176 netdev_linux_arp_lookup,
2178 netdev_linux_update_flags,
2180 netdev_linux_poll_add,
2181 netdev_linux_poll_remove,
2184 /* HTB traffic control class. */
2186 #define HTB_N_QUEUES 0xf000
2190 unsigned int max_rate; /* In bytes/s. */
2194 unsigned int min_rate; /* In bytes/s. */
2195 unsigned int max_rate; /* In bytes/s. */
2196 unsigned int burst; /* In bytes. */
2197 unsigned int priority; /* Lower values are higher priorities. */
2201 htb_get__(const struct netdev *netdev)
2203 struct netdev_dev_linux *netdev_dev =
2204 netdev_dev_linux_cast(netdev_get_dev(netdev));
2205 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2209 htb_install__(struct netdev *netdev, uint64_t max_rate)
2211 struct netdev_dev_linux *netdev_dev =
2212 netdev_dev_linux_cast(netdev_get_dev(netdev));
2215 htb = xmalloc(sizeof *htb);
2216 tc_init(&htb->tc, &tc_ops_htb);
2217 htb->max_rate = max_rate;
2219 netdev_dev->tc = &htb->tc;
2224 /* Create an HTB qdisc.
2226 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2229 htb_setup_qdisc__(struct netdev *netdev)
2232 struct tc_htb_glob opt;
2233 struct ofpbuf request;
2234 struct tcmsg *tcmsg;
2236 tc_del_qdisc(netdev);
2238 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2239 NLM_F_EXCL | NLM_F_CREATE, &request);
2240 tcmsg->tcm_handle = tc_make_handle(1, 0);
2241 tcmsg->tcm_parent = TC_H_ROOT;
2243 nl_msg_put_string(&request, TCA_KIND, "htb");
2245 memset(&opt, 0, sizeof opt);
2246 opt.rate2quantum = 10;
2250 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2251 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2252 nl_msg_end_nested(&request, opt_offset);
2254 return tc_transact(&request, NULL);
2257 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2258 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2260 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2261 unsigned int parent, struct htb_class *class)
2264 struct tc_htb_opt opt;
2265 struct ofpbuf request;
2266 struct tcmsg *tcmsg;
2270 netdev_get_mtu(netdev, &mtu);
2272 memset(&opt, 0, sizeof opt);
2273 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2274 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2275 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2276 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2277 opt.prio = class->priority;
2279 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2280 tcmsg->tcm_handle = handle;
2281 tcmsg->tcm_parent = parent;
2283 nl_msg_put_string(&request, TCA_KIND, "htb");
2284 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2285 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2286 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2287 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2288 nl_msg_end_nested(&request, opt_offset);
2290 error = tc_transact(&request, NULL);
2292 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2293 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2294 netdev_get_name(netdev),
2295 tc_get_major(handle), tc_get_minor(handle),
2296 tc_get_major(parent), tc_get_minor(parent),
2297 class->min_rate, class->max_rate,
2298 class->burst, class->priority, strerror(error));
2303 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2304 * description of them into 'details'. The description complies with the
2305 * specification given in the vswitch database documentation for linux-htb
2308 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2310 static const struct nl_policy tca_htb_policy[] = {
2311 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2312 .min_len = sizeof(struct tc_htb_opt) },
2315 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2316 const struct tc_htb_opt *htb;
2318 if (!nl_parse_nested(nl_options, tca_htb_policy,
2319 attrs, ARRAY_SIZE(tca_htb_policy))) {
2320 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2324 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2325 class->min_rate = htb->rate.rate;
2326 class->max_rate = htb->ceil.rate;
2327 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2328 class->priority = htb->prio;
2333 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2334 struct htb_class *options,
2335 struct netdev_queue_stats *stats)
2337 struct nlattr *nl_options;
2338 unsigned int handle;
2341 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2342 if (!error && queue_id) {
2343 unsigned int major = tc_get_major(handle);
2344 unsigned int minor = tc_get_minor(handle);
2345 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2346 *queue_id = minor - 1;
2351 if (!error && options) {
2352 error = htb_parse_tca_options__(nl_options, options);
2358 htb_parse_qdisc_details__(struct netdev *netdev,
2359 const struct shash *details, struct htb_class *hc)
2361 const char *max_rate_s;
2363 max_rate_s = shash_find_data(details, "max-rate");
2364 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2365 if (!hc->max_rate) {
2368 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2369 hc->max_rate = netdev_features_to_bps(current) / 8;
2371 hc->min_rate = hc->max_rate;
2377 htb_parse_class_details__(struct netdev *netdev,
2378 const struct shash *details, struct htb_class *hc)
2380 const struct htb *htb = htb_get__(netdev);
2381 const char *min_rate_s = shash_find_data(details, "min-rate");
2382 const char *max_rate_s = shash_find_data(details, "max-rate");
2383 const char *burst_s = shash_find_data(details, "burst");
2384 const char *priority_s = shash_find_data(details, "priority");
2389 /* min-rate is required. */
2392 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2393 hc->min_rate = MAX(hc->min_rate, 0);
2394 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2397 hc->max_rate = (max_rate_s
2398 ? strtoull(max_rate_s, NULL, 10) / 8
2400 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2401 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2405 * According to hints in the documentation that I've read, it is important
2406 * that 'burst' be at least as big as the largest frame that might be
2407 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2408 * but having it a bit too small is a problem. Since netdev_get_mtu()
2409 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2410 * the MTU. We actually add 64, instead of 14, as a guard against
2411 * additional headers get tacked on somewhere that we're not aware of. */
2412 netdev_get_mtu(netdev, &mtu);
2413 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2414 hc->burst = MAX(hc->burst, mtu + 64);
2417 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2423 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2424 unsigned int parent, struct htb_class *options,
2425 struct netdev_queue_stats *stats)
2427 struct ofpbuf *reply;
2430 error = tc_query_class(netdev, handle, parent, &reply);
2432 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2433 ofpbuf_delete(reply);
2439 htb_tc_install(struct netdev *netdev, const struct shash *details)
2443 error = htb_setup_qdisc__(netdev);
2445 struct htb_class hc;
2447 htb_parse_qdisc_details__(netdev, details, &hc);
2448 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2449 tc_make_handle(1, 0), &hc);
2451 htb_install__(netdev, hc.max_rate);
2458 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2459 const struct htb_class *hc)
2461 struct htb *htb = htb_get__(netdev);
2462 struct htb_class *hcp;
2464 hcp = port_array_get(&htb->tc.queues, queue_id);
2466 hcp = xmalloc(sizeof *hcp);
2467 port_array_set(&htb->tc.queues, queue_id, hcp);
2473 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2475 struct shash details = SHASH_INITIALIZER(&details);
2477 struct nl_dump dump;
2478 struct htb_class hc;
2481 /* Get qdisc options. */
2483 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2484 htb = htb_install__(netdev, hc.max_rate);
2487 start_queue_dump(netdev, &dump);
2488 shash_init(&details);
2489 while (nl_dump_next(&dump, &msg)) {
2490 unsigned int queue_id;
2492 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2493 htb_update_queue__(netdev, queue_id, &hc);
2496 nl_dump_done(&dump);
2502 htb_tc_destroy(struct tc *tc)
2504 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2505 unsigned int queue_id;
2506 struct htb_class *hc;
2508 PORT_ARRAY_FOR_EACH (hc, &htb->tc.queues, queue_id) {
2516 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2518 const struct htb *htb = htb_get__(netdev);
2519 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2524 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2526 struct htb_class hc;
2529 htb_parse_qdisc_details__(netdev, details, &hc);
2530 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2531 tc_make_handle(1, 0), &hc);
2533 htb_get__(netdev)->max_rate = hc.max_rate;
2539 htb_class_get(const struct netdev *netdev, unsigned int queue_id,
2540 struct shash *details)
2542 const struct htb *htb = htb_get__(netdev);
2543 const struct htb_class *hc;
2545 hc = port_array_get(&htb->tc.queues, queue_id);
2548 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2549 if (hc->min_rate != hc->max_rate) {
2550 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2552 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2554 shash_add(details, "priority", xasprintf("%u", hc->priority));
2560 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2561 const struct shash *details)
2563 struct htb_class hc;
2566 error = htb_parse_class_details__(netdev, details, &hc);
2571 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2572 tc_make_handle(1, 0xfffe), &hc);
2577 htb_update_queue__(netdev, queue_id, &hc);
2582 htb_class_delete(struct netdev *netdev, unsigned int queue_id)
2584 struct htb *htb = htb_get__(netdev);
2585 struct htb_class *hc;
2588 hc = port_array_get(&htb->tc.queues, queue_id);
2591 error = tc_delete_class(netdev, tc_make_handle(1, queue_id + 1));
2594 port_array_delete(&htb->tc.queues, queue_id);
2600 htb_class_get_stats(const struct netdev *netdev, unsigned int queue_id,
2601 struct netdev_queue_stats *stats)
2603 return htb_query_class__(netdev, tc_make_handle(1, queue_id + 1),
2604 tc_make_handle(1, 0xfffe), NULL, stats);
2608 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2609 const struct ofpbuf *nlmsg,
2610 netdev_dump_queue_stats_cb *cb, void *aux)
2612 struct netdev_queue_stats stats;
2613 unsigned int handle, major, minor;
2616 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2621 major = tc_get_major(handle);
2622 minor = tc_get_minor(handle);
2623 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2624 (*cb)(tc_get_minor(handle), &stats, aux);
2629 static const struct tc_ops tc_ops_htb = {
2630 "htb", /* linux_name */
2631 "linux-htb", /* ovs_name */
2632 HTB_N_QUEUES, /* n_queues */
2641 htb_class_get_stats,
2642 htb_class_dump_stats
2645 /* "linux-default" traffic control class.
2647 * This class represents the default, unnamed Linux qdisc. It corresponds to
2648 * the "" (empty string) QoS type in the OVS database. */
2651 default_install__(struct netdev *netdev)
2653 struct netdev_dev_linux *netdev_dev =
2654 netdev_dev_linux_cast(netdev_get_dev(netdev));
2655 static struct tc *tc;
2658 tc = xmalloc(sizeof *tc);
2659 tc_init(tc, &tc_ops_default);
2661 netdev_dev->tc = tc;
2665 default_tc_install(struct netdev *netdev,
2666 const struct shash *details OVS_UNUSED)
2668 default_install__(netdev);
2673 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2675 default_install__(netdev);
2679 static const struct tc_ops tc_ops_default = {
2680 NULL, /* linux_name */
2685 NULL, /* tc_destroy */
2686 NULL, /* qdisc_get */
2687 NULL, /* qdisc_set */
2688 NULL, /* class_get */
2689 NULL, /* class_set */
2690 NULL, /* class_delete */
2691 NULL, /* class_get_stats */
2692 NULL /* class_dump_stats */
2695 /* "linux-other" traffic control class.
2700 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2702 struct netdev_dev_linux *netdev_dev =
2703 netdev_dev_linux_cast(netdev_get_dev(netdev));
2704 static struct tc *tc;
2707 tc = xmalloc(sizeof *tc);
2708 tc_init(tc, &tc_ops_other);
2710 netdev_dev->tc = tc;
2714 static const struct tc_ops tc_ops_other = {
2715 NULL, /* linux_name */
2716 "linux-other", /* ovs_name */
2718 NULL, /* tc_install */
2720 NULL, /* tc_destroy */
2721 NULL, /* qdisc_get */
2722 NULL, /* qdisc_set */
2723 NULL, /* class_get */
2724 NULL, /* class_set */
2725 NULL, /* class_delete */
2726 NULL, /* class_get_stats */
2727 NULL /* class_dump_stats */
2730 /* Traffic control. */
2732 /* Number of kernel "tc" ticks per second. */
2733 static double ticks_per_s;
2735 /* Number of kernel "jiffies" per second. This is used for the purpose of
2736 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2737 * one jiffy's worth of data.
2739 * There are two possibilities here:
2741 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2742 * approximate range of 100 to 1024. That means that we really need to
2743 * make sure that the qdisc can buffer that much data.
2745 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2746 * has finely granular timers and there's no need to fudge additional room
2747 * for buffers. (There's no extra effort needed to implement that: the
2748 * large 'buffer_hz' is used as a divisor, so practically any number will
2749 * come out as 0 in the division. Small integer results in the case of
2750 * really high dividends won't have any real effect anyhow.)
2752 static unsigned int buffer_hz;
2754 /* Returns tc handle 'major':'minor'. */
2756 tc_make_handle(unsigned int major, unsigned int minor)
2758 return TC_H_MAKE(major << 16, minor);
2761 /* Returns the major number from 'handle'. */
2763 tc_get_major(unsigned int handle)
2765 return TC_H_MAJ(handle) >> 16;
2768 /* Returns the minor number from 'handle'. */
2770 tc_get_minor(unsigned int handle)
2772 return TC_H_MIN(handle);
2775 static struct tcmsg *
2776 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2777 struct ofpbuf *request)
2779 struct tcmsg *tcmsg;
2783 error = get_ifindex(netdev, &ifindex);
2788 ofpbuf_init(request, 512);
2789 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2790 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2791 tcmsg->tcm_family = AF_UNSPEC;
2792 tcmsg->tcm_ifindex = ifindex;
2793 /* Caller should fill in tcmsg->tcm_handle. */
2794 /* Caller should fill in tcmsg->tcm_parent. */
2800 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2802 int error = nl_sock_transact(rtnl_sock, request, replyp);
2803 ofpbuf_uninit(request);
2810 /* The values in psched are not individually very meaningful, but they are
2811 * important. The tables below show some values seen in the wild.
2815 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2816 * (Before that, there are hints that it was 1000000000.)
2818 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2822 * -----------------------------------
2823 * [1] 000c8000 000f4240 000f4240 00000064
2824 * [2] 000003e8 00000400 000f4240 3b9aca00
2825 * [3] 000003e8 00000400 000f4240 3b9aca00
2826 * [4] 000003e8 00000400 000f4240 00000064
2827 * [5] 000003e8 00000040 000f4240 3b9aca00
2828 * [6] 000003e8 00000040 000f4240 000000f9
2830 * a b c d ticks_per_s buffer_hz
2831 * ------- --------- ---------- ------------- ----------- -------------
2832 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2833 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2834 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2835 * [4] 1,000 1,024 1,000,000 100 976,562 100
2836 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2837 * [6] 1,000 64 1,000,000 249 15,625,000 249
2839 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2840 * [2] 2.6.26-1-686-bigmem from Debian lenny
2841 * [3] 2.6.26-2-sparc64 from Debian lenny
2842 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2843 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2844 * [6] 2.6.34 from kernel.org on KVM
2846 static const char fn[] = "/proc/net/psched";
2847 unsigned int a, b, c, d;
2853 stream = fopen(fn, "r");
2855 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2859 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2860 VLOG_WARN("%s: read failed", fn);
2864 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2868 VLOG_WARN("%s: invalid scheduler parameters", fn);
2872 ticks_per_s = (double) a * c / b;
2876 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2879 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2882 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2883 * rate of 'rate' bytes per second. */
2885 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2890 return (rate * ticks) / ticks_per_s;
2893 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2894 * rate of 'rate' bytes per second. */
2896 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2901 return ((unsigned long long int) ticks_per_s * size) / rate;
2904 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2905 * a transmission rate of 'rate' bytes per second. */
2907 tc_buffer_per_jiffy(unsigned int rate)
2912 return rate / buffer_hz;
2915 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2916 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2917 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2918 * stores NULL into it if it is absent.
2920 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2923 * Returns 0 if successful, otherwise a positive errno value. */
2925 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2926 struct nlattr **options)
2928 static const struct nl_policy tca_policy[] = {
2929 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2930 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2932 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2934 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2935 tca_policy, ta, ARRAY_SIZE(ta))) {
2936 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2941 *kind = nl_attr_get_string(ta[TCA_KIND]);
2945 *options = ta[TCA_OPTIONS];
2960 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
2961 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
2962 * into '*options', and its queue statistics into '*stats'. Any of the output
2963 * arguments may be null.
2965 * Returns 0 if successful, otherwise a positive errno value. */
2967 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
2968 struct nlattr **options, struct netdev_queue_stats *stats)
2970 static const struct nl_policy tca_policy[] = {
2971 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
2972 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
2974 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2976 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2977 tca_policy, ta, ARRAY_SIZE(ta))) {
2978 VLOG_WARN_RL(&rl, "failed to parse class message");
2983 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
2984 *handlep = tc->tcm_handle;
2988 *options = ta[TCA_OPTIONS];
2992 const struct gnet_stats_queue *gsq;
2993 struct gnet_stats_basic gsb;
2995 static const struct nl_policy stats_policy[] = {
2996 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
2997 .min_len = sizeof gsb },
2998 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
2999 .min_len = sizeof *gsq },
3001 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3003 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3004 sa, ARRAY_SIZE(sa))) {
3005 VLOG_WARN_RL(&rl, "failed to parse class stats");
3009 /* Alignment issues screw up the length of struct gnet_stats_basic on
3010 * some arch/bitsize combinations. Newer versions of Linux have a
3011 * struct gnet_stats_basic_packed, but we can't depend on that. The
3012 * easiest thing to do is just to make a copy. */
3013 memset(&gsb, 0, sizeof gsb);
3014 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3015 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3016 stats->tx_bytes = gsb.bytes;
3017 stats->tx_packets = gsb.packets;
3019 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3020 stats->tx_errors = gsq->drops;
3030 memset(stats, 0, sizeof *stats);
3035 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3038 tc_query_class(const struct netdev *netdev,
3039 unsigned int handle, unsigned int parent,
3040 struct ofpbuf **replyp)
3042 struct ofpbuf request;
3043 struct tcmsg *tcmsg;
3046 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3047 tcmsg->tcm_handle = handle;
3048 tcmsg->tcm_parent = parent;
3050 error = tc_transact(&request, replyp);
3052 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3053 netdev_get_name(netdev),
3054 tc_get_major(handle), tc_get_minor(handle),
3055 tc_get_major(parent), tc_get_minor(parent),
3061 /* Equivalent to "tc class del dev <name> handle <handle>". */
3063 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3065 struct ofpbuf request;
3066 struct tcmsg *tcmsg;
3069 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3070 tcmsg->tcm_handle = handle;
3071 tcmsg->tcm_parent = 0;
3073 error = tc_transact(&request, NULL);
3075 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3076 netdev_get_name(netdev),
3077 tc_get_major(handle), tc_get_minor(handle),
3083 /* Equivalent to "tc qdisc del dev <name> root". */
3085 tc_del_qdisc(struct netdev *netdev)
3087 struct netdev_dev_linux *netdev_dev =
3088 netdev_dev_linux_cast(netdev_get_dev(netdev));
3089 struct ofpbuf request;
3090 struct tcmsg *tcmsg;
3093 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3094 tcmsg->tcm_handle = tc_make_handle(1, 0);
3095 tcmsg->tcm_parent = TC_H_ROOT;
3097 error = tc_transact(&request, NULL);
3098 if (error == EINVAL) {
3099 /* EINVAL probably means that the default qdisc was in use, in which
3100 * case we've accomplished our purpose. */
3103 if (!error && netdev_dev->tc) {
3104 if (netdev_dev->tc->ops->tc_destroy) {
3105 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3107 netdev_dev->tc = NULL;
3112 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3113 * kernel to determine what they are. Returns 0 if successful, otherwise a
3114 * positive errno value. */
3116 tc_query_qdisc(const struct netdev *netdev)
3118 struct netdev_dev_linux *netdev_dev =
3119 netdev_dev_linux_cast(netdev_get_dev(netdev));
3120 struct ofpbuf request, *qdisc;
3121 const struct tc_ops *ops;
3122 struct tcmsg *tcmsg;
3126 if (netdev_dev->tc) {
3130 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3131 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3132 * 2.6.35 without that fix backported to it.
3134 * To avoid the OOPS, we must not make a request that would attempt to dump
3135 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3136 * few others. There are a few ways that I can see to do this, but most of
3137 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3138 * technique chosen here is to assume that any non-default qdisc that we
3139 * create will have a class with handle 1:0. The built-in qdiscs only have
3140 * a class with handle 0:0.
3142 * We could check for Linux 2.6.35+ and use a more straightforward method
3144 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3145 tcmsg->tcm_handle = tc_make_handle(1, 0);
3146 tcmsg->tcm_parent = 0;
3148 /* Figure out what tc class to instantiate. */
3149 error = tc_transact(&request, &qdisc);
3153 error = tc_parse_qdisc(qdisc, &kind, NULL);
3155 ops = &tc_ops_other;
3157 ops = tc_lookup_linux_name(kind);
3159 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3160 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3162 ops = &tc_ops_other;
3165 } else if (error == ENOENT) {
3166 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3167 * other entity that doesn't have a handle 1:0. We will assume
3168 * that it's the system default qdisc. */
3169 ops = &tc_ops_default;
3172 /* Who knows? Maybe the device got deleted. */
3173 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3174 netdev_get_name(netdev), strerror(error));
3175 ops = &tc_ops_other;
3178 /* Instantiate it. */
3179 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3180 assert((load_error == 0) == (netdev_dev->tc != NULL));
3181 ofpbuf_delete(qdisc);
3183 return error ? error : load_error;
3186 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3187 approximate the time to transmit packets of various lengths. For an MTU of
3188 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3189 represents two possible packet lengths; for a MTU of 513 through 1024, four
3190 possible lengths; and so on.
3192 Returns, for the specified 'mtu', the number of bits that packet lengths
3193 need to be shifted right to fit within such a 256-entry table. */
3195 tc_calc_cell_log(unsigned int mtu)
3200 mtu = ETH_PAYLOAD_MAX;
3202 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3204 for (cell_log = 0; mtu >= 256; cell_log++) {
3211 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3214 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3216 memset(rate, 0, sizeof *rate);
3217 rate->cell_log = tc_calc_cell_log(mtu);
3218 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3219 /* rate->cell_align = 0; */ /* distro headers. */
3220 rate->mpu = ETH_TOTAL_MIN;
3224 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3225 * attribute of the specified "type".
3227 * See tc_calc_cell_log() above for a description of "rtab"s. */
3229 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3234 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3235 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3236 unsigned packet_size = (i + 1) << rate->cell_log;
3237 if (packet_size < rate->mpu) {
3238 packet_size = rate->mpu;
3240 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3244 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3245 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3246 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3251 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3253 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3254 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3258 /* Utility functions. */
3261 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3263 /* Policy for RTNLGRP_LINK messages.
3265 * There are *many* more fields in these messages, but currently we only
3266 * care about these fields. */
3267 static const struct nl_policy rtnlgrp_link_policy[] = {
3268 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3269 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3270 .min_len = sizeof(struct rtnl_link_stats) },
3273 struct ofpbuf request;
3274 struct ofpbuf *reply;
3275 struct ifinfomsg *ifi;
3276 const struct rtnl_link_stats *rtnl_stats;
3277 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3280 ofpbuf_init(&request, 0);
3281 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3282 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3283 ifi->ifi_family = PF_UNSPEC;
3284 ifi->ifi_index = ifindex;
3285 error = nl_sock_transact(rtnl_sock, &request, &reply);
3286 ofpbuf_uninit(&request);
3291 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3292 rtnlgrp_link_policy,
3293 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3294 ofpbuf_delete(reply);
3298 if (!attrs[IFLA_STATS]) {
3299 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3300 ofpbuf_delete(reply);
3304 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3305 stats->rx_packets = rtnl_stats->rx_packets;
3306 stats->tx_packets = rtnl_stats->tx_packets;
3307 stats->rx_bytes = rtnl_stats->rx_bytes;
3308 stats->tx_bytes = rtnl_stats->tx_bytes;
3309 stats->rx_errors = rtnl_stats->rx_errors;
3310 stats->tx_errors = rtnl_stats->tx_errors;
3311 stats->rx_dropped = rtnl_stats->rx_dropped;
3312 stats->tx_dropped = rtnl_stats->tx_dropped;
3313 stats->multicast = rtnl_stats->multicast;
3314 stats->collisions = rtnl_stats->collisions;
3315 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3316 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3317 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3318 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3319 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3320 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3321 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3322 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3323 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3324 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3325 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3327 ofpbuf_delete(reply);
3333 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3335 static const char fn[] = "/proc/net/dev";
3340 stream = fopen(fn, "r");
3342 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3347 while (fgets(line, sizeof line, stream)) {
3350 #define X64 "%"SCNu64
3353 X64 X64 X64 X64 X64 X64 X64 "%*u"
3354 X64 X64 X64 X64 X64 X64 X64 "%*u",
3360 &stats->rx_fifo_errors,
3361 &stats->rx_frame_errors,
3367 &stats->tx_fifo_errors,
3369 &stats->tx_carrier_errors) != 15) {
3370 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3371 } else if (!strcmp(devname, netdev_name)) {
3372 stats->rx_length_errors = UINT64_MAX;
3373 stats->rx_over_errors = UINT64_MAX;
3374 stats->rx_crc_errors = UINT64_MAX;
3375 stats->rx_missed_errors = UINT64_MAX;
3376 stats->tx_aborted_errors = UINT64_MAX;
3377 stats->tx_heartbeat_errors = UINT64_MAX;
3378 stats->tx_window_errors = UINT64_MAX;
3384 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3390 get_flags(const struct netdev *netdev, int *flags)
3395 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3397 *flags = ifr.ifr_flags;
3402 set_flags(struct netdev *netdev, int flags)
3406 ifr.ifr_flags = flags;
3407 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3412 do_get_ifindex(const char *netdev_name)
3416 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3417 COVERAGE_INC(netdev_get_ifindex);
3418 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3419 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3420 netdev_name, strerror(errno));
3423 return ifr.ifr_ifindex;
3427 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3429 struct netdev_dev_linux *netdev_dev =
3430 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3432 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3433 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3437 netdev_dev->cache_valid |= VALID_IFINDEX;
3438 netdev_dev->ifindex = ifindex;
3440 *ifindexp = netdev_dev->ifindex;
3445 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3450 memset(&ifr, 0, sizeof ifr);
3451 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3452 COVERAGE_INC(netdev_get_hwaddr);
3453 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3454 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3455 netdev_name, strerror(errno));
3458 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3459 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3460 VLOG_WARN("%s device has unknown hardware address family %d",
3461 netdev_name, hwaddr_family);
3463 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3468 set_etheraddr(const char *netdev_name, int hwaddr_family,
3469 const uint8_t mac[ETH_ADDR_LEN])
3473 memset(&ifr, 0, sizeof ifr);
3474 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3475 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3476 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3477 COVERAGE_INC(netdev_set_hwaddr);
3478 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3479 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3480 netdev_name, strerror(errno));
3487 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3488 int cmd, const char *cmd_name)
3492 memset(&ifr, 0, sizeof ifr);
3493 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3494 ifr.ifr_data = (caddr_t) ecmd;
3497 COVERAGE_INC(netdev_ethtool);
3498 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3501 if (errno != EOPNOTSUPP) {
3502 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3503 "failed: %s", cmd_name, name, strerror(errno));
3505 /* The device doesn't support this operation. That's pretty
3506 * common, so there's no point in logging anything. */
3513 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3514 const char *cmd_name)
3516 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3517 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3518 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3526 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3527 int cmd, const char *cmd_name)
3532 ifr.ifr_addr.sa_family = AF_INET;
3533 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3535 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3536 *ip = sin->sin_addr;