2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
51 #include "netdev-provider.h"
52 #include "netdev-vport.h"
55 #include "openflow/openflow.h"
57 #include "poll-loop.h"
58 #include "port-array.h"
59 #include "rtnetlink.h"
60 #include "socket-util.h"
65 VLOG_DEFINE_THIS_MODULE(netdev_linux)
67 /* These were introduced in Linux 2.6.14, so they might be missing if we have
69 #ifndef ADVERTISED_Pause
70 #define ADVERTISED_Pause (1 << 13)
72 #ifndef ADVERTISED_Asym_Pause
73 #define ADVERTISED_Asym_Pause (1 << 14)
76 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
79 #define TC_RTAB_SIZE 1024
82 static struct rtnetlink_notifier netdev_linux_cache_notifier;
83 static int cache_notifier_refcount;
86 VALID_IFINDEX = 1 << 0,
87 VALID_ETHERADDR = 1 << 1,
91 VALID_CARRIER = 1 << 5,
92 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
93 VALID_POLICING = 1 << 7,
94 VALID_HAVE_VPORT_STATS = 1 << 8
102 /* Traffic control. */
104 /* An instance of a traffic control class. Always associated with a particular
107 const struct tc_ops *ops;
109 /* Maps from queue ID to tc-specific data.
111 * The generic netdev TC layer uses this to the following extent: if an
112 * entry is nonnull, then the queue whose ID is the index is assumed to
113 * exist; if an entry is null, then that queue is assumed not to exist.
114 * Implementations must adhere to this scheme, although they may store
115 * whatever they like as data.
117 struct port_array queues;
120 /* A particular kind of traffic control. Each implementation generally maps to
121 * one particular Linux qdisc class.
123 * The functions below return 0 if successful or a positive errno value on
124 * failure, except where otherwise noted. All of them must be provided, except
125 * where otherwise noted. */
127 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
128 * This is null for tc_ops_default and tc_ops_other, for which there are no
129 * appropriate values. */
130 const char *linux_name;
132 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
133 const char *ovs_name;
135 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
136 * queues. The queues are numbered 0 through n_queues - 1. */
137 unsigned int n_queues;
139 /* Called to install this TC class on 'netdev'. The implementation should
140 * make the Netlink calls required to set up 'netdev' with the right qdisc
141 * and configure it according to 'details'. The implementation may assume
142 * that the current qdisc is the default; that is, there is no need for it
143 * to delete the current qdisc before installing itself.
145 * The contents of 'details' should be documented as valid for 'ovs_name'
146 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
147 * (which is built as ovs-vswitchd.conf.db(8)).
149 * This function must return 0 if and only if it sets 'netdev->tc' to an
150 * initialized 'struct tc'.
152 * (This function is null for tc_ops_other, which cannot be installed. For
153 * other TC classes it should always be nonnull.) */
154 int (*tc_install)(struct netdev *netdev, const struct shash *details);
156 /* Called when the netdev code determines (through a Netlink query) that
157 * this TC class's qdisc is installed on 'netdev', but we didn't install
158 * it ourselves and so don't know any of the details.
160 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
161 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
162 * implementation should parse the other attributes of 'nlmsg' as
163 * necessary to determine its configuration. If necessary it should also
164 * use Netlink queries to determine the configuration of queues on
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'. */
169 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
171 /* Destroys the data structures allocated by the implementation as part of
172 * 'tc'. (This includes destroying 'tc->queues' by calling
175 * The implementation should not need to perform any Netlink calls. If
176 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
177 * (But it may not be desirable.)
179 * This function may be null if 'tc' is trivial. */
180 void (*tc_destroy)(struct tc *tc);
182 /* Retrieves details of 'netdev->tc' configuration into 'details'.
184 * The implementation should not need to perform any Netlink calls, because
185 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
186 * cached the configuration.
188 * The contents of 'details' should be documented as valid for 'ovs_name'
189 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
190 * (which is built as ovs-vswitchd.conf.db(8)).
192 * This function may be null if 'tc' is not configurable.
194 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
196 /* Reconfigures 'netdev->tc' according to 'details', performing any
197 * required Netlink calls to complete the reconfiguration.
199 * The contents of 'details' should be documented as valid for 'ovs_name'
200 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
201 * (which is built as ovs-vswitchd.conf.db(8)).
203 * This function may be null if 'tc' is not configurable.
205 int (*qdisc_set)(struct netdev *, const struct shash *details);
207 /* Retrieves details of 'queue_id' on 'netdev->tc' into 'details'. The
208 * caller ensures that 'queues' has a nonnull value for index 'queue_id.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "Queue" table in
212 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the queue configuration.
218 * This function may be null if 'tc' does not have queues ('n_queues' is
220 int (*class_get)(const struct netdev *netdev, unsigned int queue_id,
221 struct shash *details);
223 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
224 * 'details', perfoming any required Netlink calls to complete the
225 * reconfiguration. The caller ensures that 'queue_id' is less than
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * This function may be null if 'tc' does not have queues or its queues are
233 * not configurable. */
234 int (*class_set)(struct netdev *, unsigned int queue_id,
235 const struct shash *details);
237 /* Deletes 'queue_id' from 'netdev->tc'. The caller ensures that 'queues'
238 * has a nonnull value for index 'queue_id.
240 * This function may be null if 'tc' does not have queues or its queues
241 * cannot be deleted. */
242 int (*class_delete)(struct netdev *, unsigned int queue_id);
244 /* Obtains stats for 'queue' from 'netdev->tc'. The caller ensures that
245 * 'queues' has a nonnull value for index 'queue_id.
247 * On success, initializes '*stats'.
249 * This function may be null if 'tc' does not have queues or if it cannot
250 * report queue statistics. */
251 int (*class_get_stats)(const struct netdev *netdev, unsigned int queue_id,
252 struct netdev_queue_stats *stats);
254 /* Extracts queue stats from 'nlmsg', which is a response to a
255 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
257 * This function may be null if 'tc' does not have queues or if it cannot
258 * report queue statistics. */
259 int (*class_dump_stats)(const struct netdev *netdev,
260 const struct ofpbuf *nlmsg,
261 netdev_dump_queue_stats_cb *cb, void *aux);
265 tc_init(struct tc *tc, const struct tc_ops *ops)
268 port_array_init(&tc->queues);
272 tc_destroy(struct tc *tc)
274 port_array_destroy(&tc->queues);
277 static const struct tc_ops tc_ops_htb;
278 static const struct tc_ops tc_ops_default;
279 static const struct tc_ops tc_ops_other;
281 static const struct tc_ops *tcs[] = {
282 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
283 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
284 &tc_ops_other, /* Some other qdisc. */
288 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
289 static unsigned int tc_get_major(unsigned int handle);
290 static unsigned int tc_get_minor(unsigned int handle);
292 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
293 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
294 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
296 static struct tcmsg *tc_make_request(const struct netdev *, int type,
297 unsigned int flags, struct ofpbuf *);
298 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
300 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
301 struct nlattr **options);
302 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
303 struct nlattr **options,
304 struct netdev_queue_stats *);
305 static int tc_query_class(const struct netdev *,
306 unsigned int handle, unsigned int parent,
307 struct ofpbuf **replyp);
308 static int tc_delete_class(const struct netdev *, unsigned int handle);
310 static int tc_del_qdisc(struct netdev *netdev);
311 static int tc_query_qdisc(const struct netdev *netdev);
313 static int tc_calc_cell_log(unsigned int mtu);
314 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
315 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
316 const struct tc_ratespec *rate);
317 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
319 struct netdev_dev_linux {
320 struct netdev_dev netdev_dev;
322 struct shash_node *shash_node;
323 unsigned int cache_valid;
325 /* The following are figured out "on demand" only. They are only valid
326 * when the corresponding VALID_* bit in 'cache_valid' is set. */
328 uint8_t etheraddr[ETH_ADDR_LEN];
329 struct in_addr address, netmask;
333 bool is_internal; /* Is this an openvswitch internal device? */
334 bool is_tap; /* Is this a tuntap device? */
335 uint32_t kbits_rate; /* Policing data. */
336 uint32_t kbits_burst;
337 bool have_vport_stats;
341 struct tap_state tap;
345 struct netdev_linux {
346 struct netdev netdev;
350 /* An AF_INET socket (used for ioctl operations). */
351 static int af_inet_sock = -1;
353 /* A Netlink routing socket that is not subscribed to any multicast groups. */
354 static struct nl_sock *rtnl_sock;
356 struct netdev_linux_notifier {
357 struct netdev_notifier notifier;
361 static struct shash netdev_linux_notifiers =
362 SHASH_INITIALIZER(&netdev_linux_notifiers);
363 static struct rtnetlink_notifier netdev_linux_poll_notifier;
365 /* This is set pretty low because we probably won't learn anything from the
366 * additional log messages. */
367 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
369 static int netdev_linux_init(void);
371 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
372 int cmd, const char *cmd_name);
373 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
374 const char *cmd_name);
375 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
376 int cmd, const char *cmd_name);
377 static int get_flags(const struct netdev *, int *flagsp);
378 static int set_flags(struct netdev *, int flags);
379 static int do_get_ifindex(const char *netdev_name);
380 static int get_ifindex(const struct netdev *, int *ifindexp);
381 static int do_set_addr(struct netdev *netdev,
382 int ioctl_nr, const char *ioctl_name,
383 struct in_addr addr);
384 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
385 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
386 const uint8_t[ETH_ADDR_LEN]);
387 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
388 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
391 is_netdev_linux_class(const struct netdev_class *netdev_class)
393 return netdev_class->init == netdev_linux_init;
396 static struct netdev_dev_linux *
397 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
399 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
400 assert(is_netdev_linux_class(netdev_class));
402 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
405 static struct netdev_linux *
406 netdev_linux_cast(const struct netdev *netdev)
408 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
409 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
410 assert(is_netdev_linux_class(netdev_class));
412 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
416 netdev_linux_init(void)
418 static int status = -1;
420 /* Create AF_INET socket. */
421 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
422 status = af_inet_sock >= 0 ? 0 : errno;
424 VLOG_ERR("failed to create inet socket: %s", strerror(status));
427 /* Create rtnetlink socket. */
429 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
431 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
440 netdev_linux_run(void)
442 rtnetlink_notifier_run();
446 netdev_linux_wait(void)
448 rtnetlink_notifier_wait();
452 netdev_linux_cache_cb(const struct rtnetlink_change *change,
453 void *aux OVS_UNUSED)
455 struct netdev_dev_linux *dev;
457 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
459 const struct netdev_class *netdev_class =
460 netdev_dev_get_class(base_dev);
462 if (is_netdev_linux_class(netdev_class)) {
463 dev = netdev_dev_linux_cast(base_dev);
464 dev->cache_valid = 0;
468 struct shash device_shash;
469 struct shash_node *node;
471 shash_init(&device_shash);
472 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
473 SHASH_FOR_EACH (node, &device_shash) {
475 dev->cache_valid = 0;
477 shash_destroy(&device_shash);
481 /* Creates the netdev device of 'type' with 'name'. */
483 netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
484 const struct shash *args, struct netdev_dev **netdev_devp)
486 struct netdev_dev_linux *netdev_dev;
489 if (!shash_is_empty(args)) {
490 VLOG_WARN("%s: arguments for system devices should be empty", name);
493 if (!cache_notifier_refcount) {
494 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
495 netdev_linux_cache_cb, NULL);
500 cache_notifier_refcount++;
502 netdev_dev = xzalloc(sizeof *netdev_dev);
503 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
505 *netdev_devp = &netdev_dev->netdev_dev;
509 /* For most types of netdevs we open the device for each call of
510 * netdev_open(). However, this is not the case with tap devices,
511 * since it is only possible to open the device once. In this
512 * situation we share a single file descriptor, and consequently
513 * buffers, across all readers. Therefore once data is read it will
514 * be unavailable to other reads for tap devices. */
516 netdev_linux_create_tap(const char *name, const char *type OVS_UNUSED,
517 const struct shash *args, struct netdev_dev **netdev_devp)
519 struct netdev_dev_linux *netdev_dev;
520 struct tap_state *state;
521 static const char tap_dev[] = "/dev/net/tun";
525 if (!shash_is_empty(args)) {
526 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
529 netdev_dev = xzalloc(sizeof *netdev_dev);
530 state = &netdev_dev->state.tap;
532 /* Open tap device. */
533 state->fd = open(tap_dev, O_RDWR);
536 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
540 /* Create tap device. */
541 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
542 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
543 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
544 VLOG_WARN("%s: creating tap device failed: %s", name,
550 /* Make non-blocking. */
551 error = set_nonblocking(state->fd);
556 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
557 *netdev_devp = &netdev_dev->netdev_dev;
566 destroy_tap(struct netdev_dev_linux *netdev_dev)
568 struct tap_state *state = &netdev_dev->state.tap;
570 if (state->fd >= 0) {
575 /* Destroys the netdev device 'netdev_dev_'. */
577 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
579 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
580 const char *type = netdev_dev_get_type(netdev_dev_);
582 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
583 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
586 if (!strcmp(type, "system")) {
587 cache_notifier_refcount--;
589 if (!cache_notifier_refcount) {
590 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
592 } else if (!strcmp(type, "tap")) {
593 destroy_tap(netdev_dev);
600 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
601 struct netdev **netdevp)
603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
604 struct netdev_linux *netdev;
605 enum netdev_flags flags;
608 /* Allocate network device. */
609 netdev = xzalloc(sizeof *netdev);
611 netdev_init(&netdev->netdev, netdev_dev_);
613 error = netdev_get_flags(&netdev->netdev, &flags);
614 if (error == ENODEV) {
618 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
619 !netdev_dev->state.tap.opened) {
621 /* We assume that the first user of the tap device is the primary user
622 * and give them the tap FD. Subsequent users probably just expect
623 * this to be a system device so open it normally to avoid send/receive
624 * directions appearing to be reversed. */
625 netdev->fd = netdev_dev->state.tap.fd;
626 netdev_dev->state.tap.opened = true;
627 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
628 struct sockaddr_ll sll;
632 /* Create file descriptor. */
633 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
634 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
636 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
637 if (netdev->fd < 0) {
642 /* Set non-blocking mode. */
643 error = set_nonblocking(netdev->fd);
648 /* Get ethernet device index. */
649 error = get_ifindex(&netdev->netdev, &ifindex);
654 /* Bind to specific ethernet device. */
655 memset(&sll, 0, sizeof sll);
656 sll.sll_family = AF_PACKET;
657 sll.sll_ifindex = ifindex;
659 (struct sockaddr *) &sll, sizeof sll) < 0) {
661 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
666 /* Between the socket() and bind() calls above, the socket receives all
667 * packets of the requested type on all system interfaces. We do not
668 * want to receive that data, but there is no way to avoid it. So we
669 * must now drain out the receive queue. */
670 error = drain_rcvbuf(netdev->fd);
676 *netdevp = &netdev->netdev;
680 netdev_uninit(&netdev->netdev, true);
684 /* Closes and destroys 'netdev'. */
686 netdev_linux_close(struct netdev *netdev_)
688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
690 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
696 /* Initializes 'svec' with a list of the names of all known network devices. */
698 netdev_linux_enumerate(struct svec *svec)
700 struct if_nameindex *names;
702 names = if_nameindex();
706 for (i = 0; names[i].if_name != NULL; i++) {
707 svec_add(svec, names[i].if_name);
709 if_freenameindex(names);
712 VLOG_WARN("could not obtain list of network device names: %s",
719 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
723 if (netdev->fd < 0) {
724 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
729 ssize_t retval = read(netdev->fd, data, size);
732 } else if (errno != EINTR) {
733 if (errno != EAGAIN) {
734 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
735 strerror(errno), netdev_get_name(netdev_));
742 /* Registers with the poll loop to wake up from the next call to poll_block()
743 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
745 netdev_linux_recv_wait(struct netdev *netdev_)
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
748 if (netdev->fd >= 0) {
749 poll_fd_wait(netdev->fd, POLLIN);
753 /* Discards all packets waiting to be received from 'netdev'. */
755 netdev_linux_drain(struct netdev *netdev_)
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
760 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
762 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
763 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
767 drain_fd(netdev->fd, ifr.ifr_qlen);
770 return drain_rcvbuf(netdev->fd);
774 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
775 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
776 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
777 * the packet is too big or too small to transmit on the device.
779 * The caller retains ownership of 'buffer' in all cases.
781 * The kernel maintains a packet transmission queue, so the caller is not
782 * expected to do additional queuing of packets. */
784 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
790 if (netdev->fd < 0) {
795 ssize_t retval = write(netdev->fd, data, size);
797 /* The Linux AF_PACKET implementation never blocks waiting for room
798 * for packets, instead returning ENOBUFS. Translate this into
799 * EAGAIN for the caller. */
800 if (errno == ENOBUFS) {
802 } else if (errno == EINTR) {
804 } else if (errno != EAGAIN) {
805 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
806 netdev_get_name(netdev_), strerror(errno));
809 } else if (retval != size) {
810 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
811 "%zu) on %s", retval, size, netdev_get_name(netdev_));
819 /* Registers with the poll loop to wake up from the next call to poll_block()
820 * when the packet transmission queue has sufficient room to transmit a packet
821 * with netdev_send().
823 * The kernel maintains a packet transmission queue, so the client is not
824 * expected to do additional queuing of packets. Thus, this function is
825 * unlikely to ever be used. It is included for completeness. */
827 netdev_linux_send_wait(struct netdev *netdev_)
829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
830 if (netdev->fd < 0) {
832 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
833 poll_fd_wait(netdev->fd, POLLOUT);
835 /* TAP device always accepts packets.*/
836 poll_immediate_wake();
840 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
841 * otherwise a positive errno value. */
843 netdev_linux_set_etheraddr(struct netdev *netdev_,
844 const uint8_t mac[ETH_ADDR_LEN])
846 struct netdev_dev_linux *netdev_dev =
847 netdev_dev_linux_cast(netdev_get_dev(netdev_));
850 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
851 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
852 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
854 netdev_dev->cache_valid |= VALID_ETHERADDR;
855 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
863 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
864 * free the returned buffer. */
866 netdev_linux_get_etheraddr(const struct netdev *netdev_,
867 uint8_t mac[ETH_ADDR_LEN])
869 struct netdev_dev_linux *netdev_dev =
870 netdev_dev_linux_cast(netdev_get_dev(netdev_));
871 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
872 int error = get_etheraddr(netdev_get_name(netdev_),
873 netdev_dev->etheraddr);
877 netdev_dev->cache_valid |= VALID_ETHERADDR;
879 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
883 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
884 * in bytes, not including the hardware header; thus, this is typically 1500
885 * bytes for Ethernet devices. */
887 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
889 struct netdev_dev_linux *netdev_dev =
890 netdev_dev_linux_cast(netdev_get_dev(netdev_));
891 if (!(netdev_dev->cache_valid & VALID_MTU)) {
895 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
896 SIOCGIFMTU, "SIOCGIFMTU");
900 netdev_dev->mtu = ifr.ifr_mtu;
901 netdev_dev->cache_valid |= VALID_MTU;
903 *mtup = netdev_dev->mtu;
907 /* Returns the ifindex of 'netdev', if successful, as a positive number.
908 * On failure, returns a negative errno value. */
910 netdev_linux_get_ifindex(const struct netdev *netdev)
914 error = get_ifindex(netdev, &ifindex);
915 return error ? -error : ifindex;
919 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
921 struct netdev_dev_linux *netdev_dev =
922 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
931 fn = xasprintf("/sys/class/net/%s/carrier",
932 netdev_get_name(netdev_));
933 fd = open(fn, O_RDONLY);
936 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
940 retval = read(fd, line, sizeof line);
943 if (error == EINVAL) {
944 /* This is the normal return value when we try to check carrier
945 * if the network device is not up. */
947 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
950 } else if (retval == 0) {
952 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
956 if (line[0] != '0' && line[0] != '1') {
958 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
962 netdev_dev->carrier = line[0] != '0';
963 netdev_dev->cache_valid |= VALID_CARRIER;
965 *carrier = netdev_dev->carrier;
976 /* Check whether we can we use RTM_GETLINK to get network device statistics.
977 * In pre-2.6.19 kernels, this was only available if wireless extensions were
980 check_for_working_netlink_stats(void)
982 /* Decide on the netdev_get_stats() implementation to use. Netlink is
983 * preferable, so if that works, we'll use it. */
984 int ifindex = do_get_ifindex("lo");
986 VLOG_WARN("failed to get ifindex for lo, "
987 "obtaining netdev stats from proc");
990 struct netdev_stats stats;
991 int error = get_stats_via_netlink(ifindex, &stats);
993 VLOG_DBG("obtaining netdev stats via rtnetlink");
996 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
997 "via proc (you are probably running a pre-2.6.19 "
998 "kernel)", strerror(error));
1004 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1006 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1008 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1009 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1010 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1012 netdev_dev->is_tap = !strcmp(type, "tap");
1013 netdev_dev->is_internal = false;
1014 if (!netdev_dev->is_tap) {
1015 struct ethtool_drvinfo drvinfo;
1018 memset(&drvinfo, 0, sizeof drvinfo);
1019 error = netdev_linux_do_ethtool(name,
1020 (struct ethtool_cmd *)&drvinfo,
1022 "ETHTOOL_GDRVINFO");
1024 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1025 netdev_dev->is_internal = true;
1029 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1034 swap_uint64(uint64_t *a, uint64_t *b)
1041 /* Retrieves current device stats for 'netdev'. */
1043 netdev_linux_get_stats(const struct netdev *netdev_,
1044 struct netdev_stats *stats)
1046 struct netdev_dev_linux *netdev_dev =
1047 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1048 static int use_netlink_stats = -1;
1051 COVERAGE_INC(netdev_get_stats);
1053 if (netdev_dev->have_vport_stats ||
1054 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1056 error = netdev_vport_get_stats(netdev_, stats);
1057 netdev_dev->have_vport_stats = !error;
1058 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1061 if (!netdev_dev->have_vport_stats) {
1062 if (use_netlink_stats < 0) {
1063 use_netlink_stats = check_for_working_netlink_stats();
1065 if (use_netlink_stats) {
1068 error = get_ifindex(netdev_, &ifindex);
1070 error = get_stats_via_netlink(ifindex, stats);
1073 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1077 /* If this port is an internal port then the transmit and receive stats
1078 * will appear to be swapped relative to the other ports since we are the
1079 * one sending the data, not a remote computer. For consistency, we swap
1080 * them back here. This does not apply if we are getting stats from the
1081 * vport layer because it always tracks stats from the perspective of the
1083 netdev_linux_update_is_pseudo(netdev_dev);
1084 if (!error && !netdev_dev->have_vport_stats &&
1085 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1086 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1087 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1088 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1089 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1090 stats->rx_length_errors = 0;
1091 stats->rx_over_errors = 0;
1092 stats->rx_crc_errors = 0;
1093 stats->rx_frame_errors = 0;
1094 stats->rx_fifo_errors = 0;
1095 stats->rx_missed_errors = 0;
1096 stats->tx_aborted_errors = 0;
1097 stats->tx_carrier_errors = 0;
1098 stats->tx_fifo_errors = 0;
1099 stats->tx_heartbeat_errors = 0;
1100 stats->tx_window_errors = 0;
1106 /* Stores the features supported by 'netdev' into each of '*current',
1107 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1108 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1109 * successful, otherwise a positive errno value. */
1111 netdev_linux_get_features(struct netdev *netdev,
1112 uint32_t *current, uint32_t *advertised,
1113 uint32_t *supported, uint32_t *peer)
1115 struct ethtool_cmd ecmd;
1118 memset(&ecmd, 0, sizeof ecmd);
1119 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1120 ETHTOOL_GSET, "ETHTOOL_GSET");
1125 /* Supported features. */
1127 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1128 *supported |= OFPPF_10MB_HD;
1130 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1131 *supported |= OFPPF_10MB_FD;
1133 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1134 *supported |= OFPPF_100MB_HD;
1136 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1137 *supported |= OFPPF_100MB_FD;
1139 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1140 *supported |= OFPPF_1GB_HD;
1142 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1143 *supported |= OFPPF_1GB_FD;
1145 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1146 *supported |= OFPPF_10GB_FD;
1148 if (ecmd.supported & SUPPORTED_TP) {
1149 *supported |= OFPPF_COPPER;
1151 if (ecmd.supported & SUPPORTED_FIBRE) {
1152 *supported |= OFPPF_FIBER;
1154 if (ecmd.supported & SUPPORTED_Autoneg) {
1155 *supported |= OFPPF_AUTONEG;
1157 if (ecmd.supported & SUPPORTED_Pause) {
1158 *supported |= OFPPF_PAUSE;
1160 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1161 *supported |= OFPPF_PAUSE_ASYM;
1164 /* Advertised features. */
1166 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1167 *advertised |= OFPPF_10MB_HD;
1169 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1170 *advertised |= OFPPF_10MB_FD;
1172 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1173 *advertised |= OFPPF_100MB_HD;
1175 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1176 *advertised |= OFPPF_100MB_FD;
1178 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1179 *advertised |= OFPPF_1GB_HD;
1181 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1182 *advertised |= OFPPF_1GB_FD;
1184 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1185 *advertised |= OFPPF_10GB_FD;
1187 if (ecmd.advertising & ADVERTISED_TP) {
1188 *advertised |= OFPPF_COPPER;
1190 if (ecmd.advertising & ADVERTISED_FIBRE) {
1191 *advertised |= OFPPF_FIBER;
1193 if (ecmd.advertising & ADVERTISED_Autoneg) {
1194 *advertised |= OFPPF_AUTONEG;
1196 if (ecmd.advertising & ADVERTISED_Pause) {
1197 *advertised |= OFPPF_PAUSE;
1199 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1200 *advertised |= OFPPF_PAUSE_ASYM;
1203 /* Current settings. */
1204 if (ecmd.speed == SPEED_10) {
1205 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1206 } else if (ecmd.speed == SPEED_100) {
1207 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1208 } else if (ecmd.speed == SPEED_1000) {
1209 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1210 } else if (ecmd.speed == SPEED_10000) {
1211 *current = OFPPF_10GB_FD;
1216 if (ecmd.port == PORT_TP) {
1217 *current |= OFPPF_COPPER;
1218 } else if (ecmd.port == PORT_FIBRE) {
1219 *current |= OFPPF_FIBER;
1223 *current |= OFPPF_AUTONEG;
1226 /* Peer advertisements. */
1227 *peer = 0; /* XXX */
1232 /* Set the features advertised by 'netdev' to 'advertise'. */
1234 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1236 struct ethtool_cmd ecmd;
1239 memset(&ecmd, 0, sizeof ecmd);
1240 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1241 ETHTOOL_GSET, "ETHTOOL_GSET");
1246 ecmd.advertising = 0;
1247 if (advertise & OFPPF_10MB_HD) {
1248 ecmd.advertising |= ADVERTISED_10baseT_Half;
1250 if (advertise & OFPPF_10MB_FD) {
1251 ecmd.advertising |= ADVERTISED_10baseT_Full;
1253 if (advertise & OFPPF_100MB_HD) {
1254 ecmd.advertising |= ADVERTISED_100baseT_Half;
1256 if (advertise & OFPPF_100MB_FD) {
1257 ecmd.advertising |= ADVERTISED_100baseT_Full;
1259 if (advertise & OFPPF_1GB_HD) {
1260 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1262 if (advertise & OFPPF_1GB_FD) {
1263 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1265 if (advertise & OFPPF_10GB_FD) {
1266 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1268 if (advertise & OFPPF_COPPER) {
1269 ecmd.advertising |= ADVERTISED_TP;
1271 if (advertise & OFPPF_FIBER) {
1272 ecmd.advertising |= ADVERTISED_FIBRE;
1274 if (advertise & OFPPF_AUTONEG) {
1275 ecmd.advertising |= ADVERTISED_Autoneg;
1277 if (advertise & OFPPF_PAUSE) {
1278 ecmd.advertising |= ADVERTISED_Pause;
1280 if (advertise & OFPPF_PAUSE_ASYM) {
1281 ecmd.advertising |= ADVERTISED_Asym_Pause;
1283 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1284 ETHTOOL_SSET, "ETHTOOL_SSET");
1287 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1288 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1289 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1290 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1291 * sets '*vlan_vid' to -1. */
1293 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1295 const char *netdev_name = netdev_get_name(netdev);
1296 struct ds line = DS_EMPTY_INITIALIZER;
1297 FILE *stream = NULL;
1301 COVERAGE_INC(netdev_get_vlan_vid);
1302 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1303 stream = fopen(fn, "r");
1309 if (ds_get_line(&line, stream)) {
1310 if (ferror(stream)) {
1312 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1315 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1320 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1322 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1323 fn, ds_cstr(&line));
1341 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1342 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1344 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1345 * positive errno value.
1347 * This function is equivalent to running
1348 * /sbin/tc qdisc del dev %s handle ffff: ingress
1349 * but it is much, much faster.
1352 netdev_linux_remove_policing(struct netdev *netdev)
1354 struct netdev_dev_linux *netdev_dev =
1355 netdev_dev_linux_cast(netdev_get_dev(netdev));
1356 const char *netdev_name = netdev_get_name(netdev);
1358 struct ofpbuf request;
1359 struct tcmsg *tcmsg;
1362 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1363 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1364 tcmsg->tcm_parent = TC_H_INGRESS;
1365 nl_msg_put_string(&request, TCA_KIND, "ingress");
1366 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1368 error = tc_transact(&request, NULL);
1369 if (error && error != ENOENT && error != EINVAL) {
1370 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1371 netdev_name, strerror(error));
1375 netdev_dev->kbits_rate = 0;
1376 netdev_dev->kbits_burst = 0;
1377 netdev_dev->cache_valid |= VALID_POLICING;
1381 /* Attempts to set input rate limiting (policing) policy. */
1383 netdev_linux_set_policing(struct netdev *netdev,
1384 uint32_t kbits_rate, uint32_t kbits_burst)
1386 struct netdev_dev_linux *netdev_dev =
1387 netdev_dev_linux_cast(netdev_get_dev(netdev));
1388 const char *netdev_name = netdev_get_name(netdev);
1391 COVERAGE_INC(netdev_set_policing);
1393 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1394 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1395 : kbits_burst); /* Stick with user-specified value. */
1397 if (netdev_dev->cache_valid & VALID_POLICING
1398 && netdev_dev->kbits_rate == kbits_rate
1399 && netdev_dev->kbits_burst == kbits_burst) {
1400 /* Assume that settings haven't changed since we last set them. */
1404 netdev_linux_remove_policing(netdev);
1406 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1407 if (system(command) != 0) {
1408 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1412 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1413 kbits_rate, kbits_burst);
1414 if (system(command) != 0) {
1415 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1420 netdev_dev->kbits_rate = kbits_rate;
1421 netdev_dev->kbits_burst = kbits_burst;
1422 netdev_dev->cache_valid |= VALID_POLICING;
1429 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1432 const struct tc_ops **opsp;
1434 for (opsp = tcs; *opsp != NULL; opsp++) {
1435 const struct tc_ops *ops = *opsp;
1436 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1437 svec_add(types, ops->ovs_name);
1443 static const struct tc_ops *
1444 tc_lookup_ovs_name(const char *name)
1446 const struct tc_ops **opsp;
1448 for (opsp = tcs; *opsp != NULL; opsp++) {
1449 const struct tc_ops *ops = *opsp;
1450 if (!strcmp(name, ops->ovs_name)) {
1457 static const struct tc_ops *
1458 tc_lookup_linux_name(const char *name)
1460 const struct tc_ops **opsp;
1462 for (opsp = tcs; *opsp != NULL; opsp++) {
1463 const struct tc_ops *ops = *opsp;
1464 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1472 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1474 struct netdev_qos_capabilities *caps)
1476 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1480 caps->n_queues = ops->n_queues;
1485 netdev_linux_get_qos(const struct netdev *netdev,
1486 const char **typep, struct shash *details)
1488 struct netdev_dev_linux *netdev_dev =
1489 netdev_dev_linux_cast(netdev_get_dev(netdev));
1492 error = tc_query_qdisc(netdev);
1497 *typep = netdev_dev->tc->ops->ovs_name;
1498 return (netdev_dev->tc->ops->qdisc_get
1499 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1504 netdev_linux_set_qos(struct netdev *netdev,
1505 const char *type, const struct shash *details)
1507 struct netdev_dev_linux *netdev_dev =
1508 netdev_dev_linux_cast(netdev_get_dev(netdev));
1509 const struct tc_ops *new_ops;
1512 new_ops = tc_lookup_ovs_name(type);
1513 if (!new_ops || !new_ops->tc_install) {
1517 error = tc_query_qdisc(netdev);
1522 if (new_ops == netdev_dev->tc->ops) {
1523 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1525 /* Delete existing qdisc. */
1526 error = tc_del_qdisc(netdev);
1530 assert(netdev_dev->tc == NULL);
1532 /* Install new qdisc. */
1533 error = new_ops->tc_install(netdev, details);
1534 assert((error == 0) == (netdev_dev->tc != NULL));
1541 netdev_linux_get_queue(const struct netdev *netdev,
1542 unsigned int queue_id, struct shash *details)
1544 struct netdev_dev_linux *netdev_dev =
1545 netdev_dev_linux_cast(netdev_get_dev(netdev));
1548 error = tc_query_qdisc(netdev);
1551 } else if (queue_id > UINT16_MAX
1552 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1556 return netdev_dev->tc->ops->class_get(netdev, queue_id, details);
1560 netdev_linux_set_queue(struct netdev *netdev,
1561 unsigned int queue_id, const struct shash *details)
1563 struct netdev_dev_linux *netdev_dev =
1564 netdev_dev_linux_cast(netdev_get_dev(netdev));
1567 error = tc_query_qdisc(netdev);
1570 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1571 || !netdev_dev->tc->ops->class_set) {
1575 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1579 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1581 struct netdev_dev_linux *netdev_dev =
1582 netdev_dev_linux_cast(netdev_get_dev(netdev));
1585 error = tc_query_qdisc(netdev);
1588 } else if (!netdev_dev->tc->ops->class_delete) {
1590 } else if (queue_id > UINT16_MAX
1591 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1595 return netdev_dev->tc->ops->class_delete(netdev, queue_id);
1599 netdev_linux_get_queue_stats(const struct netdev *netdev,
1600 unsigned int queue_id,
1601 struct netdev_queue_stats *stats)
1603 struct netdev_dev_linux *netdev_dev =
1604 netdev_dev_linux_cast(netdev_get_dev(netdev));
1607 error = tc_query_qdisc(netdev);
1610 } else if (queue_id > UINT16_MAX
1611 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1613 } else if (!netdev_dev->tc->ops->class_get_stats) {
1617 return netdev_dev->tc->ops->class_get_stats(netdev, queue_id, stats);
1621 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1623 struct ofpbuf request;
1624 struct tcmsg *tcmsg;
1626 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1627 tcmsg->tcm_parent = 0;
1628 nl_dump_start(dump, rtnl_sock, &request);
1629 ofpbuf_uninit(&request);
1633 netdev_linux_dump_queues(const struct netdev *netdev,
1634 netdev_dump_queues_cb *cb, void *aux)
1636 struct netdev_dev_linux *netdev_dev =
1637 netdev_dev_linux_cast(netdev_get_dev(netdev));
1638 unsigned int queue_id;
1639 struct shash details;
1644 error = tc_query_qdisc(netdev);
1647 } else if (!netdev_dev->tc->ops->class_get) {
1652 shash_init(&details);
1653 PORT_ARRAY_FOR_EACH (queue, &netdev_dev->tc->queues, queue_id) {
1654 shash_clear(&details);
1656 error = netdev_dev->tc->ops->class_get(netdev, queue_id, &details);
1658 (*cb)(queue_id, &details, aux);
1663 shash_destroy(&details);
1669 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1670 netdev_dump_queue_stats_cb *cb, void *aux)
1672 struct netdev_dev_linux *netdev_dev =
1673 netdev_dev_linux_cast(netdev_get_dev(netdev));
1674 struct nl_dump dump;
1679 error = tc_query_qdisc(netdev);
1682 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1687 start_queue_dump(netdev, &dump);
1688 while (nl_dump_next(&dump, &msg)) {
1689 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1695 error = nl_dump_done(&dump);
1696 return error ? error : last_error;
1700 netdev_linux_get_in4(const struct netdev *netdev_,
1701 struct in_addr *address, struct in_addr *netmask)
1703 struct netdev_dev_linux *netdev_dev =
1704 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1706 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1709 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1710 SIOCGIFADDR, "SIOCGIFADDR");
1715 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1716 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1721 netdev_dev->cache_valid |= VALID_IN4;
1723 *address = netdev_dev->address;
1724 *netmask = netdev_dev->netmask;
1725 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1729 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1730 struct in_addr netmask)
1732 struct netdev_dev_linux *netdev_dev =
1733 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1736 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1738 netdev_dev->cache_valid |= VALID_IN4;
1739 netdev_dev->address = address;
1740 netdev_dev->netmask = netmask;
1741 if (address.s_addr != INADDR_ANY) {
1742 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1743 "SIOCSIFNETMASK", netmask);
1750 parse_if_inet6_line(const char *line,
1751 struct in6_addr *in6, char ifname[16 + 1])
1753 uint8_t *s6 = in6->s6_addr;
1754 #define X8 "%2"SCNx8
1756 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1757 "%*x %*x %*x %*x %16s\n",
1758 &s6[0], &s6[1], &s6[2], &s6[3],
1759 &s6[4], &s6[5], &s6[6], &s6[7],
1760 &s6[8], &s6[9], &s6[10], &s6[11],
1761 &s6[12], &s6[13], &s6[14], &s6[15],
1765 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1766 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1768 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1770 struct netdev_dev_linux *netdev_dev =
1771 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1772 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1776 netdev_dev->in6 = in6addr_any;
1778 file = fopen("/proc/net/if_inet6", "r");
1780 const char *name = netdev_get_name(netdev_);
1781 while (fgets(line, sizeof line, file)) {
1782 struct in6_addr in6;
1783 char ifname[16 + 1];
1784 if (parse_if_inet6_line(line, &in6, ifname)
1785 && !strcmp(name, ifname))
1787 netdev_dev->in6 = in6;
1793 netdev_dev->cache_valid |= VALID_IN6;
1795 *in6 = netdev_dev->in6;
1800 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1802 struct sockaddr_in sin;
1803 memset(&sin, 0, sizeof sin);
1804 sin.sin_family = AF_INET;
1805 sin.sin_addr = addr;
1808 memset(sa, 0, sizeof *sa);
1809 memcpy(sa, &sin, sizeof sin);
1813 do_set_addr(struct netdev *netdev,
1814 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1817 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1818 make_in4_sockaddr(&ifr.ifr_addr, addr);
1820 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1824 /* Adds 'router' as a default IP gateway. */
1826 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1828 struct in_addr any = { INADDR_ANY };
1832 memset(&rt, 0, sizeof rt);
1833 make_in4_sockaddr(&rt.rt_dst, any);
1834 make_in4_sockaddr(&rt.rt_gateway, router);
1835 make_in4_sockaddr(&rt.rt_genmask, any);
1836 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1837 COVERAGE_INC(netdev_add_router);
1838 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1840 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1846 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1849 static const char fn[] = "/proc/net/route";
1854 *netdev_name = NULL;
1855 stream = fopen(fn, "r");
1856 if (stream == NULL) {
1857 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1862 while (fgets(line, sizeof line, stream)) {
1865 uint32_t dest, gateway, mask;
1866 int refcnt, metric, mtu;
1867 unsigned int flags, use, window, irtt;
1870 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1872 iface, &dest, &gateway, &flags, &refcnt,
1873 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1875 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1879 if (!(flags & RTF_UP)) {
1880 /* Skip routes that aren't up. */
1884 /* The output of 'dest', 'mask', and 'gateway' were given in
1885 * network byte order, so we don't need need any endian
1886 * conversions here. */
1887 if ((dest & mask) == (host->s_addr & mask)) {
1889 /* The host is directly reachable. */
1890 next_hop->s_addr = 0;
1892 /* To reach the host, we must go through a gateway. */
1893 next_hop->s_addr = gateway;
1895 *netdev_name = xstrdup(iface);
1906 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1907 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1908 * returns 0. Otherwise, it returns a positive errno value; in particular,
1909 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1911 netdev_linux_arp_lookup(const struct netdev *netdev,
1912 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1915 struct sockaddr_in sin;
1918 memset(&r, 0, sizeof r);
1919 sin.sin_family = AF_INET;
1920 sin.sin_addr.s_addr = ip;
1922 memcpy(&r.arp_pa, &sin, sizeof sin);
1923 r.arp_ha.sa_family = ARPHRD_ETHER;
1925 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1926 COVERAGE_INC(netdev_arp_lookup);
1927 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1929 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1930 } else if (retval != ENXIO) {
1931 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1932 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1938 nd_to_iff_flags(enum netdev_flags nd)
1941 if (nd & NETDEV_UP) {
1944 if (nd & NETDEV_PROMISC) {
1951 iff_to_nd_flags(int iff)
1953 enum netdev_flags nd = 0;
1957 if (iff & IFF_PROMISC) {
1958 nd |= NETDEV_PROMISC;
1964 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1965 enum netdev_flags on, enum netdev_flags *old_flagsp)
1967 int old_flags, new_flags;
1970 error = get_flags(netdev, &old_flags);
1972 *old_flagsp = iff_to_nd_flags(old_flags);
1973 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
1974 if (new_flags != old_flags) {
1975 error = set_flags(netdev, new_flags);
1982 poll_notify(struct list *list)
1984 struct netdev_linux_notifier *notifier;
1985 LIST_FOR_EACH (notifier, struct netdev_linux_notifier, node, list) {
1986 struct netdev_notifier *n = ¬ifier->notifier;
1992 netdev_linux_poll_cb(const struct rtnetlink_change *change,
1993 void *aux OVS_UNUSED)
1996 struct list *list = shash_find_data(&netdev_linux_notifiers,
2002 struct shash_node *node;
2003 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2004 poll_notify(node->data);
2010 netdev_linux_poll_add(struct netdev *netdev,
2011 void (*cb)(struct netdev_notifier *), void *aux,
2012 struct netdev_notifier **notifierp)
2014 const char *netdev_name = netdev_get_name(netdev);
2015 struct netdev_linux_notifier *notifier;
2018 if (shash_is_empty(&netdev_linux_notifiers)) {
2019 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2020 netdev_linux_poll_cb, NULL);
2026 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2028 list = xmalloc(sizeof *list);
2030 shash_add(&netdev_linux_notifiers, netdev_name, list);
2033 notifier = xmalloc(sizeof *notifier);
2034 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2035 list_push_back(list, ¬ifier->node);
2036 *notifierp = ¬ifier->notifier;
2041 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2043 struct netdev_linux_notifier *notifier =
2044 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2047 /* Remove 'notifier' from its list. */
2048 list = list_remove(¬ifier->node);
2049 if (list_is_empty(list)) {
2050 /* The list is now empty. Remove it from the hash and free it. */
2051 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2052 shash_delete(&netdev_linux_notifiers,
2053 shash_find(&netdev_linux_notifiers, netdev_name));
2058 /* If that was the last notifier, unregister. */
2059 if (shash_is_empty(&netdev_linux_notifiers)) {
2060 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2064 const struct netdev_class netdev_linux_class = {
2071 netdev_linux_create_system,
2072 netdev_linux_destroy,
2073 NULL, /* reconfigure */
2078 netdev_linux_enumerate,
2081 netdev_linux_recv_wait,
2083 NULL, /* dispatch */
2089 netdev_linux_send_wait,
2091 netdev_linux_set_etheraddr,
2092 netdev_linux_get_etheraddr,
2093 netdev_linux_get_mtu,
2094 netdev_linux_get_ifindex,
2095 netdev_linux_get_carrier,
2096 netdev_linux_get_stats,
2097 netdev_vport_set_stats,
2099 netdev_linux_get_features,
2100 netdev_linux_set_advertisements,
2101 netdev_linux_get_vlan_vid,
2103 netdev_linux_set_policing,
2104 netdev_linux_get_qos_types,
2105 netdev_linux_get_qos_capabilities,
2106 netdev_linux_get_qos,
2107 netdev_linux_set_qos,
2108 netdev_linux_get_queue,
2109 netdev_linux_set_queue,
2110 netdev_linux_delete_queue,
2111 netdev_linux_get_queue_stats,
2112 netdev_linux_dump_queues,
2113 netdev_linux_dump_queue_stats,
2115 netdev_linux_get_in4,
2116 netdev_linux_set_in4,
2117 netdev_linux_get_in6,
2118 netdev_linux_add_router,
2119 netdev_linux_get_next_hop,
2120 netdev_linux_arp_lookup,
2122 netdev_linux_update_flags,
2124 netdev_linux_poll_add,
2125 netdev_linux_poll_remove,
2128 const struct netdev_class netdev_tap_class = {
2135 netdev_linux_create_tap,
2136 netdev_linux_destroy,
2137 NULL, /* reconfigure */
2142 NULL, /* enumerate */
2145 netdev_linux_recv_wait,
2147 NULL, /* dispatch */
2153 netdev_linux_send_wait,
2155 netdev_linux_set_etheraddr,
2156 netdev_linux_get_etheraddr,
2157 netdev_linux_get_mtu,
2158 netdev_linux_get_ifindex,
2159 netdev_linux_get_carrier,
2160 netdev_linux_get_stats,
2161 NULL, /* set_stats */
2163 netdev_linux_get_features,
2164 netdev_linux_set_advertisements,
2165 netdev_linux_get_vlan_vid,
2167 netdev_linux_set_policing,
2168 netdev_linux_get_qos_types,
2169 netdev_linux_get_qos_capabilities,
2170 netdev_linux_get_qos,
2171 netdev_linux_set_qos,
2172 netdev_linux_get_queue,
2173 netdev_linux_set_queue,
2174 netdev_linux_delete_queue,
2175 netdev_linux_get_queue_stats,
2176 netdev_linux_dump_queues,
2177 netdev_linux_dump_queue_stats,
2179 netdev_linux_get_in4,
2180 netdev_linux_set_in4,
2181 netdev_linux_get_in6,
2182 netdev_linux_add_router,
2183 netdev_linux_get_next_hop,
2184 netdev_linux_arp_lookup,
2186 netdev_linux_update_flags,
2188 netdev_linux_poll_add,
2189 netdev_linux_poll_remove,
2192 /* HTB traffic control class. */
2194 #define HTB_N_QUEUES 0xf000
2198 unsigned int max_rate; /* In bytes/s. */
2202 unsigned int min_rate; /* In bytes/s. */
2203 unsigned int max_rate; /* In bytes/s. */
2204 unsigned int burst; /* In bytes. */
2205 unsigned int priority; /* Lower values are higher priorities. */
2209 htb_get__(const struct netdev *netdev)
2211 struct netdev_dev_linux *netdev_dev =
2212 netdev_dev_linux_cast(netdev_get_dev(netdev));
2213 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2217 htb_install__(struct netdev *netdev, uint64_t max_rate)
2219 struct netdev_dev_linux *netdev_dev =
2220 netdev_dev_linux_cast(netdev_get_dev(netdev));
2223 htb = xmalloc(sizeof *htb);
2224 tc_init(&htb->tc, &tc_ops_htb);
2225 htb->max_rate = max_rate;
2227 netdev_dev->tc = &htb->tc;
2232 /* Create an HTB qdisc.
2234 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2237 htb_setup_qdisc__(struct netdev *netdev)
2240 struct tc_htb_glob opt;
2241 struct ofpbuf request;
2242 struct tcmsg *tcmsg;
2244 tc_del_qdisc(netdev);
2246 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2247 NLM_F_EXCL | NLM_F_CREATE, &request);
2248 tcmsg->tcm_handle = tc_make_handle(1, 0);
2249 tcmsg->tcm_parent = TC_H_ROOT;
2251 nl_msg_put_string(&request, TCA_KIND, "htb");
2253 memset(&opt, 0, sizeof opt);
2254 opt.rate2quantum = 10;
2258 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2259 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2260 nl_msg_end_nested(&request, opt_offset);
2262 return tc_transact(&request, NULL);
2265 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2266 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2268 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2269 unsigned int parent, struct htb_class *class)
2272 struct tc_htb_opt opt;
2273 struct ofpbuf request;
2274 struct tcmsg *tcmsg;
2278 netdev_get_mtu(netdev, &mtu);
2280 memset(&opt, 0, sizeof opt);
2281 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2282 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2283 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2284 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2285 opt.prio = class->priority;
2287 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2288 tcmsg->tcm_handle = handle;
2289 tcmsg->tcm_parent = parent;
2291 nl_msg_put_string(&request, TCA_KIND, "htb");
2292 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2293 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2294 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2295 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2296 nl_msg_end_nested(&request, opt_offset);
2298 error = tc_transact(&request, NULL);
2300 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2301 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2302 netdev_get_name(netdev),
2303 tc_get_major(handle), tc_get_minor(handle),
2304 tc_get_major(parent), tc_get_minor(parent),
2305 class->min_rate, class->max_rate,
2306 class->burst, class->priority, strerror(error));
2311 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2312 * description of them into 'details'. The description complies with the
2313 * specification given in the vswitch database documentation for linux-htb
2316 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2318 static const struct nl_policy tca_htb_policy[] = {
2319 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2320 .min_len = sizeof(struct tc_htb_opt) },
2323 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2324 const struct tc_htb_opt *htb;
2326 if (!nl_parse_nested(nl_options, tca_htb_policy,
2327 attrs, ARRAY_SIZE(tca_htb_policy))) {
2328 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2332 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2333 class->min_rate = htb->rate.rate;
2334 class->max_rate = htb->ceil.rate;
2335 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2336 class->priority = htb->prio;
2341 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2342 struct htb_class *options,
2343 struct netdev_queue_stats *stats)
2345 struct nlattr *nl_options;
2346 unsigned int handle;
2349 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2350 if (!error && queue_id) {
2351 unsigned int major = tc_get_major(handle);
2352 unsigned int minor = tc_get_minor(handle);
2353 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2354 *queue_id = minor - 1;
2359 if (!error && options) {
2360 error = htb_parse_tca_options__(nl_options, options);
2366 htb_parse_qdisc_details__(struct netdev *netdev,
2367 const struct shash *details, struct htb_class *hc)
2369 const char *max_rate_s;
2371 max_rate_s = shash_find_data(details, "max-rate");
2372 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2373 if (!hc->max_rate) {
2376 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2377 hc->max_rate = netdev_features_to_bps(current) / 8;
2379 hc->min_rate = hc->max_rate;
2385 htb_parse_class_details__(struct netdev *netdev,
2386 const struct shash *details, struct htb_class *hc)
2388 const struct htb *htb = htb_get__(netdev);
2389 const char *min_rate_s = shash_find_data(details, "min-rate");
2390 const char *max_rate_s = shash_find_data(details, "max-rate");
2391 const char *burst_s = shash_find_data(details, "burst");
2392 const char *priority_s = shash_find_data(details, "priority");
2397 /* min-rate is required. */
2400 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2401 hc->min_rate = MAX(hc->min_rate, 0);
2402 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2405 hc->max_rate = (max_rate_s
2406 ? strtoull(max_rate_s, NULL, 10) / 8
2408 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2409 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2413 * According to hints in the documentation that I've read, it is important
2414 * that 'burst' be at least as big as the largest frame that might be
2415 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2416 * but having it a bit too small is a problem. Since netdev_get_mtu()
2417 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2418 * the MTU. We actually add 64, instead of 14, as a guard against
2419 * additional headers get tacked on somewhere that we're not aware of. */
2420 netdev_get_mtu(netdev, &mtu);
2421 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2422 hc->burst = MAX(hc->burst, mtu + 64);
2425 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2431 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2432 unsigned int parent, struct htb_class *options,
2433 struct netdev_queue_stats *stats)
2435 struct ofpbuf *reply;
2438 error = tc_query_class(netdev, handle, parent, &reply);
2440 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2441 ofpbuf_delete(reply);
2447 htb_tc_install(struct netdev *netdev, const struct shash *details)
2451 error = htb_setup_qdisc__(netdev);
2453 struct htb_class hc;
2455 htb_parse_qdisc_details__(netdev, details, &hc);
2456 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2457 tc_make_handle(1, 0), &hc);
2459 htb_install__(netdev, hc.max_rate);
2466 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2467 const struct htb_class *hc)
2469 struct htb *htb = htb_get__(netdev);
2470 struct htb_class *hcp;
2472 hcp = port_array_get(&htb->tc.queues, queue_id);
2474 hcp = xmalloc(sizeof *hcp);
2475 port_array_set(&htb->tc.queues, queue_id, hcp);
2481 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2483 struct shash details = SHASH_INITIALIZER(&details);
2485 struct nl_dump dump;
2486 struct htb_class hc;
2489 /* Get qdisc options. */
2491 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2492 htb = htb_install__(netdev, hc.max_rate);
2495 start_queue_dump(netdev, &dump);
2496 shash_init(&details);
2497 while (nl_dump_next(&dump, &msg)) {
2498 unsigned int queue_id;
2500 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2501 htb_update_queue__(netdev, queue_id, &hc);
2504 nl_dump_done(&dump);
2510 htb_tc_destroy(struct tc *tc)
2512 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2513 unsigned int queue_id;
2514 struct htb_class *hc;
2516 PORT_ARRAY_FOR_EACH (hc, &htb->tc.queues, queue_id) {
2524 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2526 const struct htb *htb = htb_get__(netdev);
2527 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2532 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2534 struct htb_class hc;
2537 htb_parse_qdisc_details__(netdev, details, &hc);
2538 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2539 tc_make_handle(1, 0), &hc);
2541 htb_get__(netdev)->max_rate = hc.max_rate;
2547 htb_class_get(const struct netdev *netdev, unsigned int queue_id,
2548 struct shash *details)
2550 const struct htb *htb = htb_get__(netdev);
2551 const struct htb_class *hc;
2553 hc = port_array_get(&htb->tc.queues, queue_id);
2556 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2557 if (hc->min_rate != hc->max_rate) {
2558 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2560 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2562 shash_add(details, "priority", xasprintf("%u", hc->priority));
2568 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2569 const struct shash *details)
2571 struct htb_class hc;
2574 error = htb_parse_class_details__(netdev, details, &hc);
2579 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2580 tc_make_handle(1, 0xfffe), &hc);
2585 htb_update_queue__(netdev, queue_id, &hc);
2590 htb_class_delete(struct netdev *netdev, unsigned int queue_id)
2592 struct htb *htb = htb_get__(netdev);
2593 struct htb_class *hc;
2596 hc = port_array_get(&htb->tc.queues, queue_id);
2599 error = tc_delete_class(netdev, tc_make_handle(1, queue_id + 1));
2602 port_array_delete(&htb->tc.queues, queue_id);
2608 htb_class_get_stats(const struct netdev *netdev, unsigned int queue_id,
2609 struct netdev_queue_stats *stats)
2611 return htb_query_class__(netdev, tc_make_handle(1, queue_id + 1),
2612 tc_make_handle(1, 0xfffe), NULL, stats);
2616 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2617 const struct ofpbuf *nlmsg,
2618 netdev_dump_queue_stats_cb *cb, void *aux)
2620 struct netdev_queue_stats stats;
2621 unsigned int handle, major, minor;
2624 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2629 major = tc_get_major(handle);
2630 minor = tc_get_minor(handle);
2631 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2632 (*cb)(tc_get_minor(handle), &stats, aux);
2637 static const struct tc_ops tc_ops_htb = {
2638 "htb", /* linux_name */
2639 "linux-htb", /* ovs_name */
2640 HTB_N_QUEUES, /* n_queues */
2649 htb_class_get_stats,
2650 htb_class_dump_stats
2653 /* "linux-default" traffic control class.
2655 * This class represents the default, unnamed Linux qdisc. It corresponds to
2656 * the "" (empty string) QoS type in the OVS database. */
2659 default_install__(struct netdev *netdev)
2661 struct netdev_dev_linux *netdev_dev =
2662 netdev_dev_linux_cast(netdev_get_dev(netdev));
2663 static struct tc *tc;
2666 tc = xmalloc(sizeof *tc);
2667 tc_init(tc, &tc_ops_default);
2669 netdev_dev->tc = tc;
2673 default_tc_install(struct netdev *netdev,
2674 const struct shash *details OVS_UNUSED)
2676 default_install__(netdev);
2681 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2683 default_install__(netdev);
2687 static const struct tc_ops tc_ops_default = {
2688 NULL, /* linux_name */
2693 NULL, /* tc_destroy */
2694 NULL, /* qdisc_get */
2695 NULL, /* qdisc_set */
2696 NULL, /* class_get */
2697 NULL, /* class_set */
2698 NULL, /* class_delete */
2699 NULL, /* class_get_stats */
2700 NULL /* class_dump_stats */
2703 /* "linux-other" traffic control class.
2708 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2710 struct netdev_dev_linux *netdev_dev =
2711 netdev_dev_linux_cast(netdev_get_dev(netdev));
2712 static struct tc *tc;
2715 tc = xmalloc(sizeof *tc);
2716 tc_init(tc, &tc_ops_other);
2718 netdev_dev->tc = tc;
2722 static const struct tc_ops tc_ops_other = {
2723 NULL, /* linux_name */
2724 "linux-other", /* ovs_name */
2726 NULL, /* tc_install */
2728 NULL, /* tc_destroy */
2729 NULL, /* qdisc_get */
2730 NULL, /* qdisc_set */
2731 NULL, /* class_get */
2732 NULL, /* class_set */
2733 NULL, /* class_delete */
2734 NULL, /* class_get_stats */
2735 NULL /* class_dump_stats */
2738 /* Traffic control. */
2740 /* Number of kernel "tc" ticks per second. */
2741 static double ticks_per_s;
2743 /* Number of kernel "jiffies" per second. This is used for the purpose of
2744 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2745 * one jiffy's worth of data.
2747 * There are two possibilities here:
2749 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2750 * approximate range of 100 to 1024. That means that we really need to
2751 * make sure that the qdisc can buffer that much data.
2753 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2754 * has finely granular timers and there's no need to fudge additional room
2755 * for buffers. (There's no extra effort needed to implement that: the
2756 * large 'buffer_hz' is used as a divisor, so practically any number will
2757 * come out as 0 in the division. Small integer results in the case of
2758 * really high dividends won't have any real effect anyhow.)
2760 static unsigned int buffer_hz;
2762 /* Returns tc handle 'major':'minor'. */
2764 tc_make_handle(unsigned int major, unsigned int minor)
2766 return TC_H_MAKE(major << 16, minor);
2769 /* Returns the major number from 'handle'. */
2771 tc_get_major(unsigned int handle)
2773 return TC_H_MAJ(handle) >> 16;
2776 /* Returns the minor number from 'handle'. */
2778 tc_get_minor(unsigned int handle)
2780 return TC_H_MIN(handle);
2783 static struct tcmsg *
2784 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2785 struct ofpbuf *request)
2787 struct tcmsg *tcmsg;
2791 error = get_ifindex(netdev, &ifindex);
2796 ofpbuf_init(request, 512);
2797 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2798 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2799 tcmsg->tcm_family = AF_UNSPEC;
2800 tcmsg->tcm_ifindex = ifindex;
2801 /* Caller should fill in tcmsg->tcm_handle. */
2802 /* Caller should fill in tcmsg->tcm_parent. */
2808 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2810 int error = nl_sock_transact(rtnl_sock, request, replyp);
2811 ofpbuf_uninit(request);
2818 /* The values in psched are not individually very meaningful, but they are
2819 * important. The tables below show some values seen in the wild.
2823 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2824 * (Before that, there are hints that it was 1000000000.)
2826 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2830 * -----------------------------------
2831 * [1] 000c8000 000f4240 000f4240 00000064
2832 * [2] 000003e8 00000400 000f4240 3b9aca00
2833 * [3] 000003e8 00000400 000f4240 3b9aca00
2834 * [4] 000003e8 00000400 000f4240 00000064
2835 * [5] 000003e8 00000040 000f4240 3b9aca00
2836 * [6] 000003e8 00000040 000f4240 000000f9
2838 * a b c d ticks_per_s buffer_hz
2839 * ------- --------- ---------- ------------- ----------- -------------
2840 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2841 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2842 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2843 * [4] 1,000 1,024 1,000,000 100 976,562 100
2844 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2845 * [6] 1,000 64 1,000,000 249 15,625,000 249
2847 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2848 * [2] 2.6.26-1-686-bigmem from Debian lenny
2849 * [3] 2.6.26-2-sparc64 from Debian lenny
2850 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2851 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2852 * [6] 2.6.34 from kernel.org on KVM
2854 static const char fn[] = "/proc/net/psched";
2855 unsigned int a, b, c, d;
2861 stream = fopen(fn, "r");
2863 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2867 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2868 VLOG_WARN("%s: read failed", fn);
2872 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2876 VLOG_WARN("%s: invalid scheduler parameters", fn);
2880 ticks_per_s = (double) a * c / b;
2884 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2887 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2890 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2891 * rate of 'rate' bytes per second. */
2893 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2898 return (rate * ticks) / ticks_per_s;
2901 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2902 * rate of 'rate' bytes per second. */
2904 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2909 return ((unsigned long long int) ticks_per_s * size) / rate;
2912 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2913 * a transmission rate of 'rate' bytes per second. */
2915 tc_buffer_per_jiffy(unsigned int rate)
2920 return rate / buffer_hz;
2923 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2924 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2925 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2926 * stores NULL into it if it is absent.
2928 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2931 * Returns 0 if successful, otherwise a positive errno value. */
2933 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2934 struct nlattr **options)
2936 static const struct nl_policy tca_policy[] = {
2937 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2938 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2940 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2942 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2943 tca_policy, ta, ARRAY_SIZE(ta))) {
2944 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2949 *kind = nl_attr_get_string(ta[TCA_KIND]);
2953 *options = ta[TCA_OPTIONS];
2968 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
2969 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
2970 * into '*options', and its queue statistics into '*stats'. Any of the output
2971 * arguments may be null.
2973 * Returns 0 if successful, otherwise a positive errno value. */
2975 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
2976 struct nlattr **options, struct netdev_queue_stats *stats)
2978 static const struct nl_policy tca_policy[] = {
2979 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
2980 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
2982 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2984 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2985 tca_policy, ta, ARRAY_SIZE(ta))) {
2986 VLOG_WARN_RL(&rl, "failed to parse class message");
2991 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
2992 *handlep = tc->tcm_handle;
2996 *options = ta[TCA_OPTIONS];
3000 const struct gnet_stats_queue *gsq;
3001 struct gnet_stats_basic gsb;
3003 static const struct nl_policy stats_policy[] = {
3004 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3005 .min_len = sizeof gsb },
3006 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3007 .min_len = sizeof *gsq },
3009 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3011 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3012 sa, ARRAY_SIZE(sa))) {
3013 VLOG_WARN_RL(&rl, "failed to parse class stats");
3017 /* Alignment issues screw up the length of struct gnet_stats_basic on
3018 * some arch/bitsize combinations. Newer versions of Linux have a
3019 * struct gnet_stats_basic_packed, but we can't depend on that. The
3020 * easiest thing to do is just to make a copy. */
3021 memset(&gsb, 0, sizeof gsb);
3022 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3023 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3024 stats->tx_bytes = gsb.bytes;
3025 stats->tx_packets = gsb.packets;
3027 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3028 stats->tx_errors = gsq->drops;
3038 memset(stats, 0, sizeof *stats);
3043 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3046 tc_query_class(const struct netdev *netdev,
3047 unsigned int handle, unsigned int parent,
3048 struct ofpbuf **replyp)
3050 struct ofpbuf request;
3051 struct tcmsg *tcmsg;
3054 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3055 tcmsg->tcm_handle = handle;
3056 tcmsg->tcm_parent = parent;
3058 error = tc_transact(&request, replyp);
3060 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3061 netdev_get_name(netdev),
3062 tc_get_major(handle), tc_get_minor(handle),
3063 tc_get_major(parent), tc_get_minor(parent),
3069 /* Equivalent to "tc class del dev <name> handle <handle>". */
3071 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3073 struct ofpbuf request;
3074 struct tcmsg *tcmsg;
3077 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3078 tcmsg->tcm_handle = handle;
3079 tcmsg->tcm_parent = 0;
3081 error = tc_transact(&request, NULL);
3083 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3084 netdev_get_name(netdev),
3085 tc_get_major(handle), tc_get_minor(handle),
3091 /* Equivalent to "tc qdisc del dev <name> root". */
3093 tc_del_qdisc(struct netdev *netdev)
3095 struct netdev_dev_linux *netdev_dev =
3096 netdev_dev_linux_cast(netdev_get_dev(netdev));
3097 struct ofpbuf request;
3098 struct tcmsg *tcmsg;
3101 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3102 tcmsg->tcm_handle = tc_make_handle(1, 0);
3103 tcmsg->tcm_parent = TC_H_ROOT;
3105 error = tc_transact(&request, NULL);
3106 if (error == EINVAL) {
3107 /* EINVAL probably means that the default qdisc was in use, in which
3108 * case we've accomplished our purpose. */
3111 if (!error && netdev_dev->tc) {
3112 if (netdev_dev->tc->ops->tc_destroy) {
3113 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3115 netdev_dev->tc = NULL;
3120 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3121 * kernel to determine what they are. Returns 0 if successful, otherwise a
3122 * positive errno value. */
3124 tc_query_qdisc(const struct netdev *netdev)
3126 struct netdev_dev_linux *netdev_dev =
3127 netdev_dev_linux_cast(netdev_get_dev(netdev));
3128 struct ofpbuf request, *qdisc;
3129 const struct tc_ops *ops;
3130 struct tcmsg *tcmsg;
3134 if (netdev_dev->tc) {
3138 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3139 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3140 * 2.6.35 without that fix backported to it.
3142 * To avoid the OOPS, we must not make a request that would attempt to dump
3143 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3144 * few others. There are a few ways that I can see to do this, but most of
3145 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3146 * technique chosen here is to assume that any non-default qdisc that we
3147 * create will have a class with handle 1:0. The built-in qdiscs only have
3148 * a class with handle 0:0.
3150 * We could check for Linux 2.6.35+ and use a more straightforward method
3152 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3153 tcmsg->tcm_handle = tc_make_handle(1, 0);
3154 tcmsg->tcm_parent = 0;
3156 /* Figure out what tc class to instantiate. */
3157 error = tc_transact(&request, &qdisc);
3161 error = tc_parse_qdisc(qdisc, &kind, NULL);
3163 ops = &tc_ops_other;
3165 ops = tc_lookup_linux_name(kind);
3167 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3168 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3170 ops = &tc_ops_other;
3173 } else if (error == ENOENT) {
3174 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3175 * other entity that doesn't have a handle 1:0. We will assume
3176 * that it's the system default qdisc. */
3177 ops = &tc_ops_default;
3180 /* Who knows? Maybe the device got deleted. */
3181 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3182 netdev_get_name(netdev), strerror(error));
3183 ops = &tc_ops_other;
3186 /* Instantiate it. */
3187 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3188 assert((load_error == 0) == (netdev_dev->tc != NULL));
3189 ofpbuf_delete(qdisc);
3191 return error ? error : load_error;
3194 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3195 approximate the time to transmit packets of various lengths. For an MTU of
3196 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3197 represents two possible packet lengths; for a MTU of 513 through 1024, four
3198 possible lengths; and so on.
3200 Returns, for the specified 'mtu', the number of bits that packet lengths
3201 need to be shifted right to fit within such a 256-entry table. */
3203 tc_calc_cell_log(unsigned int mtu)
3208 mtu = ETH_PAYLOAD_MAX;
3210 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3212 for (cell_log = 0; mtu >= 256; cell_log++) {
3219 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3222 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3224 memset(rate, 0, sizeof *rate);
3225 rate->cell_log = tc_calc_cell_log(mtu);
3226 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3227 /* rate->cell_align = 0; */ /* distro headers. */
3228 rate->mpu = ETH_TOTAL_MIN;
3232 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3233 * attribute of the specified "type".
3235 * See tc_calc_cell_log() above for a description of "rtab"s. */
3237 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3242 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3243 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3244 unsigned packet_size = (i + 1) << rate->cell_log;
3245 if (packet_size < rate->mpu) {
3246 packet_size = rate->mpu;
3248 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3252 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3253 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3254 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3259 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3261 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3262 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3266 /* Utility functions. */
3269 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3271 /* Policy for RTNLGRP_LINK messages.
3273 * There are *many* more fields in these messages, but currently we only
3274 * care about these fields. */
3275 static const struct nl_policy rtnlgrp_link_policy[] = {
3276 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3277 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3278 .min_len = sizeof(struct rtnl_link_stats) },
3281 struct ofpbuf request;
3282 struct ofpbuf *reply;
3283 struct ifinfomsg *ifi;
3284 const struct rtnl_link_stats *rtnl_stats;
3285 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3288 ofpbuf_init(&request, 0);
3289 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3290 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3291 ifi->ifi_family = PF_UNSPEC;
3292 ifi->ifi_index = ifindex;
3293 error = nl_sock_transact(rtnl_sock, &request, &reply);
3294 ofpbuf_uninit(&request);
3299 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3300 rtnlgrp_link_policy,
3301 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3302 ofpbuf_delete(reply);
3306 if (!attrs[IFLA_STATS]) {
3307 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3308 ofpbuf_delete(reply);
3312 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3313 stats->rx_packets = rtnl_stats->rx_packets;
3314 stats->tx_packets = rtnl_stats->tx_packets;
3315 stats->rx_bytes = rtnl_stats->rx_bytes;
3316 stats->tx_bytes = rtnl_stats->tx_bytes;
3317 stats->rx_errors = rtnl_stats->rx_errors;
3318 stats->tx_errors = rtnl_stats->tx_errors;
3319 stats->rx_dropped = rtnl_stats->rx_dropped;
3320 stats->tx_dropped = rtnl_stats->tx_dropped;
3321 stats->multicast = rtnl_stats->multicast;
3322 stats->collisions = rtnl_stats->collisions;
3323 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3324 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3325 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3326 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3327 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3328 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3329 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3330 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3331 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3332 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3333 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3335 ofpbuf_delete(reply);
3341 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3343 static const char fn[] = "/proc/net/dev";
3348 stream = fopen(fn, "r");
3350 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3355 while (fgets(line, sizeof line, stream)) {
3358 #define X64 "%"SCNu64
3361 X64 X64 X64 X64 X64 X64 X64 "%*u"
3362 X64 X64 X64 X64 X64 X64 X64 "%*u",
3368 &stats->rx_fifo_errors,
3369 &stats->rx_frame_errors,
3375 &stats->tx_fifo_errors,
3377 &stats->tx_carrier_errors) != 15) {
3378 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3379 } else if (!strcmp(devname, netdev_name)) {
3380 stats->rx_length_errors = UINT64_MAX;
3381 stats->rx_over_errors = UINT64_MAX;
3382 stats->rx_crc_errors = UINT64_MAX;
3383 stats->rx_missed_errors = UINT64_MAX;
3384 stats->tx_aborted_errors = UINT64_MAX;
3385 stats->tx_heartbeat_errors = UINT64_MAX;
3386 stats->tx_window_errors = UINT64_MAX;
3392 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3398 get_flags(const struct netdev *netdev, int *flags)
3403 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3405 *flags = ifr.ifr_flags;
3410 set_flags(struct netdev *netdev, int flags)
3414 ifr.ifr_flags = flags;
3415 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3420 do_get_ifindex(const char *netdev_name)
3424 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3425 COVERAGE_INC(netdev_get_ifindex);
3426 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3427 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3428 netdev_name, strerror(errno));
3431 return ifr.ifr_ifindex;
3435 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3437 struct netdev_dev_linux *netdev_dev =
3438 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3440 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3441 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3445 netdev_dev->cache_valid |= VALID_IFINDEX;
3446 netdev_dev->ifindex = ifindex;
3448 *ifindexp = netdev_dev->ifindex;
3453 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3458 memset(&ifr, 0, sizeof ifr);
3459 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3460 COVERAGE_INC(netdev_get_hwaddr);
3461 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3462 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3463 netdev_name, strerror(errno));
3466 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3467 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3468 VLOG_WARN("%s device has unknown hardware address family %d",
3469 netdev_name, hwaddr_family);
3471 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3476 set_etheraddr(const char *netdev_name, int hwaddr_family,
3477 const uint8_t mac[ETH_ADDR_LEN])
3481 memset(&ifr, 0, sizeof ifr);
3482 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3483 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3484 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3485 COVERAGE_INC(netdev_set_hwaddr);
3486 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3487 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3488 netdev_name, strerror(errno));
3495 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3496 int cmd, const char *cmd_name)
3500 memset(&ifr, 0, sizeof ifr);
3501 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3502 ifr.ifr_data = (caddr_t) ecmd;
3505 COVERAGE_INC(netdev_ethtool);
3506 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3509 if (errno != EOPNOTSUPP) {
3510 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3511 "failed: %s", cmd_name, name, strerror(errno));
3513 /* The device doesn't support this operation. That's pretty
3514 * common, so there's no point in logging anything. */
3521 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3522 const char *cmd_name)
3524 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3525 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3526 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3534 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3535 int cmd, const char *cmd_name)
3540 ifr.ifr_addr.sa_family = AF_INET;
3541 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3543 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3544 *ip = sin->sin_addr;