2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
51 #include "netdev-provider.h"
52 #include "netdev-vport.h"
55 #include "openflow/openflow.h"
57 #include "poll-loop.h"
58 #include "port-array.h"
59 #include "rtnetlink.h"
60 #include "socket-util.h"
65 VLOG_DEFINE_THIS_MODULE(netdev_linux)
67 /* These were introduced in Linux 2.6.14, so they might be missing if we have
69 #ifndef ADVERTISED_Pause
70 #define ADVERTISED_Pause (1 << 13)
72 #ifndef ADVERTISED_Asym_Pause
73 #define ADVERTISED_Asym_Pause (1 << 14)
76 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
79 #define TC_RTAB_SIZE 1024
82 static struct rtnetlink_notifier netdev_linux_cache_notifier;
83 static int cache_notifier_refcount;
86 VALID_IFINDEX = 1 << 0,
87 VALID_ETHERADDR = 1 << 1,
91 VALID_CARRIER = 1 << 5,
92 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
93 VALID_POLICING = 1 << 7,
94 VALID_HAVE_VPORT_STATS = 1 << 8
102 /* Traffic control. */
104 /* An instance of a traffic control class. Always associated with a particular
107 const struct tc_ops *ops;
109 /* Maps from queue ID to tc-specific data.
111 * The generic netdev TC layer uses this to the following extent: if an
112 * entry is nonnull, then the queue whose ID is the index is assumed to
113 * exist; if an entry is null, then that queue is assumed not to exist.
114 * Implementations must adhere to this scheme, although they may store
115 * whatever they like as data.
117 struct port_array queues;
120 /* A particular kind of traffic control. Each implementation generally maps to
121 * one particular Linux qdisc class.
123 * The functions below return 0 if successful or a positive errno value on
124 * failure, except where otherwise noted. All of them must be provided, except
125 * where otherwise noted. */
127 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
128 * This is null for tc_ops_default and tc_ops_other, for which there are no
129 * appropriate values. */
130 const char *linux_name;
132 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
133 const char *ovs_name;
135 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
136 * queues. The queues are numbered 0 through n_queues - 1. */
137 unsigned int n_queues;
139 /* Called to install this TC class on 'netdev'. The implementation should
140 * make the Netlink calls required to set up 'netdev' with the right qdisc
141 * and configure it according to 'details'. The implementation may assume
142 * that the current qdisc is the default; that is, there is no need for it
143 * to delete the current qdisc before installing itself.
145 * The contents of 'details' should be documented as valid for 'ovs_name'
146 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
147 * (which is built as ovs-vswitchd.conf.db(8)).
149 * This function must return 0 if and only if it sets 'netdev->tc' to an
150 * initialized 'struct tc'.
152 * (This function is null for tc_ops_other, which cannot be installed. For
153 * other TC classes it should always be nonnull.) */
154 int (*tc_install)(struct netdev *netdev, const struct shash *details);
156 /* Called when the netdev code determines (through a Netlink query) that
157 * this TC class's qdisc is installed on 'netdev', but we didn't install
158 * it ourselves and so don't know any of the details.
160 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
161 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
162 * implementation should parse the other attributes of 'nlmsg' as
163 * necessary to determine its configuration. If necessary it should also
164 * use Netlink queries to determine the configuration of queues on
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'. */
169 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
171 /* Destroys the data structures allocated by the implementation as part of
172 * 'tc'. (This includes destroying 'tc->queues' by calling
175 * The implementation should not need to perform any Netlink calls. If
176 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
177 * (But it may not be desirable.)
179 * This function may be null if 'tc' is trivial. */
180 void (*tc_destroy)(struct tc *tc);
182 /* Retrieves details of 'netdev->tc' configuration into 'details'.
184 * The implementation should not need to perform any Netlink calls, because
185 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
186 * cached the configuration.
188 * The contents of 'details' should be documented as valid for 'ovs_name'
189 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
190 * (which is built as ovs-vswitchd.conf.db(8)).
192 * This function may be null if 'tc' is not configurable.
194 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
196 /* Reconfigures 'netdev->tc' according to 'details', performing any
197 * required Netlink calls to complete the reconfiguration.
199 * The contents of 'details' should be documented as valid for 'ovs_name'
200 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
201 * (which is built as ovs-vswitchd.conf.db(8)).
203 * This function may be null if 'tc' is not configurable.
205 int (*qdisc_set)(struct netdev *, const struct shash *details);
207 /* Retrieves details of 'queue_id' on 'netdev->tc' into 'details'. The
208 * caller ensures that 'queues' has a nonnull value for index 'queue_id.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "Queue" table in
212 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the queue configuration.
218 * This function may be null if 'tc' does not have queues ('n_queues' is
220 int (*class_get)(const struct netdev *netdev, unsigned int queue_id,
221 struct shash *details);
223 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
224 * 'details', perfoming any required Netlink calls to complete the
225 * reconfiguration. The caller ensures that 'queue_id' is less than
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * This function may be null if 'tc' does not have queues or its queues are
233 * not configurable. */
234 int (*class_set)(struct netdev *, unsigned int queue_id,
235 const struct shash *details);
237 /* Deletes 'queue_id' from 'netdev->tc'. The caller ensures that 'queues'
238 * has a nonnull value for index 'queue_id.
240 * This function may be null if 'tc' does not have queues or its queues
241 * cannot be deleted. */
242 int (*class_delete)(struct netdev *, unsigned int queue_id);
244 /* Obtains stats for 'queue' from 'netdev->tc'. The caller ensures that
245 * 'queues' has a nonnull value for index 'queue_id.
247 * On success, initializes '*stats'.
249 * This function may be null if 'tc' does not have queues or if it cannot
250 * report queue statistics. */
251 int (*class_get_stats)(const struct netdev *netdev, unsigned int queue_id,
252 struct netdev_queue_stats *stats);
254 /* Extracts queue stats from 'nlmsg', which is a response to a
255 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
257 * This function may be null if 'tc' does not have queues or if it cannot
258 * report queue statistics. */
259 int (*class_dump_stats)(const struct netdev *netdev,
260 const struct ofpbuf *nlmsg,
261 netdev_dump_queue_stats_cb *cb, void *aux);
265 tc_init(struct tc *tc, const struct tc_ops *ops)
268 port_array_init(&tc->queues);
272 tc_destroy(struct tc *tc)
274 port_array_destroy(&tc->queues);
277 static const struct tc_ops tc_ops_htb;
278 static const struct tc_ops tc_ops_default;
279 static const struct tc_ops tc_ops_other;
281 static const struct tc_ops *tcs[] = {
282 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
283 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
284 &tc_ops_other, /* Some other qdisc. */
288 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
289 static unsigned int tc_get_major(unsigned int handle);
290 static unsigned int tc_get_minor(unsigned int handle);
292 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
293 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
294 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
296 static struct tcmsg *tc_make_request(const struct netdev *, int type,
297 unsigned int flags, struct ofpbuf *);
298 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
300 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
301 struct nlattr **options);
302 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
303 struct nlattr **options,
304 struct netdev_queue_stats *);
305 static int tc_query_class(const struct netdev *,
306 unsigned int handle, unsigned int parent,
307 struct ofpbuf **replyp);
308 static int tc_delete_class(const struct netdev *, unsigned int handle);
310 static int tc_del_qdisc(struct netdev *netdev);
311 static int tc_query_qdisc(const struct netdev *netdev);
313 static int tc_calc_cell_log(unsigned int mtu);
314 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
315 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
316 const struct tc_ratespec *rate);
317 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
319 struct netdev_dev_linux {
320 struct netdev_dev netdev_dev;
322 struct shash_node *shash_node;
323 unsigned int cache_valid;
325 /* The following are figured out "on demand" only. They are only valid
326 * when the corresponding VALID_* bit in 'cache_valid' is set. */
328 uint8_t etheraddr[ETH_ADDR_LEN];
329 struct in_addr address, netmask;
333 bool is_internal; /* Is this an openvswitch internal device? */
334 bool is_tap; /* Is this a tuntap device? */
335 uint32_t kbits_rate; /* Policing data. */
336 uint32_t kbits_burst;
337 bool have_vport_stats;
341 struct tap_state tap;
345 struct netdev_linux {
346 struct netdev netdev;
350 /* An AF_INET socket (used for ioctl operations). */
351 static int af_inet_sock = -1;
353 /* A Netlink routing socket that is not subscribed to any multicast groups. */
354 static struct nl_sock *rtnl_sock;
356 struct netdev_linux_notifier {
357 struct netdev_notifier notifier;
361 static struct shash netdev_linux_notifiers =
362 SHASH_INITIALIZER(&netdev_linux_notifiers);
363 static struct rtnetlink_notifier netdev_linux_poll_notifier;
365 /* This is set pretty low because we probably won't learn anything from the
366 * additional log messages. */
367 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
369 static int netdev_linux_init(void);
371 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
372 int cmd, const char *cmd_name);
373 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
374 const char *cmd_name);
375 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
376 int cmd, const char *cmd_name);
377 static int get_flags(const struct netdev *, int *flagsp);
378 static int set_flags(struct netdev *, int flags);
379 static int do_get_ifindex(const char *netdev_name);
380 static int get_ifindex(const struct netdev *, int *ifindexp);
381 static int do_set_addr(struct netdev *netdev,
382 int ioctl_nr, const char *ioctl_name,
383 struct in_addr addr);
384 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
385 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
386 const uint8_t[ETH_ADDR_LEN]);
387 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
388 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
391 is_netdev_linux_class(const struct netdev_class *netdev_class)
393 return netdev_class->init == netdev_linux_init;
396 static struct netdev_dev_linux *
397 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
399 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
400 assert(is_netdev_linux_class(netdev_class));
402 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
405 static struct netdev_linux *
406 netdev_linux_cast(const struct netdev *netdev)
408 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
409 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
410 assert(is_netdev_linux_class(netdev_class));
412 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
416 netdev_linux_init(void)
418 static int status = -1;
420 /* Create AF_INET socket. */
421 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
422 status = af_inet_sock >= 0 ? 0 : errno;
424 VLOG_ERR("failed to create inet socket: %s", strerror(status));
427 /* Create rtnetlink socket. */
429 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
431 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
440 netdev_linux_run(void)
442 rtnetlink_notifier_run();
446 netdev_linux_wait(void)
448 rtnetlink_notifier_wait();
452 netdev_linux_cache_cb(const struct rtnetlink_change *change,
453 void *aux OVS_UNUSED)
455 struct netdev_dev_linux *dev;
457 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
459 const struct netdev_class *netdev_class =
460 netdev_dev_get_class(base_dev);
462 if (is_netdev_linux_class(netdev_class)) {
463 dev = netdev_dev_linux_cast(base_dev);
464 dev->cache_valid = 0;
468 struct shash device_shash;
469 struct shash_node *node;
471 shash_init(&device_shash);
472 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
473 SHASH_FOR_EACH (node, &device_shash) {
475 dev->cache_valid = 0;
477 shash_destroy(&device_shash);
481 /* Creates the netdev device of 'type' with 'name'. */
483 netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
484 const struct shash *args, struct netdev_dev **netdev_devp)
486 struct netdev_dev_linux *netdev_dev;
489 if (!shash_is_empty(args)) {
490 VLOG_WARN("%s: arguments for system devices should be empty", name);
493 if (!cache_notifier_refcount) {
494 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
495 netdev_linux_cache_cb, NULL);
500 cache_notifier_refcount++;
502 netdev_dev = xzalloc(sizeof *netdev_dev);
503 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
505 *netdev_devp = &netdev_dev->netdev_dev;
509 /* For most types of netdevs we open the device for each call of
510 * netdev_open(). However, this is not the case with tap devices,
511 * since it is only possible to open the device once. In this
512 * situation we share a single file descriptor, and consequently
513 * buffers, across all readers. Therefore once data is read it will
514 * be unavailable to other reads for tap devices. */
516 netdev_linux_create_tap(const char *name, const char *type OVS_UNUSED,
517 const struct shash *args, struct netdev_dev **netdev_devp)
519 struct netdev_dev_linux *netdev_dev;
520 struct tap_state *state;
521 static const char tap_dev[] = "/dev/net/tun";
525 if (!shash_is_empty(args)) {
526 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
529 netdev_dev = xzalloc(sizeof *netdev_dev);
530 state = &netdev_dev->state.tap;
532 /* Open tap device. */
533 state->fd = open(tap_dev, O_RDWR);
536 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
540 /* Create tap device. */
541 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
542 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
543 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
544 VLOG_WARN("%s: creating tap device failed: %s", name,
550 /* Make non-blocking. */
551 error = set_nonblocking(state->fd);
556 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
557 *netdev_devp = &netdev_dev->netdev_dev;
566 destroy_tap(struct netdev_dev_linux *netdev_dev)
568 struct tap_state *state = &netdev_dev->state.tap;
570 if (state->fd >= 0) {
575 /* Destroys the netdev device 'netdev_dev_'. */
577 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
579 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
580 const char *type = netdev_dev_get_type(netdev_dev_);
582 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
583 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
586 if (!strcmp(type, "system")) {
587 cache_notifier_refcount--;
589 if (!cache_notifier_refcount) {
590 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
592 } else if (!strcmp(type, "tap")) {
593 destroy_tap(netdev_dev);
600 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
601 struct netdev **netdevp)
603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
604 struct netdev_linux *netdev;
605 enum netdev_flags flags;
608 /* Allocate network device. */
609 netdev = xzalloc(sizeof *netdev);
611 netdev_init(&netdev->netdev, netdev_dev_);
613 error = netdev_get_flags(&netdev->netdev, &flags);
614 if (error == ENODEV) {
618 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
619 !netdev_dev->state.tap.opened) {
621 /* We assume that the first user of the tap device is the primary user
622 * and give them the tap FD. Subsequent users probably just expect
623 * this to be a system device so open it normally to avoid send/receive
624 * directions appearing to be reversed. */
625 netdev->fd = netdev_dev->state.tap.fd;
626 netdev_dev->state.tap.opened = true;
627 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
628 struct sockaddr_ll sll;
632 /* Create file descriptor. */
633 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
634 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
636 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
637 if (netdev->fd < 0) {
642 /* Set non-blocking mode. */
643 error = set_nonblocking(netdev->fd);
648 /* Get ethernet device index. */
649 error = get_ifindex(&netdev->netdev, &ifindex);
654 /* Bind to specific ethernet device. */
655 memset(&sll, 0, sizeof sll);
656 sll.sll_family = AF_PACKET;
657 sll.sll_ifindex = ifindex;
659 (struct sockaddr *) &sll, sizeof sll) < 0) {
661 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
666 /* Between the socket() and bind() calls above, the socket receives all
667 * packets of the requested type on all system interfaces. We do not
668 * want to receive that data, but there is no way to avoid it. So we
669 * must now drain out the receive queue. */
670 error = drain_rcvbuf(netdev->fd);
676 *netdevp = &netdev->netdev;
680 netdev_uninit(&netdev->netdev, true);
684 /* Closes and destroys 'netdev'. */
686 netdev_linux_close(struct netdev *netdev_)
688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
690 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
696 /* Initializes 'svec' with a list of the names of all known network devices. */
698 netdev_linux_enumerate(struct svec *svec)
700 struct if_nameindex *names;
702 names = if_nameindex();
706 for (i = 0; names[i].if_name != NULL; i++) {
707 svec_add(svec, names[i].if_name);
709 if_freenameindex(names);
712 VLOG_WARN("could not obtain list of network device names: %s",
719 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
723 if (netdev->fd < 0) {
724 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
729 ssize_t retval = read(netdev->fd, data, size);
732 } else if (errno != EINTR) {
733 if (errno != EAGAIN) {
734 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
735 strerror(errno), netdev_get_name(netdev_));
742 /* Registers with the poll loop to wake up from the next call to poll_block()
743 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
745 netdev_linux_recv_wait(struct netdev *netdev_)
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
748 if (netdev->fd >= 0) {
749 poll_fd_wait(netdev->fd, POLLIN);
753 /* Discards all packets waiting to be received from 'netdev'. */
755 netdev_linux_drain(struct netdev *netdev_)
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
760 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
762 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
763 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
767 drain_fd(netdev->fd, ifr.ifr_qlen);
770 return drain_rcvbuf(netdev->fd);
774 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
775 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
776 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
777 * the packet is too big or too small to transmit on the device.
779 * The caller retains ownership of 'buffer' in all cases.
781 * The kernel maintains a packet transmission queue, so the caller is not
782 * expected to do additional queuing of packets. */
784 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
790 if (netdev->fd < 0) {
795 ssize_t retval = write(netdev->fd, data, size);
797 /* The Linux AF_PACKET implementation never blocks waiting for room
798 * for packets, instead returning ENOBUFS. Translate this into
799 * EAGAIN for the caller. */
800 if (errno == ENOBUFS) {
802 } else if (errno == EINTR) {
804 } else if (errno != EAGAIN) {
805 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
806 netdev_get_name(netdev_), strerror(errno));
809 } else if (retval != size) {
810 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
811 "%zu) on %s", retval, size, netdev_get_name(netdev_));
819 /* Registers with the poll loop to wake up from the next call to poll_block()
820 * when the packet transmission queue has sufficient room to transmit a packet
821 * with netdev_send().
823 * The kernel maintains a packet transmission queue, so the client is not
824 * expected to do additional queuing of packets. Thus, this function is
825 * unlikely to ever be used. It is included for completeness. */
827 netdev_linux_send_wait(struct netdev *netdev_)
829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
830 if (netdev->fd < 0) {
832 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
833 poll_fd_wait(netdev->fd, POLLOUT);
835 /* TAP device always accepts packets.*/
836 poll_immediate_wake();
840 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
841 * otherwise a positive errno value. */
843 netdev_linux_set_etheraddr(struct netdev *netdev_,
844 const uint8_t mac[ETH_ADDR_LEN])
846 struct netdev_dev_linux *netdev_dev =
847 netdev_dev_linux_cast(netdev_get_dev(netdev_));
850 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
851 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
852 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
854 netdev_dev->cache_valid |= VALID_ETHERADDR;
855 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
863 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
864 * free the returned buffer. */
866 netdev_linux_get_etheraddr(const struct netdev *netdev_,
867 uint8_t mac[ETH_ADDR_LEN])
869 struct netdev_dev_linux *netdev_dev =
870 netdev_dev_linux_cast(netdev_get_dev(netdev_));
871 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
872 int error = get_etheraddr(netdev_get_name(netdev_),
873 netdev_dev->etheraddr);
877 netdev_dev->cache_valid |= VALID_ETHERADDR;
879 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
883 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
884 * in bytes, not including the hardware header; thus, this is typically 1500
885 * bytes for Ethernet devices. */
887 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
889 struct netdev_dev_linux *netdev_dev =
890 netdev_dev_linux_cast(netdev_get_dev(netdev_));
891 if (!(netdev_dev->cache_valid & VALID_MTU)) {
895 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
896 SIOCGIFMTU, "SIOCGIFMTU");
900 netdev_dev->mtu = ifr.ifr_mtu;
901 netdev_dev->cache_valid |= VALID_MTU;
903 *mtup = netdev_dev->mtu;
907 /* Returns the ifindex of 'netdev', if successful, as a positive number.
908 * On failure, returns a negative errno value. */
910 netdev_linux_get_ifindex(const struct netdev *netdev)
914 error = get_ifindex(netdev, &ifindex);
915 return error ? -error : ifindex;
919 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
921 struct netdev_dev_linux *netdev_dev =
922 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
931 fn = xasprintf("/sys/class/net/%s/carrier",
932 netdev_get_name(netdev_));
933 fd = open(fn, O_RDONLY);
936 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
940 retval = read(fd, line, sizeof line);
943 if (error == EINVAL) {
944 /* This is the normal return value when we try to check carrier
945 * if the network device is not up. */
947 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
950 } else if (retval == 0) {
952 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
956 if (line[0] != '0' && line[0] != '1') {
958 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
962 netdev_dev->carrier = line[0] != '0';
963 netdev_dev->cache_valid |= VALID_CARRIER;
965 *carrier = netdev_dev->carrier;
976 /* Check whether we can we use RTM_GETLINK to get network device statistics.
977 * In pre-2.6.19 kernels, this was only available if wireless extensions were
980 check_for_working_netlink_stats(void)
982 /* Decide on the netdev_get_stats() implementation to use. Netlink is
983 * preferable, so if that works, we'll use it. */
984 int ifindex = do_get_ifindex("lo");
986 VLOG_WARN("failed to get ifindex for lo, "
987 "obtaining netdev stats from proc");
990 struct netdev_stats stats;
991 int error = get_stats_via_netlink(ifindex, &stats);
993 VLOG_DBG("obtaining netdev stats via rtnetlink");
996 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
997 "via proc (you are probably running a pre-2.6.19 "
998 "kernel)", strerror(error));
1004 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1006 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1008 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1009 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1010 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1012 netdev_dev->is_tap = !strcmp(type, "tap");
1013 netdev_dev->is_internal = false;
1014 if (!netdev_dev->is_tap) {
1015 struct ethtool_drvinfo drvinfo;
1018 memset(&drvinfo, 0, sizeof drvinfo);
1019 error = netdev_linux_do_ethtool(name,
1020 (struct ethtool_cmd *)&drvinfo,
1022 "ETHTOOL_GDRVINFO");
1024 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1025 netdev_dev->is_internal = true;
1029 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1034 swap_uint64(uint64_t *a, uint64_t *b)
1041 /* Retrieves current device stats for 'netdev'. */
1043 netdev_linux_get_stats(const struct netdev *netdev_,
1044 struct netdev_stats *stats)
1046 struct netdev_dev_linux *netdev_dev =
1047 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1048 static int use_netlink_stats = -1;
1051 COVERAGE_INC(netdev_get_stats);
1053 if (netdev_dev->have_vport_stats ||
1054 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1056 error = netdev_vport_get_stats(netdev_, stats);
1057 netdev_dev->have_vport_stats = !error;
1058 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1061 if (!netdev_dev->have_vport_stats) {
1062 if (use_netlink_stats < 0) {
1063 use_netlink_stats = check_for_working_netlink_stats();
1065 if (use_netlink_stats) {
1068 error = get_ifindex(netdev_, &ifindex);
1070 error = get_stats_via_netlink(ifindex, stats);
1073 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1077 /* If this port is an internal port then the transmit and receive stats
1078 * will appear to be swapped relative to the other ports since we are the
1079 * one sending the data, not a remote computer. For consistency, we swap
1080 * them back here. This does not apply if we are getting stats from the
1081 * vport layer because it always tracks stats from the perspective of the
1083 netdev_linux_update_is_pseudo(netdev_dev);
1084 if (!error && !netdev_dev->have_vport_stats &&
1085 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1086 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1087 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1088 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1089 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1090 stats->rx_length_errors = 0;
1091 stats->rx_over_errors = 0;
1092 stats->rx_crc_errors = 0;
1093 stats->rx_frame_errors = 0;
1094 stats->rx_fifo_errors = 0;
1095 stats->rx_missed_errors = 0;
1096 stats->tx_aborted_errors = 0;
1097 stats->tx_carrier_errors = 0;
1098 stats->tx_fifo_errors = 0;
1099 stats->tx_heartbeat_errors = 0;
1100 stats->tx_window_errors = 0;
1106 /* Stores the features supported by 'netdev' into each of '*current',
1107 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1108 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1109 * successful, otherwise a positive errno value. */
1111 netdev_linux_get_features(struct netdev *netdev,
1112 uint32_t *current, uint32_t *advertised,
1113 uint32_t *supported, uint32_t *peer)
1115 struct ethtool_cmd ecmd;
1118 memset(&ecmd, 0, sizeof ecmd);
1119 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1120 ETHTOOL_GSET, "ETHTOOL_GSET");
1125 /* Supported features. */
1127 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1128 *supported |= OFPPF_10MB_HD;
1130 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1131 *supported |= OFPPF_10MB_FD;
1133 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1134 *supported |= OFPPF_100MB_HD;
1136 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1137 *supported |= OFPPF_100MB_FD;
1139 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1140 *supported |= OFPPF_1GB_HD;
1142 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1143 *supported |= OFPPF_1GB_FD;
1145 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1146 *supported |= OFPPF_10GB_FD;
1148 if (ecmd.supported & SUPPORTED_TP) {
1149 *supported |= OFPPF_COPPER;
1151 if (ecmd.supported & SUPPORTED_FIBRE) {
1152 *supported |= OFPPF_FIBER;
1154 if (ecmd.supported & SUPPORTED_Autoneg) {
1155 *supported |= OFPPF_AUTONEG;
1157 if (ecmd.supported & SUPPORTED_Pause) {
1158 *supported |= OFPPF_PAUSE;
1160 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1161 *supported |= OFPPF_PAUSE_ASYM;
1164 /* Advertised features. */
1166 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1167 *advertised |= OFPPF_10MB_HD;
1169 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1170 *advertised |= OFPPF_10MB_FD;
1172 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1173 *advertised |= OFPPF_100MB_HD;
1175 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1176 *advertised |= OFPPF_100MB_FD;
1178 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1179 *advertised |= OFPPF_1GB_HD;
1181 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1182 *advertised |= OFPPF_1GB_FD;
1184 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1185 *advertised |= OFPPF_10GB_FD;
1187 if (ecmd.advertising & ADVERTISED_TP) {
1188 *advertised |= OFPPF_COPPER;
1190 if (ecmd.advertising & ADVERTISED_FIBRE) {
1191 *advertised |= OFPPF_FIBER;
1193 if (ecmd.advertising & ADVERTISED_Autoneg) {
1194 *advertised |= OFPPF_AUTONEG;
1196 if (ecmd.advertising & ADVERTISED_Pause) {
1197 *advertised |= OFPPF_PAUSE;
1199 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1200 *advertised |= OFPPF_PAUSE_ASYM;
1203 /* Current settings. */
1204 if (ecmd.speed == SPEED_10) {
1205 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1206 } else if (ecmd.speed == SPEED_100) {
1207 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1208 } else if (ecmd.speed == SPEED_1000) {
1209 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1210 } else if (ecmd.speed == SPEED_10000) {
1211 *current = OFPPF_10GB_FD;
1216 if (ecmd.port == PORT_TP) {
1217 *current |= OFPPF_COPPER;
1218 } else if (ecmd.port == PORT_FIBRE) {
1219 *current |= OFPPF_FIBER;
1223 *current |= OFPPF_AUTONEG;
1226 /* Peer advertisements. */
1227 *peer = 0; /* XXX */
1232 /* Set the features advertised by 'netdev' to 'advertise'. */
1234 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1236 struct ethtool_cmd ecmd;
1239 memset(&ecmd, 0, sizeof ecmd);
1240 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1241 ETHTOOL_GSET, "ETHTOOL_GSET");
1246 ecmd.advertising = 0;
1247 if (advertise & OFPPF_10MB_HD) {
1248 ecmd.advertising |= ADVERTISED_10baseT_Half;
1250 if (advertise & OFPPF_10MB_FD) {
1251 ecmd.advertising |= ADVERTISED_10baseT_Full;
1253 if (advertise & OFPPF_100MB_HD) {
1254 ecmd.advertising |= ADVERTISED_100baseT_Half;
1256 if (advertise & OFPPF_100MB_FD) {
1257 ecmd.advertising |= ADVERTISED_100baseT_Full;
1259 if (advertise & OFPPF_1GB_HD) {
1260 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1262 if (advertise & OFPPF_1GB_FD) {
1263 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1265 if (advertise & OFPPF_10GB_FD) {
1266 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1268 if (advertise & OFPPF_COPPER) {
1269 ecmd.advertising |= ADVERTISED_TP;
1271 if (advertise & OFPPF_FIBER) {
1272 ecmd.advertising |= ADVERTISED_FIBRE;
1274 if (advertise & OFPPF_AUTONEG) {
1275 ecmd.advertising |= ADVERTISED_Autoneg;
1277 if (advertise & OFPPF_PAUSE) {
1278 ecmd.advertising |= ADVERTISED_Pause;
1280 if (advertise & OFPPF_PAUSE_ASYM) {
1281 ecmd.advertising |= ADVERTISED_Asym_Pause;
1283 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1284 ETHTOOL_SSET, "ETHTOOL_SSET");
1287 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1288 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1289 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1290 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1291 * sets '*vlan_vid' to -1. */
1293 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1295 const char *netdev_name = netdev_get_name(netdev);
1296 struct ds line = DS_EMPTY_INITIALIZER;
1297 FILE *stream = NULL;
1301 COVERAGE_INC(netdev_get_vlan_vid);
1302 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1303 stream = fopen(fn, "r");
1309 if (ds_get_line(&line, stream)) {
1310 if (ferror(stream)) {
1312 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1315 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1320 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1322 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1323 fn, ds_cstr(&line));
1341 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1342 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1344 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1345 * positive errno value.
1347 * This function is equivalent to running
1348 * /sbin/tc qdisc del dev %s handle ffff: ingress
1349 * but it is much, much faster.
1352 netdev_linux_remove_policing(struct netdev *netdev)
1354 struct netdev_dev_linux *netdev_dev =
1355 netdev_dev_linux_cast(netdev_get_dev(netdev));
1356 const char *netdev_name = netdev_get_name(netdev);
1358 struct ofpbuf request;
1359 struct tcmsg *tcmsg;
1362 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1366 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1367 tcmsg->tcm_parent = TC_H_INGRESS;
1368 nl_msg_put_string(&request, TCA_KIND, "ingress");
1369 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1371 error = tc_transact(&request, NULL);
1372 if (error && error != ENOENT && error != EINVAL) {
1373 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1374 netdev_name, strerror(error));
1378 netdev_dev->kbits_rate = 0;
1379 netdev_dev->kbits_burst = 0;
1380 netdev_dev->cache_valid |= VALID_POLICING;
1384 /* Attempts to set input rate limiting (policing) policy. */
1386 netdev_linux_set_policing(struct netdev *netdev,
1387 uint32_t kbits_rate, uint32_t kbits_burst)
1389 struct netdev_dev_linux *netdev_dev =
1390 netdev_dev_linux_cast(netdev_get_dev(netdev));
1391 const char *netdev_name = netdev_get_name(netdev);
1394 COVERAGE_INC(netdev_set_policing);
1396 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1397 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1398 : kbits_burst); /* Stick with user-specified value. */
1400 if (netdev_dev->cache_valid & VALID_POLICING
1401 && netdev_dev->kbits_rate == kbits_rate
1402 && netdev_dev->kbits_burst == kbits_burst) {
1403 /* Assume that settings haven't changed since we last set them. */
1407 netdev_linux_remove_policing(netdev);
1409 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1410 if (system(command) != 0) {
1411 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1415 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1416 kbits_rate, kbits_burst);
1417 if (system(command) != 0) {
1418 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1423 netdev_dev->kbits_rate = kbits_rate;
1424 netdev_dev->kbits_burst = kbits_burst;
1425 netdev_dev->cache_valid |= VALID_POLICING;
1432 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1435 const struct tc_ops **opsp;
1437 for (opsp = tcs; *opsp != NULL; opsp++) {
1438 const struct tc_ops *ops = *opsp;
1439 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1440 svec_add(types, ops->ovs_name);
1446 static const struct tc_ops *
1447 tc_lookup_ovs_name(const char *name)
1449 const struct tc_ops **opsp;
1451 for (opsp = tcs; *opsp != NULL; opsp++) {
1452 const struct tc_ops *ops = *opsp;
1453 if (!strcmp(name, ops->ovs_name)) {
1460 static const struct tc_ops *
1461 tc_lookup_linux_name(const char *name)
1463 const struct tc_ops **opsp;
1465 for (opsp = tcs; *opsp != NULL; opsp++) {
1466 const struct tc_ops *ops = *opsp;
1467 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1475 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1477 struct netdev_qos_capabilities *caps)
1479 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1483 caps->n_queues = ops->n_queues;
1488 netdev_linux_get_qos(const struct netdev *netdev,
1489 const char **typep, struct shash *details)
1491 struct netdev_dev_linux *netdev_dev =
1492 netdev_dev_linux_cast(netdev_get_dev(netdev));
1495 error = tc_query_qdisc(netdev);
1500 *typep = netdev_dev->tc->ops->ovs_name;
1501 return (netdev_dev->tc->ops->qdisc_get
1502 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1507 netdev_linux_set_qos(struct netdev *netdev,
1508 const char *type, const struct shash *details)
1510 struct netdev_dev_linux *netdev_dev =
1511 netdev_dev_linux_cast(netdev_get_dev(netdev));
1512 const struct tc_ops *new_ops;
1515 new_ops = tc_lookup_ovs_name(type);
1516 if (!new_ops || !new_ops->tc_install) {
1520 error = tc_query_qdisc(netdev);
1525 if (new_ops == netdev_dev->tc->ops) {
1526 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1528 /* Delete existing qdisc. */
1529 error = tc_del_qdisc(netdev);
1533 assert(netdev_dev->tc == NULL);
1535 /* Install new qdisc. */
1536 error = new_ops->tc_install(netdev, details);
1537 assert((error == 0) == (netdev_dev->tc != NULL));
1544 netdev_linux_get_queue(const struct netdev *netdev,
1545 unsigned int queue_id, struct shash *details)
1547 struct netdev_dev_linux *netdev_dev =
1548 netdev_dev_linux_cast(netdev_get_dev(netdev));
1551 error = tc_query_qdisc(netdev);
1554 } else if (queue_id > UINT16_MAX
1555 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1559 return netdev_dev->tc->ops->class_get(netdev, queue_id, details);
1563 netdev_linux_set_queue(struct netdev *netdev,
1564 unsigned int queue_id, const struct shash *details)
1566 struct netdev_dev_linux *netdev_dev =
1567 netdev_dev_linux_cast(netdev_get_dev(netdev));
1570 error = tc_query_qdisc(netdev);
1573 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1574 || !netdev_dev->tc->ops->class_set) {
1578 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1582 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1584 struct netdev_dev_linux *netdev_dev =
1585 netdev_dev_linux_cast(netdev_get_dev(netdev));
1588 error = tc_query_qdisc(netdev);
1591 } else if (!netdev_dev->tc->ops->class_delete) {
1593 } else if (queue_id > UINT16_MAX
1594 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1598 return netdev_dev->tc->ops->class_delete(netdev, queue_id);
1602 netdev_linux_get_queue_stats(const struct netdev *netdev,
1603 unsigned int queue_id,
1604 struct netdev_queue_stats *stats)
1606 struct netdev_dev_linux *netdev_dev =
1607 netdev_dev_linux_cast(netdev_get_dev(netdev));
1610 error = tc_query_qdisc(netdev);
1613 } else if (queue_id > UINT16_MAX
1614 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1616 } else if (!netdev_dev->tc->ops->class_get_stats) {
1620 return netdev_dev->tc->ops->class_get_stats(netdev, queue_id, stats);
1624 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1626 struct ofpbuf request;
1627 struct tcmsg *tcmsg;
1629 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1633 tcmsg->tcm_parent = 0;
1634 nl_dump_start(dump, rtnl_sock, &request);
1635 ofpbuf_uninit(&request);
1640 netdev_linux_dump_queues(const struct netdev *netdev,
1641 netdev_dump_queues_cb *cb, void *aux)
1643 struct netdev_dev_linux *netdev_dev =
1644 netdev_dev_linux_cast(netdev_get_dev(netdev));
1645 unsigned int queue_id;
1646 struct shash details;
1651 error = tc_query_qdisc(netdev);
1654 } else if (!netdev_dev->tc->ops->class_get) {
1659 shash_init(&details);
1660 PORT_ARRAY_FOR_EACH (queue, &netdev_dev->tc->queues, queue_id) {
1661 shash_clear(&details);
1663 error = netdev_dev->tc->ops->class_get(netdev, queue_id, &details);
1665 (*cb)(queue_id, &details, aux);
1670 shash_destroy(&details);
1676 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1677 netdev_dump_queue_stats_cb *cb, void *aux)
1679 struct netdev_dev_linux *netdev_dev =
1680 netdev_dev_linux_cast(netdev_get_dev(netdev));
1681 struct nl_dump dump;
1686 error = tc_query_qdisc(netdev);
1689 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1694 if (!start_queue_dump(netdev, &dump)) {
1697 while (nl_dump_next(&dump, &msg)) {
1698 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1704 error = nl_dump_done(&dump);
1705 return error ? error : last_error;
1709 netdev_linux_get_in4(const struct netdev *netdev_,
1710 struct in_addr *address, struct in_addr *netmask)
1712 struct netdev_dev_linux *netdev_dev =
1713 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1715 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1718 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1719 SIOCGIFADDR, "SIOCGIFADDR");
1724 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1725 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1730 netdev_dev->cache_valid |= VALID_IN4;
1732 *address = netdev_dev->address;
1733 *netmask = netdev_dev->netmask;
1734 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1738 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1739 struct in_addr netmask)
1741 struct netdev_dev_linux *netdev_dev =
1742 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1745 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1747 netdev_dev->cache_valid |= VALID_IN4;
1748 netdev_dev->address = address;
1749 netdev_dev->netmask = netmask;
1750 if (address.s_addr != INADDR_ANY) {
1751 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1752 "SIOCSIFNETMASK", netmask);
1759 parse_if_inet6_line(const char *line,
1760 struct in6_addr *in6, char ifname[16 + 1])
1762 uint8_t *s6 = in6->s6_addr;
1763 #define X8 "%2"SCNx8
1765 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1766 "%*x %*x %*x %*x %16s\n",
1767 &s6[0], &s6[1], &s6[2], &s6[3],
1768 &s6[4], &s6[5], &s6[6], &s6[7],
1769 &s6[8], &s6[9], &s6[10], &s6[11],
1770 &s6[12], &s6[13], &s6[14], &s6[15],
1774 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1775 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1777 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1779 struct netdev_dev_linux *netdev_dev =
1780 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1781 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1785 netdev_dev->in6 = in6addr_any;
1787 file = fopen("/proc/net/if_inet6", "r");
1789 const char *name = netdev_get_name(netdev_);
1790 while (fgets(line, sizeof line, file)) {
1791 struct in6_addr in6;
1792 char ifname[16 + 1];
1793 if (parse_if_inet6_line(line, &in6, ifname)
1794 && !strcmp(name, ifname))
1796 netdev_dev->in6 = in6;
1802 netdev_dev->cache_valid |= VALID_IN6;
1804 *in6 = netdev_dev->in6;
1809 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1811 struct sockaddr_in sin;
1812 memset(&sin, 0, sizeof sin);
1813 sin.sin_family = AF_INET;
1814 sin.sin_addr = addr;
1817 memset(sa, 0, sizeof *sa);
1818 memcpy(sa, &sin, sizeof sin);
1822 do_set_addr(struct netdev *netdev,
1823 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1826 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1827 make_in4_sockaddr(&ifr.ifr_addr, addr);
1829 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1833 /* Adds 'router' as a default IP gateway. */
1835 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1837 struct in_addr any = { INADDR_ANY };
1841 memset(&rt, 0, sizeof rt);
1842 make_in4_sockaddr(&rt.rt_dst, any);
1843 make_in4_sockaddr(&rt.rt_gateway, router);
1844 make_in4_sockaddr(&rt.rt_genmask, any);
1845 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1846 COVERAGE_INC(netdev_add_router);
1847 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1849 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1855 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1858 static const char fn[] = "/proc/net/route";
1863 *netdev_name = NULL;
1864 stream = fopen(fn, "r");
1865 if (stream == NULL) {
1866 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1871 while (fgets(line, sizeof line, stream)) {
1874 uint32_t dest, gateway, mask;
1875 int refcnt, metric, mtu;
1876 unsigned int flags, use, window, irtt;
1879 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1881 iface, &dest, &gateway, &flags, &refcnt,
1882 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1884 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1888 if (!(flags & RTF_UP)) {
1889 /* Skip routes that aren't up. */
1893 /* The output of 'dest', 'mask', and 'gateway' were given in
1894 * network byte order, so we don't need need any endian
1895 * conversions here. */
1896 if ((dest & mask) == (host->s_addr & mask)) {
1898 /* The host is directly reachable. */
1899 next_hop->s_addr = 0;
1901 /* To reach the host, we must go through a gateway. */
1902 next_hop->s_addr = gateway;
1904 *netdev_name = xstrdup(iface);
1915 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1916 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1917 * returns 0. Otherwise, it returns a positive errno value; in particular,
1918 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1920 netdev_linux_arp_lookup(const struct netdev *netdev,
1921 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1924 struct sockaddr_in sin;
1927 memset(&r, 0, sizeof r);
1928 sin.sin_family = AF_INET;
1929 sin.sin_addr.s_addr = ip;
1931 memcpy(&r.arp_pa, &sin, sizeof sin);
1932 r.arp_ha.sa_family = ARPHRD_ETHER;
1934 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1935 COVERAGE_INC(netdev_arp_lookup);
1936 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1938 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1939 } else if (retval != ENXIO) {
1940 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1941 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1947 nd_to_iff_flags(enum netdev_flags nd)
1950 if (nd & NETDEV_UP) {
1953 if (nd & NETDEV_PROMISC) {
1960 iff_to_nd_flags(int iff)
1962 enum netdev_flags nd = 0;
1966 if (iff & IFF_PROMISC) {
1967 nd |= NETDEV_PROMISC;
1973 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1974 enum netdev_flags on, enum netdev_flags *old_flagsp)
1976 int old_flags, new_flags;
1979 error = get_flags(netdev, &old_flags);
1981 *old_flagsp = iff_to_nd_flags(old_flags);
1982 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
1983 if (new_flags != old_flags) {
1984 error = set_flags(netdev, new_flags);
1991 poll_notify(struct list *list)
1993 struct netdev_linux_notifier *notifier;
1994 LIST_FOR_EACH (notifier, struct netdev_linux_notifier, node, list) {
1995 struct netdev_notifier *n = ¬ifier->notifier;
2001 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2002 void *aux OVS_UNUSED)
2005 struct list *list = shash_find_data(&netdev_linux_notifiers,
2011 struct shash_node *node;
2012 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2013 poll_notify(node->data);
2019 netdev_linux_poll_add(struct netdev *netdev,
2020 void (*cb)(struct netdev_notifier *), void *aux,
2021 struct netdev_notifier **notifierp)
2023 const char *netdev_name = netdev_get_name(netdev);
2024 struct netdev_linux_notifier *notifier;
2027 if (shash_is_empty(&netdev_linux_notifiers)) {
2028 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2029 netdev_linux_poll_cb, NULL);
2035 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2037 list = xmalloc(sizeof *list);
2039 shash_add(&netdev_linux_notifiers, netdev_name, list);
2042 notifier = xmalloc(sizeof *notifier);
2043 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2044 list_push_back(list, ¬ifier->node);
2045 *notifierp = ¬ifier->notifier;
2050 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2052 struct netdev_linux_notifier *notifier =
2053 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2056 /* Remove 'notifier' from its list. */
2057 list = list_remove(¬ifier->node);
2058 if (list_is_empty(list)) {
2059 /* The list is now empty. Remove it from the hash and free it. */
2060 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2061 shash_delete(&netdev_linux_notifiers,
2062 shash_find(&netdev_linux_notifiers, netdev_name));
2067 /* If that was the last notifier, unregister. */
2068 if (shash_is_empty(&netdev_linux_notifiers)) {
2069 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2073 const struct netdev_class netdev_linux_class = {
2080 netdev_linux_create_system,
2081 netdev_linux_destroy,
2082 NULL, /* reconfigure */
2087 netdev_linux_enumerate,
2090 netdev_linux_recv_wait,
2094 netdev_linux_send_wait,
2096 netdev_linux_set_etheraddr,
2097 netdev_linux_get_etheraddr,
2098 netdev_linux_get_mtu,
2099 netdev_linux_get_ifindex,
2100 netdev_linux_get_carrier,
2101 netdev_linux_get_stats,
2102 netdev_vport_set_stats,
2104 netdev_linux_get_features,
2105 netdev_linux_set_advertisements,
2106 netdev_linux_get_vlan_vid,
2108 netdev_linux_set_policing,
2109 netdev_linux_get_qos_types,
2110 netdev_linux_get_qos_capabilities,
2111 netdev_linux_get_qos,
2112 netdev_linux_set_qos,
2113 netdev_linux_get_queue,
2114 netdev_linux_set_queue,
2115 netdev_linux_delete_queue,
2116 netdev_linux_get_queue_stats,
2117 netdev_linux_dump_queues,
2118 netdev_linux_dump_queue_stats,
2120 netdev_linux_get_in4,
2121 netdev_linux_set_in4,
2122 netdev_linux_get_in6,
2123 netdev_linux_add_router,
2124 netdev_linux_get_next_hop,
2125 netdev_linux_arp_lookup,
2127 netdev_linux_update_flags,
2129 netdev_linux_poll_add,
2130 netdev_linux_poll_remove,
2133 const struct netdev_class netdev_tap_class = {
2140 netdev_linux_create_tap,
2141 netdev_linux_destroy,
2142 NULL, /* reconfigure */
2147 NULL, /* enumerate */
2150 netdev_linux_recv_wait,
2154 netdev_linux_send_wait,
2156 netdev_linux_set_etheraddr,
2157 netdev_linux_get_etheraddr,
2158 netdev_linux_get_mtu,
2159 netdev_linux_get_ifindex,
2160 netdev_linux_get_carrier,
2161 netdev_linux_get_stats,
2162 NULL, /* set_stats */
2164 netdev_linux_get_features,
2165 netdev_linux_set_advertisements,
2166 netdev_linux_get_vlan_vid,
2168 netdev_linux_set_policing,
2169 netdev_linux_get_qos_types,
2170 netdev_linux_get_qos_capabilities,
2171 netdev_linux_get_qos,
2172 netdev_linux_set_qos,
2173 netdev_linux_get_queue,
2174 netdev_linux_set_queue,
2175 netdev_linux_delete_queue,
2176 netdev_linux_get_queue_stats,
2177 netdev_linux_dump_queues,
2178 netdev_linux_dump_queue_stats,
2180 netdev_linux_get_in4,
2181 netdev_linux_set_in4,
2182 netdev_linux_get_in6,
2183 netdev_linux_add_router,
2184 netdev_linux_get_next_hop,
2185 netdev_linux_arp_lookup,
2187 netdev_linux_update_flags,
2189 netdev_linux_poll_add,
2190 netdev_linux_poll_remove,
2193 /* HTB traffic control class. */
2195 #define HTB_N_QUEUES 0xf000
2199 unsigned int max_rate; /* In bytes/s. */
2203 unsigned int min_rate; /* In bytes/s. */
2204 unsigned int max_rate; /* In bytes/s. */
2205 unsigned int burst; /* In bytes. */
2206 unsigned int priority; /* Lower values are higher priorities. */
2210 htb_get__(const struct netdev *netdev)
2212 struct netdev_dev_linux *netdev_dev =
2213 netdev_dev_linux_cast(netdev_get_dev(netdev));
2214 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2218 htb_install__(struct netdev *netdev, uint64_t max_rate)
2220 struct netdev_dev_linux *netdev_dev =
2221 netdev_dev_linux_cast(netdev_get_dev(netdev));
2224 htb = xmalloc(sizeof *htb);
2225 tc_init(&htb->tc, &tc_ops_htb);
2226 htb->max_rate = max_rate;
2228 netdev_dev->tc = &htb->tc;
2233 /* Create an HTB qdisc.
2235 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2238 htb_setup_qdisc__(struct netdev *netdev)
2241 struct tc_htb_glob opt;
2242 struct ofpbuf request;
2243 struct tcmsg *tcmsg;
2245 tc_del_qdisc(netdev);
2247 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2248 NLM_F_EXCL | NLM_F_CREATE, &request);
2252 tcmsg->tcm_handle = tc_make_handle(1, 0);
2253 tcmsg->tcm_parent = TC_H_ROOT;
2255 nl_msg_put_string(&request, TCA_KIND, "htb");
2257 memset(&opt, 0, sizeof opt);
2258 opt.rate2quantum = 10;
2262 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2263 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2264 nl_msg_end_nested(&request, opt_offset);
2266 return tc_transact(&request, NULL);
2269 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2270 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2272 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2273 unsigned int parent, struct htb_class *class)
2276 struct tc_htb_opt opt;
2277 struct ofpbuf request;
2278 struct tcmsg *tcmsg;
2282 netdev_get_mtu(netdev, &mtu);
2284 memset(&opt, 0, sizeof opt);
2285 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2286 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2287 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2288 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2289 opt.prio = class->priority;
2291 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2295 tcmsg->tcm_handle = handle;
2296 tcmsg->tcm_parent = parent;
2298 nl_msg_put_string(&request, TCA_KIND, "htb");
2299 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2300 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2301 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2302 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2303 nl_msg_end_nested(&request, opt_offset);
2305 error = tc_transact(&request, NULL);
2307 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2308 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2309 netdev_get_name(netdev),
2310 tc_get_major(handle), tc_get_minor(handle),
2311 tc_get_major(parent), tc_get_minor(parent),
2312 class->min_rate, class->max_rate,
2313 class->burst, class->priority, strerror(error));
2318 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2319 * description of them into 'details'. The description complies with the
2320 * specification given in the vswitch database documentation for linux-htb
2323 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2325 static const struct nl_policy tca_htb_policy[] = {
2326 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2327 .min_len = sizeof(struct tc_htb_opt) },
2330 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2331 const struct tc_htb_opt *htb;
2333 if (!nl_parse_nested(nl_options, tca_htb_policy,
2334 attrs, ARRAY_SIZE(tca_htb_policy))) {
2335 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2339 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2340 class->min_rate = htb->rate.rate;
2341 class->max_rate = htb->ceil.rate;
2342 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2343 class->priority = htb->prio;
2348 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2349 struct htb_class *options,
2350 struct netdev_queue_stats *stats)
2352 struct nlattr *nl_options;
2353 unsigned int handle;
2356 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2357 if (!error && queue_id) {
2358 unsigned int major = tc_get_major(handle);
2359 unsigned int minor = tc_get_minor(handle);
2360 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2361 *queue_id = minor - 1;
2366 if (!error && options) {
2367 error = htb_parse_tca_options__(nl_options, options);
2373 htb_parse_qdisc_details__(struct netdev *netdev,
2374 const struct shash *details, struct htb_class *hc)
2376 const char *max_rate_s;
2378 max_rate_s = shash_find_data(details, "max-rate");
2379 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2380 if (!hc->max_rate) {
2383 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2384 hc->max_rate = netdev_features_to_bps(current) / 8;
2386 hc->min_rate = hc->max_rate;
2392 htb_parse_class_details__(struct netdev *netdev,
2393 const struct shash *details, struct htb_class *hc)
2395 const struct htb *htb = htb_get__(netdev);
2396 const char *min_rate_s = shash_find_data(details, "min-rate");
2397 const char *max_rate_s = shash_find_data(details, "max-rate");
2398 const char *burst_s = shash_find_data(details, "burst");
2399 const char *priority_s = shash_find_data(details, "priority");
2402 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2404 /* min-rate is required. */
2407 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2408 hc->min_rate = MAX(hc->min_rate, 1500);
2409 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2412 hc->max_rate = (max_rate_s
2413 ? strtoull(max_rate_s, NULL, 10) / 8
2415 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2416 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2420 * According to hints in the documentation that I've read, it is important
2421 * that 'burst' be at least as big as the largest frame that might be
2422 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2423 * but having it a bit too small is a problem. Since netdev_get_mtu()
2424 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2425 * the MTU. We actually add 64, instead of 14, as a guard against
2426 * additional headers get tacked on somewhere that we're not aware of. */
2427 netdev_get_mtu(netdev, &mtu);
2428 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2429 hc->burst = MAX(hc->burst, mtu + 64);
2432 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2438 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2439 unsigned int parent, struct htb_class *options,
2440 struct netdev_queue_stats *stats)
2442 struct ofpbuf *reply;
2445 error = tc_query_class(netdev, handle, parent, &reply);
2447 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2448 ofpbuf_delete(reply);
2454 htb_tc_install(struct netdev *netdev, const struct shash *details)
2458 error = htb_setup_qdisc__(netdev);
2460 struct htb_class hc;
2462 htb_parse_qdisc_details__(netdev, details, &hc);
2463 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2464 tc_make_handle(1, 0), &hc);
2466 htb_install__(netdev, hc.max_rate);
2473 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2474 const struct htb_class *hc)
2476 struct htb *htb = htb_get__(netdev);
2477 struct htb_class *hcp;
2479 hcp = port_array_get(&htb->tc.queues, queue_id);
2481 hcp = xmalloc(sizeof *hcp);
2482 port_array_set(&htb->tc.queues, queue_id, hcp);
2488 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2490 struct shash details = SHASH_INITIALIZER(&details);
2492 struct nl_dump dump;
2493 struct htb_class hc;
2496 /* Get qdisc options. */
2498 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2499 htb = htb_install__(netdev, hc.max_rate);
2502 if (!start_queue_dump(netdev, &dump)) {
2505 shash_init(&details);
2506 while (nl_dump_next(&dump, &msg)) {
2507 unsigned int queue_id;
2509 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2510 htb_update_queue__(netdev, queue_id, &hc);
2513 nl_dump_done(&dump);
2519 htb_tc_destroy(struct tc *tc)
2521 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2522 unsigned int queue_id;
2523 struct htb_class *hc;
2525 PORT_ARRAY_FOR_EACH (hc, &htb->tc.queues, queue_id) {
2533 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2535 const struct htb *htb = htb_get__(netdev);
2536 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2541 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2543 struct htb_class hc;
2546 htb_parse_qdisc_details__(netdev, details, &hc);
2547 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2548 tc_make_handle(1, 0), &hc);
2550 htb_get__(netdev)->max_rate = hc.max_rate;
2556 htb_class_get(const struct netdev *netdev, unsigned int queue_id,
2557 struct shash *details)
2559 const struct htb *htb = htb_get__(netdev);
2560 const struct htb_class *hc;
2562 hc = port_array_get(&htb->tc.queues, queue_id);
2565 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2566 if (hc->min_rate != hc->max_rate) {
2567 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2569 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2571 shash_add(details, "priority", xasprintf("%u", hc->priority));
2577 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2578 const struct shash *details)
2580 struct htb_class hc;
2583 error = htb_parse_class_details__(netdev, details, &hc);
2588 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2589 tc_make_handle(1, 0xfffe), &hc);
2594 htb_update_queue__(netdev, queue_id, &hc);
2599 htb_class_delete(struct netdev *netdev, unsigned int queue_id)
2601 struct htb *htb = htb_get__(netdev);
2602 struct htb_class *hc;
2605 hc = port_array_get(&htb->tc.queues, queue_id);
2608 error = tc_delete_class(netdev, tc_make_handle(1, queue_id + 1));
2611 port_array_delete(&htb->tc.queues, queue_id);
2617 htb_class_get_stats(const struct netdev *netdev, unsigned int queue_id,
2618 struct netdev_queue_stats *stats)
2620 return htb_query_class__(netdev, tc_make_handle(1, queue_id + 1),
2621 tc_make_handle(1, 0xfffe), NULL, stats);
2625 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2626 const struct ofpbuf *nlmsg,
2627 netdev_dump_queue_stats_cb *cb, void *aux)
2629 struct netdev_queue_stats stats;
2630 unsigned int handle, major, minor;
2633 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2638 major = tc_get_major(handle);
2639 minor = tc_get_minor(handle);
2640 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2641 (*cb)(minor - 1, &stats, aux);
2646 static const struct tc_ops tc_ops_htb = {
2647 "htb", /* linux_name */
2648 "linux-htb", /* ovs_name */
2649 HTB_N_QUEUES, /* n_queues */
2658 htb_class_get_stats,
2659 htb_class_dump_stats
2662 /* "linux-default" traffic control class.
2664 * This class represents the default, unnamed Linux qdisc. It corresponds to
2665 * the "" (empty string) QoS type in the OVS database. */
2668 default_install__(struct netdev *netdev)
2670 struct netdev_dev_linux *netdev_dev =
2671 netdev_dev_linux_cast(netdev_get_dev(netdev));
2672 static struct tc *tc;
2675 tc = xmalloc(sizeof *tc);
2676 tc_init(tc, &tc_ops_default);
2678 netdev_dev->tc = tc;
2682 default_tc_install(struct netdev *netdev,
2683 const struct shash *details OVS_UNUSED)
2685 default_install__(netdev);
2690 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2692 default_install__(netdev);
2696 static const struct tc_ops tc_ops_default = {
2697 NULL, /* linux_name */
2702 NULL, /* tc_destroy */
2703 NULL, /* qdisc_get */
2704 NULL, /* qdisc_set */
2705 NULL, /* class_get */
2706 NULL, /* class_set */
2707 NULL, /* class_delete */
2708 NULL, /* class_get_stats */
2709 NULL /* class_dump_stats */
2712 /* "linux-other" traffic control class.
2717 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2719 struct netdev_dev_linux *netdev_dev =
2720 netdev_dev_linux_cast(netdev_get_dev(netdev));
2721 static struct tc *tc;
2724 tc = xmalloc(sizeof *tc);
2725 tc_init(tc, &tc_ops_other);
2727 netdev_dev->tc = tc;
2731 static const struct tc_ops tc_ops_other = {
2732 NULL, /* linux_name */
2733 "linux-other", /* ovs_name */
2735 NULL, /* tc_install */
2737 NULL, /* tc_destroy */
2738 NULL, /* qdisc_get */
2739 NULL, /* qdisc_set */
2740 NULL, /* class_get */
2741 NULL, /* class_set */
2742 NULL, /* class_delete */
2743 NULL, /* class_get_stats */
2744 NULL /* class_dump_stats */
2747 /* Traffic control. */
2749 /* Number of kernel "tc" ticks per second. */
2750 static double ticks_per_s;
2752 /* Number of kernel "jiffies" per second. This is used for the purpose of
2753 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2754 * one jiffy's worth of data.
2756 * There are two possibilities here:
2758 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2759 * approximate range of 100 to 1024. That means that we really need to
2760 * make sure that the qdisc can buffer that much data.
2762 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2763 * has finely granular timers and there's no need to fudge additional room
2764 * for buffers. (There's no extra effort needed to implement that: the
2765 * large 'buffer_hz' is used as a divisor, so practically any number will
2766 * come out as 0 in the division. Small integer results in the case of
2767 * really high dividends won't have any real effect anyhow.)
2769 static unsigned int buffer_hz;
2771 /* Returns tc handle 'major':'minor'. */
2773 tc_make_handle(unsigned int major, unsigned int minor)
2775 return TC_H_MAKE(major << 16, minor);
2778 /* Returns the major number from 'handle'. */
2780 tc_get_major(unsigned int handle)
2782 return TC_H_MAJ(handle) >> 16;
2785 /* Returns the minor number from 'handle'. */
2787 tc_get_minor(unsigned int handle)
2789 return TC_H_MIN(handle);
2792 static struct tcmsg *
2793 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2794 struct ofpbuf *request)
2796 struct tcmsg *tcmsg;
2800 error = get_ifindex(netdev, &ifindex);
2805 ofpbuf_init(request, 512);
2806 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2807 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2808 tcmsg->tcm_family = AF_UNSPEC;
2809 tcmsg->tcm_ifindex = ifindex;
2810 /* Caller should fill in tcmsg->tcm_handle. */
2811 /* Caller should fill in tcmsg->tcm_parent. */
2817 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2819 int error = nl_sock_transact(rtnl_sock, request, replyp);
2820 ofpbuf_uninit(request);
2827 /* The values in psched are not individually very meaningful, but they are
2828 * important. The tables below show some values seen in the wild.
2832 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2833 * (Before that, there are hints that it was 1000000000.)
2835 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2839 * -----------------------------------
2840 * [1] 000c8000 000f4240 000f4240 00000064
2841 * [2] 000003e8 00000400 000f4240 3b9aca00
2842 * [3] 000003e8 00000400 000f4240 3b9aca00
2843 * [4] 000003e8 00000400 000f4240 00000064
2844 * [5] 000003e8 00000040 000f4240 3b9aca00
2845 * [6] 000003e8 00000040 000f4240 000000f9
2847 * a b c d ticks_per_s buffer_hz
2848 * ------- --------- ---------- ------------- ----------- -------------
2849 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2850 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2851 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2852 * [4] 1,000 1,024 1,000,000 100 976,562 100
2853 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2854 * [6] 1,000 64 1,000,000 249 15,625,000 249
2856 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2857 * [2] 2.6.26-1-686-bigmem from Debian lenny
2858 * [3] 2.6.26-2-sparc64 from Debian lenny
2859 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2860 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2861 * [6] 2.6.34 from kernel.org on KVM
2863 static const char fn[] = "/proc/net/psched";
2864 unsigned int a, b, c, d;
2870 stream = fopen(fn, "r");
2872 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2876 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2877 VLOG_WARN("%s: read failed", fn);
2881 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2885 VLOG_WARN("%s: invalid scheduler parameters", fn);
2889 ticks_per_s = (double) a * c / b;
2893 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2896 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2899 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2900 * rate of 'rate' bytes per second. */
2902 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2907 return (rate * ticks) / ticks_per_s;
2910 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2911 * rate of 'rate' bytes per second. */
2913 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2918 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
2921 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2922 * a transmission rate of 'rate' bytes per second. */
2924 tc_buffer_per_jiffy(unsigned int rate)
2929 return rate / buffer_hz;
2932 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2933 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2934 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2935 * stores NULL into it if it is absent.
2937 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2940 * Returns 0 if successful, otherwise a positive errno value. */
2942 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2943 struct nlattr **options)
2945 static const struct nl_policy tca_policy[] = {
2946 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2947 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2949 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2951 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2952 tca_policy, ta, ARRAY_SIZE(ta))) {
2953 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2958 *kind = nl_attr_get_string(ta[TCA_KIND]);
2962 *options = ta[TCA_OPTIONS];
2977 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
2978 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
2979 * into '*options', and its queue statistics into '*stats'. Any of the output
2980 * arguments may be null.
2982 * Returns 0 if successful, otherwise a positive errno value. */
2984 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
2985 struct nlattr **options, struct netdev_queue_stats *stats)
2987 static const struct nl_policy tca_policy[] = {
2988 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
2989 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
2991 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2993 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2994 tca_policy, ta, ARRAY_SIZE(ta))) {
2995 VLOG_WARN_RL(&rl, "failed to parse class message");
3000 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3001 *handlep = tc->tcm_handle;
3005 *options = ta[TCA_OPTIONS];
3009 const struct gnet_stats_queue *gsq;
3010 struct gnet_stats_basic gsb;
3012 static const struct nl_policy stats_policy[] = {
3013 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3014 .min_len = sizeof gsb },
3015 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3016 .min_len = sizeof *gsq },
3018 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3020 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3021 sa, ARRAY_SIZE(sa))) {
3022 VLOG_WARN_RL(&rl, "failed to parse class stats");
3026 /* Alignment issues screw up the length of struct gnet_stats_basic on
3027 * some arch/bitsize combinations. Newer versions of Linux have a
3028 * struct gnet_stats_basic_packed, but we can't depend on that. The
3029 * easiest thing to do is just to make a copy. */
3030 memset(&gsb, 0, sizeof gsb);
3031 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3032 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3033 stats->tx_bytes = gsb.bytes;
3034 stats->tx_packets = gsb.packets;
3036 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3037 stats->tx_errors = gsq->drops;
3047 memset(stats, 0, sizeof *stats);
3052 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3055 tc_query_class(const struct netdev *netdev,
3056 unsigned int handle, unsigned int parent,
3057 struct ofpbuf **replyp)
3059 struct ofpbuf request;
3060 struct tcmsg *tcmsg;
3063 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3067 tcmsg->tcm_handle = handle;
3068 tcmsg->tcm_parent = parent;
3070 error = tc_transact(&request, replyp);
3072 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3073 netdev_get_name(netdev),
3074 tc_get_major(handle), tc_get_minor(handle),
3075 tc_get_major(parent), tc_get_minor(parent),
3081 /* Equivalent to "tc class del dev <name> handle <handle>". */
3083 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3085 struct ofpbuf request;
3086 struct tcmsg *tcmsg;
3089 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3093 tcmsg->tcm_handle = handle;
3094 tcmsg->tcm_parent = 0;
3096 error = tc_transact(&request, NULL);
3098 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3099 netdev_get_name(netdev),
3100 tc_get_major(handle), tc_get_minor(handle),
3106 /* Equivalent to "tc qdisc del dev <name> root". */
3108 tc_del_qdisc(struct netdev *netdev)
3110 struct netdev_dev_linux *netdev_dev =
3111 netdev_dev_linux_cast(netdev_get_dev(netdev));
3112 struct ofpbuf request;
3113 struct tcmsg *tcmsg;
3116 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3120 tcmsg->tcm_handle = tc_make_handle(1, 0);
3121 tcmsg->tcm_parent = TC_H_ROOT;
3123 error = tc_transact(&request, NULL);
3124 if (error == EINVAL) {
3125 /* EINVAL probably means that the default qdisc was in use, in which
3126 * case we've accomplished our purpose. */
3129 if (!error && netdev_dev->tc) {
3130 if (netdev_dev->tc->ops->tc_destroy) {
3131 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3133 netdev_dev->tc = NULL;
3138 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3139 * kernel to determine what they are. Returns 0 if successful, otherwise a
3140 * positive errno value. */
3142 tc_query_qdisc(const struct netdev *netdev)
3144 struct netdev_dev_linux *netdev_dev =
3145 netdev_dev_linux_cast(netdev_get_dev(netdev));
3146 struct ofpbuf request, *qdisc;
3147 const struct tc_ops *ops;
3148 struct tcmsg *tcmsg;
3152 if (netdev_dev->tc) {
3156 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3157 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3158 * 2.6.35 without that fix backported to it.
3160 * To avoid the OOPS, we must not make a request that would attempt to dump
3161 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3162 * few others. There are a few ways that I can see to do this, but most of
3163 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3164 * technique chosen here is to assume that any non-default qdisc that we
3165 * create will have a class with handle 1:0. The built-in qdiscs only have
3166 * a class with handle 0:0.
3168 * We could check for Linux 2.6.35+ and use a more straightforward method
3170 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3174 tcmsg->tcm_handle = tc_make_handle(1, 0);
3175 tcmsg->tcm_parent = 0;
3177 /* Figure out what tc class to instantiate. */
3178 error = tc_transact(&request, &qdisc);
3182 error = tc_parse_qdisc(qdisc, &kind, NULL);
3184 ops = &tc_ops_other;
3186 ops = tc_lookup_linux_name(kind);
3188 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3189 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3191 ops = &tc_ops_other;
3194 } else if (error == ENOENT) {
3195 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3196 * other entity that doesn't have a handle 1:0. We will assume
3197 * that it's the system default qdisc. */
3198 ops = &tc_ops_default;
3201 /* Who knows? Maybe the device got deleted. */
3202 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3203 netdev_get_name(netdev), strerror(error));
3204 ops = &tc_ops_other;
3207 /* Instantiate it. */
3208 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3209 assert((load_error == 0) == (netdev_dev->tc != NULL));
3210 ofpbuf_delete(qdisc);
3212 return error ? error : load_error;
3215 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3216 approximate the time to transmit packets of various lengths. For an MTU of
3217 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3218 represents two possible packet lengths; for a MTU of 513 through 1024, four
3219 possible lengths; and so on.
3221 Returns, for the specified 'mtu', the number of bits that packet lengths
3222 need to be shifted right to fit within such a 256-entry table. */
3224 tc_calc_cell_log(unsigned int mtu)
3229 mtu = ETH_PAYLOAD_MAX;
3231 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3233 for (cell_log = 0; mtu >= 256; cell_log++) {
3240 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3243 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3245 memset(rate, 0, sizeof *rate);
3246 rate->cell_log = tc_calc_cell_log(mtu);
3247 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3248 /* rate->cell_align = 0; */ /* distro headers. */
3249 rate->mpu = ETH_TOTAL_MIN;
3253 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3254 * attribute of the specified "type".
3256 * See tc_calc_cell_log() above for a description of "rtab"s. */
3258 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3263 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3264 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3265 unsigned packet_size = (i + 1) << rate->cell_log;
3266 if (packet_size < rate->mpu) {
3267 packet_size = rate->mpu;
3269 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3273 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3274 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3275 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3278 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3280 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3281 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3285 /* Utility functions. */
3288 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3290 /* Policy for RTNLGRP_LINK messages.
3292 * There are *many* more fields in these messages, but currently we only
3293 * care about these fields. */
3294 static const struct nl_policy rtnlgrp_link_policy[] = {
3295 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3296 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3297 .min_len = sizeof(struct rtnl_link_stats) },
3300 struct ofpbuf request;
3301 struct ofpbuf *reply;
3302 struct ifinfomsg *ifi;
3303 const struct rtnl_link_stats *rtnl_stats;
3304 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3307 ofpbuf_init(&request, 0);
3308 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3309 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3310 ifi->ifi_family = PF_UNSPEC;
3311 ifi->ifi_index = ifindex;
3312 error = nl_sock_transact(rtnl_sock, &request, &reply);
3313 ofpbuf_uninit(&request);
3318 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3319 rtnlgrp_link_policy,
3320 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3321 ofpbuf_delete(reply);
3325 if (!attrs[IFLA_STATS]) {
3326 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3327 ofpbuf_delete(reply);
3331 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3332 stats->rx_packets = rtnl_stats->rx_packets;
3333 stats->tx_packets = rtnl_stats->tx_packets;
3334 stats->rx_bytes = rtnl_stats->rx_bytes;
3335 stats->tx_bytes = rtnl_stats->tx_bytes;
3336 stats->rx_errors = rtnl_stats->rx_errors;
3337 stats->tx_errors = rtnl_stats->tx_errors;
3338 stats->rx_dropped = rtnl_stats->rx_dropped;
3339 stats->tx_dropped = rtnl_stats->tx_dropped;
3340 stats->multicast = rtnl_stats->multicast;
3341 stats->collisions = rtnl_stats->collisions;
3342 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3343 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3344 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3345 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3346 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3347 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3348 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3349 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3350 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3351 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3352 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3354 ofpbuf_delete(reply);
3360 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3362 static const char fn[] = "/proc/net/dev";
3367 stream = fopen(fn, "r");
3369 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3374 while (fgets(line, sizeof line, stream)) {
3377 #define X64 "%"SCNu64
3380 X64 X64 X64 X64 X64 X64 X64 "%*u"
3381 X64 X64 X64 X64 X64 X64 X64 "%*u",
3387 &stats->rx_fifo_errors,
3388 &stats->rx_frame_errors,
3394 &stats->tx_fifo_errors,
3396 &stats->tx_carrier_errors) != 15) {
3397 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3398 } else if (!strcmp(devname, netdev_name)) {
3399 stats->rx_length_errors = UINT64_MAX;
3400 stats->rx_over_errors = UINT64_MAX;
3401 stats->rx_crc_errors = UINT64_MAX;
3402 stats->rx_missed_errors = UINT64_MAX;
3403 stats->tx_aborted_errors = UINT64_MAX;
3404 stats->tx_heartbeat_errors = UINT64_MAX;
3405 stats->tx_window_errors = UINT64_MAX;
3411 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3417 get_flags(const struct netdev *netdev, int *flags)
3422 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3424 *flags = ifr.ifr_flags;
3429 set_flags(struct netdev *netdev, int flags)
3433 ifr.ifr_flags = flags;
3434 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3439 do_get_ifindex(const char *netdev_name)
3443 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3444 COVERAGE_INC(netdev_get_ifindex);
3445 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3446 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3447 netdev_name, strerror(errno));
3450 return ifr.ifr_ifindex;
3454 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3456 struct netdev_dev_linux *netdev_dev =
3457 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3459 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3460 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3464 netdev_dev->cache_valid |= VALID_IFINDEX;
3465 netdev_dev->ifindex = ifindex;
3467 *ifindexp = netdev_dev->ifindex;
3472 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3477 memset(&ifr, 0, sizeof ifr);
3478 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3479 COVERAGE_INC(netdev_get_hwaddr);
3480 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3481 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3482 netdev_name, strerror(errno));
3485 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3486 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3487 VLOG_WARN("%s device has unknown hardware address family %d",
3488 netdev_name, hwaddr_family);
3490 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3495 set_etheraddr(const char *netdev_name, int hwaddr_family,
3496 const uint8_t mac[ETH_ADDR_LEN])
3500 memset(&ifr, 0, sizeof ifr);
3501 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3502 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3503 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3504 COVERAGE_INC(netdev_set_hwaddr);
3505 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3506 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3507 netdev_name, strerror(errno));
3514 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3515 int cmd, const char *cmd_name)
3519 memset(&ifr, 0, sizeof ifr);
3520 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3521 ifr.ifr_data = (caddr_t) ecmd;
3524 COVERAGE_INC(netdev_ethtool);
3525 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3528 if (errno != EOPNOTSUPP) {
3529 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3530 "failed: %s", cmd_name, name, strerror(errno));
3532 /* The device doesn't support this operation. That's pretty
3533 * common, so there's no point in logging anything. */
3540 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3541 const char *cmd_name)
3543 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3544 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3545 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3553 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3554 int cmd, const char *cmd_name)
3559 ifr.ifr_addr.sa_family = AF_INET;
3560 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3562 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3563 *ip = sin->sin_addr;