2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
96 #define TC_RTAB_SIZE 1024
99 static struct rtnetlink_notifier netdev_linux_cache_notifier;
100 static int cache_notifier_refcount;
103 VALID_IFINDEX = 1 << 0,
104 VALID_ETHERADDR = 1 << 1,
108 VALID_CARRIER = 1 << 5,
109 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
110 VALID_POLICING = 1 << 7,
111 VALID_HAVE_VPORT_STATS = 1 << 8
119 /* Traffic control. */
121 /* An instance of a traffic control class. Always associated with a particular
124 * Each TC implementation subclasses this with whatever additional data it
127 const struct tc_ops *ops;
128 struct hmap queues; /* Contains "struct tc_queue"s.
129 * Read by generic TC layer.
130 * Written only by TC implementation. */
133 /* One traffic control queue.
135 * Each TC implementation subclasses this with whatever additional data it
138 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
139 unsigned int queue_id; /* OpenFlow queue ID. */
142 /* A particular kind of traffic control. Each implementation generally maps to
143 * one particular Linux qdisc class.
145 * The functions below return 0 if successful or a positive errno value on
146 * failure, except where otherwise noted. All of them must be provided, except
147 * where otherwise noted. */
149 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
150 * This is null for tc_ops_default and tc_ops_other, for which there are no
151 * appropriate values. */
152 const char *linux_name;
154 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
155 const char *ovs_name;
157 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
158 * queues. The queues are numbered 0 through n_queues - 1. */
159 unsigned int n_queues;
161 /* Called to install this TC class on 'netdev'. The implementation should
162 * make the Netlink calls required to set up 'netdev' with the right qdisc
163 * and configure it according to 'details'. The implementation may assume
164 * that the current qdisc is the default; that is, there is no need for it
165 * to delete the current qdisc before installing itself.
167 * The contents of 'details' should be documented as valid for 'ovs_name'
168 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
169 * (which is built as ovs-vswitchd.conf.db(8)).
171 * This function must return 0 if and only if it sets 'netdev->tc' to an
172 * initialized 'struct tc'.
174 * (This function is null for tc_ops_other, which cannot be installed. For
175 * other TC classes it should always be nonnull.) */
176 int (*tc_install)(struct netdev *netdev, const struct shash *details);
178 /* Called when the netdev code determines (through a Netlink query) that
179 * this TC class's qdisc is installed on 'netdev', but we didn't install
180 * it ourselves and so don't know any of the details.
182 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
183 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
184 * implementation should parse the other attributes of 'nlmsg' as
185 * necessary to determine its configuration. If necessary it should also
186 * use Netlink queries to determine the configuration of queues on
189 * This function must return 0 if and only if it sets 'netdev->tc' to an
190 * initialized 'struct tc'. */
191 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
193 /* Destroys the data structures allocated by the implementation as part of
194 * 'tc'. (This includes destroying 'tc->queues' by calling
197 * The implementation should not need to perform any Netlink calls. If
198 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
199 * (But it may not be desirable.)
201 * This function may be null if 'tc' is trivial. */
202 void (*tc_destroy)(struct tc *tc);
204 /* Retrieves details of 'netdev->tc' configuration into 'details'.
206 * The implementation should not need to perform any Netlink calls, because
207 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
208 * cached the configuration.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
214 * This function may be null if 'tc' is not configurable.
216 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
218 /* Reconfigures 'netdev->tc' according to 'details', performing any
219 * required Netlink calls to complete the reconfiguration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_set)(struct netdev *, const struct shash *details);
229 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
230 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "Queue" table in
234 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
236 * The implementation should not need to perform any Netlink calls, because
237 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
238 * cached the queue configuration.
240 * This function may be null if 'tc' does not have queues ('n_queues' is
242 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
243 struct shash *details);
245 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
246 * 'details', perfoming any required Netlink calls to complete the
247 * reconfiguration. The caller ensures that 'queue_id' is less than
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "Queue" table in
252 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
254 * This function may be null if 'tc' does not have queues or its queues are
255 * not configurable. */
256 int (*class_set)(struct netdev *, unsigned int queue_id,
257 const struct shash *details);
259 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
260 * tc_queue's within 'netdev->tc->queues'.
262 * This function may be null if 'tc' does not have queues or its queues
263 * cannot be deleted. */
264 int (*class_delete)(struct netdev *, struct tc_queue *queue);
266 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
267 * 'struct tc_queue's within 'netdev->tc->queues'.
269 * On success, initializes '*stats'.
271 * This function may be null if 'tc' does not have queues or if it cannot
272 * report queue statistics. */
273 int (*class_get_stats)(const struct netdev *netdev,
274 const struct tc_queue *queue,
275 struct netdev_queue_stats *stats);
277 /* Extracts queue stats from 'nlmsg', which is a response to a
278 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_dump_stats)(const struct netdev *netdev,
283 const struct ofpbuf *nlmsg,
284 netdev_dump_queue_stats_cb *cb, void *aux);
288 tc_init(struct tc *tc, const struct tc_ops *ops)
291 hmap_init(&tc->queues);
295 tc_destroy(struct tc *tc)
297 hmap_destroy(&tc->queues);
300 static const struct tc_ops tc_ops_htb;
301 static const struct tc_ops tc_ops_hfsc;
302 static const struct tc_ops tc_ops_default;
303 static const struct tc_ops tc_ops_other;
305 static const struct tc_ops *tcs[] = {
306 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
307 &tc_ops_hfsc, /* Hierarchical fair service curve. */
308 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
309 &tc_ops_other, /* Some other qdisc. */
313 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
314 static unsigned int tc_get_major(unsigned int handle);
315 static unsigned int tc_get_minor(unsigned int handle);
317 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
318 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
319 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
321 static struct tcmsg *tc_make_request(const struct netdev *, int type,
322 unsigned int flags, struct ofpbuf *);
323 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
325 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
326 struct nlattr **options);
327 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
328 struct nlattr **options,
329 struct netdev_queue_stats *);
330 static int tc_query_class(const struct netdev *,
331 unsigned int handle, unsigned int parent,
332 struct ofpbuf **replyp);
333 static int tc_delete_class(const struct netdev *, unsigned int handle);
335 static int tc_del_qdisc(struct netdev *netdev);
336 static int tc_query_qdisc(const struct netdev *netdev);
338 static int tc_calc_cell_log(unsigned int mtu);
339 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
340 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
341 const struct tc_ratespec *rate);
342 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
344 struct netdev_dev_linux {
345 struct netdev_dev netdev_dev;
347 struct shash_node *shash_node;
348 unsigned int cache_valid;
349 unsigned int change_seq;
351 bool miimon; /* Link status of last poll. */
352 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
353 struct timer miimon_timer;
355 /* The following are figured out "on demand" only. They are only valid
356 * when the corresponding VALID_* bit in 'cache_valid' is set. */
358 uint8_t etheraddr[ETH_ADDR_LEN];
359 struct in_addr address, netmask;
363 bool is_internal; /* Is this an openvswitch internal device? */
364 bool is_tap; /* Is this a tuntap device? */
365 uint32_t kbits_rate; /* Policing data. */
366 uint32_t kbits_burst;
367 bool have_vport_stats;
371 struct tap_state tap;
375 struct netdev_linux {
376 struct netdev netdev;
380 /* Sockets used for ioctl operations. */
381 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
383 /* A Netlink routing socket that is not subscribed to any multicast groups. */
384 static struct nl_sock *rtnl_sock;
386 /* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
390 static int netdev_linux_init(void);
392 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
393 int cmd, const char *cmd_name);
394 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
396 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
398 static int get_flags(const struct netdev *, int *flagsp);
399 static int set_flags(struct netdev *, int flags);
400 static int do_get_ifindex(const char *netdev_name);
401 static int get_ifindex(const struct netdev *, int *ifindexp);
402 static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410 static int af_packet_sock(void);
411 static void netdev_linux_miimon_run(void);
412 static void netdev_linux_miimon_wait(void);
415 is_netdev_linux_class(const struct netdev_class *netdev_class)
417 return netdev_class->init == netdev_linux_init;
420 static struct netdev_dev_linux *
421 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
423 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
424 assert(is_netdev_linux_class(netdev_class));
426 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
429 static struct netdev_linux *
430 netdev_linux_cast(const struct netdev *netdev)
432 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
433 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
434 assert(is_netdev_linux_class(netdev_class));
436 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
440 netdev_linux_init(void)
442 static int status = -1;
444 /* Create AF_INET socket. */
445 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
446 status = af_inet_sock >= 0 ? 0 : errno;
448 VLOG_ERR("failed to create inet socket: %s", strerror(status));
451 /* Create rtnetlink socket. */
453 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
455 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
464 netdev_linux_run(void)
466 rtnetlink_link_notifier_run();
467 netdev_linux_miimon_run();
471 netdev_linux_wait(void)
473 rtnetlink_link_notifier_wait();
474 netdev_linux_miimon_wait();
478 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
481 if (!dev->change_seq) {
484 dev->cache_valid = 0;
488 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
489 void *aux OVS_UNUSED)
491 struct netdev_dev_linux *dev;
493 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
495 const struct netdev_class *netdev_class =
496 netdev_dev_get_class(base_dev);
498 if (is_netdev_linux_class(netdev_class)) {
499 dev = netdev_dev_linux_cast(base_dev);
500 netdev_dev_linux_changed(dev);
504 struct shash device_shash;
505 struct shash_node *node;
507 shash_init(&device_shash);
508 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
509 SHASH_FOR_EACH (node, &device_shash) {
511 netdev_dev_linux_changed(dev);
513 shash_destroy(&device_shash);
517 /* Creates system and internal devices. */
519 netdev_linux_create(const struct netdev_class *class,
520 const char *name, const struct shash *args,
521 struct netdev_dev **netdev_devp)
523 struct netdev_dev_linux *netdev_dev;
526 if (!shash_is_empty(args)) {
527 VLOG_WARN("%s: arguments for %s devices should be empty",
531 if (!cache_notifier_refcount) {
532 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
533 netdev_linux_cache_cb, NULL);
538 cache_notifier_refcount++;
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 netdev_dev->change_seq = 1;
542 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
544 *netdev_devp = &netdev_dev->netdev_dev;
548 /* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
555 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
556 const char *name, const struct shash *args,
557 struct netdev_dev **netdev_devp)
559 struct netdev_dev_linux *netdev_dev;
560 struct tap_state *state;
561 static const char tap_dev[] = "/dev/net/tun";
565 if (!shash_is_empty(args)) {
566 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
569 netdev_dev = xzalloc(sizeof *netdev_dev);
570 state = &netdev_dev->state.tap;
572 /* Open tap device. */
573 state->fd = open(tap_dev, O_RDWR);
576 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
580 /* Create tap device. */
581 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
582 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
583 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
584 VLOG_WARN("%s: creating tap device failed: %s", name,
590 /* Make non-blocking. */
591 error = set_nonblocking(state->fd);
596 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
597 *netdev_devp = &netdev_dev->netdev_dev;
606 destroy_tap(struct netdev_dev_linux *netdev_dev)
608 struct tap_state *state = &netdev_dev->state.tap;
610 if (state->fd >= 0) {
615 /* Destroys the netdev device 'netdev_dev_'. */
617 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
619 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
620 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
622 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
623 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
626 if (class == &netdev_linux_class || class == &netdev_internal_class) {
627 cache_notifier_refcount--;
629 if (!cache_notifier_refcount) {
630 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
632 } else if (class == &netdev_tap_class) {
633 destroy_tap(netdev_dev);
642 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
644 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
645 struct netdev_linux *netdev;
646 enum netdev_flags flags;
649 /* Allocate network device. */
650 netdev = xzalloc(sizeof *netdev);
652 netdev_init(&netdev->netdev, netdev_dev_);
654 /* Verify that the device really exists, by attempting to read its flags.
655 * (The flags might be cached, in which case this won't actually do an
658 * Don't do this for "internal" netdevs, though, because those have to be
659 * created as netdev objects before they exist in the kernel, because
660 * creating them in the kernel happens by passing a netdev object to
661 * dpif_port_add(). */
662 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
663 error = netdev_get_flags(&netdev->netdev, &flags);
664 if (error == ENODEV) {
669 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
670 !netdev_dev->state.tap.opened) {
672 /* We assume that the first user of the tap device is the primary user
673 * and give them the tap FD. Subsequent users probably just expect
674 * this to be a system device so open it normally to avoid send/receive
675 * directions appearing to be reversed. */
676 netdev->fd = netdev_dev->state.tap.fd;
677 netdev_dev->state.tap.opened = true;
680 *netdevp = &netdev->netdev;
684 netdev_uninit(&netdev->netdev, true);
688 /* Closes and destroys 'netdev'. */
690 netdev_linux_close(struct netdev *netdev_)
692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
694 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
700 /* Initializes 'sset' with a list of the names of all known network devices. */
702 netdev_linux_enumerate(struct sset *sset)
704 struct if_nameindex *names;
706 names = if_nameindex();
710 for (i = 0; names[i].if_name != NULL; i++) {
711 sset_add(sset, names[i].if_name);
713 if_freenameindex(names);
716 VLOG_WARN("could not obtain list of network device names: %s",
723 netdev_linux_listen(struct netdev *netdev_)
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 struct sockaddr_ll sll;
731 if (netdev->fd >= 0) {
735 /* Create file descriptor. */
736 fd = socket(PF_PACKET, SOCK_RAW, 0);
739 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
743 /* Set non-blocking mode. */
744 error = set_nonblocking(fd);
749 /* Get ethernet device index. */
750 error = get_ifindex(&netdev->netdev, &ifindex);
755 /* Bind to specific ethernet device. */
756 memset(&sll, 0, sizeof sll);
757 sll.sll_family = AF_PACKET;
758 sll.sll_ifindex = ifindex;
759 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
760 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
762 VLOG_ERR("%s: failed to bind raw socket (%s)",
763 netdev_get_name(netdev_), strerror(error));
778 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
780 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 if (netdev->fd < 0) {
783 /* Device is not listening. */
788 ssize_t retval = read(netdev->fd, data, size);
791 } else if (errno != EINTR) {
792 if (errno != EAGAIN) {
793 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
794 strerror(errno), netdev_get_name(netdev_));
801 /* Registers with the poll loop to wake up from the next call to poll_block()
802 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
804 netdev_linux_recv_wait(struct netdev *netdev_)
806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
807 if (netdev->fd >= 0) {
808 poll_fd_wait(netdev->fd, POLLIN);
812 /* Discards all packets waiting to be received from 'netdev'. */
814 netdev_linux_drain(struct netdev *netdev_)
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
817 if (netdev->fd < 0) {
819 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
821 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
822 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
826 drain_fd(netdev->fd, ifr.ifr_qlen);
829 return drain_rcvbuf(netdev->fd);
833 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
834 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
835 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
836 * the packet is too big or too small to transmit on the device.
838 * The caller retains ownership of 'buffer' in all cases.
840 * The kernel maintains a packet transmission queue, so the caller is not
841 * expected to do additional queuing of packets. */
843 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
845 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
849 if (netdev->fd < 0) {
850 /* Use our AF_PACKET socket to send to this device. */
851 struct sockaddr_ll sll;
858 sock = af_packet_sock();
863 error = get_ifindex(netdev_, &ifindex);
868 /* We don't bother setting most fields in sockaddr_ll because the
869 * kernel ignores them for SOCK_RAW. */
870 memset(&sll, 0, sizeof sll);
871 sll.sll_family = AF_PACKET;
872 sll.sll_ifindex = ifindex;
874 iov.iov_base = (void *) data;
878 msg.msg_namelen = sizeof sll;
881 msg.msg_control = NULL;
882 msg.msg_controllen = 0;
885 retval = sendmsg(sock, &msg, 0);
887 /* Use the netdev's own fd to send to this device. This is
888 * essential for tap devices, because packets sent to a tap device
889 * with an AF_PACKET socket will loop back to be *received* again
890 * on the tap device. */
891 retval = write(netdev->fd, data, size);
895 /* The Linux AF_PACKET implementation never blocks waiting for room
896 * for packets, instead returning ENOBUFS. Translate this into
897 * EAGAIN for the caller. */
898 if (errno == ENOBUFS) {
900 } else if (errno == EINTR) {
902 } else if (errno != EAGAIN) {
903 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
904 netdev_get_name(netdev_), strerror(errno));
907 } else if (retval != size) {
908 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
909 "%zu) on %s", retval, size, netdev_get_name(netdev_));
917 /* Registers with the poll loop to wake up from the next call to poll_block()
918 * when the packet transmission queue has sufficient room to transmit a packet
919 * with netdev_send().
921 * The kernel maintains a packet transmission queue, so the client is not
922 * expected to do additional queuing of packets. Thus, this function is
923 * unlikely to ever be used. It is included for completeness. */
925 netdev_linux_send_wait(struct netdev *netdev_)
927 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
928 if (netdev->fd < 0) {
930 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
931 poll_fd_wait(netdev->fd, POLLOUT);
933 /* TAP device always accepts packets.*/
934 poll_immediate_wake();
938 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
939 * otherwise a positive errno value. */
941 netdev_linux_set_etheraddr(struct netdev *netdev_,
942 const uint8_t mac[ETH_ADDR_LEN])
944 struct netdev_dev_linux *netdev_dev =
945 netdev_dev_linux_cast(netdev_get_dev(netdev_));
948 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
949 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
950 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
952 netdev_dev->cache_valid |= VALID_ETHERADDR;
953 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
961 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
962 * free the returned buffer. */
964 netdev_linux_get_etheraddr(const struct netdev *netdev_,
965 uint8_t mac[ETH_ADDR_LEN])
967 struct netdev_dev_linux *netdev_dev =
968 netdev_dev_linux_cast(netdev_get_dev(netdev_));
969 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
970 int error = get_etheraddr(netdev_get_name(netdev_),
971 netdev_dev->etheraddr);
975 netdev_dev->cache_valid |= VALID_ETHERADDR;
977 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
981 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
982 * in bytes, not including the hardware header; thus, this is typically 1500
983 * bytes for Ethernet devices. */
985 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
987 struct netdev_dev_linux *netdev_dev =
988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
989 if (!(netdev_dev->cache_valid & VALID_MTU)) {
993 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
994 SIOCGIFMTU, "SIOCGIFMTU");
998 netdev_dev->mtu = ifr.ifr_mtu;
999 netdev_dev->cache_valid |= VALID_MTU;
1001 *mtup = netdev_dev->mtu;
1005 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1006 * On failure, returns a negative errno value. */
1008 netdev_linux_get_ifindex(const struct netdev *netdev)
1012 error = get_ifindex(netdev, &ifindex);
1013 return error ? -error : ifindex;
1017 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1019 struct netdev_dev_linux *netdev_dev =
1020 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1025 if (netdev_dev->miimon_interval > 0) {
1026 *carrier = netdev_dev->miimon;
1030 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1034 fn = xasprintf("/sys/class/net/%s/carrier",
1035 netdev_get_name(netdev_));
1036 fd = open(fn, O_RDONLY);
1039 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1043 retval = read(fd, line, sizeof line);
1046 if (error == EINVAL) {
1047 /* This is the normal return value when we try to check carrier
1048 * if the network device is not up. */
1050 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1053 } else if (retval == 0) {
1055 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1059 if (line[0] != '0' && line[0] != '1') {
1061 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1065 netdev_dev->carrier = line[0] != '0';
1066 netdev_dev->cache_valid |= VALID_CARRIER;
1068 *carrier = netdev_dev->carrier;
1080 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1081 struct mii_ioctl_data *data)
1086 memset(&ifr, 0, sizeof ifr);
1087 memcpy(&ifr.ifr_data, data, sizeof *data);
1088 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1089 memcpy(data, &ifr.ifr_data, sizeof *data);
1095 netdev_linux_get_miimon(const char *name, bool *miimon)
1097 struct mii_ioctl_data data;
1102 memset(&data, 0, sizeof data);
1103 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1105 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1106 data.reg_num = MII_BMSR;
1107 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1111 *miimon = !!(data.val_out & BMSR_LSTATUS);
1113 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1116 struct ethtool_cmd ecmd;
1118 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1121 memset(&ecmd, 0, sizeof ecmd);
1122 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1125 struct ethtool_value eval;
1127 memcpy(&eval, &ecmd, sizeof eval);
1128 *miimon = !!eval.data;
1130 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1138 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1139 long long int interval)
1141 struct netdev_dev_linux *netdev_dev;
1143 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1145 interval = interval > 0 ? MAX(interval, 100) : 0;
1146 if (netdev_dev->miimon_interval != interval) {
1147 netdev_dev->miimon_interval = interval;
1148 timer_set_expired(&netdev_dev->miimon_timer);
1155 netdev_linux_miimon_run(void)
1157 struct shash device_shash;
1158 struct shash_node *node;
1160 shash_init(&device_shash);
1161 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1162 SHASH_FOR_EACH (node, &device_shash) {
1163 struct netdev_dev_linux *dev = node->data;
1166 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1170 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1171 if (miimon != dev->miimon) {
1172 dev->miimon = miimon;
1173 netdev_dev_linux_changed(dev);
1176 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1179 shash_destroy(&device_shash);
1183 netdev_linux_miimon_wait(void)
1185 struct shash device_shash;
1186 struct shash_node *node;
1188 shash_init(&device_shash);
1189 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1190 SHASH_FOR_EACH (node, &device_shash) {
1191 struct netdev_dev_linux *dev = node->data;
1193 if (dev->miimon_interval > 0) {
1194 timer_wait(&dev->miimon_timer);
1197 shash_destroy(&device_shash);
1200 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1201 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1204 check_for_working_netlink_stats(void)
1206 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1207 * preferable, so if that works, we'll use it. */
1208 int ifindex = do_get_ifindex("lo");
1210 VLOG_WARN("failed to get ifindex for lo, "
1211 "obtaining netdev stats from proc");
1214 struct netdev_stats stats;
1215 int error = get_stats_via_netlink(ifindex, &stats);
1217 VLOG_DBG("obtaining netdev stats via rtnetlink");
1220 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1221 "via proc (you are probably running a pre-2.6.19 "
1222 "kernel)", strerror(error));
1228 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1230 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1232 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1233 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1234 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1236 netdev_dev->is_tap = !strcmp(type, "tap");
1237 netdev_dev->is_internal = (!netdev_dev->is_tap
1238 && dpif_linux_is_internal_device(name));
1239 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1244 swap_uint64(uint64_t *a, uint64_t *b)
1251 /* Retrieves current device stats for 'netdev'. */
1253 netdev_linux_get_stats(const struct netdev *netdev_,
1254 struct netdev_stats *stats)
1256 struct netdev_dev_linux *netdev_dev =
1257 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1258 static int use_netlink_stats = -1;
1261 if (netdev_dev->have_vport_stats ||
1262 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1264 error = netdev_vport_get_stats(netdev_, stats);
1265 netdev_dev->have_vport_stats = !error;
1266 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1269 if (!netdev_dev->have_vport_stats) {
1270 if (use_netlink_stats < 0) {
1271 use_netlink_stats = check_for_working_netlink_stats();
1273 if (use_netlink_stats) {
1276 error = get_ifindex(netdev_, &ifindex);
1278 error = get_stats_via_netlink(ifindex, stats);
1281 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1285 /* If this port is an internal port then the transmit and receive stats
1286 * will appear to be swapped relative to the other ports since we are the
1287 * one sending the data, not a remote computer. For consistency, we swap
1288 * them back here. This does not apply if we are getting stats from the
1289 * vport layer because it always tracks stats from the perspective of the
1291 netdev_linux_update_is_pseudo(netdev_dev);
1292 if (!error && !netdev_dev->have_vport_stats &&
1293 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1294 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1295 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1296 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1297 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1298 stats->rx_length_errors = 0;
1299 stats->rx_over_errors = 0;
1300 stats->rx_crc_errors = 0;
1301 stats->rx_frame_errors = 0;
1302 stats->rx_fifo_errors = 0;
1303 stats->rx_missed_errors = 0;
1304 stats->tx_aborted_errors = 0;
1305 stats->tx_carrier_errors = 0;
1306 stats->tx_fifo_errors = 0;
1307 stats->tx_heartbeat_errors = 0;
1308 stats->tx_window_errors = 0;
1314 /* Stores the features supported by 'netdev' into each of '*current',
1315 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1316 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1317 * successful, otherwise a positive errno value. */
1319 netdev_linux_get_features(const struct netdev *netdev,
1320 uint32_t *current, uint32_t *advertised,
1321 uint32_t *supported, uint32_t *peer)
1323 struct ethtool_cmd ecmd;
1326 memset(&ecmd, 0, sizeof ecmd);
1327 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1328 ETHTOOL_GSET, "ETHTOOL_GSET");
1333 /* Supported features. */
1335 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1336 *supported |= OFPPF_10MB_HD;
1338 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1339 *supported |= OFPPF_10MB_FD;
1341 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1342 *supported |= OFPPF_100MB_HD;
1344 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1345 *supported |= OFPPF_100MB_FD;
1347 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1348 *supported |= OFPPF_1GB_HD;
1350 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1351 *supported |= OFPPF_1GB_FD;
1353 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1354 *supported |= OFPPF_10GB_FD;
1356 if (ecmd.supported & SUPPORTED_TP) {
1357 *supported |= OFPPF_COPPER;
1359 if (ecmd.supported & SUPPORTED_FIBRE) {
1360 *supported |= OFPPF_FIBER;
1362 if (ecmd.supported & SUPPORTED_Autoneg) {
1363 *supported |= OFPPF_AUTONEG;
1365 if (ecmd.supported & SUPPORTED_Pause) {
1366 *supported |= OFPPF_PAUSE;
1368 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1369 *supported |= OFPPF_PAUSE_ASYM;
1372 /* Advertised features. */
1374 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1375 *advertised |= OFPPF_10MB_HD;
1377 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1378 *advertised |= OFPPF_10MB_FD;
1380 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1381 *advertised |= OFPPF_100MB_HD;
1383 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1384 *advertised |= OFPPF_100MB_FD;
1386 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1387 *advertised |= OFPPF_1GB_HD;
1389 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1390 *advertised |= OFPPF_1GB_FD;
1392 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1393 *advertised |= OFPPF_10GB_FD;
1395 if (ecmd.advertising & ADVERTISED_TP) {
1396 *advertised |= OFPPF_COPPER;
1398 if (ecmd.advertising & ADVERTISED_FIBRE) {
1399 *advertised |= OFPPF_FIBER;
1401 if (ecmd.advertising & ADVERTISED_Autoneg) {
1402 *advertised |= OFPPF_AUTONEG;
1404 if (ecmd.advertising & ADVERTISED_Pause) {
1405 *advertised |= OFPPF_PAUSE;
1407 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1408 *advertised |= OFPPF_PAUSE_ASYM;
1411 /* Current settings. */
1412 if (ecmd.speed == SPEED_10) {
1413 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1414 } else if (ecmd.speed == SPEED_100) {
1415 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1416 } else if (ecmd.speed == SPEED_1000) {
1417 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1418 } else if (ecmd.speed == SPEED_10000) {
1419 *current = OFPPF_10GB_FD;
1424 if (ecmd.port == PORT_TP) {
1425 *current |= OFPPF_COPPER;
1426 } else if (ecmd.port == PORT_FIBRE) {
1427 *current |= OFPPF_FIBER;
1431 *current |= OFPPF_AUTONEG;
1434 /* Peer advertisements. */
1435 *peer = 0; /* XXX */
1440 /* Set the features advertised by 'netdev' to 'advertise'. */
1442 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1444 struct ethtool_cmd ecmd;
1447 memset(&ecmd, 0, sizeof ecmd);
1448 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1449 ETHTOOL_GSET, "ETHTOOL_GSET");
1454 ecmd.advertising = 0;
1455 if (advertise & OFPPF_10MB_HD) {
1456 ecmd.advertising |= ADVERTISED_10baseT_Half;
1458 if (advertise & OFPPF_10MB_FD) {
1459 ecmd.advertising |= ADVERTISED_10baseT_Full;
1461 if (advertise & OFPPF_100MB_HD) {
1462 ecmd.advertising |= ADVERTISED_100baseT_Half;
1464 if (advertise & OFPPF_100MB_FD) {
1465 ecmd.advertising |= ADVERTISED_100baseT_Full;
1467 if (advertise & OFPPF_1GB_HD) {
1468 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1470 if (advertise & OFPPF_1GB_FD) {
1471 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1473 if (advertise & OFPPF_10GB_FD) {
1474 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1476 if (advertise & OFPPF_COPPER) {
1477 ecmd.advertising |= ADVERTISED_TP;
1479 if (advertise & OFPPF_FIBER) {
1480 ecmd.advertising |= ADVERTISED_FIBRE;
1482 if (advertise & OFPPF_AUTONEG) {
1483 ecmd.advertising |= ADVERTISED_Autoneg;
1485 if (advertise & OFPPF_PAUSE) {
1486 ecmd.advertising |= ADVERTISED_Pause;
1488 if (advertise & OFPPF_PAUSE_ASYM) {
1489 ecmd.advertising |= ADVERTISED_Asym_Pause;
1491 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1492 ETHTOOL_SSET, "ETHTOOL_SSET");
1495 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1496 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1497 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1498 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1499 * sets '*vlan_vid' to -1. */
1501 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1503 const char *netdev_name = netdev_get_name(netdev);
1504 struct ds line = DS_EMPTY_INITIALIZER;
1505 FILE *stream = NULL;
1509 COVERAGE_INC(netdev_get_vlan_vid);
1510 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1511 stream = fopen(fn, "r");
1517 if (ds_get_line(&line, stream)) {
1518 if (ferror(stream)) {
1520 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1523 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1528 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1530 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1531 fn, ds_cstr(&line));
1549 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1550 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1552 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1553 * positive errno value.
1555 * This function is equivalent to running
1556 * /sbin/tc qdisc del dev %s handle ffff: ingress
1557 * but it is much, much faster.
1560 netdev_linux_remove_policing(struct netdev *netdev)
1562 struct netdev_dev_linux *netdev_dev =
1563 netdev_dev_linux_cast(netdev_get_dev(netdev));
1564 const char *netdev_name = netdev_get_name(netdev);
1566 struct ofpbuf request;
1567 struct tcmsg *tcmsg;
1570 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1574 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1575 tcmsg->tcm_parent = TC_H_INGRESS;
1576 nl_msg_put_string(&request, TCA_KIND, "ingress");
1577 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1579 error = tc_transact(&request, NULL);
1580 if (error && error != ENOENT && error != EINVAL) {
1581 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1582 netdev_name, strerror(error));
1586 netdev_dev->kbits_rate = 0;
1587 netdev_dev->kbits_burst = 0;
1588 netdev_dev->cache_valid |= VALID_POLICING;
1592 /* Attempts to set input rate limiting (policing) policy. */
1594 netdev_linux_set_policing(struct netdev *netdev,
1595 uint32_t kbits_rate, uint32_t kbits_burst)
1597 struct netdev_dev_linux *netdev_dev =
1598 netdev_dev_linux_cast(netdev_get_dev(netdev));
1599 const char *netdev_name = netdev_get_name(netdev);
1602 COVERAGE_INC(netdev_set_policing);
1604 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1605 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1606 : kbits_burst); /* Stick with user-specified value. */
1608 if (netdev_dev->cache_valid & VALID_POLICING
1609 && netdev_dev->kbits_rate == kbits_rate
1610 && netdev_dev->kbits_burst == kbits_burst) {
1611 /* Assume that settings haven't changed since we last set them. */
1615 netdev_linux_remove_policing(netdev);
1617 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1618 if (system(command) != 0) {
1619 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1623 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1624 kbits_rate, kbits_burst);
1625 if (system(command) != 0) {
1626 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1631 netdev_dev->kbits_rate = kbits_rate;
1632 netdev_dev->kbits_burst = kbits_burst;
1633 netdev_dev->cache_valid |= VALID_POLICING;
1640 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1643 const struct tc_ops **opsp;
1645 for (opsp = tcs; *opsp != NULL; opsp++) {
1646 const struct tc_ops *ops = *opsp;
1647 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1648 sset_add(types, ops->ovs_name);
1654 static const struct tc_ops *
1655 tc_lookup_ovs_name(const char *name)
1657 const struct tc_ops **opsp;
1659 for (opsp = tcs; *opsp != NULL; opsp++) {
1660 const struct tc_ops *ops = *opsp;
1661 if (!strcmp(name, ops->ovs_name)) {
1668 static const struct tc_ops *
1669 tc_lookup_linux_name(const char *name)
1671 const struct tc_ops **opsp;
1673 for (opsp = tcs; *opsp != NULL; opsp++) {
1674 const struct tc_ops *ops = *opsp;
1675 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1682 static struct tc_queue *
1683 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1686 struct netdev_dev_linux *netdev_dev =
1687 netdev_dev_linux_cast(netdev_get_dev(netdev));
1688 struct tc_queue *queue;
1690 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1691 if (queue->queue_id == queue_id) {
1698 static struct tc_queue *
1699 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1701 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1705 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1707 struct netdev_qos_capabilities *caps)
1709 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1713 caps->n_queues = ops->n_queues;
1718 netdev_linux_get_qos(const struct netdev *netdev,
1719 const char **typep, struct shash *details)
1721 struct netdev_dev_linux *netdev_dev =
1722 netdev_dev_linux_cast(netdev_get_dev(netdev));
1725 error = tc_query_qdisc(netdev);
1730 *typep = netdev_dev->tc->ops->ovs_name;
1731 return (netdev_dev->tc->ops->qdisc_get
1732 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1737 netdev_linux_set_qos(struct netdev *netdev,
1738 const char *type, const struct shash *details)
1740 struct netdev_dev_linux *netdev_dev =
1741 netdev_dev_linux_cast(netdev_get_dev(netdev));
1742 const struct tc_ops *new_ops;
1745 new_ops = tc_lookup_ovs_name(type);
1746 if (!new_ops || !new_ops->tc_install) {
1750 error = tc_query_qdisc(netdev);
1755 if (new_ops == netdev_dev->tc->ops) {
1756 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1758 /* Delete existing qdisc. */
1759 error = tc_del_qdisc(netdev);
1763 assert(netdev_dev->tc == NULL);
1765 /* Install new qdisc. */
1766 error = new_ops->tc_install(netdev, details);
1767 assert((error == 0) == (netdev_dev->tc != NULL));
1774 netdev_linux_get_queue(const struct netdev *netdev,
1775 unsigned int queue_id, struct shash *details)
1777 struct netdev_dev_linux *netdev_dev =
1778 netdev_dev_linux_cast(netdev_get_dev(netdev));
1781 error = tc_query_qdisc(netdev);
1785 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1787 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1793 netdev_linux_set_queue(struct netdev *netdev,
1794 unsigned int queue_id, const struct shash *details)
1796 struct netdev_dev_linux *netdev_dev =
1797 netdev_dev_linux_cast(netdev_get_dev(netdev));
1800 error = tc_query_qdisc(netdev);
1803 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1804 || !netdev_dev->tc->ops->class_set) {
1808 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1812 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1814 struct netdev_dev_linux *netdev_dev =
1815 netdev_dev_linux_cast(netdev_get_dev(netdev));
1818 error = tc_query_qdisc(netdev);
1821 } else if (!netdev_dev->tc->ops->class_delete) {
1824 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1826 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1832 netdev_linux_get_queue_stats(const struct netdev *netdev,
1833 unsigned int queue_id,
1834 struct netdev_queue_stats *stats)
1836 struct netdev_dev_linux *netdev_dev =
1837 netdev_dev_linux_cast(netdev_get_dev(netdev));
1840 error = tc_query_qdisc(netdev);
1843 } else if (!netdev_dev->tc->ops->class_get_stats) {
1846 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1848 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1854 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1856 struct ofpbuf request;
1857 struct tcmsg *tcmsg;
1859 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1863 tcmsg->tcm_parent = 0;
1864 nl_dump_start(dump, rtnl_sock, &request);
1865 ofpbuf_uninit(&request);
1870 netdev_linux_dump_queues(const struct netdev *netdev,
1871 netdev_dump_queues_cb *cb, void *aux)
1873 struct netdev_dev_linux *netdev_dev =
1874 netdev_dev_linux_cast(netdev_get_dev(netdev));
1875 struct tc_queue *queue;
1876 struct shash details;
1880 error = tc_query_qdisc(netdev);
1883 } else if (!netdev_dev->tc->ops->class_get) {
1888 shash_init(&details);
1889 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1890 shash_clear(&details);
1892 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1894 (*cb)(queue->queue_id, &details, aux);
1899 shash_destroy(&details);
1905 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1906 netdev_dump_queue_stats_cb *cb, void *aux)
1908 struct netdev_dev_linux *netdev_dev =
1909 netdev_dev_linux_cast(netdev_get_dev(netdev));
1910 struct nl_dump dump;
1915 error = tc_query_qdisc(netdev);
1918 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1923 if (!start_queue_dump(netdev, &dump)) {
1926 while (nl_dump_next(&dump, &msg)) {
1927 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1933 error = nl_dump_done(&dump);
1934 return error ? error : last_error;
1938 netdev_linux_get_in4(const struct netdev *netdev_,
1939 struct in_addr *address, struct in_addr *netmask)
1941 struct netdev_dev_linux *netdev_dev =
1942 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1944 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1947 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1948 SIOCGIFADDR, "SIOCGIFADDR");
1953 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1954 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1959 netdev_dev->cache_valid |= VALID_IN4;
1961 *address = netdev_dev->address;
1962 *netmask = netdev_dev->netmask;
1963 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1967 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1968 struct in_addr netmask)
1970 struct netdev_dev_linux *netdev_dev =
1971 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1974 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1976 netdev_dev->cache_valid |= VALID_IN4;
1977 netdev_dev->address = address;
1978 netdev_dev->netmask = netmask;
1979 if (address.s_addr != INADDR_ANY) {
1980 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1981 "SIOCSIFNETMASK", netmask);
1988 parse_if_inet6_line(const char *line,
1989 struct in6_addr *in6, char ifname[16 + 1])
1991 uint8_t *s6 = in6->s6_addr;
1992 #define X8 "%2"SCNx8
1994 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1995 "%*x %*x %*x %*x %16s\n",
1996 &s6[0], &s6[1], &s6[2], &s6[3],
1997 &s6[4], &s6[5], &s6[6], &s6[7],
1998 &s6[8], &s6[9], &s6[10], &s6[11],
1999 &s6[12], &s6[13], &s6[14], &s6[15],
2003 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2004 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2006 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2008 struct netdev_dev_linux *netdev_dev =
2009 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2010 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2014 netdev_dev->in6 = in6addr_any;
2016 file = fopen("/proc/net/if_inet6", "r");
2018 const char *name = netdev_get_name(netdev_);
2019 while (fgets(line, sizeof line, file)) {
2020 struct in6_addr in6_tmp;
2021 char ifname[16 + 1];
2022 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2023 && !strcmp(name, ifname))
2025 netdev_dev->in6 = in6_tmp;
2031 netdev_dev->cache_valid |= VALID_IN6;
2033 *in6 = netdev_dev->in6;
2038 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2040 struct sockaddr_in sin;
2041 memset(&sin, 0, sizeof sin);
2042 sin.sin_family = AF_INET;
2043 sin.sin_addr = addr;
2046 memset(sa, 0, sizeof *sa);
2047 memcpy(sa, &sin, sizeof sin);
2051 do_set_addr(struct netdev *netdev,
2052 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2055 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2056 make_in4_sockaddr(&ifr.ifr_addr, addr);
2058 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2062 /* Adds 'router' as a default IP gateway. */
2064 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2066 struct in_addr any = { INADDR_ANY };
2070 memset(&rt, 0, sizeof rt);
2071 make_in4_sockaddr(&rt.rt_dst, any);
2072 make_in4_sockaddr(&rt.rt_gateway, router);
2073 make_in4_sockaddr(&rt.rt_genmask, any);
2074 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2075 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2077 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2083 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2086 static const char fn[] = "/proc/net/route";
2091 *netdev_name = NULL;
2092 stream = fopen(fn, "r");
2093 if (stream == NULL) {
2094 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2099 while (fgets(line, sizeof line, stream)) {
2102 ovs_be32 dest, gateway, mask;
2103 int refcnt, metric, mtu;
2104 unsigned int flags, use, window, irtt;
2107 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2109 iface, &dest, &gateway, &flags, &refcnt,
2110 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2112 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2116 if (!(flags & RTF_UP)) {
2117 /* Skip routes that aren't up. */
2121 /* The output of 'dest', 'mask', and 'gateway' were given in
2122 * network byte order, so we don't need need any endian
2123 * conversions here. */
2124 if ((dest & mask) == (host->s_addr & mask)) {
2126 /* The host is directly reachable. */
2127 next_hop->s_addr = 0;
2129 /* To reach the host, we must go through a gateway. */
2130 next_hop->s_addr = gateway;
2132 *netdev_name = xstrdup(iface);
2144 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2146 struct ethtool_drvinfo drvinfo;
2149 memset(&drvinfo, 0, sizeof drvinfo);
2150 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2151 (struct ethtool_cmd *)&drvinfo,
2153 "ETHTOOL_GDRVINFO");
2155 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2156 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2157 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2163 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2164 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2165 * returns 0. Otherwise, it returns a positive errno value; in particular,
2166 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2168 netdev_linux_arp_lookup(const struct netdev *netdev,
2169 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2172 struct sockaddr_in sin;
2175 memset(&r, 0, sizeof r);
2176 memset(&sin, 0, sizeof sin);
2177 sin.sin_family = AF_INET;
2178 sin.sin_addr.s_addr = ip;
2180 memcpy(&r.arp_pa, &sin, sizeof sin);
2181 r.arp_ha.sa_family = ARPHRD_ETHER;
2183 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2184 COVERAGE_INC(netdev_arp_lookup);
2185 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2187 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2188 } else if (retval != ENXIO) {
2189 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2190 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2196 nd_to_iff_flags(enum netdev_flags nd)
2199 if (nd & NETDEV_UP) {
2202 if (nd & NETDEV_PROMISC) {
2209 iff_to_nd_flags(int iff)
2211 enum netdev_flags nd = 0;
2215 if (iff & IFF_PROMISC) {
2216 nd |= NETDEV_PROMISC;
2222 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2223 enum netdev_flags on, enum netdev_flags *old_flagsp)
2225 int old_flags, new_flags;
2228 error = get_flags(netdev, &old_flags);
2230 *old_flagsp = iff_to_nd_flags(old_flags);
2231 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2232 if (new_flags != old_flags) {
2233 error = set_flags(netdev, new_flags);
2240 netdev_linux_change_seq(const struct netdev *netdev)
2242 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2245 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2249 netdev_linux_init, \
2251 netdev_linux_wait, \
2254 netdev_linux_destroy, \
2255 NULL, /* set_config */ \
2256 NULL, /* config_equal */ \
2258 netdev_linux_open, \
2259 netdev_linux_close, \
2263 netdev_linux_listen, \
2264 netdev_linux_recv, \
2265 netdev_linux_recv_wait, \
2266 netdev_linux_drain, \
2268 netdev_linux_send, \
2269 netdev_linux_send_wait, \
2271 netdev_linux_set_etheraddr, \
2272 netdev_linux_get_etheraddr, \
2273 netdev_linux_get_mtu, \
2274 netdev_linux_get_ifindex, \
2275 netdev_linux_get_carrier, \
2276 netdev_linux_set_miimon_interval, \
2277 netdev_linux_get_stats, \
2280 netdev_linux_get_features, \
2281 netdev_linux_set_advertisements, \
2282 netdev_linux_get_vlan_vid, \
2284 netdev_linux_set_policing, \
2285 netdev_linux_get_qos_types, \
2286 netdev_linux_get_qos_capabilities, \
2287 netdev_linux_get_qos, \
2288 netdev_linux_set_qos, \
2289 netdev_linux_get_queue, \
2290 netdev_linux_set_queue, \
2291 netdev_linux_delete_queue, \
2292 netdev_linux_get_queue_stats, \
2293 netdev_linux_dump_queues, \
2294 netdev_linux_dump_queue_stats, \
2296 netdev_linux_get_in4, \
2297 netdev_linux_set_in4, \
2298 netdev_linux_get_in6, \
2299 netdev_linux_add_router, \
2300 netdev_linux_get_next_hop, \
2301 netdev_linux_get_status, \
2302 netdev_linux_arp_lookup, \
2304 netdev_linux_update_flags, \
2306 netdev_linux_change_seq \
2309 const struct netdev_class netdev_linux_class =
2312 netdev_linux_create,
2313 netdev_linux_enumerate,
2314 NULL); /* set_stats */
2316 const struct netdev_class netdev_tap_class =
2319 netdev_linux_create_tap,
2320 NULL, /* enumerate */
2321 NULL); /* set_stats */
2323 const struct netdev_class netdev_internal_class =
2326 netdev_linux_create,
2327 NULL, /* enumerate */
2328 netdev_vport_set_stats);
2330 /* HTB traffic control class. */
2332 #define HTB_N_QUEUES 0xf000
2336 unsigned int max_rate; /* In bytes/s. */
2340 struct tc_queue tc_queue;
2341 unsigned int min_rate; /* In bytes/s. */
2342 unsigned int max_rate; /* In bytes/s. */
2343 unsigned int burst; /* In bytes. */
2344 unsigned int priority; /* Lower values are higher priorities. */
2348 htb_get__(const struct netdev *netdev)
2350 struct netdev_dev_linux *netdev_dev =
2351 netdev_dev_linux_cast(netdev_get_dev(netdev));
2352 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2356 htb_install__(struct netdev *netdev, uint64_t max_rate)
2358 struct netdev_dev_linux *netdev_dev =
2359 netdev_dev_linux_cast(netdev_get_dev(netdev));
2362 htb = xmalloc(sizeof *htb);
2363 tc_init(&htb->tc, &tc_ops_htb);
2364 htb->max_rate = max_rate;
2366 netdev_dev->tc = &htb->tc;
2369 /* Create an HTB qdisc.
2371 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2373 htb_setup_qdisc__(struct netdev *netdev)
2376 struct tc_htb_glob opt;
2377 struct ofpbuf request;
2378 struct tcmsg *tcmsg;
2380 tc_del_qdisc(netdev);
2382 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2383 NLM_F_EXCL | NLM_F_CREATE, &request);
2387 tcmsg->tcm_handle = tc_make_handle(1, 0);
2388 tcmsg->tcm_parent = TC_H_ROOT;
2390 nl_msg_put_string(&request, TCA_KIND, "htb");
2392 memset(&opt, 0, sizeof opt);
2393 opt.rate2quantum = 10;
2397 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2398 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2399 nl_msg_end_nested(&request, opt_offset);
2401 return tc_transact(&request, NULL);
2404 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2405 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2407 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2408 unsigned int parent, struct htb_class *class)
2411 struct tc_htb_opt opt;
2412 struct ofpbuf request;
2413 struct tcmsg *tcmsg;
2417 netdev_get_mtu(netdev, &mtu);
2418 if (mtu == INT_MAX) {
2419 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2420 netdev_get_name(netdev));
2424 memset(&opt, 0, sizeof opt);
2425 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2426 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2427 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2428 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2429 opt.prio = class->priority;
2431 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2435 tcmsg->tcm_handle = handle;
2436 tcmsg->tcm_parent = parent;
2438 nl_msg_put_string(&request, TCA_KIND, "htb");
2439 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2440 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2441 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2442 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2443 nl_msg_end_nested(&request, opt_offset);
2445 error = tc_transact(&request, NULL);
2447 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2448 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2449 netdev_get_name(netdev),
2450 tc_get_major(handle), tc_get_minor(handle),
2451 tc_get_major(parent), tc_get_minor(parent),
2452 class->min_rate, class->max_rate,
2453 class->burst, class->priority, strerror(error));
2458 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2459 * description of them into 'details'. The description complies with the
2460 * specification given in the vswitch database documentation for linux-htb
2463 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2465 static const struct nl_policy tca_htb_policy[] = {
2466 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2467 .min_len = sizeof(struct tc_htb_opt) },
2470 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2471 const struct tc_htb_opt *htb;
2473 if (!nl_parse_nested(nl_options, tca_htb_policy,
2474 attrs, ARRAY_SIZE(tca_htb_policy))) {
2475 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2479 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2480 class->min_rate = htb->rate.rate;
2481 class->max_rate = htb->ceil.rate;
2482 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2483 class->priority = htb->prio;
2488 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2489 struct htb_class *options,
2490 struct netdev_queue_stats *stats)
2492 struct nlattr *nl_options;
2493 unsigned int handle;
2496 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2497 if (!error && queue_id) {
2498 unsigned int major = tc_get_major(handle);
2499 unsigned int minor = tc_get_minor(handle);
2500 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2501 *queue_id = minor - 1;
2506 if (!error && options) {
2507 error = htb_parse_tca_options__(nl_options, options);
2513 htb_parse_qdisc_details__(struct netdev *netdev,
2514 const struct shash *details, struct htb_class *hc)
2516 const char *max_rate_s;
2518 max_rate_s = shash_find_data(details, "max-rate");
2519 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2520 if (!hc->max_rate) {
2523 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2524 hc->max_rate = netdev_features_to_bps(current) / 8;
2526 hc->min_rate = hc->max_rate;
2532 htb_parse_class_details__(struct netdev *netdev,
2533 const struct shash *details, struct htb_class *hc)
2535 const struct htb *htb = htb_get__(netdev);
2536 const char *min_rate_s = shash_find_data(details, "min-rate");
2537 const char *max_rate_s = shash_find_data(details, "max-rate");
2538 const char *burst_s = shash_find_data(details, "burst");
2539 const char *priority_s = shash_find_data(details, "priority");
2542 netdev_get_mtu(netdev, &mtu);
2543 if (mtu == INT_MAX) {
2544 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2545 netdev_get_name(netdev));
2549 /* HTB requires at least an mtu sized min-rate to send any traffic even
2550 * on uncongested links. */
2551 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2552 hc->min_rate = MAX(hc->min_rate, mtu);
2553 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2556 hc->max_rate = (max_rate_s
2557 ? strtoull(max_rate_s, NULL, 10) / 8
2559 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2560 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2564 * According to hints in the documentation that I've read, it is important
2565 * that 'burst' be at least as big as the largest frame that might be
2566 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2567 * but having it a bit too small is a problem. Since netdev_get_mtu()
2568 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2569 * the MTU. We actually add 64, instead of 14, as a guard against
2570 * additional headers get tacked on somewhere that we're not aware of. */
2571 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2572 hc->burst = MAX(hc->burst, mtu + 64);
2575 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2581 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2582 unsigned int parent, struct htb_class *options,
2583 struct netdev_queue_stats *stats)
2585 struct ofpbuf *reply;
2588 error = tc_query_class(netdev, handle, parent, &reply);
2590 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2591 ofpbuf_delete(reply);
2597 htb_tc_install(struct netdev *netdev, const struct shash *details)
2601 error = htb_setup_qdisc__(netdev);
2603 struct htb_class hc;
2605 htb_parse_qdisc_details__(netdev, details, &hc);
2606 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2607 tc_make_handle(1, 0), &hc);
2609 htb_install__(netdev, hc.max_rate);
2615 static struct htb_class *
2616 htb_class_cast__(const struct tc_queue *queue)
2618 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2622 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2623 const struct htb_class *hc)
2625 struct htb *htb = htb_get__(netdev);
2626 size_t hash = hash_int(queue_id, 0);
2627 struct tc_queue *queue;
2628 struct htb_class *hcp;
2630 queue = tc_find_queue__(netdev, queue_id, hash);
2632 hcp = htb_class_cast__(queue);
2634 hcp = xmalloc(sizeof *hcp);
2635 queue = &hcp->tc_queue;
2636 queue->queue_id = queue_id;
2637 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2640 hcp->min_rate = hc->min_rate;
2641 hcp->max_rate = hc->max_rate;
2642 hcp->burst = hc->burst;
2643 hcp->priority = hc->priority;
2647 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2650 struct nl_dump dump;
2651 struct htb_class hc;
2653 /* Get qdisc options. */
2655 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2656 htb_install__(netdev, hc.max_rate);
2659 if (!start_queue_dump(netdev, &dump)) {
2662 while (nl_dump_next(&dump, &msg)) {
2663 unsigned int queue_id;
2665 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2666 htb_update_queue__(netdev, queue_id, &hc);
2669 nl_dump_done(&dump);
2675 htb_tc_destroy(struct tc *tc)
2677 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2678 struct htb_class *hc, *next;
2680 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2681 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2689 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2691 const struct htb *htb = htb_get__(netdev);
2692 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2697 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2699 struct htb_class hc;
2702 htb_parse_qdisc_details__(netdev, details, &hc);
2703 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2704 tc_make_handle(1, 0), &hc);
2706 htb_get__(netdev)->max_rate = hc.max_rate;
2712 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2713 const struct tc_queue *queue, struct shash *details)
2715 const struct htb_class *hc = htb_class_cast__(queue);
2717 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2718 if (hc->min_rate != hc->max_rate) {
2719 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2721 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2723 shash_add(details, "priority", xasprintf("%u", hc->priority));
2729 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2730 const struct shash *details)
2732 struct htb_class hc;
2735 error = htb_parse_class_details__(netdev, details, &hc);
2740 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2741 tc_make_handle(1, 0xfffe), &hc);
2746 htb_update_queue__(netdev, queue_id, &hc);
2751 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2753 struct htb_class *hc = htb_class_cast__(queue);
2754 struct htb *htb = htb_get__(netdev);
2757 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2759 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2766 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2767 struct netdev_queue_stats *stats)
2769 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2770 tc_make_handle(1, 0xfffe), NULL, stats);
2774 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2775 const struct ofpbuf *nlmsg,
2776 netdev_dump_queue_stats_cb *cb, void *aux)
2778 struct netdev_queue_stats stats;
2779 unsigned int handle, major, minor;
2782 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2787 major = tc_get_major(handle);
2788 minor = tc_get_minor(handle);
2789 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2790 (*cb)(minor - 1, &stats, aux);
2795 static const struct tc_ops tc_ops_htb = {
2796 "htb", /* linux_name */
2797 "linux-htb", /* ovs_name */
2798 HTB_N_QUEUES, /* n_queues */
2807 htb_class_get_stats,
2808 htb_class_dump_stats
2811 /* "linux-hfsc" traffic control class. */
2813 #define HFSC_N_QUEUES 0xf000
2821 struct tc_queue tc_queue;
2826 static struct hfsc *
2827 hfsc_get__(const struct netdev *netdev)
2829 struct netdev_dev_linux *netdev_dev;
2830 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2831 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2834 static struct hfsc_class *
2835 hfsc_class_cast__(const struct tc_queue *queue)
2837 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2841 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2843 struct netdev_dev_linux * netdev_dev;
2846 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2847 hfsc = xmalloc(sizeof *hfsc);
2848 tc_init(&hfsc->tc, &tc_ops_hfsc);
2849 hfsc->max_rate = max_rate;
2850 netdev_dev->tc = &hfsc->tc;
2854 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2855 const struct hfsc_class *hc)
2859 struct hfsc_class *hcp;
2860 struct tc_queue *queue;
2862 hfsc = hfsc_get__(netdev);
2863 hash = hash_int(queue_id, 0);
2865 queue = tc_find_queue__(netdev, queue_id, hash);
2867 hcp = hfsc_class_cast__(queue);
2869 hcp = xmalloc(sizeof *hcp);
2870 queue = &hcp->tc_queue;
2871 queue->queue_id = queue_id;
2872 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2875 hcp->min_rate = hc->min_rate;
2876 hcp->max_rate = hc->max_rate;
2880 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2882 const struct tc_service_curve *rsc, *fsc, *usc;
2883 static const struct nl_policy tca_hfsc_policy[] = {
2885 .type = NL_A_UNSPEC,
2887 .min_len = sizeof(struct tc_service_curve),
2890 .type = NL_A_UNSPEC,
2892 .min_len = sizeof(struct tc_service_curve),
2895 .type = NL_A_UNSPEC,
2897 .min_len = sizeof(struct tc_service_curve),
2900 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2902 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2903 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2904 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2908 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2909 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2910 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2912 if (rsc->m1 != 0 || rsc->d != 0 ||
2913 fsc->m1 != 0 || fsc->d != 0 ||
2914 usc->m1 != 0 || usc->d != 0) {
2915 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2916 "Non-linear service curves are not supported.");
2920 if (rsc->m2 != fsc->m2) {
2921 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2922 "Real-time service curves are not supported ");
2926 if (rsc->m2 > usc->m2) {
2927 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2928 "Min-rate service curve is greater than "
2929 "the max-rate service curve.");
2933 class->min_rate = fsc->m2;
2934 class->max_rate = usc->m2;
2939 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2940 struct hfsc_class *options,
2941 struct netdev_queue_stats *stats)
2944 unsigned int handle;
2945 struct nlattr *nl_options;
2947 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2953 unsigned int major, minor;
2955 major = tc_get_major(handle);
2956 minor = tc_get_minor(handle);
2957 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2958 *queue_id = minor - 1;
2965 error = hfsc_parse_tca_options__(nl_options, options);
2972 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2973 unsigned int parent, struct hfsc_class *options,
2974 struct netdev_queue_stats *stats)
2977 struct ofpbuf *reply;
2979 error = tc_query_class(netdev, handle, parent, &reply);
2984 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2985 ofpbuf_delete(reply);
2990 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2991 struct hfsc_class *class)
2994 const char *max_rate_s;
2996 max_rate_s = shash_find_data(details, "max-rate");
2997 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3002 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3003 max_rate = netdev_features_to_bps(current) / 8;
3006 class->min_rate = max_rate;
3007 class->max_rate = max_rate;
3011 hfsc_parse_class_details__(struct netdev *netdev,
3012 const struct shash *details,
3013 struct hfsc_class * class)
3015 const struct hfsc *hfsc;
3016 uint32_t min_rate, max_rate;
3017 const char *min_rate_s, *max_rate_s;
3019 hfsc = hfsc_get__(netdev);
3020 min_rate_s = shash_find_data(details, "min-rate");
3021 max_rate_s = shash_find_data(details, "max-rate");
3023 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3024 min_rate = MAX(min_rate, 1);
3025 min_rate = MIN(min_rate, hfsc->max_rate);
3027 max_rate = (max_rate_s
3028 ? strtoull(max_rate_s, NULL, 10) / 8
3030 max_rate = MAX(max_rate, min_rate);
3031 max_rate = MIN(max_rate, hfsc->max_rate);
3033 class->min_rate = min_rate;
3034 class->max_rate = max_rate;
3039 /* Create an HFSC qdisc.
3041 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3043 hfsc_setup_qdisc__(struct netdev * netdev)
3045 struct tcmsg *tcmsg;
3046 struct ofpbuf request;
3047 struct tc_hfsc_qopt opt;
3049 tc_del_qdisc(netdev);
3051 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3052 NLM_F_EXCL | NLM_F_CREATE, &request);
3058 tcmsg->tcm_handle = tc_make_handle(1, 0);
3059 tcmsg->tcm_parent = TC_H_ROOT;
3061 memset(&opt, 0, sizeof opt);
3064 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3065 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3067 return tc_transact(&request, NULL);
3070 /* Create an HFSC class.
3072 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3073 * sc rate <min_rate> ul rate <max_rate>" */
3075 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3076 unsigned int parent, struct hfsc_class *class)
3080 struct tcmsg *tcmsg;
3081 struct ofpbuf request;
3082 struct tc_service_curve min, max;
3084 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3090 tcmsg->tcm_handle = handle;
3091 tcmsg->tcm_parent = parent;
3095 min.m2 = class->min_rate;
3099 max.m2 = class->max_rate;
3101 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3102 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3103 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3104 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3105 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3106 nl_msg_end_nested(&request, opt_offset);
3108 error = tc_transact(&request, NULL);
3110 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3111 "min-rate %ubps, max-rate %ubps (%s)",
3112 netdev_get_name(netdev),
3113 tc_get_major(handle), tc_get_minor(handle),
3114 tc_get_major(parent), tc_get_minor(parent),
3115 class->min_rate, class->max_rate, strerror(error));
3122 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3125 struct hfsc_class class;
3127 error = hfsc_setup_qdisc__(netdev);
3133 hfsc_parse_qdisc_details__(netdev, details, &class);
3134 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3135 tc_make_handle(1, 0), &class);
3141 hfsc_install__(netdev, class.max_rate);
3146 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3149 struct nl_dump dump;
3150 struct hfsc_class hc;
3153 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3154 hfsc_install__(netdev, hc.max_rate);
3156 if (!start_queue_dump(netdev, &dump)) {
3160 while (nl_dump_next(&dump, &msg)) {
3161 unsigned int queue_id;
3163 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3164 hfsc_update_queue__(netdev, queue_id, &hc);
3168 nl_dump_done(&dump);
3173 hfsc_tc_destroy(struct tc *tc)
3176 struct hfsc_class *hc, *next;
3178 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3180 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3181 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3190 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3192 const struct hfsc *hfsc;
3193 hfsc = hfsc_get__(netdev);
3194 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3199 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3202 struct hfsc_class class;
3204 hfsc_parse_qdisc_details__(netdev, details, &class);
3205 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3206 tc_make_handle(1, 0), &class);
3209 hfsc_get__(netdev)->max_rate = class.max_rate;
3216 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3217 const struct tc_queue *queue, struct shash *details)
3219 const struct hfsc_class *hc;
3221 hc = hfsc_class_cast__(queue);
3222 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3223 if (hc->min_rate != hc->max_rate) {
3224 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3230 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3231 const struct shash *details)
3234 struct hfsc_class class;
3236 error = hfsc_parse_class_details__(netdev, details, &class);
3241 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3242 tc_make_handle(1, 0xfffe), &class);
3247 hfsc_update_queue__(netdev, queue_id, &class);
3252 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3256 struct hfsc_class *hc;
3258 hc = hfsc_class_cast__(queue);
3259 hfsc = hfsc_get__(netdev);
3261 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3263 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3270 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3271 struct netdev_queue_stats *stats)
3273 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3274 tc_make_handle(1, 0xfffe), NULL, stats);
3278 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3279 const struct ofpbuf *nlmsg,
3280 netdev_dump_queue_stats_cb *cb, void *aux)
3282 struct netdev_queue_stats stats;
3283 unsigned int handle, major, minor;
3286 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3291 major = tc_get_major(handle);
3292 minor = tc_get_minor(handle);
3293 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3294 (*cb)(minor - 1, &stats, aux);
3299 static const struct tc_ops tc_ops_hfsc = {
3300 "hfsc", /* linux_name */
3301 "linux-hfsc", /* ovs_name */
3302 HFSC_N_QUEUES, /* n_queues */
3303 hfsc_tc_install, /* tc_install */
3304 hfsc_tc_load, /* tc_load */
3305 hfsc_tc_destroy, /* tc_destroy */
3306 hfsc_qdisc_get, /* qdisc_get */
3307 hfsc_qdisc_set, /* qdisc_set */
3308 hfsc_class_get, /* class_get */
3309 hfsc_class_set, /* class_set */
3310 hfsc_class_delete, /* class_delete */
3311 hfsc_class_get_stats, /* class_get_stats */
3312 hfsc_class_dump_stats /* class_dump_stats */
3315 /* "linux-default" traffic control class.
3317 * This class represents the default, unnamed Linux qdisc. It corresponds to
3318 * the "" (empty string) QoS type in the OVS database. */
3321 default_install__(struct netdev *netdev)
3323 struct netdev_dev_linux *netdev_dev =
3324 netdev_dev_linux_cast(netdev_get_dev(netdev));
3325 static struct tc *tc;
3328 tc = xmalloc(sizeof *tc);
3329 tc_init(tc, &tc_ops_default);
3331 netdev_dev->tc = tc;
3335 default_tc_install(struct netdev *netdev,
3336 const struct shash *details OVS_UNUSED)
3338 default_install__(netdev);
3343 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3345 default_install__(netdev);
3349 static const struct tc_ops tc_ops_default = {
3350 NULL, /* linux_name */
3355 NULL, /* tc_destroy */
3356 NULL, /* qdisc_get */
3357 NULL, /* qdisc_set */
3358 NULL, /* class_get */
3359 NULL, /* class_set */
3360 NULL, /* class_delete */
3361 NULL, /* class_get_stats */
3362 NULL /* class_dump_stats */
3365 /* "linux-other" traffic control class.
3370 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3372 struct netdev_dev_linux *netdev_dev =
3373 netdev_dev_linux_cast(netdev_get_dev(netdev));
3374 static struct tc *tc;
3377 tc = xmalloc(sizeof *tc);
3378 tc_init(tc, &tc_ops_other);
3380 netdev_dev->tc = tc;
3384 static const struct tc_ops tc_ops_other = {
3385 NULL, /* linux_name */
3386 "linux-other", /* ovs_name */
3388 NULL, /* tc_install */
3390 NULL, /* tc_destroy */
3391 NULL, /* qdisc_get */
3392 NULL, /* qdisc_set */
3393 NULL, /* class_get */
3394 NULL, /* class_set */
3395 NULL, /* class_delete */
3396 NULL, /* class_get_stats */
3397 NULL /* class_dump_stats */
3400 /* Traffic control. */
3402 /* Number of kernel "tc" ticks per second. */
3403 static double ticks_per_s;
3405 /* Number of kernel "jiffies" per second. This is used for the purpose of
3406 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3407 * one jiffy's worth of data.
3409 * There are two possibilities here:
3411 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3412 * approximate range of 100 to 1024. That means that we really need to
3413 * make sure that the qdisc can buffer that much data.
3415 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3416 * has finely granular timers and there's no need to fudge additional room
3417 * for buffers. (There's no extra effort needed to implement that: the
3418 * large 'buffer_hz' is used as a divisor, so practically any number will
3419 * come out as 0 in the division. Small integer results in the case of
3420 * really high dividends won't have any real effect anyhow.)
3422 static unsigned int buffer_hz;
3424 /* Returns tc handle 'major':'minor'. */
3426 tc_make_handle(unsigned int major, unsigned int minor)
3428 return TC_H_MAKE(major << 16, minor);
3431 /* Returns the major number from 'handle'. */
3433 tc_get_major(unsigned int handle)
3435 return TC_H_MAJ(handle) >> 16;
3438 /* Returns the minor number from 'handle'. */
3440 tc_get_minor(unsigned int handle)
3442 return TC_H_MIN(handle);
3445 static struct tcmsg *
3446 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3447 struct ofpbuf *request)
3449 struct tcmsg *tcmsg;
3453 error = get_ifindex(netdev, &ifindex);
3458 ofpbuf_init(request, 512);
3459 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3460 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3461 tcmsg->tcm_family = AF_UNSPEC;
3462 tcmsg->tcm_ifindex = ifindex;
3463 /* Caller should fill in tcmsg->tcm_handle. */
3464 /* Caller should fill in tcmsg->tcm_parent. */
3470 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3472 int error = nl_sock_transact(rtnl_sock, request, replyp);
3473 ofpbuf_uninit(request);
3480 /* The values in psched are not individually very meaningful, but they are
3481 * important. The tables below show some values seen in the wild.
3485 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3486 * (Before that, there are hints that it was 1000000000.)
3488 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3492 * -----------------------------------
3493 * [1] 000c8000 000f4240 000f4240 00000064
3494 * [2] 000003e8 00000400 000f4240 3b9aca00
3495 * [3] 000003e8 00000400 000f4240 3b9aca00
3496 * [4] 000003e8 00000400 000f4240 00000064
3497 * [5] 000003e8 00000040 000f4240 3b9aca00
3498 * [6] 000003e8 00000040 000f4240 000000f9
3500 * a b c d ticks_per_s buffer_hz
3501 * ------- --------- ---------- ------------- ----------- -------------
3502 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3503 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3504 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3505 * [4] 1,000 1,024 1,000,000 100 976,562 100
3506 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3507 * [6] 1,000 64 1,000,000 249 15,625,000 249
3509 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3510 * [2] 2.6.26-1-686-bigmem from Debian lenny
3511 * [3] 2.6.26-2-sparc64 from Debian lenny
3512 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3513 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3514 * [6] 2.6.34 from kernel.org on KVM
3516 static const char fn[] = "/proc/net/psched";
3517 unsigned int a, b, c, d;
3523 stream = fopen(fn, "r");
3525 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3529 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3530 VLOG_WARN("%s: read failed", fn);
3534 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3538 VLOG_WARN("%s: invalid scheduler parameters", fn);
3542 ticks_per_s = (double) a * c / b;
3546 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3549 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3552 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3553 * rate of 'rate' bytes per second. */
3555 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3560 return (rate * ticks) / ticks_per_s;
3563 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3564 * rate of 'rate' bytes per second. */
3566 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3571 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3574 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3575 * a transmission rate of 'rate' bytes per second. */
3577 tc_buffer_per_jiffy(unsigned int rate)
3582 return rate / buffer_hz;
3585 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3586 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3587 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3588 * stores NULL into it if it is absent.
3590 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3593 * Returns 0 if successful, otherwise a positive errno value. */
3595 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3596 struct nlattr **options)
3598 static const struct nl_policy tca_policy[] = {
3599 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3600 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3602 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3604 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3605 tca_policy, ta, ARRAY_SIZE(ta))) {
3606 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3611 *kind = nl_attr_get_string(ta[TCA_KIND]);
3615 *options = ta[TCA_OPTIONS];
3630 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3631 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3632 * into '*options', and its queue statistics into '*stats'. Any of the output
3633 * arguments may be null.
3635 * Returns 0 if successful, otherwise a positive errno value. */
3637 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3638 struct nlattr **options, struct netdev_queue_stats *stats)
3640 static const struct nl_policy tca_policy[] = {
3641 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3642 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3644 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3646 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3647 tca_policy, ta, ARRAY_SIZE(ta))) {
3648 VLOG_WARN_RL(&rl, "failed to parse class message");
3653 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3654 *handlep = tc->tcm_handle;
3658 *options = ta[TCA_OPTIONS];
3662 const struct gnet_stats_queue *gsq;
3663 struct gnet_stats_basic gsb;
3665 static const struct nl_policy stats_policy[] = {
3666 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3667 .min_len = sizeof gsb },
3668 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3669 .min_len = sizeof *gsq },
3671 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3673 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3674 sa, ARRAY_SIZE(sa))) {
3675 VLOG_WARN_RL(&rl, "failed to parse class stats");
3679 /* Alignment issues screw up the length of struct gnet_stats_basic on
3680 * some arch/bitsize combinations. Newer versions of Linux have a
3681 * struct gnet_stats_basic_packed, but we can't depend on that. The
3682 * easiest thing to do is just to make a copy. */
3683 memset(&gsb, 0, sizeof gsb);
3684 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3685 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3686 stats->tx_bytes = gsb.bytes;
3687 stats->tx_packets = gsb.packets;
3689 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3690 stats->tx_errors = gsq->drops;
3700 memset(stats, 0, sizeof *stats);
3705 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3708 tc_query_class(const struct netdev *netdev,
3709 unsigned int handle, unsigned int parent,
3710 struct ofpbuf **replyp)
3712 struct ofpbuf request;
3713 struct tcmsg *tcmsg;
3716 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3720 tcmsg->tcm_handle = handle;
3721 tcmsg->tcm_parent = parent;
3723 error = tc_transact(&request, replyp);
3725 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3726 netdev_get_name(netdev),
3727 tc_get_major(handle), tc_get_minor(handle),
3728 tc_get_major(parent), tc_get_minor(parent),
3734 /* Equivalent to "tc class del dev <name> handle <handle>". */
3736 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3738 struct ofpbuf request;
3739 struct tcmsg *tcmsg;
3742 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3746 tcmsg->tcm_handle = handle;
3747 tcmsg->tcm_parent = 0;
3749 error = tc_transact(&request, NULL);
3751 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3752 netdev_get_name(netdev),
3753 tc_get_major(handle), tc_get_minor(handle),
3759 /* Equivalent to "tc qdisc del dev <name> root". */
3761 tc_del_qdisc(struct netdev *netdev)
3763 struct netdev_dev_linux *netdev_dev =
3764 netdev_dev_linux_cast(netdev_get_dev(netdev));
3765 struct ofpbuf request;
3766 struct tcmsg *tcmsg;
3769 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3773 tcmsg->tcm_handle = tc_make_handle(1, 0);
3774 tcmsg->tcm_parent = TC_H_ROOT;
3776 error = tc_transact(&request, NULL);
3777 if (error == EINVAL) {
3778 /* EINVAL probably means that the default qdisc was in use, in which
3779 * case we've accomplished our purpose. */
3782 if (!error && netdev_dev->tc) {
3783 if (netdev_dev->tc->ops->tc_destroy) {
3784 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3786 netdev_dev->tc = NULL;
3791 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3792 * kernel to determine what they are. Returns 0 if successful, otherwise a
3793 * positive errno value. */
3795 tc_query_qdisc(const struct netdev *netdev)
3797 struct netdev_dev_linux *netdev_dev =
3798 netdev_dev_linux_cast(netdev_get_dev(netdev));
3799 struct ofpbuf request, *qdisc;
3800 const struct tc_ops *ops;
3801 struct tcmsg *tcmsg;
3805 if (netdev_dev->tc) {
3809 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3810 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3811 * 2.6.35 without that fix backported to it.
3813 * To avoid the OOPS, we must not make a request that would attempt to dump
3814 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3815 * few others. There are a few ways that I can see to do this, but most of
3816 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3817 * technique chosen here is to assume that any non-default qdisc that we
3818 * create will have a class with handle 1:0. The built-in qdiscs only have
3819 * a class with handle 0:0.
3821 * We could check for Linux 2.6.35+ and use a more straightforward method
3823 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3827 tcmsg->tcm_handle = tc_make_handle(1, 0);
3828 tcmsg->tcm_parent = 0;
3830 /* Figure out what tc class to instantiate. */
3831 error = tc_transact(&request, &qdisc);
3835 error = tc_parse_qdisc(qdisc, &kind, NULL);
3837 ops = &tc_ops_other;
3839 ops = tc_lookup_linux_name(kind);
3841 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3842 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3844 ops = &tc_ops_other;
3847 } else if (error == ENOENT) {
3848 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3849 * other entity that doesn't have a handle 1:0. We will assume
3850 * that it's the system default qdisc. */
3851 ops = &tc_ops_default;
3854 /* Who knows? Maybe the device got deleted. */
3855 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3856 netdev_get_name(netdev), strerror(error));
3857 ops = &tc_ops_other;
3860 /* Instantiate it. */
3861 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3862 assert((load_error == 0) == (netdev_dev->tc != NULL));
3863 ofpbuf_delete(qdisc);
3865 return error ? error : load_error;
3868 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3869 approximate the time to transmit packets of various lengths. For an MTU of
3870 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3871 represents two possible packet lengths; for a MTU of 513 through 1024, four
3872 possible lengths; and so on.
3874 Returns, for the specified 'mtu', the number of bits that packet lengths
3875 need to be shifted right to fit within such a 256-entry table. */
3877 tc_calc_cell_log(unsigned int mtu)
3882 mtu = ETH_PAYLOAD_MAX;
3884 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3886 for (cell_log = 0; mtu >= 256; cell_log++) {
3893 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3896 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3898 memset(rate, 0, sizeof *rate);
3899 rate->cell_log = tc_calc_cell_log(mtu);
3900 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3901 /* rate->cell_align = 0; */ /* distro headers. */
3902 rate->mpu = ETH_TOTAL_MIN;
3906 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3907 * attribute of the specified "type".
3909 * See tc_calc_cell_log() above for a description of "rtab"s. */
3911 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3916 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3917 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3918 unsigned packet_size = (i + 1) << rate->cell_log;
3919 if (packet_size < rate->mpu) {
3920 packet_size = rate->mpu;
3922 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3926 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3927 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3928 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3931 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3933 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3934 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3937 /* Public utility functions. */
3939 #define COPY_NETDEV_STATS \
3940 dst->rx_packets = src->rx_packets; \
3941 dst->tx_packets = src->tx_packets; \
3942 dst->rx_bytes = src->rx_bytes; \
3943 dst->tx_bytes = src->tx_bytes; \
3944 dst->rx_errors = src->rx_errors; \
3945 dst->tx_errors = src->tx_errors; \
3946 dst->rx_dropped = src->rx_dropped; \
3947 dst->tx_dropped = src->tx_dropped; \
3948 dst->multicast = src->multicast; \
3949 dst->collisions = src->collisions; \
3950 dst->rx_length_errors = src->rx_length_errors; \
3951 dst->rx_over_errors = src->rx_over_errors; \
3952 dst->rx_crc_errors = src->rx_crc_errors; \
3953 dst->rx_frame_errors = src->rx_frame_errors; \
3954 dst->rx_fifo_errors = src->rx_fifo_errors; \
3955 dst->rx_missed_errors = src->rx_missed_errors; \
3956 dst->tx_aborted_errors = src->tx_aborted_errors; \
3957 dst->tx_carrier_errors = src->tx_carrier_errors; \
3958 dst->tx_fifo_errors = src->tx_fifo_errors; \
3959 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3960 dst->tx_window_errors = src->tx_window_errors
3962 /* Copies 'src' into 'dst', performing format conversion in the process. */
3964 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3965 const struct rtnl_link_stats *src)
3970 /* Copies 'src' into 'dst', performing format conversion in the process. */
3972 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3973 const struct rtnl_link_stats64 *src)
3978 /* Copies 'src' into 'dst', performing format conversion in the process. */
3980 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3981 const struct netdev_stats *src)
3984 dst->rx_compressed = 0;
3985 dst->tx_compressed = 0;
3988 /* Utility functions. */
3991 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3993 /* Policy for RTNLGRP_LINK messages.
3995 * There are *many* more fields in these messages, but currently we only
3996 * care about these fields. */
3997 static const struct nl_policy rtnlgrp_link_policy[] = {
3998 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3999 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4000 .min_len = sizeof(struct rtnl_link_stats) },
4003 struct ofpbuf request;
4004 struct ofpbuf *reply;
4005 struct ifinfomsg *ifi;
4006 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4009 ofpbuf_init(&request, 0);
4010 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4011 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4012 ifi->ifi_family = PF_UNSPEC;
4013 ifi->ifi_index = ifindex;
4014 error = nl_sock_transact(rtnl_sock, &request, &reply);
4015 ofpbuf_uninit(&request);
4020 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4021 rtnlgrp_link_policy,
4022 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4023 ofpbuf_delete(reply);
4027 if (!attrs[IFLA_STATS]) {
4028 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4029 ofpbuf_delete(reply);
4033 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4035 ofpbuf_delete(reply);
4041 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4043 static const char fn[] = "/proc/net/dev";
4048 stream = fopen(fn, "r");
4050 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4055 while (fgets(line, sizeof line, stream)) {
4058 #define X64 "%"SCNu64
4061 X64 X64 X64 X64 X64 X64 X64 "%*u"
4062 X64 X64 X64 X64 X64 X64 X64 "%*u",
4068 &stats->rx_fifo_errors,
4069 &stats->rx_frame_errors,
4075 &stats->tx_fifo_errors,
4077 &stats->tx_carrier_errors) != 15) {
4078 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4079 } else if (!strcmp(devname, netdev_name)) {
4080 stats->rx_length_errors = UINT64_MAX;
4081 stats->rx_over_errors = UINT64_MAX;
4082 stats->rx_crc_errors = UINT64_MAX;
4083 stats->rx_missed_errors = UINT64_MAX;
4084 stats->tx_aborted_errors = UINT64_MAX;
4085 stats->tx_heartbeat_errors = UINT64_MAX;
4086 stats->tx_window_errors = UINT64_MAX;
4092 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4098 get_flags(const struct netdev *netdev, int *flags)
4103 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4105 *flags = ifr.ifr_flags;
4110 set_flags(struct netdev *netdev, int flags)
4114 ifr.ifr_flags = flags;
4115 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4120 do_get_ifindex(const char *netdev_name)
4124 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4125 COVERAGE_INC(netdev_get_ifindex);
4126 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4127 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4128 netdev_name, strerror(errno));
4131 return ifr.ifr_ifindex;
4135 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4137 struct netdev_dev_linux *netdev_dev =
4138 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4140 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4141 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4145 netdev_dev->cache_valid |= VALID_IFINDEX;
4146 netdev_dev->ifindex = ifindex;
4148 *ifindexp = netdev_dev->ifindex;
4153 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4158 memset(&ifr, 0, sizeof ifr);
4159 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4160 COVERAGE_INC(netdev_get_hwaddr);
4161 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4162 /* ENODEV probably means that a vif disappeared asynchronously and
4163 * hasn't been removed from the database yet, so reduce the log level
4164 * to INFO for that case. */
4165 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4166 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4167 netdev_name, strerror(errno));
4170 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4171 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4172 VLOG_WARN("%s device has unknown hardware address family %d",
4173 netdev_name, hwaddr_family);
4175 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4180 set_etheraddr(const char *netdev_name, int hwaddr_family,
4181 const uint8_t mac[ETH_ADDR_LEN])
4185 memset(&ifr, 0, sizeof ifr);
4186 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4187 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4188 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4189 COVERAGE_INC(netdev_set_hwaddr);
4190 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4191 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4192 netdev_name, strerror(errno));
4199 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4200 int cmd, const char *cmd_name)
4204 memset(&ifr, 0, sizeof ifr);
4205 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4206 ifr.ifr_data = (caddr_t) ecmd;
4209 COVERAGE_INC(netdev_ethtool);
4210 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4213 if (errno != EOPNOTSUPP) {
4214 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4215 "failed: %s", cmd_name, name, strerror(errno));
4217 /* The device doesn't support this operation. That's pretty
4218 * common, so there's no point in logging anything. */
4225 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4226 const char *cmd_name)
4228 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4229 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4230 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4238 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4239 int cmd, const char *cmd_name)
4244 ifr.ifr_addr.sa_family = AF_INET;
4245 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4247 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4248 *ip = sin->sin_addr;
4253 /* Returns an AF_PACKET raw socket or a negative errno value. */
4255 af_packet_sock(void)
4257 static int sock = INT_MIN;
4259 if (sock == INT_MIN) {
4260 sock = socket(AF_PACKET, SOCK_RAW, 0);
4262 set_nonblocking(sock);
4265 VLOG_ERR("failed to create packet socket: %s", strerror(errno));